diff --git "a/training.log" "b/training.log" --- "a/training.log" +++ "b/training.log" @@ -1,16 +1,16 @@ -[2023-12-06 04:40:04,481] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect) -[2023-12-06 04:40:06,410] [WARNING] [runner.py:203:fetch_hostfile] Unable to find hostfile, will proceed with training with local resources only. -[2023-12-06 04:40:06,410] [INFO] [runner.py:570:main] cmd = /home/t-sokumar/miniconda3/envs/ft/bin/python -u -m deepspeed.launcher.launch --world_info=eyJsb2NhbGhvc3QiOiBbMCwgMSwgMiwgM119 --master_addr=127.0.0.1 --master_port=29500 --enable_each_rank_log=None main.py --data_path local/jsonfile --data_split 2,4,4 --model_name_or_path meta-llama/Llama-2-7b-hf --per_device_train_batch_size 4 --per_device_eval_batch_size 4 --max_seq_len 512 --learning_rate 9.65e-6 --weight_decay 0. --num_train_epochs 10 --gradient_accumulation_steps 1 --lr_scheduler_type cosine --num_warmup_steps 0 --seed 1234 --gradient_checkpointing --zero_stage 3 --deepspeed --lora_dim 128 --lora_module_name layers. --output_dir ./output_step1_llama2_7b_lora -[2023-12-06 04:40:09,034] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect) -[2023-12-06 04:40:11,238] [INFO] [launch.py:145:main] WORLD INFO DICT: {'localhost': [0, 1, 2, 3]} -[2023-12-06 04:40:11,239] [INFO] [launch.py:151:main] nnodes=1, num_local_procs=4, node_rank=0 -[2023-12-06 04:40:11,239] [INFO] [launch.py:162:main] global_rank_mapping=defaultdict(, {'localhost': [0, 1, 2, 3]}) -[2023-12-06 04:40:11,239] [INFO] [launch.py:163:main] dist_world_size=4 -[2023-12-06 04:40:11,239] [INFO] [launch.py:165:main] Setting CUDA_VISIBLE_DEVICES=0,1,2,3 -[2023-12-06 04:40:14,677] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect) -[2023-12-06 04:40:14,691] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect) -[2023-12-06 04:40:14,691] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect) -[2023-12-06 04:40:14,713] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2023-12-06 08:40:26,818] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2023-12-06 08:40:28,764] [WARNING] [runner.py:203:fetch_hostfile] Unable to find hostfile, will proceed with training with local resources only. +[2023-12-06 08:40:28,764] [INFO] [runner.py:570:main] cmd = /home/t-sokumar/miniconda3/envs/ft/bin/python -u -m deepspeed.launcher.launch --world_info=eyJsb2NhbGhvc3QiOiBbMCwgMSwgMiwgM119 --master_addr=127.0.0.1 --master_port=29500 --enable_each_rank_log=None main.py --data_path local/jsonfile --data_split 2,4,4 --model_name_or_path meta-llama/Llama-2-7b-hf --per_device_train_batch_size 4 --per_device_eval_batch_size 4 --max_seq_len 512 --learning_rate 9.65e-6 --weight_decay 0. --num_train_epochs 300 --gradient_accumulation_steps 1 --lr_scheduler_type cosine --num_warmup_steps 0 --seed 1234 --gradient_checkpointing --zero_stage 3 --deepspeed --lora_dim 128 --lora_module_name layers. --output_dir ./output_step1_llama2_7b_lora +[2023-12-06 08:40:31,316] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2023-12-06 08:40:33,360] [INFO] [launch.py:145:main] WORLD INFO DICT: {'localhost': [0, 1, 2, 3]} +[2023-12-06 08:40:33,361] [INFO] [launch.py:151:main] nnodes=1, num_local_procs=4, node_rank=0 +[2023-12-06 08:40:33,361] [INFO] [launch.py:162:main] global_rank_mapping=defaultdict(, {'localhost': [0, 1, 2, 3]}) +[2023-12-06 08:40:33,361] [INFO] [launch.py:163:main] dist_world_size=4 +[2023-12-06 08:40:33,361] [INFO] [launch.py:165:main] Setting CUDA_VISIBLE_DEVICES=0,1,2,3 +[2023-12-06 08:40:36,779] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2023-12-06 08:40:36,781] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2023-12-06 08:40:36,795] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2023-12-06 08:40:36,796] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect) /home/t-sokumar/miniconda3/envs/ft/lib/python3.11/site-packages/transformers/deepspeed.py:23: FutureWarning: transformers.deepspeed module is deprecated and will be removed in a future version. Please import deepspeed modules directly from transformers.integrations warnings.warn( /home/t-sokumar/miniconda3/envs/ft/lib/python3.11/site-packages/transformers/deepspeed.py:23: FutureWarning: transformers.deepspeed module is deprecated and will be removed in a future version. Please import deepspeed modules directly from transformers.integrations @@ -19,16 +19,16 @@ warnings.warn( /home/t-sokumar/miniconda3/envs/ft/lib/python3.11/site-packages/transformers/deepspeed.py:23: FutureWarning: transformers.deepspeed module is deprecated and will be removed in a future version. Please import deepspeed modules directly from transformers.integrations warnings.warn( -[2023-12-06 04:40:16,288] [INFO] [comm.py:637:init_distributed] cdb=None -[2023-12-06 04:40:16,288] [INFO] [comm.py:668:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl -[2023-12-06 04:40:16,558] [INFO] [comm.py:637:init_distributed] cdb=None -[2023-12-06 04:40:16,642] [INFO] [comm.py:637:init_distributed] cdb=None -[2023-12-06 04:40:16,658] [INFO] [comm.py:637:init_distributed] cdb=None -[2023-12-06 04:40:20,136] [INFO] [partition_parameters.py:348:__exit__] finished initializing model - num_params = 291, num_elems = 6.74B - Loading checkpoint shards: 0%| | 0/2 [00:00 -[2023-12-06 04:40:29,778] [INFO] [logging.py:96:log_dist] [Rank 0] Creating fp16 ZeRO stage 3 optimizer, MiCS is enabled False, Hierarchical params gather False -[2023-12-06 04:40:29,778] [INFO] [logging.py:96:log_dist] [Rank 0] Creating torch.float16 ZeRO stage 3 optimizer -[2023-12-06 04:40:29,894] [INFO] [utils.py:795:see_memory_usage] Stage 3 initialize beginning -[2023-12-06 04:40:29,895] [INFO] [utils.py:796:see_memory_usage] MA 4.17 GB Max_MA 4.39 GB CA 7.75 GB Max_CA 8 GB -[2023-12-06 04:40:29,895] [INFO] [utils.py:803:see_memory_usage] CPU Virtual Memory: used = 13.59 GB, percent = 5.4% -[2023-12-06 04:40:29,897] [INFO] [stage3.py:127:__init__] Reduce bucket size 500,000,000 -[2023-12-06 04:40:29,897] [INFO] [stage3.py:128:__init__] Prefetch bucket size 30000000 -[2023-12-06 04:40:29,998] [INFO] [utils.py:795:see_memory_usage] DeepSpeedZeRoOffload initialize [begin] -[2023-12-06 04:40:29,999] [INFO] [utils.py:796:see_memory_usage] MA 4.17 GB Max_MA 4.17 GB CA 7.75 GB Max_CA 8 GB -[2023-12-06 04:40:29,999] [INFO] [utils.py:803:see_memory_usage] CPU Virtual Memory: used = 13.59 GB, percent = 5.4% +[2023-12-06 08:40:51,217] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed info: version=0.12.4, git-hash=unknown, git-branch=unknown +[2023-12-06 08:40:51,217] [INFO] [comm.py:662:init_distributed] Distributed backend already initialized +[2023-12-06 08:40:51,257] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False +[2023-12-06 08:40:51,258] [INFO] [logging.py:96:log_dist] [Rank 0] Using client Optimizer as basic optimizer +[2023-12-06 08:40:51,259] [INFO] [logging.py:96:log_dist] [Rank 0] Removing param_group that has no 'params' in the basic Optimizer +[2023-12-06 08:40:51,296] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Basic Optimizer = FusedAdam +[2023-12-06 08:40:51,297] [INFO] [utils.py:56:is_zero_supported_optimizer] Checking ZeRO support for optimizer=FusedAdam type= +[2023-12-06 08:40:51,297] [INFO] [logging.py:96:log_dist] [Rank 0] Creating fp16 ZeRO stage 3 optimizer, MiCS is enabled False, Hierarchical params gather False +[2023-12-06 08:40:51,297] [INFO] [logging.py:96:log_dist] [Rank 0] Creating torch.float16 ZeRO stage 3 optimizer +[2023-12-06 08:40:51,415] [INFO] [utils.py:795:see_memory_usage] Stage 3 initialize beginning +[2023-12-06 08:40:51,416] [INFO] [utils.py:796:see_memory_usage] MA 4.17 GB Max_MA 4.4 GB CA 7.8 GB Max_CA 8 GB +[2023-12-06 08:40:51,416] [INFO] [utils.py:803:see_memory_usage] CPU Virtual Memory: used = 19.56 GB, percent = 7.8% +[2023-12-06 08:40:51,418] [INFO] [stage3.py:127:__init__] Reduce bucket size 500,000,000 +[2023-12-06 08:40:51,418] [INFO] [stage3.py:128:__init__] Prefetch bucket size 30000000 +[2023-12-06 08:40:51,530] [INFO] [utils.py:795:see_memory_usage] DeepSpeedZeRoOffload initialize [begin] +[2023-12-06 08:40:51,530] [INFO] [utils.py:796:see_memory_usage] MA 4.17 GB Max_MA 4.17 GB CA 7.8 GB Max_CA 8 GB +[2023-12-06 08:40:51,531] [INFO] [utils.py:803:see_memory_usage] CPU Virtual Memory: used = 19.55 GB, percent = 7.8% Parameter Offload: Total persistent parameters: 266240 in 65 params -[2023-12-06 04:40:30,335] [INFO] [utils.py:795:see_memory_usage] DeepSpeedZeRoOffload initialize [end] -[2023-12-06 04:40:30,335] [INFO] [utils.py:796:see_memory_usage] MA 3.35 GB Max_MA 4.23 GB CA 7.75 GB Max_CA 8 GB -[2023-12-06 04:40:30,336] [INFO] [utils.py:803:see_memory_usage] CPU Virtual Memory: used = 13.6 GB, percent = 5.4% -[2023-12-06 04:40:30,446] [INFO] [utils.py:795:see_memory_usage] Before creating fp16 partitions -[2023-12-06 04:40:30,447] [INFO] [utils.py:796:see_memory_usage] MA 3.35 GB Max_MA 3.35 GB CA 7.75 GB Max_CA 8 GB -[2023-12-06 04:40:30,447] [INFO] [utils.py:803:see_memory_usage] CPU Virtual Memory: used = 13.57 GB, percent = 5.4% -[2023-12-06 04:40:31,148] [INFO] [utils.py:795:see_memory_usage] After creating fp16 partitions: 3 -[2023-12-06 04:40:31,149] [INFO] [utils.py:796:see_memory_usage] MA 3.35 GB Max_MA 3.35 GB CA 4.38 GB Max_CA 8 GB -[2023-12-06 04:40:31,149] [INFO] [utils.py:803:see_memory_usage] CPU Virtual Memory: used = 13.64 GB, percent = 5.4% -[2023-12-06 04:40:31,292] [INFO] [utils.py:795:see_memory_usage] Before creating fp32 partitions -[2023-12-06 04:40:31,293] [INFO] [utils.py:796:see_memory_usage] MA 3.35 GB Max_MA 3.35 GB CA 4.38 GB Max_CA 4 GB -[2023-12-06 04:40:31,293] [INFO] [utils.py:803:see_memory_usage] CPU Virtual Memory: used = 11.18 GB, percent = 4.4% -[2023-12-06 04:40:31,429] [INFO] [utils.py:795:see_memory_usage] After creating fp32 partitions -[2023-12-06 04:40:31,430] [INFO] [utils.py:796:see_memory_usage] MA 3.9 GB Max_MA 4.05 GB CA 5.07 GB Max_CA 5 GB -[2023-12-06 04:40:31,430] [INFO] [utils.py:803:see_memory_usage] CPU Virtual Memory: used = 11.19 GB, percent = 4.4% -[2023-12-06 04:40:31,541] [INFO] [utils.py:795:see_memory_usage] Before initializing optimizer states -[2023-12-06 04:40:31,541] [INFO] [utils.py:796:see_memory_usage] MA 3.9 GB Max_MA 3.9 GB CA 5.07 GB Max_CA 5 GB -[2023-12-06 04:40:31,542] [INFO] [utils.py:803:see_memory_usage] CPU Virtual Memory: used = 11.19 GB, percent = 4.4% -[2023-12-06 04:40:31,686] [INFO] [utils.py:795:see_memory_usage] After initializing optimizer states -[2023-12-06 04:40:31,687] [INFO] [utils.py:796:see_memory_usage] MA 4.98 GB Max_MA 5.28 GB CA 6.46 GB Max_CA 6 GB -[2023-12-06 04:40:31,687] [INFO] [utils.py:803:see_memory_usage] CPU Virtual Memory: used = 11.15 GB, percent = 4.4% -[2023-12-06 04:40:31,688] [INFO] [stage3.py:479:_setup_for_real_optimizer] optimizer state initialized -[2023-12-06 04:40:32,196] [INFO] [utils.py:795:see_memory_usage] After initializing ZeRO optimizer -[2023-12-06 04:40:32,196] [INFO] [utils.py:796:see_memory_usage] MA 6.19 GB Max_MA 6.67 GB CA 8.14 GB Max_CA 8 GB -[2023-12-06 04:40:32,196] [INFO] [utils.py:803:see_memory_usage] CPU Virtual Memory: used = 11.18 GB, percent = 4.4% -[2023-12-06 04:40:32,197] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Final Optimizer = FusedAdam -[2023-12-06 04:40:32,197] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed using client LR scheduler -[2023-12-06 04:40:32,197] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed LR Scheduler = -[2023-12-06 04:40:32,197] [INFO] [logging.py:96:log_dist] [Rank 0] step=0, skipped=0, lr=[9.65e-06, 0.0005, 9.65e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-06 04:40:32,198] [INFO] [config.py:979:print] DeepSpeedEngine configuration: -[2023-12-06 04:40:32,198] [INFO] [config.py:983:print] activation_checkpointing_config { +[2023-12-06 08:40:51,844] [INFO] [utils.py:795:see_memory_usage] DeepSpeedZeRoOffload initialize [end] +[2023-12-06 08:40:51,845] [INFO] [utils.py:796:see_memory_usage] MA 3.36 GB Max_MA 4.24 GB CA 7.8 GB Max_CA 8 GB +[2023-12-06 08:40:51,845] [INFO] [utils.py:803:see_memory_usage] CPU Virtual Memory: used = 19.56 GB, percent = 7.8% +[2023-12-06 08:40:51,966] [INFO] [utils.py:795:see_memory_usage] Before creating fp16 partitions +[2023-12-06 08:40:51,967] [INFO] [utils.py:796:see_memory_usage] MA 3.36 GB Max_MA 3.36 GB CA 7.8 GB Max_CA 8 GB +[2023-12-06 08:40:51,967] [INFO] [utils.py:803:see_memory_usage] CPU Virtual Memory: used = 19.58 GB, percent = 7.8% +[2023-12-06 08:40:52,745] [INFO] [utils.py:795:see_memory_usage] After creating fp16 partitions: 3 +[2023-12-06 08:40:52,746] [INFO] [utils.py:796:see_memory_usage] MA 3.36 GB Max_MA 3.36 GB CA 4.85 GB Max_CA 8 GB +[2023-12-06 08:40:52,746] [INFO] [utils.py:803:see_memory_usage] CPU Virtual Memory: used = 19.63 GB, percent = 7.8% +[2023-12-06 08:40:52,894] [INFO] [utils.py:795:see_memory_usage] Before creating fp32 partitions +[2023-12-06 08:40:52,895] [INFO] [utils.py:796:see_memory_usage] MA 3.36 GB Max_MA 3.36 GB CA 4.85 GB Max_CA 5 GB +[2023-12-06 08:40:52,895] [INFO] [utils.py:803:see_memory_usage] CPU Virtual Memory: used = 17.67 GB, percent = 7.0% +[2023-12-06 08:40:53,026] [INFO] [utils.py:795:see_memory_usage] After creating fp32 partitions +[2023-12-06 08:40:53,027] [INFO] [utils.py:796:see_memory_usage] MA 3.9 GB Max_MA 4.05 GB CA 5.54 GB Max_CA 6 GB +[2023-12-06 08:40:53,027] [INFO] [utils.py:803:see_memory_usage] CPU Virtual Memory: used = 16.79 GB, percent = 6.7% +[2023-12-06 08:40:53,186] [INFO] [utils.py:795:see_memory_usage] Before initializing optimizer states +[2023-12-06 08:40:53,186] [INFO] [utils.py:796:see_memory_usage] MA 3.9 GB Max_MA 3.9 GB CA 5.54 GB Max_CA 6 GB +[2023-12-06 08:40:53,186] [INFO] [utils.py:803:see_memory_usage] CPU Virtual Memory: used = 16.74 GB, percent = 6.7% +[2023-12-06 08:40:53,314] [INFO] [utils.py:795:see_memory_usage] After initializing optimizer states +[2023-12-06 08:40:53,314] [INFO] [utils.py:796:see_memory_usage] MA 4.99 GB Max_MA 5.29 GB CA 6.93 GB Max_CA 7 GB +[2023-12-06 08:40:53,314] [INFO] [utils.py:803:see_memory_usage] CPU Virtual Memory: used = 16.77 GB, percent = 6.7% +[2023-12-06 08:40:53,315] [INFO] [stage3.py:479:_setup_for_real_optimizer] optimizer state initialized +[2023-12-06 08:40:53,712] [INFO] [utils.py:795:see_memory_usage] After initializing ZeRO optimizer +[2023-12-06 08:40:53,713] [INFO] [utils.py:796:see_memory_usage] MA 6.19 GB Max_MA 6.68 GB CA 8.61 GB Max_CA 9 GB +[2023-12-06 08:40:53,713] [INFO] [utils.py:803:see_memory_usage] CPU Virtual Memory: used = 16.76 GB, percent = 6.7% +[2023-12-06 08:40:53,713] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Final Optimizer = FusedAdam +[2023-12-06 08:40:53,713] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed using client LR scheduler +[2023-12-06 08:40:53,713] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed LR Scheduler = +[2023-12-06 08:40:53,713] [INFO] [logging.py:96:log_dist] [Rank 0] step=0, skipped=0, lr=[9.65e-06, 0.0005, 9.65e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 08:40:53,715] [INFO] [config.py:979:print] DeepSpeedEngine configuration: +[2023-12-06 08:40:53,715] [INFO] [config.py:983:print] activation_checkpointing_config { "partition_activations": false, "contiguous_memory_optimization": false, "cpu_checkpointing": false, @@ -110,10 +110,10 @@ Parameter Offload: Total persistent parameters: 266240 in 65 params "synchronize_checkpoint_boundary": false, "profile": false } -[2023-12-06 04:40:32,198] [INFO] [config.py:983:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True} -[2023-12-06 04:40:32,199] [INFO] [config.py:983:print] amp_enabled .................. False -[2023-12-06 04:40:32,199] [INFO] [config.py:983:print] amp_params ................... False -[2023-12-06 04:40:32,199] [INFO] [config.py:983:print] autotuning_config ............ { +[2023-12-06 08:40:53,715] [INFO] [config.py:983:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True} +[2023-12-06 08:40:53,715] [INFO] [config.py:983:print] amp_enabled .................. False +[2023-12-06 08:40:53,715] [INFO] [config.py:983:print] amp_params ................... False +[2023-12-06 08:40:53,716] [INFO] [config.py:983:print] autotuning_config ............ { "enabled": false, "start_step": null, "end_step": null, @@ -138,31 +138,31 @@ Parameter Offload: Total persistent parameters: 266240 in 65 params "min_train_micro_batch_size_per_gpu": 1, "num_tuning_micro_batch_sizes": 3 } -[2023-12-06 04:40:32,199] [INFO] [config.py:983:print] bfloat16_enabled ............. False -[2023-12-06 04:40:32,199] [INFO] [config.py:983:print] checkpoint_parallel_write_pipeline False -[2023-12-06 04:40:32,199] [INFO] [config.py:983:print] checkpoint_tag_validation_enabled True -[2023-12-06 04:40:32,199] [INFO] [config.py:983:print] checkpoint_tag_validation_fail False -[2023-12-06 04:40:32,199] [INFO] [config.py:983:print] comms_config ................. -[2023-12-06 04:40:32,199] [INFO] [config.py:983:print] communication_data_type ...... None -[2023-12-06 04:40:32,199] [INFO] [config.py:983:print] compression_config ........... {'weight_quantization': {'shared_parameters': {'enabled': False, 'quantizer_kernel': False, 'schedule_offset': 0, 'quantize_groups': 1, 'quantize_verbose': False, 'quantization_type': 'symmetric', 'quantize_weight_in_forward': False, 'rounding': 'nearest', 'fp16_mixed_quantize': False, 'quantize_change_ratio': 0.001}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {'enabled': False, 'quantization_type': 'symmetric', 'range_calibration': 'dynamic', 'schedule_offset': 1000}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {'enabled': False, 'method': 'topk', 'schedule_offset': 1000}, 'different_groups': {}}, 'channel_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'layer_reduction': {'enabled': False}} -[2023-12-06 04:40:32,199] [INFO] [config.py:983:print] curriculum_enabled_legacy .... False -[2023-12-06 04:40:32,199] [INFO] [config.py:983:print] curriculum_params_legacy ..... False -[2023-12-06 04:40:32,199] [INFO] [config.py:983:print] data_efficiency_config ....... {'enabled': False, 'seed': 1234, 'data_sampling': {'enabled': False, 'num_epochs': 1000, 'num_workers': 0, 'curriculum_learning': {'enabled': False}}, 'data_routing': {'enabled': False, 'random_ltd': {'enabled': False, 'layer_token_lr_schedule': {'enabled': False}}}} -[2023-12-06 04:40:32,199] [INFO] [config.py:983:print] data_efficiency_enabled ...... False -[2023-12-06 04:40:32,199] [INFO] [config.py:983:print] dataloader_drop_last ......... False -[2023-12-06 04:40:32,199] [INFO] [config.py:983:print] disable_allgather ............ False -[2023-12-06 04:40:32,199] [INFO] [config.py:983:print] dump_state ................... False -[2023-12-06 04:40:32,199] [INFO] [config.py:983:print] dynamic_loss_scale_args ...... {'init_scale': 65536, 'scale_window': 100, 'delayed_shift': 2, 'consecutive_hysteresis': False, 'min_scale': 1} -[2023-12-06 04:40:32,199] [INFO] [config.py:983:print] eigenvalue_enabled ........... False -[2023-12-06 04:40:32,199] [INFO] [config.py:983:print] eigenvalue_gas_boundary_resolution 1 -[2023-12-06 04:40:32,199] [INFO] [config.py:983:print] eigenvalue_layer_name ........ bert.encoder.layer -[2023-12-06 04:40:32,199] [INFO] [config.py:983:print] eigenvalue_layer_num ......... 0 -[2023-12-06 04:40:32,199] [INFO] [config.py:983:print] eigenvalue_max_iter .......... 100 -[2023-12-06 04:40:32,200] [INFO] [config.py:983:print] eigenvalue_stability ......... 1e-06 -[2023-12-06 04:40:32,200] [INFO] [config.py:983:print] eigenvalue_tol ............... 0.01 -[2023-12-06 04:40:32,200] [INFO] [config.py:983:print] eigenvalue_verbose ........... False -[2023-12-06 04:40:32,200] [INFO] [config.py:983:print] elasticity_enabled ........... False -[2023-12-06 04:40:32,200] [INFO] [config.py:983:print] flops_profiler_config ........ { +[2023-12-06 08:40:53,716] [INFO] [config.py:983:print] bfloat16_enabled ............. False +[2023-12-06 08:40:53,716] [INFO] [config.py:983:print] checkpoint_parallel_write_pipeline False +[2023-12-06 08:40:53,716] [INFO] [config.py:983:print] checkpoint_tag_validation_enabled True +[2023-12-06 08:40:53,716] [INFO] [config.py:983:print] checkpoint_tag_validation_fail False +[2023-12-06 08:40:53,716] [INFO] [config.py:983:print] comms_config ................. +[2023-12-06 08:40:53,716] [INFO] [config.py:983:print] communication_data_type ...... None +[2023-12-06 08:40:53,716] [INFO] [config.py:983:print] compression_config ........... {'weight_quantization': {'shared_parameters': {'enabled': False, 'quantizer_kernel': False, 'schedule_offset': 0, 'quantize_groups': 1, 'quantize_verbose': False, 'quantization_type': 'symmetric', 'quantize_weight_in_forward': False, 'rounding': 'nearest', 'fp16_mixed_quantize': False, 'quantize_change_ratio': 0.001}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {'enabled': False, 'quantization_type': 'symmetric', 'range_calibration': 'dynamic', 'schedule_offset': 1000}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {'enabled': False, 'method': 'topk', 'schedule_offset': 1000}, 'different_groups': {}}, 'channel_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'layer_reduction': {'enabled': False}} +[2023-12-06 08:40:53,716] [INFO] [config.py:983:print] curriculum_enabled_legacy .... False +[2023-12-06 08:40:53,716] [INFO] [config.py:983:print] curriculum_params_legacy ..... False +[2023-12-06 08:40:53,716] [INFO] [config.py:983:print] data_efficiency_config ....... {'enabled': False, 'seed': 1234, 'data_sampling': {'enabled': False, 'num_epochs': 1000, 'num_workers': 0, 'curriculum_learning': {'enabled': False}}, 'data_routing': {'enabled': False, 'random_ltd': {'enabled': False, 'layer_token_lr_schedule': {'enabled': False}}}} +[2023-12-06 08:40:53,716] [INFO] [config.py:983:print] data_efficiency_enabled ...... False +[2023-12-06 08:40:53,716] [INFO] [config.py:983:print] dataloader_drop_last ......... False +[2023-12-06 08:40:53,716] [INFO] [config.py:983:print] disable_allgather ............ False +[2023-12-06 08:40:53,716] [INFO] [config.py:983:print] dump_state ................... False +[2023-12-06 08:40:53,716] [INFO] [config.py:983:print] dynamic_loss_scale_args ...... {'init_scale': 65536, 'scale_window': 100, 'delayed_shift': 2, 'consecutive_hysteresis': False, 'min_scale': 1} +[2023-12-06 08:40:53,716] [INFO] [config.py:983:print] eigenvalue_enabled ........... False +[2023-12-06 08:40:53,716] [INFO] [config.py:983:print] eigenvalue_gas_boundary_resolution 1 +[2023-12-06 08:40:53,716] [INFO] [config.py:983:print] eigenvalue_layer_name ........ bert.encoder.layer +[2023-12-06 08:40:53,716] [INFO] [config.py:983:print] eigenvalue_layer_num ......... 0 +[2023-12-06 08:40:53,716] [INFO] [config.py:983:print] eigenvalue_max_iter .......... 100 +[2023-12-06 08:40:53,716] [INFO] [config.py:983:print] eigenvalue_stability ......... 1e-06 +[2023-12-06 08:40:53,716] [INFO] [config.py:983:print] eigenvalue_tol ............... 0.01 +[2023-12-06 08:40:53,716] [INFO] [config.py:983:print] eigenvalue_verbose ........... False +[2023-12-06 08:40:53,716] [INFO] [config.py:983:print] elasticity_enabled ........... False +[2023-12-06 08:40:53,716] [INFO] [config.py:983:print] flops_profiler_config ........ { "enabled": false, "recompute_fwd_factor": 0.0, "profile_step": 1, @@ -171,23 +171,23 @@ Parameter Offload: Total persistent parameters: 266240 in 65 params "detailed": true, "output_file": null } -[2023-12-06 04:40:32,200] [INFO] [config.py:983:print] fp16_auto_cast ............... False -[2023-12-06 04:40:32,200] [INFO] [config.py:983:print] fp16_enabled ................. True -[2023-12-06 04:40:32,200] [INFO] [config.py:983:print] fp16_master_weights_and_gradients False -[2023-12-06 04:40:32,200] [INFO] [config.py:983:print] global_rank .................. 0 -[2023-12-06 04:40:32,200] [INFO] [config.py:983:print] grad_accum_dtype ............. None -[2023-12-06 04:40:32,200] [INFO] [config.py:983:print] gradient_accumulation_steps .. 1 -[2023-12-06 04:40:32,200] [INFO] [config.py:983:print] gradient_clipping ............ 1.0 -[2023-12-06 04:40:32,200] [INFO] [config.py:983:print] gradient_predivide_factor .... 1.0 -[2023-12-06 04:40:32,200] [INFO] [config.py:983:print] hybrid_engine ................ enabled=False max_out_tokens=512 inference_tp_size=1 release_inference_cache=False pin_parameters=True tp_gather_partition_size=8 -[2023-12-06 04:40:32,200] [INFO] [config.py:983:print] initial_dynamic_scale ........ 65536 -[2023-12-06 04:40:32,200] [INFO] [config.py:983:print] load_universal_checkpoint .... False -[2023-12-06 04:40:32,200] [INFO] [config.py:983:print] loss_scale ................... 0 -[2023-12-06 04:40:32,200] [INFO] [config.py:983:print] memory_breakdown ............. False -[2023-12-06 04:40:32,200] [INFO] [config.py:983:print] mics_hierarchial_params_gather False -[2023-12-06 04:40:32,200] [INFO] [config.py:983:print] mics_shard_size .............. -1 -[2023-12-06 04:40:32,200] [INFO] [config.py:983:print] monitor_config ............... tensorboard=TensorBoardConfig(enabled=False, output_path='step1_tensorboard/ds_tensorboard_logs/', job_name='step1_model_tensorboard') wandb=WandbConfig(enabled=False, group=None, team=None, project='deepspeed') csv_monitor=CSVConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') enabled=False -[2023-12-06 04:40:32,200] [INFO] [config.py:983:print] nebula_config ................ { +[2023-12-06 08:40:53,716] [INFO] [config.py:983:print] fp16_auto_cast ............... False +[2023-12-06 08:40:53,716] [INFO] [config.py:983:print] fp16_enabled ................. True +[2023-12-06 08:40:53,716] [INFO] [config.py:983:print] fp16_master_weights_and_gradients False +[2023-12-06 08:40:53,717] [INFO] [config.py:983:print] global_rank .................. 0 +[2023-12-06 08:40:53,717] [INFO] [config.py:983:print] grad_accum_dtype ............. None +[2023-12-06 08:40:53,717] [INFO] [config.py:983:print] gradient_accumulation_steps .. 1 +[2023-12-06 08:40:53,717] [INFO] [config.py:983:print] gradient_clipping ............ 1.0 +[2023-12-06 08:40:53,717] [INFO] [config.py:983:print] gradient_predivide_factor .... 1.0 +[2023-12-06 08:40:53,717] [INFO] [config.py:983:print] hybrid_engine ................ enabled=False max_out_tokens=512 inference_tp_size=1 release_inference_cache=False pin_parameters=True tp_gather_partition_size=8 +[2023-12-06 08:40:53,717] [INFO] [config.py:983:print] initial_dynamic_scale ........ 65536 +[2023-12-06 08:40:53,717] [INFO] [config.py:983:print] load_universal_checkpoint .... False +[2023-12-06 08:40:53,717] [INFO] [config.py:983:print] loss_scale ................... 0 +[2023-12-06 08:40:53,717] [INFO] [config.py:983:print] memory_breakdown ............. False +[2023-12-06 08:40:53,717] [INFO] [config.py:983:print] mics_hierarchial_params_gather False +[2023-12-06 08:40:53,717] [INFO] [config.py:983:print] mics_shard_size .............. -1 +[2023-12-06 08:40:53,717] [INFO] [config.py:983:print] monitor_config ............... tensorboard=TensorBoardConfig(enabled=False, output_path='step1_tensorboard/ds_tensorboard_logs/', job_name='step1_model_tensorboard') wandb=WandbConfig(enabled=False, group=None, team=None, project='deepspeed') csv_monitor=CSVConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') enabled=False +[2023-12-06 08:40:53,717] [INFO] [config.py:983:print] nebula_config ................ { "enabled": false, "persistent_storage_path": null, "persistent_time_interval": 100, @@ -195,32 +195,32 @@ Parameter Offload: Total persistent parameters: 266240 in 65 params "enable_nebula_load": true, "load_path": null } -[2023-12-06 04:40:32,200] [INFO] [config.py:983:print] optimizer_legacy_fusion ...... False -[2023-12-06 04:40:32,200] [INFO] [config.py:983:print] optimizer_name ............... None -[2023-12-06 04:40:32,200] [INFO] [config.py:983:print] optimizer_params ............. None -[2023-12-06 04:40:32,200] [INFO] [config.py:983:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0, 'pipe_partitioned': True, 'grad_partitioned': True} -[2023-12-06 04:40:32,200] [INFO] [config.py:983:print] pld_enabled .................. False -[2023-12-06 04:40:32,200] [INFO] [config.py:983:print] pld_params ................... False -[2023-12-06 04:40:32,200] [INFO] [config.py:983:print] prescale_gradients ........... False -[2023-12-06 04:40:32,201] [INFO] [config.py:983:print] scheduler_name ............... None -[2023-12-06 04:40:32,201] [INFO] [config.py:983:print] scheduler_params ............. None -[2023-12-06 04:40:32,201] [INFO] [config.py:983:print] seq_parallel_communication_data_type torch.float32 -[2023-12-06 04:40:32,201] [INFO] [config.py:983:print] sparse_attention ............. None -[2023-12-06 04:40:32,201] [INFO] [config.py:983:print] sparse_gradients_enabled ..... False -[2023-12-06 04:40:32,201] [INFO] [config.py:983:print] steps_per_print .............. 10 -[2023-12-06 04:40:32,201] [INFO] [config.py:983:print] train_batch_size ............. 16 -[2023-12-06 04:40:32,201] [INFO] [config.py:983:print] train_micro_batch_size_per_gpu 4 -[2023-12-06 04:40:32,201] [INFO] [config.py:983:print] use_data_before_expert_parallel_ False -[2023-12-06 04:40:32,201] [INFO] [config.py:983:print] use_node_local_storage ....... False -[2023-12-06 04:40:32,201] [INFO] [config.py:983:print] wall_clock_breakdown ......... False -[2023-12-06 04:40:32,201] [INFO] [config.py:983:print] weight_quantization_config ... None -[2023-12-06 04:40:32,201] [INFO] [config.py:983:print] world_size ................... 4 -[2023-12-06 04:40:32,201] [INFO] [config.py:983:print] zero_allow_untested_optimizer False -[2023-12-06 04:40:32,201] [INFO] [config.py:983:print] zero_config .................. stage=3 contiguous_gradients=True reduce_scatter=True reduce_bucket_size=500,000,000 use_multi_rank_bucket_allreduce=True allgather_partitions=True allgather_bucket_size=500,000,000 overlap_comm=True load_from_fp32_weights=True elastic_checkpoint=False offload_param=DeepSpeedZeroOffloadParamConfig(device='none', nvme_path=None, buffer_count=5, buffer_size=100,000,000, max_in_cpu=1,000,000,000, pin_memory=False) offload_optimizer=DeepSpeedZeroOffloadOptimizerConfig(device='none', nvme_path=None, buffer_count=4, pin_memory=False, pipeline=False, pipeline_read=False, pipeline_write=False, fast_init=False, ratio=1.0) sub_group_size=1,000,000,000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=30000000 param_persistence_threshold=10000 model_persistence_threshold=sys.maxsize max_live_parameters=30000000 max_reuse_distance=1,000,000,000 gather_16bit_weights_on_model_save=False stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=False zero_hpz_partition_size=1 zero_quantized_weights=False zero_quantized_nontrainable_weights=False zero_quantized_gradients=False mics_shard_size=-1 mics_hierarchical_params_gather=False memory_efficient_linear=False pipeline_loading_checkpoint=False override_module_apply=True -[2023-12-06 04:40:32,201] [INFO] [config.py:983:print] zero_enabled ................. True -[2023-12-06 04:40:32,201] [INFO] [config.py:983:print] zero_force_ds_cpu_optimizer .. True -[2023-12-06 04:40:32,201] [INFO] [config.py:983:print] zero_optimization_stage ...... 3 -[2023-12-06 04:40:32,201] [INFO] [config.py:969:print_user_config] json = { +[2023-12-06 08:40:53,717] [INFO] [config.py:983:print] optimizer_legacy_fusion ...... False +[2023-12-06 08:40:53,717] [INFO] [config.py:983:print] optimizer_name ............... None +[2023-12-06 08:40:53,717] [INFO] [config.py:983:print] optimizer_params ............. None +[2023-12-06 08:40:53,717] [INFO] [config.py:983:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0, 'pipe_partitioned': True, 'grad_partitioned': True} +[2023-12-06 08:40:53,717] [INFO] [config.py:983:print] pld_enabled .................. False +[2023-12-06 08:40:53,717] [INFO] [config.py:983:print] pld_params ................... False +[2023-12-06 08:40:53,717] [INFO] [config.py:983:print] prescale_gradients ........... False +[2023-12-06 08:40:53,717] [INFO] [config.py:983:print] scheduler_name ............... None +[2023-12-06 08:40:53,717] [INFO] [config.py:983:print] scheduler_params ............. None +[2023-12-06 08:40:53,717] [INFO] [config.py:983:print] seq_parallel_communication_data_type torch.float32 +[2023-12-06 08:40:53,717] [INFO] [config.py:983:print] sparse_attention ............. None +[2023-12-06 08:40:53,717] [INFO] [config.py:983:print] sparse_gradients_enabled ..... False +[2023-12-06 08:40:53,717] [INFO] [config.py:983:print] steps_per_print .............. 10 +[2023-12-06 08:40:53,717] [INFO] [config.py:983:print] train_batch_size ............. 16 +[2023-12-06 08:40:53,717] [INFO] [config.py:983:print] train_micro_batch_size_per_gpu 4 +[2023-12-06 08:40:53,717] [INFO] [config.py:983:print] use_data_before_expert_parallel_ False +[2023-12-06 08:40:53,718] [INFO] [config.py:983:print] use_node_local_storage ....... False +[2023-12-06 08:40:53,718] [INFO] [config.py:983:print] wall_clock_breakdown ......... False +[2023-12-06 08:40:53,718] [INFO] [config.py:983:print] weight_quantization_config ... None +[2023-12-06 08:40:53,718] [INFO] [config.py:983:print] world_size ................... 4 +[2023-12-06 08:40:53,718] [INFO] [config.py:983:print] zero_allow_untested_optimizer False +[2023-12-06 08:40:53,718] [INFO] [config.py:983:print] zero_config .................. stage=3 contiguous_gradients=True reduce_scatter=True reduce_bucket_size=500,000,000 use_multi_rank_bucket_allreduce=True allgather_partitions=True allgather_bucket_size=500,000,000 overlap_comm=True load_from_fp32_weights=True elastic_checkpoint=False offload_param=DeepSpeedZeroOffloadParamConfig(device='none', nvme_path=None, buffer_count=5, buffer_size=100,000,000, max_in_cpu=1,000,000,000, pin_memory=False) offload_optimizer=DeepSpeedZeroOffloadOptimizerConfig(device='none', nvme_path=None, buffer_count=4, pin_memory=False, pipeline=False, pipeline_read=False, pipeline_write=False, fast_init=False, ratio=1.0) sub_group_size=1,000,000,000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=30000000 param_persistence_threshold=10000 model_persistence_threshold=sys.maxsize max_live_parameters=30000000 max_reuse_distance=1,000,000,000 gather_16bit_weights_on_model_save=False stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=False zero_hpz_partition_size=1 zero_quantized_weights=False zero_quantized_nontrainable_weights=False zero_quantized_gradients=False mics_shard_size=-1 mics_hierarchical_params_gather=False memory_efficient_linear=False pipeline_loading_checkpoint=False override_module_apply=True +[2023-12-06 08:40:53,718] [INFO] [config.py:983:print] zero_enabled ................. True +[2023-12-06 08:40:53,718] [INFO] [config.py:983:print] zero_force_ds_cpu_optimizer .. True +[2023-12-06 08:40:53,718] [INFO] [config.py:983:print] zero_optimization_stage ...... 3 +[2023-12-06 08:40:53,718] [INFO] [config.py:969:print_user_config] json = { "train_batch_size": 16, "train_micro_batch_size_per_gpu": 4, "steps_per_print": 10, @@ -259,9 +259,9 @@ Parameter Offload: Total persistent parameters: 266240 in 65 params } } ***** Running training ***** -***** Evaluating perplexity, Epoch 0/10 ***** +***** Evaluating perplexity, Epoch 0/300 ***** ppl: 6.489692211151123, loss: 1.8702150583267212 -Beginning of Epoch 1/10, Total Micro Batches 6 +Beginning of Epoch 1/300, Total Micro Batches 6 /home/t-sokumar/miniconda3/envs/ft/lib/python3.11/site-packages/torch/utils/checkpoint.py:429: UserWarning: torch.utils.checkpoint: please pass in use_reentrant=True or use_reentrant=False explicitly. The default value of use_reentrant will be updated to be False in the future. To maintain current behavior, pass use_reentrant=True. It is recommended that you use use_reentrant=False. Refer to docs for more details on the differences between the two variants. warnings.warn( /home/t-sokumar/miniconda3/envs/ft/lib/python3.11/site-packages/torch/utils/checkpoint.py:429: UserWarning: torch.utils.checkpoint: please pass in use_reentrant=True or use_reentrant=False explicitly. The default value of use_reentrant will be updated to be False in the future. To maintain current behavior, pass use_reentrant=True. It is recommended that you use use_reentrant=False. Refer to docs for more details on the differences between the two variants. @@ -270,120 +270,3394 @@ Beginning of Epoch 1/10, Total Micro Batches 6 warnings.warn( /home/t-sokumar/miniconda3/envs/ft/lib/python3.11/site-packages/torch/utils/checkpoint.py:429: UserWarning: torch.utils.checkpoint: please pass in use_reentrant=True or use_reentrant=False explicitly. The default value of use_reentrant will be updated to be False in the future. To maintain current behavior, pass use_reentrant=True. It is recommended that you use use_reentrant=False. Refer to docs for more details on the differences between the two variants. warnings.warn( -Model Parameters: 6.927 B, Latency: 2.79s, TFLOPs: 7.49, Samples/sec: 1.43, Time/seq 0.70s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.73s, TFLOPs: 7.66, Samples/sec: 1.46, Time/seq 0.68s, Batch Size: 4, Sequence Length: 512 Invalidate trace cache @ step 0: expected module 6, but got module 0 -Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.00, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 2.33s, TFLOPs: 8.98, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 2.63s, TFLOPs: 7.95, Samples/sec: 1.52, Time/seq 0.66s, Batch Size: 4, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 2.38s, TFLOPs: 8.80, Samples/sec: 1.68, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 1.58s, TFLOPs: 13.24, Samples/sec: 2.53, Time/seq 0.40s, Batch Size: 4, Sequence Length: 512 -***** Evaluating perplexity, Epoch 1/10 ***** +Model Parameters: 6.927 B, Latency: 2.36s, TFLOPs: 8.85, Samples/sec: 1.69, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.31s, TFLOPs: 9.05, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.50s, TFLOPs: 8.39, Samples/sec: 1.60, Time/seq 0.62s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.30s, TFLOPs: 9.10, Samples/sec: 1.74, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.60s, TFLOPs: 13.08, Samples/sec: 2.50, Time/seq 0.40s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 1/300 ***** Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 3.473783493041992, loss: 1.2452443838119507 -Beginning of Epoch 2/10, Total Micro Batches 6 -Model Parameters: 6.927 B, Latency: 2.31s, TFLOPs: 9.07, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.01, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.10, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 -[2023-12-06 04:40:57,343] [INFO] [logging.py:96:log_dist] [Rank 0] step=10, skipped=0, lr=[9.003572573259918e-06, 0.00046650635094610973, 9.003572573259918e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-06 04:40:57,343] [INFO] [timer.py:260:stop] epoch=1/micro_step=4/global_step=10, RunningAvgSamplesPerSec=7.236305009057328, CurrSamplesPerSec=7.733712400366698, MemAllocated=6.45GB, MaxMemAllocated=8.41GB -Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.11, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 2.09s, TFLOPs: 10.04, Samples/sec: 1.92, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 1.57s, TFLOPs: 13.36, Samples/sec: 2.55, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 -***** Evaluating perplexity, Epoch 2/10 ***** +ppl: 3.461667537689209, loss: 1.2417503595352173 +Beginning of Epoch 2/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.30s, TFLOPs: 9.09, Samples/sec: 1.74, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.02, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.18, Samples/sec: 1.95, Time/seq 0.51s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 08:41:18,669] [INFO] [logging.py:96:log_dist] [Rank 0] step=10, skipped=0, lr=[9.649265129129589e-06, 0.0004999619237890978, 9.649265129129589e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 08:41:18,670] [INFO] [timer.py:260:stop] epoch=1/micro_step=4/global_step=10, RunningAvgSamplesPerSec=7.313074180302858, CurrSamplesPerSec=7.549363220513197, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.12s, TFLOPs: 9.87, Samples/sec: 1.89, Time/seq 0.53s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.05s, TFLOPs: 10.23, Samples/sec: 1.95, Time/seq 0.51s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.58s, TFLOPs: 13.28, Samples/sec: 2.54, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 2/300 ***** Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 1.778276801109314, loss: 0.5756447911262512 -Beginning of Epoch 3/10, Total Micro Batches 6 -Model Parameters: 6.927 B, Latency: 2.31s, TFLOPs: 9.05, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.01, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 2.11s, TFLOPs: 9.91, Samples/sec: 1.89, Time/seq 0.53s, Batch Size: 4, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.12, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +ppl: 1.7053332328796387, loss: 0.5337604880332947 +Beginning of Epoch 3/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.35s, TFLOPs: 8.91, Samples/sec: 1.70, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.33s, TFLOPs: 8.99, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.05s, TFLOPs: 10.19, Samples/sec: 1.95, Time/seq 0.51s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.05s, TFLOPs: 10.20, Samples/sec: 1.95, Time/seq 0.51s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.05s, TFLOPs: 10.20, Samples/sec: 1.95, Time/seq 0.51s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.55s, TFLOPs: 13.48, Samples/sec: 2.57, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 3/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.1227624416351318, loss: 0.11579209566116333 +Beginning of Epoch 4/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.31s, TFLOPs: 9.08, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 08:41:40,555] [INFO] [logging.py:96:log_dist] [Rank 0] step=20, skipped=0, lr=[9.647060740367139e-06, 0.0004998477067547739, 9.647060740367139e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 08:41:40,555] [INFO] [timer.py:260:stop] epoch=3/micro_step=2/global_step=20, RunningAvgSamplesPerSec=7.5466607101028265, CurrSamplesPerSec=6.802800396720166, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.35s, TFLOPs: 8.89, Samples/sec: 1.70, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.18, Samples/sec: 1.95, Time/seq 0.51s, Batch Size: 4, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 1.57s, TFLOPs: 13.32, Samples/sec: 2.54, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 -***** Evaluating perplexity, Epoch 3/10 ***** +Model Parameters: 6.927 B, Latency: 2.05s, TFLOPs: 10.21, Samples/sec: 1.95, Time/seq 0.51s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.56s, TFLOPs: 13.45, Samples/sec: 2.57, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 4/300 ***** Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 1.1743606328964233, loss: 0.16072379052639008 -Beginning of Epoch 4/10, Total Micro Batches 6 -Model Parameters: 6.927 B, Latency: 2.33s, TFLOPs: 8.98, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 -[2023-12-06 04:41:19,315] [INFO] [logging.py:96:log_dist] [Rank 0] step=20, skipped=0, lr=[7.237500000000001e-06, 0.000375, 7.237500000000001e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-06 04:41:19,316] [INFO] [timer.py:260:stop] epoch=3/micro_step=2/global_step=20, RunningAvgSamplesPerSec=7.491063054773498, CurrSamplesPerSec=6.840971579160688, MemAllocated=6.45GB, MaxMemAllocated=8.42GB -Model Parameters: 6.927 B, Latency: 2.34s, TFLOPs: 8.94, Samples/sec: 1.71, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.18, Samples/sec: 1.94, Time/seq 0.51s, Batch Size: 4, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.17, Samples/sec: 1.94, Time/seq 0.51s, Batch Size: 4, Sequence Length: 512 +ppl: 1.0218873023986816, loss: 0.02165115252137184 +Beginning of Epoch 5/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.33s, TFLOPs: 8.99, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.35s, TFLOPs: 8.90, Samples/sec: 1.70, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 Model Parameters: 6.927 B, Latency: 2.05s, TFLOPs: 10.19, Samples/sec: 1.95, Time/seq 0.51s, Batch Size: 4, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 1.56s, TFLOPs: 13.39, Samples/sec: 2.56, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 -***** Evaluating perplexity, Epoch 4/10 ***** +Model Parameters: 6.927 B, Latency: 2.05s, TFLOPs: 10.20, Samples/sec: 1.95, Time/seq 0.51s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.15, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 08:42:01,295] [INFO] [logging.py:96:log_dist] [Rank 0] step=30, skipped=0, lr=[9.643387505190818e-06, 0.0004996573836886434, 9.643387505190818e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 08:42:01,296] [INFO] [timer.py:260:stop] epoch=4/micro_step=6/global_step=30, RunningAvgSamplesPerSec=7.6881372502942025, CurrSamplesPerSec=10.357282762309842, MemAllocated=6.27GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 1.55s, TFLOPs: 13.54, Samples/sec: 2.59, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 5/300 ***** Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 1.0426948070526123, loss: 0.04180852696299553 -Beginning of Epoch 5/10, Total Micro Batches 6 -Model Parameters: 6.927 B, Latency: 2.33s, TFLOPs: 9.00, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +ppl: 1.0088937282562256, loss: 0.008854405023157597 +Beginning of Epoch 6/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.36s, TFLOPs: 8.87, Samples/sec: 1.70, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 Model Parameters: 6.927 B, Latency: 2.33s, TFLOPs: 8.99, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.05s, TFLOPs: 10.19, Samples/sec: 1.95, Time/seq 0.51s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.18, Samples/sec: 1.95, Time/seq 0.51s, Batch Size: 4, Sequence Length: 512 Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.15, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 -[2023-12-06 04:41:40,115] [INFO] [logging.py:96:log_dist] [Rank 0] step=30, skipped=0, lr=[4.825e-06, 0.00025, 4.825e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-06 04:41:40,116] [INFO] [timer.py:260:stop] epoch=4/micro_step=6/global_step=30, RunningAvgSamplesPerSec=7.642425087608659, CurrSamplesPerSec=9.97939015094399, MemAllocated=6.26GB, MaxMemAllocated=8.42GB -Model Parameters: 6.927 B, Latency: 1.60s, TFLOPs: 13.04, Samples/sec: 2.49, Time/seq 0.40s, Batch Size: 4, Sequence Length: 512 -***** Evaluating perplexity, Epoch 5/10 ***** +Model Parameters: 6.927 B, Latency: 1.57s, TFLOPs: 13.37, Samples/sec: 2.55, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 6/300 ***** Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 1.0130127668380737, loss: 0.012928837910294533 -Beginning of Epoch 6/10, Total Micro Batches 6 -Model Parameters: 6.927 B, Latency: 2.31s, TFLOPs: 9.08, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +ppl: 1.005681037902832, loss: 0.00566498190164566 +Beginning of Epoch 7/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.31s, TFLOPs: 9.06, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.31s, TFLOPs: 9.06, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.12, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 08:42:23,689] [INFO] [logging.py:96:log_dist] [Rank 0] step=40, skipped=0, lr=[9.638246542503653e-06, 0.0004993910125649561, 9.638246542503653e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 08:42:23,690] [INFO] [timer.py:260:stop] epoch=6/micro_step=4/global_step=40, RunningAvgSamplesPerSec=7.654353973240889, CurrSamplesPerSec=7.775919039157762, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.17, Samples/sec: 1.94, Time/seq 0.51s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.05s, TFLOPs: 10.22, Samples/sec: 1.95, Time/seq 0.51s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.54s, TFLOPs: 13.62, Samples/sec: 2.60, Time/seq 0.38s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 7/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.004724383354187, loss: 0.004713195841759443 +Beginning of Epoch 8/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.33s, TFLOPs: 8.99, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 Model Parameters: 6.927 B, Latency: 2.34s, TFLOPs: 8.93, Samples/sec: 1.71, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.18, Samples/sec: 1.94, Time/seq 0.51s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.12, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.56s, TFLOPs: 13.45, Samples/sec: 2.57, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 8/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0049450397491455, loss: 0.004932814743369818 +Beginning of Epoch 9/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.28s, TFLOPs: 9.17, Samples/sec: 1.75, Time/seq 0.57s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 08:42:45,495] [INFO] [logging.py:96:log_dist] [Rank 0] step=50, skipped=0, lr=[9.631639418292673e-06, 0.0004990486745229364, 9.631639418292673e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 08:42:45,496] [INFO] [timer.py:260:stop] epoch=8/micro_step=2/global_step=50, RunningAvgSamplesPerSec=7.6794833477470705, CurrSamplesPerSec=6.961140082064217, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.30s, TFLOPs: 9.10, Samples/sec: 1.74, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.15, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.16, Samples/sec: 1.94, Time/seq 0.51s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.05s, TFLOPs: 10.20, Samples/sec: 1.95, Time/seq 0.51s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.51s, TFLOPs: 13.86, Samples/sec: 2.65, Time/seq 0.38s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 9/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046290159225464, loss: 0.004618301521986723 +Beginning of Epoch 10/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.29s, TFLOPs: 9.13, Samples/sec: 1.74, Time/seq 0.57s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.30s, TFLOPs: 9.12, Samples/sec: 1.74, Time/seq 0.57s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.17, Samples/sec: 1.94, Time/seq 0.51s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.16, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.16, Samples/sec: 1.94, Time/seq 0.51s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 08:43:06,091] [INFO] [logging.py:96:log_dist] [Rank 0] step=60, skipped=0, lr=[9.62356814515192e-06, 0.0004986304738420684, 9.62356814515192e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 08:43:06,092] [INFO] [timer.py:260:stop] epoch=9/micro_step=6/global_step=60, RunningAvgSamplesPerSec=7.735846348248568, CurrSamplesPerSec=10.569243814580418, MemAllocated=6.27GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 1.52s, TFLOPs: 13.81, Samples/sec: 2.64, Time/seq 0.38s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 10/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0050643682479858, loss: 0.005051491782069206 +Beginning of Epoch 11/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.29s, TFLOPs: 9.15, Samples/sec: 1.75, Time/seq 0.57s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.30s, TFLOPs: 9.12, Samples/sec: 1.74, Time/seq 0.57s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.16, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.11, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.18, Samples/sec: 1.95, Time/seq 0.51s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.52s, TFLOPs: 13.75, Samples/sec: 2.63, Time/seq 0.38s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 11/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0045534372329712, loss: 0.004543079063296318 +Beginning of Epoch 12/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.28s, TFLOPs: 9.18, Samples/sec: 1.75, Time/seq 0.57s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.45s, TFLOPs: 8.53, Samples/sec: 1.63, Time/seq 0.61s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.16, Samples/sec: 1.94, Time/seq 0.51s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 08:43:28,440] [INFO] [logging.py:96:log_dist] [Rank 0] step=70, skipped=0, lr=[9.61403518166938e-06, 0.0004981365379103306, 9.61403518166938e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 08:43:28,440] [INFO] [timer.py:260:stop] epoch=11/micro_step=4/global_step=70, RunningAvgSamplesPerSec=7.711586917836205, CurrSamplesPerSec=7.807169990829241, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.05s, TFLOPs: 10.21, Samples/sec: 1.95, Time/seq 0.51s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.16, Samples/sec: 1.94, Time/seq 0.51s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.55s, TFLOPs: 13.53, Samples/sec: 2.59, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 12/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046911239624023, loss: 0.004680057987570763 +Beginning of Epoch 13/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.27s, TFLOPs: 9.20, Samples/sec: 1.76, Time/seq 0.57s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.31s, TFLOPs: 9.05, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.16, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.17, Samples/sec: 1.94, Time/seq 0.51s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.60s, TFLOPs: 13.10, Samples/sec: 2.50, Time/seq 0.40s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 13/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0045222043991089, loss: 0.004511988256126642 +Beginning of Epoch 14/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.03, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 08:43:50,298] [INFO] [logging.py:96:log_dist] [Rank 0] step=80, skipped=0, lr=[9.603043431678079e-06, 0.0004975670171853926, 9.603043431678079e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 08:43:50,300] [INFO] [timer.py:260:stop] epoch=13/micro_step=2/global_step=80, RunningAvgSamplesPerSec=7.71746344203027, CurrSamplesPerSec=6.824883799072671, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.35s, TFLOPs: 8.92, Samples/sec: 1.71, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.15, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.11, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.15, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.56s, TFLOPs: 13.38, Samples/sec: 2.56, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 14/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.004562497138977, loss: 0.004552107769995928 +Beginning of Epoch 15/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.31s, TFLOPs: 9.05, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.45s, TFLOPs: 8.56, Samples/sec: 1.64, Time/seq 0.61s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.18, Samples/sec: 1.95, Time/seq 0.51s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 08:44:11,167] [INFO] [logging.py:96:log_dist] [Rank 0] step=90, skipped=0, lr=[9.590596243371541e-06, 0.0004969220851487844, 9.590596243371541e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 08:44:11,167] [INFO] [timer.py:260:stop] epoch=14/micro_step=6/global_step=90, RunningAvgSamplesPerSec=7.738349896525623, CurrSamplesPerSec=10.374236952757853, MemAllocated=6.27GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 1.54s, TFLOPs: 13.56, Samples/sec: 2.59, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 15/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.004675269126892, loss: 0.004664290230721235 +Beginning of Epoch 16/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.37s, TFLOPs: 8.84, Samples/sec: 1.69, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.41s, TFLOPs: 8.69, Samples/sec: 1.66, Time/seq 0.60s, Batch Size: 4, Sequence Length: 512 Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.15, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 1.57s, TFLOPs: 13.31, Samples/sec: 2.54, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 -***** Evaluating perplexity, Epoch 6/10 ***** +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.15, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.59s, TFLOPs: 13.14, Samples/sec: 2.51, Time/seq 0.40s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 16/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0045578479766846, loss: 0.0045474860817193985 +Beginning of Epoch 17/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.34s, TFLOPs: 8.96, Samples/sec: 1.71, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.34s, TFLOPs: 8.94, Samples/sec: 1.71, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.16, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 08:44:33,733] [INFO] [logging.py:96:log_dist] [Rank 0] step=100, skipped=0, lr=[9.576697408283905e-06, 0.000496201938253052, 9.576697408283905e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 08:44:33,735] [INFO] [timer.py:260:stop] epoch=16/micro_step=4/global_step=100, RunningAvgSamplesPerSec=7.712840377583818, CurrSamplesPerSec=7.771893665353572, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.16, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.15, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.57s, TFLOPs: 13.37, Samples/sec: 2.55, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 17/300 ***** Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 1.006258487701416, loss: 0.0062390221282839775 -Beginning of Epoch 7/10, Total Micro Batches 6 +ppl: 1.0045911073684692, loss: 0.0045806230045855045 +Beginning of Epoch 18/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.01, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.36s, TFLOPs: 8.87, Samples/sec: 1.69, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.12, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.11, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.11, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.57s, TFLOPs: 13.34, Samples/sec: 2.55, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 18/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.00454843044281, loss: 0.004538155160844326 +Beginning of Epoch 19/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.04, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 08:44:55,710] [INFO] [logging.py:96:log_dist] [Rank 0] step=110, skipped=0, lr=[9.56135116013498e-06, 0.000495406795861916, 9.56135116013498e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 08:44:55,710] [INFO] [timer.py:260:stop] epoch=18/micro_step=2/global_step=110, RunningAvgSamplesPerSec=7.712778325518424, CurrSamplesPerSec=6.801926789108827, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.35s, TFLOPs: 8.89, Samples/sec: 1.70, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.08s, TFLOPs: 10.08, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.08s, TFLOPs: 10.09, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.55s, TFLOPs: 13.47, Samples/sec: 2.57, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 19/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0045993328094482, loss: 0.004588749259710312 +Beginning of Epoch 20/300, Total Micro Batches 6 Model Parameters: 6.927 B, Latency: 2.31s, TFLOPs: 9.06, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.34s, TFLOPs: 8.94, Samples/sec: 1.71, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.10, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 08:45:16,504] [INFO] [logging.py:96:log_dist] [Rank 0] step=120, skipped=0, lr=[9.544562173540613e-06, 0.0004945369001834514, 9.544562173540613e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 08:45:16,505] [INFO] [timer.py:260:stop] epoch=19/micro_step=6/global_step=120, RunningAvgSamplesPerSec=7.7310410822149755, CurrSamplesPerSec=10.292071259575462, MemAllocated=6.27GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 1.56s, TFLOPs: 13.45, Samples/sec: 2.57, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 20/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.004582405090332, loss: 0.004571957513689995 +Beginning of Epoch 21/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.33s, TFLOPs: 8.99, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.33s, TFLOPs: 8.97, Samples/sec: 1.71, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.12, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.11, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.57s, TFLOPs: 13.36, Samples/sec: 2.55, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 21/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0045524835586548, loss: 0.004542169161140919 +Beginning of Epoch 22/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.03, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.35s, TFLOPs: 8.90, Samples/sec: 1.70, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 08:45:38,964] [INFO] [logging.py:96:log_dist] [Rank 0] step=130, skipped=0, lr=[9.526335562588761e-06, 0.0004935925161963088, 9.526335562588761e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 08:45:38,965] [INFO] [timer.py:260:stop] epoch=21/micro_step=4/global_step=130, RunningAvgSamplesPerSec=7.715671437100243, CurrSamplesPerSec=7.7525401810989845, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.56s, TFLOPs: 13.41, Samples/sec: 2.56, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 22/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0045561790466309, loss: 0.0045457929372787476 +Beginning of Epoch 23/300, Total Micro Batches 6 Model Parameters: 6.927 B, Latency: 2.33s, TFLOPs: 9.00, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.34s, TFLOPs: 8.94, Samples/sec: 1.71, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.15, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 -[2023-12-06 04:42:02,526] [INFO] [logging.py:96:log_dist] [Rank 0] step=40, skipped=0, lr=[2.4125000000000015e-06, 0.00012500000000000006, 2.4125000000000015e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-06 04:42:02,526] [INFO] [timer.py:260:stop] epoch=6/micro_step=4/global_step=40, RunningAvgSamplesPerSec=7.620713816433709, CurrSamplesPerSec=7.736858876659631, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 1.56s, TFLOPs: 13.43, Samples/sec: 2.57, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 23/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.004577398300171, loss: 0.004566950257867575 +Beginning of Epoch 24/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.31s, TFLOPs: 9.04, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 08:46:00,878] [INFO] [logging.py:96:log_dist] [Rank 0] step=140, skipped=0, lr=[9.506676879281683e-06, 0.0004925739315689991, 9.506676879281683e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 08:46:00,878] [INFO] [timer.py:260:stop] epoch=23/micro_step=2/global_step=140, RunningAvgSamplesPerSec=7.716823881247703, CurrSamplesPerSec=6.829584580000407, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.34s, TFLOPs: 8.93, Samples/sec: 1.71, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.10, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.08s, TFLOPs: 10.09, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.16, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.56s, TFLOPs: 13.46, Samples/sec: 2.57, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 24/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.004574179649353, loss: 0.004563763737678528 +Beginning of Epoch 25/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.31s, TFLOPs: 9.08, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.36s, TFLOPs: 8.85, Samples/sec: 1.69, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.10, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 08:46:21,701] [INFO] [logging.py:96:log_dist] [Rank 0] step=150, skipped=0, lr=[9.485592111844755e-06, 0.0004914814565722671, 9.485592111844755e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 08:46:21,702] [INFO] [timer.py:260:stop] epoch=24/micro_step=6/global_step=150, RunningAvgSamplesPerSec=7.7303952923462775, CurrSamplesPerSec=10.190762891206791, MemAllocated=6.27GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 1.57s, TFLOPs: 13.32, Samples/sec: 2.55, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 25/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0045571327209473, loss: 0.004546836018562317 +Beginning of Epoch 26/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.29s, TFLOPs: 9.15, Samples/sec: 1.75, Time/seq 0.57s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.31s, TFLOPs: 9.05, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.12, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.11, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.54s, TFLOPs: 13.56, Samples/sec: 2.59, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 26/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0045771598815918, loss: 0.004566721618175507 +Beginning of Epoch 27/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.30s, TFLOPs: 9.11, Samples/sec: 1.74, Time/seq 0.57s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.03, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 08:46:44,010] [INFO] [logging.py:96:log_dist] [Rank 0] step=160, skipped=0, lr=[9.46308768290239e-06, 0.0004903154239845797, 9.46308768290239e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 08:46:44,010] [INFO] [timer.py:260:stop] epoch=26/micro_step=4/global_step=160, RunningAvgSamplesPerSec=7.721293729470686, CurrSamplesPerSec=7.756451394046652, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.12, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.55s, TFLOPs: 13.51, Samples/sec: 2.58, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 27/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.004559874534607, loss: 0.004549544304609299 +Beginning of Epoch 28/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.30s, TFLOPs: 9.10, Samples/sec: 1.74, Time/seq 0.57s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.31s, TFLOPs: 9.05, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.11, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.09s, TFLOPs: 10.02, Samples/sec: 1.92, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.56s, TFLOPs: 13.45, Samples/sec: 2.57, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 28/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0045775175094604, loss: 0.004567084368318319 +Beginning of Epoch 29/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.35s, TFLOPs: 8.90, Samples/sec: 1.70, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 08:47:05,933] [INFO] [logging.py:96:log_dist] [Rank 0] step=170, skipped=0, lr=[9.439170447521647e-06, 0.0004890761889907589, 9.439170447521647e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 08:47:05,933] [INFO] [timer.py:260:stop] epoch=28/micro_step=2/global_step=170, RunningAvgSamplesPerSec=7.721824898114301, CurrSamplesPerSec=6.827850205154377, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.35s, TFLOPs: 8.93, Samples/sec: 1.71, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.08s, TFLOPs: 10.07, Samples/sec: 1.92, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.12, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.57s, TFLOPs: 13.32, Samples/sec: 2.54, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 29/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0045764446258545, loss: 0.004565949086099863 +Beginning of Epoch 30/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.03, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.01, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.11, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.16, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.09, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 08:47:26,789] [INFO] [logging.py:96:log_dist] [Rank 0] step=180, skipped=0, lr=[9.413847691124117e-06, 0.0004877641290737884, 9.413847691124117e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 08:47:26,789] [INFO] [timer.py:260:stop] epoch=29/micro_step=6/global_step=180, RunningAvgSamplesPerSec=7.732138400045645, CurrSamplesPerSec=9.939005964096935, MemAllocated=6.27GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 1.61s, TFLOPs: 12.99, Samples/sec: 2.48, Time/seq 0.40s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 30/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0045665502548218, loss: 0.0045561408624053 +Beginning of Epoch 31/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.34s, TFLOPs: 8.95, Samples/sec: 1.71, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.35s, TFLOPs: 8.90, Samples/sec: 1.70, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.61s, TFLOPs: 13.01, Samples/sec: 2.49, Time/seq 0.40s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 31/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0045784711837769, loss: 0.004568074829876423 +Beginning of Epoch 32/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.35s, TFLOPs: 8.90, Samples/sec: 1.70, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.37s, TFLOPs: 8.83, Samples/sec: 1.69, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 08:47:49,334] [INFO] [logging.py:96:log_dist] [Rank 0] step=190, skipped=0, lr=[9.387127127266705e-06, 0.00048637964389982923, 9.387127127266705e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 08:47:49,334] [INFO] [timer.py:260:stop] epoch=31/micro_step=4/global_step=190, RunningAvgSamplesPerSec=7.719527014140443, CurrSamplesPerSec=7.775375775639015, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.17, Samples/sec: 1.94, Time/seq 0.51s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.16, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.54s, TFLOPs: 13.61, Samples/sec: 2.60, Time/seq 0.38s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 32/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0045814514160156, loss: 0.004570980090647936 +Beginning of Epoch 33/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.30s, TFLOPs: 9.08, Samples/sec: 1.74, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.35s, TFLOPs: 8.89, Samples/sec: 1.70, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.08s, TFLOPs: 10.08, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.08s, TFLOPs: 10.07, Samples/sec: 1.92, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 Model Parameters: 6.927 B, Latency: 1.58s, TFLOPs: 13.22, Samples/sec: 2.53, Time/seq 0.40s, Batch Size: 4, Sequence Length: 512 -***** Evaluating perplexity, Epoch 7/10 ***** +***** Evaluating perplexity, Epoch 33/300 ***** Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 1.0054712295532227, loss: 0.005456325598061085 -Beginning of Epoch 8/10, Total Micro Batches 6 +ppl: 1.004576563835144, loss: 0.004566078074276447 +Beginning of Epoch 34/300, Total Micro Batches 6 Model Parameters: 6.927 B, Latency: 2.31s, TFLOPs: 9.06, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 2.33s, TFLOPs: 8.97, Samples/sec: 1.71, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.09, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.09, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 08:48:11,269] [INFO] [logging.py:96:log_dist] [Rank 0] step=200, skipped=0, lr=[9.35901689529201e-06, 0.0004849231551964771, 9.35901689529201e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 08:48:11,270] [INFO] [timer.py:260:stop] epoch=33/micro_step=2/global_step=200, RunningAvgSamplesPerSec=7.719885340711891, CurrSamplesPerSec=6.823927488416617, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.35s, TFLOPs: 8.92, Samples/sec: 1.70, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.08s, TFLOPs: 10.07, Samples/sec: 1.92, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.17, Samples/sec: 1.94, Time/seq 0.51s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.54s, TFLOPs: 13.57, Samples/sec: 2.59, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 34/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046138763427734, loss: 0.0046032196842134 +Beginning of Epoch 35/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.29s, TFLOPs: 9.13, Samples/sec: 1.75, Time/seq 0.57s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.03, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.08s, TFLOPs: 10.09, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.12, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.16, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 08:48:32,008] [INFO] [logging.py:96:log_dist] [Rank 0] step=210, skipped=0, lr=[9.329525557848999e-06, 0.00048339510662430044, 9.329525557848999e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 08:48:32,009] [INFO] [timer.py:260:stop] epoch=34/micro_step=6/global_step=210, RunningAvgSamplesPerSec=7.730956536671431, CurrSamplesPerSec=10.260605442539866, MemAllocated=6.27GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 1.56s, TFLOPs: 13.41, Samples/sec: 2.56, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 35/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046511888504028, loss: 0.004640465602278709 +Beginning of Epoch 36/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.31s, TFLOPs: 9.08, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.04, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.15, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.12, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.54s, TFLOPs: 13.56, Samples/sec: 2.59, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 36/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0045183897018433, loss: 0.004508160054683685 +Beginning of Epoch 37/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.30s, TFLOPs: 9.08, Samples/sec: 1.74, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.33s, TFLOPs: 9.00, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 08:48:54,346] [INFO] [logging.py:96:log_dist] [Rank 0] step=220, skipped=0, lr=[9.29866209828475e-06, 0.00048179596364169685, 9.29866209828475e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 08:48:54,346] [INFO] [timer.py:260:stop] epoch=36/micro_step=4/global_step=220, RunningAvgSamplesPerSec=7.723880800310185, CurrSamplesPerSec=7.756035444214196, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.10, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 1.59s, TFLOPs: 13.14, Samples/sec: 2.51, Time/seq 0.40s, Batch Size: 4, Sequence Length: 512 -***** Evaluating perplexity, Epoch 8/10 ***** +Model Parameters: 6.927 B, Latency: 1.70s, TFLOPs: 12.33, Samples/sec: 2.36, Time/seq 0.42s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 37/300 ***** Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 1.005515694618225, loss: 0.0055005596950650215 -Beginning of Epoch 9/10, Total Micro Batches 6 +ppl: 1.0045980215072632, loss: 0.0045874640345573425 +Beginning of Epoch 38/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.44s, TFLOPs: 8.59, Samples/sec: 1.64, Time/seq 0.61s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.02, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.11, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.11, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.15, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.55s, TFLOPs: 13.47, Samples/sec: 2.57, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 38/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046025514602661, loss: 0.004591932520270348 +Beginning of Epoch 39/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.29s, TFLOPs: 9.13, Samples/sec: 1.75, Time/seq 0.57s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 08:49:16,455] [INFO] [logging.py:96:log_dist] [Rank 0] step=230, skipped=0, lr=[9.266435917908027e-06, 0.00048012621336311016, 9.266435917908027e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 08:49:16,455] [INFO] [timer.py:260:stop] epoch=38/micro_step=2/global_step=230, RunningAvgSamplesPerSec=7.721206555476635, CurrSamplesPerSec=6.922949496687584, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.31s, TFLOPs: 9.05, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.11, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.15, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.55s, TFLOPs: 13.54, Samples/sec: 2.59, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 39/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046082735061646, loss: 0.0045977248810231686 +Beginning of Epoch 40/300, Total Micro Batches 6 Model Parameters: 6.927 B, Latency: 2.30s, TFLOPs: 9.09, Samples/sec: 1.74, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 -[2023-12-06 04:42:24,503] [INFO] [logging.py:96:log_dist] [Rank 0] step=50, skipped=0, lr=[6.464274267400833e-07, 3.3493649053890325e-05, 6.464274267400833e-07], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-06 04:42:24,503] [INFO] [timer.py:260:stop] epoch=8/micro_step=2/global_step=50, RunningAvgSamplesPerSec=7.638560733715662, CurrSamplesPerSec=6.81366785411214, MemAllocated=6.45GB, MaxMemAllocated=8.42GB -Model Parameters: 6.927 B, Latency: 2.35s, TFLOPs: 8.91, Samples/sec: 1.70, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.33s, TFLOPs: 8.97, Samples/sec: 1.71, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.15, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.16, Samples/sec: 1.94, Time/seq 0.51s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 08:49:37,201] [INFO] [logging.py:96:log_dist] [Rank 0] step=240, skipped=0, lr=[9.23285683312555e-06, 0.0004783863644106502, 9.23285683312555e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 08:49:37,202] [INFO] [timer.py:260:stop] epoch=39/micro_step=6/global_step=240, RunningAvgSamplesPerSec=7.730687607823271, CurrSamplesPerSec=10.27432718935212, MemAllocated=6.27GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 1.56s, TFLOPs: 13.43, Samples/sec: 2.57, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 40/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0045539140701294, loss: 0.004543534945696592 +Beginning of Epoch 41/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.01, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.33s, TFLOPs: 8.99, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.12, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.56s, TFLOPs: 13.45, Samples/sec: 2.57, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 41/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046100616455078, loss: 0.004599445033818483 +Beginning of Epoch 42/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.31s, TFLOPs: 9.07, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.33s, TFLOPs: 8.97, Samples/sec: 1.71, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 1.59s, TFLOPs: 13.19, Samples/sec: 2.52, Time/seq 0.40s, Batch Size: 4, Sequence Length: 512 -***** Evaluating perplexity, Epoch 9/10 ***** +[2023-12-06 08:49:59,594] [INFO] [logging.py:96:log_dist] [Rank 0] step=250, skipped=0, lr=[9.197935072451837e-06, 0.00047657694675916254, 9.197935072451837e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 08:49:59,595] [INFO] [timer.py:260:stop] epoch=41/micro_step=4/global_step=250, RunningAvgSamplesPerSec=7.723593951538276, CurrSamplesPerSec=7.7597061275196495, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.10, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.57s, TFLOPs: 13.36, Samples/sec: 2.55, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 42/300 ***** Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 1.0054373741149902, loss: 0.005422617308795452 -Beginning of Epoch 10/10, Total Micro Batches 6 -Model Parameters: 6.927 B, Latency: 2.31s, TFLOPs: 9.08, Samples/sec: 1.74, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +ppl: 1.0045968294143677, loss: 0.004586254246532917 +Beginning of Epoch 43/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.31s, TFLOPs: 9.06, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.40s, TFLOPs: 8.72, Samples/sec: 1.67, Time/seq 0.60s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.17, Samples/sec: 1.94, Time/seq 0.51s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.18, Samples/sec: 1.94, Time/seq 0.51s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.16, Samples/sec: 1.94, Time/seq 0.51s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.56s, TFLOPs: 13.41, Samples/sec: 2.56, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 43/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.004574179649353, loss: 0.004563635215163231 +Beginning of Epoch 44/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.33s, TFLOPs: 8.97, Samples/sec: 1.71, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 08:50:21,544] [INFO] [logging.py:96:log_dist] [Rank 0] step=260, skipped=0, lr=[9.161681273393482e-06, 0.00047469851157479177, 9.161681273393482e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 08:50:21,544] [INFO] [timer.py:260:stop] epoch=43/micro_step=2/global_step=260, RunningAvgSamplesPerSec=7.7233416967818265, CurrSamplesPerSec=6.893272746284852, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.01, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.15, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.53s, TFLOPs: 13.66, Samples/sec: 2.61, Time/seq 0.38s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 44/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046074390411377, loss: 0.0045968955382704735 +Beginning of Epoch 45/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.31s, TFLOPs: 9.07, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.34s, TFLOPs: 8.96, Samples/sec: 1.71, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.12, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 08:50:42,254] [INFO] [logging.py:96:log_dist] [Rank 0] step=270, skipped=0, lr=[9.124106479208876e-06, 0.00047275163104709196, 9.124106479208876e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 08:50:42,254] [INFO] [timer.py:260:stop] epoch=44/micro_step=6/global_step=270, RunningAvgSamplesPerSec=7.732285490944579, CurrSamplesPerSec=10.52392237350693, MemAllocated=6.27GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 1.52s, TFLOPs: 13.76, Samples/sec: 2.63, Time/seq 0.38s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 45/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.004591703414917, loss: 0.004581090062856674 +Beginning of Epoch 46/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.28s, TFLOPs: 9.16, Samples/sec: 1.75, Time/seq 0.57s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.30s, TFLOPs: 9.09, Samples/sec: 1.74, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.15, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.15, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.59s, TFLOPs: 13.13, Samples/sec: 2.51, Time/seq 0.40s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 46/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0045998096466064, loss: 0.004589286167174578 +Beginning of Epoch 47/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.29s, TFLOPs: 9.12, Samples/sec: 1.74, Time/seq 0.57s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.03, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 08:51:04,586] [INFO] [logging.py:96:log_dist] [Rank 0] step=280, skipped=0, lr=[9.085222135544324e-06, 0.00047073689821473173, 9.085222135544324e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 08:51:04,586] [INFO] [timer.py:260:stop] epoch=46/micro_step=4/global_step=280, RunningAvgSamplesPerSec=7.726744022013679, CurrSamplesPerSec=7.739571411239187, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.12, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.65s, TFLOPs: 12.71, Samples/sec: 2.43, Time/seq 0.41s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 47/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0045881271362305, loss: 0.004577603656798601 +Beginning of Epoch 48/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.36s, TFLOPs: 8.85, Samples/sec: 1.69, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.02, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.11, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.54s, TFLOPs: 13.56, Samples/sec: 2.59, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 48/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0045946836471558, loss: 0.004584057256579399 +Beginning of Epoch 49/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.30s, TFLOPs: 9.11, Samples/sec: 1.74, Time/seq 0.57s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 08:51:26,659] [INFO] [logging.py:96:log_dist] [Rank 0] step=290, skipped=0, lr=[9.045040086947585e-06, 0.00046865492678484894, 9.045040086947585e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 08:51:26,660] [INFO] [timer.py:260:stop] epoch=48/micro_step=2/global_step=290, RunningAvgSamplesPerSec=7.725687137109765, CurrSamplesPerSec=6.773815142675455, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.36s, TFLOPs: 8.86, Samples/sec: 1.69, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.15, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.56s, TFLOPs: 13.41, Samples/sec: 2.56, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 49/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046005249023438, loss: 0.004589908290654421 +Beginning of Epoch 50/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.31s, TFLOPs: 9.06, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.48s, TFLOPs: 8.44, Samples/sec: 1.61, Time/seq 0.62s, Batch Size: 4, Sequence Length: 512 Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.15, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 -[2023-12-06 04:42:45,284] [INFO] [logging.py:96:log_dist] [Rank 0] step=60, skipped=0, lr=[0.0, 0.0, 0.0], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-06 04:42:45,284] [INFO] [timer.py:260:stop] epoch=9/micro_step=6/global_step=60, RunningAvgSamplesPerSec=7.688756169911607, CurrSamplesPerSec=10.190957881216764, MemAllocated=6.26GB, MaxMemAllocated=8.42GB -Model Parameters: 6.927 B, Latency: 1.57s, TFLOPs: 13.32, Samples/sec: 2.55, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 -***** Evaluating perplexity, Epoch 10/10 ***** +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.16, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.15, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 08:51:47,670] [INFO] [logging.py:96:log_dist] [Rank 0] step=300, skipped=0, lr=[9.003572573259918e-06, 0.00046650635094610973, 9.003572573259918e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 08:51:47,670] [INFO] [timer.py:260:stop] epoch=49/micro_step=6/global_step=300, RunningAvgSamplesPerSec=7.729938503705482, CurrSamplesPerSec=9.697741244970594, MemAllocated=6.27GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 1.65s, TFLOPs: 12.68, Samples/sec: 2.42, Time/seq 0.41s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 50/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0045802593231201, loss: 0.004569769371300936 +Beginning of Epoch 51/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.29s, TFLOPs: 9.16, Samples/sec: 1.75, Time/seq 0.57s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.31s, TFLOPs: 9.08, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.16, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.15, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.16, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.54s, TFLOPs: 13.60, Samples/sec: 2.60, Time/seq 0.38s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 51/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0045608282089233, loss: 0.004550450015813112 +Beginning of Epoch 52/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.01, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.33s, TFLOPs: 8.97, Samples/sec: 1.71, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.16, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 08:52:09,982] [INFO] [logging.py:96:log_dist] [Rank 0] step=310, skipped=0, lr=[8.960832225887693e-06, 0.0004642918251755281, 8.960832225887693e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 08:52:09,983] [INFO] [timer.py:260:stop] epoch=51/micro_step=4/global_step=310, RunningAvgSamplesPerSec=7.725273020765501, CurrSamplesPerSec=7.768393090354041, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.16, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.17, Samples/sec: 1.94, Time/seq 0.51s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.55s, TFLOPs: 13.53, Samples/sec: 2.58, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 52/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0045989751815796, loss: 0.004588347859680653 +Beginning of Epoch 53/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.30s, TFLOPs: 9.10, Samples/sec: 1.74, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.31s, TFLOPs: 9.06, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.11, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.15, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.17, Samples/sec: 1.94, Time/seq 0.51s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.55s, TFLOPs: 13.47, Samples/sec: 2.57, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 53/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046355724334717, loss: 0.004624801222234964 +Beginning of Epoch 54/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.30s, TFLOPs: 9.11, Samples/sec: 1.74, Time/seq 0.57s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 08:52:31,796] [INFO] [logging.py:96:log_dist] [Rank 0] step=320, skipped=0, lr=[8.916832063954755e-06, 0.00046201202403910646, 8.916832063954755e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 08:52:31,796] [INFO] [timer.py:260:stop] epoch=53/micro_step=2/global_step=320, RunningAvgSamplesPerSec=7.726893844353771, CurrSamplesPerSec=6.886313868538238, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.33s, TFLOPs: 9.00, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.11, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.08s, TFLOPs: 10.09, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.12, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.55s, TFLOPs: 13.51, Samples/sec: 2.58, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 54/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.004626750946045, loss: 0.0046160537749528885 +Beginning of Epoch 55/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.30s, TFLOPs: 9.10, Samples/sec: 1.74, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.31s, TFLOPs: 9.05, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.16, Samples/sec: 1.94, Time/seq 0.51s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.09, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.10, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 08:52:52,577] [INFO] [logging.py:96:log_dist] [Rank 0] step=330, skipped=0, lr=[8.871585490336672e-06, 0.00045966764198635603, 8.871585490336672e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 08:52:52,578] [INFO] [timer.py:260:stop] epoch=54/micro_step=6/global_step=330, RunningAvgSamplesPerSec=7.733257366057002, CurrSamplesPerSec=10.142689854841157, MemAllocated=6.27GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 1.58s, TFLOPs: 13.26, Samples/sec: 2.53, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 55/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0045695304870605, loss: 0.004559140652418137 +Beginning of Epoch 56/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.30s, TFLOPs: 9.12, Samples/sec: 1.74, Time/seq 0.57s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.34s, TFLOPs: 8.95, Samples/sec: 1.71, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.09, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.11, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.15, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.52s, TFLOPs: 13.79, Samples/sec: 2.64, Time/seq 0.38s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 56/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046120882034302, loss: 0.0046015214174985886 +Beginning of Epoch 57/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.31s, TFLOPs: 9.08, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.31s, TFLOPs: 9.05, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.16, Samples/sec: 1.94, Time/seq 0.51s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 08:53:14,903] [INFO] [logging.py:96:log_dist] [Rank 0] step=340, skipped=0, lr=[8.825106287578076e-06, 0.00045725939313876043, 8.825106287578076e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 08:53:14,904] [INFO] [timer.py:260:stop] epoch=56/micro_step=4/global_step=340, RunningAvgSamplesPerSec=7.728807316699491, CurrSamplesPerSec=7.751265969137173, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.16, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.52s, TFLOPs: 13.78, Samples/sec: 2.63, Time/seq 0.38s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 57/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046089887619019, loss: 0.004598350264132023 +Beginning of Epoch 58/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.31s, TFLOPs: 9.07, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.34s, TFLOPs: 8.95, Samples/sec: 1.71, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.15, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.17, Samples/sec: 1.94, Time/seq 0.51s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.15, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.52s, TFLOPs: 13.78, Samples/sec: 2.63, Time/seq 0.38s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 58/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046242475509644, loss: 0.004613569937646389 +Beginning of Epoch 59/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.30s, TFLOPs: 9.11, Samples/sec: 1.74, Time/seq 0.57s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 08:53:36,726] [INFO] [logging.py:96:log_dist] [Rank 0] step=350, skipped=0, lr=[8.777408613694386e-06, 0.00045478801107224796, 8.777408613694386e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 08:53:36,727] [INFO] [timer.py:260:stop] epoch=58/micro_step=2/global_step=350, RunningAvgSamplesPerSec=7.729995506776423, CurrSamplesPerSec=6.735520894118148, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.38s, TFLOPs: 8.81, Samples/sec: 1.68, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.10, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.12, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.58s, TFLOPs: 13.25, Samples/sec: 2.53, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 59/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0045959949493408, loss: 0.004585481248795986 +Beginning of Epoch 60/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.30s, TFLOPs: 9.12, Samples/sec: 1.74, Time/seq 0.57s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.04, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.10, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.12, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 08:53:57,504] [INFO] [logging.py:96:log_dist] [Rank 0] step=360, skipped=0, lr=[8.728506997859123e-06, 0.0004522542485937369, 8.728506997859123e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 08:53:57,504] [INFO] [timer.py:260:stop] epoch=59/micro_step=6/global_step=360, RunningAvgSamplesPerSec=7.7357617666285, CurrSamplesPerSec=10.30205978673101, MemAllocated=6.27GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 1.55s, TFLOPs: 13.47, Samples/sec: 2.57, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 60/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046159029006958, loss: 0.004605256021022797 +Beginning of Epoch 61/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.30s, TFLOPs: 9.09, Samples/sec: 1.74, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.43s, TFLOPs: 8.62, Samples/sec: 1.65, Time/seq 0.61s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.12, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.52s, TFLOPs: 13.80, Samples/sec: 2.64, Time/seq 0.38s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 61/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.004603624343872, loss: 0.004593097139149904 +Beginning of Epoch 62/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.04, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.31s, TFLOPs: 9.07, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 08:54:19,920] [INFO] [logging.py:96:log_dist] [Rank 0] step=370, skipped=0, lr=[8.678416335978189e-06, 0.0004496588775118232, 8.678416335978189e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 08:54:19,920] [INFO] [timer.py:260:stop] epoch=61/micro_step=4/global_step=370, RunningAvgSamplesPerSec=7.730564259591224, CurrSamplesPerSec=7.726621049164178, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.10, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.52s, TFLOPs: 13.76, Samples/sec: 2.63, Time/seq 0.38s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 62/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.004612922668457, loss: 0.00460231164470315 +Beginning of Epoch 63/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.29s, TFLOPs: 9.14, Samples/sec: 1.75, Time/seq 0.57s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.30s, TFLOPs: 9.10, Samples/sec: 1.74, Time/seq 0.57s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.09s, TFLOPs: 10.03, Samples/sec: 1.92, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.15, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.52s, TFLOPs: 13.77, Samples/sec: 2.63, Time/seq 0.38s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 63/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046029090881348, loss: 0.004592340439558029 +Beginning of Epoch 64/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.28s, TFLOPs: 9.20, Samples/sec: 1.76, Time/seq 0.57s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 08:54:41,744] [INFO] [logging.py:96:log_dist] [Rank 0] step=380, skipped=0, lr=[8.627151886152434e-06, 0.00044700268840168044, 8.627151886152434e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 08:54:41,744] [INFO] [timer.py:260:stop] epoch=63/micro_step=2/global_step=380, RunningAvgSamplesPerSec=7.73153014756928, CurrSamplesPerSec=6.60439125150855, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.42s, TFLOPs: 8.64, Samples/sec: 1.65, Time/seq 0.61s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.08s, TFLOPs: 10.06, Samples/sec: 1.92, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.58s, TFLOPs: 13.27, Samples/sec: 2.54, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 64/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046172142028809, loss: 0.0046065268106758595 +Beginning of Epoch 65/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.33s, TFLOPs: 9.00, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.38s, TFLOPs: 8.80, Samples/sec: 1.68, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.08s, TFLOPs: 10.09, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.10, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.11, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 08:55:02,638] [INFO] [logging.py:96:log_dist] [Rank 0] step=390, skipped=0, lr=[8.574729264029886e-06, 0.0004442864903642427, 8.574729264029886e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 08:55:02,639] [INFO] [timer.py:260:stop] epoch=64/micro_step=6/global_step=390, RunningAvgSamplesPerSec=7.7356853070608365, CurrSamplesPerSec=10.2514346293855, MemAllocated=6.27GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 1.56s, TFLOPs: 13.40, Samples/sec: 2.56, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 65/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.004612922668457, loss: 0.0046022627502679825 +Beginning of Epoch 66/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.33s, TFLOPs: 8.99, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.36s, TFLOPs: 8.85, Samples/sec: 1.69, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.12, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.15, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.15, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.57s, TFLOPs: 13.37, Samples/sec: 2.55, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 66/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.004618763923645, loss: 0.004608134739100933 +Beginning of Epoch 67/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.34s, TFLOPs: 8.93, Samples/sec: 1.71, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.34s, TFLOPs: 8.93, Samples/sec: 1.71, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.10, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 08:55:25,125] [INFO] [logging.py:96:log_dist] [Rank 0] step=400, skipped=0, lr=[8.52116443804907e-06, 0.0004415111107797445, 8.52116443804907e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 08:55:25,125] [INFO] [timer.py:260:stop] epoch=66/micro_step=4/global_step=400, RunningAvgSamplesPerSec=7.730168118722821, CurrSamplesPerSec=7.741511504085444, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.12, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.15, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.57s, TFLOPs: 13.37, Samples/sec: 2.56, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 67/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046539306640625, loss: 0.004643190652132034 +Beginning of Epoch 68/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.02, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.34s, TFLOPs: 8.94, Samples/sec: 1.71, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.16, Samples/sec: 1.94, Time/seq 0.51s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.12, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.12, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.55s, TFLOPs: 13.51, Samples/sec: 2.58, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 68/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.00455641746521, loss: 0.004545976873487234 +Beginning of Epoch 69/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.31s, TFLOPs: 9.05, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 08:55:47,004] [INFO] [logging.py:96:log_dist] [Rank 0] step=410, skipped=0, lr=[8.466473724574875e-06, 0.000438677395055693, 8.466473724574875e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 08:55:47,005] [INFO] [timer.py:260:stop] epoch=68/micro_step=2/global_step=410, RunningAvgSamplesPerSec=7.73049565471172, CurrSamplesPerSec=6.874488924673851, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.33s, TFLOPs: 8.99, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.08s, TFLOPs: 10.05, Samples/sec: 1.92, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.08s, TFLOPs: 10.08, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.11, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.55s, TFLOPs: 13.49, Samples/sec: 2.58, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 69/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.004663348197937, loss: 0.004652484320104122 +Beginning of Epoch 70/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.33s, TFLOPs: 9.00, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.39s, TFLOPs: 8.75, Samples/sec: 1.67, Time/seq 0.60s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.08s, TFLOPs: 10.07, Samples/sec: 1.92, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.11, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.12, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 08:56:07,888] [INFO] [logging.py:96:log_dist] [Rank 0] step=420, skipped=0, lr=[8.410673782928428e-06, 0.00043578620636934855, 8.410673782928428e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 08:56:07,889] [INFO] [timer.py:260:stop] epoch=69/micro_step=6/global_step=420, RunningAvgSamplesPerSec=7.734440614487563, CurrSamplesPerSec=10.324140421147755, MemAllocated=6.27GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 1.55s, TFLOPs: 13.49, Samples/sec: 2.58, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 70/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0045949220657349, loss: 0.004584323149174452 +Beginning of Epoch 71/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.33s, TFLOPs: 8.97, Samples/sec: 1.71, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.34s, TFLOPs: 8.93, Samples/sec: 1.71, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.12, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.10, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.60s, TFLOPs: 13.10, Samples/sec: 2.50, Time/seq 0.40s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 71/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046530961990356, loss: 0.0046423012390732765 +Beginning of Epoch 72/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.29s, TFLOPs: 9.13, Samples/sec: 1.74, Time/seq 0.57s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.00, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.10, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 08:56:30,369] [INFO] [logging.py:96:log_dist] [Rank 0] step=430, skipped=0, lr=[8.353781610312498e-06, 0.0004328384254047926, 8.353781610312498e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 08:56:30,369] [INFO] [timer.py:260:stop] epoch=71/micro_step=4/global_step=430, RunningAvgSamplesPerSec=7.729658300561802, CurrSamplesPerSec=7.72086334312254, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.09, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.11, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.54s, TFLOPs: 13.61, Samples/sec: 2.60, Time/seq 0.38s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 72/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0045661926269531, loss: 0.004555823281407356 +Beginning of Epoch 73/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.31s, TFLOPs: 9.07, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.33s, TFLOPs: 8.99, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.10, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.11, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.17, Samples/sec: 1.94, Time/seq 0.51s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.53s, TFLOPs: 13.65, Samples/sec: 2.61, Time/seq 0.38s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 73/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.004664421081543, loss: 0.004653565119951963 +Beginning of Epoch 74/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.30s, TFLOPs: 9.10, Samples/sec: 1.74, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 08:56:52,212] [INFO] [logging.py:96:log_dist] [Rank 0] step=440, skipped=0, lr=[8.295814536633993e-06, 0.0004298349500846628, 8.295814536633993e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 08:56:52,212] [INFO] [timer.py:260:stop] epoch=73/micro_step=2/global_step=440, RunningAvgSamplesPerSec=7.730302019046364, CurrSamplesPerSec=6.80046553222321, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.35s, TFLOPs: 8.89, Samples/sec: 1.70, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.11, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.10, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.53s, TFLOPs: 13.66, Samples/sec: 2.61, Time/seq 0.38s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 74/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0045828819274902, loss: 0.004572412930428982 +Beginning of Epoch 75/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.30s, TFLOPs: 9.09, Samples/sec: 1.74, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.34s, TFLOPs: 8.96, Samples/sec: 1.71, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.16, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.16, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.18, Samples/sec: 1.95, Time/seq 0.51s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 08:57:12,950] [INFO] [logging.py:96:log_dist] [Rank 0] step=450, skipped=0, lr=[8.236790219225093e-06, 0.00042677669529663686, 8.236790219225093e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 08:57:12,951] [INFO] [timer.py:260:stop] epoch=74/micro_step=6/global_step=450, RunningAvgSamplesPerSec=7.735244694973908, CurrSamplesPerSec=10.270756162978891, MemAllocated=6.27GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 1.56s, TFLOPs: 13.42, Samples/sec: 2.57, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 75/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046590566635132, loss: 0.004648192785680294 +Beginning of Epoch 76/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.45s, TFLOPs: 8.55, Samples/sec: 1.63, Time/seq 0.61s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.33s, TFLOPs: 8.99, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.11, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.15, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.53s, TFLOPs: 13.65, Samples/sec: 2.61, Time/seq 0.38s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 76/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0045865774154663, loss: 0.004576060455292463 +Beginning of Epoch 77/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.29s, TFLOPs: 9.14, Samples/sec: 1.75, Time/seq 0.57s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.02, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.11, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 08:57:35,409] [INFO] [logging.py:96:log_dist] [Rank 0] step=460, skipped=0, lr=[8.176726637464662e-06, 0.00042366459261474935, 8.176726637464662e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 08:57:35,409] [INFO] [timer.py:260:stop] epoch=76/micro_step=4/global_step=460, RunningAvgSamplesPerSec=7.7307404106406645, CurrSamplesPerSec=7.772841549257438, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.16, Samples/sec: 1.94, Time/seq 0.51s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.15, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.54s, TFLOPs: 13.55, Samples/sec: 2.59, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 77/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046485662460327, loss: 0.004637811332941055 +Beginning of Epoch 78/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.04, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.34s, TFLOPs: 8.96, Samples/sec: 1.71, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.16, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.17, Samples/sec: 1.94, Time/seq 0.51s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.17, Samples/sec: 1.94, Time/seq 0.51s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.55s, TFLOPs: 13.50, Samples/sec: 2.58, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 78/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0045924186706543, loss: 0.004581847228109837 +Beginning of Epoch 79/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.02, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 08:57:57,263] [INFO] [logging.py:96:log_dist] [Rank 0] step=470, skipped=0, lr=[8.115642087301557e-06, 0.00042049959001562464, 8.115642087301557e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 08:57:57,263] [INFO] [timer.py:260:stop] epoch=78/micro_step=2/global_step=470, RunningAvgSamplesPerSec=7.7312689081524, CurrSamplesPerSec=6.854719668271303, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.34s, TFLOPs: 8.96, Samples/sec: 1.71, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.16, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.12, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.53s, TFLOPs: 13.66, Samples/sec: 2.61, Time/seq 0.38s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 79/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046530961990356, loss: 0.004642223007977009 +Beginning of Epoch 80/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.30s, TFLOPs: 9.10, Samples/sec: 1.74, Time/seq 0.57s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.33s, TFLOPs: 8.99, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.17, Samples/sec: 1.94, Time/seq 0.51s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.15, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.18, Samples/sec: 1.94, Time/seq 0.51s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 08:58:17,934] [INFO] [logging.py:96:log_dist] [Rank 0] step=480, skipped=0, lr=[8.053555175681491e-06, 0.0004172826515897146, 8.053555175681491e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 08:58:17,935] [INFO] [timer.py:260:stop] epoch=79/micro_step=6/global_step=480, RunningAvgSamplesPerSec=7.736359684613006, CurrSamplesPerSec=10.532969096646454, MemAllocated=6.27GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 1.52s, TFLOPs: 13.77, Samples/sec: 2.63, Time/seq 0.38s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 80/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0045965909957886, loss: 0.00458594411611557 +Beginning of Epoch 81/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.29s, TFLOPs: 9.15, Samples/sec: 1.75, Time/seq 0.57s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.30s, TFLOPs: 9.11, Samples/sec: 1.74, Time/seq 0.57s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.16, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.11, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.51s, TFLOPs: 13.86, Samples/sec: 2.65, Time/seq 0.38s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 81/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046485662460327, loss: 0.004637773614376783 +Beginning of Epoch 82/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.28s, TFLOPs: 9.18, Samples/sec: 1.75, Time/seq 0.57s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.30s, TFLOPs: 9.09, Samples/sec: 1.74, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.16, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 08:58:40,139] [INFO] [logging.py:96:log_dist] [Rank 0] step=490, skipped=0, lr=[7.990484814879197e-06, 0.00041401475724762684, 7.990484814879197e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 08:58:40,140] [INFO] [timer.py:260:stop] epoch=81/micro_step=4/global_step=490, RunningAvgSamplesPerSec=7.734012262310286, CurrSamplesPerSec=7.757737176223722, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.15, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.52s, TFLOPs: 13.76, Samples/sec: 2.63, Time/seq 0.38s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 82/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0045998096466064, loss: 0.00458930479362607 +Beginning of Epoch 83/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.28s, TFLOPs: 9.18, Samples/sec: 1.75, Time/seq 0.57s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.31s, TFLOPs: 9.05, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.17, Samples/sec: 1.94, Time/seq 0.51s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.16, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.15, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.54s, TFLOPs: 13.57, Samples/sec: 2.59, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 83/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046367645263672, loss: 0.004625976085662842 +Beginning of Epoch 84/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.33s, TFLOPs: 8.99, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 08:59:01,904] [INFO] [logging.py:96:log_dist] [Rank 0] step=500, skipped=0, lr=[7.926450216737553e-06, 0.0004106969024216348, 7.926450216737553e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 08:59:01,904] [INFO] [timer.py:260:stop] epoch=83/micro_step=2/global_step=500, RunningAvgSamplesPerSec=7.735160075046996, CurrSamplesPerSec=6.924852572232615, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.31s, TFLOPs: 9.05, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.08s, TFLOPs: 10.06, Samples/sec: 1.92, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 08:59:06,011] [INFO] [loss_scaler.py:190:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 2097152, but hysteresis is 2. Reducing hysteresis to 1 +Model Parameters: 6.927 B, Latency: 2.02s, TFLOPs: 10.35, Samples/sec: 1.98, Time/seq 0.51s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.17, Samples/sec: 1.94, Time/seq 0.51s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 08:59:09,612] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 2097152, reducing to 1048576 +Model Parameters: 6.927 B, Latency: 1.54s, TFLOPs: 13.61, Samples/sec: 2.60, Time/seq 0.38s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 84/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046782493591309, loss: 0.004667269065976143 +Beginning of Epoch 85/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.29s, TFLOPs: 9.12, Samples/sec: 1.74, Time/seq 0.57s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.33s, TFLOPs: 8.97, Samples/sec: 1.71, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.16, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 08:59:22,641] [INFO] [logging.py:96:log_dist] [Rank 0] step=510, skipped=2, lr=[7.874541385357906e-06, 0.00040800732566621275, 7.874541385357906e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 08:59:22,641] [INFO] [timer.py:260:stop] epoch=84/micro_step=6/global_step=510, RunningAvgSamplesPerSec=7.739838648685208, CurrSamplesPerSec=10.381235556557963, MemAllocated=6.27GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 1.54s, TFLOPs: 13.57, Samples/sec: 2.59, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 85/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0045396089553833, loss: 0.004529269877821207 +Beginning of Epoch 86/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.29s, TFLOPs: 9.15, Samples/sec: 1.75, Time/seq 0.57s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.34s, TFLOPs: 8.94, Samples/sec: 1.71, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.10, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.15, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.56s, TFLOPs: 13.38, Samples/sec: 2.56, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 86/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046840906143188, loss: 0.004673165734857321 +Beginning of Epoch 87/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.29s, TFLOPs: 9.13, Samples/sec: 1.74, Time/seq 0.57s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.35s, TFLOPs: 8.90, Samples/sec: 1.70, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.12, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 08:59:45,044] [INFO] [logging.py:96:log_dist] [Rank 0] step=520, skipped=2, lr=[7.8088205076002e-06, 0.00040460209883938855, 7.8088205076002e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 08:59:45,044] [INFO] [timer.py:260:stop] epoch=86/micro_step=4/global_step=520, RunningAvgSamplesPerSec=7.7362152561994995, CurrSamplesPerSec=7.719307383169164, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.09, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.12, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.55s, TFLOPs: 13.48, Samples/sec: 2.58, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 87/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0045883655548096, loss: 0.004577841609716415 +Beginning of Epoch 88/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.33s, TFLOPs: 8.97, Samples/sec: 1.71, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.34s, TFLOPs: 8.96, Samples/sec: 1.71, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.16, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.09, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.68s, TFLOPs: 12.49, Samples/sec: 2.39, Time/seq 0.42s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 88/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.00471031665802, loss: 0.004699287936091423 +Beginning of Epoch 89/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.36s, TFLOPs: 8.86, Samples/sec: 1.69, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:00:07,166] [INFO] [logging.py:96:log_dist] [Rank 0] step=530, skipped=2, lr=[7.74219072921096e-06, 0.00040114977871559375, 7.74219072921096e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:00:07,167] [INFO] [timer.py:260:stop] epoch=88/micro_step=2/global_step=530, RunningAvgSamplesPerSec=7.7347723500544605, CurrSamplesPerSec=6.751126489440309, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.37s, TFLOPs: 8.83, Samples/sec: 1.69, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.15s, TFLOPs: 9.73, Samples/sec: 1.86, Time/seq 0.54s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.16, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.58s, TFLOPs: 13.27, Samples/sec: 2.54, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 89/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0045890808105469, loss: 0.004578555002808571 +Beginning of Epoch 90/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.02, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.01, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:00:28,045] [INFO] [logging.py:96:log_dist] [Rank 0] step=540, skipped=2, lr=[7.67467234626614e-06, 0.00039765141690498135, 7.67467234626614e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:00:28,045] [INFO] [timer.py:260:stop] epoch=89/micro_step=6/global_step=540, RunningAvgSamplesPerSec=7.737836864549019, CurrSamplesPerSec=10.248986003658603, MemAllocated=6.27GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 1.56s, TFLOPs: 13.40, Samples/sec: 2.56, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 90/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046651363372803, loss: 0.004654349759221077 +Beginning of Epoch 91/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.28s, TFLOPs: 9.17, Samples/sec: 1.75, Time/seq 0.57s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.02, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.12, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.53s, TFLOPs: 13.73, Samples/sec: 2.62, Time/seq 0.38s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 91/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0045976638793945, loss: 0.004587167873978615 +Beginning of Epoch 92/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.29s, TFLOPs: 9.14, Samples/sec: 1.75, Time/seq 0.57s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.31s, TFLOPs: 9.08, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.12, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:00:50,314] [INFO] [logging.py:96:log_dist] [Rank 0] step=550, skipped=2, lr=[7.606285925519253e-06, 0.0003941080790424483, 7.606285925519253e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:00:50,315] [INFO] [timer.py:260:stop] epoch=91/micro_step=4/global_step=550, RunningAvgSamplesPerSec=7.735301053459147, CurrSamplesPerSec=7.746646324113118, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.12, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.53s, TFLOPs: 13.69, Samples/sec: 2.62, Time/seq 0.38s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 92/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046519041061401, loss: 0.004641080275177956 +Beginning of Epoch 93/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.27s, TFLOPs: 9.20, Samples/sec: 1.76, Time/seq 0.57s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.34s, TFLOPs: 8.94, Samples/sec: 1.71, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.11, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.12, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.52s, TFLOPs: 13.77, Samples/sec: 2.63, Time/seq 0.38s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 93/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046228170394897, loss: 0.004612160846590996 +Beginning of Epoch 94/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.30s, TFLOPs: 9.10, Samples/sec: 1.74, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:01:12,088] [INFO] [logging.py:96:log_dist] [Rank 0] step=560, skipped=2, lr=[7.5370522981365305e-06, 0.00039052084446303264, 7.5370522981365305e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:01:12,088] [INFO] [timer.py:260:stop] epoch=93/micro_step=2/global_step=560, RunningAvgSamplesPerSec=7.736211548800385, CurrSamplesPerSec=6.899197228677871, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.02, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.12, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.53s, TFLOPs: 13.68, Samples/sec: 2.61, Time/seq 0.38s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 94/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.004636526107788, loss: 0.004625811707228422 +Beginning of Epoch 95/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.29s, TFLOPs: 9.12, Samples/sec: 1.74, Time/seq 0.57s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.03, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.12, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.12, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:01:32,793] [INFO] [logging.py:96:log_dist] [Rank 0] step=570, skipped=2, lr=[7.466992553351555e-06, 0.00038689080587313757, 7.466992553351555e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:01:32,794] [INFO] [timer.py:260:stop] epoch=94/micro_step=6/global_step=570, RunningAvgSamplesPerSec=7.74018098619295, CurrSamplesPerSec=10.368972961413974, MemAllocated=6.27GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 1.54s, TFLOPs: 13.55, Samples/sec: 2.59, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 95/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046260356903076, loss: 0.004615375772118568 +Beginning of Epoch 96/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.30s, TFLOPs: 9.09, Samples/sec: 1.74, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.31s, TFLOPs: 9.05, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.08s, TFLOPs: 10.07, Samples/sec: 1.92, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.12, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.54s, TFLOPs: 13.57, Samples/sec: 2.59, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 96/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046312808990479, loss: 0.004620506428182125 +Beginning of Epoch 97/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.30s, TFLOPs: 9.11, Samples/sec: 1.74, Time/seq 0.57s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.34s, TFLOPs: 8.93, Samples/sec: 1.71, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.08s, TFLOPs: 10.09, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:01:55,185] [INFO] [logging.py:96:log_dist] [Rank 0] step=580, skipped=2, lr=[7.396128032041273e-06, 0.0003832190690176825, 7.396128032041273e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:01:55,186] [INFO] [timer.py:260:stop] epoch=96/micro_step=4/global_step=580, RunningAvgSamplesPerSec=7.737026556461911, CurrSamplesPerSec=7.703175584870436, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.08s, TFLOPs: 10.07, Samples/sec: 1.92, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.55s, TFLOPs: 13.54, Samples/sec: 2.59, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 97/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046290159225464, loss: 0.004618259612470865 +Beginning of Epoch 98/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.31s, TFLOPs: 9.06, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.41s, TFLOPs: 8.68, Samples/sec: 1.66, Time/seq 0.60s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.12, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.54s, TFLOPs: 13.63, Samples/sec: 2.60, Time/seq 0.38s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 98/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046305656433105, loss: 0.004619892220944166 +Beginning of Epoch 99/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.33s, TFLOPs: 8.99, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:02:17,123] [INFO] [logging.py:96:log_dist] [Rank 0] step=590, skipped=2, lr=[7.3244803202253545e-06, 0.00037950675234328256, 7.3244803202253545e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:02:17,123] [INFO] [timer.py:260:stop] epoch=98/micro_step=2/global_step=590, RunningAvgSamplesPerSec=7.736792950442204, CurrSamplesPerSec=6.885870131789023, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.33s, TFLOPs: 9.00, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.09, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.54s, TFLOPs: 13.57, Samples/sec: 2.59, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 99/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.004628300666809, loss: 0.004617627710103989 +Beginning of Epoch 100/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.29s, TFLOPs: 9.14, Samples/sec: 1.75, Time/seq 0.57s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.30s, TFLOPs: 9.10, Samples/sec: 1.74, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.12, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.10, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:02:37,817] [INFO] [logging.py:96:log_dist] [Rank 0] step=600, skipped=2, lr=[7.252071242490885e-06, 0.00037575498665755883, 7.252071242490885e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:02:37,818] [INFO] [timer.py:260:stop] epoch=99/micro_step=6/global_step=600, RunningAvgSamplesPerSec=7.740748930731568, CurrSamplesPerSec=10.573253674153621, MemAllocated=6.27GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 1.51s, TFLOPs: 13.82, Samples/sec: 2.64, Time/seq 0.38s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 100/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046334266662598, loss: 0.00462276441976428 +Beginning of Epoch 101/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.28s, TFLOPs: 9.20, Samples/sec: 1.76, Time/seq 0.57s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.30s, TFLOPs: 9.09, Samples/sec: 1.74, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.10, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.12, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:02:50,674] [INFO] [loss_scaler.py:190:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 2097152, but hysteresis is 2. Reducing hysteresis to 1 +Model Parameters: 6.927 B, Latency: 1.47s, TFLOPs: 14.22, Samples/sec: 2.72, Time/seq 0.37s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 101/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046533346176147, loss: 0.004642527550458908 +Beginning of Epoch 102/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.28s, TFLOPs: 9.18, Samples/sec: 1.75, Time/seq 0.57s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:02:55,943] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 2097152, reducing to 1048576 +Model Parameters: 6.927 B, Latency: 2.39s, TFLOPs: 8.77, Samples/sec: 1.68, Time/seq 0.60s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.10, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:03:00,097] [INFO] [logging.py:96:log_dist] [Rank 0] step=610, skipped=4, lr=[7.193610611193055e-06, 0.0003727259384037852, 7.193610611193055e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:03:00,097] [INFO] [timer.py:260:stop] epoch=101/micro_step=4/global_step=610, RunningAvgSamplesPerSec=7.738371958234215, CurrSamplesPerSec=7.712091744072357, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.08s, TFLOPs: 10.08, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.54s, TFLOPs: 13.60, Samples/sec: 2.60, Time/seq 0.38s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 102/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0045593976974487, loss: 0.004549030214548111 +Beginning of Epoch 103/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.43s, TFLOPs: 8.61, Samples/sec: 1.64, Time/seq 0.61s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.31s, TFLOPs: 9.06, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.12, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.11, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.55s, TFLOPs: 13.52, Samples/sec: 2.58, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 103/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046483278274536, loss: 0.004637556150555611 +Beginning of Epoch 104/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.30s, TFLOPs: 9.09, Samples/sec: 1.74, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:03:22,040] [INFO] [logging.py:96:log_dist] [Rank 0] step=620, skipped=4, lr=[7.119886808764079e-06, 0.00036890605226756886, 7.119886808764079e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:03:22,040] [INFO] [timer.py:260:stop] epoch=103/micro_step=2/global_step=620, RunningAvgSamplesPerSec=7.7380804022560135, CurrSamplesPerSec=6.865503722790595, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.33s, TFLOPs: 8.98, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.08s, TFLOPs: 10.07, Samples/sec: 1.92, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.08s, TFLOPs: 10.07, Samples/sec: 1.92, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.12, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.54s, TFLOPs: 13.55, Samples/sec: 2.59, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 104/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.004673957824707, loss: 0.004663079511374235 +Beginning of Epoch 105/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.28s, TFLOPs: 9.17, Samples/sec: 1.75, Time/seq 0.57s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.33s, TFLOPs: 8.97, Samples/sec: 1.71, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.11, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.12, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:03:42,785] [INFO] [logging.py:96:log_dist] [Rank 0] step=630, skipped=4, lr=[7.045463961582085e-06, 0.00036504994619596294, 7.045463961582085e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:03:42,785] [INFO] [timer.py:260:stop] epoch=104/micro_step=6/global_step=630, RunningAvgSamplesPerSec=7.741420023872607, CurrSamplesPerSec=10.411168855386556, MemAllocated=6.27GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 1.54s, TFLOPs: 13.61, Samples/sec: 2.60, Time/seq 0.38s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 105/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046485662460327, loss: 0.00463775172829628 +Beginning of Epoch 106/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.30s, TFLOPs: 9.09, Samples/sec: 1.74, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.04, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.17, Samples/sec: 1.94, Time/seq 0.51s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.08s, TFLOPs: 10.07, Samples/sec: 1.92, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.57s, TFLOPs: 13.35, Samples/sec: 2.55, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 106/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0045989751815796, loss: 0.004588374402374029 +Beginning of Epoch 107/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.33s, TFLOPs: 9.00, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.34s, TFLOPs: 8.93, Samples/sec: 1.71, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.09s, TFLOPs: 10.01, Samples/sec: 1.91, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:04:05,215] [INFO] [logging.py:96:log_dist] [Rank 0] step=640, skipped=4, lr=[6.970364739567275e-06, 0.00036115879479623185, 6.970364739567275e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:04:05,215] [INFO] [timer.py:260:stop] epoch=106/micro_step=4/global_step=640, RunningAvgSamplesPerSec=7.738269453247794, CurrSamplesPerSec=7.745550157470127, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.16, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.54s, TFLOPs: 13.56, Samples/sec: 2.59, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 107/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046361684799194, loss: 0.004625363275408745 +Beginning of Epoch 108/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.30s, TFLOPs: 9.12, Samples/sec: 1.74, Time/seq 0.57s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.36s, TFLOPs: 8.88, Samples/sec: 1.70, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.12, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.12, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.53s, TFLOPs: 13.65, Samples/sec: 2.61, Time/seq 0.38s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 108/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046449899673462, loss: 0.004634275566786528 +Beginning of Epoch 109/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.31s, TFLOPs: 9.06, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:04:27,049] [INFO] [logging.py:96:log_dist] [Rank 0] step=650, skipped=4, lr=[6.8946120186701795e-06, 0.0003572337833507865, 6.8946120186701795e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:04:27,049] [INFO] [timer.py:260:stop] epoch=108/micro_step=2/global_step=650, RunningAvgSamplesPerSec=7.7386411825827945, CurrSamplesPerSec=6.906317220927473, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.03, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.08s, TFLOPs: 10.07, Samples/sec: 1.92, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.11, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.56s, TFLOPs: 13.38, Samples/sec: 2.56, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 109/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046260356903076, loss: 0.004615365993231535 +Beginning of Epoch 110/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.30s, TFLOPs: 9.10, Samples/sec: 1.74, Time/seq 0.57s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.33s, TFLOPs: 9.00, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.12, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.12, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:04:47,819] [INFO] [logging.py:96:log_dist] [Rank 0] step=660, skipped=4, lr=[6.818228873903415e-06, 0.0003532761074561355, 6.818228873903415e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:04:47,819] [INFO] [timer.py:260:stop] epoch=109/micro_step=6/global_step=660, RunningAvgSamplesPerSec=7.741687012433769, CurrSamplesPerSec=10.365526372532605, MemAllocated=6.27GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 1.54s, TFLOPs: 13.55, Samples/sec: 2.59, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 110/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046348571777344, loss: 0.004624121822416782 +Beginning of Epoch 111/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.31s, TFLOPs: 9.08, Samples/sec: 1.74, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.34s, TFLOPs: 8.94, Samples/sec: 1.71, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.11, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.08s, TFLOPs: 10.07, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.08s, TFLOPs: 10.08, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.60s, TFLOPs: 13.06, Samples/sec: 2.50, Time/seq 0.40s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 111/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046298503875732, loss: 0.004619171377271414 +Beginning of Epoch 112/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.31s, TFLOPs: 9.08, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.02, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:05:10,267] [INFO] [logging.py:96:log_dist] [Rank 0] step=670, skipped=4, lr=[6.7412385723128166e-06, 0.00034928697265869515, 6.7412385723128166e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:05:10,267] [INFO] [timer.py:260:stop] epoch=111/micro_step=4/global_step=670, RunningAvgSamplesPerSec=7.738559912483301, CurrSamplesPerSec=7.727474278739586, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.10, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.56s, TFLOPs: 13.43, Samples/sec: 2.57, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 112/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046329498291016, loss: 0.004622204229235649 +Beginning of Epoch 113/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.31s, TFLOPs: 9.07, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.33s, TFLOPs: 9.00, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.09, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.15, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.55s, TFLOPs: 13.51, Samples/sec: 2.58, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 113/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.004636287689209, loss: 0.004625522997230291 +Beginning of Epoch 114/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.30s, TFLOPs: 9.11, Samples/sec: 1.74, Time/seq 0.57s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:05:32,109] [INFO] [logging.py:96:log_dist] [Rank 0] step=680, skipped=4, lr=[6.6636645658900744e-06, 0.0003452675940875686, 6.6636645658900744e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:05:32,109] [INFO] [timer.py:260:stop] epoch=113/micro_step=2/global_step=680, RunningAvgSamplesPerSec=7.738873557859264, CurrSamplesPerSec=6.886807133502467, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.00, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.10, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.12, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.16, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.54s, TFLOPs: 13.56, Samples/sec: 2.59, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 114/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046257972717285, loss: 0.004615142475813627 +Beginning of Epoch 115/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.30s, TFLOPs: 9.11, Samples/sec: 1.74, Time/seq 0.57s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.31s, TFLOPs: 9.07, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.09, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.08s, TFLOPs: 10.05, Samples/sec: 1.92, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:05:52,855] [INFO] [logging.py:96:log_dist] [Rank 0] step=690, skipped=4, lr=[6.5855304844290145e-06, 0.00034121919608440487, 6.5855304844290145e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:05:52,855] [INFO] [timer.py:260:stop] epoch=114/micro_step=6/global_step=690, RunningAvgSamplesPerSec=7.741891019895047, CurrSamplesPerSec=10.279133358351267, MemAllocated=6.27GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 1.56s, TFLOPs: 13.44, Samples/sec: 2.57, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 115/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.004639983177185, loss: 0.004629264585673809 +Beginning of Epoch 116/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.02, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.36s, TFLOPs: 8.88, Samples/sec: 1.70, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.12, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.09, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.11, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.55s, TFLOPs: 13.49, Samples/sec: 2.58, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 116/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046319961547852, loss: 0.0046212757006287575 +Beginning of Epoch 117/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.34s, TFLOPs: 8.96, Samples/sec: 1.71, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.34s, TFLOPs: 8.96, Samples/sec: 1.71, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.15, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:06:15,307] [INFO] [logging.py:96:log_dist] [Rank 0] step=700, skipped=4, lr=[6.506860128327759e-06, 0.0003371430118304538, 6.506860128327759e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:06:15,308] [INFO] [timer.py:260:stop] epoch=116/micro_step=4/global_step=700, RunningAvgSamplesPerSec=7.73885344740097, CurrSamplesPerSec=7.74706037314427, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.16, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.56s, TFLOPs: 13.40, Samples/sec: 2.56, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 117/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.004638671875, loss: 0.0046278624795377254 +Beginning of Epoch 118/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.33s, TFLOPs: 8.98, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.34s, TFLOPs: 8.93, Samples/sec: 1.71, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.11, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.60s, TFLOPs: 13.12, Samples/sec: 2.51, Time/seq 0.40s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 118/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046296119689941, loss: 0.004618949722498655 +Beginning of Epoch 119/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.03, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:06:37,209] [INFO] [loss_scaler.py:190:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 2097152, but hysteresis is 2. Reducing hysteresis to 1 +[2023-12-06 09:06:37,209] [INFO] [logging.py:96:log_dist] [Rank 0] step=710, skipped=5, lr=[6.435618095802945e-06, 0.0003334517148084427, 6.435618095802945e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:06:37,210] [INFO] [timer.py:260:stop] epoch=118/micro_step=2/global_step=710, RunningAvgSamplesPerSec=7.738792905576164, CurrSamplesPerSec=7.006813565487513, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.29s, TFLOPs: 9.16, Samples/sec: 1.75, Time/seq 0.57s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:06:41,301] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 2097152, reducing to 1048576 +Model Parameters: 6.927 B, Latency: 2.02s, TFLOPs: 10.35, Samples/sec: 1.98, Time/seq 0.51s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.57s, TFLOPs: 13.35, Samples/sec: 2.55, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 119/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.004667043685913, loss: 0.004656112287193537 +Beginning of Epoch 120/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.33s, TFLOPs: 8.97, Samples/sec: 1.71, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.34s, TFLOPs: 8.96, Samples/sec: 1.71, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.11, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.10, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:06:57,999] [INFO] [logging.py:96:log_dist] [Rank 0] step=720, skipped=6, lr=[6.3639786673631884e-06, 0.0003297398273245175, 6.3639786673631884e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:06:58,000] [INFO] [timer.py:260:stop] epoch=119/micro_step=6/global_step=720, RunningAvgSamplesPerSec=7.741544046753895, CurrSamplesPerSec=10.276004274008145, MemAllocated=6.27GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 1.56s, TFLOPs: 13.43, Samples/sec: 2.57, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 120/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046265125274658, loss: 0.004615819547325373 +Beginning of Epoch 121/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.34s, TFLOPs: 8.96, Samples/sec: 1.71, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.35s, TFLOPs: 8.93, Samples/sec: 1.71, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.12, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.11, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.08s, TFLOPs: 10.07, Samples/sec: 1.92, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.56s, TFLOPs: 13.39, Samples/sec: 2.56, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 121/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046632289886475, loss: 0.004652450326830149 +Beginning of Epoch 122/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.36s, TFLOPs: 8.89, Samples/sec: 1.70, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.35s, TFLOPs: 8.92, Samples/sec: 1.70, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.12, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:07:20,517] [INFO] [logging.py:96:log_dist] [Rank 0] step=730, skipped=6, lr=[6.283934722870896e-06, 0.00032559247268761115, 6.283934722870896e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:07:20,517] [INFO] [timer.py:260:stop] epoch=121/micro_step=4/global_step=730, RunningAvgSamplesPerSec=7.738337227896884, CurrSamplesPerSec=7.740463215337426, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.12, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.15, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.57s, TFLOPs: 13.37, Samples/sec: 2.56, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 122/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.004673719406128, loss: 0.0046628424897789955 +Beginning of Epoch 123/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.03, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.34s, TFLOPs: 8.93, Samples/sec: 1.71, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.10, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.12, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.12, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.57s, TFLOPs: 13.34, Samples/sec: 2.55, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 123/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046193599700928, loss: 0.004608681425452232 +Beginning of Epoch 124/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.04, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:07:42,442] [INFO] [logging.py:96:log_dist] [Rank 0] step=740, skipped=6, lr=[6.203446372728998e-06, 0.0003214220918512434, 6.203446372728998e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:07:42,443] [INFO] [timer.py:260:stop] epoch=123/micro_step=2/global_step=740, RunningAvgSamplesPerSec=7.738183103439789, CurrSamplesPerSec=6.853928574747845, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.34s, TFLOPs: 8.96, Samples/sec: 1.71, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.09, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.11, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.55s, TFLOPs: 13.46, Samples/sec: 2.57, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 124/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.004615068435669, loss: 0.00460436986759305 +Beginning of Epoch 125/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.33s, TFLOPs: 8.99, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.33s, TFLOPs: 8.97, Samples/sec: 1.71, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.08s, TFLOPs: 10.08, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:08:03,366] [INFO] [logging.py:96:log_dist] [Rank 0] step=750, skipped=6, lr=[6.122538134468657e-06, 0.0003172299551538164, 6.122538134468657e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:08:03,366] [INFO] [timer.py:260:stop] epoch=124/micro_step=6/global_step=750, RunningAvgSamplesPerSec=7.740101105336991, CurrSamplesPerSec=9.610036421666495, MemAllocated=6.27GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 1.67s, TFLOPs: 12.56, Samples/sec: 2.40, Time/seq 0.42s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 125/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.004639983177185, loss: 0.0046291775070130825 +Beginning of Epoch 126/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.35s, TFLOPs: 8.91, Samples/sec: 1.70, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.35s, TFLOPs: 8.91, Samples/sec: 1.70, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.12, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.08s, TFLOPs: 10.07, Samples/sec: 1.92, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.67s, TFLOPs: 12.57, Samples/sec: 2.40, Time/seq 0.42s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 126/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046472549438477, loss: 0.00463646138086915 +Beginning of Epoch 127/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.02, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.34s, TFLOPs: 8.95, Samples/sec: 1.71, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.08s, TFLOPs: 10.06, Samples/sec: 1.92, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:08:25,978] [INFO] [logging.py:96:log_dist] [Rank 0] step=760, skipped=6, lr=[6.041234653523025e-06, 0.0003130173395607785, 6.041234653523025e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:08:25,979] [INFO] [timer.py:260:stop] epoch=126/micro_step=4/global_step=760, RunningAvgSamplesPerSec=7.736636449908393, CurrSamplesPerSec=7.74423534565308, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.12, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.09, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.56s, TFLOPs: 13.44, Samples/sec: 2.57, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 127/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046297311782837, loss: 0.0046190437860786915 +Beginning of Epoch 128/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.33s, TFLOPs: 8.97, Samples/sec: 1.71, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.34s, TFLOPs: 8.93, Samples/sec: 1.71, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.08s, TFLOPs: 10.09, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.57s, TFLOPs: 13.37, Samples/sec: 2.56, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 128/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.004638910293579, loss: 0.004628215916454792 +Beginning of Epoch 129/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.31s, TFLOPs: 9.05, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:08:47,935] [INFO] [logging.py:96:log_dist] [Rank 0] step=770, skipped=6, lr=[5.9595606957199966e-06, 0.00030878552827564746, 5.9595606957199966e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:08:47,936] [INFO] [timer.py:260:stop] epoch=128/micro_step=2/global_step=770, RunningAvgSamplesPerSec=7.7364400005545715, CurrSamplesPerSec=6.838650165583531, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.34s, TFLOPs: 8.94, Samples/sec: 1.71, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.17, Samples/sec: 1.94, Time/seq 0.51s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.16, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.56s, TFLOPs: 13.42, Samples/sec: 2.56, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 129/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.004638671875, loss: 0.004627843387424946 +Beginning of Epoch 130/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.03, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.35s, TFLOPs: 8.92, Samples/sec: 1.71, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.09, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.15, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.10, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:09:08,757] [INFO] [logging.py:96:log_dist] [Rank 0] step=780, skipped=6, lr=[5.877541139738319e-06, 0.0003045358103491357, 5.877541139738319e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:09:08,758] [INFO] [timer.py:260:stop] epoch=129/micro_step=6/global_step=780, RunningAvgSamplesPerSec=7.738882343029476, CurrSamplesPerSec=10.201838064551927, MemAllocated=6.27GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 1.57s, TFLOPs: 13.33, Samples/sec: 2.55, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 130/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.004635214805603, loss: 0.004624475724995136 +Beginning of Epoch 131/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.01, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.37s, TFLOPs: 8.84, Samples/sec: 1.69, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.18, Samples/sec: 1.95, Time/seq 0.51s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.08s, TFLOPs: 10.06, Samples/sec: 1.92, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.55s, TFLOPs: 13.48, Samples/sec: 2.58, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 131/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046390295028687, loss: 0.004628320224583149 +Beginning of Epoch 132/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.31s, TFLOPs: 9.07, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.34s, TFLOPs: 8.94, Samples/sec: 1.71, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.11, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:09:31,209] [INFO] [logging.py:96:log_dist] [Rank 0] step=790, skipped=6, lr=[5.795200969529281e-06, 0.0003002694802864912, 5.795200969529281e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:09:31,209] [INFO] [timer.py:260:stop] epoch=131/micro_step=4/global_step=790, RunningAvgSamplesPerSec=7.736329011283486, CurrSamplesPerSec=7.777886414279194, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.17, Samples/sec: 1.94, Time/seq 0.51s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.16, Samples/sec: 1.94, Time/seq 0.51s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.56s, TFLOPs: 13.40, Samples/sec: 2.56, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 132/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046342611312866, loss: 0.004623537417501211 +Beginning of Epoch 133/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.34s, TFLOPs: 8.96, Samples/sec: 1.71, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.35s, TFLOPs: 8.90, Samples/sec: 1.70, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.16, Samples/sec: 1.94, Time/seq 0.51s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.60s, TFLOPs: 13.12, Samples/sec: 2.51, Time/seq 0.40s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 133/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046360492706299, loss: 0.004625337664037943 +Beginning of Epoch 134/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.31s, TFLOPs: 9.05, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:09:53,185] [INFO] [logging.py:96:log_dist] [Rank 0] step=800, skipped=6, lr=[5.712565266706376e-06, 0.00029598783765318005, 5.712565266706376e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:09:53,185] [INFO] [timer.py:260:stop] epoch=133/micro_step=2/global_step=800, RunningAvgSamplesPerSec=7.73606976967669, CurrSamplesPerSec=6.838632046631068, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.34s, TFLOPs: 8.94, Samples/sec: 1.71, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.16, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.18, Samples/sec: 1.94, Time/seq 0.51s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.54s, TFLOPs: 13.57, Samples/sec: 2.59, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 134/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046379566192627, loss: 0.004627235233783722 +Beginning of Epoch 135/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.35s, TFLOPs: 8.89, Samples/sec: 1.70, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.01, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.08s, TFLOPs: 10.05, Samples/sec: 1.92, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.12, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:10:13,975] [INFO] [logging.py:96:log_dist] [Rank 0] step=810, skipped=6, lr=[5.629659202905194e-06, 0.00029169218667902556, 5.629659202905194e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:10:13,975] [INFO] [timer.py:260:stop] epoch=134/micro_step=6/global_step=810, RunningAvgSamplesPerSec=7.738510664548841, CurrSamplesPerSec=10.344137444484675, MemAllocated=6.27GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 1.55s, TFLOPs: 13.52, Samples/sec: 2.58, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 135/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046409368515015, loss: 0.004630213137716055 +Beginning of Epoch 136/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.00, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.35s, TFLOPs: 8.91, Samples/sec: 1.70, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:10:23,344] [INFO] [loss_scaler.py:190:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 2097152, but hysteresis is 2. Reducing hysteresis to 1 +Model Parameters: 6.927 B, Latency: 2.02s, TFLOPs: 10.38, Samples/sec: 1.98, Time/seq 0.50s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.16, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:10:26,917] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 2097152, reducing to 1048576 +Model Parameters: 6.927 B, Latency: 1.51s, TFLOPs: 13.89, Samples/sec: 2.65, Time/seq 0.38s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 136/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046753883361816, loss: 0.0046645160764455795 +Beginning of Epoch 137/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.29s, TFLOPs: 9.12, Samples/sec: 1.74, Time/seq 0.57s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.34s, TFLOPs: 8.96, Samples/sec: 1.71, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.15, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:10:36,287] [INFO] [logging.py:96:log_dist] [Rank 0] step=820, skipped=8, lr=[5.5631566600704844e-06, 0.00028824645907100954, 5.5631566600704844e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:10:36,287] [INFO] [timer.py:260:stop] epoch=136/micro_step=4/global_step=820, RunningAvgSamplesPerSec=7.736718822711145, CurrSamplesPerSec=7.77243734256668, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.16, Samples/sec: 1.94, Time/seq 0.51s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.18, Samples/sec: 1.95, Time/seq 0.51s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.56s, TFLOPs: 13.40, Samples/sec: 2.56, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 137/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0045894384384155, loss: 0.00457890797406435 +Beginning of Epoch 138/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.04, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.33s, TFLOPs: 8.97, Samples/sec: 1.71, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.17, Samples/sec: 1.94, Time/seq 0.51s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.18, Samples/sec: 1.94, Time/seq 0.51s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.18, Samples/sec: 1.94, Time/seq 0.51s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.53s, TFLOPs: 13.64, Samples/sec: 2.61, Time/seq 0.38s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 138/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046035051345825, loss: 0.0045928955078125 +Beginning of Epoch 139/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.31s, TFLOPs: 9.08, Samples/sec: 1.74, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:10:58,134] [INFO] [logging.py:96:log_dist] [Rank 0] step=830, skipped=8, lr=[5.479827636995519e-06, 0.0002839288931085761, 5.479827636995519e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:10:58,135] [INFO] [timer.py:260:stop] epoch=138/micro_step=2/global_step=830, RunningAvgSamplesPerSec=7.736983545255089, CurrSamplesPerSec=6.800734301220576, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.35s, TFLOPs: 8.89, Samples/sec: 1.70, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.15, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.15, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.63s, TFLOPs: 12.86, Samples/sec: 2.46, Time/seq 0.41s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 139/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046499967575073, loss: 0.004639158025383949 +Beginning of Epoch 140/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.31s, TFLOPs: 9.06, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.46s, TFLOPs: 8.52, Samples/sec: 1.63, Time/seq 0.61s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.15, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.16, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:11:19,067] [INFO] [logging.py:96:log_dist] [Rank 0] step=840, skipped=8, lr=[5.396299147078869e-06, 0.0002796009920766253, 5.396299147078869e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:11:19,068] [INFO] [timer.py:260:stop] epoch=139/micro_step=6/global_step=840, RunningAvgSamplesPerSec=7.73868756830536, CurrSamplesPerSec=10.42695361084633, MemAllocated=6.27GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 1.54s, TFLOPs: 13.63, Samples/sec: 2.60, Time/seq 0.38s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 140/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046547651290894, loss: 0.004644001368433237 +Beginning of Epoch 141/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.30s, TFLOPs: 9.11, Samples/sec: 1.74, Time/seq 0.57s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.02, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.11, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.09s, TFLOPs: 10.04, Samples/sec: 1.92, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.58s, TFLOPs: 13.25, Samples/sec: 2.53, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 141/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046483278274536, loss: 0.004637538455426693 +Beginning of Epoch 142/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.30s, TFLOPs: 9.11, Samples/sec: 1.74, Time/seq 0.57s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.31s, TFLOPs: 9.06, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.09, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:11:41,441] [INFO] [logging.py:96:log_dist] [Rank 0] step=850, skipped=8, lr=[5.312596633907717e-06, 0.00027526407429573655, 5.312596633907717e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:11:41,442] [INFO] [timer.py:260:stop] epoch=141/micro_step=4/global_step=850, RunningAvgSamplesPerSec=7.736567479038148, CurrSamplesPerSec=7.748031728790515, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.54s, TFLOPs: 13.62, Samples/sec: 2.60, Time/seq 0.38s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 142/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046459436416626, loss: 0.004635121673345566 +Beginning of Epoch 143/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.28s, TFLOPs: 9.17, Samples/sec: 1.75, Time/seq 0.57s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.03, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.54s, TFLOPs: 13.60, Samples/sec: 2.60, Time/seq 0.38s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 143/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046383142471313, loss: 0.004627531860023737 +Beginning of Epoch 144/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.30s, TFLOPs: 9.10, Samples/sec: 1.74, Time/seq 0.57s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:12:03,270] [INFO] [logging.py:96:log_dist] [Rank 0] step=860, skipped=8, lr=[5.228745594078424e-06, 0.00027091946083307894, 5.228745594078424e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:12:03,271] [INFO] [timer.py:260:stop] epoch=143/micro_step=2/global_step=860, RunningAvgSamplesPerSec=7.7368675728413985, CurrSamplesPerSec=6.721167770338287, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.38s, TFLOPs: 8.79, Samples/sec: 1.68, Time/seq 0.60s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.08s, TFLOPs: 10.08, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.12, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.17, Samples/sec: 1.94, Time/seq 0.51s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.54s, TFLOPs: 13.63, Samples/sec: 2.60, Time/seq 0.38s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 144/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046348571777344, loss: 0.004624112509191036 +Beginning of Epoch 145/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.30s, TFLOPs: 9.09, Samples/sec: 1.74, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.34s, TFLOPs: 8.93, Samples/sec: 1.71, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.15, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:12:24,029] [INFO] [logging.py:96:log_dist] [Rank 0] step=870, skipped=8, lr=[5.144771569430002e-06, 0.0002665684751000001, 5.144771569430002e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:12:24,030] [INFO] [timer.py:260:stop] epoch=144/micro_step=6/global_step=870, RunningAvgSamplesPerSec=7.739240862239181, CurrSamplesPerSec=10.257945456643741, MemAllocated=6.27GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 1.56s, TFLOPs: 13.41, Samples/sec: 2.56, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 145/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046361684799194, loss: 0.004625376313924789 +Beginning of Epoch 146/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.30s, TFLOPs: 9.12, Samples/sec: 1.74, Time/seq 0.57s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.34s, TFLOPs: 8.95, Samples/sec: 1.71, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.12, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.12, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.58s, TFLOPs: 13.23, Samples/sec: 2.53, Time/seq 0.40s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 146/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.004636526107788, loss: 0.004625778645277023 +Beginning of Epoch 147/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.04, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.34s, TFLOPs: 8.96, Samples/sec: 1.71, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:12:46,465] [INFO] [logging.py:96:log_dist] [Rank 0] step=880, skipped=8, lr=[5.060700139263835e-06, 0.00026221244244890336, 5.060700139263835e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:12:46,466] [INFO] [timer.py:260:stop] epoch=146/micro_step=4/global_step=880, RunningAvgSamplesPerSec=7.737012608129387, CurrSamplesPerSec=7.727130828952902, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.10, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.55s, TFLOPs: 13.49, Samples/sec: 2.58, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 147/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046453475952148, loss: 0.004634547047317028 +Beginning of Epoch 148/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.04, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.34s, TFLOPs: 8.94, Samples/sec: 1.71, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.08s, TFLOPs: 10.08, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.11, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.09, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.61s, TFLOPs: 12.96, Samples/sec: 2.48, Time/seq 0.40s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 148/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046417713165283, loss: 0.004630994983017445 +Beginning of Epoch 149/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.03, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:13:08,442] [INFO] [logging.py:96:log_dist] [Rank 0] step=890, skipped=8, lr=[4.976556912551969e-06, 0.00025785268976953206, 4.976556912551969e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:13:08,442] [INFO] [timer.py:260:stop] epoch=148/micro_step=2/global_step=890, RunningAvgSamplesPerSec=7.736704229709124, CurrSamplesPerSec=6.857735198445431, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.33s, TFLOPs: 8.97, Samples/sec: 1.71, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.08s, TFLOPs: 10.08, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.12, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.12, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.56s, TFLOPs: 13.42, Samples/sec: 2.56, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 149/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046380758285522, loss: 0.004627351649105549 +Beginning of Epoch 150/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.31s, TFLOPs: 9.05, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.34s, TFLOPs: 8.94, Samples/sec: 1.71, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.11, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.08s, TFLOPs: 10.05, Samples/sec: 1.92, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:13:29,281] [INFO] [logging.py:96:log_dist] [Rank 0] step=900, skipped=8, lr=[4.892367520136377e-06, 0.0002534905450847864, 4.892367520136377e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:13:29,281] [INFO] [timer.py:260:stop] epoch=149/micro_step=6/global_step=900, RunningAvgSamplesPerSec=7.738682861441555, CurrSamplesPerSec=10.188046187805714, MemAllocated=6.27GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 1.57s, TFLOPs: 13.32, Samples/sec: 2.54, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 150/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046391487121582, loss: 0.0046283476985991 +Beginning of Epoch 151/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.35s, TFLOPs: 8.91, Samples/sec: 1.70, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.33s, TFLOPs: 8.97, Samples/sec: 1.71, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.12, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.11, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.57s, TFLOPs: 13.34, Samples/sec: 2.55, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 151/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046383142471313, loss: 0.00462756771594286 +Beginning of Epoch 152/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.00, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.36s, TFLOPs: 8.85, Samples/sec: 1.69, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.15, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:13:51,776] [INFO] [logging.py:96:log_dist] [Rank 0] step=910, skipped=8, lr=[4.808157606921547e-06, 0.00024912733714619413, 4.808157606921547e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:13:51,777] [INFO] [timer.py:260:stop] epoch=151/micro_step=4/global_step=910, RunningAvgSamplesPerSec=7.736286590621073, CurrSamplesPerSec=7.7652200194486705, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.15, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.12, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.65s, TFLOPs: 12.67, Samples/sec: 2.42, Time/seq 0.41s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 152/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046435594558716, loss: 0.00463278591632843 +Beginning of Epoch 153/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.38s, TFLOPs: 8.79, Samples/sec: 1.68, Time/seq 0.60s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.34s, TFLOPs: 8.94, Samples/sec: 1.71, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.10, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.08s, TFLOPs: 10.07, Samples/sec: 1.92, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.15, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:14:08,563] [INFO] [loss_scaler.py:190:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 2097152, but hysteresis is 2. Reducing hysteresis to 1 +Model Parameters: 6.927 B, Latency: 1.51s, TFLOPs: 13.84, Samples/sec: 2.65, Time/seq 0.38s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 153/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046592950820923, loss: 0.004648453556001186 +Beginning of Epoch 154/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.31s, TFLOPs: 9.08, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:14:13,802] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 2097152, reducing to 1048576 +[2023-12-06 09:14:13,802] [INFO] [logging.py:96:log_dist] [Rank 0] step=920, skipped=10, lr=[4.740792138940107e-06, 0.0002456368983906791, 4.740792138940107e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:14:13,803] [INFO] [timer.py:260:stop] epoch=153/micro_step=2/global_step=920, RunningAvgSamplesPerSec=7.735854508794184, CurrSamplesPerSec=6.89205580328419, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.01, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.16, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.15, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.55s, TFLOPs: 13.47, Samples/sec: 2.57, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 154/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.004593849182129, loss: 0.004583239555358887 +Beginning of Epoch 155/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.33s, TFLOPs: 8.99, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.34s, TFLOPs: 8.96, Samples/sec: 1.71, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.15, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:14:34,600] [INFO] [logging.py:96:log_dist] [Rank 0] step=930, skipped=10, lr=[4.656609928410434e-06, 0.00024127512582437484, 4.656609928410434e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:14:34,601] [INFO] [timer.py:260:stop] epoch=154/micro_step=6/global_step=930, RunningAvgSamplesPerSec=7.737918815820894, CurrSamplesPerSec=10.10814192908968, MemAllocated=6.27GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 1.58s, TFLOPs: 13.21, Samples/sec: 2.52, Time/seq 0.40s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 155/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046188831329346, loss: 0.004608172923326492 +Beginning of Epoch 156/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.02, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.35s, TFLOPs: 8.90, Samples/sec: 1.70, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.15, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.09, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.15, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.58s, TFLOPs: 13.21, Samples/sec: 2.52, Time/seq 0.40s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 156/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046591758728027, loss: 0.004648404195904732 +Beginning of Epoch 157/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.34s, TFLOPs: 8.94, Samples/sec: 1.71, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.43s, TFLOPs: 8.61, Samples/sec: 1.64, Time/seq 0.61s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.08s, TFLOPs: 10.09, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:14:57,198] [INFO] [logging.py:96:log_dist] [Rank 0] step=940, skipped=10, lr=[4.572479011127796e-06, 0.00023691601093926405, 4.572479011127796e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:14:57,199] [INFO] [timer.py:260:stop] epoch=156/micro_step=4/global_step=940, RunningAvgSamplesPerSec=7.735165839532568, CurrSamplesPerSec=7.712306226606126, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.08s, TFLOPs: 10.08, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.08s, TFLOPs: 10.07, Samples/sec: 1.92, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.62s, TFLOPs: 12.94, Samples/sec: 2.47, Time/seq 0.40s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 157/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.004669189453125, loss: 0.004658283665776253 +Beginning of Epoch 158/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.38s, TFLOPs: 8.80, Samples/sec: 1.68, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.37s, TFLOPs: 8.83, Samples/sec: 1.69, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.15, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.10, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.54s, TFLOPs: 13.59, Samples/sec: 2.60, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 158/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046496391296387, loss: 0.004638831131160259 +Beginning of Epoch 159/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.31s, TFLOPs: 9.07, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:15:19,252] [INFO] [logging.py:96:log_dist] [Rank 0] step=950, skipped=10, lr=[4.488425014184596e-06, 0.00023256088156396867, 4.488425014184596e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:15:19,253] [INFO] [timer.py:260:stop] epoch=158/micro_step=2/global_step=950, RunningAvgSamplesPerSec=7.734691974196186, CurrSamplesPerSec=6.889995979484282, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.01, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.15, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.18, Samples/sec: 1.95, Time/seq 0.51s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.17, Samples/sec: 1.94, Time/seq 0.51s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.53s, TFLOPs: 13.68, Samples/sec: 2.61, Time/seq 0.38s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 159/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046356916427612, loss: 0.004624947905540466 +Beginning of Epoch 160/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.02, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.36s, TFLOPs: 8.88, Samples/sec: 1.70, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.11, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.12, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.08s, TFLOPs: 10.06, Samples/sec: 1.92, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:15:40,166] [INFO] [logging.py:96:log_dist] [Rank 0] step=960, skipped=10, lr=[4.404473541242549e-06, 0.00022821106431308543, 4.404473541242549e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:15:40,167] [INFO] [timer.py:260:stop] epoch=159/micro_step=6/global_step=960, RunningAvgSamplesPerSec=7.736289406073198, CurrSamplesPerSec=9.515027930220246, MemAllocated=6.27GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 1.68s, TFLOPs: 12.43, Samples/sec: 2.38, Time/seq 0.42s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 160/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.004631757736206, loss: 0.004621043801307678 +Beginning of Epoch 161/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.35s, TFLOPs: 8.92, Samples/sec: 1.71, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.34s, TFLOPs: 8.95, Samples/sec: 1.71, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.08s, TFLOPs: 10.08, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.12, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.63s, TFLOPs: 12.85, Samples/sec: 2.46, Time/seq 0.41s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 161/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046418905258179, loss: 0.004631182178854942 +Beginning of Epoch 162/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.01, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.36s, TFLOPs: 8.87, Samples/sec: 1.70, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:16:02,763] [INFO] [logging.py:96:log_dist] [Rank 0] step=970, skipped=10, lr=[4.320650164733573e-06, 0.00022386788418308668, 4.320650164733573e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:16:02,764] [INFO] [timer.py:260:stop] epoch=161/micro_step=4/global_step=970, RunningAvgSamplesPerSec=7.733771534057517, CurrSamplesPerSec=7.73586356799606, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.11, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.09s, TFLOPs: 10.00, Samples/sec: 1.91, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.60s, TFLOPs: 13.12, Samples/sec: 2.51, Time/seq 0.40s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 162/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046498775482178, loss: 0.004639056045562029 +Beginning of Epoch 163/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.33s, TFLOPs: 8.97, Samples/sec: 1.71, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.34s, TFLOPs: 8.96, Samples/sec: 1.71, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.11, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.08s, TFLOPs: 10.09, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.09, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.57s, TFLOPs: 13.33, Samples/sec: 2.55, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 163/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046464204788208, loss: 0.0046356054954230785 +Beginning of Epoch 164/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.34s, TFLOPs: 8.93, Samples/sec: 1.71, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:16:24,890] [INFO] [logging.py:96:log_dist] [Rank 0] step=980, skipped=10, lr=[4.236980418070164e-06, 0.00021953266414871317, 4.236980418070164e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:16:24,891] [INFO] [timer.py:260:stop] epoch=163/micro_step=2/global_step=980, RunningAvgSamplesPerSec=7.733002585888538, CurrSamplesPerSec=6.654518634454986, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.41s, TFLOPs: 8.70, Samples/sec: 1.66, Time/seq 0.60s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.08s, TFLOPs: 10.06, Samples/sec: 1.92, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.10, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.09, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.62s, TFLOPs: 12.91, Samples/sec: 2.47, Time/seq 0.41s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 164/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046411752700806, loss: 0.004630442708730698 +Beginning of Epoch 165/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.02, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.34s, TFLOPs: 8.95, Samples/sec: 1.71, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.11, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.16, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:16:45,819] [INFO] [logging.py:96:log_dist] [Rank 0] step=990, skipped=10, lr=[4.153489787867686e-06, 0.0002152067247599837, 4.153489787867686e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:16:45,820] [INFO] [timer.py:260:stop] epoch=164/micro_step=6/global_step=990, RunningAvgSamplesPerSec=7.734525144260239, CurrSamplesPerSec=10.046497311981284, MemAllocated=6.27GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 1.59s, TFLOPs: 13.13, Samples/sec: 2.51, Time/seq 0.40s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 165/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046401023864746, loss: 0.004629343748092651 +Beginning of Epoch 166/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.35s, TFLOPs: 8.91, Samples/sec: 1.70, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.52s, TFLOPs: 8.31, Samples/sec: 1.59, Time/seq 0.63s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.15, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.09, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.56s, TFLOPs: 13.40, Samples/sec: 2.56, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 166/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.004645824432373, loss: 0.004635074641555548 +Beginning of Epoch 167/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.34s, TFLOPs: 8.95, Samples/sec: 1.71, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.39s, TFLOPs: 8.77, Samples/sec: 1.68, Time/seq 0.60s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.08s, TFLOPs: 10.08, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:17:08,570] [INFO] [logging.py:96:log_dist] [Rank 0] step=1000, skipped=10, lr=[4.070203706180886e-06, 0.00021089138373994224, 4.070203706180886e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:17:08,571] [INFO] [timer.py:260:stop] epoch=166/micro_step=4/global_step=1000, RunningAvgSamplesPerSec=7.731476311949026, CurrSamplesPerSec=7.710856486018995, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.08s, TFLOPs: 10.08, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.08s, TFLOPs: 10.05, Samples/sec: 1.92, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.64s, TFLOPs: 12.77, Samples/sec: 2.44, Time/seq 0.41s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 167/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046395063400269, loss: 0.004628775641322136 +Beginning of Epoch 168/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.01, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.34s, TFLOPs: 8.93, Samples/sec: 1.71, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.10, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.11, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.57s, TFLOPs: 13.32, Samples/sec: 2.55, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 168/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046426057815552, loss: 0.004631810821592808 +Beginning of Epoch 169/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.02, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:17:30,681] [INFO] [logging.py:96:log_dist] [Rank 0] step=1010, skipped=10, lr=[3.987147542757061e-06, 0.00020658795558326743, 3.987147542757061e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:17:30,682] [INFO] [timer.py:260:stop] epoch=168/micro_step=2/global_step=1010, RunningAvgSamplesPerSec=7.730845849020505, CurrSamplesPerSec=6.710910559278013, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.39s, TFLOPs: 8.77, Samples/sec: 1.68, Time/seq 0.60s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.08s, TFLOPs: 10.06, Samples/sec: 1.92, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.11, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.08s, TFLOPs: 10.08, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.55s, TFLOPs: 13.51, Samples/sec: 2.58, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 169/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046461820602417, loss: 0.004635367542505264 +Beginning of Epoch 170/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.31s, TFLOPs: 9.04, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.39s, TFLOPs: 8.77, Samples/sec: 1.68, Time/seq 0.60s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.10, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.12, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:17:51,511] [INFO] [logging.py:96:log_dist] [Rank 0] step=1020, skipped=10, lr=[3.904346597308171e-06, 0.0002022977511558638, 3.904346597308171e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:17:51,512] [INFO] [timer.py:260:stop] epoch=169/micro_step=6/global_step=1020, RunningAvgSamplesPerSec=7.732679122005515, CurrSamplesPerSec=10.5056468612068, MemAllocated=6.27GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 1.52s, TFLOPs: 13.73, Samples/sec: 2.62, Time/seq 0.38s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 170/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046418905258179, loss: 0.004631101153790951 +Beginning of Epoch 171/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.31s, TFLOPs: 9.07, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:17:56,686] [INFO] [loss_scaler.py:190:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 2097152, but hysteresis is 2. Reducing hysteresis to 1 +Model Parameters: 6.927 B, Latency: 2.27s, TFLOPs: 9.22, Samples/sec: 1.76, Time/seq 0.57s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.17, Samples/sec: 1.94, Time/seq 0.51s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:18:00,767] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 2097152, reducing to 1048576 +Model Parameters: 6.927 B, Latency: 2.02s, TFLOPs: 10.37, Samples/sec: 1.98, Time/seq 0.50s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.16, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.52s, TFLOPs: 13.78, Samples/sec: 2.63, Time/seq 0.38s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 171/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.004660964012146, loss: 0.004650114569813013 +Beginning of Epoch 172/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.33s, TFLOPs: 8.99, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.36s, TFLOPs: 8.86, Samples/sec: 1.69, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.15, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:18:13,758] [INFO] [logging.py:96:log_dist] [Rank 0] step=1030, skipped=12, lr=[3.838306549863362e-06, 0.00019887598703955244, 3.838306549863362e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:18:13,758] [INFO] [timer.py:260:stop] epoch=171/micro_step=4/global_step=1030, RunningAvgSamplesPerSec=7.731444189053446, CurrSamplesPerSec=7.786268136649869, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.18, Samples/sec: 1.95, Time/seq 0.51s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.18, Samples/sec: 1.95, Time/seq 0.51s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.55s, TFLOPs: 13.54, Samples/sec: 2.59, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 172/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046581029891968, loss: 0.004647254478186369 +Beginning of Epoch 173/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.29s, TFLOPs: 9.12, Samples/sec: 1.74, Time/seq 0.57s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.00, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.15, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.16, Samples/sec: 1.94, Time/seq 0.51s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.17, Samples/sec: 1.94, Time/seq 0.51s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.53s, TFLOPs: 13.68, Samples/sec: 2.61, Time/seq 0.38s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 173/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046336650848389, loss: 0.004622924607247114 +Beginning of Epoch 174/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.33s, TFLOPs: 8.98, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:18:35,567] [INFO] [logging.py:96:log_dist] [Rank 0] step=1040, skipped=12, lr=[3.756028498986071e-06, 0.0001946128755951332, 3.756028498986071e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:18:35,567] [INFO] [timer.py:260:stop] epoch=173/micro_step=2/global_step=1040, RunningAvgSamplesPerSec=7.7318139547344185, CurrSamplesPerSec=6.832597509210825, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.34s, TFLOPs: 8.93, Samples/sec: 1.71, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.10, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.15, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.15, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.53s, TFLOPs: 13.67, Samples/sec: 2.61, Time/seq 0.38s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 174/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.004659652709961, loss: 0.004648763686418533 +Beginning of Epoch 175/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.29s, TFLOPs: 9.16, Samples/sec: 1.75, Time/seq 0.57s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.31s, TFLOPs: 9.08, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.16, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:18:56,220] [INFO] [logging.py:96:log_dist] [Rank 0] step=1050, skipped=12, lr=[3.6740760671833485e-06, 0.00019036663560535483, 3.6740760671833485e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:18:56,221] [INFO] [timer.py:260:stop] epoch=174/micro_step=6/global_step=1050, RunningAvgSamplesPerSec=7.73420334996445, CurrSamplesPerSec=10.497525429672976, MemAllocated=6.27GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 1.53s, TFLOPs: 13.72, Samples/sec: 2.62, Time/seq 0.38s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 175/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046722888946533, loss: 0.004661307670176029 +Beginning of Epoch 176/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.29s, TFLOPs: 9.15, Samples/sec: 1.75, Time/seq 0.57s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.01, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.12, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.15, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.55s, TFLOPs: 13.48, Samples/sec: 2.58, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 176/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.004657506942749, loss: 0.0046467287465929985 +Beginning of Epoch 177/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.31s, TFLOPs: 9.08, Samples/sec: 1.74, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.34s, TFLOPs: 8.96, Samples/sec: 1.71, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.15, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:19:18,563] [INFO] [logging.py:96:log_dist] [Rank 0] step=1060, skipped=12, lr=[3.592474217959812e-06, 0.0001861385605160524, 3.592474217959812e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:19:18,564] [INFO] [timer.py:260:stop] epoch=176/micro_step=4/global_step=1060, RunningAvgSamplesPerSec=7.732704010348308, CurrSamplesPerSec=7.775002832693149, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.17, Samples/sec: 1.94, Time/seq 0.51s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.16, Samples/sec: 1.94, Time/seq 0.51s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.56s, TFLOPs: 13.38, Samples/sec: 2.56, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 177/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046356916427612, loss: 0.004625000525265932 +Beginning of Epoch 178/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.34s, TFLOPs: 8.95, Samples/sec: 1.71, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.35s, TFLOPs: 8.90, Samples/sec: 1.70, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.08s, TFLOPs: 10.08, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.69s, TFLOPs: 12.35, Samples/sec: 2.36, Time/seq 0.42s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 178/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046359300613403, loss: 0.004625236615538597 +Beginning of Epoch 179/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.03, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:19:40,681] [INFO] [logging.py:96:log_dist] [Rank 0] step=1070, skipped=12, lr=[3.511247808029229e-06, 0.00018192993823985644, 3.511247808029229e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:19:40,682] [INFO] [timer.py:260:stop] epoch=178/micro_step=2/global_step=1070, RunningAvgSamplesPerSec=7.732013426075244, CurrSamplesPerSec=6.765015317056363, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.37s, TFLOPs: 8.84, Samples/sec: 1.69, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.08s, TFLOPs: 10.07, Samples/sec: 1.92, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.56s, TFLOPs: 13.44, Samples/sec: 2.57, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 179/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046430826187134, loss: 0.004632371012121439 +Beginning of Epoch 180/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.31s, TFLOPs: 9.06, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.36s, TFLOPs: 8.85, Samples/sec: 1.69, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.11, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.08s, TFLOPs: 10.08, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:20:01,507] [INFO] [logging.py:96:log_dist] [Rank 0] step=1080, skipped=12, lr=[3.430421579742924e-06, 0.00017774205076388205, 3.430421579742924e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:20:01,507] [INFO] [timer.py:260:stop] epoch=179/micro_step=6/global_step=1080, RunningAvgSamplesPerSec=7.73373236492358, CurrSamplesPerSec=10.279293956328422, MemAllocated=6.27GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 1.56s, TFLOPs: 13.43, Samples/sec: 2.57, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 180/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046477317810059, loss: 0.00463686790317297 +Beginning of Epoch 181/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.36s, TFLOPs: 8.87, Samples/sec: 1.70, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.35s, TFLOPs: 8.92, Samples/sec: 1.70, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.57s, TFLOPs: 13.36, Samples/sec: 2.55, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 181/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046464204788208, loss: 0.004635610617697239 +Beginning of Epoch 182/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.31s, TFLOPs: 9.05, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.36s, TFLOPs: 8.88, Samples/sec: 1.70, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.16, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:20:23,991] [INFO] [logging.py:96:log_dist] [Rank 0] step=1090, skipped=12, lr=[3.350020153553016e-06, 0.0001735761737592236, 3.350020153553016e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:20:23,992] [INFO] [timer.py:260:stop] epoch=181/micro_step=4/global_step=1090, RunningAvgSamplesPerSec=7.731784657043682, CurrSamplesPerSec=7.752829466749723, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.11, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.55s, TFLOPs: 13.50, Samples/sec: 2.58, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 182/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046486854553223, loss: 0.004637884441763163 +Beginning of Epoch 183/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.31s, TFLOPs: 9.07, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.34s, TFLOPs: 8.96, Samples/sec: 1.71, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.16, Samples/sec: 1.94, Time/seq 0.51s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.16, Samples/sec: 1.94, Time/seq 0.51s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.17, Samples/sec: 1.94, Time/seq 0.51s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.55s, TFLOPs: 13.49, Samples/sec: 2.58, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 183/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046454668045044, loss: 0.0046346415765583515 +Beginning of Epoch 184/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.31s, TFLOPs: 9.06, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:20:45,834] [INFO] [logging.py:96:log_dist] [Rank 0] step=1100, skipped=12, lr=[3.2700680205127847e-06, 0.00016943357619237227, 3.2700680205127847e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:20:45,834] [INFO] [timer.py:260:stop] epoch=183/micro_step=2/global_step=1100, RunningAvgSamplesPerSec=7.732027508117204, CurrSamplesPerSec=6.8784622486874225, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.33s, TFLOPs: 8.99, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.15, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.16, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.17, Samples/sec: 1.94, Time/seq 0.51s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.59s, TFLOPs: 13.20, Samples/sec: 2.52, Time/seq 0.40s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 184/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046473741531372, loss: 0.004636617377400398 +Beginning of Epoch 185/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.03, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.34s, TFLOPs: 8.94, Samples/sec: 1.71, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.16, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.12, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.17, Samples/sec: 1.94, Time/seq 0.51s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:21:06,668] [INFO] [logging.py:96:log_dist] [Rank 0] step=1110, skipped=12, lr=[3.1905895348164694e-06, 0.00016531551993867715, 3.1905895348164694e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:21:06,669] [INFO] [timer.py:260:stop] epoch=184/micro_step=6/global_step=1110, RunningAvgSamplesPerSec=7.733684431684904, CurrSamplesPerSec=10.003096525776781, MemAllocated=6.27GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 1.60s, TFLOPs: 13.08, Samples/sec: 2.50, Time/seq 0.40s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 185/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046473741531372, loss: 0.004636526107788086 +Beginning of Epoch 186/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.34s, TFLOPs: 8.96, Samples/sec: 1.71, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.34s, TFLOPs: 8.94, Samples/sec: 1.71, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.17, Samples/sec: 1.94, Time/seq 0.51s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.54s, TFLOPs: 13.55, Samples/sec: 2.59, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 186/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046472549438477, loss: 0.00463649770244956 +Beginning of Epoch 187/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.30s, TFLOPs: 9.11, Samples/sec: 1.74, Time/seq 0.57s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.33s, TFLOPs: 8.98, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:21:29,052] [INFO] [logging.py:96:log_dist] [Rank 0] step=1120, skipped=12, lr=[3.11160890638074e-06, 0.00016122325939796578, 3.11160890638074e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:21:29,052] [INFO] [timer.py:260:stop] epoch=186/micro_step=4/global_step=1120, RunningAvgSamplesPerSec=7.732096833794225, CurrSamplesPerSec=7.729467958042941, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.11, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.54s, TFLOPs: 13.56, Samples/sec: 2.59, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 187/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.004649043083191, loss: 0.004638249985873699 +Beginning of Epoch 188/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.30s, TFLOPs: 9.11, Samples/sec: 1.74, Time/seq 0.57s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.00, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.12, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:21:41,975] [INFO] [loss_scaler.py:190:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 2097152, but hysteresis is 2. Reducing hysteresis to 1 +Model Parameters: 6.927 B, Latency: 2.01s, TFLOPs: 10.39, Samples/sec: 1.99, Time/seq 0.50s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.10, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:21:45,553] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 2097152, reducing to 1048576 +Model Parameters: 6.927 B, Latency: 1.50s, TFLOPs: 13.94, Samples/sec: 2.66, Time/seq 0.38s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 188/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046619176864624, loss: 0.004651087801903486 +Beginning of Epoch 189/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.02, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:21:50,809] [INFO] [logging.py:96:log_dist] [Rank 0] step=1130, skipped=14, lr=[3.0487990332964295e-06, 0.00015796886182883053, 3.0487990332964295e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:21:50,809] [INFO] [timer.py:260:stop] epoch=188/micro_step=2/global_step=1130, RunningAvgSamplesPerSec=7.732634565291675, CurrSamplesPerSec=6.868639121568987, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.33s, TFLOPs: 8.98, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.09, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.16, Samples/sec: 1.94, Time/seq 0.51s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.54s, TFLOPs: 13.56, Samples/sec: 2.59, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 189/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.004642128944397, loss: 0.004631316289305687 +Beginning of Epoch 190/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.31s, TFLOPs: 9.04, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.39s, TFLOPs: 8.76, Samples/sec: 1.67, Time/seq 0.60s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.10, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.10, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:22:11,678] [INFO] [logging.py:96:log_dist] [Rank 0] step=1140, skipped=14, lr=[2.970775068166448e-06, 0.0001539261693350491, 2.970775068166448e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:22:11,679] [INFO] [timer.py:260:stop] epoch=189/micro_step=6/global_step=1140, RunningAvgSamplesPerSec=7.7342730861359055, CurrSamplesPerSec=10.34853199787567, MemAllocated=6.27GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 1.55s, TFLOPs: 13.53, Samples/sec: 2.58, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 190/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.004610538482666, loss: 0.004599877633154392 +Beginning of Epoch 191/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.02, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.34s, TFLOPs: 8.95, Samples/sec: 1.71, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.10, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.17, Samples/sec: 1.94, Time/seq 0.51s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.08s, TFLOPs: 10.08, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.59s, TFLOPs: 13.17, Samples/sec: 2.52, Time/seq 0.40s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 191/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.00462007522583, loss: 0.004609429277479649 +Beginning of Epoch 192/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.30s, TFLOPs: 9.09, Samples/sec: 1.74, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.33s, TFLOPs: 8.98, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.16, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:22:34,127] [INFO] [logging.py:96:log_dist] [Rank 0] step=1150, skipped=14, lr=[2.893315917912983e-06, 0.0001499127418607763, 2.893315917912983e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:22:34,128] [INFO] [timer.py:260:stop] epoch=191/micro_step=4/global_step=1150, RunningAvgSamplesPerSec=7.7325987033920915, CurrSamplesPerSec=7.757266390609191, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.11, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.56s, TFLOPs: 13.45, Samples/sec: 2.57, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 192/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.004644751548767, loss: 0.004633954726159573 +Beginning of Epoch 193/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.29s, TFLOPs: 9.15, Samples/sec: 1.75, Time/seq 0.57s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.30s, TFLOPs: 9.10, Samples/sec: 1.74, Time/seq 0.57s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.10, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.58s, TFLOPs: 13.25, Samples/sec: 2.53, Time/seq 0.40s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 193/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046558380126953, loss: 0.0046450113877654076 +Beginning of Epoch 194/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.30s, TFLOPs: 9.11, Samples/sec: 1.74, Time/seq 0.57s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:22:55,953] [INFO] [logging.py:96:log_dist] [Rank 0] step=1160, skipped=14, lr=[2.816445177343565e-06, 0.00014592980193489974, 2.816445177343565e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:22:55,953] [INFO] [timer.py:260:stop] epoch=193/micro_step=2/global_step=1160, RunningAvgSamplesPerSec=7.7328896666266775, CurrSamplesPerSec=6.905295322388394, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.03, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.11, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.12, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.11, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.56s, TFLOPs: 13.44, Samples/sec: 2.57, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 194/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046508312225342, loss: 0.004640028811991215 +Beginning of Epoch 195/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.29s, TFLOPs: 9.12, Samples/sec: 1.74, Time/seq 0.57s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.02, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.12, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:23:16,705] [INFO] [logging.py:96:log_dist] [Rank 0] step=1170, skipped=14, lr=[2.7401862620304347e-06, 0.0001419785627995044, 2.7401862620304347e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:23:16,705] [INFO] [timer.py:260:stop] epoch=194/micro_step=6/global_step=1170, RunningAvgSamplesPerSec=7.734728230355722, CurrSamplesPerSec=10.338677769956751, MemAllocated=6.27GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 1.55s, TFLOPs: 13.51, Samples/sec: 2.58, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 195/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046545267105103, loss: 0.004643694963306189 +Beginning of Epoch 196/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.30s, TFLOPs: 9.09, Samples/sec: 1.74, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.02, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.08s, TFLOPs: 10.08, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.16, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.17, Samples/sec: 1.94, Time/seq 0.51s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.56s, TFLOPs: 13.43, Samples/sec: 2.57, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 196/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046523809432983, loss: 0.004641575738787651 +Beginning of Epoch 197/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.31s, TFLOPs: 9.06, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.34s, TFLOPs: 8.95, Samples/sec: 1.71, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:23:39,084] [INFO] [logging.py:96:log_dist] [Rank 0] step=1180, skipped=14, lr=[2.664562401177937e-06, 0.0001380602280403076, 2.664562401177937e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:23:39,084] [INFO] [timer.py:260:stop] epoch=196/micro_step=4/global_step=1180, RunningAvgSamplesPerSec=7.733264899694819, CurrSamplesPerSec=7.751893620641479, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.08s, TFLOPs: 10.09, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.57s, TFLOPs: 13.29, Samples/sec: 2.54, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 197/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046483278274536, loss: 0.004637547302991152 +Beginning of Epoch 198/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.02, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.33s, TFLOPs: 8.97, Samples/sec: 1.71, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.18, Samples/sec: 1.94, Time/seq 0.51s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.15, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.55s, TFLOPs: 13.48, Samples/sec: 2.58, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 198/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046480894088745, loss: 0.004637275356799364 +Beginning of Epoch 199/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.01, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:24:01,012] [INFO] [logging.py:96:log_dist] [Rank 0] step=1190, skipped=14, lr=[2.5895966305466683e-06, 0.00013417599122003462, 2.5895966305466683e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:24:01,013] [INFO] [timer.py:260:stop] epoch=198/micro_step=2/global_step=1190, RunningAvgSamplesPerSec=7.733227141483117, CurrSamplesPerSec=6.822323598116352, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.35s, TFLOPs: 8.92, Samples/sec: 1.70, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.08s, TFLOPs: 10.07, Samples/sec: 1.92, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.11, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.12, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.56s, TFLOPs: 13.41, Samples/sec: 2.56, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 199/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.00464928150177, loss: 0.00463847303763032 +Beginning of Epoch 200/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.00, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.37s, TFLOPs: 8.83, Samples/sec: 1.69, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.12, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.12, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.11, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:24:21,882] [INFO] [logging.py:96:log_dist] [Rank 0] step=1200, skipped=14, lr=[2.515311785436573e-06, 0.00013032703551484832, 2.515311785436573e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:24:21,883] [INFO] [timer.py:260:stop] epoch=199/micro_step=6/global_step=1200, RunningAvgSamplesPerSec=7.734667503814574, CurrSamplesPerSec=10.272271698302665, MemAllocated=6.27GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 1.56s, TFLOPs: 13.43, Samples/sec: 2.57, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 200/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.004647970199585, loss: 0.004637092351913452 +Beginning of Epoch 201/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.02, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.33s, TFLOPs: 8.97, Samples/sec: 1.71, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.11, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.08s, TFLOPs: 10.08, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.12, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.56s, TFLOPs: 13.46, Samples/sec: 2.57, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 201/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.004649043083191, loss: 0.004638225771486759 +Beginning of Epoch 202/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.31s, TFLOPs: 9.06, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.01, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.12, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:24:44,295] [INFO] [logging.py:96:log_dist] [Rank 0] step=1210, skipped=14, lr=[2.4417304937310867e-06, 0.0001265145333539423, 2.4417304937310867e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:24:44,295] [INFO] [timer.py:260:stop] epoch=201/micro_step=4/global_step=1210, RunningAvgSamplesPerSec=7.73314292324986, CurrSamplesPerSec=7.753996680433681, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.55s, TFLOPs: 13.52, Samples/sec: 2.58, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 202/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046443939208984, loss: 0.004633601289242506 +Beginning of Epoch 203/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.04, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.03, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.55s, TFLOPs: 13.49, Samples/sec: 2.58, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 203/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046466588974, loss: 0.004635862074792385 +Beginning of Epoch 204/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.33s, TFLOPs: 8.99, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:25:06,189] [INFO] [logging.py:96:log_dist] [Rank 0] step=1220, skipped=14, lr=[2.3688751690044587e-06, 0.00012273964606240718, 2.3688751690044587e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:25:06,190] [INFO] [timer.py:260:stop] epoch=203/micro_step=2/global_step=1220, RunningAvgSamplesPerSec=7.733207631943375, CurrSamplesPerSec=6.802583870051521, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.35s, TFLOPs: 8.89, Samples/sec: 1.70, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.16, Samples/sec: 1.94, Time/seq 0.51s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.16, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.55s, TFLOPs: 13.50, Samples/sec: 2.58, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 204/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.004645824432373, loss: 0.004635047633200884 +Beginning of Epoch 205/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.29s, TFLOPs: 9.15, Samples/sec: 1.75, Time/seq 0.57s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.02, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.10, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:25:26,868] [INFO] [loss_scaler.py:190:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 2097152, but hysteresis is 2. Reducing hysteresis to 1 +[2023-12-06 09:25:26,868] [INFO] [logging.py:96:log_dist] [Rank 0] step=1230, skipped=15, lr=[2.3039444252455474e-06, 0.00011937535882101281, 2.3039444252455474e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:25:26,868] [INFO] [timer.py:260:stop] epoch=204/micro_step=6/global_step=1230, RunningAvgSamplesPerSec=7.735172984401048, CurrSamplesPerSec=10.620574922927917, MemAllocated=6.27GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 1.51s, TFLOPs: 13.88, Samples/sec: 2.65, Time/seq 0.38s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 205/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046555995941162, loss: 0.0046447403728961945 +Beginning of Epoch 206/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.30s, TFLOPs: 9.09, Samples/sec: 1.74, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:25:32,059] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 2097152, reducing to 1048576 +Model Parameters: 6.927 B, Latency: 2.27s, TFLOPs: 9.22, Samples/sec: 1.76, Time/seq 0.57s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.16, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.16, Samples/sec: 1.94, Time/seq 0.51s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.15, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.55s, TFLOPs: 13.49, Samples/sec: 2.58, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 206/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.004635214805603, loss: 0.004624475724995136 +Beginning of Epoch 207/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.31s, TFLOPs: 9.06, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.03, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.16, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:25:49,160] [INFO] [logging.py:96:log_dist] [Rank 0] step=1240, skipped=16, lr=[2.23963571422634e-06, 0.00011604330125525078, 2.23963571422634e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:25:49,160] [INFO] [timer.py:260:stop] epoch=206/micro_step=4/global_step=1240, RunningAvgSamplesPerSec=7.734089692458622, CurrSamplesPerSec=7.771037797920879, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.16, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.18, Samples/sec: 1.95, Time/seq 0.51s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.55s, TFLOPs: 13.50, Samples/sec: 2.58, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 207/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046274662017822, loss: 0.004616747610270977 +Beginning of Epoch 208/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.31s, TFLOPs: 9.05, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.02, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.11, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.15, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.10s, TFLOPs: 9.96, Samples/sec: 1.90, Time/seq 0.53s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.57s, TFLOPs: 13.31, Samples/sec: 2.54, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 208/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046337842941284, loss: 0.004623069427907467 +Beginning of Epoch 209/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.29s, TFLOPs: 9.16, Samples/sec: 1.75, Time/seq 0.57s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:26:11,040] [INFO] [logging.py:96:log_dist] [Rank 0] step=1250, skipped=16, lr=[2.1689304290898963e-06, 0.0001123798149787511, 2.1689304290898963e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:26:11,040] [INFO] [timer.py:260:stop] epoch=208/micro_step=2/global_step=1250, RunningAvgSamplesPerSec=7.734210209495319, CurrSamplesPerSec=6.8842094529766475, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.33s, TFLOPs: 9.00, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.11, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.11, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.55s, TFLOPs: 13.51, Samples/sec: 2.58, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 209/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046555995941162, loss: 0.004644826054573059 +Beginning of Epoch 210/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.30s, TFLOPs: 9.10, Samples/sec: 1.74, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.31s, TFLOPs: 9.05, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.16, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.16, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:26:31,761] [INFO] [logging.py:96:log_dist] [Rank 0] step=1260, skipped=16, lr=[2.099034208474675e-06, 0.00010875824914376553, 2.099034208474675e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:26:31,762] [INFO] [timer.py:260:stop] epoch=209/micro_step=6/global_step=1260, RunningAvgSamplesPerSec=7.735972508409919, CurrSamplesPerSec=10.3212902400252, MemAllocated=6.27GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 1.55s, TFLOPs: 13.49, Samples/sec: 2.58, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 210/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046632289886475, loss: 0.0046523986384272575 +Beginning of Epoch 211/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.29s, TFLOPs: 9.14, Samples/sec: 1.75, Time/seq 0.57s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.31s, TFLOPs: 9.05, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.15, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.15, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.55s, TFLOPs: 13.50, Samples/sec: 2.58, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 211/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046579837799072, loss: 0.00464710034430027 +Beginning of Epoch 212/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.29s, TFLOPs: 9.13, Samples/sec: 1.74, Time/seq 0.57s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.04, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:26:54,051] [INFO] [logging.py:96:log_dist] [Rank 0] step=1270, skipped=16, lr=[2.0299683434465758e-06, 0.00010517970691433035, 2.0299683434465758e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:26:54,052] [INFO] [timer.py:260:stop] epoch=211/micro_step=4/global_step=1270, RunningAvgSamplesPerSec=7.734831681028381, CurrSamplesPerSec=7.761785599452236, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.15, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.55s, TFLOPs: 13.50, Samples/sec: 2.58, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 212/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046528577804565, loss: 0.004642024636268616 +Beginning of Epoch 213/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.30s, TFLOPs: 9.10, Samples/sec: 1.74, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.35s, TFLOPs: 8.92, Samples/sec: 1.70, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.16, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.08s, TFLOPs: 10.09, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.56s, TFLOPs: 13.39, Samples/sec: 2.56, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 213/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046488046646118, loss: 0.004638058599084616 +Beginning of Epoch 214/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.29s, TFLOPs: 9.12, Samples/sec: 1.74, Time/seq 0.57s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:27:15,909] [INFO] [logging.py:96:log_dist] [Rank 0] step=1280, skipped=16, lr=[1.9617538721371414e-06, 0.00010164527834907467, 1.9617538721371414e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:27:15,909] [INFO] [timer.py:260:stop] epoch=213/micro_step=2/global_step=1280, RunningAvgSamplesPerSec=7.734975755419193, CurrSamplesPerSec=6.875127700666768, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.33s, TFLOPs: 8.99, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.12, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.12, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.57s, TFLOPs: 13.33, Samples/sec: 2.55, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 214/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.004650354385376, loss: 0.0046395957469940186 +Beginning of Epoch 215/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.30s, TFLOPs: 9.12, Samples/sec: 1.74, Time/seq 0.57s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.02, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.08s, TFLOPs: 10.08, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:27:36,720] [INFO] [logging.py:96:log_dist] [Rank 0] step=1290, skipped=16, lr=[1.894411573335143e-06, 9.815604006917839e-05, 1.894411573335143e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:27:36,720] [INFO] [timer.py:260:stop] epoch=214/micro_step=6/global_step=1290, RunningAvgSamplesPerSec=7.736461183080137, CurrSamplesPerSec=10.084315985992616, MemAllocated=6.27GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 1.59s, TFLOPs: 13.18, Samples/sec: 2.52, Time/seq 0.40s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 215/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046472549438477, loss: 0.004636465571820736 +Beginning of Epoch 216/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.33s, TFLOPs: 8.99, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.33s, TFLOPs: 8.99, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.17, Samples/sec: 1.94, Time/seq 0.51s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.12, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.16, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.55s, TFLOPs: 13.52, Samples/sec: 2.58, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 216/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046528577804565, loss: 0.004642083775252104 +Beginning of Epoch 217/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.29s, TFLOPs: 9.14, Samples/sec: 1.75, Time/seq 0.57s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.34s, TFLOPs: 8.96, Samples/sec: 1.71, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.08s, TFLOPs: 10.08, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:27:59,090] [INFO] [logging.py:96:log_dist] [Rank 0] step=1300, skipped=16, lr=[1.827961960157153e-06, 9.471305493042242e-05, 1.827961960157153e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:27:59,091] [INFO] [timer.py:260:stop] epoch=216/micro_step=4/global_step=1300, RunningAvgSamplesPerSec=7.735130315532385, CurrSamplesPerSec=7.767199064262941, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.15, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.62s, TFLOPs: 12.94, Samples/sec: 2.47, Time/seq 0.40s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 217/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.004652500152588, loss: 0.004641670733690262 +Beginning of Epoch 218/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.01, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.33s, TFLOPs: 9.00, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.09s, TFLOPs: 10.01, Samples/sec: 1.91, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.08s, TFLOPs: 10.07, Samples/sec: 1.92, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.09, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.56s, TFLOPs: 13.40, Samples/sec: 2.56, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 218/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.004651427268982, loss: 0.004640614613890648 +Beginning of Epoch 219/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.31s, TFLOPs: 9.07, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:28:21,143] [INFO] [logging.py:96:log_dist] [Rank 0] step=1310, skipped=16, lr=[1.7624252737990595e-06, 9.131737169943313e-05, 1.7624252737990595e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:28:21,143] [INFO] [timer.py:260:stop] epoch=218/micro_step=2/global_step=1310, RunningAvgSamplesPerSec=7.73491693732998, CurrSamplesPerSec=6.884117648084637, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.33s, TFLOPs: 9.00, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.18, Samples/sec: 1.94, Time/seq 0.51s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.08s, TFLOPs: 10.09, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.53s, TFLOPs: 13.67, Samples/sec: 2.61, Time/seq 0.38s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 219/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046517848968506, loss: 0.004640921019017696 +Beginning of Epoch 220/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.30s, TFLOPs: 9.08, Samples/sec: 1.74, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.33s, TFLOPs: 8.99, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.16, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:28:41,833] [INFO] [logging.py:96:log_dist] [Rank 0] step=1320, skipped=16, lr=[1.6978214773703937e-06, 8.797002473421728e-05, 1.6978214773703937e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:28:41,834] [INFO] [timer.py:260:stop] epoch=219/micro_step=6/global_step=1320, RunningAvgSamplesPerSec=7.736690383575355, CurrSamplesPerSec=10.522976809387576, MemAllocated=6.27GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 1.52s, TFLOPs: 13.75, Samples/sec: 2.63, Time/seq 0.38s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 220/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046552419662476, loss: 0.004644409753382206 +Beginning of Epoch 221/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.31s, TFLOPs: 9.08, Samples/sec: 1.74, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.01, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.16, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.17, Samples/sec: 1.94, Time/seq 0.51s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.18, Samples/sec: 1.94, Time/seq 0.51s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.52s, TFLOPs: 13.73, Samples/sec: 2.62, Time/seq 0.38s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 221/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046474933624268, loss: 0.004636731930077076 +Beginning of Epoch 222/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.29s, TFLOPs: 9.15, Samples/sec: 1.75, Time/seq 0.57s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.01, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.16, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:29:04,104] [INFO] [logging.py:96:log_dist] [Rank 0] step=1330, skipped=16, lr=[1.6341702498133807e-06, 8.467203366908707e-05, 1.6341702498133807e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:29:04,105] [INFO] [timer.py:260:stop] epoch=221/micro_step=4/global_step=1330, RunningAvgSamplesPerSec=7.73565156355225, CurrSamplesPerSec=7.777327553297789, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.17, Samples/sec: 1.94, Time/seq 0.51s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.16, Samples/sec: 1.94, Time/seq 0.51s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.52s, TFLOPs: 13.76, Samples/sec: 2.63, Time/seq 0.38s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 222/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046520233154297, loss: 0.004641297273337841 +Beginning of Epoch 223/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.38s, TFLOPs: 8.78, Samples/sec: 1.68, Time/seq 0.60s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:29:12,952] [INFO] [loss_scaler.py:190:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 2097152, but hysteresis is 2. Reducing hysteresis to 1 +Model Parameters: 6.927 B, Latency: 2.28s, TFLOPs: 9.19, Samples/sec: 1.76, Time/seq 0.57s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:29:17,039] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 2097152, reducing to 1048576 +Model Parameters: 6.927 B, Latency: 2.02s, TFLOPs: 10.37, Samples/sec: 1.98, Time/seq 0.50s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.15, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.55s, TFLOPs: 13.53, Samples/sec: 2.59, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 223/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046554803848267, loss: 0.004644624423235655 +Beginning of Epoch 224/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.30s, TFLOPs: 9.09, Samples/sec: 1.74, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:29:25,880] [INFO] [logging.py:96:log_dist] [Rank 0] step=1340, skipped=18, lr=[1.5839481565165732e-06, 8.206985266925248e-05, 1.5839481565165732e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:29:25,881] [INFO] [timer.py:260:stop] epoch=223/micro_step=2/global_step=1340, RunningAvgSamplesPerSec=7.736022285449305, CurrSamplesPerSec=6.883352937870098, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.33s, TFLOPs: 9.00, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.15, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.16, Samples/sec: 1.94, Time/seq 0.51s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.56s, TFLOPs: 13.46, Samples/sec: 2.57, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 224/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046558380126953, loss: 0.0046450369991362095 +Beginning of Epoch 225/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.30s, TFLOPs: 9.11, Samples/sec: 1.74, Time/seq 0.57s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.33s, TFLOPs: 9.00, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.17, Samples/sec: 1.94, Time/seq 0.51s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.15, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.15, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:29:46,661] [INFO] [logging.py:96:log_dist] [Rank 0] step=1350, skipped=18, lr=[1.522060213894077e-06, 7.886322351782782e-05, 1.522060213894077e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:29:46,662] [INFO] [timer.py:260:stop] epoch=224/micro_step=6/global_step=1350, RunningAvgSamplesPerSec=7.737519781770807, CurrSamplesPerSec=9.96453243487197, MemAllocated=6.27GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 1.61s, TFLOPs: 13.02, Samples/sec: 2.49, Time/seq 0.40s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 225/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046494007110596, loss: 0.004638657905161381 +Beginning of Epoch 226/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.02, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.34s, TFLOPs: 8.93, Samples/sec: 1.71, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.16, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.58s, TFLOPs: 13.23, Samples/sec: 2.53, Time/seq 0.40s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 226/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046380758285522, loss: 0.004627339541912079 +Beginning of Epoch 227/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.42s, TFLOPs: 8.65, Samples/sec: 1.65, Time/seq 0.60s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.36s, TFLOPs: 8.89, Samples/sec: 1.70, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.10, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:30:09,225] [INFO] [logging.py:96:log_dist] [Rank 0] step=1360, skipped=18, lr=[1.4611783787267265e-06, 7.570872428635889e-05, 1.4611783787267265e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:30:09,226] [INFO] [timer.py:260:stop] epoch=226/micro_step=4/global_step=1360, RunningAvgSamplesPerSec=7.73570380359671, CurrSamplesPerSec=7.766857467709302, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.15, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.16, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.56s, TFLOPs: 13.45, Samples/sec: 2.57, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 227/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046457052230835, loss: 0.0046349926851689816 +Beginning of Epoch 228/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.34s, TFLOPs: 8.96, Samples/sec: 1.71, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.36s, TFLOPs: 8.88, Samples/sec: 1.70, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.08s, TFLOPs: 10.08, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.10, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.57s, TFLOPs: 13.32, Samples/sec: 2.54, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 228/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046595335006714, loss: 0.00464867427945137 +Beginning of Epoch 229/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.33s, TFLOPs: 8.97, Samples/sec: 1.71, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:30:31,212] [INFO] [logging.py:96:log_dist] [Rank 0] step=1370, skipped=18, lr=[1.4013211962112878e-06, 7.260731586586983e-05, 1.4013211962112878e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:30:31,213] [INFO] [timer.py:260:stop] epoch=228/micro_step=2/global_step=1370, RunningAvgSamplesPerSec=7.735496275050307, CurrSamplesPerSec=6.838314980504205, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.34s, TFLOPs: 8.94, Samples/sec: 1.71, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.08s, TFLOPs: 10.08, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.09, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.08s, TFLOPs: 10.07, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.56s, TFLOPs: 13.42, Samples/sec: 2.56, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 229/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046660900115967, loss: 0.004655242897570133 +Beginning of Epoch 230/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.33s, TFLOPs: 9.00, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.33s, TFLOPs: 9.00, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.08s, TFLOPs: 10.08, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.09, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.11, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:30:52,032] [INFO] [logging.py:96:log_dist] [Rank 0] step=1380, skipped=18, lr=[1.3425068994254028e-06, 6.955994297540946e-05, 1.3425068994254028e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:30:52,032] [INFO] [timer.py:260:stop] epoch=229/micro_step=6/global_step=1380, RunningAvgSamplesPerSec=7.736845433722544, CurrSamplesPerSec=10.418937169686854, MemAllocated=6.27GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 1.54s, TFLOPs: 13.62, Samples/sec: 2.60, Time/seq 0.38s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 230/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046621561050415, loss: 0.004651273135095835 +Beginning of Epoch 231/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.02, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.34s, TFLOPs: 8.96, Samples/sec: 1.71, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.08s, TFLOPs: 10.09, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.10, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.15, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.54s, TFLOPs: 13.59, Samples/sec: 2.60, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 231/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046569108963013, loss: 0.004646029323339462 +Beginning of Epoch 232/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.31s, TFLOPs: 9.07, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.01, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:31:14,409] [INFO] [logging.py:96:log_dist] [Rank 0] step=1390, skipped=18, lr=[1.284753403773621e-06, 6.656753387428089e-05, 1.284753403773621e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:31:14,410] [INFO] [timer.py:260:stop] epoch=231/micro_step=4/global_step=1390, RunningAvgSamplesPerSec=7.735559463575248, CurrSamplesPerSec=7.758215194174269, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.08s, TFLOPs: 10.08, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.56s, TFLOPs: 13.40, Samples/sec: 2.56, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 232/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046565532684326, loss: 0.0046456847339868546 +Beginning of Epoch 233/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.02, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.35s, TFLOPs: 8.89, Samples/sec: 1.70, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.10, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.15, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.11, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.56s, TFLOPs: 13.43, Samples/sec: 2.57, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 233/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046541690826416, loss: 0.0046433634124696255 +Beginning of Epoch 234/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.02, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:31:36,351] [INFO] [logging.py:96:log_dist] [Rank 0] step=1400, skipped=18, lr=[1.2280783015301902e-06, 6.363100007928447e-05, 1.2280783015301902e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:31:36,351] [INFO] [timer.py:260:stop] epoch=233/micro_step=2/global_step=1400, RunningAvgSamplesPerSec=7.7354725863989655, CurrSamplesPerSec=6.871400951957538, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.33s, TFLOPs: 8.98, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.15, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.16, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.11, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.57s, TFLOPs: 13.31, Samples/sec: 2.54, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 234/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046497583389282, loss: 0.004638966638594866 +Beginning of Epoch 235/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.33s, TFLOPs: 8.97, Samples/sec: 1.71, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.00, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.12, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.10, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:31:57,149] [INFO] [logging.py:96:log_dist] [Rank 0] step=1410, skipped=18, lr=[1.172498856480276e-06, 6.075123608706093e-05, 1.172498856480276e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:31:57,149] [INFO] [timer.py:260:stop] epoch=234/micro_step=6/global_step=1410, RunningAvgSamplesPerSec=7.736842171293425, CurrSamplesPerSec=10.348723497042599, MemAllocated=6.27GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 1.55s, TFLOPs: 13.53, Samples/sec: 2.58, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 235/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046496391296387, loss: 0.0046388693153858185 +Beginning of Epoch 236/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.31s, TFLOPs: 9.07, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.43s, TFLOPs: 8.62, Samples/sec: 1.65, Time/seq 0.61s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.15, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.16, Samples/sec: 1.94, Time/seq 0.51s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.15, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.53s, TFLOPs: 13.67, Samples/sec: 2.61, Time/seq 0.38s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 236/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046507120132446, loss: 0.004639935679733753 +Beginning of Epoch 237/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.35s, TFLOPs: 8.92, Samples/sec: 1.70, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.35s, TFLOPs: 8.90, Samples/sec: 1.70, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:32:19,651] [INFO] [logging.py:96:log_dist] [Rank 0] step=1420, skipped=18, lr=[1.1180319986612508e-06, 5.792911910161921e-05, 1.1180319986612508e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:32:19,651] [INFO] [timer.py:260:stop] epoch=236/micro_step=4/global_step=1420, RunningAvgSamplesPerSec=7.73528561783852, CurrSamplesPerSec=7.750718983461734, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.17, Samples/sec: 1.94, Time/seq 0.51s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.54s, TFLOPs: 13.55, Samples/sec: 2.59, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 237/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046501159667969, loss: 0.004639298655092716 +Beginning of Epoch 238/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.30s, TFLOPs: 9.10, Samples/sec: 1.74, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.02, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.16, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.08s, TFLOPs: 10.09, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.53s, TFLOPs: 13.70, Samples/sec: 2.62, Time/seq 0.38s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 238/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046547651290894, loss: 0.004643899388611317 +Beginning of Epoch 239/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.30s, TFLOPs: 9.09, Samples/sec: 1.74, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:32:41,450] [INFO] [logging.py:96:log_dist] [Rank 0] step=1430, skipped=18, lr=[1.0646943192056365e-06, 5.5165508767131415e-05, 1.0646943192056365e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:32:41,451] [INFO] [timer.py:260:stop] epoch=238/micro_step=2/global_step=1430, RunningAvgSamplesPerSec=7.7355812642510795, CurrSamplesPerSec=6.891278006110349, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.01, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.15, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.16, Samples/sec: 1.94, Time/seq 0.51s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.54s, TFLOPs: 13.59, Samples/sec: 2.60, Time/seq 0.38s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 239/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046558380126953, loss: 0.004644996486604214 +Beginning of Epoch 240/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.30s, TFLOPs: 9.09, Samples/sec: 1.74, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.31s, TFLOPs: 9.04, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.15, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:32:58,485] [INFO] [loss_scaler.py:190:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 2097152, but hysteresis is 2. Reducing hysteresis to 1 +Model Parameters: 6.927 B, Latency: 2.01s, TFLOPs: 10.40, Samples/sec: 1.99, Time/seq 0.50s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.17, Samples/sec: 1.94, Time/seq 0.51s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:33:02,069] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 2097152, reducing to 1048576 +[2023-12-06 09:33:02,069] [INFO] [logging.py:96:log_dist] [Rank 0] step=1440, skipped=20, lr=[1.022848113847567e-06, 5.299731159831953e-05, 1.022848113847567e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:33:02,070] [INFO] [timer.py:260:stop] epoch=239/micro_step=6/global_step=1440, RunningAvgSamplesPerSec=7.737392884374001, CurrSamplesPerSec=10.523447094908642, MemAllocated=6.27GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 1.52s, TFLOPs: 13.75, Samples/sec: 2.63, Time/seq 0.38s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 240/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.00465989112854, loss: 0.004649053327739239 +Beginning of Epoch 241/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.30s, TFLOPs: 9.11, Samples/sec: 1.74, Time/seq 0.57s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.33s, TFLOPs: 8.97, Samples/sec: 1.71, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.16, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.17, Samples/sec: 1.94, Time/seq 0.51s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.16, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.54s, TFLOPs: 13.61, Samples/sec: 2.60, Time/seq 0.38s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 241/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046606063842773, loss: 0.004649748094379902 +Beginning of Epoch 242/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.31s, TFLOPs: 9.05, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.33s, TFLOPs: 9.00, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:33:24,410] [INFO] [logging.py:96:log_dist] [Rank 0] step=1450, skipped=20, lr=[9.715836640218132e-07, 5.0341122488176846e-05, 9.715836640218132e-07], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:33:24,411] [INFO] [timer.py:260:stop] epoch=241/micro_step=4/global_step=1450, RunningAvgSamplesPerSec=7.736278842551517, CurrSamplesPerSec=7.759370572698879, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.12, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.54s, TFLOPs: 13.57, Samples/sec: 2.59, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 242/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046534538269043, loss: 0.004642621614038944 +Beginning of Epoch 243/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.31s, TFLOPs: 9.06, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.34s, TFLOPs: 8.96, Samples/sec: 1.71, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.16, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.55s, TFLOPs: 13.54, Samples/sec: 2.59, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 243/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046404600143433, loss: 0.0046296752989292145 +Beginning of Epoch 244/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.43s, TFLOPs: 8.61, Samples/sec: 1.65, Time/seq 0.61s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:33:46,388] [INFO] [logging.py:96:log_dist] [Rank 0] step=1460, skipped=20, lr=[9.214930021408792e-07, 4.7745751406263163e-05, 9.214930021408792e-07], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:33:46,388] [INFO] [timer.py:260:stop] epoch=243/micro_step=2/global_step=1460, RunningAvgSamplesPerSec=7.736093836759811, CurrSamplesPerSec=6.836755861132553, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.34s, TFLOPs: 8.94, Samples/sec: 1.71, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.09, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.08s, TFLOPs: 10.09, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.55s, TFLOPs: 13.53, Samples/sec: 2.59, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 244/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.004638433456421, loss: 0.004627726972103119 +Beginning of Epoch 245/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.31s, TFLOPs: 9.06, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.34s, TFLOPs: 8.96, Samples/sec: 1.71, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.08s, TFLOPs: 10.08, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.08s, TFLOPs: 10.08, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.08s, TFLOPs: 10.07, Samples/sec: 1.92, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:34:07,217] [INFO] [logging.py:96:log_dist] [Rank 0] step=1470, skipped=20, lr=[8.725913863056142e-07, 4.521198892775202e-05, 8.725913863056142e-07], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:34:07,218] [INFO] [timer.py:260:stop] epoch=244/micro_step=6/global_step=1470, RunningAvgSamplesPerSec=7.737340718459995, CurrSamplesPerSec=10.229306491276287, MemAllocated=6.27GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 1.57s, TFLOPs: 13.37, Samples/sec: 2.56, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 245/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046378374099731, loss: 0.004627154674381018 +Beginning of Epoch 246/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.34s, TFLOPs: 8.96, Samples/sec: 1.71, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.33s, TFLOPs: 9.00, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.11, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.12, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.11, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.59s, TFLOPs: 13.21, Samples/sec: 2.52, Time/seq 0.40s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 246/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046424865722656, loss: 0.0046316939406096935 +Beginning of Epoch 247/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.31s, TFLOPs: 9.05, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.33s, TFLOPs: 8.97, Samples/sec: 1.71, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:34:29,673] [INFO] [logging.py:96:log_dist] [Rank 0] step=1480, skipped=20, lr=[8.248937124219243e-07, 4.274060686123959e-05, 8.248937124219243e-07], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:34:29,674] [INFO] [timer.py:260:stop] epoch=246/micro_step=4/global_step=1480, RunningAvgSamplesPerSec=7.73595134252695, CurrSamplesPerSec=7.730840099996901, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.11, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.16, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.56s, TFLOPs: 13.39, Samples/sec: 2.56, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 247/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046477317810059, loss: 0.004636987578123808 +Beginning of Epoch 248/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.31s, TFLOPs: 9.07, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.03, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.11, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.16, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.15, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.55s, TFLOPs: 13.51, Samples/sec: 2.58, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 248/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046508312225342, loss: 0.004640006925910711 +Beginning of Epoch 249/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.03, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:34:51,518] [INFO] [logging.py:96:log_dist] [Rank 0] step=1490, skipped=20, lr=[7.784145096633295e-07, 4.033235801364402e-05, 7.784145096633295e-07], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:34:51,518] [INFO] [timer.py:260:stop] epoch=248/micro_step=2/global_step=1490, RunningAvgSamplesPerSec=7.736109261854205, CurrSamplesPerSec=6.896918358261047, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.02, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.16, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.15, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.56s, TFLOPs: 13.42, Samples/sec: 2.56, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 249/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046509504318237, loss: 0.004640177357941866 +Beginning of Epoch 250/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.35s, TFLOPs: 8.91, Samples/sec: 1.70, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.35s, TFLOPs: 8.93, Samples/sec: 1.71, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.08s, TFLOPs: 10.08, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.15, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.15, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:35:12,361] [INFO] [logging.py:96:log_dist] [Rank 0] step=1500, skipped=20, lr=[7.331679360452449e-07, 3.798797596089351e-05, 7.331679360452449e-07], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:35:12,362] [INFO] [timer.py:260:stop] epoch=249/micro_step=6/global_step=1500, RunningAvgSamplesPerSec=7.7373045197307135, CurrSamplesPerSec=10.185140795146504, MemAllocated=6.27GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 1.57s, TFLOPs: 13.31, Samples/sec: 2.54, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 250/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046501159667969, loss: 0.0046392641961574554 +Beginning of Epoch 251/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.30s, TFLOPs: 9.11, Samples/sec: 1.74, Time/seq 0.57s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.03, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.11, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.12, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.53s, TFLOPs: 13.71, Samples/sec: 2.62, Time/seq 0.38s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 251/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046558380126953, loss: 0.004644969943910837 +Beginning of Epoch 252/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.30s, TFLOPs: 9.10, Samples/sec: 1.74, Time/seq 0.57s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.31s, TFLOPs: 9.07, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:35:34,659] [INFO] [logging.py:96:log_dist] [Rank 0] step=1510, skipped=20, lr=[6.891677741123086e-07, 3.570817482447195e-05, 6.891677741123086e-07], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:35:34,659] [INFO] [timer.py:260:stop] epoch=251/micro_step=4/global_step=1510, RunningAvgSamplesPerSec=7.736326504579678, CurrSamplesPerSec=7.7582412043454285, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.52s, TFLOPs: 13.75, Samples/sec: 2.63, Time/seq 0.38s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 252/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046576261520386, loss: 0.004646811634302139 +Beginning of Epoch 253/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.31s, TFLOPs: 9.07, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.03, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.16, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.12, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.15, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.51s, TFLOPs: 13.82, Samples/sec: 2.64, Time/seq 0.38s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 253/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046552419662476, loss: 0.004644398577511311 +Beginning of Epoch 254/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.30s, TFLOPs: 9.11, Samples/sec: 1.74, Time/seq 0.57s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:35:56,400] [INFO] [logging.py:96:log_dist] [Rank 0] step=1520, skipped=20, lr=[6.464274267400833e-07, 3.3493649053890325e-05, 6.464274267400833e-07], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:35:56,400] [INFO] [timer.py:260:stop] epoch=253/micro_step=2/global_step=1520, RunningAvgSamplesPerSec=7.736741243212721, CurrSamplesPerSec=6.937586611461385, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.31s, TFLOPs: 9.07, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.12, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.12, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.16, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.57s, TFLOPs: 13.36, Samples/sec: 2.55, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 254/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046519041061401, loss: 0.004641111008822918 +Beginning of Epoch 255/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.30s, TFLOPs: 9.10, Samples/sec: 1.74, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.34s, TFLOPs: 8.94, Samples/sec: 1.71, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.10, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:36:17,198] [INFO] [logging.py:96:log_dist] [Rank 0] step=1530, skipped=20, lr=[6.049599130524156e-07, 3.134507321515106e-05, 6.049599130524156e-07], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:36:17,199] [INFO] [timer.py:260:stop] epoch=254/micro_step=6/global_step=1530, RunningAvgSamplesPerSec=7.738008962967379, CurrSamplesPerSec=10.243365168030818, MemAllocated=6.27GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 1.56s, TFLOPs: 13.39, Samples/sec: 2.56, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 255/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046547651290894, loss: 0.004643917083740234 +Beginning of Epoch 256/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.30s, TFLOPs: 9.11, Samples/sec: 1.74, Time/seq 0.57s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.36s, TFLOPs: 8.85, Samples/sec: 1.69, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.12, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.15, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.51s, TFLOPs: 13.83, Samples/sec: 2.64, Time/seq 0.38s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 256/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.004651665687561, loss: 0.004640886560082436 +Beginning of Epoch 257/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.30s, TFLOPs: 9.12, Samples/sec: 1.74, Time/seq 0.57s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.45s, TFLOPs: 8.55, Samples/sec: 1.63, Time/seq 0.61s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:36:39,658] [INFO] [logging.py:96:log_dist] [Rank 0] step=1540, skipped=20, lr=[5.647778644556773e-07, 2.9263101785268254e-05, 5.647778644556773e-07], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:36:39,658] [INFO] [timer.py:260:stop] epoch=256/micro_step=4/global_step=1540, RunningAvgSamplesPerSec=7.736641197261128, CurrSamplesPerSec=7.734774908016168, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.11, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:36:43,215] [INFO] [loss_scaler.py:190:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 2097152, but hysteresis is 2. Reducing hysteresis to 1 +Model Parameters: 6.927 B, Latency: 1.49s, TFLOPs: 14.08, Samples/sec: 2.69, Time/seq 0.37s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 257/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046591758728027, loss: 0.004648256581276655 +Beginning of Epoch 258/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.30s, TFLOPs: 9.12, Samples/sec: 1.74, Time/seq 0.57s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:36:48,404] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 2097152, reducing to 1048576 +Model Parameters: 6.927 B, Latency: 2.28s, TFLOPs: 9.16, Samples/sec: 1.75, Time/seq 0.57s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.15, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.15, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.11, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.53s, TFLOPs: 13.72, Samples/sec: 2.62, Time/seq 0.38s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 258/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046488046646118, loss: 0.004638071171939373 +Beginning of Epoch 259/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.31s, TFLOPs: 9.06, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:37:01,372] [INFO] [logging.py:96:log_dist] [Rank 0] step=1550, skipped=22, lr=[5.335659988694895e-07, 2.764590667717562e-05, 5.335659988694895e-07], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:37:01,372] [INFO] [timer.py:260:stop] epoch=258/micro_step=2/global_step=1550, RunningAvgSamplesPerSec=7.737119029600906, CurrSamplesPerSec=6.871267275217394, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.33s, TFLOPs: 8.98, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.15, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.53s, TFLOPs: 13.65, Samples/sec: 2.61, Time/seq 0.38s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 259/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.004650354385376, loss: 0.004639556165784597 +Beginning of Epoch 260/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.30s, TFLOPs: 9.09, Samples/sec: 1.74, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.01, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.16, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.17, Samples/sec: 1.94, Time/seq 0.51s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:37:22,085] [INFO] [logging.py:96:log_dist] [Rank 0] step=1560, skipped=22, lr=[4.957283663213656e-07, 2.568540758141791e-05, 4.957283663213656e-07], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:37:22,085] [INFO] [timer.py:260:stop] epoch=259/micro_step=6/global_step=1560, RunningAvgSamplesPerSec=7.7385716759067416, CurrSamplesPerSec=10.340084367641753, MemAllocated=6.27GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 1.55s, TFLOPs: 13.52, Samples/sec: 2.58, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 260/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046446323394775, loss: 0.004633807577192783 +Beginning of Epoch 261/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.30s, TFLOPs: 9.08, Samples/sec: 1.74, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.35s, TFLOPs: 8.90, Samples/sec: 1.70, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.16, Samples/sec: 1.94, Time/seq 0.51s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.17, Samples/sec: 1.94, Time/seq 0.51s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.15, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.52s, TFLOPs: 13.74, Samples/sec: 2.63, Time/seq 0.38s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 261/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046457052230835, loss: 0.004634868819266558 +Beginning of Epoch 262/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.03, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.35s, TFLOPs: 8.92, Samples/sec: 1.71, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.11, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:37:44,463] [INFO] [logging.py:96:log_dist] [Rank 0] step=1570, skipped=22, lr=[4.5920947185145666e-07, 2.379323688349516e-05, 4.5920947185145666e-07], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:37:44,463] [INFO] [timer.py:260:stop] epoch=261/micro_step=4/global_step=1570, RunningAvgSamplesPerSec=7.737430969915318, CurrSamplesPerSec=7.729750182362303, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.11, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.11, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.53s, TFLOPs: 13.69, Samples/sec: 2.62, Time/seq 0.38s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 262/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046498775482178, loss: 0.004639122635126114 +Beginning of Epoch 263/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.29s, TFLOPs: 9.12, Samples/sec: 1.74, Time/seq 0.57s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.02, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.09, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.56s, TFLOPs: 13.43, Samples/sec: 2.57, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 263/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046501159667969, loss: 0.004639311693608761 +Beginning of Epoch 264/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.37s, TFLOPs: 8.84, Samples/sec: 1.69, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:38:06,356] [INFO] [logging.py:96:log_dist] [Rank 0] step=1580, skipped=22, lr=[4.2402043946878304e-07, 2.196997095693176e-05, 4.2402043946878304e-07], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:38:06,356] [INFO] [timer.py:260:stop] epoch=263/micro_step=2/global_step=1580, RunningAvgSamplesPerSec=7.737465234885264, CurrSamplesPerSec=6.868348790705967, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.33s, TFLOPs: 8.98, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.15, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.15, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.08s, TFLOPs: 10.05, Samples/sec: 1.92, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.57s, TFLOPs: 13.37, Samples/sec: 2.55, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 264/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046526193618774, loss: 0.004641720559448004 +Beginning of Epoch 265/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.31s, TFLOPs: 9.07, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.37s, TFLOPs: 8.84, Samples/sec: 1.69, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.15, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.15, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:38:27,173] [INFO] [logging.py:96:log_dist] [Rank 0] step=1590, skipped=22, lr=[3.9017198809349414e-07, 2.0216165186191403e-05, 3.9017198809349414e-07], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:38:27,174] [INFO] [timer.py:260:stop] epoch=264/micro_step=6/global_step=1590, RunningAvgSamplesPerSec=7.738636380295201, CurrSamplesPerSec=10.313273719932171, MemAllocated=6.27GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 1.55s, TFLOPs: 13.48, Samples/sec: 2.58, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 265/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046472549438477, loss: 0.004636448808014393 +Beginning of Epoch 266/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.33s, TFLOPs: 9.00, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.34s, TFLOPs: 8.96, Samples/sec: 1.71, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.12, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.12, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.56s, TFLOPs: 13.41, Samples/sec: 2.56, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 266/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046545267105103, loss: 0.004643658176064491 +Beginning of Epoch 267/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.30s, TFLOPs: 9.09, Samples/sec: 1.74, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.35s, TFLOPs: 8.90, Samples/sec: 1.70, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.12, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:38:49,598] [INFO] [logging.py:96:log_dist] [Rank 0] step=1600, skipped=22, lr=[3.5767442829177545e-07, 1.8532353797501317e-05, 3.5767442829177545e-07], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:38:49,598] [INFO] [timer.py:260:stop] epoch=266/micro_step=4/global_step=1600, RunningAvgSamplesPerSec=7.737415844398861, CurrSamplesPerSec=7.779600454936386, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.17, Samples/sec: 1.94, Time/seq 0.51s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.10, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.61s, TFLOPs: 13.03, Samples/sec: 2.49, Time/seq 0.40s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 267/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046542882919312, loss: 0.00464339554309845 +Beginning of Epoch 268/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.02, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.33s, TFLOPs: 9.00, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.10, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.09, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.56s, TFLOPs: 13.41, Samples/sec: 2.56, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 268/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046545267105103, loss: 0.004643773660063744 +Beginning of Epoch 269/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.30s, TFLOPs: 9.11, Samples/sec: 1.74, Time/seq 0.57s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:39:11,551] [INFO] [logging.py:96:log_dist] [Rank 0] step=1610, skipped=22, lr=[3.265376591351538e-07, 1.691904969612196e-05, 3.265376591351538e-07], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:39:11,552] [INFO] [timer.py:260:stop] epoch=268/micro_step=2/global_step=1610, RunningAvgSamplesPerSec=7.737278337549834, CurrSamplesPerSec=6.816597512565108, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.35s, TFLOPs: 8.91, Samples/sec: 1.70, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.16, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.16, Samples/sec: 1.94, Time/seq 0.51s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.15, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.55s, TFLOPs: 13.46, Samples/sec: 2.57, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 269/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046520233154297, loss: 0.004641151987016201 +Beginning of Epoch 270/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.31s, TFLOPs: 9.07, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.33s, TFLOPs: 9.00, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.15, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:39:32,305] [INFO] [logging.py:96:log_dist] [Rank 0] step=1620, skipped=22, lr=[2.967711651851427e-07, 1.537674431011102e-05, 2.967711651851427e-07], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:39:32,306] [INFO] [timer.py:260:stop] epoch=269/micro_step=6/global_step=1620, RunningAvgSamplesPerSec=7.7385859635122065, CurrSamplesPerSec=10.276366193352697, MemAllocated=6.27GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 1.56s, TFLOPs: 13.43, Samples/sec: 2.57, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 270/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046519041061401, loss: 0.004641090519726276 +Beginning of Epoch 271/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.34s, TFLOPs: 8.94, Samples/sec: 1.71, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.03, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.15, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.12, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.08s, TFLOPs: 10.06, Samples/sec: 1.92, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.57s, TFLOPs: 13.36, Samples/sec: 2.55, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 271/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.004652500152588, loss: 0.004641645587980747 +Beginning of Epoch 272/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.29s, TFLOPs: 9.14, Samples/sec: 1.75, Time/seq 0.57s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.01, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:39:54,699] [INFO] [logging.py:96:log_dist] [Rank 0] step=1630, skipped=22, lr=[2.683840136041542e-07, 1.390590744062975e-05, 2.683840136041542e-07], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:39:54,700] [INFO] [timer.py:260:stop] epoch=271/micro_step=4/global_step=1630, RunningAvgSamplesPerSec=7.737450495822309, CurrSamplesPerSec=7.752344947877276, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.55s, TFLOPs: 13.53, Samples/sec: 2.58, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 272/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.004655361175537, loss: 0.0046444544568657875 +Beginning of Epoch 273/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.04, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.37s, TFLOPs: 8.84, Samples/sec: 1.69, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.16, Samples/sec: 1.94, Time/seq 0.51s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.11, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.57s, TFLOPs: 13.29, Samples/sec: 2.54, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 273/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046513080596924, loss: 0.004640522878617048 +Beginning of Epoch 274/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.33s, TFLOPs: 8.99, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:40:16,639] [INFO] [logging.py:96:log_dist] [Rank 0] step=1640, skipped=22, lr=[2.413848513935524e-07, 1.250698711883691e-05, 2.413848513935524e-07], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:40:16,640] [INFO] [timer.py:260:stop] epoch=273/micro_step=2/global_step=1640, RunningAvgSamplesPerSec=7.737367549356774, CurrSamplesPerSec=6.8553750840336525, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.34s, TFLOPs: 8.96, Samples/sec: 1.71, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.11, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.15, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.12, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.58s, TFLOPs: 13.27, Samples/sec: 2.53, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 274/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046541690826416, loss: 0.004643340595066547 +Beginning of Epoch 275/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.01, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:40:29,659] [INFO] [loss_scaler.py:190:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 2097152, but hysteresis is 2. Reducing hysteresis to 1 +Model Parameters: 6.927 B, Latency: 2.31s, TFLOPs: 9.06, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.15, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:40:33,753] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 2097152, reducing to 1048576 +Model Parameters: 6.927 B, Latency: 2.03s, TFLOPs: 10.33, Samples/sec: 1.97, Time/seq 0.51s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.11, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:40:37,387] [INFO] [logging.py:96:log_dist] [Rank 0] step=1650, skipped=24, lr=[2.2079041490896434e-07, 1.1439917870930795e-05, 2.2079041490896434e-07], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:40:37,388] [INFO] [timer.py:260:stop] epoch=274/micro_step=6/global_step=1650, RunningAvgSamplesPerSec=7.738642004882467, CurrSamplesPerSec=10.267973082185263, MemAllocated=6.27GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 1.56s, TFLOPs: 13.42, Samples/sec: 2.56, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 275/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046536922454834, loss: 0.004642879124730825 +Beginning of Epoch 276/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.35s, TFLOPs: 8.92, Samples/sec: 1.70, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.33s, TFLOPs: 8.97, Samples/sec: 1.71, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.57s, TFLOPs: 13.36, Samples/sec: 2.55, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 276/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.004653811454773, loss: 0.004643029533326626 +Beginning of Epoch 277/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.31s, TFLOPs: 9.06, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.34s, TFLOPs: 8.96, Samples/sec: 1.71, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.12, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:40:59,853] [INFO] [logging.py:96:log_dist] [Rank 0] step=1660, skipped=24, lr=[1.9631007284422253e-07, 1.0171506364985622e-05, 1.9631007284422253e-07], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:40:59,854] [INFO] [timer.py:260:stop] epoch=276/micro_step=4/global_step=1660, RunningAvgSamplesPerSec=7.73736972691174, CurrSamplesPerSec=7.668539782923286, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.09s, TFLOPs: 10.03, Samples/sec: 1.92, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.11, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.58s, TFLOPs: 13.29, Samples/sec: 2.54, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 277/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046555995941162, loss: 0.004644750617444515 +Beginning of Epoch 278/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.31s, TFLOPs: 9.05, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.35s, TFLOPs: 8.92, Samples/sec: 1.70, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.10, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.11, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.57s, TFLOPs: 13.37, Samples/sec: 2.56, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 278/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046541690826416, loss: 0.004643330816179514 +Beginning of Epoch 279/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.03, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:41:21,828] [INFO] [logging.py:96:log_dist] [Rank 0] step=1670, skipped=24, lr=[1.7323967457041927e-07, 8.976148941472501e-06, 1.7323967457041927e-07], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:41:21,829] [INFO] [timer.py:260:stop] epoch=278/micro_step=2/global_step=1670, RunningAvgSamplesPerSec=7.737207689992428, CurrSamplesPerSec=6.778255481227925, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.36s, TFLOPs: 8.86, Samples/sec: 1.69, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.10, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.10, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.12, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.56s, TFLOPs: 13.41, Samples/sec: 2.56, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 279/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046566724777222, loss: 0.004645807668566704 +Beginning of Epoch 280/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.33s, TFLOPs: 8.99, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.34s, TFLOPs: 8.94, Samples/sec: 1.71, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.11, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.12, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.10, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:41:42,668] [INFO] [logging.py:96:log_dist] [Rank 0] step=1680, skipped=24, lr=[1.5158624755435507e-07, 7.854209717842232e-06, 1.5158624755435507e-07], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:41:42,669] [INFO] [timer.py:260:stop] epoch=279/micro_step=6/global_step=1680, RunningAvgSamplesPerSec=7.738262513821373, CurrSamplesPerSec=10.25011310292412, MemAllocated=6.27GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 1.56s, TFLOPs: 13.40, Samples/sec: 2.56, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 280/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046517848968506, loss: 0.004640951752662659 +Beginning of Epoch 281/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.04, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.34s, TFLOPs: 8.96, Samples/sec: 1.71, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.11, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.12, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.59s, TFLOPs: 13.20, Samples/sec: 2.52, Time/seq 0.40s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 281/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046566724777222, loss: 0.004645771812647581 +Beginning of Epoch 282/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.02, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.36s, TFLOPs: 8.87, Samples/sec: 1.69, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.09, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:42:05,146] [INFO] [logging.py:96:log_dist] [Rank 0] step=1690, skipped=24, lr=[1.3135638763966197e-07, 6.806030447650879e-06, 1.3135638763966197e-07], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:42:05,146] [INFO] [timer.py:260:stop] epoch=281/micro_step=4/global_step=1690, RunningAvgSamplesPerSec=7.736984181753003, CurrSamplesPerSec=7.74137933620038, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.12, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.15, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.74s, TFLOPs: 12.03, Samples/sec: 2.30, Time/seq 0.43s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 282/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.004652738571167, loss: 0.004641938954591751 +Beginning of Epoch 283/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.31s, TFLOPs: 9.07, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.34s, TFLOPs: 8.95, Samples/sec: 1.71, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.12, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.56s, TFLOPs: 13.40, Samples/sec: 2.56, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 283/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.004650354385376, loss: 0.004639545921236277 +Beginning of Epoch 284/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.31s, TFLOPs: 9.06, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:42:27,212] [INFO] [logging.py:96:log_dist] [Rank 0] step=1700, skipped=24, lr=[1.125562570376398e-07, 5.83193041645802e-06, 1.125562570376398e-07], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:42:27,212] [INFO] [timer.py:260:stop] epoch=283/micro_step=2/global_step=1700, RunningAvgSamplesPerSec=7.736621624021055, CurrSamplesPerSec=6.852547749107755, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.34s, TFLOPs: 8.96, Samples/sec: 1.71, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.12, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.57s, TFLOPs: 13.36, Samples/sec: 2.55, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 284/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046550035476685, loss: 0.004644195549190044 +Beginning of Epoch 285/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.33s, TFLOPs: 8.99, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.37s, TFLOPs: 8.82, Samples/sec: 1.69, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.10, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.11, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:42:48,071] [INFO] [logging.py:96:log_dist] [Rank 0] step=1710, skipped=24, lr=[9.519158245019255e-08, 4.932206344569562e-06, 9.519158245019255e-08], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:42:48,072] [INFO] [timer.py:260:stop] epoch=284/micro_step=6/global_step=1710, RunningAvgSamplesPerSec=7.737618878085892, CurrSamplesPerSec=10.278835793010058, MemAllocated=6.27GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 1.56s, TFLOPs: 13.44, Samples/sec: 2.57, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 285/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046550035476685, loss: 0.004644147120416164 +Beginning of Epoch 286/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.04, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.34s, TFLOPs: 8.95, Samples/sec: 1.71, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.15, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.58s, TFLOPs: 13.23, Samples/sec: 2.53, Time/seq 0.40s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 286/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046542882919312, loss: 0.004643462132662535 +Beginning of Epoch 287/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.02, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.34s, TFLOPs: 8.95, Samples/sec: 1.71, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:43:10,518] [INFO] [logging.py:96:log_dist] [Rank 0] step=1720, skipped=24, lr=[7.926765332541349e-08, 4.107132296653549e-06, 7.926765332541349e-08], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:43:10,518] [INFO] [timer.py:260:stop] epoch=286/micro_step=4/global_step=1720, RunningAvgSamplesPerSec=7.736439837758823, CurrSamplesPerSec=7.716933799063327, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.09, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.08s, TFLOPs: 10.09, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.57s, TFLOPs: 13.32, Samples/sec: 2.55, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 287/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.004651665687561, loss: 0.004640870727598667 +Beginning of Epoch 288/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.42s, TFLOPs: 8.66, Samples/sec: 1.66, Time/seq 0.60s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.41s, TFLOPs: 8.69, Samples/sec: 1.66, Time/seq 0.60s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.08s, TFLOPs: 10.07, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.09, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.08s, TFLOPs: 10.08, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.59s, TFLOPs: 13.15, Samples/sec: 2.51, Time/seq 0.40s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 288/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046526193618774, loss: 0.004641825798898935 +Beginning of Epoch 289/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.35s, TFLOPs: 8.92, Samples/sec: 1.70, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:43:32,721] [INFO] [logging.py:96:log_dist] [Rank 0] step=1730, skipped=24, lr=[6.478932024637282e-08, 3.3569595982576585e-06, 6.478932024637282e-08], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:43:32,721] [INFO] [timer.py:260:stop] epoch=288/micro_step=2/global_step=1730, RunningAvgSamplesPerSec=7.735815221393471, CurrSamplesPerSec=6.832883433983719, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.34s, TFLOPs: 8.93, Samples/sec: 1.71, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.10, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.13, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.56s, TFLOPs: 13.43, Samples/sec: 2.57, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 289/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046522617340088, loss: 0.004641498439013958 +Beginning of Epoch 290/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.37s, TFLOPs: 8.82, Samples/sec: 1.69, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.37s, TFLOPs: 8.85, Samples/sec: 1.69, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.08s, TFLOPs: 10.09, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.08s, TFLOPs: 10.08, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.10, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:43:53,654] [INFO] [logging.py:96:log_dist] [Rank 0] step=1740, skipped=24, lr=[5.17609934535813e-08, 2.681916759252917e-06, 5.17609934535813e-08], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:43:53,654] [INFO] [timer.py:260:stop] epoch=289/micro_step=6/global_step=1740, RunningAvgSamplesPerSec=7.736645541974179, CurrSamplesPerSec=10.10578105336518, MemAllocated=6.27GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 1.58s, TFLOPs: 13.21, Samples/sec: 2.52, Time/seq 0.40s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 290/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046566724777222, loss: 0.004645905923098326 +Beginning of Epoch 291/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.39s, TFLOPs: 8.74, Samples/sec: 1.67, Time/seq 0.60s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.36s, TFLOPs: 8.88, Samples/sec: 1.70, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.08s, TFLOPs: 10.09, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.08s, TFLOPs: 10.05, Samples/sec: 1.92, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.14, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.67s, TFLOPs: 12.51, Samples/sec: 2.39, Time/seq 0.42s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 291/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046545267105103, loss: 0.004643725231289864 +Beginning of Epoch 292/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.38s, TFLOPs: 8.80, Samples/sec: 1.68, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.38s, TFLOPs: 8.79, Samples/sec: 1.68, Time/seq 0.60s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.09, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:44:16,435] [INFO] [loss_scaler.py:190:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 2097152, but hysteresis is 2. Reducing hysteresis to 1 +[2023-12-06 09:44:16,436] [INFO] [logging.py:96:log_dist] [Rank 0] step=1750, skipped=25, lr=[4.127854387136491e-08, 2.1387846565474044e-06, 4.127854387136491e-08], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:44:16,436] [INFO] [timer.py:260:stop] epoch=291/micro_step=4/global_step=1750, RunningAvgSamplesPerSec=7.7348891537810225, CurrSamplesPerSec=7.891136919128529, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.03s, TFLOPs: 10.32, Samples/sec: 1.97, Time/seq 0.51s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.09, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:44:20,057] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 2097152, reducing to 1048576 +Model Parameters: 6.927 B, Latency: 1.54s, TFLOPs: 13.58, Samples/sec: 2.60, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 292/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046554803848267, loss: 0.004644655156880617 +Beginning of Epoch 293/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.33s, TFLOPs: 9.00, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.34s, TFLOPs: 8.93, Samples/sec: 1.71, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.08s, TFLOPs: 10.07, Samples/sec: 1.92, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.11, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.08s, TFLOPs: 10.08, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.57s, TFLOPs: 13.36, Samples/sec: 2.55, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 293/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.004651427268982, loss: 0.004640596453100443 +Beginning of Epoch 294/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.38s, TFLOPs: 8.80, Samples/sec: 1.68, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:44:38,474] [INFO] [logging.py:96:log_dist] [Rank 0] step=1760, skipped=26, lr=[3.197640597808715e-08, 1.6568085999008887e-06, 3.197640597808715e-08], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:44:38,474] [INFO] [timer.py:260:stop] epoch=293/micro_step=2/global_step=1760, RunningAvgSamplesPerSec=7.734645481650479, CurrSamplesPerSec=6.798422893172489, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.36s, TFLOPs: 8.89, Samples/sec: 1.70, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.08s, TFLOPs: 10.07, Samples/sec: 1.92, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.09, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.10, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.57s, TFLOPs: 13.30, Samples/sec: 2.54, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 294/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.004655122756958, loss: 0.004644330590963364 +Beginning of Epoch 295/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.34s, TFLOPs: 8.95, Samples/sec: 1.71, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.34s, TFLOPs: 8.93, Samples/sec: 1.71, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.10, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.08s, TFLOPs: 10.07, Samples/sec: 1.92, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.11s, TFLOPs: 9.91, Samples/sec: 1.89, Time/seq 0.53s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:44:59,428] [INFO] [logging.py:96:log_dist] [Rank 0] step=1770, skipped=26, lr=[2.3027795087664535e-08, 1.193150004542204e-06, 2.3027795087664535e-08], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:44:59,429] [INFO] [timer.py:260:stop] epoch=294/micro_step=6/global_step=1770, RunningAvgSamplesPerSec=7.735428006813978, CurrSamplesPerSec=10.133029251746697, MemAllocated=6.27GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 1.58s, TFLOPs: 13.24, Samples/sec: 2.53, Time/seq 0.40s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 295/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046569108963013, loss: 0.0046460917219519615 +Beginning of Epoch 296/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.39s, TFLOPs: 8.74, Samples/sec: 1.67, Time/seq 0.60s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.37s, TFLOPs: 8.83, Samples/sec: 1.69, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.06s, TFLOPs: 10.15, Samples/sec: 1.94, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.10, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.10, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.55s, TFLOPs: 13.52, Samples/sec: 2.58, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 296/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046532154083252, loss: 0.004642450250685215 +Beginning of Epoch 297/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.31s, TFLOPs: 9.05, Samples/sec: 1.73, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.35s, TFLOPs: 8.90, Samples/sec: 1.70, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.08s, TFLOPs: 10.06, Samples/sec: 1.92, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:45:22,010] [INFO] [logging.py:96:log_dist] [Rank 0] step=1780, skipped=26, lr=[1.5541911448607597e-08, 8.052803859382174e-07, 1.5541911448607597e-08], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:45:22,010] [INFO] [timer.py:260:stop] epoch=296/micro_step=4/global_step=1780, RunningAvgSamplesPerSec=7.734062355911316, CurrSamplesPerSec=7.707549644534795, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.08s, TFLOPs: 10.08, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.09s, TFLOPs: 10.02, Samples/sec: 1.91, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.63s, TFLOPs: 12.86, Samples/sec: 2.46, Time/seq 0.41s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 297/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046550035476685, loss: 0.004644164815545082 +Beginning of Epoch 298/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.38s, TFLOPs: 8.80, Samples/sec: 1.68, Time/seq 0.60s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.37s, TFLOPs: 8.83, Samples/sec: 1.69, Time/seq 0.59s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.11, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.08s, TFLOPs: 10.07, Samples/sec: 1.92, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.11, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.56s, TFLOPs: 13.40, Samples/sec: 2.56, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 298/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046534538269043, loss: 0.00464257039129734 +Beginning of Epoch 299/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.33s, TFLOPs: 8.98, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:45:44,220] [INFO] [logging.py:96:log_dist] [Rank 0] step=1790, skipped=26, lr=[9.52103533358973e-09, 4.933178929321103e-07, 9.52103533358973e-09], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:45:44,221] [INFO] [timer.py:260:stop] epoch=298/micro_step=2/global_step=1790, RunningAvgSamplesPerSec=7.733454376671476, CurrSamplesPerSec=6.640435365038164, MemAllocated=6.45GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 2.41s, TFLOPs: 8.68, Samples/sec: 1.66, Time/seq 0.60s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.08s, TFLOPs: 10.06, Samples/sec: 1.92, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.11, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.07s, TFLOPs: 10.10, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 1.55s, TFLOPs: 13.54, Samples/sec: 2.59, Time/seq 0.39s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 299/300 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.0046534538269043, loss: 0.004642569925636053 +Beginning of Epoch 300/300, Total Micro Batches 6 +Model Parameters: 6.927 B, Latency: 2.30s, TFLOPs: 9.09, Samples/sec: 1.74, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.32s, TFLOPs: 9.01, Samples/sec: 1.72, Time/seq 0.58s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.09s, TFLOPs: 10.03, Samples/sec: 1.92, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.08s, TFLOPs: 10.06, Samples/sec: 1.92, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 2.08s, TFLOPs: 10.07, Samples/sec: 1.93, Time/seq 0.52s, Batch Size: 4, Sequence Length: 512 +[2023-12-06 09:46:05,070] [INFO] [logging.py:96:log_dist] [Rank 0] step=1800, skipped=26, lr=[4.967000759801885e-09, 2.5735755232134115e-07, 4.967000759801885e-09], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-06 09:46:05,070] [INFO] [timer.py:260:stop] epoch=299/micro_step=6/global_step=1800, RunningAvgSamplesPerSec=7.734447039790468, CurrSamplesPerSec=10.114468909049174, MemAllocated=6.27GB, MaxMemAllocated=8.42GB +Model Parameters: 6.927 B, Latency: 1.58s, TFLOPs: 13.22, Samples/sec: 2.53, Time/seq 0.40s, Batch Size: 4, Sequence Length: 512 +***** Evaluating perplexity, Epoch 300/300 ***** Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 1.005433440208435, loss: 0.005418714135885239 +ppl: 1.0046544075012207, loss: 0.004643562249839306 saving the final model ... -[2023-12-06 04:42:59,262] [INFO] [launch.py:347:main] Process 2519341 exits successfully. -[2023-12-06 04:42:59,263] [INFO] [launch.py:347:main] Process 2519342 exits successfully. -[2023-12-06 04:42:59,263] [INFO] [launch.py:347:main] Process 2519343 exits successfully. -[2023-12-06 04:44:22,272] [INFO] [launch.py:347:main] Process 2519340 exits successfully. +[2023-12-06 09:46:18,848] [INFO] [launch.py:347:main] Process 2987770 exits successfully. +[2023-12-06 09:46:18,848] [INFO] [launch.py:347:main] Process 2987768 exits successfully. +[2023-12-06 09:46:18,849] [INFO] [launch.py:347:main] Process 2987769 exits successfully. +[2023-12-06 09:47:41,858] [INFO] [launch.py:347:main] Process 2987767 exits successfully.