Ligeng-Zhu's picture
Upload files with `vila-upload`.
342f304 verified
SLURM_JOB_ID = 1038294
SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
NNODES = 8
NODES = pool0-01504 pool0-01683 pool0-01722 pool0-01867 pool0-01881 pool0-01893 pool0-01919 pool0-01939
NODE_RANK = 7
GPUS_PER_NODE = 8
MASTER_ADDR = pool0-01504
MASTER_PORT = 25001
GLOBAL_TRAIN_BATCH_SIZE = 2048
GRADIENT_ACCUMULATION_STEPS = 4
PER_DEVICE_TRAIN_BATCH_SIZE = 8
DEFAULT_LEARNING_RATE: 2e-5
SLURM_JOB_ID = 1038294
SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
NNODES = 8
NODES = pool0-01504 pool0-01683 pool0-01722 pool0-01867 pool0-01881 pool0-01893 pool0-01919 pool0-01939
NODE_RANK = 0
GPUS_PER_NODE = 8
MASTER_ADDR = pool0-01504
MASTER_PORT = 25001
GLOBAL_TRAIN_BATCH_SIZE = 2048
GRADIENT_ACCUMULATION_STEPS = 4
PER_DEVICE_TRAIN_BATCH_SIZE = 8
DEFAULT_LEARNING_RATE: 2e-5
SLURM_JOB_ID = 1038294
SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
NNODES = 8
NODES = pool0-01504 pool0-01683 pool0-01722 pool0-01867 pool0-01881 pool0-01893 pool0-01919 pool0-01939
NODE_RANK = 6
GPUS_PER_NODE = 8
MASTER_ADDR = pool0-01504
MASTER_PORT = 25001
GLOBAL_TRAIN_BATCH_SIZE = 2048
GRADIENT_ACCUMULATION_STEPS = 4
PER_DEVICE_TRAIN_BATCH_SIZE = 8
DEFAULT_LEARNING_RATE: 2e-5
SLURM_JOB_ID = 1038294
SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
NNODES = 8
NODES = pool0-01504 pool0-01683 pool0-01722 pool0-01867 pool0-01881 pool0-01893 pool0-01919 pool0-01939
NODE_RANK = 2
GPUS_PER_NODE = 8
MASTER_ADDR = pool0-01504
MASTER_PORT = 25001
GLOBAL_TRAIN_BATCH_SIZE = 2048
GRADIENT_ACCUMULATION_STEPS = 4
PER_DEVICE_TRAIN_BATCH_SIZE = 8
DEFAULT_LEARNING_RATE: 2e-5
SLURM_JOB_ID = 1038294
SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
NNODES = 8
NODES = pool0-01504 pool0-01683 pool0-01722 pool0-01867 pool0-01881 pool0-01893 pool0-01919 pool0-01939
NODE_RANK = 1
GPUS_PER_NODE = 8
MASTER_ADDR = pool0-01504
MASTER_PORT = 25001
GLOBAL_TRAIN_BATCH_SIZE = 2048
GRADIENT_ACCUMULATION_STEPS = 4
PER_DEVICE_TRAIN_BATCH_SIZE = 8
DEFAULT_LEARNING_RATE: 2e-5
SLURM_JOB_ID = 1038294
SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
NNODES = 8
NODES = pool0-01504 pool0-01683 pool0-01722 pool0-01867 pool0-01881 pool0-01893 pool0-01919 pool0-01939
NODE_RANK = 5
GPUS_PER_NODE = 8
MASTER_ADDR = pool0-01504
MASTER_PORT = 25001
GLOBAL_TRAIN_BATCH_SIZE = 2048
GRADIENT_ACCUMULATION_STEPS = 4
PER_DEVICE_TRAIN_BATCH_SIZE = 8
DEFAULT_LEARNING_RATE: 2e-5
SLURM_JOB_ID = 1038294
SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
NNODES = 8
NODES = pool0-01504 pool0-01683 pool0-01722 pool0-01867 pool0-01881 pool0-01893 pool0-01919 pool0-01939
NODE_RANK = 4
GPUS_PER_NODE = 8
MASTER_ADDR = pool0-01504
MASTER_PORT = 25001
GLOBAL_TRAIN_BATCH_SIZE = 2048
GRADIENT_ACCUMULATION_STEPS = 4
PER_DEVICE_TRAIN_BATCH_SIZE = 8
DEFAULT_LEARNING_RATE: 2e-5
SLURM_JOB_ID = 1038294
SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
NNODES = 8
NODES = pool0-01504 pool0-01683 pool0-01722 pool0-01867 pool0-01881 pool0-01893 pool0-01919 pool0-01939
NODE_RANK = 3
GPUS_PER_NODE = 8
MASTER_ADDR = pool0-01504
MASTER_PORT = 25001
GLOBAL_TRAIN_BATCH_SIZE = 2048
GRADIENT_ACCUMULATION_STEPS = 4
PER_DEVICE_TRAIN_BATCH_SIZE = 8
DEFAULT_LEARNING_RATE: 2e-5
[2025-07-01 09:15:45,395] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:15:45,549] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:15:46,175] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:15:46,197] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:15:46,284] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:15:46,324] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:15:46,329] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:15:46,330] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:15:48,763] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:15:48,763] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:15:48,885] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:15:48,885] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:15:49,514] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:15:49,514] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:15:49,515] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:15:49,515] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:15:49,535] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:15:49,535] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:15:50,031] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:15:50,031] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:15:50,031] [INFO] [comm.py:625:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
[2025-07-01 09:15:50,095] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:15:50,095] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:15:50,099] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:15:50,099] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:15:50,847] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:15:50,847] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:15:50,847] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:15:50,847] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:15:50,847] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:15:50,847] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:15:50,854] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:15:50,854] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:15:50,854] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:15:50,854] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:15:50,854] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:15:50,854] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:15:50,854] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:15:50,854] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:15:50,856] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:15:50,856] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:15:50,856] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:15:50,856] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:15:50,856] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:15:50,856] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:15:50,856] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:15:50,856] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:15:50,861] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:15:50,862] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:15:51,005] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:15:51,005] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:15:51,005] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:15:51,005] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:15:51,005] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:15:51,005] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:15:51,023] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:15:51,024] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:15:51,070] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:15:51,070] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:15:51,070] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:15:51,070] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:15:51,070] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:15:51,070] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:15:51,070] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:15:51,071] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:15:51,307] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:15:51,307] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:15:51,307] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:15:51,307] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:15:51,307] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:15:51,307] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:15:51,307] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:15:51,307] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:15:53,021] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:15:53,021] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:15:53,021] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:15:53,021] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:15:53,021] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:15:53,021] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:15:53,021] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:15:53,021] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:16:01,457] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:16:01,457] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:16:01,466] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:16:01,466] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:16:01,467] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:16:01,467] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:16:01,468] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:16:01,468] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:16:01,471] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:16:01,471] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:16:01,475] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:16:01,475] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:16:01,476] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:16:01,476] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:16:01,482] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:16:01,482] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:16:01,486] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:16:01,486] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:16:01,490] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:16:01,490] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:16:01,501] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:16:01,501] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:16:01,502] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:16:01,502] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:16:01,505] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:16:01,505] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:16:01,510] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:16:01,510] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:16:01,512] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:16:01,512] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:16:01,516] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:16:01,516] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:16:01,523] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:16:01,523] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:16:01,568] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:16:01,569] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:16:01,570] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:16:01,570] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:16:01,577] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:16:01,577] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:16:01,578] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:16:01,579] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:16:01,580] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:16:01,580] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:16:01,580] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:16:01,581] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:16:01,591] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:16:01,591] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:16:01,739] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:16:01,739] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:16:01,779] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:16:01,779] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:16:01,782] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:16:01,782] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:16:01,786] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:16:01,787] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:16:01,791] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:16:01,791] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:16:01,792] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:16:01,792] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:16:01,793] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:16:01,794] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:16:01,796] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:16:01,796] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:16:01,890] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:16:01,890] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:16:01,891] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:16:01,891] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:16:01,896] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:16:01,896] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:16:01,897] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:16:01,897] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:16:01,898] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:16:01,898] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:16:01,901] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:16:01,901] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:16:01,906] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:16:01,906] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:16:01,907] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:16:01,907] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:16:01,929] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:16:01,929] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:16:01,930] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:16:01,930] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:16:01,932] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:16:01,932] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:16:01,936] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:16:01,936] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:16:01,937] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:16:01,937] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:16:01,939] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:16:01,939] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:16:01,946] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:16:01,946] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:16:01,946] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:16:01,947] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:16:04,276] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:16:04,276] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:16:04,281] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:16:04,282] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:16:04,284] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:16:04,284] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:16:04,324] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:16:04,324] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:16:04,378] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:16:04,378] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:16:04,385] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:16:04,386] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:16:04,392] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:16:04,392] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:16:04,395] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:16:04,395] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:16:20,231] [INFO] [partition_parameters.py:453:__exit__] finished initializing model with 7.61B parameters
[2025-07-01 09:16:29,303] [INFO] [partition_parameters.py:453:__exit__] finished initializing model with 8.03B parameters
[2025-07-01 09:16:29,934] [INFO] [partition_parameters.py:453:__exit__] finished initializing model with 8.09B parameters
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
[dist-0-of-64] LlavaLlamaModel(
(llm): Qwen2ForCausalLM(
(model): Qwen2Model(
(embed_tokens): Embedding(151648, 3584)
(layers): ModuleList(
(0-27): 28 x Qwen2DecoderLayer(
(self_attn): Qwen2FlashAttention2(
(q_proj): Linear(in_features=3584, out_features=3584, bias=True)
(k_proj): Linear(in_features=3584, out_features=512, bias=True)
(v_proj): Linear(in_features=3584, out_features=512, bias=True)
(o_proj): Linear(in_features=3584, out_features=3584, bias=False)
(rotary_emb): Qwen2RotaryEmbedding()
)
(mlp): Qwen2MLP(
(gate_proj): Linear(in_features=3584, out_features=18944, bias=False)
(up_proj): Linear(in_features=3584, out_features=18944, bias=False)
(down_proj): Linear(in_features=18944, out_features=3584, bias=False)
(act_fn): SiLU()
)
(input_layernorm): Qwen2RMSNorm((0,), eps=1e-06)
(post_attention_layernorm): Qwen2RMSNorm((0,), eps=1e-06)
)
)
(norm): Qwen2RMSNorm((0,), eps=1e-06)
(rotary_emb): Qwen2RotaryEmbedding()
)
(lm_head): Linear(in_features=3584, out_features=151648, bias=False)
)
(vision_tower): SiglipVisionTower(
(vision_tower): SiglipVisionModel(
(vision_model): SiglipVisionTransformer(
(embeddings): SiglipVisionEmbeddings(
(patch_embedding): Conv2d(3, 1152, kernel_size=(14, 14), stride=(14, 14), padding=valid)
(position_embedding): Embedding(1024, 1152)
)
(encoder): SiglipEncoder(
(layers): ModuleList(
(0-26): 27 x SiglipEncoderLayer(
(self_attn): SiglipFlashAttention2(
(k_proj): Linear(in_features=1152, out_features=1152, bias=True)
(v_proj): Linear(in_features=1152, out_features=1152, bias=True)
(q_proj): Linear(in_features=1152, out_features=1152, bias=True)
(out_proj): Linear(in_features=1152, out_features=1152, bias=True)
)
(layer_norm1): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
(mlp): SiglipMLP(
(activation_fn): PytorchGELUTanh()
(fc1): Linear(in_features=1152, out_features=4304, bias=True)
(fc2): Linear(in_features=4304, out_features=1152, bias=True)
)
(layer_norm2): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
)
)
)
(post_layernorm): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
)
)
)
(mm_projector): MultimodalProjector(
(layers): Sequential(
(0): DownSample3x3BlockFix()
(1): LayerNorm((10368,), eps=1e-05, elementwise_affine=True)
(2): Linear(in_features=10368, out_features=3456, bias=True)
(3): GELU(approximate='none')
(4): LayerNorm((3456,), eps=1e-05, elementwise_affine=True)
(5): Linear(in_features=3456, out_features=3584, bias=True)
(6): GELU(approximate='none')
(7): Linear(in_features=3584, out_features=3584, bias=True)
)
)
)
[dist-0-of-64] Tunable parameters:
language model True
[dist-0-of-64] vision tower True
[dist-0-of-64] mm projector True
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
[2025-07-01 09:19:03] Rank 31: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 182.53949546813965s
[2025-07-01 09:19:03] Rank 41: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 182.08894276618958s
[2025-07-01 09:19:03] Rank 18: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 182.17969298362732s
[2025-07-01 09:19:03] Rank 58: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 182.5725381374359s
[2025-07-01 09:19:04] Rank 34: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 182.6094913482666s
[2025-07-01 09:19:04] Rank 55: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 179.82354164123535s
[2025-07-01 09:19:04] Rank 36: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 182.67147135734558s
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
[2025-07-01 09:19:04] Rank 20: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 182.38758826255798s
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
[2025-07-01 09:19:04] Rank 47: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 182.3753411769867s
[2025-07-01 09:19:04] Rank 62: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 182.7982451915741s
[2025-07-01 09:19:04] Rank 11: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 182.58949255943298s
[2025-07-01 09:19:04] Rank 51: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 180.0623378753662s
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
[2025-07-01 09:19:04] Rank 13: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 182.6344199180603s
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
[2025-07-01 09:19:04] Rank 8: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 182.6651632785797s
[2025-07-01 09:19:04] Rank 27: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 182.92465996742249s
[2025-07-01 09:19:04] Rank 9: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 182.67580819129944s
[2025-07-01 09:19:04] Rank 37: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 182.93791437149048s
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
[2025-07-01 09:19:04] Rank 15: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 182.69433569908142s
[2025-07-01 09:19:04] Rank 23: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 182.5678791999817s
[2025-07-01 09:19:04] Rank 48: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 180.17630982398987s
[2025-07-01 09:19:04] Rank 54: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 180.1825351715088s
[2025-07-01 09:19:04] Rank 53: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 180.1913890838623s
[2025-07-01 09:19:04] Rank 21: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 182.60063314437866s
[2025-07-01 09:19:04] Rank 57: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 182.97364211082458s
[2025-07-01 09:19:04] Rank 59: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 182.97400450706482s
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
[2025-07-01 09:19:04] Rank 40: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 182.56714510917664s
[2025-07-01 09:19:04] Rank 52: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 180.22575664520264s
[2025-07-01 09:19:04] Rank 29: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 183.02315592765808s
[2025-07-01 09:19:04] Rank 24: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 183.02467370033264s
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
[2025-07-01 09:19:04] Rank 12: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 182.78038883209229s
[2025-07-01 09:19:04] Rank 10: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 182.78413200378418s
[2025-07-01 09:19:04] Rank 32: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 183.04120206832886s
[2025-07-01 09:19:04] Rank 25: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 183.0450234413147s
[2025-07-01 09:19:04] Rank 43: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 182.59447646141052s
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
[2025-07-01 09:19:04] Rank 44: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 182.60314464569092s
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
[2025-07-01 09:19:04] Rank 56: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 183.03522372245789s
[2025-07-01 09:19:04] Rank 63: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 183.0367488861084s
[2025-07-01 09:19:04] Rank 35: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 183.07129096984863s
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
length of dataloader: 28 14336
length of dataloader: 28 14336
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795[GPU memory] before trainer
2.292407512664795
[GPU memory] before trainer length of dataloader:2.292407512664795
28 14336
[GPU memory] before trainer 2.292407512664795
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
[2025-07-01 09:19:04] Rank 49: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 180.28140139579773s
[2025-07-01 09:19:04] Rank 39: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 183.07610249519348s
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
[2025-07-01 09:19:04] Rank 28: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 183.08507561683655s
[2025-07-01 09:19:04] Rank 6: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 195.64789366722107s
length of dataloader: 28 14336
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
[GPU memory] before trainer 2.292407512664795
[2025-07-01 09:19:04] Rank 1: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 194.98654437065125s
[2025-07-01 09:19:04] Rank 46: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 182.64663672447205s
[2025-07-01 09:19:04] Rank 5: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 195.77389311790466s
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
[2025-07-01 09:19:04] Rank 2: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 194.8103952407837s
length of dataloader: 28 14336
[2025-07-01 09:19:04] Rank 0: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 194.84082126617432s
[GPU memory] before trainer 2.292407512664795
[2025-07-01 09:19:04] Rank 33: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 183.11176013946533s
[2025-07-01 09:19:04] Rank 4: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 194.82239317893982s
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
[2025-07-01 09:19:04] Rank 3: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 195.04775762557983s
[2025-07-01 09:19:04] Rank 7: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 195.0560109615326s
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
[2025-07-01 09:19:04] Rank 38: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 183.1278281211853s
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
[2025-07-01 09:19:04] Rank 30: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 183.13366270065308s
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
[2025-07-01 09:19:04] Rank 26: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 183.14656853675842s
length of dataloader: 28 14336
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
[GPU memory] before trainer 2.292407512664795
[2025-07-01 09:19:04] Rank 22: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 182.76397037506104s
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
[2025-07-01 09:19:04] Rank 19: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 182.77731561660767s
[2025-07-01 09:19:04] Rank 45: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 182.71926474571228s
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
[2025-07-01 09:19:04] Rank 17: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 182.7852520942688s
[2025-07-01 09:19:04] Rank 61: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 183.14857816696167s
[2025-07-01 09:19:04] Rank 14: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 182.92455291748047s
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
[2025-07-01 09:19:04] Rank 16: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 182.81509160995483s
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
[2025-07-01 09:19:04] Rank 50: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 180.4306833744049s
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
[2025-07-01 09:19:04] Rank 42: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 182.78269171714783s
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
[2025-07-01 09:19:04] Rank 60: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 183.22880291938782s
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
length of dataloader: 28 14336
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
[GPU memory] before trainer 2.292407512664795
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
Parameter Offload: Total persistent parameters: 771184 in 421 params