Ligeng-Zhu's picture
Upload files with `vila-upload`.
342f304 verified
SLURM_JOB_ID = 1038286
SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
NNODES = 8
NODES = pool0-02124 pool0-02152 pool0-02160 pool0-02193 pool0-02639 pool0-02605 pool0-02007 pool0-02015
NODE_RANK = 5
GPUS_PER_NODE = 8
MASTER_ADDR = pool0-02124
MASTER_PORT = 25001
GLOBAL_TRAIN_BATCH_SIZE = 2048
GRADIENT_ACCUMULATION_STEPS = 4
PER_DEVICE_TRAIN_BATCH_SIZE = 8
DEFAULT_LEARNING_RATE: 2e-5
SLURM_JOB_ID = 1038286
SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
NNODES = 8
NODES = pool0-02124 pool0-02152 pool0-02160 pool0-02193 pool0-02639 pool0-02605 pool0-02007 pool0-02015
NODE_RANK = 6
GPUS_PER_NODE = 8
MASTER_ADDR = pool0-02124
MASTER_PORT = 25001
GLOBAL_TRAIN_BATCH_SIZE = 2048
GRADIENT_ACCUMULATION_STEPS = 4
PER_DEVICE_TRAIN_BATCH_SIZE = 8
DEFAULT_LEARNING_RATE: 2e-5
SLURM_JOB_ID = 1038286
SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
NNODES = 8
NODES = pool0-02124 pool0-02152 pool0-02160 pool0-02193 pool0-02639 pool0-02605 pool0-02007 pool0-02015
NODE_RANK = 4
GPUS_PER_NODE = 8
MASTER_ADDR = pool0-02124
MASTER_PORT = 25001
GLOBAL_TRAIN_BATCH_SIZE = 2048
GRADIENT_ACCUMULATION_STEPS = 4
PER_DEVICE_TRAIN_BATCH_SIZE = 8
DEFAULT_LEARNING_RATE: 2e-5
SLURM_JOB_ID = 1038286
SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
NNODES = 8
SLURM_JOB_ID = 1038286
SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
NNODES = 8
NODES = pool0-02124 pool0-02152 pool0-02160 pool0-02193 pool0-02639 pool0-02605 pool0-02007 pool0-02015
NODE_RANK = 3
GPUS_PER_NODE = 8
MASTER_ADDR = pool0-02124
MASTER_PORT = 25001
GLOBAL_TRAIN_BATCH_SIZE = 2048
GRADIENT_ACCUMULATION_STEPS = 4
PER_DEVICE_TRAIN_BATCH_SIZE = 8
DEFAULT_LEARNING_RATE: 2e-5
NODES = pool0-02124 pool0-02152 pool0-02160 pool0-02193 pool0-02639 pool0-02605 pool0-02007 pool0-02015
NODE_RANK = 2
GPUS_PER_NODE = 8
MASTER_ADDR = pool0-02124
MASTER_PORT = 25001
GLOBAL_TRAIN_BATCH_SIZE = 2048
GRADIENT_ACCUMULATION_STEPS = 4
PER_DEVICE_TRAIN_BATCH_SIZE = 8
DEFAULT_LEARNING_RATE: 2e-5
SLURM_JOB_ID = 1038286
SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
NNODES = 8
NODES = pool0-02124 pool0-02152 pool0-02160 pool0-02193 pool0-02639 pool0-02605 pool0-02007 pool0-02015
NODE_RANK = 1
GPUS_PER_NODE = 8
MASTER_ADDR = pool0-02124
MASTER_PORT = 25001
GLOBAL_TRAIN_BATCH_SIZE = 2048
GRADIENT_ACCUMULATION_STEPS = 4
PER_DEVICE_TRAIN_BATCH_SIZE = 8
DEFAULT_LEARNING_RATE: 2e-5
SLURM_JOB_ID = 1038286
SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
NNODES = 8
NODES = pool0-02124 pool0-02152 pool0-02160 pool0-02193 pool0-02639 pool0-02605 pool0-02007 pool0-02015
NODE_RANK = 7
GPUS_PER_NODE = 8
MASTER_ADDR = pool0-02124
MASTER_PORT = 25001
GLOBAL_TRAIN_BATCH_SIZE = 2048
GRADIENT_ACCUMULATION_STEPS = 4
PER_DEVICE_TRAIN_BATCH_SIZE = 8
DEFAULT_LEARNING_RATE: 2e-5
SLURM_JOB_ID = 1038286
SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
NNODES = 8
NODES = pool0-02124 pool0-02152 pool0-02160 pool0-02193 pool0-02639 pool0-02605 pool0-02007 pool0-02015
NODE_RANK = 0
GPUS_PER_NODE = 8
MASTER_ADDR = pool0-02124
MASTER_PORT = 25001
GLOBAL_TRAIN_BATCH_SIZE = 2048
GRADIENT_ACCUMULATION_STEPS = 4
PER_DEVICE_TRAIN_BATCH_SIZE = 8
DEFAULT_LEARNING_RATE: 2e-5
[2025-07-01 09:10:30,122] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:10:30,706] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:10:30,713] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:10:30,744] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:10:30,752] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:10:30,755] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:10:30,762] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:10:30,764] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:10:30,784] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:10:30,825] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:10:30,850] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:10:30,857] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:10:30,884] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:10:30,886] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:10:30,887] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:10:30,887] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:10:33,393] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:10:33,394] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:10:33,920] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:10:33,920] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:10:33,962] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:10:33,962] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:10:34,066] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:10:34,067] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:10:34,067] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:10:34,067] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:10:34,083] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:10:34,083] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:10:34,164] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:10:34,164] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:10:34,165] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:10:34,165] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:10:34,168] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:10:34,169] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:10:34,169] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:10:34,169] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:10:34,175] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:10:34,175] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:10:34,205] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:10:34,205] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:10:34,206] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:10:34,206] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:10:34,214] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:10:34,214] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:10:34,217] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:10:34,217] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:10:34,296] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:10:34,296] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:10:35,443] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:10:35,443] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:10:35,443] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:10:35,443] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:10:35,443] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:10:35,443] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:10:35,443] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:10:35,444] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:10:35,444] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:10:35,444] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:10:35,444] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:10:35,444] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:10:35,444] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:10:35,444] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:10:35,444] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:10:35,444] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:10:35,504] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:10:35,504] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:10:35,504] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:10:35,504] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:10:35,504] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:10:35,504] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:10:35,504] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:10:35,504] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:10:35,544] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:10:35,544] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:10:35,544] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:10:35,544] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:10:35,544] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:10:35,544] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:10:35,544] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:10:35,544] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:10:36,054] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:10:36,054] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:10:36,054] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:10:36,054] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:10:36,054] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:10:36,054] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:10:36,054] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:10:36,054] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:10:37,641] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:10:37,641] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:10:37,641] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:10:37,641] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:10:37,641] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:10:37,641] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:10:37,641] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:10:37,641] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 09:10:46,125] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:10:46,125] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:10:46,149] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:10:46,149] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:10:46,150] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:10:46,150] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:10:46,151] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:10:46,151] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:10:46,152] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:10:46,152] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:10:46,160] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:10:46,160] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:10:46,168] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:10:46,168] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:10:46,168] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:10:46,168] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:10:46,370] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:10:46,370] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:10:46,376] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:10:46,376] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:10:46,382] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:10:46,382] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:10:46,386] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:10:46,386] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:10:46,387] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:10:46,387] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:10:46,394] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:10:46,394] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:10:46,394] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:10:46,394] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:10:46,396] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:10:46,397] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:10:46,409] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:10:46,409] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:10:46,413] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:10:46,413] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:10:46,413] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:10:46,413] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:10:46,416] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:10:46,416] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:10:46,422] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:10:46,422] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:10:46,428] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:10:46,428] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:10:46,428] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:10:46,429] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:10:46,429] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:10:46,429] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:10:46,493] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:10:46,493] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:10:46,502] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:10:46,502] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:10:46,515] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:10:46,515] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:10:46,533] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:10:46,533] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:10:46,534] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:10:46,534] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:10:46,551] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:10:46,551] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:10:46,554] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:10:46,554] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:10:46,555] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:10:46,555] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:10:46,787] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:10:46,787] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:10:46,819] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:10:46,820] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:10:46,821] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:10:46,821] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:10:46,828] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:10:46,828] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:10:46,854] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:10:46,854] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:10:46,862] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:10:46,862] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:10:46,866] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:10:46,866] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:10:46,868] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:10:46,869] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:10:48,303] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:10:48,303] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:10:48,325] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:10:48,325] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:10:48,330] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:10:48,331] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:10:48,468] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:10:48,468] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:10:48,478] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:10:48,478] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:10:48,480] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:10:48,480] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:10:48,480] [INFO] [comm.py:625:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
[2025-07-01 09:10:48,511] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:10:48,511] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:10:48,511] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 09:10:48,511] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 09:11:04,481] [INFO] [partition_parameters.py:453:__exit__] finished initializing model with 7.61B parameters
[2025-07-01 09:11:20,374] [INFO] [partition_parameters.py:453:__exit__] finished initializing model with 8.03B parameters
[2025-07-01 09:11:21,706] [INFO] [partition_parameters.py:453:__exit__] finished initializing model with 8.09B parameters
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
[dist-0-of-64] LlavaLlamaModel(
(llm): Qwen2ForCausalLM(
(model): Qwen2Model(
(embed_tokens): Embedding(151648, 3584)
(layers): ModuleList(
(0-27): 28 x Qwen2DecoderLayer(
(self_attn): Qwen2FlashAttention2(
(q_proj): Linear(in_features=3584, out_features=3584, bias=True)
(k_proj): Linear(in_features=3584, out_features=512, bias=True)
(v_proj): Linear(in_features=3584, out_features=512, bias=True)
(o_proj): Linear(in_features=3584, out_features=3584, bias=False)
(rotary_emb): Qwen2RotaryEmbedding()
)
(mlp): Qwen2MLP(
(gate_proj): Linear(in_features=3584, out_features=18944, bias=False)
(up_proj): Linear(in_features=3584, out_features=18944, bias=False)
(down_proj): Linear(in_features=18944, out_features=3584, bias=False)
(act_fn): SiLU()
)
(input_layernorm): Qwen2RMSNorm((0,), eps=1e-06)
(post_attention_layernorm): Qwen2RMSNorm((0,), eps=1e-06)
)
)
(norm): Qwen2RMSNorm((0,), eps=1e-06)
(rotary_emb): Qwen2RotaryEmbedding()
)
(lm_head): Linear(in_features=3584, out_features=151648, bias=False)
)
(vision_tower): SiglipVisionTower(
(vision_tower): SiglipVisionModel(
(vision_model): SiglipVisionTransformer(
(embeddings): SiglipVisionEmbeddings(
(patch_embedding): Conv2d(3, 1152, kernel_size=(14, 14), stride=(14, 14), padding=valid)
(position_embedding): Embedding(1024, 1152)
)
(encoder): SiglipEncoder(
(layers): ModuleList(
(0-26): 27 x SiglipEncoderLayer(
(self_attn): SiglipFlashAttention2(
(k_proj): Linear(in_features=1152, out_features=1152, bias=True)
(v_proj): Linear(in_features=1152, out_features=1152, bias=True)
(q_proj): Linear(in_features=1152, out_features=1152, bias=True)
(out_proj): Linear(in_features=1152, out_features=1152, bias=True)
)
(layer_norm1): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
(mlp): SiglipMLP(
(activation_fn): PytorchGELUTanh()
(fc1): Linear(in_features=1152, out_features=4304, bias=True)
(fc2): Linear(in_features=4304, out_features=1152, bias=True)
)
(layer_norm2): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
)
)
)
(post_layernorm): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
)
)
)
(mm_projector): MultimodalProjector(
(layers): Sequential(
(0): DownSample3x3BlockFix()
(1): LayerNorm((10368,), eps=1e-05, elementwise_affine=True)
(2): Linear(in_features=10368, out_features=3456, bias=True)
(3): GELU(approximate='none')
(4): LayerNorm((3456,), eps=1e-05, elementwise_affine=True)
(5): Linear(in_features=3456, out_features=3584, bias=True)
(6): GELU(approximate='none')
(7): Linear(in_features=3584, out_features=3584, bias=True)
)
)
)
[dist-0-of-64] Tunable parameters:
language model True
[dist-0-of-64] vision tower True
[dist-0-of-64] mm projector True
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
[2025-07-01 09:13:55] Rank 15: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 189.35445713996887s
[2025-07-01 09:13:55] Rank 55: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 201.59668064117432s
[2025-07-01 09:13:55] Rank 43: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 189.26995539665222s
[2025-07-01 09:13:55] Rank 19: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 189.44619512557983s
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
[2025-07-01 09:13:55] Rank 27: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 189.14272332191467s
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
[2025-07-01 09:13:55] Rank 57: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 201.8264696598053s
[2025-07-01 09:13:55] Rank 7: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 187.70187664031982s
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
[2025-07-01 09:13:55] Rank 8: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 189.69908165931702s
[2025-07-01 09:13:55] Rank 21: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 189.7061002254486s
[2025-07-01 09:13:56] Rank 46: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 189.5839924812317s
[2025-07-01 09:13:56] Rank 24: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 189.28924942016602s
[2025-07-01 09:13:56] Rank 35: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 189.95243191719055s
[2025-07-01 09:13:56] Rank 1: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 187.78522086143494s
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
[2025-07-01 09:13:56] Rank 50: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 202.2034502029419s
[2025-07-01 09:13:56] Rank 31: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 189.37554931640625s
[2025-07-01 09:13:56] Rank 56: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 202.09286189079285s
[2025-07-01 09:13:56] Rank 3: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 187.87985610961914s
[2025-07-01 09:13:56] Rank 60: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 202.09541821479797s
[2025-07-01 09:13:56] Rank 9: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 189.82429432868958s
[2025-07-01 09:13:56] Rank 38: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 190.08812403678894s
length of dataloader: 28 14336
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
[GPU memory] before trainer 2.292407512664795
[2025-07-01 09:13:56] Rank 20: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 189.87151789665222s
[2025-07-01 09:13:56] Rank 18: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 189.87728786468506s
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
[2025-07-01 09:13:56] Rank 17: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 189.8816032409668s
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
[2025-07-01 09:13:56] Rank 5: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 187.94594931602478s
[2025-07-01 09:13:56] Rank 4: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 187.95590615272522s
[2025-07-01 09:13:56] Rank 41: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 189.7688856124878s
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
[2025-07-01 09:13:56] Rank 36: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 190.14091515541077s
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
[2025-07-01 09:13:56] Rank 62: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 202.1776213645935s
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
[2025-07-01 09:13:56] Rank 22: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 189.92305088043213s
[2025-07-01 09:13:56] Rank 12: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 189.92479276657104s
[2025-07-01 09:13:56] Rank 26: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 189.50398349761963s
[2025-07-01 09:13:56] Rank 30: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 189.50814604759216s
[2025-07-01 09:13:56] Rank 33: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 190.1808216571808s
length of dataloader: 28 14336
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
[GPU memory] before trainer 2.292407512664795
[2025-07-01 09:13:56] Rank 59: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 202.2262580394745s
[2025-07-01 09:13:56] Rank 16: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 189.95579552650452s
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
[2025-07-01 09:13:56] Rank 53: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 202.1831030845642s
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
[2025-07-01 09:13:56] Rank 13: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 189.99509859085083s
[2025-07-01 09:13:56] Rank 23: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 189.99686932563782s
[2025-07-01 09:13:56] Rank 45: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 189.87132096290588s
[2025-07-01 09:13:56] Rank 6: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 188.06090354919434s
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
length of dataloader: 28 14336
[2025-07-01 09:13:56] Rank 61: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 202.2874138355255s
[GPU memory] before trainer 2.292407512664795
[2025-07-01 09:13:56] Rank 54: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 202.3776957988739s
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
[2025-07-01 09:13:56] Rank 14: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 190.0164692401886s
[2025-07-01 09:13:56] Rank 42: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 189.89093255996704s
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
[2025-07-01 09:13:56] Rank 47: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 189.8917055130005s
[2025-07-01 09:13:56] Rank 25: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 189.60104870796204s
[2025-07-01 09:13:56] Rank 58: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 202.30275464057922s
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
[2025-07-01 09:13:56] Rank 28: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 189.60884761810303s
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
[2025-07-01 09:13:56] Rank 0: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 188.10366201400757s
[2025-07-01 09:13:56] Rank 29: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 189.62677335739136s
[2025-07-01 09:13:56] Rank 49: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 202.26963424682617s
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
[2025-07-01 09:13:56] Rank 10: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 190.0586109161377s
[2025-07-01 09:13:56] Rank 37: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 190.2976393699646s
[2025-07-01 09:13:56] Rank 11: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 190.06773209571838s
[2025-07-01 09:13:56] Rank 39: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 190.31072449684143s
[2025-07-01 09:13:56] Rank 34: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 190.3135223388672s
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
[2025-07-01 09:13:56] Rank 40: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 189.95435571670532s
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
[2025-07-01 09:13:56] Rank 44: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 189.97284388542175s
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
[2025-07-01 09:13:56] Rank 51: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 202.32772946357727s
[2025-07-01 09:13:56] Rank 2: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 188.1806402206421s
[2025-07-01 09:13:56] Rank 48: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 202.36938166618347s
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
[2025-07-01 09:13:56] Rank 63: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 202.3610863685608s
length of dataloader: 28 14336
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
[GPU memory] before trainer 2.292407512664795
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
[2025-07-01 09:13:56] Rank 32: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 190.39307260513306s
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
[2025-07-01 09:13:56] Rank 52: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 203.1189968585968s
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
Parameter Offload: Total persistent parameters: 771184 in 421 params