Ligeng-Zhu's picture
Upload files with `vila-upload`.
342f304 verified
SLURM_JOB_ID = 1038255
SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
NNODES = 8
NODES = pool0-01868 pool0-01869 pool0-01894 pool0-01900 pool0-01911 pool0-01921 pool0-02007 pool0-02015
NODE_RANK = 1
GPUS_PER_NODE = 8
MASTER_ADDR = pool0-01868
MASTER_PORT = 25001
GLOBAL_TRAIN_BATCH_SIZE = 2048
GRADIENT_ACCUMULATION_STEPS = 4
PER_DEVICE_TRAIN_BATCH_SIZE = 8
DEFAULT_LEARNING_RATE: 2e-5
SLURM_JOB_ID = 1038255
SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
NNODES = 8
SLURM_JOB_ID = 1038255
SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
NNODES = 8
NODES = pool0-01868 pool0-01869 pool0-01894 pool0-01900 pool0-01911 pool0-01921 pool0-02007 pool0-02015
NODE_RANK = 5
GPUS_PER_NODE = 8
MASTER_ADDR = pool0-01868
MASTER_PORT = 25001
GLOBAL_TRAIN_BATCH_SIZE = 2048
GRADIENT_ACCUMULATION_STEPS = 4
PER_DEVICE_TRAIN_BATCH_SIZE = 8
DEFAULT_LEARNING_RATE: 2e-5
NODES = pool0-01868 pool0-01869 pool0-01894 pool0-01900 pool0-01911 pool0-01921 pool0-02007 pool0-02015
NODE_RANK = 6
GPUS_PER_NODE = 8
MASTER_ADDR = pool0-01868
MASTER_PORT = 25001
GLOBAL_TRAIN_BATCH_SIZE = 2048
GRADIENT_ACCUMULATION_STEPS = 4
PER_DEVICE_TRAIN_BATCH_SIZE = 8
DEFAULT_LEARNING_RATE: 2e-5
SLURM_JOB_ID = 1038255
SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
NNODES = 8
NODES = pool0-01868 pool0-01869 pool0-01894 pool0-01900 pool0-01911 pool0-01921 pool0-02007 pool0-02015
NODE_RANK = 4
GPUS_PER_NODE = 8
MASTER_ADDR = pool0-01868
MASTER_PORT = 25001
GLOBAL_TRAIN_BATCH_SIZE = 2048
GRADIENT_ACCUMULATION_STEPS = 4
PER_DEVICE_TRAIN_BATCH_SIZE = 8
DEFAULT_LEARNING_RATE: 2e-5
SLURM_JOB_ID = 1038255
SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
NNODES = 8
NODES = pool0-01868 pool0-01869 pool0-01894 pool0-01900 pool0-01911 pool0-01921 pool0-02007 pool0-02015
NODE_RANK = 7
GPUS_PER_NODE = 8
MASTER_ADDR = pool0-01868
MASTER_PORT = 25001
GLOBAL_TRAIN_BATCH_SIZE = 2048
GRADIENT_ACCUMULATION_STEPS = 4
PER_DEVICE_TRAIN_BATCH_SIZE = 8
DEFAULT_LEARNING_RATE: 2e-5
SLURM_JOB_ID = 1038255
SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
NNODES = 8
NODES = pool0-01868 pool0-01869 pool0-01894 pool0-01900 pool0-01911 pool0-01921 pool0-02007 pool0-02015
NODE_RANK = 2
GPUS_PER_NODE = 8
MASTER_ADDR = pool0-01868
MASTER_PORT = 25001
GLOBAL_TRAIN_BATCH_SIZE = 2048
GRADIENT_ACCUMULATION_STEPS = 4
PER_DEVICE_TRAIN_BATCH_SIZE = 8
DEFAULT_LEARNING_RATE: 2e-5
SLURM_JOB_ID = 1038255
SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
NNODES = 8
NODES = pool0-01868 pool0-01869 pool0-01894 pool0-01900 pool0-01911 pool0-01921 pool0-02007 pool0-02015
NODE_RANK = 0
GPUS_PER_NODE = 8
MASTER_ADDR = pool0-01868
MASTER_PORT = 25001
GLOBAL_TRAIN_BATCH_SIZE = 2048
GRADIENT_ACCUMULATION_STEPS = 4
PER_DEVICE_TRAIN_BATCH_SIZE = 8
DEFAULT_LEARNING_RATE: 2e-5
SLURM_JOB_ID = 1038255
SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
NNODES = 8
NODES = pool0-01868 pool0-01869 pool0-01894 pool0-01900 pool0-01911 pool0-01921 pool0-02007 pool0-02015
NODE_RANK = 3
GPUS_PER_NODE = 8
MASTER_ADDR = pool0-01868
MASTER_PORT = 25001
GLOBAL_TRAIN_BATCH_SIZE = 2048
GRADIENT_ACCUMULATION_STEPS = 4
PER_DEVICE_TRAIN_BATCH_SIZE = 8
DEFAULT_LEARNING_RATE: 2e-5
[2025-07-01 08:58:36,726] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 08:58:36,726] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 08:58:36,726] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 08:58:36,726] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 08:58:36,726] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 08:58:36,726] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 08:58:36,726] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 08:58:36,726] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 08:58:36,743] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 08:58:36,743] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 08:58:36,743] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 08:58:36,743] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 08:58:36,743] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 08:58:36,743] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 08:58:36,743] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 08:58:36,743] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 08:58:36,807] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 08:58:36,807] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 08:58:36,807] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 08:58:36,807] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 08:58:36,807] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 08:58:36,807] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 08:58:36,807] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 08:58:36,808] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 08:58:36,934] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 08:58:36,934] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 08:58:36,934] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 08:58:36,934] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 08:58:36,934] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 08:58:36,934] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 08:58:36,969] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 08:58:36,970] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 08:58:37,174] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 08:58:37,174] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 08:58:37,174] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 08:58:37,174] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 08:58:37,174] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 08:58:37,174] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 08:58:37,174] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 08:58:37,174] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 08:58:37,482] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 08:58:37,482] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 08:58:37,482] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 08:58:37,482] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 08:58:37,482] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 08:58:37,482] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 08:58:37,482] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 08:58:37,483] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 08:58:37,827] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 08:58:37,827] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 08:58:37,827] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 08:58:37,827] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 08:58:37,827] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 08:58:37,827] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 08:58:37,828] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 08:58:37,828] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 08:58:37,997] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 08:58:37,997] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 08:58:37,997] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 08:58:37,997] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 08:58:37,997] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 08:58:37,997] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 08:58:37,997] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 08:58:37,998] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-01 08:58:49,040] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 08:58:49,040] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 08:58:49,044] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 08:58:49,044] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 08:58:49,050] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 08:58:49,050] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 08:58:49,055] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 08:58:49,055] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 08:58:49,088] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 08:58:49,088] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 08:58:49,089] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 08:58:49,089] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 08:58:49,089] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 08:58:49,089] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 08:58:49,089] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 08:58:49,089] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 08:58:49,198] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 08:58:49,198] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 08:58:49,204] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 08:58:49,204] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 08:58:49,206] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 08:58:49,206] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 08:58:49,208] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 08:58:49,208] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 08:58:49,211] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 08:58:49,211] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 08:58:49,219] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 08:58:49,219] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 08:58:49,221] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 08:58:49,221] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 08:58:49,221] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 08:58:49,221] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 08:58:49,222] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 08:58:49,222] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 08:58:49,226] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 08:58:49,226] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 08:58:49,227] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 08:58:49,227] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 08:58:49,227] [INFO] [comm.py:625:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
[2025-07-01 08:58:49,228] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 08:58:49,228] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 08:58:49,239] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 08:58:49,239] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 08:58:49,243] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 08:58:49,243] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 08:58:49,243] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 08:58:49,243] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 08:58:49,249] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 08:58:49,249] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 08:58:49,252] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 08:58:49,253] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 08:58:49,253] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 08:58:49,253] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 08:58:49,257] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 08:58:49,257] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 08:58:49,260] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 08:58:49,260] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 08:58:49,272] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 08:58:49,272] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 08:58:49,283] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 08:58:49,283] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 08:58:49,286] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 08:58:49,286] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 08:58:49,288] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 08:58:49,288] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 08:58:49,300] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 08:58:49,300] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 08:58:49,304] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 08:58:49,304] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 08:58:49,316] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 08:58:49,316] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 08:58:49,339] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 08:58:49,339] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 08:58:49,349] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 08:58:49,349] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 08:58:49,595] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 08:58:49,595] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 08:58:49,640] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 08:58:49,640] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 08:58:49,684] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 08:58:49,684] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 08:58:49,925] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 08:58:49,925] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 08:58:49,984] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 08:58:49,984] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 08:58:49,997] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 08:58:49,997] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 08:58:50,001] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 08:58:50,001] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 08:58:50,007] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 08:58:50,007] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 08:58:50,009] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 08:58:50,009] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 08:58:50,009] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 08:58:50,009] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 08:58:50,010] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 08:58:50,010] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 08:58:50,691] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 08:58:50,691] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 08:58:50,692] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 08:58:50,692] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 08:58:50,723] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 08:58:50,723] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 08:58:50,741] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 08:58:50,741] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 08:58:50,745] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 08:58:50,745] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 08:58:50,754] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 08:58:50,754] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 08:58:50,758] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 08:58:50,758] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 08:58:50,760] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 08:58:50,760] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 08:58:50,760] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 08:58:50,760] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 08:58:50,782] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 08:58:50,782] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 08:58:50,821] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 08:58:50,821] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 08:58:50,831] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 08:58:50,831] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 08:58:50,834] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 08:58:50,834] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 08:58:50,855] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 08:58:50,855] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 08:58:50,860] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 08:58:50,860] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 08:58:50,864] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2025-07-01 08:58:50,864] [INFO] [comm.py:594:init_distributed] cdb=None
[2025-07-01 08:59:04,797] [INFO] [partition_parameters.py:453:__exit__] finished initializing model with 7.61B parameters
[2025-07-01 08:59:22,183] [INFO] [partition_parameters.py:453:__exit__] finished initializing model with 8.03B parameters
[2025-07-01 08:59:23,443] [INFO] [partition_parameters.py:453:__exit__] finished initializing model with 8.09B parameters
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
[dist-0-of-64] LlavaLlamaModel(
(llm): Qwen2ForCausalLM(
(model): Qwen2Model(
(embed_tokens): Embedding(151648, 3584)
(layers): ModuleList(
(0-27): 28 x Qwen2DecoderLayer(
(self_attn): Qwen2FlashAttention2(
(q_proj): Linear(in_features=3584, out_features=3584, bias=True)
(k_proj): Linear(in_features=3584, out_features=512, bias=True)
(v_proj): Linear(in_features=3584, out_features=512, bias=True)
(o_proj): Linear(in_features=3584, out_features=3584, bias=False)
(rotary_emb): Qwen2RotaryEmbedding()
)
(mlp): Qwen2MLP(
(gate_proj): Linear(in_features=3584, out_features=18944, bias=False)
(up_proj): Linear(in_features=3584, out_features=18944, bias=False)
(down_proj): Linear(in_features=18944, out_features=3584, bias=False)
(act_fn): SiLU()
)
(input_layernorm): Qwen2RMSNorm((0,), eps=1e-06)
(post_attention_layernorm): Qwen2RMSNorm((0,), eps=1e-06)
)
)
(norm): Qwen2RMSNorm((0,), eps=1e-06)
(rotary_emb): Qwen2RotaryEmbedding()
)
(lm_head): Linear(in_features=3584, out_features=151648, bias=False)
)
(vision_tower): SiglipVisionTower(
(vision_tower): SiglipVisionModel(
(vision_model): SiglipVisionTransformer(
(embeddings): SiglipVisionEmbeddings(
(patch_embedding): Conv2d(3, 1152, kernel_size=(14, 14), stride=(14, 14), padding=valid)
(position_embedding): Embedding(1024, 1152)
)
(encoder): SiglipEncoder(
(layers): ModuleList(
(0-26): 27 x SiglipEncoderLayer(
(self_attn): SiglipFlashAttention2(
(k_proj): Linear(in_features=1152, out_features=1152, bias=True)
(v_proj): Linear(in_features=1152, out_features=1152, bias=True)
(q_proj): Linear(in_features=1152, out_features=1152, bias=True)
(out_proj): Linear(in_features=1152, out_features=1152, bias=True)
)
(layer_norm1): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
(mlp): SiglipMLP(
(activation_fn): PytorchGELUTanh()
(fc1): Linear(in_features=1152, out_features=4304, bias=True)
(fc2): Linear(in_features=4304, out_features=1152, bias=True)
)
(layer_norm2): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
)
)
)
(post_layernorm): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
)
)
)
(mm_projector): MultimodalProjector(
(layers): Sequential(
(0): DownSample3x3BlockFix()
(1): LayerNorm((10368,), eps=1e-05, elementwise_affine=True)
(2): Linear(in_features=10368, out_features=3456, bias=True)
(3): GELU(approximate='none')
(4): LayerNorm((3456,), eps=1e-05, elementwise_affine=True)
(5): Linear(in_features=3456, out_features=3584, bias=True)
(6): GELU(approximate='none')
(7): Linear(in_features=3584, out_features=3584, bias=True)
)
)
)
[dist-0-of-64] Tunable parameters:
language model True
[dist-0-of-64] vision tower True
[dist-0-of-64] mm projector True
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
[2025-07-01 09:01:57] Rank 31: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 186.7240424156189s
[2025-07-01 09:01:57] Rank 3: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 188.2856583595276s
[2025-07-01 09:01:57] Rank 32: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 188.3059437274933s
[2025-07-01 09:01:57] Rank 63: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 188.31311774253845s
[2025-07-01 09:01:57] Rank 18: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 186.8426342010498s
[2025-07-01 09:01:57] Rank 9: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 188.27460193634033s
[2025-07-01 09:01:57] Rank 52: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 188.58034896850586s
length of dataloader: 28 14336
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
[GPU memory] before trainer 2.292407512664795
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
[2025-07-01 09:01:57] Rank 33: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 188.53948974609375s
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
[2025-07-01 09:01:57] Rank 47: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 187.8767204284668s
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
[2025-07-01 09:01:57] Rank 1: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 188.6350953578949s
[2025-07-01 09:01:57] Rank 19: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 187.17372345924377s
[2025-07-01 09:01:57] Rank 46: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 187.946551322937s
[2025-07-01 09:01:57] Rank 44: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 187.947163105011s
[2025-07-01 09:01:57] Rank 30: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 187.1679859161377s
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
[2025-07-01 09:01:57] Rank 61: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 188.73403882980347s
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
[2025-07-01 09:01:57] Rank 51: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 188.93320965766907s
[2025-07-01 09:01:57] Rank 58: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 188.77459192276s
[2025-07-01 09:01:57] Rank 12: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 188.71449184417725s
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
[2025-07-01 09:01:57] Rank 48: Timer for terminate callback has been set.
Total limit: 240min
[2025-07-01 09:01:57] Rank 36: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 188.8108024597168s
Pre terminate time: 10min elapsed_time: 188.9705455303192s
[2025-07-01 09:01:57] Rank 39: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 188.81623101234436s
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
[2025-07-01 09:01:57] Rank 14: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 188.76175379753113s
[2025-07-01 09:01:57] Rank 60: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 188.82697677612305s
length of dataloader:length of dataloader: 28 2814336
14336
length of dataloader: 28 14336
[GPU memory] before trainer [GPU memory] before trainer 2.292407512664795
2.292407512664795
[GPU memory] before trainer 2.292407512664795
[2025-07-01 09:01:57] Rank 50: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 189.00980305671692s
[2025-07-01 09:01:57] Rank 29: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 187.3063566684723s
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
[2025-07-01 09:01:57] Rank 37: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 188.86100125312805s
length of dataloader: 28 14336
[2025-07-01 09:01:57] Rank 59: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 188.8586766719818s
[GPU memory] before trainer 2.292407512664795
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
[2025-07-01 09:01:58] Rank 62: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 188.87530517578125s
[2025-07-01 09:01:58] Rank 23: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 187.39407753944397s
[2025-07-01 09:01:58] Rank 54: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 189.04404830932617s
[2025-07-01 09:01:58] Rank 57: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 188.88598775863647s
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
[2025-07-01 09:01:58] Rank 28: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 187.35117411613464s
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
[2025-07-01 09:01:58] Rank 25: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 187.3726806640625s
[2025-07-01 09:01:58] Rank 49: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 189.0832874774933s
[2025-07-01 09:01:58] Rank 7: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 188.92747592926025s
[2025-07-01 09:01:58] Rank 55: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 189.09379124641418s
[2025-07-01 09:01:58] Rank 43: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 188.19229888916016s
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
[2025-07-01 09:01:58] Rank 41: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 188.19884490966797s
[2025-07-01 09:01:58] Rank 40: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 188.20034885406494s
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
[2025-07-01 09:01:58] Rank 45: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 188.20093441009521s
[2025-07-01 09:01:58] Rank 24: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 187.39897632598877s
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
[2025-07-01 09:01:58] Rank 42: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 188.20916652679443s
[2025-07-01 09:01:58] Rank 17: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 187.46580815315247s
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
length of dataloader: 28 14336
[2025-07-01 09:01:58] Rank 22: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 187.48894619941711s
[GPU memory] before trainer 2.292407512664795
[2025-07-01 09:01:58] Rank 34: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 188.97889137268066s
[2025-07-01 09:01:58] Rank 35: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 188.97930574417114s
[2025-07-01 09:01:58] Rank 6: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 188.98443937301636s
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
[2025-07-01 09:01:58] Rank 16: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 187.5020468235016s
[2025-07-01 09:01:58] Rank 8: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 188.9302875995636s
[2025-07-01 09:01:58] Rank 10: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 188.93019914627075s
[2025-07-01 09:01:58] Rank 11: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 188.9377384185791s
[2025-07-01 09:01:58] Rank 15: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 188.93913388252258s
[2025-07-01 09:01:58] Rank 5: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 189.00355291366577s
[2025-07-01 09:01:58] Rank 56: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 189.0038776397705s
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
[2025-07-01 09:01:58] Rank 21: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 187.52710509300232s
[2025-07-01 09:01:58] Rank 0: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 189.0127944946289s
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
[2025-07-01 09:01:58] Rank 38: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 189.02126288414001s
[2025-07-01 09:01:58] Rank 20: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 187.5401885509491s
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
[2025-07-01 09:01:58] Rank 53: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 189.20625829696655s
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
[2025-07-01 09:01:58] Rank 4: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 189.05517554283142s
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795length of dataloader:
28 14336
[GPU memory] before trainer 2.292407512664795
[2025-07-01 09:01:58] Rank 2: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 189.06267762184143s
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
[2025-07-01 09:01:58] Rank 13: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 189.01914143562317s
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
length of dataloader: 28 14336
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
[GPU memory] before trainer 2.292407512664795
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
[2025-07-01 09:01:58] Rank 26: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 187.55401301383972s
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
[2025-07-01 09:01:58] Rank 27: Timer for terminate callback has been set.
Total limit: 240min
Pre terminate time: 10min elapsed_time: 187.57627320289612s
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
length of dataloader: 28 14336
[GPU memory] before trainer 2.292407512664795
Parameter Offload: Total persistent parameters: 771184 in 421 params