|
SLURM_JOB_ID = 1038286 |
|
SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train |
|
RUN_NAME = NVILA-Lite-8B-quantumn-qa-train |
|
OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train |
|
NNODES = 8 |
|
NODES = pool0-02124 pool0-02152 pool0-02160 pool0-02193 pool0-02639 pool0-02605 pool0-02007 pool0-02015 |
|
NODE_RANK = 5 |
|
GPUS_PER_NODE = 8 |
|
MASTER_ADDR = pool0-02124 |
|
MASTER_PORT = 25001 |
|
GLOBAL_TRAIN_BATCH_SIZE = 2048 |
|
GRADIENT_ACCUMULATION_STEPS = 4 |
|
PER_DEVICE_TRAIN_BATCH_SIZE = 8 |
|
DEFAULT_LEARNING_RATE: 2e-5 |
|
SLURM_JOB_ID = 1038286 |
|
SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train |
|
RUN_NAME = NVILA-Lite-8B-quantumn-qa-train |
|
OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train |
|
NNODES = 8 |
|
NODES = pool0-02124 pool0-02152 pool0-02160 pool0-02193 pool0-02639 pool0-02605 pool0-02007 pool0-02015 |
|
NODE_RANK = 6 |
|
GPUS_PER_NODE = 8 |
|
MASTER_ADDR = pool0-02124 |
|
MASTER_PORT = 25001 |
|
GLOBAL_TRAIN_BATCH_SIZE = 2048 |
|
GRADIENT_ACCUMULATION_STEPS = 4 |
|
PER_DEVICE_TRAIN_BATCH_SIZE = 8 |
|
DEFAULT_LEARNING_RATE: 2e-5 |
|
SLURM_JOB_ID = 1038286 |
|
SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train |
|
RUN_NAME = NVILA-Lite-8B-quantumn-qa-train |
|
OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train |
|
NNODES = 8 |
|
NODES = pool0-02124 pool0-02152 pool0-02160 pool0-02193 pool0-02639 pool0-02605 pool0-02007 pool0-02015 |
|
NODE_RANK = 4 |
|
GPUS_PER_NODE = 8 |
|
MASTER_ADDR = pool0-02124 |
|
MASTER_PORT = 25001 |
|
GLOBAL_TRAIN_BATCH_SIZE = 2048 |
|
GRADIENT_ACCUMULATION_STEPS = 4 |
|
PER_DEVICE_TRAIN_BATCH_SIZE = 8 |
|
DEFAULT_LEARNING_RATE: 2e-5 |
|
SLURM_JOB_ID = 1038286 |
|
SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train |
|
RUN_NAME = NVILA-Lite-8B-quantumn-qa-train |
|
OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train |
|
NNODES = 8 |
|
SLURM_JOB_ID = 1038286 |
|
SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train |
|
RUN_NAME = NVILA-Lite-8B-quantumn-qa-train |
|
OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train |
|
NNODES = 8 |
|
NODES = pool0-02124 pool0-02152 pool0-02160 pool0-02193 pool0-02639 pool0-02605 pool0-02007 pool0-02015 |
|
NODE_RANK = 3 |
|
GPUS_PER_NODE = 8 |
|
MASTER_ADDR = pool0-02124 |
|
MASTER_PORT = 25001 |
|
GLOBAL_TRAIN_BATCH_SIZE = 2048 |
|
GRADIENT_ACCUMULATION_STEPS = 4 |
|
PER_DEVICE_TRAIN_BATCH_SIZE = 8 |
|
DEFAULT_LEARNING_RATE: 2e-5 |
|
NODES = pool0-02124 pool0-02152 pool0-02160 pool0-02193 pool0-02639 pool0-02605 pool0-02007 pool0-02015 |
|
NODE_RANK = 2 |
|
GPUS_PER_NODE = 8 |
|
MASTER_ADDR = pool0-02124 |
|
MASTER_PORT = 25001 |
|
GLOBAL_TRAIN_BATCH_SIZE = 2048 |
|
GRADIENT_ACCUMULATION_STEPS = 4 |
|
PER_DEVICE_TRAIN_BATCH_SIZE = 8 |
|
DEFAULT_LEARNING_RATE: 2e-5 |
|
SLURM_JOB_ID = 1038286 |
|
SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train |
|
RUN_NAME = NVILA-Lite-8B-quantumn-qa-train |
|
OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train |
|
NNODES = 8 |
|
NODES = pool0-02124 pool0-02152 pool0-02160 pool0-02193 pool0-02639 pool0-02605 pool0-02007 pool0-02015 |
|
NODE_RANK = 1 |
|
GPUS_PER_NODE = 8 |
|
MASTER_ADDR = pool0-02124 |
|
MASTER_PORT = 25001 |
|
GLOBAL_TRAIN_BATCH_SIZE = 2048 |
|
GRADIENT_ACCUMULATION_STEPS = 4 |
|
PER_DEVICE_TRAIN_BATCH_SIZE = 8 |
|
DEFAULT_LEARNING_RATE: 2e-5 |
|
SLURM_JOB_ID = 1038286 |
|
SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train |
|
RUN_NAME = NVILA-Lite-8B-quantumn-qa-train |
|
OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train |
|
NNODES = 8 |
|
NODES = pool0-02124 pool0-02152 pool0-02160 pool0-02193 pool0-02639 pool0-02605 pool0-02007 pool0-02015 |
|
NODE_RANK = 7 |
|
GPUS_PER_NODE = 8 |
|
MASTER_ADDR = pool0-02124 |
|
MASTER_PORT = 25001 |
|
GLOBAL_TRAIN_BATCH_SIZE = 2048 |
|
GRADIENT_ACCUMULATION_STEPS = 4 |
|
PER_DEVICE_TRAIN_BATCH_SIZE = 8 |
|
DEFAULT_LEARNING_RATE: 2e-5 |
|
SLURM_JOB_ID = 1038286 |
|
SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train |
|
RUN_NAME = NVILA-Lite-8B-quantumn-qa-train |
|
OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train |
|
NNODES = 8 |
|
NODES = pool0-02124 pool0-02152 pool0-02160 pool0-02193 pool0-02639 pool0-02605 pool0-02007 pool0-02015 |
|
NODE_RANK = 0 |
|
GPUS_PER_NODE = 8 |
|
MASTER_ADDR = pool0-02124 |
|
MASTER_PORT = 25001 |
|
GLOBAL_TRAIN_BATCH_SIZE = 2048 |
|
GRADIENT_ACCUMULATION_STEPS = 4 |
|
PER_DEVICE_TRAIN_BATCH_SIZE = 8 |
|
DEFAULT_LEARNING_RATE: 2e-5 |
|
[2025-07-01 09:10:30,122] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 09:10:30,706] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 09:10:30,713] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 09:10:30,744] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 09:10:30,752] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 09:10:30,755] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 09:10:30,762] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 09:10:30,764] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 09:10:30,784] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 09:10:30,825] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 09:10:30,850] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 09:10:30,857] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 09:10:30,884] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 09:10:30,886] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 09:10:30,887] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 09:10:30,887] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 09:10:33,393] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 09:10:33,394] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 09:10:33,920] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 09:10:33,920] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 09:10:33,962] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 09:10:33,962] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 09:10:34,066] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 09:10:34,067] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 09:10:34,067] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 09:10:34,067] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 09:10:34,083] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 09:10:34,083] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 09:10:34,164] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 09:10:34,164] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 09:10:34,165] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 09:10:34,165] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 09:10:34,168] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 09:10:34,169] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 09:10:34,169] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 09:10:34,169] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 09:10:34,175] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 09:10:34,175] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 09:10:34,205] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 09:10:34,205] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 09:10:34,206] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 09:10:34,206] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 09:10:34,214] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 09:10:34,214] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 09:10:34,217] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 09:10:34,217] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 09:10:34,296] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 09:10:34,296] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 09:10:35,443] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 09:10:35,443] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 09:10:35,443] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 09:10:35,443] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 09:10:35,443] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 09:10:35,443] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 09:10:35,443] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 09:10:35,444] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 09:10:35,444] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 09:10:35,444] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 09:10:35,444] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 09:10:35,444] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 09:10:35,444] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 09:10:35,444] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 09:10:35,444] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 09:10:35,444] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 09:10:35,504] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 09:10:35,504] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 09:10:35,504] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 09:10:35,504] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 09:10:35,504] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 09:10:35,504] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 09:10:35,504] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 09:10:35,504] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 09:10:35,544] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 09:10:35,544] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 09:10:35,544] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 09:10:35,544] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 09:10:35,544] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 09:10:35,544] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 09:10:35,544] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 09:10:35,544] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 09:10:36,054] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 09:10:36,054] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 09:10:36,054] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 09:10:36,054] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 09:10:36,054] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 09:10:36,054] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 09:10:36,054] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 09:10:36,054] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 09:10:37,641] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 09:10:37,641] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 09:10:37,641] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 09:10:37,641] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 09:10:37,641] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 09:10:37,641] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 09:10:37,641] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 09:10:37,641] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 09:10:46,125] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 09:10:46,125] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 09:10:46,149] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 09:10:46,149] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 09:10:46,150] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 09:10:46,150] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 09:10:46,151] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 09:10:46,151] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 09:10:46,152] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 09:10:46,152] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 09:10:46,160] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 09:10:46,160] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 09:10:46,168] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 09:10:46,168] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 09:10:46,168] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 09:10:46,168] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 09:10:46,370] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 09:10:46,370] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 09:10:46,376] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 09:10:46,376] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 09:10:46,382] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 09:10:46,382] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 09:10:46,386] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 09:10:46,386] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 09:10:46,387] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 09:10:46,387] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 09:10:46,394] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 09:10:46,394] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 09:10:46,394] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 09:10:46,394] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 09:10:46,396] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 09:10:46,397] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 09:10:46,409] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 09:10:46,409] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 09:10:46,413] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 09:10:46,413] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 09:10:46,413] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 09:10:46,413] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 09:10:46,416] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 09:10:46,416] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 09:10:46,422] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 09:10:46,422] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 09:10:46,428] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 09:10:46,428] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 09:10:46,428] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 09:10:46,429] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 09:10:46,429] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 09:10:46,429] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 09:10:46,493] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 09:10:46,493] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 09:10:46,502] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 09:10:46,502] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 09:10:46,515] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 09:10:46,515] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 09:10:46,533] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 09:10:46,533] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 09:10:46,534] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 09:10:46,534] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 09:10:46,551] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 09:10:46,551] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 09:10:46,554] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 09:10:46,554] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 09:10:46,555] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 09:10:46,555] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 09:10:46,787] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 09:10:46,787] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 09:10:46,819] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 09:10:46,820] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 09:10:46,821] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 09:10:46,821] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 09:10:46,828] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 09:10:46,828] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 09:10:46,854] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 09:10:46,854] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 09:10:46,862] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 09:10:46,862] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 09:10:46,866] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 09:10:46,866] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 09:10:46,868] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 09:10:46,869] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 09:10:48,303] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 09:10:48,303] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 09:10:48,325] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 09:10:48,325] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 09:10:48,330] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 09:10:48,331] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 09:10:48,468] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 09:10:48,468] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 09:10:48,478] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 09:10:48,478] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 09:10:48,480] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 09:10:48,480] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 09:10:48,480] [INFO] [comm.py:625:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl |
|
[2025-07-01 09:10:48,511] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 09:10:48,511] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 09:10:48,511] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 09:10:48,511] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 09:11:04,481] [INFO] [partition_parameters.py:453:__exit__] finished initializing model with 7.61B parameters |
|
[2025-07-01 09:11:20,374] [INFO] [partition_parameters.py:453:__exit__] finished initializing model with 8.03B parameters |
|
[2025-07-01 09:11:21,706] [INFO] [partition_parameters.py:453:__exit__] finished initializing model with 8.09B parameters |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
|
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
|
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
|
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
|
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
|
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
|
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
|
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
|
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
|
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
|
|
|
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
[dist-0-of-64] LlavaLlamaModel( |
|
(llm): Qwen2ForCausalLM( |
|
(model): Qwen2Model( |
|
(embed_tokens): Embedding(151648, 3584) |
|
(layers): ModuleList( |
|
(0-27): 28 x Qwen2DecoderLayer( |
|
(self_attn): Qwen2FlashAttention2( |
|
(q_proj): Linear(in_features=3584, out_features=3584, bias=True) |
|
(k_proj): Linear(in_features=3584, out_features=512, bias=True) |
|
(v_proj): Linear(in_features=3584, out_features=512, bias=True) |
|
(o_proj): Linear(in_features=3584, out_features=3584, bias=False) |
|
(rotary_emb): Qwen2RotaryEmbedding() |
|
) |
|
(mlp): Qwen2MLP( |
|
(gate_proj): Linear(in_features=3584, out_features=18944, bias=False) |
|
(up_proj): Linear(in_features=3584, out_features=18944, bias=False) |
|
(down_proj): Linear(in_features=18944, out_features=3584, bias=False) |
|
(act_fn): SiLU() |
|
) |
|
(input_layernorm): Qwen2RMSNorm((0,), eps=1e-06) |
|
(post_attention_layernorm): Qwen2RMSNorm((0,), eps=1e-06) |
|
) |
|
) |
|
(norm): Qwen2RMSNorm((0,), eps=1e-06) |
|
(rotary_emb): Qwen2RotaryEmbedding() |
|
) |
|
(lm_head): Linear(in_features=3584, out_features=151648, bias=False) |
|
) |
|
(vision_tower): SiglipVisionTower( |
|
(vision_tower): SiglipVisionModel( |
|
(vision_model): SiglipVisionTransformer( |
|
(embeddings): SiglipVisionEmbeddings( |
|
(patch_embedding): Conv2d(3, 1152, kernel_size=(14, 14), stride=(14, 14), padding=valid) |
|
(position_embedding): Embedding(1024, 1152) |
|
) |
|
(encoder): SiglipEncoder( |
|
(layers): ModuleList( |
|
(0-26): 27 x SiglipEncoderLayer( |
|
(self_attn): SiglipFlashAttention2( |
|
(k_proj): Linear(in_features=1152, out_features=1152, bias=True) |
|
(v_proj): Linear(in_features=1152, out_features=1152, bias=True) |
|
(q_proj): Linear(in_features=1152, out_features=1152, bias=True) |
|
(out_proj): Linear(in_features=1152, out_features=1152, bias=True) |
|
) |
|
(layer_norm1): LayerNorm((1152,), eps=1e-06, elementwise_affine=True) |
|
(mlp): SiglipMLP( |
|
(activation_fn): PytorchGELUTanh() |
|
(fc1): Linear(in_features=1152, out_features=4304, bias=True) |
|
(fc2): Linear(in_features=4304, out_features=1152, bias=True) |
|
) |
|
(layer_norm2): LayerNorm((1152,), eps=1e-06, elementwise_affine=True) |
|
) |
|
) |
|
) |
|
(post_layernorm): LayerNorm((1152,), eps=1e-06, elementwise_affine=True) |
|
) |
|
) |
|
) |
|
(mm_projector): MultimodalProjector( |
|
(layers): Sequential( |
|
(0): DownSample3x3BlockFix() |
|
(1): LayerNorm((10368,), eps=1e-05, elementwise_affine=True) |
|
(2): Linear(in_features=10368, out_features=3456, bias=True) |
|
(3): GELU(approximate='none') |
|
(4): LayerNorm((3456,), eps=1e-05, elementwise_affine=True) |
|
(5): Linear(in_features=3456, out_features=3584, bias=True) |
|
(6): GELU(approximate='none') |
|
(7): Linear(in_features=3584, out_features=3584, bias=True) |
|
) |
|
) |
|
) |
|
[dist-0-of-64] Tunable parameters: |
|
language model True |
|
[dist-0-of-64] vision tower True |
|
[dist-0-of-64] mm projector True |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
[2025-07-01 09:13:55] Rank 15: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 189.35445713996887s |
|
[2025-07-01 09:13:55] Rank 55: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 201.59668064117432s |
|
[2025-07-01 09:13:55] Rank 43: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 189.26995539665222s |
|
[2025-07-01 09:13:55] Rank 19: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 189.44619512557983s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 09:13:55] Rank 27: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 189.14272332191467s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 09:13:55] Rank 57: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 201.8264696598053s |
|
[2025-07-01 09:13:55] Rank 7: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 187.70187664031982s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 09:13:55] Rank 8: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 189.69908165931702s |
|
[2025-07-01 09:13:55] Rank 21: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 189.7061002254486s |
|
[2025-07-01 09:13:56] Rank 46: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 189.5839924812317s |
|
[2025-07-01 09:13:56] Rank 24: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 189.28924942016602s |
|
[2025-07-01 09:13:56] Rank 35: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 189.95243191719055s |
|
[2025-07-01 09:13:56] Rank 1: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 187.78522086143494s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 09:13:56] Rank 50: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 202.2034502029419s |
|
[2025-07-01 09:13:56] Rank 31: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 189.37554931640625s |
|
[2025-07-01 09:13:56] Rank 56: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 202.09286189079285s |
|
[2025-07-01 09:13:56] Rank 3: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 187.87985610961914s |
|
[2025-07-01 09:13:56] Rank 60: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 202.09541821479797s |
|
[2025-07-01 09:13:56] Rank 9: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 189.82429432868958s |
|
[2025-07-01 09:13:56] Rank 38: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 190.08812403678894s |
|
length of dataloader: 28 14336 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 09:13:56] Rank 20: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 189.87151789665222s |
|
[2025-07-01 09:13:56] Rank 18: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 189.87728786468506s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 09:13:56] Rank 17: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 189.8816032409668s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 09:13:56] Rank 5: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 187.94594931602478s |
|
[2025-07-01 09:13:56] Rank 4: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 187.95590615272522s |
|
[2025-07-01 09:13:56] Rank 41: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 189.7688856124878s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 09:13:56] Rank 36: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 190.14091515541077s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 09:13:56] Rank 62: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 202.1776213645935s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 09:13:56] Rank 22: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 189.92305088043213s |
|
[2025-07-01 09:13:56] Rank 12: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 189.92479276657104s |
|
[2025-07-01 09:13:56] Rank 26: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 189.50398349761963s |
|
[2025-07-01 09:13:56] Rank 30: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 189.50814604759216s |
|
[2025-07-01 09:13:56] Rank 33: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 190.1808216571808s |
|
length of dataloader: 28 14336 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 09:13:56] Rank 59: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 202.2262580394745s |
|
[2025-07-01 09:13:56] Rank 16: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 189.95579552650452s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 09:13:56] Rank 53: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 202.1831030845642s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 09:13:56] Rank 13: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 189.99509859085083s |
|
[2025-07-01 09:13:56] Rank 23: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 189.99686932563782s |
|
[2025-07-01 09:13:56] Rank 45: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 189.87132096290588s |
|
[2025-07-01 09:13:56] Rank 6: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 188.06090354919434s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[2025-07-01 09:13:56] Rank 61: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 202.2874138355255s |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 09:13:56] Rank 54: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 202.3776957988739s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 09:13:56] Rank 14: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 190.0164692401886s |
|
[2025-07-01 09:13:56] Rank 42: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 189.89093255996704s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 09:13:56] Rank 47: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 189.8917055130005s |
|
[2025-07-01 09:13:56] Rank 25: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 189.60104870796204s |
|
[2025-07-01 09:13:56] Rank 58: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 202.30275464057922s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 09:13:56] Rank 28: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 189.60884761810303s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 09:13:56] Rank 0: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 188.10366201400757s |
|
[2025-07-01 09:13:56] Rank 29: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 189.62677335739136s |
|
[2025-07-01 09:13:56] Rank 49: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 202.26963424682617s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 09:13:56] Rank 10: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 190.0586109161377s |
|
[2025-07-01 09:13:56] Rank 37: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 190.2976393699646s |
|
[2025-07-01 09:13:56] Rank 11: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 190.06773209571838s |
|
[2025-07-01 09:13:56] Rank 39: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 190.31072449684143s |
|
[2025-07-01 09:13:56] Rank 34: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 190.3135223388672s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 09:13:56] Rank 40: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 189.95435571670532s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 09:13:56] Rank 44: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 189.97284388542175s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 09:13:56] Rank 51: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 202.32772946357727s |
|
[2025-07-01 09:13:56] Rank 2: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 188.1806402206421s |
|
[2025-07-01 09:13:56] Rank 48: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 202.36938166618347s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 09:13:56] Rank 63: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 202.3610863685608s |
|
length of dataloader: 28 14336 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 09:13:56] Rank 32: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 190.39307260513306s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 09:13:56] Rank 52: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 203.1189968585968s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
Parameter Offload: Total persistent parameters: 771184 in 421 params |
|
|