|
SLURM_JOB_ID = 1038254 |
|
SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train |
|
RUN_NAME = NVILA-Lite-8B-quantumn-qa-train |
|
OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train |
|
NNODES = 8 |
|
NODES = pool0-01504 pool0-01641 pool0-02076 pool0-01787 pool0-02443 pool0-02338 pool0-02349 pool0-02374 |
|
NODE_RANK = 4 |
|
GPUS_PER_NODE = 8 |
|
MASTER_ADDR = pool0-01504 |
|
MASTER_PORT = 25001 |
|
GLOBAL_TRAIN_BATCH_SIZE = 2048 |
|
GRADIENT_ACCUMULATION_STEPS = 4 |
|
PER_DEVICE_TRAIN_BATCH_SIZE = 8 |
|
DEFAULT_LEARNING_RATE: 2e-5 |
|
SLURM_JOB_ID = 1038254 |
|
SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train |
|
RUN_NAME = NVILA-Lite-8B-quantumn-qa-train |
|
OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train |
|
NNODES = 8 |
|
NODES = pool0-01504 pool0-01641 pool0-02076 pool0-01787 pool0-02443 pool0-02338 pool0-02349 pool0-02374 |
|
NODE_RANK = 3 |
|
GPUS_PER_NODE = 8 |
|
MASTER_ADDR = pool0-01504 |
|
MASTER_PORT = 25001 |
|
GLOBAL_TRAIN_BATCH_SIZE = 2048 |
|
GRADIENT_ACCUMULATION_STEPS = 4 |
|
PER_DEVICE_TRAIN_BATCH_SIZE = 8 |
|
DEFAULT_LEARNING_RATE: 2e-5 |
|
SLURM_JOB_ID = 1038254 |
|
SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train |
|
RUN_NAME = NVILA-Lite-8B-quantumn-qa-train |
|
OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train |
|
NNODES = 8 |
|
SLURM_JOB_ID = 1038254 |
|
SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train |
|
RUN_NAME = NVILA-Lite-8B-quantumn-qa-train |
|
OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train |
|
NNODES = 8 |
|
SLURM_JOB_ID = 1038254 |
|
SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train |
|
RUN_NAME = NVILA-Lite-8B-quantumn-qa-train |
|
OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train |
|
NNODES = 8 |
|
NODES = pool0-01504 pool0-01641 pool0-02076 pool0-01787 pool0-02443 pool0-02338 pool0-02349 pool0-02374 |
|
NODE_RANK = 0 |
|
GPUS_PER_NODE = 8 |
|
NODES = pool0-01504 pool0-01641 pool0-02076 pool0-01787 pool0-02443 pool0-02338 pool0-02349 pool0-02374 |
|
NODE_RANK = 6 |
|
GPUS_PER_NODE = 8 |
|
MASTER_ADDR = pool0-01504 |
|
MASTER_PORT = 25001 |
|
GLOBAL_TRAIN_BATCH_SIZE = 2048 |
|
GRADIENT_ACCUMULATION_STEPS = 4 |
|
PER_DEVICE_TRAIN_BATCH_SIZE = 8 |
|
NODES = pool0-01504 pool0-01641 pool0-02076 pool0-01787 pool0-02443 pool0-02338 pool0-02349 pool0-02374 |
|
NODE_RANK = 5 |
|
DEFAULT_LEARNING_RATE: 2e-5 |
|
GPUS_PER_NODE = 8 |
|
MASTER_ADDR = pool0-01504 |
|
MASTER_PORT = 25001 |
|
GLOBAL_TRAIN_BATCH_SIZE = 2048 |
|
GRADIENT_ACCUMULATION_STEPS = 4 |
|
PER_DEVICE_TRAIN_BATCH_SIZE = 8 |
|
DEFAULT_LEARNING_RATE: 2e-5 |
|
MASTER_ADDR = pool0-01504 |
|
MASTER_PORT = 25001 |
|
GLOBAL_TRAIN_BATCH_SIZE = 2048 |
|
GRADIENT_ACCUMULATION_STEPS = 4 |
|
PER_DEVICE_TRAIN_BATCH_SIZE = 8 |
|
DEFAULT_LEARNING_RATE: 2e-5 |
|
SLURM_JOB_ID = 1038254 |
|
SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train |
|
RUN_NAME = NVILA-Lite-8B-quantumn-qa-train |
|
OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train |
|
NNODES = 8 |
|
NODES = pool0-01504 pool0-01641 pool0-02076 pool0-01787 pool0-02443 pool0-02338 pool0-02349 pool0-02374 |
|
NODE_RANK = 1 |
|
GPUS_PER_NODE = 8 |
|
MASTER_ADDR = pool0-01504 |
|
MASTER_PORT = 25001 |
|
GLOBAL_TRAIN_BATCH_SIZE = 2048 |
|
GRADIENT_ACCUMULATION_STEPS = 4 |
|
PER_DEVICE_TRAIN_BATCH_SIZE = 8 |
|
DEFAULT_LEARNING_RATE: 2e-5 |
|
SLURM_JOB_ID = 1038254 |
|
SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train |
|
RUN_NAME = NVILA-Lite-8B-quantumn-qa-train |
|
OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train |
|
NNODES = 8 |
|
NODES = pool0-01504 pool0-01641 pool0-02076 pool0-01787 pool0-02443 pool0-02338 pool0-02349 pool0-02374 |
|
NODE_RANK = 2 |
|
GPUS_PER_NODE = 8 |
|
MASTER_ADDR = pool0-01504 |
|
MASTER_PORT = 25001 |
|
GLOBAL_TRAIN_BATCH_SIZE = 2048 |
|
GRADIENT_ACCUMULATION_STEPS = 4 |
|
PER_DEVICE_TRAIN_BATCH_SIZE = 8 |
|
DEFAULT_LEARNING_RATE: 2e-5 |
|
SLURM_JOB_ID = 1038254 |
|
SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train |
|
RUN_NAME = NVILA-Lite-8B-quantumn-qa-train |
|
OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train |
|
NNODES = 8 |
|
NODES = pool0-01504 pool0-01641 pool0-02076 pool0-01787 pool0-02443 pool0-02338 pool0-02349 pool0-02374 |
|
NODE_RANK = 7 |
|
GPUS_PER_NODE = 8 |
|
MASTER_ADDR = pool0-01504 |
|
MASTER_PORT = 25001 |
|
GLOBAL_TRAIN_BATCH_SIZE = 2048 |
|
GRADIENT_ACCUMULATION_STEPS = 4 |
|
PER_DEVICE_TRAIN_BATCH_SIZE = 8 |
|
DEFAULT_LEARNING_RATE: 2e-5 |
|
[2025-07-01 08:54:00,867] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:54:00,974] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:54:01,467] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:54:01,563] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:54:01,572] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:54:01,576] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:54:01,592] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:54:01,592] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:54:01,612] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:54:01,764] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:54:01,764] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:54:01,769] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:54:01,775] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:54:01,779] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:54:01,783] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:54:01,847] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:54:01,880] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:54:01,960] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:54:01,962] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:54:02,013] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:54:02,021] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:54:02,035] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:54:02,048] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:54:02,054] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:54:02,099] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:54:02,101] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:54:02,102] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:54:02,114] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:54:02,145] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:54:02,187] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:54:02,188] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:54:02,189] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:54:02,193] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:54:02,194] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:54:02,265] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:54:02,373] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:54:02,373] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:54:02,383] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:54:02,383] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:54:02,387] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:54:02,389] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:54:02,656] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:54:02,664] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:54:02,693] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:54:02,731] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:54:02,756] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:54:02,756] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:54:02,776] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:54:02,782] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:54:02,785] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:54:02,788] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:54:02,791] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:54:02,835] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:54:02,839] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:54:02,841] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:54:02,851] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:54:03,212] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:54:03,212] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:54:03,217] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:54:03,217] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:54:03,245] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:54:03,247] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:54:03,247] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:54:03,254] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:54:04,150] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:54:04,150] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:54:04,274] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:54:04,274] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:54:04,839] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:54:04,839] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:54:04,847] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:54:04,847] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:54:04,895] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:54:04,895] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:54:04,913] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:54:04,913] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:54:04,932] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:54:04,932] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:54:05,228] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:54:05,229] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:54:05,229] [INFO] [comm.py:625:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl |
|
[2025-07-01 08:54:05,261] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:54:05,261] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:54:05,276] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:54:05,276] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:54:05,281] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:54:05,281] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:54:05,286] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:54:05,286] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:54:05,287] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:54:05,287] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:54:05,292] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:54:05,293] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:54:05,293] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:54:05,293] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:54:05,295] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:54:05,295] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:54:05,297] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:54:05,297] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:54:05,303] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:54:05,303] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:54:05,303] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:54:05,303] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:54:05,340] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:54:05,340] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:54:05,391] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:54:05,391] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:54:05,391] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:54:05,391] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:54:05,392] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:54:05,392] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:54:05,393] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:54:05,393] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:54:05,395] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:54:05,395] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:54:05,429] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:54:05,429] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:54:05,462] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:54:05,462] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:54:05,478] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:54:05,478] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:54:05,539] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:54:05,539] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:54:05,575] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:54:05,575] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:54:05,684] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:54:05,685] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:54:05,698] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:54:05,698] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:54:05,801] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:54:05,801] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:54:05,816] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:54:05,817] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:54:05,843] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:54:05,843] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:54:05,862] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:54:05,862] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:54:05,965] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:54:05,965] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:54:05,994] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:54:05,995] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:54:05,997] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:54:05,997] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:54:06,071] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:54:06,071] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:54:06,124] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:54:06,124] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:54:06,126] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:54:06,126] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:54:06,131] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:54:06,131] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:54:06,149] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:54:06,149] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:54:06,173] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:54:06,173] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:54:06,186] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:54:06,186] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:54:06,217] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:54:06,217] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:54:06,236] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:54:06,236] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:54:06,269] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:54:06,269] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:54:06,278] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:54:06,278] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:54:06,285] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:54:06,285] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:54:06,289] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:54:06,289] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:54:06,291] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:54:06,291] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:54:06,299] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:54:06,299] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:54:06,302] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:54:06,302] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:54:06,313] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:54:06,313] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:54:06,539] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:54:06,539] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:54:06,666] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:54:06,666] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:54:06,687] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:54:06,687] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:54:06,691] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:54:06,691] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:54:06,711] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:54:06,711] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:54:06,729] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:54:06,730] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:54:06,735] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:54:06,735] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:54:06,776] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:54:06,776] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:54:23,138] [INFO] [partition_parameters.py:453:__exit__] finished initializing model with 7.61B parameters |
|
[2025-07-01 08:54:31,001] [INFO] [partition_parameters.py:453:__exit__] finished initializing model with 8.03B parameters |
|
[2025-07-01 08:54:31,607] [INFO] [partition_parameters.py:453:__exit__] finished initializing model with 8.09B parameters |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
|
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
[dist-0-of-64] LlavaLlamaModel( |
|
(llm): Qwen2ForCausalLM( |
|
(model): Qwen2Model( |
|
(embed_tokens): Embedding(151648, 3584) |
|
(layers): ModuleList( |
|
(0-27): 28 x Qwen2DecoderLayer( |
|
(self_attn): Qwen2FlashAttention2( |
|
(q_proj): Linear(in_features=3584, out_features=3584, bias=True) |
|
(k_proj): Linear(in_features=3584, out_features=512, bias=True) |
|
(v_proj): Linear(in_features=3584, out_features=512, bias=True) |
|
(o_proj): Linear(in_features=3584, out_features=3584, bias=False) |
|
(rotary_emb): Qwen2RotaryEmbedding() |
|
) |
|
(mlp): Qwen2MLP( |
|
(gate_proj): Linear(in_features=3584, out_features=18944, bias=False) |
|
(up_proj): Linear(in_features=3584, out_features=18944, bias=False) |
|
(down_proj): Linear(in_features=18944, out_features=3584, bias=False) |
|
(act_fn): SiLU() |
|
) |
|
(input_layernorm): Qwen2RMSNorm((0,), eps=1e-06) |
|
(post_attention_layernorm): Qwen2RMSNorm((0,), eps=1e-06) |
|
) |
|
) |
|
(norm): Qwen2RMSNorm((0,), eps=1e-06) |
|
(rotary_emb): Qwen2RotaryEmbedding() |
|
) |
|
(lm_head): Linear(in_features=3584, out_features=151648, bias=False) |
|
) |
|
(vision_tower): SiglipVisionTower( |
|
(vision_tower): SiglipVisionModel( |
|
(vision_model): SiglipVisionTransformer( |
|
(embeddings): SiglipVisionEmbeddings( |
|
(patch_embedding): Conv2d(3, 1152, kernel_size=(14, 14), stride=(14, 14), padding=valid) |
|
(position_embedding): Embedding(1024, 1152) |
|
) |
|
(encoder): SiglipEncoder( |
|
(layers): ModuleList( |
|
(0-26): 27 x SiglipEncoderLayer( |
|
(self_attn): SiglipFlashAttention2( |
|
(k_proj): Linear(in_features=1152, out_features=1152, bias=True) |
|
(v_proj): Linear(in_features=1152, out_features=1152, bias=True) |
|
(q_proj): Linear(in_features=1152, out_features=1152, bias=True) |
|
(out_proj): Linear(in_features=1152, out_features=1152, bias=True) |
|
) |
|
(layer_norm1): LayerNorm((1152,), eps=1e-06, elementwise_affine=True) |
|
(mlp): SiglipMLP( |
|
(activation_fn): PytorchGELUTanh() |
|
(fc1): Linear(in_features=1152, out_features=4304, bias=True) |
|
(fc2): Linear(in_features=4304, out_features=1152, bias=True) |
|
) |
|
(layer_norm2): LayerNorm((1152,), eps=1e-06, elementwise_affine=True) |
|
) |
|
) |
|
) |
|
(post_layernorm): LayerNorm((1152,), eps=1e-06, elementwise_affine=True) |
|
) |
|
) |
|
) |
|
(mm_projector): MultimodalProjector( |
|
(layers): Sequential( |
|
(0): DownSample3x3BlockFix() |
|
(1): LayerNorm((10368,), eps=1e-05, elementwise_affine=True) |
|
(2): Linear(in_features=10368, out_features=3456, bias=True) |
|
(3): GELU(approximate='none') |
|
(4): LayerNorm((3456,), eps=1e-05, elementwise_affine=True) |
|
(5): Linear(in_features=3456, out_features=3584, bias=True) |
|
(6): GELU(approximate='none') |
|
(7): Linear(in_features=3584, out_features=3584, bias=True) |
|
) |
|
) |
|
) |
|
[dist-0-of-64] Tunable parameters: |
|
language model True |
|
[dist-0-of-64] vision tower True |
|
[dist-0-of-64] mm projector True |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
[2025-07-01 08:57:05] Rank 62: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 179.40018558502197s |
|
[2025-07-01 08:57:05] Rank 18: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 178.6438548564911s |
|
[2025-07-01 08:57:05] Rank 1: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 179.9796986579895s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 08:57:05] Rank 40: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 179.92801594734192s |
|
[2025-07-01 08:57:05] Rank 13: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 180.29648852348328s |
|
[2025-07-01 08:57:05] Rank 50: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 179.9351155757904s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 08:57:05] Rank 8: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 180.53496980667114s |
|
[2025-07-01 08:57:05] Rank 34: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 179.31973552703857s |
|
[2025-07-01 08:57:05] Rank 6: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 180.11812615394592s |
|
[2025-07-01 08:57:05] Rank 37: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 179.35892605781555s |
|
[2025-07-01 08:57:05] Rank 30: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 179.2960913181305s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 08:57:05] Rank 21: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 178.88449788093567s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 08:57:05] Rank 14: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 180.5244917869568s |
|
[2025-07-01 08:57:05] Rank 51: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 180.05884408950806s |
|
[2025-07-01 08:57:05] Rank 24: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 179.45844531059265s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 08:57:05] Rank 59: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 179.76254534721375s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 08:57:05] Rank 46: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 180.31962299346924s |
|
[2025-07-01 08:57:05] Rank 27: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 179.5118260383606s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 08:57:05] Rank 31: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 179.44188499450684s |
|
[2025-07-01 08:57:05] Rank 42: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 181.3304316997528s |
|
[2025-07-01 08:57:05] Rank 12: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 180.74464964866638s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 08:57:05] Rank 44: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 180.2750883102417s |
|
[2025-07-01 08:57:05] Rank 22: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 179.0400447845459s |
|
[2025-07-01 08:57:05] Rank 41: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 180.36503434181213s |
|
[2025-07-01 08:57:05] Rank 29: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 179.6715350151062s |
|
[2025-07-01 08:57:05] Rank 9: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 180.794335603714s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 08:57:05] Rank 47: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 180.25037503242493s |
|
[2025-07-01 08:57:05] Rank 53: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 180.32664608955383s |
|
[2025-07-01 08:57:05] Rank 26: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 179.59264469146729s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 08:57:05] Rank 0: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 180.4588851928711s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 08:57:05] Rank 25: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 179.56515669822693s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 08:57:05] Rank 10: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 180.8198959827423s |
|
[2025-07-01 08:57:05] Rank 33: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 179.61562252044678s |
|
[2025-07-01 08:57:05] Rank 55: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 180.15942478179932s |
|
length of dataloader: 28 14336 |
|
[2025-07-01 08:57:05] Rank 39: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
[GPU memory] before trainer 2.292407512664795 |
|
Pre terminate time: 10min elapsed_time: 179.74457502365112s |
|
[2025-07-01 08:57:05] Rank 15: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 181.5756447315216s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 08:57:05] Rank 49: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 180.26583528518677s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 08:57:05] Rank 11: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 180.88644003868103s |
|
[2025-07-01 08:57:05] Rank 60: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 179.95006847381592s |
|
[2025-07-01 08:57:05] Rank 61: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 180.19447827339172s |
|
length of dataloader: 28 14336 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 08:57:05] Rank 45: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 180.3910529613495s |
|
[2025-07-01 08:57:05] Rank 58: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 180.2317771911621s |
|
[2025-07-01 08:57:05] Rank 32: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 180.3425772190094s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 08:57:05] Rank 36: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 179.71157217025757s |
|
[2025-07-01 08:57:05] Rank 28: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 179.64928483963013s |
|
[2025-07-01 08:57:05] Rank 20: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 179.196674823761s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 08:57:05] Rank 57: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 180.0103840827942s |
|
[2025-07-01 08:57:05] Rank 2: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 180.53355860710144s |
|
[2025-07-01 08:57:05] Rank 19: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 179.20799660682678s |
|
length of dataloader: 28 14336 |
|
length of dataloader: [GPU memory] before trainer28 14336 |
|
2.292407512664795 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 08:57:05] Rank 43: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 180.38389587402344s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 08:57:05] Rank 3: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 180.54649567604065s |
|
[2025-07-01 08:57:05] Rank 52: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 180.3105661869049s |
|
[2025-07-01 08:57:05] Rank 5: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 180.55701327323914s |
|
[2025-07-01 08:57:05] Rank 54: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 180.3739037513733s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 08:57:05] Rank 17: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 179.306396484375s |
|
[2025-07-01 08:57:05] Rank 16: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 179.22698402404785s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 08:57:05] Rank 4: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 180.58894228935242s |
|
length of dataloader: 28 14336 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 08:57:05] Rank 38: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 179.79484272003174s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 08:57:05] Rank 23: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 179.23841524124146s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 08:57:05] Rank 35: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 179.92344903945923s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 08:57:05] Rank 56: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 180.1525583267212s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 08:57:05] Rank 48: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 180.58931589126587s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 08:57:05] Rank 7: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 180.75982356071472s |
|
[2025-07-01 08:57:06] Rank 63: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 180.29284620285034s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
Parameter Offload: Total persistent parameters: 771184 in 421 params |
|
|