|
SLURM_JOB_ID = 1038247 |
|
SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train |
|
RUN_NAME = NVILA-Lite-8B-quantumn-qa-train |
|
OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train |
|
NNODES = 8 |
|
NODES = pool0-01504 pool0-01641 pool0-02076 pool0-01787 pool0-02443 pool0-02338 pool0-02349 pool0-02374 |
|
NODE_RANK = 3 |
|
GPUS_PER_NODE = 8 |
|
MASTER_ADDR = pool0-01504 |
|
MASTER_PORT = 25001 |
|
GLOBAL_TRAIN_BATCH_SIZE = 2048 |
|
GRADIENT_ACCUMULATION_STEPS = 4 |
|
PER_DEVICE_TRAIN_BATCH_SIZE = 8 |
|
DEFAULT_LEARNING_RATE: 2e-5 |
|
SLURM_JOB_ID = 1038247 |
|
SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train |
|
RUN_NAME = NVILA-Lite-8B-quantumn-qa-train |
|
OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train |
|
NNODES = 8 |
|
NODES = pool0-01504 pool0-01641 pool0-02076 pool0-01787 pool0-02443 pool0-02338 pool0-02349 pool0-02374 |
|
NODE_RANK = 6 |
|
GPUS_PER_NODE = 8 |
|
MASTER_ADDR = pool0-01504 |
|
MASTER_PORT = 25001 |
|
GLOBAL_TRAIN_BATCH_SIZE = 2048 |
|
GRADIENT_ACCUMULATION_STEPS = 4 |
|
PER_DEVICE_TRAIN_BATCH_SIZE = 8 |
|
DEFAULT_LEARNING_RATE: 2e-5 |
|
SLURM_JOB_ID = 1038247 |
|
SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train |
|
RUN_NAME = NVILA-Lite-8B-quantumn-qa-train |
|
OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train |
|
NNODES = 8 |
|
NODES = pool0-01504 pool0-01641 pool0-02076 pool0-01787 pool0-02443 pool0-02338 pool0-02349 pool0-02374 |
|
NODE_RANK = 5 |
|
GPUS_PER_NODE = 8 |
|
MASTER_ADDR = pool0-01504 |
|
MASTER_PORT = 25001 |
|
GLOBAL_TRAIN_BATCH_SIZE = 2048 |
|
GRADIENT_ACCUMULATION_STEPS = 4 |
|
PER_DEVICE_TRAIN_BATCH_SIZE = 8 |
|
DEFAULT_LEARNING_RATE: 2e-5 |
|
SLURM_JOB_ID = 1038247 |
|
SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train |
|
RUN_NAME = NVILA-Lite-8B-quantumn-qa-train |
|
OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train |
|
NNODES = 8 |
|
NODES = pool0-01504 pool0-01641 pool0-02076 pool0-01787 pool0-02443 pool0-02338 pool0-02349 pool0-02374 |
|
NODE_RANK = 4 |
|
GPUS_PER_NODE = 8 |
|
MASTER_ADDR = pool0-01504 |
|
MASTER_PORT = 25001 |
|
GLOBAL_TRAIN_BATCH_SIZE = 2048 |
|
GRADIENT_ACCUMULATION_STEPS = 4 |
|
PER_DEVICE_TRAIN_BATCH_SIZE = 8 |
|
DEFAULT_LEARNING_RATE: 2e-5 |
|
SLURM_JOB_ID = 1038247 |
|
SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train |
|
RUN_NAME = NVILA-Lite-8B-quantumn-qa-train |
|
OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train |
|
NNODES = 8 |
|
NODES = pool0-01504 pool0-01641 pool0-02076 pool0-01787 pool0-02443 pool0-02338 pool0-02349 pool0-02374 |
|
NODE_RANK = 2 |
|
GPUS_PER_NODE = 8 |
|
MASTER_ADDR = pool0-01504 |
|
MASTER_PORT = 25001 |
|
GLOBAL_TRAIN_BATCH_SIZE = 2048 |
|
GRADIENT_ACCUMULATION_STEPS = 4 |
|
PER_DEVICE_TRAIN_BATCH_SIZE = 8 |
|
DEFAULT_LEARNING_RATE: 2e-5 |
|
SLURM_JOB_ID = 1038247 |
|
SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train |
|
RUN_NAME = NVILA-Lite-8B-quantumn-qa-train |
|
OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train |
|
NNODES = 8 |
|
NODES = pool0-01504 pool0-01641 pool0-02076 pool0-01787 pool0-02443 pool0-02338 pool0-02349 pool0-02374 |
|
NODE_RANK = 7 |
|
GPUS_PER_NODE = 8 |
|
MASTER_ADDR = pool0-01504 |
|
MASTER_PORT = 25001 |
|
GLOBAL_TRAIN_BATCH_SIZE = 2048 |
|
GRADIENT_ACCUMULATION_STEPS = 4 |
|
PER_DEVICE_TRAIN_BATCH_SIZE = 8 |
|
DEFAULT_LEARNING_RATE: 2e-5 |
|
SLURM_JOB_ID = 1038247 |
|
SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train |
|
RUN_NAME = NVILA-Lite-8B-quantumn-qa-train |
|
OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train |
|
NNODES = 8 |
|
NODES = pool0-01504 pool0-01641 pool0-02076 pool0-01787 pool0-02443 pool0-02338 pool0-02349 pool0-02374 |
|
NODE_RANK = 0 |
|
GPUS_PER_NODE = 8 |
|
MASTER_ADDR = pool0-01504 |
|
MASTER_PORT = 25001 |
|
GLOBAL_TRAIN_BATCH_SIZE = 2048 |
|
GRADIENT_ACCUMULATION_STEPS = 4 |
|
PER_DEVICE_TRAIN_BATCH_SIZE = 8 |
|
DEFAULT_LEARNING_RATE: 2e-5 |
|
SLURM_JOB_ID = 1038247 |
|
SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train |
|
RUN_NAME = NVILA-Lite-8B-quantumn-qa-train |
|
OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train |
|
NNODES = 8 |
|
NODES = pool0-01504 pool0-01641 pool0-02076 pool0-01787 pool0-02443 pool0-02338 pool0-02349 pool0-02374 |
|
NODE_RANK = 1 |
|
GPUS_PER_NODE = 8 |
|
MASTER_ADDR = pool0-01504 |
|
MASTER_PORT = 25001 |
|
GLOBAL_TRAIN_BATCH_SIZE = 2048 |
|
GRADIENT_ACCUMULATION_STEPS = 4 |
|
PER_DEVICE_TRAIN_BATCH_SIZE = 8 |
|
DEFAULT_LEARNING_RATE: 2e-5 |
|
[2025-07-01 08:49:27,028] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:49:27,330] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:49:27,386] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:49:27,390] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:49:27,399] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:49:27,443] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:49:27,449] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:49:27,451] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:49:27,451] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:49:27,453] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:49:27,527] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:49:27,584] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:49:27,587] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:49:27,643] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:49:27,658] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:49:27,659] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:49:27,693] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:49:27,694] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:49:27,696] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:49:27,697] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:49:27,698] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:49:27,736] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:49:27,738] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:49:27,739] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:49:27,745] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:49:28,093] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:49:28,213] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:49:28,213] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:49:28,214] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:49:28,215] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:49:28,219] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:49:28,222] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:49:28,281] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:49:28,311] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:49:28,314] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:49:28,326] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:49:28,383] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:49:28,385] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:49:28,387] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:49:28,389] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:49:28,419] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:49:28,464] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:49:28,590] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:49:28,590] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:49:28,602] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:49:28,603] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:49:28,604] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:49:28,608] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:49:29,434] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:49:29,469] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:49:29,470] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:49:29,481] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:49:29,510] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:49:29,523] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:49:29,599] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:49:29,600] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:49:30,356] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:49:30,356] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:49:30,357] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:49:30,393] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:49:30,393] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:49:30,462] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:49:30,468] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:49:30,470] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:49:30,472] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:49:30,479] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:49:30,627] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:49:30,627] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:49:30,702] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:49:30,702] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:49:30,728] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:49:30,728] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:49:30,741] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:49:30,741] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:49:30,803] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:49:30,804] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:49:30,820] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:49:30,820] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:49:30,824] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:49:30,824] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:49:30,828] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:49:30,828] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:49:30,840] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:49:30,840] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:49:30,857] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:49:30,857] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:49:30,857] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:49:30,858] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:49:30,985] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:49:30,985] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:49:30,991] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:49:30,991] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:49:30,993] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:49:30,993] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:49:30,996] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:49:30,996] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:49:31,151] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:49:31,151] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:49:31,173] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:49:31,173] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:49:31,178] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:49:31,178] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:49:31,179] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:49:31,179] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:49:31,300] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:49:31,300] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:49:31,300] [INFO] [comm.py:625:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl |
|
[2025-07-01 08:49:31,328] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:49:31,328] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:49:31,375] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:49:31,375] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:49:31,375] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:49:31,376] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:49:31,389] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:49:31,390] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:49:31,458] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:49:31,458] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:49:31,626] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:49:31,626] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:49:31,626] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:49:31,626] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:49:31,626] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:49:31,626] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:49:31,632] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:49:31,632] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:49:31,639] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:49:31,639] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:49:31,655] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:49:31,655] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:49:31,662] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:49:31,662] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:49:31,665] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:49:31,665] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:49:31,702] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:49:31,702] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:49:31,720] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:49:31,720] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:49:31,727] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:49:31,727] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:49:31,756] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:49:31,756] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:49:31,807] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:49:31,807] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:49:31,853] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:49:31,853] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:49:31,864] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:49:31,864] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:49:31,887] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:49:31,887] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:49:32,074] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:49:32,074] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:49:32,091] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:49:32,091] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:49:32,091] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:49:32,091] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:49:32,105] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:49:32,105] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:49:32,108] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:49:32,108] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:49:32,109] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:49:32,109] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:49:32,831] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:49:32,831] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:49:32,871] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:49:32,871] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:49:32,881] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:49:32,881] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:49:32,882] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:49:32,882] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:49:32,882] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:49:32,882] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:49:32,886] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:49:32,886] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:49:32,889] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:49:32,890] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:49:32,891] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:49:32,891] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:49:33,702] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:49:33,702] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:49:33,702] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:49:33,702] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:49:33,702] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:49:33,702] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:49:33,791] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:49:33,791] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:49:33,924] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:49:33,924] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:49:34,044] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:49:34,044] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:49:34,108] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:49:34,108] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:49:34,138] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:49:34,138] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:49:48,774] [INFO] [partition_parameters.py:453:__exit__] finished initializing model with 7.61B parameters |
|
[2025-07-01 08:49:57,388] [INFO] [partition_parameters.py:453:__exit__] finished initializing model with 8.03B parameters |
|
[2025-07-01 08:49:58,024] [INFO] [partition_parameters.py:453:__exit__] finished initializing model with 8.09B parameters |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
|
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
[dist-0-of-64] LlavaLlamaModel( |
|
(llm): Qwen2ForCausalLM( |
|
(model): Qwen2Model( |
|
(embed_tokens): Embedding(151648, 3584) |
|
(layers): ModuleList( |
|
(0-27): 28 x Qwen2DecoderLayer( |
|
(self_attn): Qwen2FlashAttention2( |
|
(q_proj): Linear(in_features=3584, out_features=3584, bias=True) |
|
(k_proj): Linear(in_features=3584, out_features=512, bias=True) |
|
(v_proj): Linear(in_features=3584, out_features=512, bias=True) |
|
(o_proj): Linear(in_features=3584, out_features=3584, bias=False) |
|
(rotary_emb): Qwen2RotaryEmbedding() |
|
) |
|
(mlp): Qwen2MLP( |
|
(gate_proj): Linear(in_features=3584, out_features=18944, bias=False) |
|
(up_proj): Linear(in_features=3584, out_features=18944, bias=False) |
|
(down_proj): Linear(in_features=18944, out_features=3584, bias=False) |
|
(act_fn): SiLU() |
|
) |
|
(input_layernorm): Qwen2RMSNorm((0,), eps=1e-06) |
|
(post_attention_layernorm): Qwen2RMSNorm((0,), eps=1e-06) |
|
) |
|
) |
|
(norm): Qwen2RMSNorm((0,), eps=1e-06) |
|
(rotary_emb): Qwen2RotaryEmbedding() |
|
) |
|
(lm_head): Linear(in_features=3584, out_features=151648, bias=False) |
|
) |
|
(vision_tower): SiglipVisionTower( |
|
(vision_tower): SiglipVisionModel( |
|
(vision_model): SiglipVisionTransformer( |
|
(embeddings): SiglipVisionEmbeddings( |
|
(patch_embedding): Conv2d(3, 1152, kernel_size=(14, 14), stride=(14, 14), padding=valid) |
|
(position_embedding): Embedding(1024, 1152) |
|
) |
|
(encoder): SiglipEncoder( |
|
(layers): ModuleList( |
|
(0-26): 27 x SiglipEncoderLayer( |
|
(self_attn): SiglipFlashAttention2( |
|
(k_proj): Linear(in_features=1152, out_features=1152, bias=True) |
|
(v_proj): Linear(in_features=1152, out_features=1152, bias=True) |
|
(q_proj): Linear(in_features=1152, out_features=1152, bias=True) |
|
(out_proj): Linear(in_features=1152, out_features=1152, bias=True) |
|
) |
|
(layer_norm1): LayerNorm((1152,), eps=1e-06, elementwise_affine=True) |
|
(mlp): SiglipMLP( |
|
(activation_fn): PytorchGELUTanh() |
|
(fc1): Linear(in_features=1152, out_features=4304, bias=True) |
|
(fc2): Linear(in_features=4304, out_features=1152, bias=True) |
|
) |
|
(layer_norm2): LayerNorm((1152,), eps=1e-06, elementwise_affine=True) |
|
) |
|
) |
|
) |
|
(post_layernorm): LayerNorm((1152,), eps=1e-06, elementwise_affine=True) |
|
) |
|
) |
|
) |
|
(mm_projector): MultimodalProjector( |
|
(layers): Sequential( |
|
(0): DownSample3x3BlockFix() |
|
(1): LayerNorm((10368,), eps=1e-05, elementwise_affine=True) |
|
(2): Linear(in_features=10368, out_features=3456, bias=True) |
|
(3): GELU(approximate='none') |
|
(4): LayerNorm((3456,), eps=1e-05, elementwise_affine=True) |
|
(5): Linear(in_features=3456, out_features=3584, bias=True) |
|
(6): GELU(approximate='none') |
|
(7): Linear(in_features=3584, out_features=3584, bias=True) |
|
) |
|
) |
|
) |
|
[dist-0-of-64] Tunable parameters: |
|
language model True |
|
[dist-0-of-64] vision tower True |
|
[dist-0-of-64] mm projector True |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
[2025-07-01 08:52:31] Rank 41: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 179.37434482574463s |
|
[2025-07-01 08:52:31] Rank 6: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 180.40121841430664s |
|
[2025-07-01 08:52:31] Rank 21: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 180.3818507194519s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 08:52:31] Rank 25: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 179.95334601402283s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 08:52:31] Rank 58: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 177.84617686271667s |
|
[2025-07-01 08:52:31] Rank 36: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 179.8410665988922s |
|
[2025-07-01 08:52:31] Rank 38: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 179.93657898902893s |
|
[2025-07-01 08:52:31] Rank 45: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 179.68102931976318s |
|
[2025-07-01 08:52:31] Rank 55: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 180.8664586544037s |
|
[2025-07-01 08:52:31] Rank 14: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 178.74036169052124s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 08:52:31] Rank 56: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 177.8835060596466s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 08:52:31] Rank 16: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 180.6106789112091s |
|
[2025-07-01 08:52:31] Rank 28: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 180.10080122947693s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 08:52:31] Rank 51: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 181.0412666797638s |
|
[2025-07-01 08:52:31] Rank 5: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 180.63762664794922s |
|
[2025-07-01 08:52:31] Rank 11: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 178.8860149383545s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 08:52:31] Rank 12: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 178.91869187355042s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 08:52:31] Rank 49: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 181.07620072364807s |
|
[2025-07-01 08:52:31] Rank 54: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 181.11947393417358s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 08:52:31] Rank 15: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 178.96962904930115s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 08:52:31] Rank 53: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 181.16217613220215s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 08:52:31] Rank 48: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 181.2186541557312s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 08:52:31] Rank 52: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 181.21066880226135s |
|
[2025-07-01 08:52:31] Rank 31: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 180.3190746307373s |
|
[2025-07-01 08:52:31] Rank 8: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 179.07767581939697s |
|
[2025-07-01 08:52:31] Rank 24: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 180.3272933959961s |
|
[2025-07-01 08:52:31] Rank 1: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 180.84223079681396s |
|
[2025-07-01 08:52:31] Rank 3: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 181.00526809692383s |
|
[2025-07-01 08:52:31] Rank 9: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 179.134206533432s |
|
[2025-07-01 08:52:31] Rank 33: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 180.2782702445984s |
|
[2025-07-01 08:52:31] Rank 32: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 180.3015902042389s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 08:52:31] Rank 50: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 181.3520963191986s |
|
[2025-07-01 08:52:31] Rank 39: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 180.27661395072937s |
|
[2025-07-01 08:52:31] Rank 40: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 180.0057246685028s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 08:52:31] Rank 20: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 180.9391541481018s |
|
[2025-07-01 08:52:31] Rank 37: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 180.3413987159729s |
|
[2025-07-01 08:52:31] Rank 34: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 180.33079957962036s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 08:52:31] Rank 22: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 181.61974906921387s |
|
[2025-07-01 08:52:31] Rank 13: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 179.15128827095032s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 08:52:31] Rank 43: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 180.01913499832153s |
|
[2025-07-01 08:52:31] Rank 10: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 179.16416096687317s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 08:52:31] Rank 23: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 181.1800184249878s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 08:52:31] Rank 46: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 181.30903506278992s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 08:52:32] Rank 29: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 180.40558052062988s |
|
[2025-07-01 08:52:32] Rank 47: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 180.4238715171814s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 08:52:32] Rank 27: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 180.43949127197266s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 08:52:32] Rank 17: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 180.98406744003296s |
|
[2025-07-01 08:52:32] Rank 61: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 178.336608171463s |
|
[2025-07-01 08:52:32] Rank 57: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 178.334801197052s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 08:52:32] Rank 35: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 180.47004318237305s |
|
[2025-07-01 08:52:32] Rank 42: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 180.17485332489014s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 08:52:32] Rank 63: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 178.42643857002258s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 08:52:32] Rank 60: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 178.29783725738525s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 08:52:32] Rank 19: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 181.14956998825073s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 08:52:32] Rank 59: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 178.32719588279724s |
|
[2025-07-01 08:52:32] Rank 18: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 181.05258059501648s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 08:52:32] Rank 26: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 180.52194571495056s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 08:52:32] Rank 30: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 180.52348446846008s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 08:52:32] Rank 62: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 178.48913526535034s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 08:52:32] Rank 2: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 181.0904836654663s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 08:52:32] Rank 7: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 181.38074707984924s |
|
[2025-07-01 08:52:32] Rank 0: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 181.12232398986816s |
|
[2025-07-01 08:52:32] Rank 4: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 181.12325024604797s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 08:52:32] Rank 44: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 180.46628999710083s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
Parameter Offload: Total persistent parameters: 771184 in 421 params |
|
|