|
SLURM_JOB_ID = 1038241 |
|
SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train |
|
RUN_NAME = NVILA-Lite-8B-quantumn-qa-train |
|
OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train |
|
NNODES = 8 |
|
NODES = pool0-01504 pool0-01641 pool0-02076 pool0-01787 pool0-02443 pool0-02338 pool0-02349 pool0-02374 |
|
NODE_RANK = 2 |
|
GPUS_PER_NODE = 8 |
|
MASTER_ADDR = pool0-01504 |
|
MASTER_PORT = 25001 |
|
GLOBAL_TRAIN_BATCH_SIZE = 2048 |
|
GRADIENT_ACCUMULATION_STEPS = 4 |
|
PER_DEVICE_TRAIN_BATCH_SIZE = 8 |
|
DEFAULT_LEARNING_RATE: 2e-5 |
|
SLURM_JOB_ID = 1038241 |
|
SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train |
|
RUN_NAME = NVILA-Lite-8B-quantumn-qa-train |
|
OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train |
|
NNODES = 8 |
|
NODES = pool0-01504 pool0-01641 pool0-02076 pool0-01787 pool0-02443 pool0-02338 pool0-02349 pool0-02374 |
|
NODE_RANK = 3 |
|
GPUS_PER_NODE = 8 |
|
MASTER_ADDR = pool0-01504 |
|
MASTER_PORT = 25001 |
|
GLOBAL_TRAIN_BATCH_SIZE = 2048 |
|
GRADIENT_ACCUMULATION_STEPS = 4 |
|
PER_DEVICE_TRAIN_BATCH_SIZE = 8 |
|
DEFAULT_LEARNING_RATE: 2e-5 |
|
SLURM_JOB_ID = 1038241 |
|
SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train |
|
RUN_NAME = NVILA-Lite-8B-quantumn-qa-train |
|
OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train |
|
NNODES = 8 |
|
NODES = pool0-01504 pool0-01641 pool0-02076 pool0-01787 pool0-02443 pool0-02338 pool0-02349 pool0-02374 |
|
NODE_RANK = 5 |
|
GPUS_PER_NODE = 8 |
|
MASTER_ADDR = pool0-01504 |
|
MASTER_PORT = 25001 |
|
GLOBAL_TRAIN_BATCH_SIZE = 2048 |
|
GRADIENT_ACCUMULATION_STEPS = 4 |
|
PER_DEVICE_TRAIN_BATCH_SIZE = 8 |
|
DEFAULT_LEARNING_RATE: 2e-5 |
|
SLURM_JOB_ID = 1038241 |
|
SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train |
|
RUN_NAME = NVILA-Lite-8B-quantumn-qa-train |
|
OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train |
|
NNODES = 8 |
|
NODES = pool0-01504 pool0-01641 pool0-02076 pool0-01787 pool0-02443 pool0-02338 pool0-02349 pool0-02374 |
|
NODE_RANK = 1 |
|
GPUS_PER_NODE = 8 |
|
MASTER_ADDR = pool0-01504 |
|
MASTER_PORT = 25001 |
|
GLOBAL_TRAIN_BATCH_SIZE = 2048 |
|
GRADIENT_ACCUMULATION_STEPS = 4 |
|
PER_DEVICE_TRAIN_BATCH_SIZE = 8 |
|
DEFAULT_LEARNING_RATE: 2e-5 |
|
SLURM_JOB_ID = 1038241 |
|
SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train |
|
RUN_NAME = NVILA-Lite-8B-quantumn-qa-train |
|
OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train |
|
NNODES = 8 |
|
NODES = pool0-01504 pool0-01641 pool0-02076 pool0-01787 pool0-02443 pool0-02338 pool0-02349 pool0-02374 |
|
NODE_RANK = 0 |
|
GPUS_PER_NODE = 8 |
|
MASTER_ADDR = pool0-01504 |
|
MASTER_PORT = 25001 |
|
GLOBAL_TRAIN_BATCH_SIZE = 2048 |
|
GRADIENT_ACCUMULATION_STEPS = 4 |
|
PER_DEVICE_TRAIN_BATCH_SIZE = 8 |
|
DEFAULT_LEARNING_RATE: 2e-5 |
|
SLURM_JOB_ID = 1038241 |
|
SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train |
|
RUN_NAME = NVILA-Lite-8B-quantumn-qa-train |
|
OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train |
|
NNODES = 8 |
|
NODES = pool0-01504 pool0-01641 pool0-02076 pool0-01787 pool0-02443 pool0-02338 pool0-02349 pool0-02374 |
|
NODE_RANK = 7 |
|
GPUS_PER_NODE = 8 |
|
MASTER_ADDR = pool0-01504 |
|
MASTER_PORT = 25001 |
|
GLOBAL_TRAIN_BATCH_SIZE = 2048 |
|
GRADIENT_ACCUMULATION_STEPS = 4 |
|
PER_DEVICE_TRAIN_BATCH_SIZE = 8 |
|
DEFAULT_LEARNING_RATE: 2e-5 |
|
SLURM_JOB_ID = 1038241 |
|
SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train |
|
RUN_NAME = NVILA-Lite-8B-quantumn-qa-train |
|
OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train |
|
NNODES = 8 |
|
NODES = pool0-01504 pool0-01641 pool0-02076 pool0-01787 pool0-02443 pool0-02338 pool0-02349 pool0-02374 |
|
NODE_RANK = 4 |
|
GPUS_PER_NODE = 8 |
|
MASTER_ADDR = pool0-01504 |
|
MASTER_PORT = 25001 |
|
GLOBAL_TRAIN_BATCH_SIZE = 2048 |
|
GRADIENT_ACCUMULATION_STEPS = 4 |
|
PER_DEVICE_TRAIN_BATCH_SIZE = 8 |
|
DEFAULT_LEARNING_RATE: 2e-5 |
|
SLURM_JOB_ID = 1038241 |
|
SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train |
|
RUN_NAME = NVILA-Lite-8B-quantumn-qa-train |
|
OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train |
|
NNODES = 8 |
|
NODES = pool0-01504 pool0-01641 pool0-02076 pool0-01787 pool0-02443 pool0-02338 pool0-02349 pool0-02374 |
|
NODE_RANK = 6 |
|
GPUS_PER_NODE = 8 |
|
MASTER_ADDR = pool0-01504 |
|
MASTER_PORT = 25001 |
|
GLOBAL_TRAIN_BATCH_SIZE = 2048 |
|
GRADIENT_ACCUMULATION_STEPS = 4 |
|
PER_DEVICE_TRAIN_BATCH_SIZE = 8 |
|
DEFAULT_LEARNING_RATE: 2e-5 |
|
[2025-07-01 08:44:47,557] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:44:47,557] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:44:47,557] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:44:47,557] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:44:47,557] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:44:47,557] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:44:47,557] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:44:47,557] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:44:47,633] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:44:47,633] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:44:47,633] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:44:47,633] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:44:47,633] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:44:47,633] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:44:47,633] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:44:47,633] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:44:47,698] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:44:47,698] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:44:47,698] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:44:47,698] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:44:47,698] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:44:47,698] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:44:47,699] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:44:47,699] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:44:47,824] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:44:47,824] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:44:47,824] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:44:47,824] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:44:47,824] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:44:47,824] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:44:47,825] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:44:47,825] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:44:47,859] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:44:47,859] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:44:47,859] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:44:47,859] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:44:47,859] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:44:47,859] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:44:47,859] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:44:47,860] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:44:48,568] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:44:48,568] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:44:48,568] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:44:48,568] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:44:48,568] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:44:48,568] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:44:48,568] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:44:48,568] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:44:49,137] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:44:49,137] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:44:49,137] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:44:49,137] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:44:49,137] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:44:49,137] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:44:49,137] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:44:49,137] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:44:49,318] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:44:49,318] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:44:49,318] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:44:49,318] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:44:49,318] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:44:49,318] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:44:49,318] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:44:49,318] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:44:58,594] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:44:58,594] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:44:58,612] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:44:58,612] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:44:58,615] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:44:58,616] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:44:58,617] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:44:58,617] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:44:58,624] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:44:58,624] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:44:58,623] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:44:58,623] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:44:58,624] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:44:58,625] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:44:58,625] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:44:58,625] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:44:58,638] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:44:58,638] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:44:58,644] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:44:58,644] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:44:58,643] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:44:58,643] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:44:58,644] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:44:58,644] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:44:58,645] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:44:58,645] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:44:58,645] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:44:58,645] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:44:58,652] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:44:58,652] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:44:58,653] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:44:58,653] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:44:58,655] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:44:58,655] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:44:58,656] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:44:58,656] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:44:58,659] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:44:58,659] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:44:58,661] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:44:58,661] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:44:58,664] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:44:58,664] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:44:58,671] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:44:58,671] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:44:58,672] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:44:58,672] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:44:58,675] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:44:58,675] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:44:58,681] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:44:58,681] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:44:58,682] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:44:58,682] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:44:58,686] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:44:58,686] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:44:58,689] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:44:58,689] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:44:58,689] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:44:58,689] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:44:58,697] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:44:58,697] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:44:58,698] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:44:58,698] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:44:58,731] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:44:58,731] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:44:58,731] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:44:58,731] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:44:58,755] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:44:58,755] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:44:58,756] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:44:58,756] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:44:58,781] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:44:58,781] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:44:58,783] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:44:58,783] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:44:58,784] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:44:58,784] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:44:58,786] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:44:58,786] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:44:58,786] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:44:58,786] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:44:59,343] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:44:59,343] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:44:59,344] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:44:59,344] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:44:59,347] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:44:59,347] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:44:59,348] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:44:59,348] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:44:59,381] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:44:59,381] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:44:59,382] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:44:59,382] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:44:59,382] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:44:59,382] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:44:59,382] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:44:59,383] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:45:00,103] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:45:00,103] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:45:00,104] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:45:00,104] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:45:00,112] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:45:00,112] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:45:00,133] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:45:00,133] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:45:00,136] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:45:00,137] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:45:00,143] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:45:00,143] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:45:00,146] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:45:00,146] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:45:00,146] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:45:00,147] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:45:00,489] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:45:00,489] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:45:00,520] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:45:00,520] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:45:00,548] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:45:00,548] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:45:00,548] [INFO] [comm.py:625:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl |
|
[2025-07-01 08:45:00,550] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:45:00,550] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:45:00,582] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:45:00,582] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:45:00,588] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:45:00,588] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:45:00,590] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:45:00,590] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:45:00,600] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:45:00,600] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:45:15,555] [INFO] [partition_parameters.py:453:__exit__] finished initializing model with 7.61B parameters |
|
[2025-07-01 08:45:31,263] [INFO] [partition_parameters.py:453:__exit__] finished initializing model with 8.03B parameters |
|
[2025-07-01 08:45:32,600] [INFO] [partition_parameters.py:453:__exit__] finished initializing model with 8.09B parameters |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
|
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
|
|
|
|
|
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
|
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
[dist-0-of-64] trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
|
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
|
|
|
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
|
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
LlavaLlamaModel( |
|
(llm): Qwen2ForCausalLM( |
|
(model): Qwen2Model( |
|
(embed_tokens): Embedding(151648, 3584) |
|
(layers): ModuleList( |
|
(0-27): 28 x Qwen2DecoderLayer( |
|
(self_attn): Qwen2FlashAttention2( |
|
(q_proj): Linear(in_features=3584, out_features=3584, bias=True) |
|
(k_proj): Linear(in_features=3584, out_features=512, bias=True) |
|
(v_proj): Linear(in_features=3584, out_features=512, bias=True) |
|
(o_proj): Linear(in_features=3584, out_features=3584, bias=False) |
|
(rotary_emb): Qwen2RotaryEmbedding() |
|
) |
|
(mlp): Qwen2MLP( |
|
(gate_proj): Linear(in_features=3584, out_features=18944, bias=False) |
|
(up_proj): Linear(in_features=3584, out_features=18944, bias=False) |
|
(down_proj): Linear(in_features=18944, out_features=3584, bias=False) |
|
(act_fn): SiLU() |
|
) |
|
(input_layernorm): Qwen2RMSNorm((0,), eps=1e-06) |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
(post_attention_layernorm): Qwen2RMSNorm((0,), eps=1e-06) |
|
) |
|
) |
|
(norm): Qwen2RMSNorm((0,), eps=1e-06) |
|
(rotary_emb): Qwen2RotaryEmbedding() |
|
) |
|
(lm_head): Linear(in_features=3584, out_features=151648, bias=False) |
|
) |
|
(vision_tower): SiglipVisionTower( |
|
(vision_tower): SiglipVisionModel( |
|
(vision_model): SiglipVisionTransformer( |
|
(embeddings): SiglipVisionEmbeddings( |
|
(patch_embedding): Conv2d(3, 1152, kernel_size=(14, 14), stride=(14, 14), padding=valid) |
|
(position_embedding): Embedding(1024, 1152) |
|
) |
|
(encoder): SiglipEncoder( |
|
(layers): ModuleList( |
|
(0-26): 27 x SiglipEncoderLayer( |
|
(self_attn): SiglipFlashAttention2( |
|
(k_proj): Linear(in_features=1152, out_features=1152, bias=True) |
|
(v_proj): Linear(in_features=1152, out_features=1152, bias=True) |
|
(q_proj): Linear(in_features=1152, out_features=1152, bias=True) |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
(out_proj): Linear(in_features=1152, out_features=1152, bias=True) |
|
) |
|
(layer_norm1): LayerNorm((1152,), eps=1e-06, elementwise_affine=True) |
|
(mlp): SiglipMLP( |
|
(activation_fn): PytorchGELUTanh() |
|
(fc1): Linear(in_features=1152, out_features=4304, bias=True) |
|
(fc2): Linear(in_features=4304, out_features=1152, bias=True) |
|
) |
|
(layer_norm2): LayerNorm((1152,), eps=1e-06, elementwise_affine=True) |
|
) |
|
) |
|
) |
|
(post_layernorm): LayerNorm((1152,), eps=1e-06, elementwise_affine=True) |
|
) |
|
) |
|
) |
|
(mm_projector): MultimodalProjector( |
|
(layers): Sequential( |
|
(0): DownSample3x3BlockFix() |
|
(1): LayerNorm((10368,), eps=1e-05, elementwise_affine=True) |
|
(2): Linear(in_features=10368, out_features=3456, bias=True) |
|
(3): GELU(approximate='none') |
|
(4): LayerNorm((3456,), eps=1e-05, elementwise_affine=True) |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
(5): Linear(in_features=3456, out_features=3584, bias=True) |
|
(6): GELU(approximate='none') |
|
(7): Linear(in_features=3584, out_features=3584, bias=True) |
|
) |
|
) |
|
) |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
|
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
|
|
[dist-0-of-64] Tunable parameters: |
|
language model True |
|
[dist-0-of-64] vision tower True |
|
[dist-0-of-64] mm projector True |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
[2025-07-01 08:48:06] Rank 32: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 187.84421372413635s |
|
[2025-07-01 08:48:06] Rank 16: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 187.99013113975525s |
|
[2025-07-01 08:48:06] Rank 60: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 188.01411652565002s |
|
[2025-07-01 08:48:06] Rank 49: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 187.33525800704956s |
|
[2025-07-01 08:48:06] Rank 3: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 186.23286867141724s |
|
[2025-07-01 08:48:06] Rank 37: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 188.08282613754272s |
|
length of dataloader: 28 14336 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 08:48:06] Rank 40: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 188.24676775932312s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 08:48:06] Rank 62: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 188.2327139377594s |
|
[2025-07-01 08:48:06] Rank 50: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 187.53005170822144s |
|
[2025-07-01 08:48:06] Rank 25: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 188.26658296585083s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 08:48:06] Rank 46: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 188.28350949287415s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 08:48:06] Rank 31: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 188.32588863372803s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 08:48:06] Rank 0: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 186.52057933807373s |
|
[2025-07-01 08:48:06] Rank 20: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 188.36052680015564s |
|
[2025-07-01 08:48:06] Rank 21: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 188.3777801990509s |
|
[2025-07-01 08:48:06] Rank 14: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 186.87918162345886s |
|
[2025-07-01 08:48:06] Rank 13: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 186.88157558441162s |
|
[2025-07-01 08:48:06] Rank 29: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 188.40087914466858s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 08:48:06] Rank 15: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 186.94210743904114s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
length of dataloader: 28 14336 |
|
length of dataloader:[GPU memory] before trainer 2.29240751266479528 |
|
14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 08:48:07] Rank 33: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 188.36294674873352s |
|
[2025-07-01 08:48:07] Rank 9: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 187.00075697898865s |
|
[2025-07-01 08:48:07] Rank 53: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 187.78530764579773s |
|
[2025-07-01 08:48:07] Rank 2: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 186.68620371818542s |
|
[2025-07-01 08:48:07] Rank 26: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 188.52323293685913s |
|
[2025-07-01 08:48:07] Rank 12: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 187.02464079856873s |
|
[2025-07-01 08:48:07] Rank 58: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 188.52194261550903s |
|
[2025-07-01 08:48:07] Rank 28: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 188.54804372787476s |
|
[2025-07-01 08:48:07] Rank 30: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 188.5494029521942s |
|
[2025-07-01 08:48:07] Rank 43: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 188.55047464370728s |
|
[2025-07-01 08:48:07] Rank 22: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 188.5522198677063s |
|
[2025-07-01 08:48:07] Rank 11: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 187.06330227851868s |
|
[2025-07-01 08:48:07] Rank 18: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 188.56281304359436s |
|
[2025-07-01 08:48:07] Rank 48: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 187.8359296321869s |
|
[2025-07-01 08:48:07] Rank 8: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 187.07292366027832s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 08:48:07] Rank 36: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 188.44479870796204s |
|
length of dataloader: 28 14336 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[2025-07-01 08:48:07] Rank 44: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 188.58538126945496s |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 08:48:07] Rank 38: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 188.47206234931946s |
|
[2025-07-01 08:48:07] Rank 19: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 188.60940408706665s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[2025-07-01 08:48:07] Rank 45: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 188.61079692840576s |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 08:48:07] Rank 17: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 188.61975002288818s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 08:48:07] Rank 6: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 186.79642844200134s |
|
length of dataloader: 28 14336 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 08:48:07] Rank 35: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 188.512140750885s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 08:48:07] Rank 27: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 188.64775276184082s |
|
[2025-07-01 08:48:07] Rank 24: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 188.64974856376648s |
|
[2025-07-01 08:48:07] Rank 1: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
length of dataloader: 28 14336 |
|
Pre terminate time: 10min elapsed_time: 186.81638717651367s |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 08:48:07] Rank 51: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 187.9340763092041s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 08:48:07] Rank 7: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 186.82988810539246s |
|
[2025-07-01 08:48:07] Rank 55: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 187.9399230480194s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 08:48:07] Rank 54: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 187.96026849746704s |
|
[2025-07-01 08:48:07] Rank 23: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 188.69248342514038s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 08:48:07] Rank 57: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 188.67919492721558s |
|
[2025-07-01 08:48:07] Rank 52: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 187.97831630706787s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 08:48:07] Rank 61: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 188.687602519989s |
|
[2025-07-01 08:48:07] Rank 4: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 186.87889289855957s |
|
[2025-07-01 08:48:07] Rank 42: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 188.7198281288147s |
|
[2025-07-01 08:48:07] Rank 5: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 186.88499283790588s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 08:48:07] Rank 10: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
length of dataloader: 28 14336 |
|
Pre terminate time: 10min elapsed_time: 187.23191928863525s |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 08:48:07] Rank 47: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 188.74053859710693s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 08:48:07] Rank 63: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 188.7193853855133s |
|
[2025-07-01 08:48:07] Rank 34: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 188.61402535438538s |
|
length of dataloader: 28 14336 |
|
[2025-07-01 08:48:07] Rank 59: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 188.72772979736328s |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 08:48:07] Rank 41: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 188.77644872665405s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 08:48:07] Rank 56: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 188.7942099571228s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 08:48:07] Rank 39: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 188.70067310333252s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
Parameter Offload: Total persistent parameters: 771184 in 421 params |
|
|