|
SLURM_JOB_ID = 1038255 |
|
SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train |
|
RUN_NAME = NVILA-Lite-8B-quantumn-qa-train |
|
OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train |
|
NNODES = 8 |
|
NODES = pool0-01868 pool0-01869 pool0-01894 pool0-01900 pool0-01911 pool0-01921 pool0-02007 pool0-02015 |
|
NODE_RANK = 1 |
|
GPUS_PER_NODE = 8 |
|
MASTER_ADDR = pool0-01868 |
|
MASTER_PORT = 25001 |
|
GLOBAL_TRAIN_BATCH_SIZE = 2048 |
|
GRADIENT_ACCUMULATION_STEPS = 4 |
|
PER_DEVICE_TRAIN_BATCH_SIZE = 8 |
|
DEFAULT_LEARNING_RATE: 2e-5 |
|
SLURM_JOB_ID = 1038255 |
|
SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train |
|
RUN_NAME = NVILA-Lite-8B-quantumn-qa-train |
|
OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train |
|
NNODES = 8 |
|
SLURM_JOB_ID = 1038255 |
|
SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train |
|
RUN_NAME = NVILA-Lite-8B-quantumn-qa-train |
|
OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train |
|
NNODES = 8 |
|
NODES = pool0-01868 pool0-01869 pool0-01894 pool0-01900 pool0-01911 pool0-01921 pool0-02007 pool0-02015 |
|
NODE_RANK = 5 |
|
GPUS_PER_NODE = 8 |
|
MASTER_ADDR = pool0-01868 |
|
MASTER_PORT = 25001 |
|
GLOBAL_TRAIN_BATCH_SIZE = 2048 |
|
GRADIENT_ACCUMULATION_STEPS = 4 |
|
PER_DEVICE_TRAIN_BATCH_SIZE = 8 |
|
DEFAULT_LEARNING_RATE: 2e-5 |
|
NODES = pool0-01868 pool0-01869 pool0-01894 pool0-01900 pool0-01911 pool0-01921 pool0-02007 pool0-02015 |
|
NODE_RANK = 6 |
|
GPUS_PER_NODE = 8 |
|
MASTER_ADDR = pool0-01868 |
|
MASTER_PORT = 25001 |
|
GLOBAL_TRAIN_BATCH_SIZE = 2048 |
|
GRADIENT_ACCUMULATION_STEPS = 4 |
|
PER_DEVICE_TRAIN_BATCH_SIZE = 8 |
|
DEFAULT_LEARNING_RATE: 2e-5 |
|
SLURM_JOB_ID = 1038255 |
|
SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train |
|
RUN_NAME = NVILA-Lite-8B-quantumn-qa-train |
|
OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train |
|
NNODES = 8 |
|
NODES = pool0-01868 pool0-01869 pool0-01894 pool0-01900 pool0-01911 pool0-01921 pool0-02007 pool0-02015 |
|
NODE_RANK = 4 |
|
GPUS_PER_NODE = 8 |
|
MASTER_ADDR = pool0-01868 |
|
MASTER_PORT = 25001 |
|
GLOBAL_TRAIN_BATCH_SIZE = 2048 |
|
GRADIENT_ACCUMULATION_STEPS = 4 |
|
PER_DEVICE_TRAIN_BATCH_SIZE = 8 |
|
DEFAULT_LEARNING_RATE: 2e-5 |
|
SLURM_JOB_ID = 1038255 |
|
SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train |
|
RUN_NAME = NVILA-Lite-8B-quantumn-qa-train |
|
OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train |
|
NNODES = 8 |
|
NODES = pool0-01868 pool0-01869 pool0-01894 pool0-01900 pool0-01911 pool0-01921 pool0-02007 pool0-02015 |
|
NODE_RANK = 7 |
|
GPUS_PER_NODE = 8 |
|
MASTER_ADDR = pool0-01868 |
|
MASTER_PORT = 25001 |
|
GLOBAL_TRAIN_BATCH_SIZE = 2048 |
|
GRADIENT_ACCUMULATION_STEPS = 4 |
|
PER_DEVICE_TRAIN_BATCH_SIZE = 8 |
|
DEFAULT_LEARNING_RATE: 2e-5 |
|
SLURM_JOB_ID = 1038255 |
|
SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train |
|
RUN_NAME = NVILA-Lite-8B-quantumn-qa-train |
|
OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train |
|
NNODES = 8 |
|
NODES = pool0-01868 pool0-01869 pool0-01894 pool0-01900 pool0-01911 pool0-01921 pool0-02007 pool0-02015 |
|
NODE_RANK = 2 |
|
GPUS_PER_NODE = 8 |
|
MASTER_ADDR = pool0-01868 |
|
MASTER_PORT = 25001 |
|
GLOBAL_TRAIN_BATCH_SIZE = 2048 |
|
GRADIENT_ACCUMULATION_STEPS = 4 |
|
PER_DEVICE_TRAIN_BATCH_SIZE = 8 |
|
DEFAULT_LEARNING_RATE: 2e-5 |
|
SLURM_JOB_ID = 1038255 |
|
SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train |
|
RUN_NAME = NVILA-Lite-8B-quantumn-qa-train |
|
OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train |
|
NNODES = 8 |
|
NODES = pool0-01868 pool0-01869 pool0-01894 pool0-01900 pool0-01911 pool0-01921 pool0-02007 pool0-02015 |
|
NODE_RANK = 0 |
|
GPUS_PER_NODE = 8 |
|
MASTER_ADDR = pool0-01868 |
|
MASTER_PORT = 25001 |
|
GLOBAL_TRAIN_BATCH_SIZE = 2048 |
|
GRADIENT_ACCUMULATION_STEPS = 4 |
|
PER_DEVICE_TRAIN_BATCH_SIZE = 8 |
|
DEFAULT_LEARNING_RATE: 2e-5 |
|
SLURM_JOB_ID = 1038255 |
|
SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train |
|
RUN_NAME = NVILA-Lite-8B-quantumn-qa-train |
|
OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train |
|
NNODES = 8 |
|
NODES = pool0-01868 pool0-01869 pool0-01894 pool0-01900 pool0-01911 pool0-01921 pool0-02007 pool0-02015 |
|
NODE_RANK = 3 |
|
GPUS_PER_NODE = 8 |
|
MASTER_ADDR = pool0-01868 |
|
MASTER_PORT = 25001 |
|
GLOBAL_TRAIN_BATCH_SIZE = 2048 |
|
GRADIENT_ACCUMULATION_STEPS = 4 |
|
PER_DEVICE_TRAIN_BATCH_SIZE = 8 |
|
DEFAULT_LEARNING_RATE: 2e-5 |
|
[2025-07-01 08:58:36,726] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:58:36,726] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:58:36,726] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:58:36,726] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:58:36,726] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:58:36,726] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:58:36,726] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:58:36,726] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:58:36,743] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:58:36,743] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:58:36,743] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:58:36,743] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:58:36,743] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:58:36,743] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:58:36,743] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:58:36,743] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:58:36,807] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:58:36,807] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:58:36,807] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:58:36,807] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:58:36,807] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:58:36,807] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:58:36,807] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:58:36,808] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:58:36,934] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:58:36,934] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:58:36,934] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:58:36,934] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:58:36,934] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:58:36,934] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:58:36,969] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:58:36,970] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:58:37,174] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:58:37,174] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:58:37,174] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:58:37,174] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:58:37,174] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:58:37,174] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:58:37,174] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:58:37,174] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:58:37,482] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:58:37,482] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:58:37,482] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:58:37,482] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:58:37,482] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:58:37,482] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:58:37,482] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:58:37,483] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:58:37,827] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:58:37,827] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:58:37,827] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:58:37,827] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:58:37,827] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:58:37,827] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:58:37,828] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:58:37,828] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:58:37,997] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:58:37,997] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:58:37,997] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:58:37,997] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:58:37,997] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:58:37,997] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:58:37,997] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:58:37,998] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2025-07-01 08:58:49,040] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:58:49,040] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:58:49,044] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:58:49,044] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:58:49,050] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:58:49,050] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:58:49,055] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:58:49,055] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:58:49,088] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:58:49,088] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:58:49,089] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:58:49,089] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:58:49,089] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:58:49,089] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:58:49,089] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:58:49,089] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:58:49,198] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:58:49,198] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:58:49,204] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:58:49,204] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:58:49,206] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:58:49,206] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:58:49,208] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:58:49,208] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:58:49,211] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:58:49,211] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:58:49,219] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:58:49,219] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:58:49,221] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:58:49,221] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:58:49,221] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:58:49,221] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:58:49,222] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:58:49,222] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:58:49,226] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:58:49,226] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:58:49,227] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:58:49,227] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:58:49,227] [INFO] [comm.py:625:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl |
|
[2025-07-01 08:58:49,228] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:58:49,228] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:58:49,239] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:58:49,239] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:58:49,243] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:58:49,243] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:58:49,243] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:58:49,243] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:58:49,249] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:58:49,249] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:58:49,252] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:58:49,253] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:58:49,253] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:58:49,253] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:58:49,257] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:58:49,257] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:58:49,260] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:58:49,260] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:58:49,272] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:58:49,272] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:58:49,283] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:58:49,283] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:58:49,286] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:58:49,286] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:58:49,288] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:58:49,288] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:58:49,300] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:58:49,300] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:58:49,304] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:58:49,304] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:58:49,316] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:58:49,316] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:58:49,339] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:58:49,339] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:58:49,349] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:58:49,349] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:58:49,595] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:58:49,595] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:58:49,640] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:58:49,640] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:58:49,684] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:58:49,684] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:58:49,925] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:58:49,925] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:58:49,984] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:58:49,984] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:58:49,997] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:58:49,997] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:58:50,001] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:58:50,001] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:58:50,007] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:58:50,007] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:58:50,009] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:58:50,009] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:58:50,009] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:58:50,009] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:58:50,010] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:58:50,010] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:58:50,691] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:58:50,691] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:58:50,692] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:58:50,692] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:58:50,723] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:58:50,723] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:58:50,741] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:58:50,741] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:58:50,745] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:58:50,745] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:58:50,754] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:58:50,754] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:58:50,758] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:58:50,758] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:58:50,760] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:58:50,760] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:58:50,760] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:58:50,760] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:58:50,782] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:58:50,782] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:58:50,821] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:58:50,821] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:58:50,831] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:58:50,831] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:58:50,834] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:58:50,834] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:58:50,855] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:58:50,855] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:58:50,860] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:58:50,860] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:58:50,864] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented |
|
[2025-07-01 08:58:50,864] [INFO] [comm.py:594:init_distributed] cdb=None |
|
[2025-07-01 08:59:04,797] [INFO] [partition_parameters.py:453:__exit__] finished initializing model with 7.61B parameters |
|
[2025-07-01 08:59:22,183] [INFO] [partition_parameters.py:453:__exit__] finished initializing model with 8.03B parameters |
|
[2025-07-01 08:59:23,443] [INFO] [partition_parameters.py:453:__exit__] finished initializing model with 8.09B parameters |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
|
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
|
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
|
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
|
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
|
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
|
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
|
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
|
|
[dist-0-of-64] LlavaLlamaModel( |
|
(llm): Qwen2ForCausalLM( |
|
(model): Qwen2Model( |
|
(embed_tokens): Embedding(151648, 3584) |
|
(layers): ModuleList( |
|
(0-27): 28 x Qwen2DecoderLayer( |
|
(self_attn): Qwen2FlashAttention2( |
|
(q_proj): Linear(in_features=3584, out_features=3584, bias=True) |
|
(k_proj): Linear(in_features=3584, out_features=512, bias=True) |
|
(v_proj): Linear(in_features=3584, out_features=512, bias=True) |
|
(o_proj): Linear(in_features=3584, out_features=3584, bias=False) |
|
(rotary_emb): Qwen2RotaryEmbedding() |
|
) |
|
(mlp): Qwen2MLP( |
|
(gate_proj): Linear(in_features=3584, out_features=18944, bias=False) |
|
(up_proj): Linear(in_features=3584, out_features=18944, bias=False) |
|
(down_proj): Linear(in_features=18944, out_features=3584, bias=False) |
|
(act_fn): SiLU() |
|
) |
|
(input_layernorm): Qwen2RMSNorm((0,), eps=1e-06) |
|
(post_attention_layernorm): Qwen2RMSNorm((0,), eps=1e-06) |
|
) |
|
) |
|
(norm): Qwen2RMSNorm((0,), eps=1e-06) |
|
(rotary_emb): Qwen2RotaryEmbedding() |
|
) |
|
(lm_head): Linear(in_features=3584, out_features=151648, bias=False) |
|
) |
|
(vision_tower): SiglipVisionTower( |
|
(vision_tower): SiglipVisionModel( |
|
(vision_model): SiglipVisionTransformer( |
|
(embeddings): SiglipVisionEmbeddings( |
|
(patch_embedding): Conv2d(3, 1152, kernel_size=(14, 14), stride=(14, 14), padding=valid) |
|
(position_embedding): Embedding(1024, 1152) |
|
) |
|
(encoder): SiglipEncoder( |
|
(layers): ModuleList( |
|
(0-26): 27 x SiglipEncoderLayer( |
|
(self_attn): SiglipFlashAttention2( |
|
(k_proj): Linear(in_features=1152, out_features=1152, bias=True) |
|
(v_proj): Linear(in_features=1152, out_features=1152, bias=True) |
|
(q_proj): Linear(in_features=1152, out_features=1152, bias=True) |
|
(out_proj): Linear(in_features=1152, out_features=1152, bias=True) |
|
) |
|
(layer_norm1): LayerNorm((1152,), eps=1e-06, elementwise_affine=True) |
|
(mlp): SiglipMLP( |
|
(activation_fn): PytorchGELUTanh() |
|
(fc1): Linear(in_features=1152, out_features=4304, bias=True) |
|
(fc2): Linear(in_features=4304, out_features=1152, bias=True) |
|
) |
|
(layer_norm2): LayerNorm((1152,), eps=1e-06, elementwise_affine=True) |
|
) |
|
) |
|
) |
|
(post_layernorm): LayerNorm((1152,), eps=1e-06, elementwise_affine=True) |
|
) |
|
) |
|
) |
|
(mm_projector): MultimodalProjector( |
|
(layers): Sequential( |
|
(0): DownSample3x3BlockFix() |
|
(1): LayerNorm((10368,), eps=1e-05, elementwise_affine=True) |
|
(2): Linear(in_features=10368, out_features=3456, bias=True) |
|
(3): GELU(approximate='none') |
|
(4): LayerNorm((3456,), eps=1e-05, elementwise_affine=True) |
|
(5): Linear(in_features=3456, out_features=3584, bias=True) |
|
(6): GELU(approximate='none') |
|
(7): Linear(in_features=3584, out_features=3584, bias=True) |
|
) |
|
) |
|
) |
|
[dist-0-of-64] Tunable parameters: |
|
language model True |
|
[dist-0-of-64] vision tower True |
|
[dist-0-of-64] mm projector True |
|
trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000 |
|
[2025-07-01 09:01:57] Rank 31: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 186.7240424156189s |
|
[2025-07-01 09:01:57] Rank 3: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 188.2856583595276s |
|
[2025-07-01 09:01:57] Rank 32: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 188.3059437274933s |
|
[2025-07-01 09:01:57] Rank 63: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 188.31311774253845s |
|
[2025-07-01 09:01:57] Rank 18: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 186.8426342010498s |
|
[2025-07-01 09:01:57] Rank 9: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 188.27460193634033s |
|
[2025-07-01 09:01:57] Rank 52: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 188.58034896850586s |
|
length of dataloader: 28 14336 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 09:01:57] Rank 33: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 188.53948974609375s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 09:01:57] Rank 47: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 187.8767204284668s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 09:01:57] Rank 1: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 188.6350953578949s |
|
[2025-07-01 09:01:57] Rank 19: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 187.17372345924377s |
|
[2025-07-01 09:01:57] Rank 46: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 187.946551322937s |
|
[2025-07-01 09:01:57] Rank 44: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 187.947163105011s |
|
[2025-07-01 09:01:57] Rank 30: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 187.1679859161377s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 09:01:57] Rank 61: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 188.73403882980347s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 09:01:57] Rank 51: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 188.93320965766907s |
|
[2025-07-01 09:01:57] Rank 58: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 188.77459192276s |
|
[2025-07-01 09:01:57] Rank 12: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 188.71449184417725s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 09:01:57] Rank 48: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
[2025-07-01 09:01:57] Rank 36: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 188.8108024597168s |
|
Pre terminate time: 10min elapsed_time: 188.9705455303192s |
|
[2025-07-01 09:01:57] Rank 39: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 188.81623101234436s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 09:01:57] Rank 14: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 188.76175379753113s |
|
[2025-07-01 09:01:57] Rank 60: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 188.82697677612305s |
|
length of dataloader:length of dataloader: 28 2814336 |
|
14336 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer [GPU memory] before trainer 2.292407512664795 |
|
2.292407512664795 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 09:01:57] Rank 50: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 189.00980305671692s |
|
[2025-07-01 09:01:57] Rank 29: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 187.3063566684723s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 09:01:57] Rank 37: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 188.86100125312805s |
|
length of dataloader: 28 14336 |
|
[2025-07-01 09:01:57] Rank 59: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 188.8586766719818s |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 09:01:58] Rank 62: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 188.87530517578125s |
|
[2025-07-01 09:01:58] Rank 23: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 187.39407753944397s |
|
[2025-07-01 09:01:58] Rank 54: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 189.04404830932617s |
|
[2025-07-01 09:01:58] Rank 57: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 188.88598775863647s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 09:01:58] Rank 28: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 187.35117411613464s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 09:01:58] Rank 25: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 187.3726806640625s |
|
[2025-07-01 09:01:58] Rank 49: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 189.0832874774933s |
|
[2025-07-01 09:01:58] Rank 7: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 188.92747592926025s |
|
[2025-07-01 09:01:58] Rank 55: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 189.09379124641418s |
|
[2025-07-01 09:01:58] Rank 43: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 188.19229888916016s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 09:01:58] Rank 41: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 188.19884490966797s |
|
[2025-07-01 09:01:58] Rank 40: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 188.20034885406494s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 09:01:58] Rank 45: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 188.20093441009521s |
|
[2025-07-01 09:01:58] Rank 24: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 187.39897632598877s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 09:01:58] Rank 42: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 188.20916652679443s |
|
[2025-07-01 09:01:58] Rank 17: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 187.46580815315247s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[2025-07-01 09:01:58] Rank 22: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 187.48894619941711s |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 09:01:58] Rank 34: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 188.97889137268066s |
|
[2025-07-01 09:01:58] Rank 35: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 188.97930574417114s |
|
[2025-07-01 09:01:58] Rank 6: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 188.98443937301636s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 09:01:58] Rank 16: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 187.5020468235016s |
|
[2025-07-01 09:01:58] Rank 8: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 188.9302875995636s |
|
[2025-07-01 09:01:58] Rank 10: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 188.93019914627075s |
|
[2025-07-01 09:01:58] Rank 11: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 188.9377384185791s |
|
[2025-07-01 09:01:58] Rank 15: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 188.93913388252258s |
|
[2025-07-01 09:01:58] Rank 5: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 189.00355291366577s |
|
[2025-07-01 09:01:58] Rank 56: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 189.0038776397705s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 09:01:58] Rank 21: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 187.52710509300232s |
|
[2025-07-01 09:01:58] Rank 0: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 189.0127944946289s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 09:01:58] Rank 38: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 189.02126288414001s |
|
[2025-07-01 09:01:58] Rank 20: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 187.5401885509491s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 09:01:58] Rank 53: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 189.20625829696655s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 09:01:58] Rank 4: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 189.05517554283142s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795length of dataloader: |
|
28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 09:01:58] Rank 2: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 189.06267762184143s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 09:01:58] Rank 13: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 189.01914143562317s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 09:01:58] Rank 26: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 187.55401301383972s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
[2025-07-01 09:01:58] Rank 27: Timer for terminate callback has been set. |
|
Total limit: 240min |
|
Pre terminate time: 10min elapsed_time: 187.57627320289612s |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
length of dataloader: 28 14336 |
|
[GPU memory] before trainer 2.292407512664795 |
|
Parameter Offload: Total persistent parameters: 771184 in 421 params |
|
|