Ligeng-Zhu commited on 23 days ago

Commit

342f304

verified ·

1 Parent(s): 4fffd94

Upload files with `vila-upload`.

Upload model/config.json
Upload model/trainer_state.json
Upload model/llm/merges.txt
Upload model/llm/generation_config.json
Upload model/llm/model.safetensors.index.json
Upload model/llm/config.json
Upload model/llm/tokenizer_config.json
Upload model/llm/tokenizer.json
Upload model/llm/model-00002-of-00004.safetensors
Upload model/llm/model-00003-of-00004.safetensors
Upload model/llm/vocab.json
Upload model/llm/special_tokens_map.json
Upload model/llm/model-00001-of-00004.safetensors
Upload model/llm/added_tokens.json
Upload model/llm/model-00004-of-00004.safetensors
Upload model/mm_projector/config.json
Upload model/mm_projector/model.safetensors
Upload model/vision_tower/config.json
Upload model/vision_tower/model.safetensors
Upload model/vision_tower/preprocessor_config.json
Upload slurm/1038294.0.err
Upload slurm/1038301.0.err
Upload slurm/1038294.0.out
Upload slurm/1038241.0.err
Upload slurm/1038255.0.out
Upload slurm/1038254.0.out
Upload slurm/1038241.0.out
Upload slurm/1038247.0.out
Upload slurm/1038255.0.err
Upload slurm/1038286.0.err
Upload slurm/1038254.0.err
Upload slurm/1038301.0.out
Upload slurm/1038303.0.err
Upload slurm/1038303.0.out
Upload slurm/1038247.0.err
Upload slurm/1038286.0.out

Files changed (37) hide show

.gitattributes +1 -0
model/config.json +344 -0
model/llm/added_tokens.json +18 -0
model/llm/config.json +32 -0
model/llm/generation_config.json +14 -0
model/llm/merges.txt +0 -0
model/llm/model-00001-of-00004.safetensors +3 -0
model/llm/model-00002-of-00004.safetensors +3 -0
model/llm/model-00003-of-00004.safetensors +3 -0
model/llm/model-00004-of-00004.safetensors +3 -0
model/llm/model.safetensors.index.json +346 -0
model/llm/special_tokens_map.json +39 -0
model/llm/tokenizer.json +3 -0
model/llm/tokenizer_config.json +149 -0
model/llm/vocab.json +0 -0
model/mm_projector/config.json +10 -0
model/mm_projector/model.safetensors +3 -0
model/trainer_state.json +133 -0
model/vision_tower/config.json +23 -0
model/vision_tower/model.safetensors +3 -0
model/vision_tower/preprocessor_config.json +24 -0
slurm/1038241.0.err +0 -0
slurm/1038241.0.out +768 -0
slurm/1038247.0.err +0 -0
slurm/1038247.0.out +768 -0
slurm/1038254.0.err +0 -0
slurm/1038254.0.out +768 -0
slurm/1038255.0.err +0 -0
slurm/1038255.0.out +768 -0
slurm/1038286.0.err +0 -0
slurm/1038286.0.out +768 -0
slurm/1038294.0.err +0 -0
slurm/1038294.0.out +768 -0
slurm/1038301.0.err +0 -0
slurm/1038301.0.out +112 -0
slurm/1038303.0.err +0 -0
slurm/1038303.0.out +792 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+model/llm/tokenizer.json filter=lfs diff=lfs merge=lfs -text

model/config.json ADDED Viewed

	@@ -0,0 +1,344 @@

+{
+  "Ubit": 100,
+  "_attn_implementation_autoset": true,
+  "_name_or_path": "runs/train/NVILA-Lite-8B-quantumn-qa-train/model",
+  "architectures": [
+    "LlavaLlamaModel"
+  ],
+  "babit": "E5M2",
+  "bobit": "E5M2",
+  "bwbit": "E5M2",
+  "chat_template": null,
+  "col_blocksize": -1,
+  "col_blocksize_optimizer": 128,
+  "draw_distribution_backward": false,
+  "draw_distribution_forward": false,
+  "drop_path_rate": 0.0,
+  "dynamic_s2": false,
+  "epsilon": 1e-10,
+  "epsilon_optimizer": 1e-15,
+  "fabit": "E4M3",
+  "first_order_bit": null,
+  "first_order_quant_type": null,
+  "fobit": "E4M3",
+  "fps": 0.0,
+  "fwbit": "E4M3",
+  "group_size": -1,
+  "hidden_size": 3584,
+  "high_res_pos_embed": false,
+  "image_aspect_ratio": "dynamic",
+  "image_encoder": {
+    "_target_": "llava.model.encoders.BasicImageEncoder"
+  },
+  "interpolate_mode": "linear",
+  "llm_cfg": {
+    "_attn_implementation_autoset": false,
+    "_name_or_path": "runs/train/NVILA-Lite-8B-quantumn-qa-train/model/llm",
+    "add_cross_attention": false,
+    "architectures": [
+      "Qwen2ForCausalLM"
+    ],
+    "attention_dropout": 0.0,
+    "bad_words_ids": null,
+    "begin_suppress_tokens": null,
+    "bos_token_id": 151643,
+    "chunk_size_feed_forward": 0,
+    "cross_attention_hidden_size": null,
+    "decoder_start_token_id": null,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "early_stopping": false,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": 151645,
+    "exponential_decay_length_penalty": null,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "hidden_act": "silu",
+    "hidden_size": 3584,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "initializer_range": 0.02,
+    "intermediate_size": 18944,
+    "is_decoder": false,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "length_penalty": 1.0,
+    "max_length": 20,
+    "max_position_embeddings": 32768,
+    "max_window_layers": 28,
+    "min_length": 0,
+    "model_max_length": 4096,
+    "model_type": "qwen2",
+    "no_repeat_ngram_size": 0,
+    "num_attention_heads": 28,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_hidden_layers": 28,
+    "num_key_value_heads": 4,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": null,
+    "prefix": null,
+    "problem_type": null,
+    "pruned_heads": {},
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "rms_norm_eps": 1e-06,
+    "rope_scaling": null,
+    "rope_theta": 1000000.0,
+    "sep_token_id": null,
+    "sliding_window": null,
+    "suppress_tokens": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": false,
+    "tokenizer_class": null,
+    "tokenizer_model_max_length": 4096,
+    "tokenizer_padding_side": "right",
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": "bfloat16",
+    "torchscript": false,
+    "typical_p": 1.0,
+    "use_bfloat16": false,
+    "use_cache": true,
+    "use_sliding_window": false,
+    "vocab_size": 151659
+  },
+  "look_close_mode": "after_image",
+  "max_tiles": 12,
+  "min_blockunit_col": 4,
+  "min_blockunit_row": 4,
+  "min_tiles": 1,
+  "mlp_path": null,
+  "mm_hidden_size": 1152,
+  "mm_low_res_token_num": null,
+  "mm_projector": "mlp_downsample_3x3_fix",
+  "mm_projector_cfg": {
+    "_attn_implementation_autoset": false,
+    "_name_or_path": "runs/train/NVILA-Lite-8B-quantumn-qa-train/model/mm_projector",
+    "add_cross_attention": false,
+    "architectures": [
+      "MultimodalProjector"
+    ],
+    "bad_words_ids": null,
+    "begin_suppress_tokens": null,
+    "bos_token_id": null,
+    "chunk_size_feed_forward": 0,
+    "cross_attention_hidden_size": null,
+    "decoder_start_token_id": null,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "early_stopping": false,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": null,
+    "exponential_decay_length_penalty": null,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "is_decoder": false,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "length_penalty": 1.0,
+    "max_length": 20,
+    "min_length": 0,
+    "mm_projector_type": "mlp_downsample_3x3_fix",
+    "model_type": "v2l_projector",
+    "no_repeat_ngram_size": 0,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": null,
+    "prefix": null,
+    "problem_type": null,
+    "pruned_heads": {},
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "sep_token_id": null,
+    "suppress_tokens": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": "bfloat16",
+    "torchscript": false,
+    "typical_p": 1.0,
+    "use_bfloat16": false
+  },
+  "mm_projector_lr": null,
+  "mm_scale_num": null,
+  "mm_use_bos_eos_tokens": true,
+  "mm_use_im_patch_token": false,
+  "mm_use_im_start_end": false,
+  "mm_vision_select_feature": "cls_patch",
+  "mm_vision_select_layer": -2,
+  "model_dtype": "torch.bfloat16",
+  "model_name_or_path": "Efficient-Large-Model/NVILA-Lite-8B",
+  "model_type": "llava_llama",
+  "num_look_close": 1,
+  "num_time_tokens": 0,
+  "num_token_look_close": null,
+  "num_video_frames": 8,
+  "pad_block": false,
+  "pad_to_multiple_of": 0,
+  "ps3": false,
+  "ps3_dynamic_aspect_ratio": false,
+  "ps3_grad_checkpointing": false,
+  "qchoice": "none",
+  "quantize_model": false,
+  "refine_attn_blocksize": false,
+  "refine_col_blocksize": 4,
+  "refine_ln_blocksize": false,
+  "refine_ln_blocksize_but_only_backward": false,
+  "refine_ln_blocksize_but_only_forward": false,
+  "refine_ln_pertoken": false,
+  "refine_mlp_blocksize": false,
+  "refine_residual_fp": false,
+  "refine_row_blocksize": 4,
+  "resume_path": "runs/train/NVILA-Lite-8B-quantumn-qa-train/model",
+  "row_blocksize": -1,
+  "row_blocksize_optimizer": 1,
+  "s2": false,
+  "s2_max_split_size": 336,
+  "s2_resize_output_to_scale_idx": 0,
+  "s2_scales": "336,672,1008",
+  "second_order_bit": null,
+  "second_order_quant_type": null,
+  "soft_ce_std": 1.0,
+  "sound_mm_projector": "mlp",
+  "sound_mm_projector_cfg": null,
+  "sound_tower": "",
+  "sound_tower_cfg": null,
+  "speech_mm_projector": "mlp",
+  "speech_mm_projector_cfg": null,
+  "speech_tower": "",
+  "speech_tower_cfg": null,
+  "symm": true,
+  "time_token_format": "<t{t}>",
+  "time_token_ids": [],
+  "top_down_prompt_head_type": "mlp",
+  "transformers_version": "4.46.0",
+  "tune_language_model": true,
+  "tune_mm_projector": true,
+  "tune_vision_tower": true,
+  "unified_audio_encoder": true,
+  "use_quantize_optimizer": false,
+  "version": "auto",
+  "video_encoder": {
+    "_target_": "llava.model.encoders.BasicVideoEncoder"
+  },
+  "video_max_tiles": 1,
+  "vision_resolution": -1,
+  "vision_tower": "Efficient-Large-Model/paligemma-siglip-so400m-patch14-448",
+  "vision_tower_cfg": {
+    "_attn_implementation_autoset": false,
+    "_name_or_path": "runs/train/NVILA-Lite-8B-quantumn-qa-train/model/vision_tower",
+    "add_cross_attention": false,
+    "architectures": [
+      "SiglipVisionModel"
+    ],
+    "attention_dropout": 0.0,
+    "bad_words_ids": null,
+    "begin_suppress_tokens": null,
+    "bos_token_id": null,
+    "chunk_size_feed_forward": 0,
+    "cross_attention_hidden_size": null,
+    "decoder_start_token_id": null,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "early_stopping": false,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": null,
+    "exponential_decay_length_penalty": null,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "hidden_act": "gelu_pytorch_tanh",
+    "hidden_size": 1152,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "image_size": 448,
+    "intermediate_size": 4304,
+    "is_decoder": false,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "layer_norm_eps": 1e-06,
+    "length_penalty": 1.0,
+    "max_length": 20,
+    "min_length": 0,
+    "model_type": "siglip_vision_model",
+    "no_repeat_ngram_size": 0,
+    "num_attention_heads": 16,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_channels": 3,
+    "num_hidden_layers": 27,
+    "num_image_tokens": 256,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": null,
+    "patch_size": 14,
+    "prefix": null,
+    "problem_type": null,
+    "projection_dim": 2048,
+    "projector_hidden_act": "gelu_fast",
+    "pruned_heads": {},
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "sep_token_id": null,
+    "suppress_tokens": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": "bfloat16",
+    "torchscript": false,
+    "typical_p": 1.0,
+    "use_bfloat16": false,
+    "vision_use_head": false
+  },
+  "vision_tower_lr": null,
+  "weight_memory_efficient": true,
+  "xvila_mode": true
+}

model/llm/added_tokens.json ADDED Viewed

	@@ -0,0 +1,18 @@

+{
+  "<image>": 151649,
+  "<vila/sentinel>": 151648,
+  "<vila/video>": 151650,
+  "<|endoftext|>": 151643,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_bos|>": 151651,
+  "<|image_eos|>": 151652,
+  "<|sound_bos|>": 151657,
+  "<|sound_eos|>": 151658,
+  "<|speech_bos|>": 151655,
+  "<|speech_eos|>": 151656,
+  "<|video_bos|>": 151653,
+  "<|video_eos|>": 151654,
+  "[BOS]": 151646,
+  "[PAD]": 151647
+}

model/llm/config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "_name_or_path": "runs/train/NVILA-Lite-8B-quantumn-qa-train/model/llm",
+  "architectures": [
+    "Qwen2ForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 151643,
+  "eos_token_id": 151645,
+  "hidden_act": "silu",
+  "hidden_size": 3584,
+  "initializer_range": 0.02,
+  "intermediate_size": 18944,
+  "max_position_embeddings": 32768,
+  "max_window_layers": 28,
+  "model_max_length": 4096,
+  "model_type": "qwen2",
+  "num_attention_heads": 28,
+  "num_hidden_layers": 28,
+  "num_key_value_heads": 4,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 1000000.0,
+  "sliding_window": null,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 4096,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.46.0",
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vocab_size": 151659
+}

model/llm/generation_config.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "bos_token_id": 151643,
+  "do_sample": true,
+  "eos_token_id": [
+    151645,
+    151643
+  ],
+  "pad_token_id": 151643,
+  "repetition_penalty": 1.05,
+  "temperature": 0.7,
+  "top_k": 20,
+  "top_p": 0.8,
+  "transformers_version": "4.46.0"
+}

model/llm/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

model/llm/model-00001-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3d9e7cde1d0d04f346241f1a173c7c0a5259b77f263a2b6dc517fca2e39f4b08
+size 4874757736

model/llm/model-00002-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:512c3fa5bf6e95a27d74e9dc6dc290d84b6657163fbc5ab814fd95bc87aa08ea
+size 4932751008

model/llm/model-00003-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bb3cb599b80fd7c4e189e24f795b0a30aa1eb5557243b81bc4b01dae085f8a51
+size 4330865200

model/llm/model-00004-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:da688388a27fb804d273baf75b638c157292b03fadf5521baa58909d148b3aa6
+size 1087091840

model/llm/model.safetensors.index.json ADDED Viewed

	@@ -0,0 +1,346 @@

+{
+  "metadata": {
+    "total_size": 15225426944
+  },
+  "weight_map": {
+    "lm_head.weight": "model-00004-of-00004.safetensors",
+    "model.embed_tokens.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.18.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.18.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.norm.weight": "model-00003-of-00004.safetensors"
+  }
+}

model/llm/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,39 @@

+{
+  "additional_special_tokens": [
+    {
+      "content": "<|sound_bos|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|sound_eos|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    }
+  ],
+  "bos_token": {
+    "content": "[BOS]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

model/llm/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8c0a6b3376d17dcabb3fc580d5bad9e566dd613e5b02faf633da5fc1d6416c35
+size 11420711

model/llm/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,149 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "[BOS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<vila/sentinel>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<image>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<vila/video>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|image_bos|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|image_eos|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|video_bos|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|video_eos|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|speech_bos|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|speech_eos|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<|sound_bos|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151658": {
+      "content": "<|sound_eos|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|sound_bos|>",
+    "<|sound_eos|>"
+  ],
+  "bos_token": "[BOS]",
+  "chat_template": "{% if messages[0]['role'] != 'system' %}{{ '<|im_start|>system\\nYou are a helpful assistant<|im_end|>\\n' }}{% endif %}{% for message in messages if message['content'] is not none %}{{ '<|im_start|>' + message['role'] + '\\n' + message['content'] + '<|im_end|>' + '\\n' }}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\\n' }}{% endif %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "legacy": false,
+  "model_max_length": 4096,
+  "pad_token": "[PAD]",
+  "padding_side": "right",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

model/llm/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

model/mm_projector/config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "_name_or_path": "runs/train/NVILA-Lite-8B-quantumn-qa-train/model/mm_projector",
+  "architectures": [
+    "MultimodalProjector"
+  ],
+  "mm_projector_type": "mlp_downsample_3x3_fix",
+  "model_type": "v2l_projector",
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.46.0"
+}

model/mm_projector/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e2f2a27121e53447b4e377deecb43822ee3ab885dde49b399304625ca15672cb
+size 122203760

model/trainer_state.json ADDED Viewed

	@@ -0,0 +1,133 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 13,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.08,
+      "grad_norm": 130.75577214745246,
+      "learning_rate": 2e-05,
+      "loss": 1.2648,
+      "step": 1
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 126.67616596972118,
+      "learning_rate": 1.9659258262890683e-05,
+      "loss": 1.2283,
+      "step": 2
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 56.84219933724937,
+      "learning_rate": 1.866025403784439e-05,
+      "loss": 0.4188,
+      "step": 3
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 89.42666319016989,
+      "learning_rate": 1.7071067811865477e-05,
+      "loss": 2.4789,
+      "step": 4
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 72.41844398977439,
+      "learning_rate": 1.5000000000000002e-05,
+      "loss": 0.7853,
+      "step": 5
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 799.0148731599335,
+      "learning_rate": 1.2588190451025209e-05,
+      "loss": 8.2197,
+      "step": 6
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 34.525610403243014,
+      "learning_rate": 1e-05,
+      "loss": 0.3008,
+      "step": 7
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 64.98250527603693,
+      "learning_rate": 7.411809548974792e-06,
+      "loss": 0.3999,
+      "step": 8
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 11.46902235575636,
+      "learning_rate": 5.000000000000003e-06,
+      "loss": 0.2575,
+      "step": 9
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 42.63293180170212,
+      "learning_rate": 2.9289321881345257e-06,
+      "loss": 0.3174,
+      "step": 10
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 40.64988981794197,
+      "learning_rate": 1.339745962155613e-06,
+      "loss": 0.3054,
+      "step": 11
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 27.588182133457394,
+      "learning_rate": 3.4074173710931804e-07,
+      "loss": 0.2827,
+      "step": 12
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 17.48926557604337,
+      "learning_rate": 0.0,
+      "loss": 0.2751,
+      "step": 13
+    },
+    {
+      "epoch": 1.0,
+      "step": 13,
+      "total_flos": 0.0,
+      "train_loss": 1.2718768601234143,
+      "train_runtime": 323.0635,
+      "train_samples_per_second": 41.206,
+      "train_steps_per_second": 0.04
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 13,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": false,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 0.0,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}

model/vision_tower/config.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+  "_name_or_path": "runs/train/NVILA-Lite-8B-quantumn-qa-train/model/vision_tower",
+  "architectures": [
+    "SiglipVisionModel"
+  ],
+  "attention_dropout": 0.0,
+  "hidden_act": "gelu_pytorch_tanh",
+  "hidden_size": 1152,
+  "image_size": 448,
+  "intermediate_size": 4304,
+  "layer_norm_eps": 1e-06,
+  "model_type": "siglip_vision_model",
+  "num_attention_heads": 16,
+  "num_channels": 3,
+  "num_hidden_layers": 27,
+  "num_image_tokens": 256,
+  "patch_size": 14,
+  "projection_dim": 2048,
+  "projector_hidden_act": "gelu_fast",
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.46.0",
+  "vision_use_head": false
+}

model/vision_tower/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:079b035a80cb54de801d4748e8b56b65be4116925c8159e679ff45cdb46e26a5
+size 826707904

model/vision_tower/preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "do_convert_rgb": null,
+  "do_normalize": true,
+  "do_rescale": true,
+  "do_resize": true,
+  "image_mean": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "image_processor_type": "SiglipImageProcessor",
+  "image_std": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "processor_class": "SiglipProcessor",
+  "resample": 3,
+  "rescale_factor": 0.00392156862745098,
+  "size": {
+    "height": 448,
+    "width": 448
+  }
+}

slurm/1038241.0.err ADDED Viewed

The diff for this file is too large to render. See raw diff

slurm/1038241.0.out ADDED Viewed

	@@ -0,0 +1,768 @@

+SLURM_JOB_ID = 1038241
+SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
+RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
+OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
+NNODES = 8
+NODES = pool0-01504 pool0-01641 pool0-02076 pool0-01787 pool0-02443 pool0-02338 pool0-02349 pool0-02374
+NODE_RANK = 2
+GPUS_PER_NODE = 8
+MASTER_ADDR = pool0-01504
+MASTER_PORT = 25001
+GLOBAL_TRAIN_BATCH_SIZE = 2048
+GRADIENT_ACCUMULATION_STEPS = 4
+PER_DEVICE_TRAIN_BATCH_SIZE = 8
+DEFAULT_LEARNING_RATE: 2e-5
+SLURM_JOB_ID = 1038241
+SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
+RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
+OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
+NNODES = 8
+NODES = pool0-01504 pool0-01641 pool0-02076 pool0-01787 pool0-02443 pool0-02338 pool0-02349 pool0-02374
+NODE_RANK = 3
+GPUS_PER_NODE = 8
+MASTER_ADDR = pool0-01504
+MASTER_PORT = 25001
+GLOBAL_TRAIN_BATCH_SIZE = 2048
+GRADIENT_ACCUMULATION_STEPS = 4
+PER_DEVICE_TRAIN_BATCH_SIZE = 8
+DEFAULT_LEARNING_RATE: 2e-5
+SLURM_JOB_ID = 1038241
+SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
+RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
+OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
+NNODES = 8
+NODES = pool0-01504 pool0-01641 pool0-02076 pool0-01787 pool0-02443 pool0-02338 pool0-02349 pool0-02374
+NODE_RANK = 5
+GPUS_PER_NODE = 8
+MASTER_ADDR = pool0-01504
+MASTER_PORT = 25001
+GLOBAL_TRAIN_BATCH_SIZE = 2048
+GRADIENT_ACCUMULATION_STEPS = 4
+PER_DEVICE_TRAIN_BATCH_SIZE = 8
+DEFAULT_LEARNING_RATE: 2e-5
+SLURM_JOB_ID = 1038241
+SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
+RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
+OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
+NNODES = 8
+NODES = pool0-01504 pool0-01641 pool0-02076 pool0-01787 pool0-02443 pool0-02338 pool0-02349 pool0-02374
+NODE_RANK = 1
+GPUS_PER_NODE = 8
+MASTER_ADDR = pool0-01504
+MASTER_PORT = 25001
+GLOBAL_TRAIN_BATCH_SIZE = 2048
+GRADIENT_ACCUMULATION_STEPS = 4
+PER_DEVICE_TRAIN_BATCH_SIZE = 8
+DEFAULT_LEARNING_RATE: 2e-5
+SLURM_JOB_ID = 1038241
+SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
+RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
+OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
+NNODES = 8
+NODES = pool0-01504 pool0-01641 pool0-02076 pool0-01787 pool0-02443 pool0-02338 pool0-02349 pool0-02374
+NODE_RANK = 0
+GPUS_PER_NODE = 8
+MASTER_ADDR = pool0-01504
+MASTER_PORT = 25001
+GLOBAL_TRAIN_BATCH_SIZE = 2048
+GRADIENT_ACCUMULATION_STEPS = 4
+PER_DEVICE_TRAIN_BATCH_SIZE = 8
+DEFAULT_LEARNING_RATE: 2e-5
+SLURM_JOB_ID = 1038241
+SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
+RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
+OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
+NNODES = 8
+NODES = pool0-01504 pool0-01641 pool0-02076 pool0-01787 pool0-02443 pool0-02338 pool0-02349 pool0-02374
+NODE_RANK = 7
+GPUS_PER_NODE = 8
+MASTER_ADDR = pool0-01504
+MASTER_PORT = 25001
+GLOBAL_TRAIN_BATCH_SIZE = 2048
+GRADIENT_ACCUMULATION_STEPS = 4
+PER_DEVICE_TRAIN_BATCH_SIZE = 8
+DEFAULT_LEARNING_RATE: 2e-5
+SLURM_JOB_ID = 1038241
+SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
+RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
+OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
+NNODES = 8
+NODES = pool0-01504 pool0-01641 pool0-02076 pool0-01787 pool0-02443 pool0-02338 pool0-02349 pool0-02374
+NODE_RANK = 4
+GPUS_PER_NODE = 8
+MASTER_ADDR = pool0-01504
+MASTER_PORT = 25001
+GLOBAL_TRAIN_BATCH_SIZE = 2048
+GRADIENT_ACCUMULATION_STEPS = 4
+PER_DEVICE_TRAIN_BATCH_SIZE = 8
+DEFAULT_LEARNING_RATE: 2e-5
+SLURM_JOB_ID = 1038241
+SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
+RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
+OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
+NNODES = 8
+NODES = pool0-01504 pool0-01641 pool0-02076 pool0-01787 pool0-02443 pool0-02338 pool0-02349 pool0-02374
+NODE_RANK = 6
+GPUS_PER_NODE = 8
+MASTER_ADDR = pool0-01504
+MASTER_PORT = 25001
+GLOBAL_TRAIN_BATCH_SIZE = 2048
+GRADIENT_ACCUMULATION_STEPS = 4
+PER_DEVICE_TRAIN_BATCH_SIZE = 8
+DEFAULT_LEARNING_RATE: 2e-5
+[2025-07-01 08:44:47,557] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:44:47,557] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:44:47,557] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:44:47,557] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:44:47,557] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:44:47,557] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:44:47,557] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:44:47,557] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:44:47,633] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:44:47,633] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:44:47,633] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:44:47,633] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:44:47,633] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:44:47,633] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:44:47,633] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:44:47,633] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:44:47,698] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:44:47,698] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:44:47,698] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:44:47,698] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:44:47,698] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:44:47,698] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:44:47,699] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:44:47,699] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:44:47,824] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:44:47,824] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:44:47,824] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:44:47,824] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:44:47,824] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:44:47,824] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:44:47,825] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:44:47,825] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:44:47,859] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:44:47,859] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:44:47,859] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:44:47,859] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:44:47,859] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:44:47,859] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:44:47,859] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:44:47,860] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:44:48,568] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:44:48,568] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:44:48,568] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:44:48,568] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:44:48,568] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:44:48,568] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:44:48,568] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:44:48,568] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:44:49,137] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:44:49,137] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:44:49,137] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:44:49,137] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:44:49,137] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:44:49,137] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:44:49,137] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:44:49,137] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:44:49,318] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:44:49,318] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:44:49,318] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:44:49,318] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:44:49,318] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:44:49,318] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:44:49,318] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:44:49,318] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:44:58,594] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:44:58,594] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:44:58,612] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:44:58,612] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:44:58,615] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:44:58,616] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:44:58,617] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:44:58,617] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:44:58,624] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:44:58,624] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:44:58,623] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:44:58,623] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:44:58,624] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:44:58,625] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:44:58,625] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:44:58,625] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:44:58,638] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:44:58,638] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:44:58,644] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:44:58,644] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:44:58,643] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:44:58,643] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:44:58,644] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:44:58,644] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:44:58,645] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:44:58,645] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:44:58,645] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:44:58,645] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:44:58,652] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:44:58,652] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:44:58,653] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:44:58,653] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:44:58,655] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:44:58,655] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:44:58,656] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:44:58,656] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:44:58,659] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:44:58,659] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:44:58,661] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:44:58,661] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:44:58,664] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:44:58,664] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:44:58,671] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:44:58,671] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:44:58,672] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:44:58,672] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:44:58,675] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:44:58,675] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:44:58,681] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:44:58,681] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:44:58,682] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:44:58,682] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:44:58,686] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:44:58,686] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:44:58,689] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:44:58,689] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:44:58,689] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:44:58,689] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:44:58,697] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:44:58,697] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:44:58,698] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:44:58,698] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:44:58,731] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:44:58,731] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:44:58,731] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:44:58,731] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:44:58,755] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:44:58,755] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:44:58,756] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:44:58,756] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:44:58,781] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:44:58,781] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:44:58,783] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:44:58,783] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:44:58,784] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:44:58,784] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:44:58,786] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:44:58,786] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:44:58,786] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:44:58,786] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:44:59,343] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:44:59,343] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:44:59,344] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:44:59,344] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:44:59,347] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:44:59,347] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:44:59,348] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:44:59,348] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:44:59,381] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:44:59,381] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:44:59,382] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:44:59,382] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:44:59,382] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:44:59,382] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:44:59,382] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:44:59,383] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:45:00,103] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:45:00,103] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:45:00,104] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:45:00,104] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:45:00,112] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:45:00,112] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:45:00,133] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:45:00,133] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:45:00,136] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:45:00,137] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:45:00,143] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:45:00,143] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:45:00,146] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:45:00,146] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:45:00,146] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:45:00,147] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:45:00,489] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:45:00,489] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:45:00,520] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:45:00,520] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:45:00,548] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:45:00,548] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:45:00,548] [INFO] [comm.py:625:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
+[2025-07-01 08:45:00,550] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:45:00,550] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:45:00,582] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:45:00,582] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:45:00,588] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:45:00,588] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:45:00,590] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:45:00,590] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:45:00,600] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:45:00,600] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:45:15,555] [INFO] [partition_parameters.py:453:__exit__] finished initializing model with 7.61B parameters
+[2025-07-01 08:45:31,263] [INFO] [partition_parameters.py:453:__exit__] finished initializing model with 8.03B parameters
+[2025-07-01 08:45:32,600] [INFO] [partition_parameters.py:453:__exit__] finished initializing model with 8.09B parameters
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+[dist-0-of-64] trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+LlavaLlamaModel(
+  (llm): Qwen2ForCausalLM(
+    (model): Qwen2Model(
+      (embed_tokens): Embedding(151648, 3584)
+      (layers): ModuleList(
+        (0-27): 28 x Qwen2DecoderLayer(
+          (self_attn): Qwen2FlashAttention2(
+            (q_proj): Linear(in_features=3584, out_features=3584, bias=True)
+            (k_proj): Linear(in_features=3584, out_features=512, bias=True)
+            (v_proj): Linear(in_features=3584, out_features=512, bias=True)
+            (o_proj): Linear(in_features=3584, out_features=3584, bias=False)
+            (rotary_emb): Qwen2RotaryEmbedding()
+          )
+          (mlp): Qwen2MLP(
+            (gate_proj): Linear(in_features=3584, out_features=18944, bias=False)
+            (up_proj): Linear(in_features=3584, out_features=18944, bias=False)
+            (down_proj): Linear(in_features=18944, out_features=3584, bias=False)
+            (act_fn): SiLU()
+          )
+          (input_layernorm): Qwen2RMSNorm((0,), eps=1e-06)
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+          (post_attention_layernorm): Qwen2RMSNorm((0,), eps=1e-06)
+        )
+      )
+      (norm): Qwen2RMSNorm((0,), eps=1e-06)
+      (rotary_emb): Qwen2RotaryEmbedding()
+    )
+    (lm_head): Linear(in_features=3584, out_features=151648, bias=False)
+  )
+  (vision_tower): SiglipVisionTower(
+    (vision_tower): SiglipVisionModel(
+      (vision_model): SiglipVisionTransformer(
+        (embeddings): SiglipVisionEmbeddings(
+          (patch_embedding): Conv2d(3, 1152, kernel_size=(14, 14), stride=(14, 14), padding=valid)
+          (position_embedding): Embedding(1024, 1152)
+        )
+        (encoder): SiglipEncoder(
+          (layers): ModuleList(
+            (0-26): 27 x SiglipEncoderLayer(
+              (self_attn): SiglipFlashAttention2(
+                (k_proj): Linear(in_features=1152, out_features=1152, bias=True)
+                (v_proj): Linear(in_features=1152, out_features=1152, bias=True)
+                (q_proj): Linear(in_features=1152, out_features=1152, bias=True)
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+                (out_proj): Linear(in_features=1152, out_features=1152, bias=True)
+              )
+              (layer_norm1): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
+              (mlp): SiglipMLP(
+                (activation_fn): PytorchGELUTanh()
+                (fc1): Linear(in_features=1152, out_features=4304, bias=True)
+                (fc2): Linear(in_features=4304, out_features=1152, bias=True)
+              )
+              (layer_norm2): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
+            )
+          )
+        )
+        (post_layernorm): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
+      )
+    )
+  )
+  (mm_projector): MultimodalProjector(
+    (layers): Sequential(
+      (0): DownSample3x3BlockFix()
+      (1): LayerNorm((10368,), eps=1e-05, elementwise_affine=True)
+      (2): Linear(in_features=10368, out_features=3456, bias=True)
+      (3): GELU(approximate='none')
+      (4): LayerNorm((3456,), eps=1e-05, elementwise_affine=True)
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+      (5): Linear(in_features=3456, out_features=3584, bias=True)
+      (6): GELU(approximate='none')
+      (7): Linear(in_features=3584, out_features=3584, bias=True)
+    )
+  )
+)
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+[dist-0-of-64] Tunable parameters:
+language model True
+[dist-0-of-64] vision tower True
+[dist-0-of-64] mm projector True
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+[2025-07-01 08:48:06] Rank 32: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 187.84421372413635s
+[2025-07-01 08:48:06] Rank 16: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 187.99013113975525s
+[2025-07-01 08:48:06] Rank 60: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 188.01411652565002s
+[2025-07-01 08:48:06] Rank 49: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 187.33525800704956s
+[2025-07-01 08:48:06] Rank 3: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 186.23286867141724s
+[2025-07-01 08:48:06] Rank 37: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 188.08282613754272s
+length of dataloader: 28 14336
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 08:48:06] Rank 40: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 188.24676775932312s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 08:48:06] Rank 62: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 188.2327139377594s
+[2025-07-01 08:48:06] Rank 50: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 187.53005170822144s
+[2025-07-01 08:48:06] Rank 25: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 188.26658296585083s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 08:48:06] Rank 46: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 188.28350949287415s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 08:48:06] Rank 31: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 188.32588863372803s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 08:48:06] Rank 0: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 186.52057933807373s
+[2025-07-01 08:48:06] Rank 20: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 188.36052680015564s
+[2025-07-01 08:48:06] Rank 21: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 188.3777801990509s
+[2025-07-01 08:48:06] Rank 14: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 186.87918162345886s
+[2025-07-01 08:48:06] Rank 13: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 186.88157558441162s
+[2025-07-01 08:48:06] Rank 29: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 188.40087914466858s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 08:48:06] Rank 15: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 186.94210743904114s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+length of dataloader: 28 14336
+length of dataloader:[GPU memory] before trainer  2.29240751266479528
+ 14336
+[GPU memory] before trainer 2.292407512664795
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 08:48:07] Rank 33: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 188.36294674873352s
+[2025-07-01 08:48:07] Rank 9: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 187.00075697898865s
+[2025-07-01 08:48:07] Rank 53: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 187.78530764579773s
+[2025-07-01 08:48:07] Rank 2: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 186.68620371818542s
+[2025-07-01 08:48:07] Rank 26: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 188.52323293685913s
+[2025-07-01 08:48:07] Rank 12: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 187.02464079856873s
+[2025-07-01 08:48:07] Rank 58: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 188.52194261550903s
+[2025-07-01 08:48:07] Rank 28: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 188.54804372787476s
+[2025-07-01 08:48:07] Rank 30: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 188.5494029521942s
+[2025-07-01 08:48:07] Rank 43: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 188.55047464370728s
+[2025-07-01 08:48:07] Rank 22: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 188.5522198677063s
+[2025-07-01 08:48:07] Rank 11: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 187.06330227851868s
+[2025-07-01 08:48:07] Rank 18: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 188.56281304359436s
+[2025-07-01 08:48:07] Rank 48: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 187.8359296321869s
+[2025-07-01 08:48:07] Rank 8: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 187.07292366027832s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 08:48:07] Rank 36: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 188.44479870796204s
+length of dataloader: 28 14336
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[2025-07-01 08:48:07] Rank 44: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 188.58538126945496s
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 08:48:07] Rank 38: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 188.47206234931946s
+[2025-07-01 08:48:07] Rank 19: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 188.60940408706665s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[2025-07-01 08:48:07] Rank 45: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 188.61079692840576s
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 08:48:07] Rank 17: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 188.61975002288818s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 08:48:07] Rank 6: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 186.79642844200134s
+length of dataloader: 28 14336
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 08:48:07] Rank 35: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 188.512140750885s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 08:48:07] Rank 27: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 188.64775276184082s
+[2025-07-01 08:48:07] Rank 24: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 188.64974856376648s
+[2025-07-01 08:48:07] Rank 1: Timer for terminate callback has been set.
+Total limit: 240min
+length of dataloader: 28 14336
+Pre terminate time: 10min elapsed_time: 186.81638717651367s
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 08:48:07] Rank 51: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 187.9340763092041s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 08:48:07] Rank 7: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 186.82988810539246s
+[2025-07-01 08:48:07] Rank 55: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 187.9399230480194s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 08:48:07] Rank 54: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 187.96026849746704s
+[2025-07-01 08:48:07] Rank 23: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 188.69248342514038s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 08:48:07] Rank 57: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 188.67919492721558s
+[2025-07-01 08:48:07] Rank 52: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 187.97831630706787s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 08:48:07] Rank 61: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 188.687602519989s
+[2025-07-01 08:48:07] Rank 4: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 186.87889289855957s
+[2025-07-01 08:48:07] Rank 42: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 188.7198281288147s
+[2025-07-01 08:48:07] Rank 5: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 186.88499283790588s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 08:48:07] Rank 10: Timer for terminate callback has been set.
+Total limit: 240min
+length of dataloader: 28 14336
+Pre terminate time: 10min elapsed_time: 187.23191928863525s
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 08:48:07] Rank 47: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 188.74053859710693s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 08:48:07] Rank 63: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 188.7193853855133s
+[2025-07-01 08:48:07] Rank 34: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 188.61402535438538s
+length of dataloader: 28 14336
+[2025-07-01 08:48:07] Rank 59: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 188.72772979736328s
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 08:48:07] Rank 41: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 188.77644872665405s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 08:48:07] Rank 56: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 188.7942099571228s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 08:48:07] Rank 39: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 188.70067310333252s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+Parameter Offload: Total persistent parameters: 771184 in 421 params

slurm/1038247.0.err ADDED Viewed

The diff for this file is too large to render. See raw diff

slurm/1038247.0.out ADDED Viewed

	@@ -0,0 +1,768 @@

+SLURM_JOB_ID = 1038247
+SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
+RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
+OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
+NNODES = 8
+NODES = pool0-01504 pool0-01641 pool0-02076 pool0-01787 pool0-02443 pool0-02338 pool0-02349 pool0-02374
+NODE_RANK = 3
+GPUS_PER_NODE = 8
+MASTER_ADDR = pool0-01504
+MASTER_PORT = 25001
+GLOBAL_TRAIN_BATCH_SIZE = 2048
+GRADIENT_ACCUMULATION_STEPS = 4
+PER_DEVICE_TRAIN_BATCH_SIZE = 8
+DEFAULT_LEARNING_RATE: 2e-5
+SLURM_JOB_ID = 1038247
+SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
+RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
+OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
+NNODES = 8
+NODES = pool0-01504 pool0-01641 pool0-02076 pool0-01787 pool0-02443 pool0-02338 pool0-02349 pool0-02374
+NODE_RANK = 6
+GPUS_PER_NODE = 8
+MASTER_ADDR = pool0-01504
+MASTER_PORT = 25001
+GLOBAL_TRAIN_BATCH_SIZE = 2048
+GRADIENT_ACCUMULATION_STEPS = 4
+PER_DEVICE_TRAIN_BATCH_SIZE = 8
+DEFAULT_LEARNING_RATE: 2e-5
+SLURM_JOB_ID = 1038247
+SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
+RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
+OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
+NNODES = 8
+NODES = pool0-01504 pool0-01641 pool0-02076 pool0-01787 pool0-02443 pool0-02338 pool0-02349 pool0-02374
+NODE_RANK = 5
+GPUS_PER_NODE = 8
+MASTER_ADDR = pool0-01504
+MASTER_PORT = 25001
+GLOBAL_TRAIN_BATCH_SIZE = 2048
+GRADIENT_ACCUMULATION_STEPS = 4
+PER_DEVICE_TRAIN_BATCH_SIZE = 8
+DEFAULT_LEARNING_RATE: 2e-5
+SLURM_JOB_ID = 1038247
+SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
+RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
+OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
+NNODES = 8
+NODES = pool0-01504 pool0-01641 pool0-02076 pool0-01787 pool0-02443 pool0-02338 pool0-02349 pool0-02374
+NODE_RANK = 4
+GPUS_PER_NODE = 8
+MASTER_ADDR = pool0-01504
+MASTER_PORT = 25001
+GLOBAL_TRAIN_BATCH_SIZE = 2048
+GRADIENT_ACCUMULATION_STEPS = 4
+PER_DEVICE_TRAIN_BATCH_SIZE = 8
+DEFAULT_LEARNING_RATE: 2e-5
+SLURM_JOB_ID = 1038247
+SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
+RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
+OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
+NNODES = 8
+NODES = pool0-01504 pool0-01641 pool0-02076 pool0-01787 pool0-02443 pool0-02338 pool0-02349 pool0-02374
+NODE_RANK = 2
+GPUS_PER_NODE = 8
+MASTER_ADDR = pool0-01504
+MASTER_PORT = 25001
+GLOBAL_TRAIN_BATCH_SIZE = 2048
+GRADIENT_ACCUMULATION_STEPS = 4
+PER_DEVICE_TRAIN_BATCH_SIZE = 8
+DEFAULT_LEARNING_RATE: 2e-5
+SLURM_JOB_ID = 1038247
+SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
+RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
+OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
+NNODES = 8
+NODES = pool0-01504 pool0-01641 pool0-02076 pool0-01787 pool0-02443 pool0-02338 pool0-02349 pool0-02374
+NODE_RANK = 7
+GPUS_PER_NODE = 8
+MASTER_ADDR = pool0-01504
+MASTER_PORT = 25001
+GLOBAL_TRAIN_BATCH_SIZE = 2048
+GRADIENT_ACCUMULATION_STEPS = 4
+PER_DEVICE_TRAIN_BATCH_SIZE = 8
+DEFAULT_LEARNING_RATE: 2e-5
+SLURM_JOB_ID = 1038247
+SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
+RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
+OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
+NNODES = 8
+NODES = pool0-01504 pool0-01641 pool0-02076 pool0-01787 pool0-02443 pool0-02338 pool0-02349 pool0-02374
+NODE_RANK = 0
+GPUS_PER_NODE = 8
+MASTER_ADDR = pool0-01504
+MASTER_PORT = 25001
+GLOBAL_TRAIN_BATCH_SIZE = 2048
+GRADIENT_ACCUMULATION_STEPS = 4
+PER_DEVICE_TRAIN_BATCH_SIZE = 8
+DEFAULT_LEARNING_RATE: 2e-5
+SLURM_JOB_ID = 1038247
+SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
+RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
+OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
+NNODES = 8
+NODES = pool0-01504 pool0-01641 pool0-02076 pool0-01787 pool0-02443 pool0-02338 pool0-02349 pool0-02374
+NODE_RANK = 1
+GPUS_PER_NODE = 8
+MASTER_ADDR = pool0-01504
+MASTER_PORT = 25001
+GLOBAL_TRAIN_BATCH_SIZE = 2048
+GRADIENT_ACCUMULATION_STEPS = 4
+PER_DEVICE_TRAIN_BATCH_SIZE = 8
+DEFAULT_LEARNING_RATE: 2e-5
+[2025-07-01 08:49:27,028] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:49:27,330] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:49:27,386] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:49:27,390] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:49:27,399] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:49:27,443] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:49:27,449] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:49:27,451] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:49:27,451] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:49:27,453] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:49:27,527] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:49:27,584] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:49:27,587] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:49:27,643] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:49:27,658] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:49:27,659] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:49:27,693] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:49:27,694] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:49:27,696] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:49:27,697] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:49:27,698] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:49:27,736] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:49:27,738] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:49:27,739] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:49:27,745] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:49:28,093] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:49:28,213] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:49:28,213] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:49:28,214] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:49:28,215] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:49:28,219] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:49:28,222] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:49:28,281] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:49:28,311] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:49:28,314] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:49:28,326] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:49:28,383] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:49:28,385] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:49:28,387] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:49:28,389] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:49:28,419] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:49:28,464] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:49:28,590] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:49:28,590] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:49:28,602] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:49:28,603] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:49:28,604] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:49:28,608] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:49:29,434] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:49:29,469] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:49:29,470] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:49:29,481] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:49:29,510] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:49:29,523] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:49:29,599] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:49:29,600] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:49:30,356] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:49:30,356] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:49:30,357] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:49:30,393] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:49:30,393] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:49:30,462] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:49:30,468] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:49:30,470] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:49:30,472] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:49:30,479] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:49:30,627] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:49:30,627] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:49:30,702] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:49:30,702] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:49:30,728] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:49:30,728] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:49:30,741] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:49:30,741] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:49:30,803] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:49:30,804] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:49:30,820] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:49:30,820] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:49:30,824] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:49:30,824] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:49:30,828] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:49:30,828] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:49:30,840] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:49:30,840] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:49:30,857] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:49:30,857] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:49:30,857] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:49:30,858] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:49:30,985] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:49:30,985] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:49:30,991] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:49:30,991] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:49:30,993] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:49:30,993] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:49:30,996] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:49:30,996] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:49:31,151] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:49:31,151] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:49:31,173] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:49:31,173] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:49:31,178] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:49:31,178] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:49:31,179] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:49:31,179] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:49:31,300] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:49:31,300] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:49:31,300] [INFO] [comm.py:625:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
+[2025-07-01 08:49:31,328] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:49:31,328] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:49:31,375] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:49:31,375] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:49:31,375] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:49:31,376] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:49:31,389] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:49:31,390] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:49:31,458] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:49:31,458] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:49:31,626] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:49:31,626] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:49:31,626] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:49:31,626] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:49:31,626] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:49:31,626] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:49:31,632] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:49:31,632] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:49:31,639] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:49:31,639] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:49:31,655] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:49:31,655] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:49:31,662] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:49:31,662] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:49:31,665] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:49:31,665] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:49:31,702] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:49:31,702] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:49:31,720] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:49:31,720] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:49:31,727] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:49:31,727] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:49:31,756] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:49:31,756] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:49:31,807] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:49:31,807] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:49:31,853] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:49:31,853] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:49:31,864] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:49:31,864] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:49:31,887] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:49:31,887] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:49:32,074] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:49:32,074] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:49:32,091] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:49:32,091] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:49:32,091] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:49:32,091] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:49:32,105] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:49:32,105] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:49:32,108] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:49:32,108] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:49:32,109] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:49:32,109] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:49:32,831] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:49:32,831] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:49:32,871] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:49:32,871] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:49:32,881] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:49:32,881] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:49:32,882] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:49:32,882] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:49:32,882] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:49:32,882] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:49:32,886] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:49:32,886] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:49:32,889] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:49:32,890] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:49:32,891] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:49:32,891] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:49:33,702] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:49:33,702] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:49:33,702] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:49:33,702] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:49:33,702] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:49:33,702] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:49:33,791] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:49:33,791] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:49:33,924] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:49:33,924] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:49:34,044] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:49:34,044] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:49:34,108] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:49:34,108] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:49:34,138] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:49:34,138] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:49:48,774] [INFO] [partition_parameters.py:453:__exit__] finished initializing model with 7.61B parameters
+[2025-07-01 08:49:57,388] [INFO] [partition_parameters.py:453:__exit__] finished initializing model with 8.03B parameters
+[2025-07-01 08:49:58,024] [INFO] [partition_parameters.py:453:__exit__] finished initializing model with 8.09B parameters
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+[dist-0-of-64] LlavaLlamaModel(
+  (llm): Qwen2ForCausalLM(
+    (model): Qwen2Model(
+      (embed_tokens): Embedding(151648, 3584)
+      (layers): ModuleList(
+        (0-27): 28 x Qwen2DecoderLayer(
+          (self_attn): Qwen2FlashAttention2(
+            (q_proj): Linear(in_features=3584, out_features=3584, bias=True)
+            (k_proj): Linear(in_features=3584, out_features=512, bias=True)
+            (v_proj): Linear(in_features=3584, out_features=512, bias=True)
+            (o_proj): Linear(in_features=3584, out_features=3584, bias=False)
+            (rotary_emb): Qwen2RotaryEmbedding()
+          )
+          (mlp): Qwen2MLP(
+            (gate_proj): Linear(in_features=3584, out_features=18944, bias=False)
+            (up_proj): Linear(in_features=3584, out_features=18944, bias=False)
+            (down_proj): Linear(in_features=18944, out_features=3584, bias=False)
+            (act_fn): SiLU()
+          )
+          (input_layernorm): Qwen2RMSNorm((0,), eps=1e-06)
+          (post_attention_layernorm): Qwen2RMSNorm((0,), eps=1e-06)
+        )
+      )
+      (norm): Qwen2RMSNorm((0,), eps=1e-06)
+      (rotary_emb): Qwen2RotaryEmbedding()
+    )
+    (lm_head): Linear(in_features=3584, out_features=151648, bias=False)
+  )
+  (vision_tower): SiglipVisionTower(
+    (vision_tower): SiglipVisionModel(
+      (vision_model): SiglipVisionTransformer(
+        (embeddings): SiglipVisionEmbeddings(
+          (patch_embedding): Conv2d(3, 1152, kernel_size=(14, 14), stride=(14, 14), padding=valid)
+          (position_embedding): Embedding(1024, 1152)
+        )
+        (encoder): SiglipEncoder(
+          (layers): ModuleList(
+            (0-26): 27 x SiglipEncoderLayer(
+              (self_attn): SiglipFlashAttention2(
+                (k_proj): Linear(in_features=1152, out_features=1152, bias=True)
+                (v_proj): Linear(in_features=1152, out_features=1152, bias=True)
+                (q_proj): Linear(in_features=1152, out_features=1152, bias=True)
+                (out_proj): Linear(in_features=1152, out_features=1152, bias=True)
+              )
+              (layer_norm1): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
+              (mlp): SiglipMLP(
+                (activation_fn): PytorchGELUTanh()
+                (fc1): Linear(in_features=1152, out_features=4304, bias=True)
+                (fc2): Linear(in_features=4304, out_features=1152, bias=True)
+              )
+              (layer_norm2): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
+            )
+          )
+        )
+        (post_layernorm): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
+      )
+    )
+  )
+  (mm_projector): MultimodalProjector(
+    (layers): Sequential(
+      (0): DownSample3x3BlockFix()
+      (1): LayerNorm((10368,), eps=1e-05, elementwise_affine=True)
+      (2): Linear(in_features=10368, out_features=3456, bias=True)
+      (3): GELU(approximate='none')
+      (4): LayerNorm((3456,), eps=1e-05, elementwise_affine=True)
+      (5): Linear(in_features=3456, out_features=3584, bias=True)
+      (6): GELU(approximate='none')
+      (7): Linear(in_features=3584, out_features=3584, bias=True)
+    )
+  )
+)
+[dist-0-of-64] Tunable parameters:
+language model True
+[dist-0-of-64] vision tower True
+[dist-0-of-64] mm projector True
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+[2025-07-01 08:52:31] Rank 41: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 179.37434482574463s
+[2025-07-01 08:52:31] Rank 6: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 180.40121841430664s
+[2025-07-01 08:52:31] Rank 21: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 180.3818507194519s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 08:52:31] Rank 25: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 179.95334601402283s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 08:52:31] Rank 58: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 177.84617686271667s
+[2025-07-01 08:52:31] Rank 36: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 179.8410665988922s
+[2025-07-01 08:52:31] Rank 38: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 179.93657898902893s
+[2025-07-01 08:52:31] Rank 45: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 179.68102931976318s
+[2025-07-01 08:52:31] Rank 55: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 180.8664586544037s
+[2025-07-01 08:52:31] Rank 14: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 178.74036169052124s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 08:52:31] Rank 56: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 177.8835060596466s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 08:52:31] Rank 16: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 180.6106789112091s
+[2025-07-01 08:52:31] Rank 28: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 180.10080122947693s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 08:52:31] Rank 51: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 181.0412666797638s
+[2025-07-01 08:52:31] Rank 5: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 180.63762664794922s
+[2025-07-01 08:52:31] Rank 11: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 178.8860149383545s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 08:52:31] Rank 12: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 178.91869187355042s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 08:52:31] Rank 49: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 181.07620072364807s
+[2025-07-01 08:52:31] Rank 54: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 181.11947393417358s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 08:52:31] Rank 15: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 178.96962904930115s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 08:52:31] Rank 53: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 181.16217613220215s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 08:52:31] Rank 48: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 181.2186541557312s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 08:52:31] Rank 52: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 181.21066880226135s
+[2025-07-01 08:52:31] Rank 31: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 180.3190746307373s
+[2025-07-01 08:52:31] Rank 8: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 179.07767581939697s
+[2025-07-01 08:52:31] Rank 24: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 180.3272933959961s
+[2025-07-01 08:52:31] Rank 1: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 180.84223079681396s
+[2025-07-01 08:52:31] Rank 3: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 181.00526809692383s
+[2025-07-01 08:52:31] Rank 9: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 179.134206533432s
+[2025-07-01 08:52:31] Rank 33: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 180.2782702445984s
+[2025-07-01 08:52:31] Rank 32: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 180.3015902042389s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 08:52:31] Rank 50: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 181.3520963191986s
+[2025-07-01 08:52:31] Rank 39: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 180.27661395072937s
+[2025-07-01 08:52:31] Rank 40: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 180.0057246685028s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 08:52:31] Rank 20: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 180.9391541481018s
+[2025-07-01 08:52:31] Rank 37: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 180.3413987159729s
+[2025-07-01 08:52:31] Rank 34: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 180.33079957962036s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 08:52:31] Rank 22: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 181.61974906921387s
+[2025-07-01 08:52:31] Rank 13: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 179.15128827095032s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 08:52:31] Rank 43: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 180.01913499832153s
+[2025-07-01 08:52:31] Rank 10: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 179.16416096687317s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 08:52:31] Rank 23: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 181.1800184249878s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 08:52:31] Rank 46: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 181.30903506278992s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 08:52:32] Rank 29: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 180.40558052062988s
+[2025-07-01 08:52:32] Rank 47: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 180.4238715171814s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 08:52:32] Rank 27: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 180.43949127197266s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 08:52:32] Rank 17: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 180.98406744003296s
+[2025-07-01 08:52:32] Rank 61: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 178.336608171463s
+[2025-07-01 08:52:32] Rank 57: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 178.334801197052s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 08:52:32] Rank 35: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 180.47004318237305s
+[2025-07-01 08:52:32] Rank 42: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 180.17485332489014s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 08:52:32] Rank 63: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 178.42643857002258s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 08:52:32] Rank 60: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 178.29783725738525s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 08:52:32] Rank 19: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 181.14956998825073s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 08:52:32] Rank 59: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 178.32719588279724s
+[2025-07-01 08:52:32] Rank 18: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 181.05258059501648s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 08:52:32] Rank 26: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 180.52194571495056s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 08:52:32] Rank 30: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 180.52348446846008s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 08:52:32] Rank 62: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 178.48913526535034s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 08:52:32] Rank 2: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 181.0904836654663s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 08:52:32] Rank 7: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 181.38074707984924s
+[2025-07-01 08:52:32] Rank 0: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 181.12232398986816s
+[2025-07-01 08:52:32] Rank 4: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 181.12325024604797s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 08:52:32] Rank 44: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 180.46628999710083s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+Parameter Offload: Total persistent parameters: 771184 in 421 params

slurm/1038254.0.err ADDED Viewed

The diff for this file is too large to render. See raw diff

slurm/1038254.0.out ADDED Viewed

	@@ -0,0 +1,768 @@

+SLURM_JOB_ID = 1038254
+SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
+RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
+OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
+NNODES = 8
+NODES = pool0-01504 pool0-01641 pool0-02076 pool0-01787 pool0-02443 pool0-02338 pool0-02349 pool0-02374
+NODE_RANK = 4
+GPUS_PER_NODE = 8
+MASTER_ADDR = pool0-01504
+MASTER_PORT = 25001
+GLOBAL_TRAIN_BATCH_SIZE = 2048
+GRADIENT_ACCUMULATION_STEPS = 4
+PER_DEVICE_TRAIN_BATCH_SIZE = 8
+DEFAULT_LEARNING_RATE: 2e-5
+SLURM_JOB_ID = 1038254
+SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
+RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
+OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
+NNODES = 8
+NODES = pool0-01504 pool0-01641 pool0-02076 pool0-01787 pool0-02443 pool0-02338 pool0-02349 pool0-02374
+NODE_RANK = 3
+GPUS_PER_NODE = 8
+MASTER_ADDR = pool0-01504
+MASTER_PORT = 25001
+GLOBAL_TRAIN_BATCH_SIZE = 2048
+GRADIENT_ACCUMULATION_STEPS = 4
+PER_DEVICE_TRAIN_BATCH_SIZE = 8
+DEFAULT_LEARNING_RATE: 2e-5
+SLURM_JOB_ID = 1038254
+SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
+RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
+OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
+NNODES = 8
+SLURM_JOB_ID = 1038254
+SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
+RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
+OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
+NNODES = 8
+SLURM_JOB_ID = 1038254
+SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
+RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
+OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
+NNODES = 8
+NODES = pool0-01504 pool0-01641 pool0-02076 pool0-01787 pool0-02443 pool0-02338 pool0-02349 pool0-02374
+NODE_RANK = 0
+GPUS_PER_NODE = 8
+NODES = pool0-01504 pool0-01641 pool0-02076 pool0-01787 pool0-02443 pool0-02338 pool0-02349 pool0-02374
+NODE_RANK = 6
+GPUS_PER_NODE = 8
+MASTER_ADDR = pool0-01504
+MASTER_PORT = 25001
+GLOBAL_TRAIN_BATCH_SIZE = 2048
+GRADIENT_ACCUMULATION_STEPS = 4
+PER_DEVICE_TRAIN_BATCH_SIZE = 8
+NODES = pool0-01504 pool0-01641 pool0-02076 pool0-01787 pool0-02443 pool0-02338 pool0-02349 pool0-02374
+NODE_RANK = 5
+DEFAULT_LEARNING_RATE: 2e-5
+GPUS_PER_NODE = 8
+MASTER_ADDR = pool0-01504
+MASTER_PORT = 25001
+GLOBAL_TRAIN_BATCH_SIZE = 2048
+GRADIENT_ACCUMULATION_STEPS = 4
+PER_DEVICE_TRAIN_BATCH_SIZE = 8
+DEFAULT_LEARNING_RATE: 2e-5
+MASTER_ADDR = pool0-01504
+MASTER_PORT = 25001
+GLOBAL_TRAIN_BATCH_SIZE = 2048
+GRADIENT_ACCUMULATION_STEPS = 4
+PER_DEVICE_TRAIN_BATCH_SIZE = 8
+DEFAULT_LEARNING_RATE: 2e-5
+SLURM_JOB_ID = 1038254
+SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
+RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
+OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
+NNODES = 8
+NODES = pool0-01504 pool0-01641 pool0-02076 pool0-01787 pool0-02443 pool0-02338 pool0-02349 pool0-02374
+NODE_RANK = 1
+GPUS_PER_NODE = 8
+MASTER_ADDR = pool0-01504
+MASTER_PORT = 25001
+GLOBAL_TRAIN_BATCH_SIZE = 2048
+GRADIENT_ACCUMULATION_STEPS = 4
+PER_DEVICE_TRAIN_BATCH_SIZE = 8
+DEFAULT_LEARNING_RATE: 2e-5
+SLURM_JOB_ID = 1038254
+SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
+RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
+OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
+NNODES = 8
+NODES = pool0-01504 pool0-01641 pool0-02076 pool0-01787 pool0-02443 pool0-02338 pool0-02349 pool0-02374
+NODE_RANK = 2
+GPUS_PER_NODE = 8
+MASTER_ADDR = pool0-01504
+MASTER_PORT = 25001
+GLOBAL_TRAIN_BATCH_SIZE = 2048
+GRADIENT_ACCUMULATION_STEPS = 4
+PER_DEVICE_TRAIN_BATCH_SIZE = 8
+DEFAULT_LEARNING_RATE: 2e-5
+SLURM_JOB_ID = 1038254
+SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
+RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
+OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
+NNODES = 8
+NODES = pool0-01504 pool0-01641 pool0-02076 pool0-01787 pool0-02443 pool0-02338 pool0-02349 pool0-02374
+NODE_RANK = 7
+GPUS_PER_NODE = 8
+MASTER_ADDR = pool0-01504
+MASTER_PORT = 25001
+GLOBAL_TRAIN_BATCH_SIZE = 2048
+GRADIENT_ACCUMULATION_STEPS = 4
+PER_DEVICE_TRAIN_BATCH_SIZE = 8
+DEFAULT_LEARNING_RATE: 2e-5
+[2025-07-01 08:54:00,867] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:54:00,974] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:54:01,467] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:54:01,563] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:54:01,572] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:54:01,576] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:54:01,592] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:54:01,592] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:54:01,612] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:54:01,764] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:54:01,764] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:54:01,769] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:54:01,775] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:54:01,779] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:54:01,783] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:54:01,847] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:54:01,880] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:54:01,960] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:54:01,962] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:54:02,013] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:54:02,021] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:54:02,035] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:54:02,048] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:54:02,054] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:54:02,099] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:54:02,101] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:54:02,102] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:54:02,114] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:54:02,145] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:54:02,187] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:54:02,188] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:54:02,189] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:54:02,193] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:54:02,194] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:54:02,265] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:54:02,373] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:54:02,373] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:54:02,383] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:54:02,383] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:54:02,387] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:54:02,389] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:54:02,656] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:54:02,664] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:54:02,693] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:54:02,731] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:54:02,756] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:54:02,756] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:54:02,776] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:54:02,782] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:54:02,785] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:54:02,788] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:54:02,791] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:54:02,835] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:54:02,839] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:54:02,841] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:54:02,851] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:54:03,212] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:54:03,212] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:54:03,217] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:54:03,217] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:54:03,245] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:54:03,247] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:54:03,247] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:54:03,254] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:54:04,150] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:54:04,150] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:54:04,274] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:54:04,274] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:54:04,839] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:54:04,839] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:54:04,847] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:54:04,847] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:54:04,895] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:54:04,895] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:54:04,913] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:54:04,913] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:54:04,932] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:54:04,932] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:54:05,228] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:54:05,229] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:54:05,229] [INFO] [comm.py:625:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
+[2025-07-01 08:54:05,261] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:54:05,261] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:54:05,276] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:54:05,276] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:54:05,281] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:54:05,281] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:54:05,286] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:54:05,286] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:54:05,287] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:54:05,287] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:54:05,292] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:54:05,293] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:54:05,293] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:54:05,293] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:54:05,295] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:54:05,295] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:54:05,297] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:54:05,297] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:54:05,303] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:54:05,303] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:54:05,303] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:54:05,303] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:54:05,340] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:54:05,340] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:54:05,391] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:54:05,391] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:54:05,391] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:54:05,391] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:54:05,392] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:54:05,392] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:54:05,393] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:54:05,393] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:54:05,395] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:54:05,395] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:54:05,429] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:54:05,429] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:54:05,462] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:54:05,462] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:54:05,478] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:54:05,478] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:54:05,539] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:54:05,539] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:54:05,575] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:54:05,575] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:54:05,684] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:54:05,685] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:54:05,698] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:54:05,698] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:54:05,801] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:54:05,801] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:54:05,816] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:54:05,817] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:54:05,843] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:54:05,843] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:54:05,862] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:54:05,862] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:54:05,965] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:54:05,965] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:54:05,994] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:54:05,995] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:54:05,997] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:54:05,997] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:54:06,071] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:54:06,071] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:54:06,124] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:54:06,124] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:54:06,126] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:54:06,126] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:54:06,131] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:54:06,131] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:54:06,149] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:54:06,149] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:54:06,173] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:54:06,173] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:54:06,186] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:54:06,186] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:54:06,217] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:54:06,217] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:54:06,236] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:54:06,236] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:54:06,269] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:54:06,269] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:54:06,278] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:54:06,278] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:54:06,285] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:54:06,285] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:54:06,289] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:54:06,289] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:54:06,291] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:54:06,291] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:54:06,299] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:54:06,299] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:54:06,302] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:54:06,302] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:54:06,313] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:54:06,313] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:54:06,539] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:54:06,539] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:54:06,666] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:54:06,666] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:54:06,687] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:54:06,687] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:54:06,691] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:54:06,691] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:54:06,711] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:54:06,711] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:54:06,729] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:54:06,730] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:54:06,735] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:54:06,735] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:54:06,776] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:54:06,776] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:54:23,138] [INFO] [partition_parameters.py:453:__exit__] finished initializing model with 7.61B parameters
+[2025-07-01 08:54:31,001] [INFO] [partition_parameters.py:453:__exit__] finished initializing model with 8.03B parameters
+[2025-07-01 08:54:31,607] [INFO] [partition_parameters.py:453:__exit__] finished initializing model with 8.09B parameters
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+[dist-0-of-64] LlavaLlamaModel(
+  (llm): Qwen2ForCausalLM(
+    (model): Qwen2Model(
+      (embed_tokens): Embedding(151648, 3584)
+      (layers): ModuleList(
+        (0-27): 28 x Qwen2DecoderLayer(
+          (self_attn): Qwen2FlashAttention2(
+            (q_proj): Linear(in_features=3584, out_features=3584, bias=True)
+            (k_proj): Linear(in_features=3584, out_features=512, bias=True)
+            (v_proj): Linear(in_features=3584, out_features=512, bias=True)
+            (o_proj): Linear(in_features=3584, out_features=3584, bias=False)
+            (rotary_emb): Qwen2RotaryEmbedding()
+          )
+          (mlp): Qwen2MLP(
+            (gate_proj): Linear(in_features=3584, out_features=18944, bias=False)
+            (up_proj): Linear(in_features=3584, out_features=18944, bias=False)
+            (down_proj): Linear(in_features=18944, out_features=3584, bias=False)
+            (act_fn): SiLU()
+          )
+          (input_layernorm): Qwen2RMSNorm((0,), eps=1e-06)
+          (post_attention_layernorm): Qwen2RMSNorm((0,), eps=1e-06)
+        )
+      )
+      (norm): Qwen2RMSNorm((0,), eps=1e-06)
+      (rotary_emb): Qwen2RotaryEmbedding()
+    )
+    (lm_head): Linear(in_features=3584, out_features=151648, bias=False)
+  )
+  (vision_tower): SiglipVisionTower(
+    (vision_tower): SiglipVisionModel(
+      (vision_model): SiglipVisionTransformer(
+        (embeddings): SiglipVisionEmbeddings(
+          (patch_embedding): Conv2d(3, 1152, kernel_size=(14, 14), stride=(14, 14), padding=valid)
+          (position_embedding): Embedding(1024, 1152)
+        )
+        (encoder): SiglipEncoder(
+          (layers): ModuleList(
+            (0-26): 27 x SiglipEncoderLayer(
+              (self_attn): SiglipFlashAttention2(
+                (k_proj): Linear(in_features=1152, out_features=1152, bias=True)
+                (v_proj): Linear(in_features=1152, out_features=1152, bias=True)
+                (q_proj): Linear(in_features=1152, out_features=1152, bias=True)
+                (out_proj): Linear(in_features=1152, out_features=1152, bias=True)
+              )
+              (layer_norm1): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
+              (mlp): SiglipMLP(
+                (activation_fn): PytorchGELUTanh()
+                (fc1): Linear(in_features=1152, out_features=4304, bias=True)
+                (fc2): Linear(in_features=4304, out_features=1152, bias=True)
+              )
+              (layer_norm2): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
+            )
+          )
+        )
+        (post_layernorm): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
+      )
+    )
+  )
+  (mm_projector): MultimodalProjector(
+    (layers): Sequential(
+      (0): DownSample3x3BlockFix()
+      (1): LayerNorm((10368,), eps=1e-05, elementwise_affine=True)
+      (2): Linear(in_features=10368, out_features=3456, bias=True)
+      (3): GELU(approximate='none')
+      (4): LayerNorm((3456,), eps=1e-05, elementwise_affine=True)
+      (5): Linear(in_features=3456, out_features=3584, bias=True)
+      (6): GELU(approximate='none')
+      (7): Linear(in_features=3584, out_features=3584, bias=True)
+    )
+  )
+)
+[dist-0-of-64] Tunable parameters:
+language model True
+[dist-0-of-64] vision tower True
+[dist-0-of-64] mm projector True
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+[2025-07-01 08:57:05] Rank 62: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 179.40018558502197s
+[2025-07-01 08:57:05] Rank 18: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 178.6438548564911s
+[2025-07-01 08:57:05] Rank 1: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 179.9796986579895s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 08:57:05] Rank 40: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 179.92801594734192s
+[2025-07-01 08:57:05] Rank 13: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 180.29648852348328s
+[2025-07-01 08:57:05] Rank 50: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 179.9351155757904s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 08:57:05] Rank 8: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 180.53496980667114s
+[2025-07-01 08:57:05] Rank 34: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 179.31973552703857s
+[2025-07-01 08:57:05] Rank 6: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 180.11812615394592s
+[2025-07-01 08:57:05] Rank 37: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 179.35892605781555s
+[2025-07-01 08:57:05] Rank 30: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 179.2960913181305s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 08:57:05] Rank 21: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 178.88449788093567s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 08:57:05] Rank 14: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 180.5244917869568s
+[2025-07-01 08:57:05] Rank 51: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 180.05884408950806s
+[2025-07-01 08:57:05] Rank 24: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 179.45844531059265s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 08:57:05] Rank 59: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 179.76254534721375s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 08:57:05] Rank 46: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 180.31962299346924s
+[2025-07-01 08:57:05] Rank 27: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 179.5118260383606s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 08:57:05] Rank 31: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 179.44188499450684s
+[2025-07-01 08:57:05] Rank 42: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 181.3304316997528s
+[2025-07-01 08:57:05] Rank 12: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 180.74464964866638s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 08:57:05] Rank 44: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 180.2750883102417s
+[2025-07-01 08:57:05] Rank 22: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 179.0400447845459s
+[2025-07-01 08:57:05] Rank 41: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 180.36503434181213s
+[2025-07-01 08:57:05] Rank 29: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 179.6715350151062s
+[2025-07-01 08:57:05] Rank 9: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 180.794335603714s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 08:57:05] Rank 47: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 180.25037503242493s
+[2025-07-01 08:57:05] Rank 53: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 180.32664608955383s
+[2025-07-01 08:57:05] Rank 26: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 179.59264469146729s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 08:57:05] Rank 0: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 180.4588851928711s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 08:57:05] Rank 25: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 179.56515669822693s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 08:57:05] Rank 10: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 180.8198959827423s
+[2025-07-01 08:57:05] Rank 33: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 179.61562252044678s
+[2025-07-01 08:57:05] Rank 55: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 180.15942478179932s
+length of dataloader: 28 14336
+[2025-07-01 08:57:05] Rank 39: Timer for terminate callback has been set.
+Total limit: 240min
+[GPU memory] before trainer 2.292407512664795
+Pre terminate time: 10min elapsed_time: 179.74457502365112s
+[2025-07-01 08:57:05] Rank 15: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 181.5756447315216s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 08:57:05] Rank 49: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 180.26583528518677s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 08:57:05] Rank 11: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 180.88644003868103s
+[2025-07-01 08:57:05] Rank 60: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 179.95006847381592s
+[2025-07-01 08:57:05] Rank 61: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 180.19447827339172s
+length of dataloader: 28 14336
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 08:57:05] Rank 45: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 180.3910529613495s
+[2025-07-01 08:57:05] Rank 58: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 180.2317771911621s
+[2025-07-01 08:57:05] Rank 32: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 180.3425772190094s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 08:57:05] Rank 36: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 179.71157217025757s
+[2025-07-01 08:57:05] Rank 28: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 179.64928483963013s
+[2025-07-01 08:57:05] Rank 20: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 179.196674823761s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 08:57:05] Rank 57: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 180.0103840827942s
+[2025-07-01 08:57:05] Rank 2: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 180.53355860710144s
+[2025-07-01 08:57:05] Rank 19: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 179.20799660682678s
+length of dataloader: 28 14336
+length of dataloader: [GPU memory] before trainer28  14336
+2.292407512664795
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 08:57:05] Rank 43: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 180.38389587402344s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 08:57:05] Rank 3: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 180.54649567604065s
+[2025-07-01 08:57:05] Rank 52: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 180.3105661869049s
+[2025-07-01 08:57:05] Rank 5: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 180.55701327323914s
+[2025-07-01 08:57:05] Rank 54: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 180.3739037513733s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 08:57:05] Rank 17: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 179.306396484375s
+[2025-07-01 08:57:05] Rank 16: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 179.22698402404785s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 08:57:05] Rank 4: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 180.58894228935242s
+length of dataloader: 28 14336
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 08:57:05] Rank 38: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 179.79484272003174s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 08:57:05] Rank 23: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 179.23841524124146s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 08:57:05] Rank 35: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 179.92344903945923s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 08:57:05] Rank 56: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 180.1525583267212s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 08:57:05] Rank 48: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 180.58931589126587s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 08:57:05] Rank 7: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 180.75982356071472s
+[2025-07-01 08:57:06] Rank 63: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 180.29284620285034s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+Parameter Offload: Total persistent parameters: 771184 in 421 params

slurm/1038255.0.err ADDED Viewed

The diff for this file is too large to render. See raw diff

slurm/1038255.0.out ADDED Viewed

	@@ -0,0 +1,768 @@

+SLURM_JOB_ID = 1038255
+SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
+RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
+OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
+NNODES = 8
+NODES = pool0-01868 pool0-01869 pool0-01894 pool0-01900 pool0-01911 pool0-01921 pool0-02007 pool0-02015
+NODE_RANK = 1
+GPUS_PER_NODE = 8
+MASTER_ADDR = pool0-01868
+MASTER_PORT = 25001
+GLOBAL_TRAIN_BATCH_SIZE = 2048
+GRADIENT_ACCUMULATION_STEPS = 4
+PER_DEVICE_TRAIN_BATCH_SIZE = 8
+DEFAULT_LEARNING_RATE: 2e-5
+SLURM_JOB_ID = 1038255
+SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
+RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
+OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
+NNODES = 8
+SLURM_JOB_ID = 1038255
+SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
+RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
+OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
+NNODES = 8
+NODES = pool0-01868 pool0-01869 pool0-01894 pool0-01900 pool0-01911 pool0-01921 pool0-02007 pool0-02015
+NODE_RANK = 5
+GPUS_PER_NODE = 8
+MASTER_ADDR = pool0-01868
+MASTER_PORT = 25001
+GLOBAL_TRAIN_BATCH_SIZE = 2048
+GRADIENT_ACCUMULATION_STEPS = 4
+PER_DEVICE_TRAIN_BATCH_SIZE = 8
+DEFAULT_LEARNING_RATE: 2e-5
+NODES = pool0-01868 pool0-01869 pool0-01894 pool0-01900 pool0-01911 pool0-01921 pool0-02007 pool0-02015
+NODE_RANK = 6
+GPUS_PER_NODE = 8
+MASTER_ADDR = pool0-01868
+MASTER_PORT = 25001
+GLOBAL_TRAIN_BATCH_SIZE = 2048
+GRADIENT_ACCUMULATION_STEPS = 4
+PER_DEVICE_TRAIN_BATCH_SIZE = 8
+DEFAULT_LEARNING_RATE: 2e-5
+SLURM_JOB_ID = 1038255
+SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
+RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
+OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
+NNODES = 8
+NODES = pool0-01868 pool0-01869 pool0-01894 pool0-01900 pool0-01911 pool0-01921 pool0-02007 pool0-02015
+NODE_RANK = 4
+GPUS_PER_NODE = 8
+MASTER_ADDR = pool0-01868
+MASTER_PORT = 25001
+GLOBAL_TRAIN_BATCH_SIZE = 2048
+GRADIENT_ACCUMULATION_STEPS = 4
+PER_DEVICE_TRAIN_BATCH_SIZE = 8
+DEFAULT_LEARNING_RATE: 2e-5
+SLURM_JOB_ID = 1038255
+SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
+RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
+OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
+NNODES = 8
+NODES = pool0-01868 pool0-01869 pool0-01894 pool0-01900 pool0-01911 pool0-01921 pool0-02007 pool0-02015
+NODE_RANK = 7
+GPUS_PER_NODE = 8
+MASTER_ADDR = pool0-01868
+MASTER_PORT = 25001
+GLOBAL_TRAIN_BATCH_SIZE = 2048
+GRADIENT_ACCUMULATION_STEPS = 4
+PER_DEVICE_TRAIN_BATCH_SIZE = 8
+DEFAULT_LEARNING_RATE: 2e-5
+SLURM_JOB_ID = 1038255
+SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
+RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
+OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
+NNODES = 8
+NODES = pool0-01868 pool0-01869 pool0-01894 pool0-01900 pool0-01911 pool0-01921 pool0-02007 pool0-02015
+NODE_RANK = 2
+GPUS_PER_NODE = 8
+MASTER_ADDR = pool0-01868
+MASTER_PORT = 25001
+GLOBAL_TRAIN_BATCH_SIZE = 2048
+GRADIENT_ACCUMULATION_STEPS = 4
+PER_DEVICE_TRAIN_BATCH_SIZE = 8
+DEFAULT_LEARNING_RATE: 2e-5
+SLURM_JOB_ID = 1038255
+SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
+RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
+OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
+NNODES = 8
+NODES = pool0-01868 pool0-01869 pool0-01894 pool0-01900 pool0-01911 pool0-01921 pool0-02007 pool0-02015
+NODE_RANK = 0
+GPUS_PER_NODE = 8
+MASTER_ADDR = pool0-01868
+MASTER_PORT = 25001
+GLOBAL_TRAIN_BATCH_SIZE = 2048
+GRADIENT_ACCUMULATION_STEPS = 4
+PER_DEVICE_TRAIN_BATCH_SIZE = 8
+DEFAULT_LEARNING_RATE: 2e-5
+SLURM_JOB_ID = 1038255
+SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
+RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
+OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
+NNODES = 8
+NODES = pool0-01868 pool0-01869 pool0-01894 pool0-01900 pool0-01911 pool0-01921 pool0-02007 pool0-02015
+NODE_RANK = 3
+GPUS_PER_NODE = 8
+MASTER_ADDR = pool0-01868
+MASTER_PORT = 25001
+GLOBAL_TRAIN_BATCH_SIZE = 2048
+GRADIENT_ACCUMULATION_STEPS = 4
+PER_DEVICE_TRAIN_BATCH_SIZE = 8
+DEFAULT_LEARNING_RATE: 2e-5
+[2025-07-01 08:58:36,726] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:58:36,726] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:58:36,726] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:58:36,726] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:58:36,726] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:58:36,726] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:58:36,726] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:58:36,726] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:58:36,743] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:58:36,743] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:58:36,743] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:58:36,743] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:58:36,743] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:58:36,743] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:58:36,743] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:58:36,743] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:58:36,807] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:58:36,807] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:58:36,807] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:58:36,807] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:58:36,807] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:58:36,807] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:58:36,807] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:58:36,808] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:58:36,934] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:58:36,934] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:58:36,934] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:58:36,934] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:58:36,934] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:58:36,934] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:58:36,969] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:58:36,970] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:58:37,174] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:58:37,174] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:58:37,174] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:58:37,174] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:58:37,174] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:58:37,174] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:58:37,174] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:58:37,174] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:58:37,482] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:58:37,482] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:58:37,482] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:58:37,482] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:58:37,482] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:58:37,482] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:58:37,482] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:58:37,483] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:58:37,827] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:58:37,827] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:58:37,827] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:58:37,827] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:58:37,827] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:58:37,827] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:58:37,828] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:58:37,828] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:58:37,997] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:58:37,997] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:58:37,997] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:58:37,997] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:58:37,997] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:58:37,997] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:58:37,997] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:58:37,998] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 08:58:49,040] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:58:49,040] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:58:49,044] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:58:49,044] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:58:49,050] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:58:49,050] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:58:49,055] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:58:49,055] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:58:49,088] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:58:49,088] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:58:49,089] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:58:49,089] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:58:49,089] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:58:49,089] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:58:49,089] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:58:49,089] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:58:49,198] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:58:49,198] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:58:49,204] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:58:49,204] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:58:49,206] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:58:49,206] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:58:49,208] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:58:49,208] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:58:49,211] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:58:49,211] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:58:49,219] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:58:49,219] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:58:49,221] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:58:49,221] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:58:49,221] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:58:49,221] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:58:49,222] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:58:49,222] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:58:49,226] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:58:49,226] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:58:49,227] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:58:49,227] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:58:49,227] [INFO] [comm.py:625:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
+[2025-07-01 08:58:49,228] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:58:49,228] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:58:49,239] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:58:49,239] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:58:49,243] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:58:49,243] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:58:49,243] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:58:49,243] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:58:49,249] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:58:49,249] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:58:49,252] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:58:49,253] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:58:49,253] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:58:49,253] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:58:49,257] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:58:49,257] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:58:49,260] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:58:49,260] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:58:49,272] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:58:49,272] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:58:49,283] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:58:49,283] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:58:49,286] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:58:49,286] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:58:49,288] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:58:49,288] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:58:49,300] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:58:49,300] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:58:49,304] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:58:49,304] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:58:49,316] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:58:49,316] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:58:49,339] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:58:49,339] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:58:49,349] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:58:49,349] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:58:49,595] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:58:49,595] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:58:49,640] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:58:49,640] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:58:49,684] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:58:49,684] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:58:49,925] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:58:49,925] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:58:49,984] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:58:49,984] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:58:49,997] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:58:49,997] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:58:50,001] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:58:50,001] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:58:50,007] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:58:50,007] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:58:50,009] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:58:50,009] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:58:50,009] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:58:50,009] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:58:50,010] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:58:50,010] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:58:50,691] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:58:50,691] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:58:50,692] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:58:50,692] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:58:50,723] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:58:50,723] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:58:50,741] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:58:50,741] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:58:50,745] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:58:50,745] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:58:50,754] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:58:50,754] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:58:50,758] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:58:50,758] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:58:50,760] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:58:50,760] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:58:50,760] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:58:50,760] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:58:50,782] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:58:50,782] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:58:50,821] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:58:50,821] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:58:50,831] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:58:50,831] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:58:50,834] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:58:50,834] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:58:50,855] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:58:50,855] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:58:50,860] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:58:50,860] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:58:50,864] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 08:58:50,864] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 08:59:04,797] [INFO] [partition_parameters.py:453:__exit__] finished initializing model with 7.61B parameters
+[2025-07-01 08:59:22,183] [INFO] [partition_parameters.py:453:__exit__] finished initializing model with 8.03B parameters
+[2025-07-01 08:59:23,443] [INFO] [partition_parameters.py:453:__exit__] finished initializing model with 8.09B parameters
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+[dist-0-of-64] LlavaLlamaModel(
+  (llm): Qwen2ForCausalLM(
+    (model): Qwen2Model(
+      (embed_tokens): Embedding(151648, 3584)
+      (layers): ModuleList(
+        (0-27): 28 x Qwen2DecoderLayer(
+          (self_attn): Qwen2FlashAttention2(
+            (q_proj): Linear(in_features=3584, out_features=3584, bias=True)
+            (k_proj): Linear(in_features=3584, out_features=512, bias=True)
+            (v_proj): Linear(in_features=3584, out_features=512, bias=True)
+            (o_proj): Linear(in_features=3584, out_features=3584, bias=False)
+            (rotary_emb): Qwen2RotaryEmbedding()
+          )
+          (mlp): Qwen2MLP(
+            (gate_proj): Linear(in_features=3584, out_features=18944, bias=False)
+            (up_proj): Linear(in_features=3584, out_features=18944, bias=False)
+            (down_proj): Linear(in_features=18944, out_features=3584, bias=False)
+            (act_fn): SiLU()
+          )
+          (input_layernorm): Qwen2RMSNorm((0,), eps=1e-06)
+          (post_attention_layernorm): Qwen2RMSNorm((0,), eps=1e-06)
+        )
+      )
+      (norm): Qwen2RMSNorm((0,), eps=1e-06)
+      (rotary_emb): Qwen2RotaryEmbedding()
+    )
+    (lm_head): Linear(in_features=3584, out_features=151648, bias=False)
+  )
+  (vision_tower): SiglipVisionTower(
+    (vision_tower): SiglipVisionModel(
+      (vision_model): SiglipVisionTransformer(
+        (embeddings): SiglipVisionEmbeddings(
+          (patch_embedding): Conv2d(3, 1152, kernel_size=(14, 14), stride=(14, 14), padding=valid)
+          (position_embedding): Embedding(1024, 1152)
+        )
+        (encoder): SiglipEncoder(
+          (layers): ModuleList(
+            (0-26): 27 x SiglipEncoderLayer(
+              (self_attn): SiglipFlashAttention2(
+                (k_proj): Linear(in_features=1152, out_features=1152, bias=True)
+                (v_proj): Linear(in_features=1152, out_features=1152, bias=True)
+                (q_proj): Linear(in_features=1152, out_features=1152, bias=True)
+                (out_proj): Linear(in_features=1152, out_features=1152, bias=True)
+              )
+              (layer_norm1): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
+              (mlp): SiglipMLP(
+                (activation_fn): PytorchGELUTanh()
+                (fc1): Linear(in_features=1152, out_features=4304, bias=True)
+                (fc2): Linear(in_features=4304, out_features=1152, bias=True)
+              )
+              (layer_norm2): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
+            )
+          )
+        )
+        (post_layernorm): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
+      )
+    )
+  )
+  (mm_projector): MultimodalProjector(
+    (layers): Sequential(
+      (0): DownSample3x3BlockFix()
+      (1): LayerNorm((10368,), eps=1e-05, elementwise_affine=True)
+      (2): Linear(in_features=10368, out_features=3456, bias=True)
+      (3): GELU(approximate='none')
+      (4): LayerNorm((3456,), eps=1e-05, elementwise_affine=True)
+      (5): Linear(in_features=3456, out_features=3584, bias=True)
+      (6): GELU(approximate='none')
+      (7): Linear(in_features=3584, out_features=3584, bias=True)
+    )
+  )
+)
+[dist-0-of-64] Tunable parameters:
+language model True
+[dist-0-of-64] vision tower True
+[dist-0-of-64] mm projector True
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+[2025-07-01 09:01:57] Rank 31: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 186.7240424156189s
+[2025-07-01 09:01:57] Rank 3: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 188.2856583595276s
+[2025-07-01 09:01:57] Rank 32: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 188.3059437274933s
+[2025-07-01 09:01:57] Rank 63: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 188.31311774253845s
+[2025-07-01 09:01:57] Rank 18: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 186.8426342010498s
+[2025-07-01 09:01:57] Rank 9: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 188.27460193634033s
+[2025-07-01 09:01:57] Rank 52: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 188.58034896850586s
+length of dataloader: 28 14336
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 09:01:57] Rank 33: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 188.53948974609375s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 09:01:57] Rank 47: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 187.8767204284668s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 09:01:57] Rank 1: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 188.6350953578949s
+[2025-07-01 09:01:57] Rank 19: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 187.17372345924377s
+[2025-07-01 09:01:57] Rank 46: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 187.946551322937s
+[2025-07-01 09:01:57] Rank 44: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 187.947163105011s
+[2025-07-01 09:01:57] Rank 30: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 187.1679859161377s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 09:01:57] Rank 61: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 188.73403882980347s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 09:01:57] Rank 51: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 188.93320965766907s
+[2025-07-01 09:01:57] Rank 58: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 188.77459192276s
+[2025-07-01 09:01:57] Rank 12: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 188.71449184417725s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 09:01:57] Rank 48: Timer for terminate callback has been set.
+Total limit: 240min
+[2025-07-01 09:01:57] Rank 36: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 188.8108024597168s
+Pre terminate time: 10min elapsed_time: 188.9705455303192s
+[2025-07-01 09:01:57] Rank 39: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 188.81623101234436s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 09:01:57] Rank 14: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 188.76175379753113s
+[2025-07-01 09:01:57] Rank 60: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 188.82697677612305s
+length of dataloader:length of dataloader: 28  2814336
+14336
+length of dataloader: 28 14336
+[GPU memory] before trainer [GPU memory] before trainer 2.292407512664795
+2.292407512664795
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 09:01:57] Rank 50: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 189.00980305671692s
+[2025-07-01 09:01:57] Rank 29: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 187.3063566684723s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 09:01:57] Rank 37: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 188.86100125312805s
+length of dataloader: 28 14336
+[2025-07-01 09:01:57] Rank 59: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 188.8586766719818s
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 09:01:58] Rank 62: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 188.87530517578125s
+[2025-07-01 09:01:58] Rank 23: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 187.39407753944397s
+[2025-07-01 09:01:58] Rank 54: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 189.04404830932617s
+[2025-07-01 09:01:58] Rank 57: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 188.88598775863647s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 09:01:58] Rank 28: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 187.35117411613464s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 09:01:58] Rank 25: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 187.3726806640625s
+[2025-07-01 09:01:58] Rank 49: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 189.0832874774933s
+[2025-07-01 09:01:58] Rank 7: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 188.92747592926025s
+[2025-07-01 09:01:58] Rank 55: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 189.09379124641418s
+[2025-07-01 09:01:58] Rank 43: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 188.19229888916016s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 09:01:58] Rank 41: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 188.19884490966797s
+[2025-07-01 09:01:58] Rank 40: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 188.20034885406494s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 09:01:58] Rank 45: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 188.20093441009521s
+[2025-07-01 09:01:58] Rank 24: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 187.39897632598877s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 09:01:58] Rank 42: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 188.20916652679443s
+[2025-07-01 09:01:58] Rank 17: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 187.46580815315247s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[2025-07-01 09:01:58] Rank 22: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 187.48894619941711s
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 09:01:58] Rank 34: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 188.97889137268066s
+[2025-07-01 09:01:58] Rank 35: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 188.97930574417114s
+[2025-07-01 09:01:58] Rank 6: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 188.98443937301636s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 09:01:58] Rank 16: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 187.5020468235016s
+[2025-07-01 09:01:58] Rank 8: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 188.9302875995636s
+[2025-07-01 09:01:58] Rank 10: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 188.93019914627075s
+[2025-07-01 09:01:58] Rank 11: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 188.9377384185791s
+[2025-07-01 09:01:58] Rank 15: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 188.93913388252258s
+[2025-07-01 09:01:58] Rank 5: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 189.00355291366577s
+[2025-07-01 09:01:58] Rank 56: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 189.0038776397705s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 09:01:58] Rank 21: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 187.52710509300232s
+[2025-07-01 09:01:58] Rank 0: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 189.0127944946289s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 09:01:58] Rank 38: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 189.02126288414001s
+[2025-07-01 09:01:58] Rank 20: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 187.5401885509491s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 09:01:58] Rank 53: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 189.20625829696655s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 09:01:58] Rank 4: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 189.05517554283142s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795length of dataloader:
+ 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 09:01:58] Rank 2: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 189.06267762184143s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 09:01:58] Rank 13: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 189.01914143562317s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 09:01:58] Rank 26: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 187.55401301383972s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 09:01:58] Rank 27: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 187.57627320289612s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+Parameter Offload: Total persistent parameters: 771184 in 421 params

slurm/1038286.0.err ADDED Viewed

The diff for this file is too large to render. See raw diff

slurm/1038286.0.out ADDED Viewed

	@@ -0,0 +1,768 @@

+SLURM_JOB_ID = 1038286
+SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
+RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
+OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
+NNODES = 8
+NODES = pool0-02124 pool0-02152 pool0-02160 pool0-02193 pool0-02639 pool0-02605 pool0-02007 pool0-02015
+NODE_RANK = 5
+GPUS_PER_NODE = 8
+MASTER_ADDR = pool0-02124
+MASTER_PORT = 25001
+GLOBAL_TRAIN_BATCH_SIZE = 2048
+GRADIENT_ACCUMULATION_STEPS = 4
+PER_DEVICE_TRAIN_BATCH_SIZE = 8
+DEFAULT_LEARNING_RATE: 2e-5
+SLURM_JOB_ID = 1038286
+SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
+RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
+OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
+NNODES = 8
+NODES = pool0-02124 pool0-02152 pool0-02160 pool0-02193 pool0-02639 pool0-02605 pool0-02007 pool0-02015
+NODE_RANK = 6
+GPUS_PER_NODE = 8
+MASTER_ADDR = pool0-02124
+MASTER_PORT = 25001
+GLOBAL_TRAIN_BATCH_SIZE = 2048
+GRADIENT_ACCUMULATION_STEPS = 4
+PER_DEVICE_TRAIN_BATCH_SIZE = 8
+DEFAULT_LEARNING_RATE: 2e-5
+SLURM_JOB_ID = 1038286
+SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
+RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
+OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
+NNODES = 8
+NODES = pool0-02124 pool0-02152 pool0-02160 pool0-02193 pool0-02639 pool0-02605 pool0-02007 pool0-02015
+NODE_RANK = 4
+GPUS_PER_NODE = 8
+MASTER_ADDR = pool0-02124
+MASTER_PORT = 25001
+GLOBAL_TRAIN_BATCH_SIZE = 2048
+GRADIENT_ACCUMULATION_STEPS = 4
+PER_DEVICE_TRAIN_BATCH_SIZE = 8
+DEFAULT_LEARNING_RATE: 2e-5
+SLURM_JOB_ID = 1038286
+SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
+RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
+OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
+NNODES = 8
+SLURM_JOB_ID = 1038286
+SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
+RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
+OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
+NNODES = 8
+NODES = pool0-02124 pool0-02152 pool0-02160 pool0-02193 pool0-02639 pool0-02605 pool0-02007 pool0-02015
+NODE_RANK = 3
+GPUS_PER_NODE = 8
+MASTER_ADDR = pool0-02124
+MASTER_PORT = 25001
+GLOBAL_TRAIN_BATCH_SIZE = 2048
+GRADIENT_ACCUMULATION_STEPS = 4
+PER_DEVICE_TRAIN_BATCH_SIZE = 8
+DEFAULT_LEARNING_RATE: 2e-5
+NODES = pool0-02124 pool0-02152 pool0-02160 pool0-02193 pool0-02639 pool0-02605 pool0-02007 pool0-02015
+NODE_RANK = 2
+GPUS_PER_NODE = 8
+MASTER_ADDR = pool0-02124
+MASTER_PORT = 25001
+GLOBAL_TRAIN_BATCH_SIZE = 2048
+GRADIENT_ACCUMULATION_STEPS = 4
+PER_DEVICE_TRAIN_BATCH_SIZE = 8
+DEFAULT_LEARNING_RATE: 2e-5
+SLURM_JOB_ID = 1038286
+SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
+RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
+OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
+NNODES = 8
+NODES = pool0-02124 pool0-02152 pool0-02160 pool0-02193 pool0-02639 pool0-02605 pool0-02007 pool0-02015
+NODE_RANK = 1
+GPUS_PER_NODE = 8
+MASTER_ADDR = pool0-02124
+MASTER_PORT = 25001
+GLOBAL_TRAIN_BATCH_SIZE = 2048
+GRADIENT_ACCUMULATION_STEPS = 4
+PER_DEVICE_TRAIN_BATCH_SIZE = 8
+DEFAULT_LEARNING_RATE: 2e-5
+SLURM_JOB_ID = 1038286
+SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
+RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
+OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
+NNODES = 8
+NODES = pool0-02124 pool0-02152 pool0-02160 pool0-02193 pool0-02639 pool0-02605 pool0-02007 pool0-02015
+NODE_RANK = 7
+GPUS_PER_NODE = 8
+MASTER_ADDR = pool0-02124
+MASTER_PORT = 25001
+GLOBAL_TRAIN_BATCH_SIZE = 2048
+GRADIENT_ACCUMULATION_STEPS = 4
+PER_DEVICE_TRAIN_BATCH_SIZE = 8
+DEFAULT_LEARNING_RATE: 2e-5
+SLURM_JOB_ID = 1038286
+SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
+RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
+OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
+NNODES = 8
+NODES = pool0-02124 pool0-02152 pool0-02160 pool0-02193 pool0-02639 pool0-02605 pool0-02007 pool0-02015
+NODE_RANK = 0
+GPUS_PER_NODE = 8
+MASTER_ADDR = pool0-02124
+MASTER_PORT = 25001
+GLOBAL_TRAIN_BATCH_SIZE = 2048
+GRADIENT_ACCUMULATION_STEPS = 4
+PER_DEVICE_TRAIN_BATCH_SIZE = 8
+DEFAULT_LEARNING_RATE: 2e-5
+[2025-07-01 09:10:30,122] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:10:30,706] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:10:30,713] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:10:30,744] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:10:30,752] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:10:30,755] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:10:30,762] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:10:30,764] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:10:30,784] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:10:30,825] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:10:30,850] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:10:30,857] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:10:30,884] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:10:30,886] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:10:30,887] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:10:30,887] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:10:33,393] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:10:33,394] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:10:33,920] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:10:33,920] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:10:33,962] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:10:33,962] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:10:34,066] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:10:34,067] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:10:34,067] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:10:34,067] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:10:34,083] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:10:34,083] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:10:34,164] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:10:34,164] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:10:34,165] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:10:34,165] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:10:34,168] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:10:34,169] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:10:34,169] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:10:34,169] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:10:34,175] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:10:34,175] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:10:34,205] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:10:34,205] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:10:34,206] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:10:34,206] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:10:34,214] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:10:34,214] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:10:34,217] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:10:34,217] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:10:34,296] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:10:34,296] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:10:35,443] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:10:35,443] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:10:35,443] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:10:35,443] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:10:35,443] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:10:35,443] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:10:35,443] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:10:35,444] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:10:35,444] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:10:35,444] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:10:35,444] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:10:35,444] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:10:35,444] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:10:35,444] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:10:35,444] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:10:35,444] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:10:35,504] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:10:35,504] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:10:35,504] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:10:35,504] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:10:35,504] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:10:35,504] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:10:35,504] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:10:35,504] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:10:35,544] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:10:35,544] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:10:35,544] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:10:35,544] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:10:35,544] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:10:35,544] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:10:35,544] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:10:35,544] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:10:36,054] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:10:36,054] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:10:36,054] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:10:36,054] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:10:36,054] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:10:36,054] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:10:36,054] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:10:36,054] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:10:37,641] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:10:37,641] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:10:37,641] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:10:37,641] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:10:37,641] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:10:37,641] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:10:37,641] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:10:37,641] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:10:46,125] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:10:46,125] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:10:46,149] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:10:46,149] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:10:46,150] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:10:46,150] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:10:46,151] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:10:46,151] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:10:46,152] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:10:46,152] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:10:46,160] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:10:46,160] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:10:46,168] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:10:46,168] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:10:46,168] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:10:46,168] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:10:46,370] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:10:46,370] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:10:46,376] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:10:46,376] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:10:46,382] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:10:46,382] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:10:46,386] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:10:46,386] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:10:46,387] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:10:46,387] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:10:46,394] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:10:46,394] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:10:46,394] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:10:46,394] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:10:46,396] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:10:46,397] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:10:46,409] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:10:46,409] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:10:46,413] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:10:46,413] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:10:46,413] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:10:46,413] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:10:46,416] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:10:46,416] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:10:46,422] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:10:46,422] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:10:46,428] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:10:46,428] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:10:46,428] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:10:46,429] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:10:46,429] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:10:46,429] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:10:46,493] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:10:46,493] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:10:46,502] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:10:46,502] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:10:46,515] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:10:46,515] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:10:46,533] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:10:46,533] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:10:46,534] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:10:46,534] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:10:46,551] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:10:46,551] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:10:46,554] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:10:46,554] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:10:46,555] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:10:46,555] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:10:46,787] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:10:46,787] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:10:46,819] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:10:46,820] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:10:46,821] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:10:46,821] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:10:46,828] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:10:46,828] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:10:46,854] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:10:46,854] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:10:46,862] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:10:46,862] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:10:46,866] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:10:46,866] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:10:46,868] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:10:46,869] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:10:48,303] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:10:48,303] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:10:48,325] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:10:48,325] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:10:48,330] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:10:48,331] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:10:48,468] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:10:48,468] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:10:48,478] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:10:48,478] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:10:48,480] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:10:48,480] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:10:48,480] [INFO] [comm.py:625:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
+[2025-07-01 09:10:48,511] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:10:48,511] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:10:48,511] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:10:48,511] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:11:04,481] [INFO] [partition_parameters.py:453:__exit__] finished initializing model with 7.61B parameters
+[2025-07-01 09:11:20,374] [INFO] [partition_parameters.py:453:__exit__] finished initializing model with 8.03B parameters
+[2025-07-01 09:11:21,706] [INFO] [partition_parameters.py:453:__exit__] finished initializing model with 8.09B parameters
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+[dist-0-of-64] LlavaLlamaModel(
+  (llm): Qwen2ForCausalLM(
+    (model): Qwen2Model(
+      (embed_tokens): Embedding(151648, 3584)
+      (layers): ModuleList(
+        (0-27): 28 x Qwen2DecoderLayer(
+          (self_attn): Qwen2FlashAttention2(
+            (q_proj): Linear(in_features=3584, out_features=3584, bias=True)
+            (k_proj): Linear(in_features=3584, out_features=512, bias=True)
+            (v_proj): Linear(in_features=3584, out_features=512, bias=True)
+            (o_proj): Linear(in_features=3584, out_features=3584, bias=False)
+            (rotary_emb): Qwen2RotaryEmbedding()
+          )
+          (mlp): Qwen2MLP(
+            (gate_proj): Linear(in_features=3584, out_features=18944, bias=False)
+            (up_proj): Linear(in_features=3584, out_features=18944, bias=False)
+            (down_proj): Linear(in_features=18944, out_features=3584, bias=False)
+            (act_fn): SiLU()
+          )
+          (input_layernorm): Qwen2RMSNorm((0,), eps=1e-06)
+          (post_attention_layernorm): Qwen2RMSNorm((0,), eps=1e-06)
+        )
+      )
+      (norm): Qwen2RMSNorm((0,), eps=1e-06)
+      (rotary_emb): Qwen2RotaryEmbedding()
+    )
+    (lm_head): Linear(in_features=3584, out_features=151648, bias=False)
+  )
+  (vision_tower): SiglipVisionTower(
+    (vision_tower): SiglipVisionModel(
+      (vision_model): SiglipVisionTransformer(
+        (embeddings): SiglipVisionEmbeddings(
+          (patch_embedding): Conv2d(3, 1152, kernel_size=(14, 14), stride=(14, 14), padding=valid)
+          (position_embedding): Embedding(1024, 1152)
+        )
+        (encoder): SiglipEncoder(
+          (layers): ModuleList(
+            (0-26): 27 x SiglipEncoderLayer(
+              (self_attn): SiglipFlashAttention2(
+                (k_proj): Linear(in_features=1152, out_features=1152, bias=True)
+                (v_proj): Linear(in_features=1152, out_features=1152, bias=True)
+                (q_proj): Linear(in_features=1152, out_features=1152, bias=True)
+                (out_proj): Linear(in_features=1152, out_features=1152, bias=True)
+              )
+              (layer_norm1): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
+              (mlp): SiglipMLP(
+                (activation_fn): PytorchGELUTanh()
+                (fc1): Linear(in_features=1152, out_features=4304, bias=True)
+                (fc2): Linear(in_features=4304, out_features=1152, bias=True)
+              )
+              (layer_norm2): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
+            )
+          )
+        )
+        (post_layernorm): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
+      )
+    )
+  )
+  (mm_projector): MultimodalProjector(
+    (layers): Sequential(
+      (0): DownSample3x3BlockFix()
+      (1): LayerNorm((10368,), eps=1e-05, elementwise_affine=True)
+      (2): Linear(in_features=10368, out_features=3456, bias=True)
+      (3): GELU(approximate='none')
+      (4): LayerNorm((3456,), eps=1e-05, elementwise_affine=True)
+      (5): Linear(in_features=3456, out_features=3584, bias=True)
+      (6): GELU(approximate='none')
+      (7): Linear(in_features=3584, out_features=3584, bias=True)
+    )
+  )
+)
+[dist-0-of-64] Tunable parameters:
+language model True
+[dist-0-of-64] vision tower True
+[dist-0-of-64] mm projector True
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+[2025-07-01 09:13:55] Rank 15: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 189.35445713996887s
+[2025-07-01 09:13:55] Rank 55: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 201.59668064117432s
+[2025-07-01 09:13:55] Rank 43: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 189.26995539665222s
+[2025-07-01 09:13:55] Rank 19: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 189.44619512557983s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 09:13:55] Rank 27: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 189.14272332191467s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 09:13:55] Rank 57: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 201.8264696598053s
+[2025-07-01 09:13:55] Rank 7: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 187.70187664031982s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 09:13:55] Rank 8: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 189.69908165931702s
+[2025-07-01 09:13:55] Rank 21: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 189.7061002254486s
+[2025-07-01 09:13:56] Rank 46: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 189.5839924812317s
+[2025-07-01 09:13:56] Rank 24: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 189.28924942016602s
+[2025-07-01 09:13:56] Rank 35: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 189.95243191719055s
+[2025-07-01 09:13:56] Rank 1: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 187.78522086143494s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 09:13:56] Rank 50: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 202.2034502029419s
+[2025-07-01 09:13:56] Rank 31: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 189.37554931640625s
+[2025-07-01 09:13:56] Rank 56: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 202.09286189079285s
+[2025-07-01 09:13:56] Rank 3: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 187.87985610961914s
+[2025-07-01 09:13:56] Rank 60: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 202.09541821479797s
+[2025-07-01 09:13:56] Rank 9: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 189.82429432868958s
+[2025-07-01 09:13:56] Rank 38: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 190.08812403678894s
+length of dataloader: 28 14336
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 09:13:56] Rank 20: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 189.87151789665222s
+[2025-07-01 09:13:56] Rank 18: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 189.87728786468506s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 09:13:56] Rank 17: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 189.8816032409668s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 09:13:56] Rank 5: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 187.94594931602478s
+[2025-07-01 09:13:56] Rank 4: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 187.95590615272522s
+[2025-07-01 09:13:56] Rank 41: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 189.7688856124878s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 09:13:56] Rank 36: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 190.14091515541077s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 09:13:56] Rank 62: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 202.1776213645935s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 09:13:56] Rank 22: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 189.92305088043213s
+[2025-07-01 09:13:56] Rank 12: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 189.92479276657104s
+[2025-07-01 09:13:56] Rank 26: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 189.50398349761963s
+[2025-07-01 09:13:56] Rank 30: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 189.50814604759216s
+[2025-07-01 09:13:56] Rank 33: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 190.1808216571808s
+length of dataloader: 28 14336
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 09:13:56] Rank 59: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 202.2262580394745s
+[2025-07-01 09:13:56] Rank 16: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 189.95579552650452s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 09:13:56] Rank 53: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 202.1831030845642s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 09:13:56] Rank 13: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 189.99509859085083s
+[2025-07-01 09:13:56] Rank 23: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 189.99686932563782s
+[2025-07-01 09:13:56] Rank 45: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 189.87132096290588s
+[2025-07-01 09:13:56] Rank 6: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 188.06090354919434s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[2025-07-01 09:13:56] Rank 61: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 202.2874138355255s
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 09:13:56] Rank 54: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 202.3776957988739s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 09:13:56] Rank 14: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 190.0164692401886s
+[2025-07-01 09:13:56] Rank 42: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 189.89093255996704s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 09:13:56] Rank 47: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 189.8917055130005s
+[2025-07-01 09:13:56] Rank 25: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 189.60104870796204s
+[2025-07-01 09:13:56] Rank 58: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 202.30275464057922s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 09:13:56] Rank 28: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 189.60884761810303s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 09:13:56] Rank 0: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 188.10366201400757s
+[2025-07-01 09:13:56] Rank 29: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 189.62677335739136s
+[2025-07-01 09:13:56] Rank 49: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 202.26963424682617s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 09:13:56] Rank 10: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 190.0586109161377s
+[2025-07-01 09:13:56] Rank 37: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 190.2976393699646s
+[2025-07-01 09:13:56] Rank 11: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 190.06773209571838s
+[2025-07-01 09:13:56] Rank 39: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 190.31072449684143s
+[2025-07-01 09:13:56] Rank 34: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 190.3135223388672s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 09:13:56] Rank 40: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 189.95435571670532s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 09:13:56] Rank 44: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 189.97284388542175s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 09:13:56] Rank 51: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 202.32772946357727s
+[2025-07-01 09:13:56] Rank 2: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 188.1806402206421s
+[2025-07-01 09:13:56] Rank 48: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 202.36938166618347s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 09:13:56] Rank 63: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 202.3610863685608s
+length of dataloader: 28 14336
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 09:13:56] Rank 32: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 190.39307260513306s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 09:13:56] Rank 52: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 203.1189968585968s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+Parameter Offload: Total persistent parameters: 771184 in 421 params

slurm/1038294.0.err ADDED Viewed

The diff for this file is too large to render. See raw diff

slurm/1038294.0.out ADDED Viewed

	@@ -0,0 +1,768 @@

+SLURM_JOB_ID = 1038294
+SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
+RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
+OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
+NNODES = 8
+NODES = pool0-01504 pool0-01683 pool0-01722 pool0-01867 pool0-01881 pool0-01893 pool0-01919 pool0-01939
+NODE_RANK = 7
+GPUS_PER_NODE = 8
+MASTER_ADDR = pool0-01504
+MASTER_PORT = 25001
+GLOBAL_TRAIN_BATCH_SIZE = 2048
+GRADIENT_ACCUMULATION_STEPS = 4
+PER_DEVICE_TRAIN_BATCH_SIZE = 8
+DEFAULT_LEARNING_RATE: 2e-5
+SLURM_JOB_ID = 1038294
+SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
+RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
+OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
+NNODES = 8
+NODES = pool0-01504 pool0-01683 pool0-01722 pool0-01867 pool0-01881 pool0-01893 pool0-01919 pool0-01939
+NODE_RANK = 0
+GPUS_PER_NODE = 8
+MASTER_ADDR = pool0-01504
+MASTER_PORT = 25001
+GLOBAL_TRAIN_BATCH_SIZE = 2048
+GRADIENT_ACCUMULATION_STEPS = 4
+PER_DEVICE_TRAIN_BATCH_SIZE = 8
+DEFAULT_LEARNING_RATE: 2e-5
+SLURM_JOB_ID = 1038294
+SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
+RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
+OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
+NNODES = 8
+NODES = pool0-01504 pool0-01683 pool0-01722 pool0-01867 pool0-01881 pool0-01893 pool0-01919 pool0-01939
+NODE_RANK = 6
+GPUS_PER_NODE = 8
+MASTER_ADDR = pool0-01504
+MASTER_PORT = 25001
+GLOBAL_TRAIN_BATCH_SIZE = 2048
+GRADIENT_ACCUMULATION_STEPS = 4
+PER_DEVICE_TRAIN_BATCH_SIZE = 8
+DEFAULT_LEARNING_RATE: 2e-5
+SLURM_JOB_ID = 1038294
+SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
+RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
+OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
+NNODES = 8
+NODES = pool0-01504 pool0-01683 pool0-01722 pool0-01867 pool0-01881 pool0-01893 pool0-01919 pool0-01939
+NODE_RANK = 2
+GPUS_PER_NODE = 8
+MASTER_ADDR = pool0-01504
+MASTER_PORT = 25001
+GLOBAL_TRAIN_BATCH_SIZE = 2048
+GRADIENT_ACCUMULATION_STEPS = 4
+PER_DEVICE_TRAIN_BATCH_SIZE = 8
+DEFAULT_LEARNING_RATE: 2e-5
+SLURM_JOB_ID = 1038294
+SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
+RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
+OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
+NNODES = 8
+NODES = pool0-01504 pool0-01683 pool0-01722 pool0-01867 pool0-01881 pool0-01893 pool0-01919 pool0-01939
+NODE_RANK = 1
+GPUS_PER_NODE = 8
+MASTER_ADDR = pool0-01504
+MASTER_PORT = 25001
+GLOBAL_TRAIN_BATCH_SIZE = 2048
+GRADIENT_ACCUMULATION_STEPS = 4
+PER_DEVICE_TRAIN_BATCH_SIZE = 8
+DEFAULT_LEARNING_RATE: 2e-5
+SLURM_JOB_ID = 1038294
+SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
+RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
+OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
+NNODES = 8
+NODES = pool0-01504 pool0-01683 pool0-01722 pool0-01867 pool0-01881 pool0-01893 pool0-01919 pool0-01939
+NODE_RANK = 5
+GPUS_PER_NODE = 8
+MASTER_ADDR = pool0-01504
+MASTER_PORT = 25001
+GLOBAL_TRAIN_BATCH_SIZE = 2048
+GRADIENT_ACCUMULATION_STEPS = 4
+PER_DEVICE_TRAIN_BATCH_SIZE = 8
+DEFAULT_LEARNING_RATE: 2e-5
+SLURM_JOB_ID = 1038294
+SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
+RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
+OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
+NNODES = 8
+NODES = pool0-01504 pool0-01683 pool0-01722 pool0-01867 pool0-01881 pool0-01893 pool0-01919 pool0-01939
+NODE_RANK = 4
+GPUS_PER_NODE = 8
+MASTER_ADDR = pool0-01504
+MASTER_PORT = 25001
+GLOBAL_TRAIN_BATCH_SIZE = 2048
+GRADIENT_ACCUMULATION_STEPS = 4
+PER_DEVICE_TRAIN_BATCH_SIZE = 8
+DEFAULT_LEARNING_RATE: 2e-5
+SLURM_JOB_ID = 1038294
+SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
+RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
+OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
+NNODES = 8
+NODES = pool0-01504 pool0-01683 pool0-01722 pool0-01867 pool0-01881 pool0-01893 pool0-01919 pool0-01939
+NODE_RANK = 3
+GPUS_PER_NODE = 8
+MASTER_ADDR = pool0-01504
+MASTER_PORT = 25001
+GLOBAL_TRAIN_BATCH_SIZE = 2048
+GRADIENT_ACCUMULATION_STEPS = 4
+PER_DEVICE_TRAIN_BATCH_SIZE = 8
+DEFAULT_LEARNING_RATE: 2e-5
+[2025-07-01 09:15:45,395] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:15:45,549] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:15:46,175] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:15:46,197] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:15:46,284] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:15:46,324] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:15:46,329] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:15:46,330] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:15:48,763] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:15:48,763] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:15:48,885] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:15:48,885] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:15:49,514] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:15:49,514] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:15:49,515] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:15:49,515] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:15:49,535] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:15:49,535] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:15:50,031] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:15:50,031] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:15:50,031] [INFO] [comm.py:625:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
+[2025-07-01 09:15:50,095] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:15:50,095] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:15:50,099] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:15:50,099] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:15:50,847] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:15:50,847] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:15:50,847] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:15:50,847] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:15:50,847] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:15:50,847] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:15:50,854] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:15:50,854] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:15:50,854] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:15:50,854] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:15:50,854] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:15:50,854] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:15:50,854] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:15:50,854] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:15:50,856] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:15:50,856] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:15:50,856] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:15:50,856] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:15:50,856] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:15:50,856] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:15:50,856] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:15:50,856] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:15:50,861] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:15:50,862] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:15:51,005] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:15:51,005] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:15:51,005] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:15:51,005] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:15:51,005] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:15:51,005] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:15:51,023] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:15:51,024] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:15:51,070] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:15:51,070] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:15:51,070] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:15:51,070] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:15:51,070] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:15:51,070] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:15:51,070] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:15:51,071] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:15:51,307] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:15:51,307] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:15:51,307] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:15:51,307] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:15:51,307] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:15:51,307] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:15:51,307] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:15:51,307] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:15:53,021] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:15:53,021] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:15:53,021] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:15:53,021] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:15:53,021] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:15:53,021] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:15:53,021] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:15:53,021] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:16:01,457] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:16:01,457] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:16:01,466] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:16:01,466] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:16:01,467] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:16:01,467] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:16:01,468] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:16:01,468] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:16:01,471] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:16:01,471] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:16:01,475] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:16:01,475] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:16:01,476] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:16:01,476] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:16:01,482] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:16:01,482] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:16:01,486] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:16:01,486] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:16:01,490] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:16:01,490] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:16:01,501] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:16:01,501] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:16:01,502] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:16:01,502] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:16:01,505] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:16:01,505] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:16:01,510] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:16:01,510] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:16:01,512] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:16:01,512] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:16:01,516] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:16:01,516] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:16:01,523] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:16:01,523] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:16:01,568] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:16:01,569] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:16:01,570] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:16:01,570] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:16:01,577] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:16:01,577] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:16:01,578] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:16:01,579] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:16:01,580] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:16:01,580] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:16:01,580] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:16:01,581] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:16:01,591] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:16:01,591] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:16:01,739] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:16:01,739] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:16:01,779] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:16:01,779] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:16:01,782] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:16:01,782] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:16:01,786] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:16:01,787] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:16:01,791] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:16:01,791] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:16:01,792] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:16:01,792] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:16:01,793] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:16:01,794] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:16:01,796] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:16:01,796] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:16:01,890] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:16:01,890] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:16:01,891] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:16:01,891] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:16:01,896] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:16:01,896] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:16:01,897] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:16:01,897] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:16:01,898] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:16:01,898] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:16:01,901] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:16:01,901] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:16:01,906] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:16:01,906] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:16:01,907] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:16:01,907] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:16:01,929] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:16:01,929] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:16:01,930] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:16:01,930] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:16:01,932] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:16:01,932] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:16:01,936] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:16:01,936] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:16:01,937] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:16:01,937] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:16:01,939] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:16:01,939] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:16:01,946] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:16:01,946] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:16:01,946] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:16:01,947] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:16:04,276] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:16:04,276] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:16:04,281] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:16:04,282] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:16:04,284] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:16:04,284] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:16:04,324] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:16:04,324] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:16:04,378] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:16:04,378] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:16:04,385] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:16:04,386] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:16:04,392] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:16:04,392] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:16:04,395] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:16:04,395] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:16:20,231] [INFO] [partition_parameters.py:453:__exit__] finished initializing model with 7.61B parameters
+[2025-07-01 09:16:29,303] [INFO] [partition_parameters.py:453:__exit__] finished initializing model with 8.03B parameters
+[2025-07-01 09:16:29,934] [INFO] [partition_parameters.py:453:__exit__] finished initializing model with 8.09B parameters
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+[dist-0-of-64] LlavaLlamaModel(
+  (llm): Qwen2ForCausalLM(
+    (model): Qwen2Model(
+      (embed_tokens): Embedding(151648, 3584)
+      (layers): ModuleList(
+        (0-27): 28 x Qwen2DecoderLayer(
+          (self_attn): Qwen2FlashAttention2(
+            (q_proj): Linear(in_features=3584, out_features=3584, bias=True)
+            (k_proj): Linear(in_features=3584, out_features=512, bias=True)
+            (v_proj): Linear(in_features=3584, out_features=512, bias=True)
+            (o_proj): Linear(in_features=3584, out_features=3584, bias=False)
+            (rotary_emb): Qwen2RotaryEmbedding()
+          )
+          (mlp): Qwen2MLP(
+            (gate_proj): Linear(in_features=3584, out_features=18944, bias=False)
+            (up_proj): Linear(in_features=3584, out_features=18944, bias=False)
+            (down_proj): Linear(in_features=18944, out_features=3584, bias=False)
+            (act_fn): SiLU()
+          )
+          (input_layernorm): Qwen2RMSNorm((0,), eps=1e-06)
+          (post_attention_layernorm): Qwen2RMSNorm((0,), eps=1e-06)
+        )
+      )
+      (norm): Qwen2RMSNorm((0,), eps=1e-06)
+      (rotary_emb): Qwen2RotaryEmbedding()
+    )
+    (lm_head): Linear(in_features=3584, out_features=151648, bias=False)
+  )
+  (vision_tower): SiglipVisionTower(
+    (vision_tower): SiglipVisionModel(
+      (vision_model): SiglipVisionTransformer(
+        (embeddings): SiglipVisionEmbeddings(
+          (patch_embedding): Conv2d(3, 1152, kernel_size=(14, 14), stride=(14, 14), padding=valid)
+          (position_embedding): Embedding(1024, 1152)
+        )
+        (encoder): SiglipEncoder(
+          (layers): ModuleList(
+            (0-26): 27 x SiglipEncoderLayer(
+              (self_attn): SiglipFlashAttention2(
+                (k_proj): Linear(in_features=1152, out_features=1152, bias=True)
+                (v_proj): Linear(in_features=1152, out_features=1152, bias=True)
+                (q_proj): Linear(in_features=1152, out_features=1152, bias=True)
+                (out_proj): Linear(in_features=1152, out_features=1152, bias=True)
+              )
+              (layer_norm1): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
+              (mlp): SiglipMLP(
+                (activation_fn): PytorchGELUTanh()
+                (fc1): Linear(in_features=1152, out_features=4304, bias=True)
+                (fc2): Linear(in_features=4304, out_features=1152, bias=True)
+              )
+              (layer_norm2): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
+            )
+          )
+        )
+        (post_layernorm): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
+      )
+    )
+  )
+  (mm_projector): MultimodalProjector(
+    (layers): Sequential(
+      (0): DownSample3x3BlockFix()
+      (1): LayerNorm((10368,), eps=1e-05, elementwise_affine=True)
+      (2): Linear(in_features=10368, out_features=3456, bias=True)
+      (3): GELU(approximate='none')
+      (4): LayerNorm((3456,), eps=1e-05, elementwise_affine=True)
+      (5): Linear(in_features=3456, out_features=3584, bias=True)
+      (6): GELU(approximate='none')
+      (7): Linear(in_features=3584, out_features=3584, bias=True)
+    )
+  )
+)
+[dist-0-of-64] Tunable parameters:
+language model True
+[dist-0-of-64] vision tower True
+[dist-0-of-64] mm projector True
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+[2025-07-01 09:19:03] Rank 31: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 182.53949546813965s
+[2025-07-01 09:19:03] Rank 41: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 182.08894276618958s
+[2025-07-01 09:19:03] Rank 18: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 182.17969298362732s
+[2025-07-01 09:19:03] Rank 58: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 182.5725381374359s
+[2025-07-01 09:19:04] Rank 34: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 182.6094913482666s
+[2025-07-01 09:19:04] Rank 55: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 179.82354164123535s
+[2025-07-01 09:19:04] Rank 36: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 182.67147135734558s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 09:19:04] Rank 20: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 182.38758826255798s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 09:19:04] Rank 47: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 182.3753411769867s
+[2025-07-01 09:19:04] Rank 62: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 182.7982451915741s
+[2025-07-01 09:19:04] Rank 11: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 182.58949255943298s
+[2025-07-01 09:19:04] Rank 51: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 180.0623378753662s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 09:19:04] Rank 13: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 182.6344199180603s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 09:19:04] Rank 8: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 182.6651632785797s
+[2025-07-01 09:19:04] Rank 27: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 182.92465996742249s
+[2025-07-01 09:19:04] Rank 9: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 182.67580819129944s
+[2025-07-01 09:19:04] Rank 37: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 182.93791437149048s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 09:19:04] Rank 15: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 182.69433569908142s
+[2025-07-01 09:19:04] Rank 23: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 182.5678791999817s
+[2025-07-01 09:19:04] Rank 48: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 180.17630982398987s
+[2025-07-01 09:19:04] Rank 54: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 180.1825351715088s
+[2025-07-01 09:19:04] Rank 53: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 180.1913890838623s
+[2025-07-01 09:19:04] Rank 21: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 182.60063314437866s
+[2025-07-01 09:19:04] Rank 57: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 182.97364211082458s
+[2025-07-01 09:19:04] Rank 59: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 182.97400450706482s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 09:19:04] Rank 40: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 182.56714510917664s
+[2025-07-01 09:19:04] Rank 52: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 180.22575664520264s
+[2025-07-01 09:19:04] Rank 29: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 183.02315592765808s
+[2025-07-01 09:19:04] Rank 24: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 183.02467370033264s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 09:19:04] Rank 12: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 182.78038883209229s
+[2025-07-01 09:19:04] Rank 10: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 182.78413200378418s
+[2025-07-01 09:19:04] Rank 32: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 183.04120206832886s
+[2025-07-01 09:19:04] Rank 25: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 183.0450234413147s
+[2025-07-01 09:19:04] Rank 43: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 182.59447646141052s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 09:19:04] Rank 44: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 182.60314464569092s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 09:19:04] Rank 56: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 183.03522372245789s
+[2025-07-01 09:19:04] Rank 63: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 183.0367488861084s
+[2025-07-01 09:19:04] Rank 35: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 183.07129096984863s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+length of dataloader: 28 14336
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795[GPU memory] before trainer
+ 2.292407512664795
+[GPU memory] before trainer length of dataloader:2.292407512664795
+ 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 09:19:04] Rank 49: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 180.28140139579773s
+[2025-07-01 09:19:04] Rank 39: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 183.07610249519348s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 09:19:04] Rank 28: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 183.08507561683655s
+[2025-07-01 09:19:04] Rank 6: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 195.64789366722107s
+length of dataloader: 28 14336
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 09:19:04] Rank 1: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 194.98654437065125s
+[2025-07-01 09:19:04] Rank 46: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 182.64663672447205s
+[2025-07-01 09:19:04] Rank 5: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 195.77389311790466s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 09:19:04] Rank 2: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 194.8103952407837s
+length of dataloader: 28 14336
+[2025-07-01 09:19:04] Rank 0: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 194.84082126617432s
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 09:19:04] Rank 33: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 183.11176013946533s
+[2025-07-01 09:19:04] Rank 4: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 194.82239317893982s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 09:19:04] Rank 3: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 195.04775762557983s
+[2025-07-01 09:19:04] Rank 7: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 195.0560109615326s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 09:19:04] Rank 38: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 183.1278281211853s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 09:19:04] Rank 30: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 183.13366270065308s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 09:19:04] Rank 26: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 183.14656853675842s
+length of dataloader: 28 14336
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 09:19:04] Rank 22: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 182.76397037506104s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 09:19:04] Rank 19: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 182.77731561660767s
+[2025-07-01 09:19:04] Rank 45: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 182.71926474571228s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 09:19:04] Rank 17: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 182.7852520942688s
+[2025-07-01 09:19:04] Rank 61: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 183.14857816696167s
+[2025-07-01 09:19:04] Rank 14: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 182.92455291748047s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 09:19:04] Rank 16: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 182.81509160995483s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 09:19:04] Rank 50: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 180.4306833744049s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 09:19:04] Rank 42: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 182.78269171714783s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 09:19:04] Rank 60: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 183.22880291938782s
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 28 14336
+[GPU memory] before trainer 2.292407512664795
+Parameter Offload: Total persistent parameters: 771184 in 421 params

slurm/1038301.0.err ADDED Viewed

File without changes

slurm/1038301.0.out ADDED Viewed

	@@ -0,0 +1,112 @@

+SLURM_JOB_ID = 1038301
+SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
+RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
+OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
+NNODES = 8
+NODES = pool0-02107 pool0-02117 pool0-02099 pool0-02196 pool0-02404 pool0-02496 pool0-02566 pool0-02669
+NODE_RANK = 1
+GPUS_PER_NODE = 8
+MASTER_ADDR = pool0-02107
+MASTER_PORT = 25001
+GLOBAL_TRAIN_BATCH_SIZE = 2048
+GRADIENT_ACCUMULATION_STEPS = 4
+PER_DEVICE_TRAIN_BATCH_SIZE = 8
+DEFAULT_LEARNING_RATE: 2e-5
+SLURM_JOB_ID = 1038301
+SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
+RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
+OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
+NNODES = 8
+NODES = pool0-02107 pool0-02117 pool0-02099 pool0-02196 pool0-02404 pool0-02496 pool0-02566 pool0-02669
+NODE_RANK = 5
+GPUS_PER_NODE = 8
+MASTER_ADDR = pool0-02107
+MASTER_PORT = 25001
+GLOBAL_TRAIN_BATCH_SIZE = 2048
+GRADIENT_ACCUMULATION_STEPS = 4
+PER_DEVICE_TRAIN_BATCH_SIZE = 8
+DEFAULT_LEARNING_RATE: 2e-5
+SLURM_JOB_ID = 1038301
+SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
+RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
+OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
+NNODES = 8
+NODES = pool0-02107 pool0-02117 pool0-02099 pool0-02196 pool0-02404 pool0-02496 pool0-02566 pool0-02669
+NODE_RANK = 0
+GPUS_PER_NODE = 8
+MASTER_ADDR = pool0-02107
+MASTER_PORT = 25001
+GLOBAL_TRAIN_BATCH_SIZE = 2048
+GRADIENT_ACCUMULATION_STEPS = 4
+PER_DEVICE_TRAIN_BATCH_SIZE = 8
+DEFAULT_LEARNING_RATE: 2e-5
+SLURM_JOB_ID = 1038301
+SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
+RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
+OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
+NNODES = 8
+NODES = pool0-02107 pool0-02117 pool0-02099 pool0-02196 pool0-02404 pool0-02496 pool0-02566 pool0-02669
+NODE_RANK = 2
+GPUS_PER_NODE = 8
+MASTER_ADDR = pool0-02107
+MASTER_PORT = 25001
+GLOBAL_TRAIN_BATCH_SIZE = 2048
+GRADIENT_ACCUMULATION_STEPS = 4
+PER_DEVICE_TRAIN_BATCH_SIZE = 8
+DEFAULT_LEARNING_RATE: 2e-5
+SLURM_JOB_ID = 1038301
+SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
+RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
+OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
+NNODES = 8
+NODES = pool0-02107 pool0-02117 pool0-02099 pool0-02196 pool0-02404 pool0-02496 pool0-02566 pool0-02669
+NODE_RANK = 7
+GPUS_PER_NODE = 8
+MASTER_ADDR = pool0-02107
+MASTER_PORT = 25001
+GLOBAL_TRAIN_BATCH_SIZE = 2048
+GRADIENT_ACCUMULATION_STEPS = 4
+PER_DEVICE_TRAIN_BATCH_SIZE = 8
+DEFAULT_LEARNING_RATE: 2e-5
+SLURM_JOB_ID = 1038301
+SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
+RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
+OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
+NNODES = 8
+NODES = pool0-02107 pool0-02117 pool0-02099 pool0-02196 pool0-02404 pool0-02496 pool0-02566 pool0-02669
+NODE_RANK = 6
+GPUS_PER_NODE = 8
+MASTER_ADDR = pool0-02107
+MASTER_PORT = 25001
+GLOBAL_TRAIN_BATCH_SIZE = 2048
+GRADIENT_ACCUMULATION_STEPS = 4
+PER_DEVICE_TRAIN_BATCH_SIZE = 8
+DEFAULT_LEARNING_RATE: 2e-5
+SLURM_JOB_ID = 1038301
+SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
+RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
+OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
+NNODES = 8
+NODES = pool0-02107 pool0-02117 pool0-02099 pool0-02196 pool0-02404 pool0-02496 pool0-02566 pool0-02669
+NODE_RANK = 3
+GPUS_PER_NODE = 8
+MASTER_ADDR = pool0-02107
+MASTER_PORT = 25001
+GLOBAL_TRAIN_BATCH_SIZE = 2048
+GRADIENT_ACCUMULATION_STEPS = 4
+PER_DEVICE_TRAIN_BATCH_SIZE = 8
+DEFAULT_LEARNING_RATE: 2e-5
+SLURM_JOB_ID = 1038301
+SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
+RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
+OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
+NNODES = 8
+NODES = pool0-02107 pool0-02117 pool0-02099 pool0-02196 pool0-02404 pool0-02496 pool0-02566 pool0-02669
+NODE_RANK = 4
+GPUS_PER_NODE = 8
+MASTER_ADDR = pool0-02107
+MASTER_PORT = 25001
+GLOBAL_TRAIN_BATCH_SIZE = 2048
+GRADIENT_ACCUMULATION_STEPS = 4
+PER_DEVICE_TRAIN_BATCH_SIZE = 8
+DEFAULT_LEARNING_RATE: 2e-5

slurm/1038303.0.err ADDED Viewed

The diff for this file is too large to render. See raw diff

slurm/1038303.0.out ADDED Viewed

	@@ -0,0 +1,792 @@

+SLURM_JOB_ID = 1038303
+SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
+RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
+OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
+NNODES = 8
+NODES = pool0-01504 pool0-01683 pool0-01722 pool0-01867 pool0-01881 pool0-01893 pool0-01919 pool0-01939
+NODE_RANK = 4
+GPUS_PER_NODE = 8
+MASTER_ADDR = pool0-01504
+MASTER_PORT = 25001
+GLOBAL_TRAIN_BATCH_SIZE = 1024
+GRADIENT_ACCUMULATION_STEPS = 4
+PER_DEVICE_TRAIN_BATCH_SIZE = 4
+DEFAULT_LEARNING_RATE: 2e-5
+SLURM_JOB_ID = 1038303
+SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
+RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
+OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
+NNODES = 8
+NODES = pool0-01504 pool0-01683 pool0-01722 pool0-01867 pool0-01881 pool0-01893 pool0-01919 pool0-01939
+NODE_RANK = 5
+GPUS_PER_NODE = 8
+MASTER_ADDR = pool0-01504
+MASTER_PORT = 25001
+GLOBAL_TRAIN_BATCH_SIZE = 1024
+GRADIENT_ACCUMULATION_STEPS = 4
+PER_DEVICE_TRAIN_BATCH_SIZE = 4
+DEFAULT_LEARNING_RATE: 2e-5
+SLURM_JOB_ID = 1038303
+SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
+RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
+OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
+NNODES = 8
+NODES = pool0-01504 pool0-01683 pool0-01722 pool0-01867 pool0-01881 pool0-01893 pool0-01919 pool0-01939
+NODE_RANK = 7
+GPUS_PER_NODE = 8
+MASTER_ADDR = pool0-01504
+MASTER_PORT = 25001
+GLOBAL_TRAIN_BATCH_SIZE = 1024
+GRADIENT_ACCUMULATION_STEPS = 4
+PER_DEVICE_TRAIN_BATCH_SIZE = 4
+DEFAULT_LEARNING_RATE: 2e-5
+SLURM_JOB_ID = 1038303
+SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
+RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
+OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
+NNODES = 8
+NODES = pool0-01504 pool0-01683 pool0-01722 pool0-01867 pool0-01881 pool0-01893 pool0-01919 pool0-01939
+NODE_RANK = 6
+GPUS_PER_NODE = 8
+MASTER_ADDR = pool0-01504
+MASTER_PORT = 25001
+GLOBAL_TRAIN_BATCH_SIZE = 1024
+GRADIENT_ACCUMULATION_STEPS = 4
+PER_DEVICE_TRAIN_BATCH_SIZE = 4
+DEFAULT_LEARNING_RATE: 2e-5
+SLURM_JOB_ID = 1038303
+SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
+RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
+OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
+NNODES = 8
+NODES = pool0-01504 pool0-01683 pool0-01722 pool0-01867 pool0-01881 pool0-01893 pool0-01919 pool0-01939
+NODE_RANK = 1
+GPUS_PER_NODE = 8
+MASTER_ADDR = pool0-01504
+MASTER_PORT = 25001
+GLOBAL_TRAIN_BATCH_SIZE = 1024
+GRADIENT_ACCUMULATION_STEPS = 4
+PER_DEVICE_TRAIN_BATCH_SIZE = 4
+DEFAULT_LEARNING_RATE: 2e-5
+SLURM_JOB_ID = 1038303
+SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
+RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
+OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
+NNODES = 8
+NODES = pool0-01504 pool0-01683 pool0-01722 pool0-01867 pool0-01881 pool0-01893 pool0-01919 pool0-01939
+NODE_RANK = 3
+GPUS_PER_NODE = 8
+MASTER_ADDR = pool0-01504
+MASTER_PORT = 25001
+GLOBAL_TRAIN_BATCH_SIZE = 1024
+GRADIENT_ACCUMULATION_STEPS = 4
+PER_DEVICE_TRAIN_BATCH_SIZE = 4
+DEFAULT_LEARNING_RATE: 2e-5
+SLURM_JOB_ID = 1038303
+SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
+RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
+OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
+NNODES = 8
+NODES = pool0-01504 pool0-01683 pool0-01722 pool0-01867 pool0-01881 pool0-01893 pool0-01919 pool0-01939
+NODE_RANK = 0
+GPUS_PER_NODE = 8
+MASTER_ADDR = pool0-01504
+MASTER_PORT = 25001
+GLOBAL_TRAIN_BATCH_SIZE = 1024
+GRADIENT_ACCUMULATION_STEPS = 4
+PER_DEVICE_TRAIN_BATCH_SIZE = 4
+DEFAULT_LEARNING_RATE: 2e-5
+SLURM_JOB_ID = 1038303
+SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
+RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
+OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
+NNODES = 8
+NODES = pool0-01504 pool0-01683 pool0-01722 pool0-01867 pool0-01881 pool0-01893 pool0-01919 pool0-01939
+NODE_RANK = 2
+GPUS_PER_NODE = 8
+MASTER_ADDR = pool0-01504
+MASTER_PORT = 25001
+GLOBAL_TRAIN_BATCH_SIZE = 1024
+GRADIENT_ACCUMULATION_STEPS = 4
+PER_DEVICE_TRAIN_BATCH_SIZE = 4
+DEFAULT_LEARNING_RATE: 2e-5
+[2025-07-01 09:21:22,856] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:21:22,972] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:21:23,111] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:21:23,563] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:21:23,735] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:21:23,799] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:21:23,799] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:21:23,803] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:21:23,847] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:21:23,855] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:21:23,856] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:21:23,911] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:21:24,002] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:21:24,003] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:21:24,019] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:21:24,021] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:21:24,028] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:21:24,030] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:21:24,113] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:21:24,232] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:21:24,243] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:21:24,247] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:21:24,251] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:21:24,253] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:21:24,311] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:21:24,311] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:21:24,328] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:21:24,330] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:21:24,332] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:21:24,369] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:21:24,369] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:21:24,370] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:21:24,371] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:21:24,372] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:21:24,376] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:21:24,424] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:21:24,429] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:21:24,442] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:21:24,446] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:21:24,452] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:21:24,465] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:21:24,518] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:21:24,542] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:21:24,543] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:21:24,563] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:21:24,564] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:21:24,579] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:21:24,587] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:21:25,700] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:21:25,700] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:21:25,775] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:21:25,790] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:21:25,791] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:21:25,792] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:21:25,864] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:21:25,901] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:21:25,906] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:21:26,124] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:21:26,215] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:21:26,241] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:21:26,247] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:21:26,306] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:21:26,306] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:21:26,313] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-07-01 09:21:26,339] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:21:26,339] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:21:26,459] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:21:26,459] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:21:26,560] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:21:26,561] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:21:26,944] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:21:26,944] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:21:27,281] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:21:27,281] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:21:27,322] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:21:27,322] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:21:27,339] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:21:27,339] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:21:27,459] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:21:27,459] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:21:27,513] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:21:27,513] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:21:27,516] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:21:27,516] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:21:27,549] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:21:27,549] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:21:27,571] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:21:27,571] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:21:27,582] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:21:27,583] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:21:27,583] [INFO] [comm.py:625:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
+[2025-07-01 09:21:27,616] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:21:27,617] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:21:27,621] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:21:27,621] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:21:27,623] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:21:27,624] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:21:27,636] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:21:27,636] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:21:27,637] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:21:27,637] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:21:27,643] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:21:27,643] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:21:27,661] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:21:27,661] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:21:27,662] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:21:27,662] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:21:27,679] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:21:27,680] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:21:27,798] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:21:27,799] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:21:27,828] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:21:27,828] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:21:27,854] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:21:27,854] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:21:27,856] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:21:27,856] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:21:27,862] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:21:27,862] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:21:27,865] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:21:27,865] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:21:27,890] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:21:27,890] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:21:27,892] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:21:27,892] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:21:27,930] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:21:27,931] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:21:27,932] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:21:27,932] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:21:27,975] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:21:27,975] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:21:27,984] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:21:27,984] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:21:27,986] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:21:27,986] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:21:27,988] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:21:27,988] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:21:27,990] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:21:27,990] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:21:28,007] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:21:28,007] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:21:28,034] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:21:28,034] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:21:28,054] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:21:28,054] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:21:28,056] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:21:28,056] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:21:28,114] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:21:28,114] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:21:28,186] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:21:28,186] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:21:28,277] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:21:28,277] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:21:28,330] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:21:28,330] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:21:28,361] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:21:28,361] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:21:28,409] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:21:28,410] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:21:28,424] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:21:28,424] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:21:29,112] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:21:29,112] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:21:29,152] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:21:29,153] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:21:29,158] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:21:29,158] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:21:29,181] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:21:29,181] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:21:29,378] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:21:29,379] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:21:29,436] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:21:29,436] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:21:29,474] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:21:29,474] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:21:29,528] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:21:29,528] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:21:29,605] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:21:29,605] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:21:29,635] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:21:29,635] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:21:29,709] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:21:29,709] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:21:29,775] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:21:29,775] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:21:29,782] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:21:29,782] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:21:29,784] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:21:29,784] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:21:29,786] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:21:29,786] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:21:29,786] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2025-07-01 09:21:29,786] [INFO] [comm.py:594:init_distributed] cdb=None
+[2025-07-01 09:21:43,807] [INFO] [partition_parameters.py:453:__exit__] finished initializing model with 7.61B parameters
+[2025-07-01 09:21:52,746] [INFO] [partition_parameters.py:453:__exit__] finished initializing model with 8.03B parameters
+[2025-07-01 09:21:53,403] [INFO] [partition_parameters.py:453:__exit__] finished initializing model with 8.09B parameters
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+[dist-0-of-64] LlavaLlamaModel(
+  (llm): Qwen2ForCausalLM(
+    (model): Qwen2Model(
+      (embed_tokens): Embedding(151648, 3584)
+      (layers): ModuleList(
+        (0-27): 28 x Qwen2DecoderLayer(
+          (self_attn): Qwen2FlashAttention2(
+            (q_proj): Linear(in_features=3584, out_features=3584, bias=True)
+            (k_proj): Linear(in_features=3584, out_features=512, bias=True)
+            (v_proj): Linear(in_features=3584, out_features=512, bias=True)
+            (o_proj): Linear(in_features=3584, out_features=3584, bias=False)
+            (rotary_emb): Qwen2RotaryEmbedding()
+          )
+          (mlp): Qwen2MLP(
+            (gate_proj): Linear(in_features=3584, out_features=18944, bias=False)
+            (up_proj): Linear(in_features=3584, out_features=18944, bias=False)
+            (down_proj): Linear(in_features=18944, out_features=3584, bias=False)
+            (act_fn): SiLU()
+          )
+          (input_layernorm): Qwen2RMSNorm((0,), eps=1e-06)
+          (post_attention_layernorm): Qwen2RMSNorm((0,), eps=1e-06)
+        )
+      )
+      (norm): Qwen2RMSNorm((0,), eps=1e-06)
+      (rotary_emb): Qwen2RotaryEmbedding()
+    )
+    (lm_head): Linear(in_features=3584, out_features=151648, bias=False)
+  )
+  (vision_tower): SiglipVisionTower(
+    (vision_tower): SiglipVisionModel(
+      (vision_model): SiglipVisionTransformer(
+        (embeddings): SiglipVisionEmbeddings(
+          (patch_embedding): Conv2d(3, 1152, kernel_size=(14, 14), stride=(14, 14), padding=valid)
+          (position_embedding): Embedding(1024, 1152)
+        )
+        (encoder): SiglipEncoder(
+          (layers): ModuleList(
+            (0-26): 27 x SiglipEncoderLayer(
+              (self_attn): SiglipFlashAttention2(
+                (k_proj): Linear(in_features=1152, out_features=1152, bias=True)
+                (v_proj): Linear(in_features=1152, out_features=1152, bias=True)
+                (q_proj): Linear(in_features=1152, out_features=1152, bias=True)
+                (out_proj): Linear(in_features=1152, out_features=1152, bias=True)
+              )
+              (layer_norm1): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
+              (mlp): SiglipMLP(
+                (activation_fn): PytorchGELUTanh()
+                (fc1): Linear(in_features=1152, out_features=4304, bias=True)
+                (fc2): Linear(in_features=4304, out_features=1152, bias=True)
+              )
+              (layer_norm2): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
+            )
+          )
+        )
+        (post_layernorm): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
+      )
+    )
+  )
+  (mm_projector): MultimodalProjector(
+    (layers): Sequential(
+      (0): DownSample3x3BlockFix()
+      (1): LayerNorm((10368,), eps=1e-05, elementwise_affine=True)
+      (2): Linear(in_features=10368, out_features=3456, bias=True)
+      (3): GELU(approximate='none')
+      (4): LayerNorm((3456,), eps=1e-05, elementwise_affine=True)
+      (5): Linear(in_features=3456, out_features=3584, bias=True)
+      (6): GELU(approximate='none')
+      (7): Linear(in_features=3584, out_features=3584, bias=True)
+    )
+  )
+)
+[dist-0-of-64] Tunable parameters:
+language model True
+[dist-0-of-64] vision tower True
+[dist-0-of-64] mm projector True
+trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
+[2025-07-01 09:24:26] Rank 51: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 178.73019170761108s
+[2025-07-01 09:24:26] Rank 15: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 178.88804817199707s
+[2025-07-01 09:24:26] Rank 61: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 178.51190781593323s
+[2025-07-01 09:24:26] Rank 18: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 179.18334817886353s
+length of dataloader: 52 13312
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 52 13312
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 09:24:26] Rank 35: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 177.17961239814758s
+[2025-07-01 09:24:26] Rank 30: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 177.50609588623047s
+length of dataloader: 52 13312
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 09:24:26] Rank 42: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 179.07143235206604s
+length of dataloader: 52 13312
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 52 13312
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 52 13312
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 52 13312
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 09:24:26] Rank 50: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 178.9429168701172s
+[2025-07-01 09:24:26] Rank 16: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 179.6384243965149s
+[2025-07-01 09:24:26] Rank 0: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 179.61671090126038s
+length of dataloader: 52 13312
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 09:24:26] Rank 11: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 179.24142980575562s
+[2025-07-01 09:24:26] Rank 3: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 179.68241500854492s
+length of dataloader: 52 13312
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 52 13312
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 52 13312
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 09:24:27] Rank 59: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 178.93752551078796s
+[2025-07-01 09:24:27] Rank 36: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 177.47626900672913s
+length of dataloader: 52 13312
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 09:24:27] Rank 43: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 179.4129433631897s
+[2025-07-01 09:24:27] Rank 39: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 178.0158293247223s
+[2025-07-01 09:24:27] Rank 38: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 177.80202722549438s
+length of dataloader: 52 13312
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 52 13312
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 09:24:27] Rank 10: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 179.43888783454895s
+length of dataloader: 52 13312
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 09:24:27] Rank 60: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 179.1183044910431s
+[2025-07-01 09:24:27] Rank 49: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 179.2389600276947s
+length of dataloader: 52 13312
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 09:24:27] Rank 57: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 179.06473851203918s
+[2025-07-01 09:24:27] Rank 63: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 179.11095666885376s
+[2025-07-01 09:24:27] Rank 41: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 179.51858401298523s
+[2025-07-01 09:24:27] Rank 52: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 179.30554270744324s
+length of dataloader: 52 13312
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 09:24:27] Rank 40: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 179.5245840549469s
+[2025-07-01 09:24:27] Rank 21: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 180.82316184043884s
+[2025-07-01 09:24:27] Rank 37: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 177.645690202713s
+[2025-07-01 09:24:27] Rank 56: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 179.1373429298401s
+[2025-07-01 09:24:27] Rank 7: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 179.96211647987366s
+length of dataloader: 52 13312
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 09:24:27] Rank 9: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 179.6913924217224s
+[2025-07-01 09:24:27] Rank 32: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 177.7236123085022s
+length of dataloader: 52 13312
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 09:24:27] Rank 26: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 178.0112874507904s
+length of dataloader: 52 13312
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 09:24:27] Rank 1: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 180.98196148872375s
+[2025-07-01 09:24:27] Rank 12: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 180.38428473472595s
+length of dataloader: 52 13312
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 52 13312
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 09:24:27] Rank 23: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 179.83022689819336s
+[2025-07-01 09:24:27] Rank 62: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 179.28459119796753s
+[2025-07-01 09:24:27] Rank 33: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 177.73978686332703s
+length of dataloader: 52 13312
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 09:24:27] Rank 53: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 179.41727900505066s
+[2025-07-01 09:24:27] Rank 34: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 177.72356247901917s
+length of dataloader: 52 13312
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 52 13312
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 52 13312
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 52 13312
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 52 13312
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 52 13312
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 09:24:27] Rank 44: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 180.81957077980042s
+[2025-07-01 09:24:27] Rank 13: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 179.6111810207367s
+[2025-07-01 09:24:27] Rank 20: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 179.8662896156311s
+[2025-07-01 09:24:27] Rank 58: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 179.25026488304138s
+[2025-07-01 09:24:27] Rank 25: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 178.25618314743042s
+length of dataloader: 52 13312
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 52 13312
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 09:24:27] Rank 4: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 180.23809123039246s
+length of dataloader: 52 13312
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 52 13312
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 09:24:27] Rank 27: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 178.11056566238403s
+length of dataloader: 52 13312
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 09:24:27] Rank 46: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 179.69872188568115s
+length of dataloader: 52 13312
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 52 13312
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 09:24:27] Rank 14: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 179.87972497940063s
+[2025-07-01 09:24:27] Rank 8: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 179.7657687664032s
+length of dataloader: 52 13312
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 52 13312
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 09:24:27] Rank 5: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 180.11463713645935s
+[2025-07-01 09:24:27] Rank 2: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 180.15511107444763s
+[2025-07-01 09:24:27] Rank 22: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 179.94539523124695s
+length of dataloader: 52 13312
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 09:24:27] Rank 48: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 179.58666896820068s
+[2025-07-01 09:24:27] Rank 17: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 179.94371223449707s
+[2025-07-01 09:24:27] Rank 54: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 179.51171231269836s
+[2025-07-01 09:24:27] Rank 6: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 180.14709281921387s
+length of dataloader: 52 13312
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 52 13312
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 09:24:27] Rank 19: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 179.9675374031067s
+length of dataloader: 52 13312
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 52 13312
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 52 13312
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 52 13312
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 52 13312
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 09:24:27] Rank 28: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 178.3642818927765s
+[2025-07-01 09:24:27] Rank 31: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 178.2817325592041s
+length of dataloader: 52 13312
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 09:24:27] Rank 47: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 179.9938542842865s
+[2025-07-01 09:24:27] Rank 24: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 178.28741884231567s
+[2025-07-01 09:24:27] Rank 29: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 178.41006183624268s
+length of dataloader: 52 13312
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 52 13312
+[GPU memory] before trainer 2.292407512664795
+[2025-07-01 09:24:27] Rank 55: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 179.64781546592712s
+[2025-07-01 09:24:27] Rank 45: Timer for terminate callback has been set.
+Total limit: 240min
+Pre terminate time: 10min elapsed_time: 179.78166794776917s
+length of dataloader: 52 13312
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 52 13312
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 52 13312
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 52 13312
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 52 13312
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 52 13312
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 52 13312
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 52 13312
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 52 13312
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 52 13312
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 52 13312
+length of dataloader: 52 13312
+[GPU memory] before trainer 2.292407512664795
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 52 13312
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 52 13312
+[GPU memory] before trainer 2.292407512664795
+length of dataloader: 52 13312
+[GPU memory] before trainer 2.292407512664795
+Parameter Offload: Total persistent parameters: 771184 in 421 params
+{'loss': 1.2648, 'grad_norm': 130.75577214745246, 'learning_rate': 2e-05, 'epoch': 0.08}
+{'loss': 1.2283, 'grad_norm': 126.67616596972118, 'learning_rate': 1.9659258262890683e-05, 'epoch': 0.15}
+[2025-07-01 09:25:57,641] [WARNING] [stage3.py:1850:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+{'loss': 0.4188, 'grad_norm': 56.84219933724937, 'learning_rate': 1.866025403784439e-05, 'epoch': 0.23}
+{'loss': 2.4789, 'grad_norm': 89.42666319016989, 'learning_rate': 1.7071067811865477e-05, 'epoch': 0.31}
+{'loss': 0.7853, 'grad_norm': 72.41844398977439, 'learning_rate': 1.5000000000000002e-05, 'epoch': 0.38}
+{'loss': 8.2197, 'grad_norm': 799.0148731599335, 'learning_rate': 1.2588190451025209e-05, 'epoch': 0.46}
+{'loss': 0.3008, 'grad_norm': 34.525610403243014, 'learning_rate': 1e-05, 'epoch': 0.54}
+{'loss': 0.3999, 'grad_norm': 64.98250527603693, 'learning_rate': 7.411809548974792e-06, 'epoch': 0.62}
+{'loss': 0.2575, 'grad_norm': 11.46902235575636, 'learning_rate': 5.000000000000003e-06, 'epoch': 0.69}
+[2025-07-01 09:28:31,764] [WARNING] [stage3.py:1850:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+{'loss': 0.3174, 'grad_norm': 42.63293180170212, 'learning_rate': 2.9289321881345257e-06, 'epoch': 0.77}
+{'loss': 0.3054, 'grad_norm': 40.64988981794197, 'learning_rate': 1.339745962155613e-06, 'epoch': 0.85}
+{'loss': 0.2827, 'grad_norm': 27.588182133457394, 'learning_rate': 3.4074173710931804e-07, 'epoch': 0.92}
+{'loss': 0.2751, 'grad_norm': 17.48926557604337, 'learning_rate': 0.0, 'epoch': 1.0}
+saving llm to runs/train/NVILA-Lite-8B-quantumn-qa-train/model/tmp-checkpoint-13/llm
+saving vision_tower to runs/train/NVILA-Lite-8B-quantumn-qa-train/model/tmp-checkpoint-13/vision_tower
+saving mm_projector to runs/train/NVILA-Lite-8B-quantumn-qa-train/model/tmp-checkpoint-13/mm_projector
+{'train_runtime': 323.0635, 'train_samples_per_second': 41.206, 'train_steps_per_second': 0.04, 'train_loss': 1.2718768601234143, 'epoch': 1.0}
+saving llm to runs/train/NVILA-Lite-8B-quantumn-qa-train/model/llm
+saving vision_tower to runs/train/NVILA-Lite-8B-quantumn-qa-train/model/vision_tower
+saving mm_projector to runs/train/NVILA-Lite-8B-quantumn-qa-train/model/mm_projector
+[1;34mwandb[0m:
+[1;34mwandb[0m: 🚀 View run [33mNVILA-Lite-8B-quantumn-qa-train[0m at: [34mhttps://wandb.ai/ligeng-zhu/vila/runs/NVILA-Lite-8B-quantumn-qa-train[0m