Ligeng-Zhu commited on
Commit
342f304
·
verified ·
1 Parent(s): 4fffd94

Upload files with `vila-upload`.

Browse files

Upload model/config.json
Upload model/trainer_state.json
Upload model/llm/merges.txt
Upload model/llm/generation_config.json
Upload model/llm/model.safetensors.index.json
Upload model/llm/config.json
Upload model/llm/tokenizer_config.json
Upload model/llm/tokenizer.json
Upload model/llm/model-00002-of-00004.safetensors
Upload model/llm/model-00003-of-00004.safetensors
Upload model/llm/vocab.json
Upload model/llm/special_tokens_map.json
Upload model/llm/model-00001-of-00004.safetensors
Upload model/llm/added_tokens.json
Upload model/llm/model-00004-of-00004.safetensors
Upload model/mm_projector/config.json
Upload model/mm_projector/model.safetensors
Upload model/vision_tower/config.json
Upload model/vision_tower/model.safetensors
Upload model/vision_tower/preprocessor_config.json
Upload slurm/1038294.0.err
Upload slurm/1038301.0.err
Upload slurm/1038294.0.out
Upload slurm/1038241.0.err
Upload slurm/1038255.0.out
Upload slurm/1038254.0.out
Upload slurm/1038241.0.out
Upload slurm/1038247.0.out
Upload slurm/1038255.0.err
Upload slurm/1038286.0.err
Upload slurm/1038254.0.err
Upload slurm/1038301.0.out
Upload slurm/1038303.0.err
Upload slurm/1038303.0.out
Upload slurm/1038247.0.err
Upload slurm/1038286.0.out

.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ model/llm/tokenizer.json filter=lfs diff=lfs merge=lfs -text
model/config.json ADDED
@@ -0,0 +1,344 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "Ubit": 100,
3
+ "_attn_implementation_autoset": true,
4
+ "_name_or_path": "runs/train/NVILA-Lite-8B-quantumn-qa-train/model",
5
+ "architectures": [
6
+ "LlavaLlamaModel"
7
+ ],
8
+ "babit": "E5M2",
9
+ "bobit": "E5M2",
10
+ "bwbit": "E5M2",
11
+ "chat_template": null,
12
+ "col_blocksize": -1,
13
+ "col_blocksize_optimizer": 128,
14
+ "draw_distribution_backward": false,
15
+ "draw_distribution_forward": false,
16
+ "drop_path_rate": 0.0,
17
+ "dynamic_s2": false,
18
+ "epsilon": 1e-10,
19
+ "epsilon_optimizer": 1e-15,
20
+ "fabit": "E4M3",
21
+ "first_order_bit": null,
22
+ "first_order_quant_type": null,
23
+ "fobit": "E4M3",
24
+ "fps": 0.0,
25
+ "fwbit": "E4M3",
26
+ "group_size": -1,
27
+ "hidden_size": 3584,
28
+ "high_res_pos_embed": false,
29
+ "image_aspect_ratio": "dynamic",
30
+ "image_encoder": {
31
+ "_target_": "llava.model.encoders.BasicImageEncoder"
32
+ },
33
+ "interpolate_mode": "linear",
34
+ "llm_cfg": {
35
+ "_attn_implementation_autoset": false,
36
+ "_name_or_path": "runs/train/NVILA-Lite-8B-quantumn-qa-train/model/llm",
37
+ "add_cross_attention": false,
38
+ "architectures": [
39
+ "Qwen2ForCausalLM"
40
+ ],
41
+ "attention_dropout": 0.0,
42
+ "bad_words_ids": null,
43
+ "begin_suppress_tokens": null,
44
+ "bos_token_id": 151643,
45
+ "chunk_size_feed_forward": 0,
46
+ "cross_attention_hidden_size": null,
47
+ "decoder_start_token_id": null,
48
+ "diversity_penalty": 0.0,
49
+ "do_sample": false,
50
+ "early_stopping": false,
51
+ "encoder_no_repeat_ngram_size": 0,
52
+ "eos_token_id": 151645,
53
+ "exponential_decay_length_penalty": null,
54
+ "finetuning_task": null,
55
+ "forced_bos_token_id": null,
56
+ "forced_eos_token_id": null,
57
+ "hidden_act": "silu",
58
+ "hidden_size": 3584,
59
+ "id2label": {
60
+ "0": "LABEL_0",
61
+ "1": "LABEL_1"
62
+ },
63
+ "initializer_range": 0.02,
64
+ "intermediate_size": 18944,
65
+ "is_decoder": false,
66
+ "is_encoder_decoder": false,
67
+ "label2id": {
68
+ "LABEL_0": 0,
69
+ "LABEL_1": 1
70
+ },
71
+ "length_penalty": 1.0,
72
+ "max_length": 20,
73
+ "max_position_embeddings": 32768,
74
+ "max_window_layers": 28,
75
+ "min_length": 0,
76
+ "model_max_length": 4096,
77
+ "model_type": "qwen2",
78
+ "no_repeat_ngram_size": 0,
79
+ "num_attention_heads": 28,
80
+ "num_beam_groups": 1,
81
+ "num_beams": 1,
82
+ "num_hidden_layers": 28,
83
+ "num_key_value_heads": 4,
84
+ "num_return_sequences": 1,
85
+ "output_attentions": false,
86
+ "output_hidden_states": false,
87
+ "output_scores": false,
88
+ "pad_token_id": null,
89
+ "prefix": null,
90
+ "problem_type": null,
91
+ "pruned_heads": {},
92
+ "remove_invalid_values": false,
93
+ "repetition_penalty": 1.0,
94
+ "return_dict": true,
95
+ "return_dict_in_generate": false,
96
+ "rms_norm_eps": 1e-06,
97
+ "rope_scaling": null,
98
+ "rope_theta": 1000000.0,
99
+ "sep_token_id": null,
100
+ "sliding_window": null,
101
+ "suppress_tokens": null,
102
+ "task_specific_params": null,
103
+ "temperature": 1.0,
104
+ "tf_legacy_loss": false,
105
+ "tie_encoder_decoder": false,
106
+ "tie_word_embeddings": false,
107
+ "tokenizer_class": null,
108
+ "tokenizer_model_max_length": 4096,
109
+ "tokenizer_padding_side": "right",
110
+ "top_k": 50,
111
+ "top_p": 1.0,
112
+ "torch_dtype": "bfloat16",
113
+ "torchscript": false,
114
+ "typical_p": 1.0,
115
+ "use_bfloat16": false,
116
+ "use_cache": true,
117
+ "use_sliding_window": false,
118
+ "vocab_size": 151659
119
+ },
120
+ "look_close_mode": "after_image",
121
+ "max_tiles": 12,
122
+ "min_blockunit_col": 4,
123
+ "min_blockunit_row": 4,
124
+ "min_tiles": 1,
125
+ "mlp_path": null,
126
+ "mm_hidden_size": 1152,
127
+ "mm_low_res_token_num": null,
128
+ "mm_projector": "mlp_downsample_3x3_fix",
129
+ "mm_projector_cfg": {
130
+ "_attn_implementation_autoset": false,
131
+ "_name_or_path": "runs/train/NVILA-Lite-8B-quantumn-qa-train/model/mm_projector",
132
+ "add_cross_attention": false,
133
+ "architectures": [
134
+ "MultimodalProjector"
135
+ ],
136
+ "bad_words_ids": null,
137
+ "begin_suppress_tokens": null,
138
+ "bos_token_id": null,
139
+ "chunk_size_feed_forward": 0,
140
+ "cross_attention_hidden_size": null,
141
+ "decoder_start_token_id": null,
142
+ "diversity_penalty": 0.0,
143
+ "do_sample": false,
144
+ "early_stopping": false,
145
+ "encoder_no_repeat_ngram_size": 0,
146
+ "eos_token_id": null,
147
+ "exponential_decay_length_penalty": null,
148
+ "finetuning_task": null,
149
+ "forced_bos_token_id": null,
150
+ "forced_eos_token_id": null,
151
+ "id2label": {
152
+ "0": "LABEL_0",
153
+ "1": "LABEL_1"
154
+ },
155
+ "is_decoder": false,
156
+ "is_encoder_decoder": false,
157
+ "label2id": {
158
+ "LABEL_0": 0,
159
+ "LABEL_1": 1
160
+ },
161
+ "length_penalty": 1.0,
162
+ "max_length": 20,
163
+ "min_length": 0,
164
+ "mm_projector_type": "mlp_downsample_3x3_fix",
165
+ "model_type": "v2l_projector",
166
+ "no_repeat_ngram_size": 0,
167
+ "num_beam_groups": 1,
168
+ "num_beams": 1,
169
+ "num_return_sequences": 1,
170
+ "output_attentions": false,
171
+ "output_hidden_states": false,
172
+ "output_scores": false,
173
+ "pad_token_id": null,
174
+ "prefix": null,
175
+ "problem_type": null,
176
+ "pruned_heads": {},
177
+ "remove_invalid_values": false,
178
+ "repetition_penalty": 1.0,
179
+ "return_dict": true,
180
+ "return_dict_in_generate": false,
181
+ "sep_token_id": null,
182
+ "suppress_tokens": null,
183
+ "task_specific_params": null,
184
+ "temperature": 1.0,
185
+ "tf_legacy_loss": false,
186
+ "tie_encoder_decoder": false,
187
+ "tie_word_embeddings": true,
188
+ "tokenizer_class": null,
189
+ "top_k": 50,
190
+ "top_p": 1.0,
191
+ "torch_dtype": "bfloat16",
192
+ "torchscript": false,
193
+ "typical_p": 1.0,
194
+ "use_bfloat16": false
195
+ },
196
+ "mm_projector_lr": null,
197
+ "mm_scale_num": null,
198
+ "mm_use_bos_eos_tokens": true,
199
+ "mm_use_im_patch_token": false,
200
+ "mm_use_im_start_end": false,
201
+ "mm_vision_select_feature": "cls_patch",
202
+ "mm_vision_select_layer": -2,
203
+ "model_dtype": "torch.bfloat16",
204
+ "model_name_or_path": "Efficient-Large-Model/NVILA-Lite-8B",
205
+ "model_type": "llava_llama",
206
+ "num_look_close": 1,
207
+ "num_time_tokens": 0,
208
+ "num_token_look_close": null,
209
+ "num_video_frames": 8,
210
+ "pad_block": false,
211
+ "pad_to_multiple_of": 0,
212
+ "ps3": false,
213
+ "ps3_dynamic_aspect_ratio": false,
214
+ "ps3_grad_checkpointing": false,
215
+ "qchoice": "none",
216
+ "quantize_model": false,
217
+ "refine_attn_blocksize": false,
218
+ "refine_col_blocksize": 4,
219
+ "refine_ln_blocksize": false,
220
+ "refine_ln_blocksize_but_only_backward": false,
221
+ "refine_ln_blocksize_but_only_forward": false,
222
+ "refine_ln_pertoken": false,
223
+ "refine_mlp_blocksize": false,
224
+ "refine_residual_fp": false,
225
+ "refine_row_blocksize": 4,
226
+ "resume_path": "runs/train/NVILA-Lite-8B-quantumn-qa-train/model",
227
+ "row_blocksize": -1,
228
+ "row_blocksize_optimizer": 1,
229
+ "s2": false,
230
+ "s2_max_split_size": 336,
231
+ "s2_resize_output_to_scale_idx": 0,
232
+ "s2_scales": "336,672,1008",
233
+ "second_order_bit": null,
234
+ "second_order_quant_type": null,
235
+ "soft_ce_std": 1.0,
236
+ "sound_mm_projector": "mlp",
237
+ "sound_mm_projector_cfg": null,
238
+ "sound_tower": "",
239
+ "sound_tower_cfg": null,
240
+ "speech_mm_projector": "mlp",
241
+ "speech_mm_projector_cfg": null,
242
+ "speech_tower": "",
243
+ "speech_tower_cfg": null,
244
+ "symm": true,
245
+ "time_token_format": "<t{t}>",
246
+ "time_token_ids": [],
247
+ "top_down_prompt_head_type": "mlp",
248
+ "transformers_version": "4.46.0",
249
+ "tune_language_model": true,
250
+ "tune_mm_projector": true,
251
+ "tune_vision_tower": true,
252
+ "unified_audio_encoder": true,
253
+ "use_quantize_optimizer": false,
254
+ "version": "auto",
255
+ "video_encoder": {
256
+ "_target_": "llava.model.encoders.BasicVideoEncoder"
257
+ },
258
+ "video_max_tiles": 1,
259
+ "vision_resolution": -1,
260
+ "vision_tower": "Efficient-Large-Model/paligemma-siglip-so400m-patch14-448",
261
+ "vision_tower_cfg": {
262
+ "_attn_implementation_autoset": false,
263
+ "_name_or_path": "runs/train/NVILA-Lite-8B-quantumn-qa-train/model/vision_tower",
264
+ "add_cross_attention": false,
265
+ "architectures": [
266
+ "SiglipVisionModel"
267
+ ],
268
+ "attention_dropout": 0.0,
269
+ "bad_words_ids": null,
270
+ "begin_suppress_tokens": null,
271
+ "bos_token_id": null,
272
+ "chunk_size_feed_forward": 0,
273
+ "cross_attention_hidden_size": null,
274
+ "decoder_start_token_id": null,
275
+ "diversity_penalty": 0.0,
276
+ "do_sample": false,
277
+ "early_stopping": false,
278
+ "encoder_no_repeat_ngram_size": 0,
279
+ "eos_token_id": null,
280
+ "exponential_decay_length_penalty": null,
281
+ "finetuning_task": null,
282
+ "forced_bos_token_id": null,
283
+ "forced_eos_token_id": null,
284
+ "hidden_act": "gelu_pytorch_tanh",
285
+ "hidden_size": 1152,
286
+ "id2label": {
287
+ "0": "LABEL_0",
288
+ "1": "LABEL_1"
289
+ },
290
+ "image_size": 448,
291
+ "intermediate_size": 4304,
292
+ "is_decoder": false,
293
+ "is_encoder_decoder": false,
294
+ "label2id": {
295
+ "LABEL_0": 0,
296
+ "LABEL_1": 1
297
+ },
298
+ "layer_norm_eps": 1e-06,
299
+ "length_penalty": 1.0,
300
+ "max_length": 20,
301
+ "min_length": 0,
302
+ "model_type": "siglip_vision_model",
303
+ "no_repeat_ngram_size": 0,
304
+ "num_attention_heads": 16,
305
+ "num_beam_groups": 1,
306
+ "num_beams": 1,
307
+ "num_channels": 3,
308
+ "num_hidden_layers": 27,
309
+ "num_image_tokens": 256,
310
+ "num_return_sequences": 1,
311
+ "output_attentions": false,
312
+ "output_hidden_states": false,
313
+ "output_scores": false,
314
+ "pad_token_id": null,
315
+ "patch_size": 14,
316
+ "prefix": null,
317
+ "problem_type": null,
318
+ "projection_dim": 2048,
319
+ "projector_hidden_act": "gelu_fast",
320
+ "pruned_heads": {},
321
+ "remove_invalid_values": false,
322
+ "repetition_penalty": 1.0,
323
+ "return_dict": true,
324
+ "return_dict_in_generate": false,
325
+ "sep_token_id": null,
326
+ "suppress_tokens": null,
327
+ "task_specific_params": null,
328
+ "temperature": 1.0,
329
+ "tf_legacy_loss": false,
330
+ "tie_encoder_decoder": false,
331
+ "tie_word_embeddings": true,
332
+ "tokenizer_class": null,
333
+ "top_k": 50,
334
+ "top_p": 1.0,
335
+ "torch_dtype": "bfloat16",
336
+ "torchscript": false,
337
+ "typical_p": 1.0,
338
+ "use_bfloat16": false,
339
+ "vision_use_head": false
340
+ },
341
+ "vision_tower_lr": null,
342
+ "weight_memory_efficient": true,
343
+ "xvila_mode": true
344
+ }
model/llm/added_tokens.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "<image>": 151649,
3
+ "<vila/sentinel>": 151648,
4
+ "<vila/video>": 151650,
5
+ "<|endoftext|>": 151643,
6
+ "<|im_end|>": 151645,
7
+ "<|im_start|>": 151644,
8
+ "<|image_bos|>": 151651,
9
+ "<|image_eos|>": 151652,
10
+ "<|sound_bos|>": 151657,
11
+ "<|sound_eos|>": 151658,
12
+ "<|speech_bos|>": 151655,
13
+ "<|speech_eos|>": 151656,
14
+ "<|video_bos|>": 151653,
15
+ "<|video_eos|>": 151654,
16
+ "[BOS]": 151646,
17
+ "[PAD]": 151647
18
+ }
model/llm/config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "runs/train/NVILA-Lite-8B-quantumn-qa-train/model/llm",
3
+ "architectures": [
4
+ "Qwen2ForCausalLM"
5
+ ],
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 151643,
8
+ "eos_token_id": 151645,
9
+ "hidden_act": "silu",
10
+ "hidden_size": 3584,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": 18944,
13
+ "max_position_embeddings": 32768,
14
+ "max_window_layers": 28,
15
+ "model_max_length": 4096,
16
+ "model_type": "qwen2",
17
+ "num_attention_heads": 28,
18
+ "num_hidden_layers": 28,
19
+ "num_key_value_heads": 4,
20
+ "rms_norm_eps": 1e-06,
21
+ "rope_scaling": null,
22
+ "rope_theta": 1000000.0,
23
+ "sliding_window": null,
24
+ "tie_word_embeddings": false,
25
+ "tokenizer_model_max_length": 4096,
26
+ "tokenizer_padding_side": "right",
27
+ "torch_dtype": "bfloat16",
28
+ "transformers_version": "4.46.0",
29
+ "use_cache": true,
30
+ "use_sliding_window": false,
31
+ "vocab_size": 151659
32
+ }
model/llm/generation_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 151643,
3
+ "do_sample": true,
4
+ "eos_token_id": [
5
+ 151645,
6
+ 151643
7
+ ],
8
+ "pad_token_id": 151643,
9
+ "repetition_penalty": 1.05,
10
+ "temperature": 0.7,
11
+ "top_k": 20,
12
+ "top_p": 0.8,
13
+ "transformers_version": "4.46.0"
14
+ }
model/llm/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model/llm/model-00001-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3d9e7cde1d0d04f346241f1a173c7c0a5259b77f263a2b6dc517fca2e39f4b08
3
+ size 4874757736
model/llm/model-00002-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:512c3fa5bf6e95a27d74e9dc6dc290d84b6657163fbc5ab814fd95bc87aa08ea
3
+ size 4932751008
model/llm/model-00003-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bb3cb599b80fd7c4e189e24f795b0a30aa1eb5557243b81bc4b01dae085f8a51
3
+ size 4330865200
model/llm/model-00004-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:da688388a27fb804d273baf75b638c157292b03fadf5521baa58909d148b3aa6
3
+ size 1087091840
model/llm/model.safetensors.index.json ADDED
@@ -0,0 +1,346 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 15225426944
4
+ },
5
+ "weight_map": {
6
+ "lm_head.weight": "model-00004-of-00004.safetensors",
7
+ "model.embed_tokens.weight": "model-00001-of-00004.safetensors",
8
+ "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors",
9
+ "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
10
+ "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
11
+ "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
12
+ "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
13
+ "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
14
+ "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
15
+ "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
16
+ "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
17
+ "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
18
+ "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
19
+ "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
20
+ "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors",
21
+ "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
22
+ "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
23
+ "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
24
+ "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
25
+ "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
26
+ "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
27
+ "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
28
+ "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
29
+ "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
30
+ "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
31
+ "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
32
+ "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors",
33
+ "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
34
+ "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
35
+ "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
36
+ "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
37
+ "model.layers.10.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
38
+ "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
39
+ "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
40
+ "model.layers.10.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
41
+ "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
42
+ "model.layers.10.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
43
+ "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
44
+ "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors",
45
+ "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
46
+ "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
47
+ "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
48
+ "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
49
+ "model.layers.11.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
50
+ "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
51
+ "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
52
+ "model.layers.11.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
53
+ "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
54
+ "model.layers.11.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
55
+ "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
56
+ "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors",
57
+ "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
58
+ "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
59
+ "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
60
+ "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
61
+ "model.layers.12.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
62
+ "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
63
+ "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
64
+ "model.layers.12.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
65
+ "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
66
+ "model.layers.12.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
67
+ "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
68
+ "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors",
69
+ "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
70
+ "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
71
+ "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
72
+ "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
73
+ "model.layers.13.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
74
+ "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
75
+ "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
76
+ "model.layers.13.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
77
+ "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
78
+ "model.layers.13.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
79
+ "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
80
+ "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors",
81
+ "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
82
+ "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
83
+ "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
84
+ "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
85
+ "model.layers.14.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
86
+ "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
87
+ "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
88
+ "model.layers.14.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
89
+ "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
90
+ "model.layers.14.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
91
+ "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
92
+ "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors",
93
+ "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
94
+ "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
95
+ "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
96
+ "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
97
+ "model.layers.15.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
98
+ "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
99
+ "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
100
+ "model.layers.15.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
101
+ "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
102
+ "model.layers.15.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
103
+ "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
104
+ "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors",
105
+ "model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
106
+ "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
107
+ "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
108
+ "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
109
+ "model.layers.16.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
110
+ "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
111
+ "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
112
+ "model.layers.16.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
113
+ "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
114
+ "model.layers.16.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
115
+ "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
116
+ "model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors",
117
+ "model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
118
+ "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
119
+ "model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
120
+ "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
121
+ "model.layers.17.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
122
+ "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
123
+ "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
124
+ "model.layers.17.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
125
+ "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
126
+ "model.layers.17.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
127
+ "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
128
+ "model.layers.18.input_layernorm.weight": "model-00003-of-00004.safetensors",
129
+ "model.layers.18.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
130
+ "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
131
+ "model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
132
+ "model.layers.18.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
133
+ "model.layers.18.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
134
+ "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
135
+ "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
136
+ "model.layers.18.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
137
+ "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
138
+ "model.layers.18.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
139
+ "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
140
+ "model.layers.19.input_layernorm.weight": "model-00003-of-00004.safetensors",
141
+ "model.layers.19.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
142
+ "model.layers.19.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
143
+ "model.layers.19.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
144
+ "model.layers.19.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
145
+ "model.layers.19.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
146
+ "model.layers.19.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
147
+ "model.layers.19.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
148
+ "model.layers.19.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
149
+ "model.layers.19.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
150
+ "model.layers.19.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
151
+ "model.layers.19.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
152
+ "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors",
153
+ "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
154
+ "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
155
+ "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
156
+ "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
157
+ "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
158
+ "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
159
+ "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
160
+ "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
161
+ "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
162
+ "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
163
+ "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
164
+ "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors",
165
+ "model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
166
+ "model.layers.20.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
167
+ "model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
168
+ "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
169
+ "model.layers.20.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
170
+ "model.layers.20.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
171
+ "model.layers.20.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
172
+ "model.layers.20.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
173
+ "model.layers.20.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
174
+ "model.layers.20.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
175
+ "model.layers.20.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
176
+ "model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors",
177
+ "model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
178
+ "model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
179
+ "model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
180
+ "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
181
+ "model.layers.21.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
182
+ "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
183
+ "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
184
+ "model.layers.21.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
185
+ "model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
186
+ "model.layers.21.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
187
+ "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
188
+ "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors",
189
+ "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
190
+ "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
191
+ "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
192
+ "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
193
+ "model.layers.22.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
194
+ "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
195
+ "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
196
+ "model.layers.22.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
197
+ "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
198
+ "model.layers.22.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
199
+ "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
200
+ "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors",
201
+ "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
202
+ "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
203
+ "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
204
+ "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
205
+ "model.layers.23.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
206
+ "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
207
+ "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
208
+ "model.layers.23.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
209
+ "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
210
+ "model.layers.23.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
211
+ "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
212
+ "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors",
213
+ "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
214
+ "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
215
+ "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
216
+ "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
217
+ "model.layers.24.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
218
+ "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
219
+ "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
220
+ "model.layers.24.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
221
+ "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
222
+ "model.layers.24.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
223
+ "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
224
+ "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors",
225
+ "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
226
+ "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
227
+ "model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
228
+ "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
229
+ "model.layers.25.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
230
+ "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
231
+ "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
232
+ "model.layers.25.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
233
+ "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
234
+ "model.layers.25.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
235
+ "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
236
+ "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors",
237
+ "model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
238
+ "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
239
+ "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
240
+ "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
241
+ "model.layers.26.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
242
+ "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
243
+ "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
244
+ "model.layers.26.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
245
+ "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
246
+ "model.layers.26.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
247
+ "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
248
+ "model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors",
249
+ "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
250
+ "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
251
+ "model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
252
+ "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
253
+ "model.layers.27.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
254
+ "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
255
+ "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
256
+ "model.layers.27.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
257
+ "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
258
+ "model.layers.27.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
259
+ "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
260
+ "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors",
261
+ "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
262
+ "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
263
+ "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
264
+ "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
265
+ "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
266
+ "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
267
+ "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
268
+ "model.layers.3.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
269
+ "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
270
+ "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
271
+ "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
272
+ "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors",
273
+ "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
274
+ "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
275
+ "model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
276
+ "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
277
+ "model.layers.4.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
278
+ "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
279
+ "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
280
+ "model.layers.4.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
281
+ "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
282
+ "model.layers.4.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
283
+ "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
284
+ "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors",
285
+ "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
286
+ "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
287
+ "model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
288
+ "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
289
+ "model.layers.5.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
290
+ "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
291
+ "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
292
+ "model.layers.5.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
293
+ "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
294
+ "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
295
+ "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
296
+ "model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors",
297
+ "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
298
+ "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
299
+ "model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
300
+ "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
301
+ "model.layers.6.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
302
+ "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
303
+ "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
304
+ "model.layers.6.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
305
+ "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
306
+ "model.layers.6.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
307
+ "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
308
+ "model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors",
309
+ "model.layers.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
310
+ "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
311
+ "model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
312
+ "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
313
+ "model.layers.7.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
314
+ "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
315
+ "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
316
+ "model.layers.7.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
317
+ "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
318
+ "model.layers.7.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
319
+ "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
320
+ "model.layers.8.input_layernorm.weight": "model-00002-of-00004.safetensors",
321
+ "model.layers.8.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
322
+ "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
323
+ "model.layers.8.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
324
+ "model.layers.8.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
325
+ "model.layers.8.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
326
+ "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
327
+ "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
328
+ "model.layers.8.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
329
+ "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
330
+ "model.layers.8.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
331
+ "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
332
+ "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors",
333
+ "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
334
+ "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
335
+ "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
336
+ "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
337
+ "model.layers.9.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
338
+ "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
339
+ "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
340
+ "model.layers.9.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
341
+ "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
342
+ "model.layers.9.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
343
+ "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
344
+ "model.norm.weight": "model-00003-of-00004.safetensors"
345
+ }
346
+ }
model/llm/special_tokens_map.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ {
4
+ "content": "<|sound_bos|>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ },
10
+ {
11
+ "content": "<|sound_eos|>",
12
+ "lstrip": false,
13
+ "normalized": false,
14
+ "rstrip": false,
15
+ "single_word": false
16
+ }
17
+ ],
18
+ "bos_token": {
19
+ "content": "[BOS]",
20
+ "lstrip": false,
21
+ "normalized": false,
22
+ "rstrip": false,
23
+ "single_word": false
24
+ },
25
+ "eos_token": {
26
+ "content": "<|im_end|>",
27
+ "lstrip": false,
28
+ "normalized": false,
29
+ "rstrip": false,
30
+ "single_word": false
31
+ },
32
+ "pad_token": {
33
+ "content": "[PAD]",
34
+ "lstrip": false,
35
+ "normalized": false,
36
+ "rstrip": false,
37
+ "single_word": false
38
+ }
39
+ }
model/llm/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8c0a6b3376d17dcabb3fc580d5bad9e566dd613e5b02faf633da5fc1d6416c35
3
+ size 11420711
model/llm/tokenizer_config.json ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "151643": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "151644": {
13
+ "content": "<|im_start|>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "151645": {
21
+ "content": "<|im_end|>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "151646": {
29
+ "content": "[BOS]",
30
+ "lstrip": false,
31
+ "normalized": false,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "151647": {
37
+ "content": "[PAD]",
38
+ "lstrip": false,
39
+ "normalized": false,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": true
43
+ },
44
+ "151648": {
45
+ "content": "<vila/sentinel>",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false,
50
+ "special": true
51
+ },
52
+ "151649": {
53
+ "content": "<image>",
54
+ "lstrip": false,
55
+ "normalized": false,
56
+ "rstrip": false,
57
+ "single_word": false,
58
+ "special": true
59
+ },
60
+ "151650": {
61
+ "content": "<vila/video>",
62
+ "lstrip": false,
63
+ "normalized": false,
64
+ "rstrip": false,
65
+ "single_word": false,
66
+ "special": true
67
+ },
68
+ "151651": {
69
+ "content": "<|image_bos|>",
70
+ "lstrip": false,
71
+ "normalized": false,
72
+ "rstrip": false,
73
+ "single_word": false,
74
+ "special": true
75
+ },
76
+ "151652": {
77
+ "content": "<|image_eos|>",
78
+ "lstrip": false,
79
+ "normalized": false,
80
+ "rstrip": false,
81
+ "single_word": false,
82
+ "special": true
83
+ },
84
+ "151653": {
85
+ "content": "<|video_bos|>",
86
+ "lstrip": false,
87
+ "normalized": false,
88
+ "rstrip": false,
89
+ "single_word": false,
90
+ "special": true
91
+ },
92
+ "151654": {
93
+ "content": "<|video_eos|>",
94
+ "lstrip": false,
95
+ "normalized": false,
96
+ "rstrip": false,
97
+ "single_word": false,
98
+ "special": true
99
+ },
100
+ "151655": {
101
+ "content": "<|speech_bos|>",
102
+ "lstrip": false,
103
+ "normalized": false,
104
+ "rstrip": false,
105
+ "single_word": false,
106
+ "special": true
107
+ },
108
+ "151656": {
109
+ "content": "<|speech_eos|>",
110
+ "lstrip": false,
111
+ "normalized": false,
112
+ "rstrip": false,
113
+ "single_word": false,
114
+ "special": true
115
+ },
116
+ "151657": {
117
+ "content": "<|sound_bos|>",
118
+ "lstrip": false,
119
+ "normalized": false,
120
+ "rstrip": false,
121
+ "single_word": false,
122
+ "special": true
123
+ },
124
+ "151658": {
125
+ "content": "<|sound_eos|>",
126
+ "lstrip": false,
127
+ "normalized": false,
128
+ "rstrip": false,
129
+ "single_word": false,
130
+ "special": true
131
+ }
132
+ },
133
+ "additional_special_tokens": [
134
+ "<|sound_bos|>",
135
+ "<|sound_eos|>"
136
+ ],
137
+ "bos_token": "[BOS]",
138
+ "chat_template": "{% if messages[0]['role'] != 'system' %}{{ '<|im_start|>system\\nYou are a helpful assistant<|im_end|>\\n' }}{% endif %}{% for message in messages if message['content'] is not none %}{{ '<|im_start|>' + message['role'] + '\\n' + message['content'] + '<|im_end|>' + '\\n' }}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\\n' }}{% endif %}",
139
+ "clean_up_tokenization_spaces": false,
140
+ "eos_token": "<|im_end|>",
141
+ "errors": "replace",
142
+ "legacy": false,
143
+ "model_max_length": 4096,
144
+ "pad_token": "[PAD]",
145
+ "padding_side": "right",
146
+ "split_special_tokens": false,
147
+ "tokenizer_class": "Qwen2Tokenizer",
148
+ "unk_token": null
149
+ }
model/llm/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
model/mm_projector/config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "runs/train/NVILA-Lite-8B-quantumn-qa-train/model/mm_projector",
3
+ "architectures": [
4
+ "MultimodalProjector"
5
+ ],
6
+ "mm_projector_type": "mlp_downsample_3x3_fix",
7
+ "model_type": "v2l_projector",
8
+ "torch_dtype": "bfloat16",
9
+ "transformers_version": "4.46.0"
10
+ }
model/mm_projector/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e2f2a27121e53447b4e377deecb43822ee3ab885dde49b399304625ca15672cb
3
+ size 122203760
model/trainer_state.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 1.0,
5
+ "eval_steps": 500,
6
+ "global_step": 13,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.08,
13
+ "grad_norm": 130.75577214745246,
14
+ "learning_rate": 2e-05,
15
+ "loss": 1.2648,
16
+ "step": 1
17
+ },
18
+ {
19
+ "epoch": 0.15,
20
+ "grad_norm": 126.67616596972118,
21
+ "learning_rate": 1.9659258262890683e-05,
22
+ "loss": 1.2283,
23
+ "step": 2
24
+ },
25
+ {
26
+ "epoch": 0.23,
27
+ "grad_norm": 56.84219933724937,
28
+ "learning_rate": 1.866025403784439e-05,
29
+ "loss": 0.4188,
30
+ "step": 3
31
+ },
32
+ {
33
+ "epoch": 0.31,
34
+ "grad_norm": 89.42666319016989,
35
+ "learning_rate": 1.7071067811865477e-05,
36
+ "loss": 2.4789,
37
+ "step": 4
38
+ },
39
+ {
40
+ "epoch": 0.38,
41
+ "grad_norm": 72.41844398977439,
42
+ "learning_rate": 1.5000000000000002e-05,
43
+ "loss": 0.7853,
44
+ "step": 5
45
+ },
46
+ {
47
+ "epoch": 0.46,
48
+ "grad_norm": 799.0148731599335,
49
+ "learning_rate": 1.2588190451025209e-05,
50
+ "loss": 8.2197,
51
+ "step": 6
52
+ },
53
+ {
54
+ "epoch": 0.54,
55
+ "grad_norm": 34.525610403243014,
56
+ "learning_rate": 1e-05,
57
+ "loss": 0.3008,
58
+ "step": 7
59
+ },
60
+ {
61
+ "epoch": 0.62,
62
+ "grad_norm": 64.98250527603693,
63
+ "learning_rate": 7.411809548974792e-06,
64
+ "loss": 0.3999,
65
+ "step": 8
66
+ },
67
+ {
68
+ "epoch": 0.69,
69
+ "grad_norm": 11.46902235575636,
70
+ "learning_rate": 5.000000000000003e-06,
71
+ "loss": 0.2575,
72
+ "step": 9
73
+ },
74
+ {
75
+ "epoch": 0.77,
76
+ "grad_norm": 42.63293180170212,
77
+ "learning_rate": 2.9289321881345257e-06,
78
+ "loss": 0.3174,
79
+ "step": 10
80
+ },
81
+ {
82
+ "epoch": 0.85,
83
+ "grad_norm": 40.64988981794197,
84
+ "learning_rate": 1.339745962155613e-06,
85
+ "loss": 0.3054,
86
+ "step": 11
87
+ },
88
+ {
89
+ "epoch": 0.92,
90
+ "grad_norm": 27.588182133457394,
91
+ "learning_rate": 3.4074173710931804e-07,
92
+ "loss": 0.2827,
93
+ "step": 12
94
+ },
95
+ {
96
+ "epoch": 1.0,
97
+ "grad_norm": 17.48926557604337,
98
+ "learning_rate": 0.0,
99
+ "loss": 0.2751,
100
+ "step": 13
101
+ },
102
+ {
103
+ "epoch": 1.0,
104
+ "step": 13,
105
+ "total_flos": 0.0,
106
+ "train_loss": 1.2718768601234143,
107
+ "train_runtime": 323.0635,
108
+ "train_samples_per_second": 41.206,
109
+ "train_steps_per_second": 0.04
110
+ }
111
+ ],
112
+ "logging_steps": 1.0,
113
+ "max_steps": 13,
114
+ "num_input_tokens_seen": 0,
115
+ "num_train_epochs": 1,
116
+ "save_steps": 100,
117
+ "stateful_callbacks": {
118
+ "TrainerControl": {
119
+ "args": {
120
+ "should_epoch_stop": false,
121
+ "should_evaluate": false,
122
+ "should_log": false,
123
+ "should_save": false,
124
+ "should_training_stop": false
125
+ },
126
+ "attributes": {}
127
+ }
128
+ },
129
+ "total_flos": 0.0,
130
+ "train_batch_size": 4,
131
+ "trial_name": null,
132
+ "trial_params": null
133
+ }
model/vision_tower/config.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "runs/train/NVILA-Lite-8B-quantumn-qa-train/model/vision_tower",
3
+ "architectures": [
4
+ "SiglipVisionModel"
5
+ ],
6
+ "attention_dropout": 0.0,
7
+ "hidden_act": "gelu_pytorch_tanh",
8
+ "hidden_size": 1152,
9
+ "image_size": 448,
10
+ "intermediate_size": 4304,
11
+ "layer_norm_eps": 1e-06,
12
+ "model_type": "siglip_vision_model",
13
+ "num_attention_heads": 16,
14
+ "num_channels": 3,
15
+ "num_hidden_layers": 27,
16
+ "num_image_tokens": 256,
17
+ "patch_size": 14,
18
+ "projection_dim": 2048,
19
+ "projector_hidden_act": "gelu_fast",
20
+ "torch_dtype": "bfloat16",
21
+ "transformers_version": "4.46.0",
22
+ "vision_use_head": false
23
+ }
model/vision_tower/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:079b035a80cb54de801d4748e8b56b65be4116925c8159e679ff45cdb46e26a5
3
+ size 826707904
model/vision_tower/preprocessor_config.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_convert_rgb": null,
3
+ "do_normalize": true,
4
+ "do_rescale": true,
5
+ "do_resize": true,
6
+ "image_mean": [
7
+ 0.5,
8
+ 0.5,
9
+ 0.5
10
+ ],
11
+ "image_processor_type": "SiglipImageProcessor",
12
+ "image_std": [
13
+ 0.5,
14
+ 0.5,
15
+ 0.5
16
+ ],
17
+ "processor_class": "SiglipProcessor",
18
+ "resample": 3,
19
+ "rescale_factor": 0.00392156862745098,
20
+ "size": {
21
+ "height": 448,
22
+ "width": 448
23
+ }
24
+ }
slurm/1038241.0.err ADDED
The diff for this file is too large to render. See raw diff
 
slurm/1038241.0.out ADDED
@@ -0,0 +1,768 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ SLURM_JOB_ID = 1038241
2
+ SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
3
+ RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
4
+ OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
5
+ NNODES = 8
6
+ NODES = pool0-01504 pool0-01641 pool0-02076 pool0-01787 pool0-02443 pool0-02338 pool0-02349 pool0-02374
7
+ NODE_RANK = 2
8
+ GPUS_PER_NODE = 8
9
+ MASTER_ADDR = pool0-01504
10
+ MASTER_PORT = 25001
11
+ GLOBAL_TRAIN_BATCH_SIZE = 2048
12
+ GRADIENT_ACCUMULATION_STEPS = 4
13
+ PER_DEVICE_TRAIN_BATCH_SIZE = 8
14
+ DEFAULT_LEARNING_RATE: 2e-5
15
+ SLURM_JOB_ID = 1038241
16
+ SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
17
+ RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
18
+ OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
19
+ NNODES = 8
20
+ NODES = pool0-01504 pool0-01641 pool0-02076 pool0-01787 pool0-02443 pool0-02338 pool0-02349 pool0-02374
21
+ NODE_RANK = 3
22
+ GPUS_PER_NODE = 8
23
+ MASTER_ADDR = pool0-01504
24
+ MASTER_PORT = 25001
25
+ GLOBAL_TRAIN_BATCH_SIZE = 2048
26
+ GRADIENT_ACCUMULATION_STEPS = 4
27
+ PER_DEVICE_TRAIN_BATCH_SIZE = 8
28
+ DEFAULT_LEARNING_RATE: 2e-5
29
+ SLURM_JOB_ID = 1038241
30
+ SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
31
+ RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
32
+ OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
33
+ NNODES = 8
34
+ NODES = pool0-01504 pool0-01641 pool0-02076 pool0-01787 pool0-02443 pool0-02338 pool0-02349 pool0-02374
35
+ NODE_RANK = 5
36
+ GPUS_PER_NODE = 8
37
+ MASTER_ADDR = pool0-01504
38
+ MASTER_PORT = 25001
39
+ GLOBAL_TRAIN_BATCH_SIZE = 2048
40
+ GRADIENT_ACCUMULATION_STEPS = 4
41
+ PER_DEVICE_TRAIN_BATCH_SIZE = 8
42
+ DEFAULT_LEARNING_RATE: 2e-5
43
+ SLURM_JOB_ID = 1038241
44
+ SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
45
+ RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
46
+ OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
47
+ NNODES = 8
48
+ NODES = pool0-01504 pool0-01641 pool0-02076 pool0-01787 pool0-02443 pool0-02338 pool0-02349 pool0-02374
49
+ NODE_RANK = 1
50
+ GPUS_PER_NODE = 8
51
+ MASTER_ADDR = pool0-01504
52
+ MASTER_PORT = 25001
53
+ GLOBAL_TRAIN_BATCH_SIZE = 2048
54
+ GRADIENT_ACCUMULATION_STEPS = 4
55
+ PER_DEVICE_TRAIN_BATCH_SIZE = 8
56
+ DEFAULT_LEARNING_RATE: 2e-5
57
+ SLURM_JOB_ID = 1038241
58
+ SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
59
+ RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
60
+ OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
61
+ NNODES = 8
62
+ NODES = pool0-01504 pool0-01641 pool0-02076 pool0-01787 pool0-02443 pool0-02338 pool0-02349 pool0-02374
63
+ NODE_RANK = 0
64
+ GPUS_PER_NODE = 8
65
+ MASTER_ADDR = pool0-01504
66
+ MASTER_PORT = 25001
67
+ GLOBAL_TRAIN_BATCH_SIZE = 2048
68
+ GRADIENT_ACCUMULATION_STEPS = 4
69
+ PER_DEVICE_TRAIN_BATCH_SIZE = 8
70
+ DEFAULT_LEARNING_RATE: 2e-5
71
+ SLURM_JOB_ID = 1038241
72
+ SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
73
+ RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
74
+ OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
75
+ NNODES = 8
76
+ NODES = pool0-01504 pool0-01641 pool0-02076 pool0-01787 pool0-02443 pool0-02338 pool0-02349 pool0-02374
77
+ NODE_RANK = 7
78
+ GPUS_PER_NODE = 8
79
+ MASTER_ADDR = pool0-01504
80
+ MASTER_PORT = 25001
81
+ GLOBAL_TRAIN_BATCH_SIZE = 2048
82
+ GRADIENT_ACCUMULATION_STEPS = 4
83
+ PER_DEVICE_TRAIN_BATCH_SIZE = 8
84
+ DEFAULT_LEARNING_RATE: 2e-5
85
+ SLURM_JOB_ID = 1038241
86
+ SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
87
+ RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
88
+ OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
89
+ NNODES = 8
90
+ NODES = pool0-01504 pool0-01641 pool0-02076 pool0-01787 pool0-02443 pool0-02338 pool0-02349 pool0-02374
91
+ NODE_RANK = 4
92
+ GPUS_PER_NODE = 8
93
+ MASTER_ADDR = pool0-01504
94
+ MASTER_PORT = 25001
95
+ GLOBAL_TRAIN_BATCH_SIZE = 2048
96
+ GRADIENT_ACCUMULATION_STEPS = 4
97
+ PER_DEVICE_TRAIN_BATCH_SIZE = 8
98
+ DEFAULT_LEARNING_RATE: 2e-5
99
+ SLURM_JOB_ID = 1038241
100
+ SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
101
+ RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
102
+ OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
103
+ NNODES = 8
104
+ NODES = pool0-01504 pool0-01641 pool0-02076 pool0-01787 pool0-02443 pool0-02338 pool0-02349 pool0-02374
105
+ NODE_RANK = 6
106
+ GPUS_PER_NODE = 8
107
+ MASTER_ADDR = pool0-01504
108
+ MASTER_PORT = 25001
109
+ GLOBAL_TRAIN_BATCH_SIZE = 2048
110
+ GRADIENT_ACCUMULATION_STEPS = 4
111
+ PER_DEVICE_TRAIN_BATCH_SIZE = 8
112
+ DEFAULT_LEARNING_RATE: 2e-5
113
+ [2025-07-01 08:44:47,557] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
114
+ [2025-07-01 08:44:47,557] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
115
+ [2025-07-01 08:44:47,557] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
116
+ [2025-07-01 08:44:47,557] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
117
+ [2025-07-01 08:44:47,557] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
118
+ [2025-07-01 08:44:47,557] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
119
+ [2025-07-01 08:44:47,557] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
120
+ [2025-07-01 08:44:47,557] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
121
+ [2025-07-01 08:44:47,633] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
122
+ [2025-07-01 08:44:47,633] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
123
+ [2025-07-01 08:44:47,633] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
124
+ [2025-07-01 08:44:47,633] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
125
+ [2025-07-01 08:44:47,633] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
126
+ [2025-07-01 08:44:47,633] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
127
+ [2025-07-01 08:44:47,633] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
128
+ [2025-07-01 08:44:47,633] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
129
+ [2025-07-01 08:44:47,698] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
130
+ [2025-07-01 08:44:47,698] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
131
+ [2025-07-01 08:44:47,698] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
132
+ [2025-07-01 08:44:47,698] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
133
+ [2025-07-01 08:44:47,698] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
134
+ [2025-07-01 08:44:47,698] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
135
+ [2025-07-01 08:44:47,699] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
136
+ [2025-07-01 08:44:47,699] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
137
+ [2025-07-01 08:44:47,824] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
138
+ [2025-07-01 08:44:47,824] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
139
+ [2025-07-01 08:44:47,824] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
140
+ [2025-07-01 08:44:47,824] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
141
+ [2025-07-01 08:44:47,824] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
142
+ [2025-07-01 08:44:47,824] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
143
+ [2025-07-01 08:44:47,825] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
144
+ [2025-07-01 08:44:47,825] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
145
+ [2025-07-01 08:44:47,859] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
146
+ [2025-07-01 08:44:47,859] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
147
+ [2025-07-01 08:44:47,859] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
148
+ [2025-07-01 08:44:47,859] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
149
+ [2025-07-01 08:44:47,859] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
150
+ [2025-07-01 08:44:47,859] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
151
+ [2025-07-01 08:44:47,859] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
152
+ [2025-07-01 08:44:47,860] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
153
+ [2025-07-01 08:44:48,568] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
154
+ [2025-07-01 08:44:48,568] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
155
+ [2025-07-01 08:44:48,568] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
156
+ [2025-07-01 08:44:48,568] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
157
+ [2025-07-01 08:44:48,568] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
158
+ [2025-07-01 08:44:48,568] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
159
+ [2025-07-01 08:44:48,568] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
160
+ [2025-07-01 08:44:48,568] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
161
+ [2025-07-01 08:44:49,137] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
162
+ [2025-07-01 08:44:49,137] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
163
+ [2025-07-01 08:44:49,137] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
164
+ [2025-07-01 08:44:49,137] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
165
+ [2025-07-01 08:44:49,137] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
166
+ [2025-07-01 08:44:49,137] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
167
+ [2025-07-01 08:44:49,137] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
168
+ [2025-07-01 08:44:49,137] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
169
+ [2025-07-01 08:44:49,318] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
170
+ [2025-07-01 08:44:49,318] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
171
+ [2025-07-01 08:44:49,318] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
172
+ [2025-07-01 08:44:49,318] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
173
+ [2025-07-01 08:44:49,318] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
174
+ [2025-07-01 08:44:49,318] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
175
+ [2025-07-01 08:44:49,318] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
176
+ [2025-07-01 08:44:49,318] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
177
+ [2025-07-01 08:44:58,594] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
178
+ [2025-07-01 08:44:58,594] [INFO] [comm.py:594:init_distributed] cdb=None
179
+ [2025-07-01 08:44:58,612] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
180
+ [2025-07-01 08:44:58,612] [INFO] [comm.py:594:init_distributed] cdb=None
181
+ [2025-07-01 08:44:58,615] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
182
+ [2025-07-01 08:44:58,616] [INFO] [comm.py:594:init_distributed] cdb=None
183
+ [2025-07-01 08:44:58,617] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
184
+ [2025-07-01 08:44:58,617] [INFO] [comm.py:594:init_distributed] cdb=None
185
+ [2025-07-01 08:44:58,624] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
186
+ [2025-07-01 08:44:58,624] [INFO] [comm.py:594:init_distributed] cdb=None
187
+ [2025-07-01 08:44:58,623] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
188
+ [2025-07-01 08:44:58,623] [INFO] [comm.py:594:init_distributed] cdb=None
189
+ [2025-07-01 08:44:58,624] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
190
+ [2025-07-01 08:44:58,625] [INFO] [comm.py:594:init_distributed] cdb=None
191
+ [2025-07-01 08:44:58,625] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
192
+ [2025-07-01 08:44:58,625] [INFO] [comm.py:594:init_distributed] cdb=None
193
+ [2025-07-01 08:44:58,638] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
194
+ [2025-07-01 08:44:58,638] [INFO] [comm.py:594:init_distributed] cdb=None
195
+ [2025-07-01 08:44:58,644] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
196
+ [2025-07-01 08:44:58,644] [INFO] [comm.py:594:init_distributed] cdb=None
197
+ [2025-07-01 08:44:58,643] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
198
+ [2025-07-01 08:44:58,643] [INFO] [comm.py:594:init_distributed] cdb=None
199
+ [2025-07-01 08:44:58,644] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
200
+ [2025-07-01 08:44:58,644] [INFO] [comm.py:594:init_distributed] cdb=None
201
+ [2025-07-01 08:44:58,645] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
202
+ [2025-07-01 08:44:58,645] [INFO] [comm.py:594:init_distributed] cdb=None
203
+ [2025-07-01 08:44:58,645] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
204
+ [2025-07-01 08:44:58,645] [INFO] [comm.py:594:init_distributed] cdb=None
205
+ [2025-07-01 08:44:58,652] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
206
+ [2025-07-01 08:44:58,652] [INFO] [comm.py:594:init_distributed] cdb=None
207
+ [2025-07-01 08:44:58,653] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
208
+ [2025-07-01 08:44:58,653] [INFO] [comm.py:594:init_distributed] cdb=None
209
+ [2025-07-01 08:44:58,655] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
210
+ [2025-07-01 08:44:58,655] [INFO] [comm.py:594:init_distributed] cdb=None
211
+ [2025-07-01 08:44:58,656] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
212
+ [2025-07-01 08:44:58,656] [INFO] [comm.py:594:init_distributed] cdb=None
213
+ [2025-07-01 08:44:58,659] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
214
+ [2025-07-01 08:44:58,659] [INFO] [comm.py:594:init_distributed] cdb=None
215
+ [2025-07-01 08:44:58,661] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
216
+ [2025-07-01 08:44:58,661] [INFO] [comm.py:594:init_distributed] cdb=None
217
+ [2025-07-01 08:44:58,664] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
218
+ [2025-07-01 08:44:58,664] [INFO] [comm.py:594:init_distributed] cdb=None
219
+ [2025-07-01 08:44:58,671] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
220
+ [2025-07-01 08:44:58,671] [INFO] [comm.py:594:init_distributed] cdb=None
221
+ [2025-07-01 08:44:58,672] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
222
+ [2025-07-01 08:44:58,672] [INFO] [comm.py:594:init_distributed] cdb=None
223
+ [2025-07-01 08:44:58,675] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
224
+ [2025-07-01 08:44:58,675] [INFO] [comm.py:594:init_distributed] cdb=None
225
+ [2025-07-01 08:44:58,681] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
226
+ [2025-07-01 08:44:58,681] [INFO] [comm.py:594:init_distributed] cdb=None
227
+ [2025-07-01 08:44:58,682] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
228
+ [2025-07-01 08:44:58,682] [INFO] [comm.py:594:init_distributed] cdb=None
229
+ [2025-07-01 08:44:58,686] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
230
+ [2025-07-01 08:44:58,686] [INFO] [comm.py:594:init_distributed] cdb=None
231
+ [2025-07-01 08:44:58,689] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
232
+ [2025-07-01 08:44:58,689] [INFO] [comm.py:594:init_distributed] cdb=None
233
+ [2025-07-01 08:44:58,689] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
234
+ [2025-07-01 08:44:58,689] [INFO] [comm.py:594:init_distributed] cdb=None
235
+ [2025-07-01 08:44:58,697] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
236
+ [2025-07-01 08:44:58,697] [INFO] [comm.py:594:init_distributed] cdb=None
237
+ [2025-07-01 08:44:58,698] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
238
+ [2025-07-01 08:44:58,698] [INFO] [comm.py:594:init_distributed] cdb=None
239
+ [2025-07-01 08:44:58,731] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
240
+ [2025-07-01 08:44:58,731] [INFO] [comm.py:594:init_distributed] cdb=None
241
+ [2025-07-01 08:44:58,731] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
242
+ [2025-07-01 08:44:58,731] [INFO] [comm.py:594:init_distributed] cdb=None
243
+ [2025-07-01 08:44:58,755] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
244
+ [2025-07-01 08:44:58,755] [INFO] [comm.py:594:init_distributed] cdb=None
245
+ [2025-07-01 08:44:58,756] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
246
+ [2025-07-01 08:44:58,756] [INFO] [comm.py:594:init_distributed] cdb=None
247
+ [2025-07-01 08:44:58,781] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
248
+ [2025-07-01 08:44:58,781] [INFO] [comm.py:594:init_distributed] cdb=None
249
+ [2025-07-01 08:44:58,783] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
250
+ [2025-07-01 08:44:58,783] [INFO] [comm.py:594:init_distributed] cdb=None
251
+ [2025-07-01 08:44:58,784] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
252
+ [2025-07-01 08:44:58,784] [INFO] [comm.py:594:init_distributed] cdb=None
253
+ [2025-07-01 08:44:58,786] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
254
+ [2025-07-01 08:44:58,786] [INFO] [comm.py:594:init_distributed] cdb=None
255
+ [2025-07-01 08:44:58,786] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
256
+ [2025-07-01 08:44:58,786] [INFO] [comm.py:594:init_distributed] cdb=None
257
+ [2025-07-01 08:44:59,343] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
258
+ [2025-07-01 08:44:59,343] [INFO] [comm.py:594:init_distributed] cdb=None
259
+ [2025-07-01 08:44:59,344] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
260
+ [2025-07-01 08:44:59,344] [INFO] [comm.py:594:init_distributed] cdb=None
261
+ [2025-07-01 08:44:59,347] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
262
+ [2025-07-01 08:44:59,347] [INFO] [comm.py:594:init_distributed] cdb=None
263
+ [2025-07-01 08:44:59,348] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
264
+ [2025-07-01 08:44:59,348] [INFO] [comm.py:594:init_distributed] cdb=None
265
+ [2025-07-01 08:44:59,381] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
266
+ [2025-07-01 08:44:59,381] [INFO] [comm.py:594:init_distributed] cdb=None
267
+ [2025-07-01 08:44:59,382] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
268
+ [2025-07-01 08:44:59,382] [INFO] [comm.py:594:init_distributed] cdb=None
269
+ [2025-07-01 08:44:59,382] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
270
+ [2025-07-01 08:44:59,382] [INFO] [comm.py:594:init_distributed] cdb=None
271
+ [2025-07-01 08:44:59,382] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
272
+ [2025-07-01 08:44:59,383] [INFO] [comm.py:594:init_distributed] cdb=None
273
+ [2025-07-01 08:45:00,103] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
274
+ [2025-07-01 08:45:00,103] [INFO] [comm.py:594:init_distributed] cdb=None
275
+ [2025-07-01 08:45:00,104] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
276
+ [2025-07-01 08:45:00,104] [INFO] [comm.py:594:init_distributed] cdb=None
277
+ [2025-07-01 08:45:00,112] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
278
+ [2025-07-01 08:45:00,112] [INFO] [comm.py:594:init_distributed] cdb=None
279
+ [2025-07-01 08:45:00,133] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
280
+ [2025-07-01 08:45:00,133] [INFO] [comm.py:594:init_distributed] cdb=None
281
+ [2025-07-01 08:45:00,136] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
282
+ [2025-07-01 08:45:00,137] [INFO] [comm.py:594:init_distributed] cdb=None
283
+ [2025-07-01 08:45:00,143] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
284
+ [2025-07-01 08:45:00,143] [INFO] [comm.py:594:init_distributed] cdb=None
285
+ [2025-07-01 08:45:00,146] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
286
+ [2025-07-01 08:45:00,146] [INFO] [comm.py:594:init_distributed] cdb=None
287
+ [2025-07-01 08:45:00,146] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
288
+ [2025-07-01 08:45:00,147] [INFO] [comm.py:594:init_distributed] cdb=None
289
+ [2025-07-01 08:45:00,489] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
290
+ [2025-07-01 08:45:00,489] [INFO] [comm.py:594:init_distributed] cdb=None
291
+ [2025-07-01 08:45:00,520] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
292
+ [2025-07-01 08:45:00,520] [INFO] [comm.py:594:init_distributed] cdb=None
293
+ [2025-07-01 08:45:00,548] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
294
+ [2025-07-01 08:45:00,548] [INFO] [comm.py:594:init_distributed] cdb=None
295
+ [2025-07-01 08:45:00,548] [INFO] [comm.py:625:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
296
+ [2025-07-01 08:45:00,550] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
297
+ [2025-07-01 08:45:00,550] [INFO] [comm.py:594:init_distributed] cdb=None
298
+ [2025-07-01 08:45:00,582] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
299
+ [2025-07-01 08:45:00,582] [INFO] [comm.py:594:init_distributed] cdb=None
300
+ [2025-07-01 08:45:00,588] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
301
+ [2025-07-01 08:45:00,588] [INFO] [comm.py:594:init_distributed] cdb=None
302
+ [2025-07-01 08:45:00,590] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
303
+ [2025-07-01 08:45:00,590] [INFO] [comm.py:594:init_distributed] cdb=None
304
+ [2025-07-01 08:45:00,600] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
305
+ [2025-07-01 08:45:00,600] [INFO] [comm.py:594:init_distributed] cdb=None
306
+ [2025-07-01 08:45:15,555] [INFO] [partition_parameters.py:453:__exit__] finished initializing model with 7.61B parameters
307
+ [2025-07-01 08:45:31,263] [INFO] [partition_parameters.py:453:__exit__] finished initializing model with 8.03B parameters
308
+ [2025-07-01 08:45:32,600] [INFO] [partition_parameters.py:453:__exit__] finished initializing model with 8.09B parameters
309
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
310
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
311
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
312
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
313
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
314
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
315
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
316
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
317
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
318
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
319
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
320
+
321
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
322
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
323
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
324
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
325
+
326
+
327
+
328
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
329
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
330
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
331
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
332
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
333
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
334
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
335
+
336
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
337
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
338
+ [dist-0-of-64] trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
339
+
340
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
341
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
342
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
343
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
344
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
345
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
346
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
347
+
348
+
349
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
350
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
351
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
352
+
353
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
354
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
355
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
356
+ LlavaLlamaModel(
357
+ (llm): Qwen2ForCausalLM(
358
+ (model): Qwen2Model(
359
+ (embed_tokens): Embedding(151648, 3584)
360
+ (layers): ModuleList(
361
+ (0-27): 28 x Qwen2DecoderLayer(
362
+ (self_attn): Qwen2FlashAttention2(
363
+ (q_proj): Linear(in_features=3584, out_features=3584, bias=True)
364
+ (k_proj): Linear(in_features=3584, out_features=512, bias=True)
365
+ (v_proj): Linear(in_features=3584, out_features=512, bias=True)
366
+ (o_proj): Linear(in_features=3584, out_features=3584, bias=False)
367
+ (rotary_emb): Qwen2RotaryEmbedding()
368
+ )
369
+ (mlp): Qwen2MLP(
370
+ (gate_proj): Linear(in_features=3584, out_features=18944, bias=False)
371
+ (up_proj): Linear(in_features=3584, out_features=18944, bias=False)
372
+ (down_proj): Linear(in_features=18944, out_features=3584, bias=False)
373
+ (act_fn): SiLU()
374
+ )
375
+ (input_layernorm): Qwen2RMSNorm((0,), eps=1e-06)
376
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
377
+ (post_attention_layernorm): Qwen2RMSNorm((0,), eps=1e-06)
378
+ )
379
+ )
380
+ (norm): Qwen2RMSNorm((0,), eps=1e-06)
381
+ (rotary_emb): Qwen2RotaryEmbedding()
382
+ )
383
+ (lm_head): Linear(in_features=3584, out_features=151648, bias=False)
384
+ )
385
+ (vision_tower): SiglipVisionTower(
386
+ (vision_tower): SiglipVisionModel(
387
+ (vision_model): SiglipVisionTransformer(
388
+ (embeddings): SiglipVisionEmbeddings(
389
+ (patch_embedding): Conv2d(3, 1152, kernel_size=(14, 14), stride=(14, 14), padding=valid)
390
+ (position_embedding): Embedding(1024, 1152)
391
+ )
392
+ (encoder): SiglipEncoder(
393
+ (layers): ModuleList(
394
+ (0-26): 27 x SiglipEncoderLayer(
395
+ (self_attn): SiglipFlashAttention2(
396
+ (k_proj): Linear(in_features=1152, out_features=1152, bias=True)
397
+ (v_proj): Linear(in_features=1152, out_features=1152, bias=True)
398
+ (q_proj): Linear(in_features=1152, out_features=1152, bias=True)
399
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
400
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
401
+ (out_proj): Linear(in_features=1152, out_features=1152, bias=True)
402
+ )
403
+ (layer_norm1): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
404
+ (mlp): SiglipMLP(
405
+ (activation_fn): PytorchGELUTanh()
406
+ (fc1): Linear(in_features=1152, out_features=4304, bias=True)
407
+ (fc2): Linear(in_features=4304, out_features=1152, bias=True)
408
+ )
409
+ (layer_norm2): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
410
+ )
411
+ )
412
+ )
413
+ (post_layernorm): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
414
+ )
415
+ )
416
+ )
417
+ (mm_projector): MultimodalProjector(
418
+ (layers): Sequential(
419
+ (0): DownSample3x3BlockFix()
420
+ (1): LayerNorm((10368,), eps=1e-05, elementwise_affine=True)
421
+ (2): Linear(in_features=10368, out_features=3456, bias=True)
422
+ (3): GELU(approximate='none')
423
+ (4): LayerNorm((3456,), eps=1e-05, elementwise_affine=True)
424
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
425
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
426
+ (5): Linear(in_features=3456, out_features=3584, bias=True)
427
+ (6): GELU(approximate='none')
428
+ (7): Linear(in_features=3584, out_features=3584, bias=True)
429
+ )
430
+ )
431
+ )
432
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
433
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
434
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
435
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
436
+
437
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
438
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
439
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
440
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
441
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
442
+
443
+ [dist-0-of-64] Tunable parameters:
444
+ language model True
445
+ [dist-0-of-64] vision tower True
446
+ [dist-0-of-64] mm projector True
447
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
448
+ [2025-07-01 08:48:06] Rank 32: Timer for terminate callback has been set.
449
+ Total limit: 240min
450
+ Pre terminate time: 10min elapsed_time: 187.84421372413635s
451
+ [2025-07-01 08:48:06] Rank 16: Timer for terminate callback has been set.
452
+ Total limit: 240min
453
+ Pre terminate time: 10min elapsed_time: 187.99013113975525s
454
+ [2025-07-01 08:48:06] Rank 60: Timer for terminate callback has been set.
455
+ Total limit: 240min
456
+ Pre terminate time: 10min elapsed_time: 188.01411652565002s
457
+ [2025-07-01 08:48:06] Rank 49: Timer for terminate callback has been set.
458
+ Total limit: 240min
459
+ Pre terminate time: 10min elapsed_time: 187.33525800704956s
460
+ [2025-07-01 08:48:06] Rank 3: Timer for terminate callback has been set.
461
+ Total limit: 240min
462
+ Pre terminate time: 10min elapsed_time: 186.23286867141724s
463
+ [2025-07-01 08:48:06] Rank 37: Timer for terminate callback has been set.
464
+ Total limit: 240min
465
+ Pre terminate time: 10min elapsed_time: 188.08282613754272s
466
+ length of dataloader: 28 14336
467
+ length of dataloader: 28 14336
468
+ [GPU memory] before trainer 2.292407512664795
469
+ [GPU memory] before trainer 2.292407512664795
470
+ [2025-07-01 08:48:06] Rank 40: Timer for terminate callback has been set.
471
+ Total limit: 240min
472
+ Pre terminate time: 10min elapsed_time: 188.24676775932312s
473
+ length of dataloader: 28 14336
474
+ [GPU memory] before trainer 2.292407512664795
475
+ [2025-07-01 08:48:06] Rank 62: Timer for terminate callback has been set.
476
+ Total limit: 240min
477
+ Pre terminate time: 10min elapsed_time: 188.2327139377594s
478
+ [2025-07-01 08:48:06] Rank 50: Timer for terminate callback has been set.
479
+ Total limit: 240min
480
+ Pre terminate time: 10min elapsed_time: 187.53005170822144s
481
+ [2025-07-01 08:48:06] Rank 25: Timer for terminate callback has been set.
482
+ Total limit: 240min
483
+ Pre terminate time: 10min elapsed_time: 188.26658296585083s
484
+ length of dataloader: 28 14336
485
+ [GPU memory] before trainer 2.292407512664795
486
+ length of dataloader: 28 14336
487
+ [GPU memory] before trainer 2.292407512664795
488
+ [2025-07-01 08:48:06] Rank 46: Timer for terminate callback has been set.
489
+ Total limit: 240min
490
+ Pre terminate time: 10min elapsed_time: 188.28350949287415s
491
+ length of dataloader: 28 14336
492
+ [GPU memory] before trainer 2.292407512664795
493
+ [2025-07-01 08:48:06] Rank 31: Timer for terminate callback has been set.
494
+ Total limit: 240min
495
+ Pre terminate time: 10min elapsed_time: 188.32588863372803s
496
+ length of dataloader: 28 14336
497
+ [GPU memory] before trainer 2.292407512664795
498
+ length of dataloader: 28 14336
499
+ [GPU memory] before trainer 2.292407512664795
500
+ [2025-07-01 08:48:06] Rank 0: Timer for terminate callback has been set.
501
+ Total limit: 240min
502
+ Pre terminate time: 10min elapsed_time: 186.52057933807373s
503
+ [2025-07-01 08:48:06] Rank 20: Timer for terminate callback has been set.
504
+ Total limit: 240min
505
+ Pre terminate time: 10min elapsed_time: 188.36052680015564s
506
+ [2025-07-01 08:48:06] Rank 21: Timer for terminate callback has been set.
507
+ Total limit: 240min
508
+ Pre terminate time: 10min elapsed_time: 188.3777801990509s
509
+ [2025-07-01 08:48:06] Rank 14: Timer for terminate callback has been set.
510
+ Total limit: 240min
511
+ Pre terminate time: 10min elapsed_time: 186.87918162345886s
512
+ [2025-07-01 08:48:06] Rank 13: Timer for terminate callback has been set.
513
+ Total limit: 240min
514
+ Pre terminate time: 10min elapsed_time: 186.88157558441162s
515
+ [2025-07-01 08:48:06] Rank 29: Timer for terminate callback has been set.
516
+ Total limit: 240min
517
+ Pre terminate time: 10min elapsed_time: 188.40087914466858s
518
+ length of dataloader: 28 14336
519
+ [GPU memory] before trainer 2.292407512664795
520
+ [2025-07-01 08:48:06] Rank 15: Timer for terminate callback has been set.
521
+ Total limit: 240min
522
+ Pre terminate time: 10min elapsed_time: 186.94210743904114s
523
+ length of dataloader: 28 14336
524
+ [GPU memory] before trainer 2.292407512664795
525
+ length of dataloader: 28 14336
526
+ length of dataloader: 28 14336
527
+ [GPU memory] before trainer 2.292407512664795
528
+ [GPU memory] before trainer 2.292407512664795
529
+ length of dataloader: 28 14336
530
+ [GPU memory] before trainer 2.292407512664795
531
+ length of dataloader: 28 14336
532
+ length of dataloader: 28 14336
533
+ length of dataloader:[GPU memory] before trainer 2.29240751266479528
534
+ 14336
535
+ [GPU memory] before trainer 2.292407512664795
536
+ [GPU memory] before trainer 2.292407512664795
537
+ [2025-07-01 08:48:07] Rank 33: Timer for terminate callback has been set.
538
+ Total limit: 240min
539
+ Pre terminate time: 10min elapsed_time: 188.36294674873352s
540
+ [2025-07-01 08:48:07] Rank 9: Timer for terminate callback has been set.
541
+ Total limit: 240min
542
+ Pre terminate time: 10min elapsed_time: 187.00075697898865s
543
+ [2025-07-01 08:48:07] Rank 53: Timer for terminate callback has been set.
544
+ Total limit: 240min
545
+ Pre terminate time: 10min elapsed_time: 187.78530764579773s
546
+ [2025-07-01 08:48:07] Rank 2: Timer for terminate callback has been set.
547
+ Total limit: 240min
548
+ Pre terminate time: 10min elapsed_time: 186.68620371818542s
549
+ [2025-07-01 08:48:07] Rank 26: Timer for terminate callback has been set.
550
+ Total limit: 240min
551
+ Pre terminate time: 10min elapsed_time: 188.52323293685913s
552
+ [2025-07-01 08:48:07] Rank 12: Timer for terminate callback has been set.
553
+ Total limit: 240min
554
+ Pre terminate time: 10min elapsed_time: 187.02464079856873s
555
+ [2025-07-01 08:48:07] Rank 58: Timer for terminate callback has been set.
556
+ Total limit: 240min
557
+ Pre terminate time: 10min elapsed_time: 188.52194261550903s
558
+ [2025-07-01 08:48:07] Rank 28: Timer for terminate callback has been set.
559
+ Total limit: 240min
560
+ Pre terminate time: 10min elapsed_time: 188.54804372787476s
561
+ [2025-07-01 08:48:07] Rank 30: Timer for terminate callback has been set.
562
+ Total limit: 240min
563
+ Pre terminate time: 10min elapsed_time: 188.5494029521942s
564
+ [2025-07-01 08:48:07] Rank 43: Timer for terminate callback has been set.
565
+ Total limit: 240min
566
+ Pre terminate time: 10min elapsed_time: 188.55047464370728s
567
+ [2025-07-01 08:48:07] Rank 22: Timer for terminate callback has been set.
568
+ Total limit: 240min
569
+ Pre terminate time: 10min elapsed_time: 188.5522198677063s
570
+ [2025-07-01 08:48:07] Rank 11: Timer for terminate callback has been set.
571
+ Total limit: 240min
572
+ Pre terminate time: 10min elapsed_time: 187.06330227851868s
573
+ [2025-07-01 08:48:07] Rank 18: Timer for terminate callback has been set.
574
+ Total limit: 240min
575
+ Pre terminate time: 10min elapsed_time: 188.56281304359436s
576
+ [2025-07-01 08:48:07] Rank 48: Timer for terminate callback has been set.
577
+ Total limit: 240min
578
+ Pre terminate time: 10min elapsed_time: 187.8359296321869s
579
+ [2025-07-01 08:48:07] Rank 8: Timer for terminate callback has been set.
580
+ Total limit: 240min
581
+ Pre terminate time: 10min elapsed_time: 187.07292366027832s
582
+ length of dataloader: 28 14336
583
+ [GPU memory] before trainer 2.292407512664795
584
+ [2025-07-01 08:48:07] Rank 36: Timer for terminate callback has been set.
585
+ Total limit: 240min
586
+ Pre terminate time: 10min elapsed_time: 188.44479870796204s
587
+ length of dataloader: 28 14336
588
+ length of dataloader: 28 14336
589
+ [GPU memory] before trainer 2.292407512664795
590
+ [GPU memory] before trainer 2.292407512664795
591
+ length of dataloader: 28 14336
592
+ [GPU memory] before trainer 2.292407512664795
593
+ length of dataloader: 28 14336
594
+ [2025-07-01 08:48:07] Rank 44: Timer for terminate callback has been set.
595
+ Total limit: 240min
596
+ Pre terminate time: 10min elapsed_time: 188.58538126945496s
597
+ [GPU memory] before trainer 2.292407512664795
598
+ length of dataloader: 28 14336
599
+ [GPU memory] before trainer 2.292407512664795
600
+ [2025-07-01 08:48:07] Rank 38: Timer for terminate callback has been set.
601
+ Total limit: 240min
602
+ Pre terminate time: 10min elapsed_time: 188.47206234931946s
603
+ [2025-07-01 08:48:07] Rank 19: Timer for terminate callback has been set.
604
+ Total limit: 240min
605
+ Pre terminate time: 10min elapsed_time: 188.60940408706665s
606
+ length of dataloader: 28 14336
607
+ [GPU memory] before trainer 2.292407512664795
608
+ length of dataloader: 28 14336
609
+ [GPU memory] before trainer 2.292407512664795
610
+ length of dataloader: 28 14336
611
+ [2025-07-01 08:48:07] Rank 45: Timer for terminate callback has been set.
612
+ Total limit: 240min
613
+ Pre terminate time: 10min elapsed_time: 188.61079692840576s
614
+ [GPU memory] before trainer 2.292407512664795
615
+ [2025-07-01 08:48:07] Rank 17: Timer for terminate callback has been set.
616
+ Total limit: 240min
617
+ Pre terminate time: 10min elapsed_time: 188.61975002288818s
618
+ length of dataloader: 28 14336
619
+ [GPU memory] before trainer 2.292407512664795
620
+ [2025-07-01 08:48:07] Rank 6: Timer for terminate callback has been set.
621
+ Total limit: 240min
622
+ Pre terminate time: 10min elapsed_time: 186.79642844200134s
623
+ length of dataloader: 28 14336
624
+ length of dataloader: 28 14336
625
+ [GPU memory] before trainer 2.292407512664795
626
+ [GPU memory] before trainer 2.292407512664795
627
+ length of dataloader: 28 14336
628
+ [GPU memory] before trainer 2.292407512664795
629
+ length of dataloader: 28 14336
630
+ [GPU memory] before trainer 2.292407512664795
631
+ length of dataloader: 28 14336
632
+ [GPU memory] before trainer 2.292407512664795
633
+ [2025-07-01 08:48:07] Rank 35: Timer for terminate callback has been set.
634
+ Total limit: 240min
635
+ Pre terminate time: 10min elapsed_time: 188.512140750885s
636
+ length of dataloader: 28 14336
637
+ [GPU memory] before trainer 2.292407512664795
638
+ [2025-07-01 08:48:07] Rank 27: Timer for terminate callback has been set.
639
+ Total limit: 240min
640
+ Pre terminate time: 10min elapsed_time: 188.64775276184082s
641
+ [2025-07-01 08:48:07] Rank 24: Timer for terminate callback has been set.
642
+ Total limit: 240min
643
+ Pre terminate time: 10min elapsed_time: 188.64974856376648s
644
+ [2025-07-01 08:48:07] Rank 1: Timer for terminate callback has been set.
645
+ Total limit: 240min
646
+ length of dataloader: 28 14336
647
+ Pre terminate time: 10min elapsed_time: 186.81638717651367s
648
+ [GPU memory] before trainer 2.292407512664795
649
+ length of dataloader: 28 14336
650
+ [GPU memory] before trainer 2.292407512664795
651
+ [2025-07-01 08:48:07] Rank 51: Timer for terminate callback has been set.
652
+ Total limit: 240min
653
+ Pre terminate time: 10min elapsed_time: 187.9340763092041s
654
+ length of dataloader: 28 14336
655
+ [GPU memory] before trainer 2.292407512664795
656
+ [2025-07-01 08:48:07] Rank 7: Timer for terminate callback has been set.
657
+ Total limit: 240min
658
+ Pre terminate time: 10min elapsed_time: 186.82988810539246s
659
+ [2025-07-01 08:48:07] Rank 55: Timer for terminate callback has been set.
660
+ Total limit: 240min
661
+ Pre terminate time: 10min elapsed_time: 187.9399230480194s
662
+ length of dataloader: 28 14336
663
+ [GPU memory] before trainer 2.292407512664795
664
+ [2025-07-01 08:48:07] Rank 54: Timer for terminate callback has been set.
665
+ Total limit: 240min
666
+ Pre terminate time: 10min elapsed_time: 187.96026849746704s
667
+ [2025-07-01 08:48:07] Rank 23: Timer for terminate callback has been set.
668
+ Total limit: 240min
669
+ Pre terminate time: 10min elapsed_time: 188.69248342514038s
670
+ length of dataloader: 28 14336
671
+ [GPU memory] before trainer 2.292407512664795
672
+ length of dataloader: 28 14336
673
+ [GPU memory] before trainer 2.292407512664795
674
+ length of dataloader: 28 14336
675
+ [GPU memory] before trainer 2.292407512664795
676
+ [2025-07-01 08:48:07] Rank 57: Timer for terminate callback has been set.
677
+ Total limit: 240min
678
+ Pre terminate time: 10min elapsed_time: 188.67919492721558s
679
+ [2025-07-01 08:48:07] Rank 52: Timer for terminate callback has been set.
680
+ Total limit: 240min
681
+ Pre terminate time: 10min elapsed_time: 187.97831630706787s
682
+ length of dataloader: 28 14336
683
+ [GPU memory] before trainer 2.292407512664795
684
+ [2025-07-01 08:48:07] Rank 61: Timer for terminate callback has been set.
685
+ Total limit: 240min
686
+ Pre terminate time: 10min elapsed_time: 188.687602519989s
687
+ [2025-07-01 08:48:07] Rank 4: Timer for terminate callback has been set.
688
+ Total limit: 240min
689
+ Pre terminate time: 10min elapsed_time: 186.87889289855957s
690
+ [2025-07-01 08:48:07] Rank 42: Timer for terminate callback has been set.
691
+ Total limit: 240min
692
+ Pre terminate time: 10min elapsed_time: 188.7198281288147s
693
+ [2025-07-01 08:48:07] Rank 5: Timer for terminate callback has been set.
694
+ Total limit: 240min
695
+ Pre terminate time: 10min elapsed_time: 186.88499283790588s
696
+ length of dataloader: 28 14336
697
+ [GPU memory] before trainer 2.292407512664795
698
+ [2025-07-01 08:48:07] Rank 10: Timer for terminate callback has been set.
699
+ Total limit: 240min
700
+ length of dataloader: 28 14336
701
+ Pre terminate time: 10min elapsed_time: 187.23191928863525s
702
+ [GPU memory] before trainer 2.292407512664795
703
+ length of dataloader: 28 14336
704
+ [GPU memory] before trainer 2.292407512664795
705
+ length of dataloader: 28 14336
706
+ [GPU memory] before trainer 2.292407512664795
707
+ [2025-07-01 08:48:07] Rank 47: Timer for terminate callback has been set.
708
+ Total limit: 240min
709
+ Pre terminate time: 10min elapsed_time: 188.74053859710693s
710
+ length of dataloader: 28 14336
711
+ [GPU memory] before trainer 2.292407512664795
712
+ [2025-07-01 08:48:07] Rank 63: Timer for terminate callback has been set.
713
+ Total limit: 240min
714
+ Pre terminate time: 10min elapsed_time: 188.7193853855133s
715
+ [2025-07-01 08:48:07] Rank 34: Timer for terminate callback has been set.
716
+ Total limit: 240min
717
+ Pre terminate time: 10min elapsed_time: 188.61402535438538s
718
+ length of dataloader: 28 14336
719
+ [2025-07-01 08:48:07] Rank 59: Timer for terminate callback has been set.
720
+ Total limit: 240min
721
+ Pre terminate time: 10min elapsed_time: 188.72772979736328s
722
+ [GPU memory] before trainer 2.292407512664795
723
+ length of dataloader: 28 14336
724
+ [GPU memory] before trainer 2.292407512664795
725
+ length of dataloader: 28 14336
726
+ [GPU memory] before trainer 2.292407512664795
727
+ [2025-07-01 08:48:07] Rank 41: Timer for terminate callback has been set.
728
+ Total limit: 240min
729
+ Pre terminate time: 10min elapsed_time: 188.77644872665405s
730
+ length of dataloader: 28 14336
731
+ [GPU memory] before trainer 2.292407512664795
732
+ length of dataloader: 28 14336
733
+ [GPU memory] before trainer 2.292407512664795
734
+ length of dataloader: 28 14336
735
+ [GPU memory] before trainer 2.292407512664795
736
+ length of dataloader: 28 14336
737
+ [GPU memory] before trainer 2.292407512664795
738
+ length of dataloader: 28 14336
739
+ [GPU memory] before trainer 2.292407512664795
740
+ length of dataloader: 28 14336
741
+ [GPU memory] before trainer 2.292407512664795
742
+ length of dataloader: 28 14336
743
+ [GPU memory] before trainer 2.292407512664795
744
+ length of dataloader: 28 14336
745
+ [GPU memory] before trainer 2.292407512664795
746
+ length of dataloader: 28 14336
747
+ [GPU memory] before trainer 2.292407512664795
748
+ [2025-07-01 08:48:07] Rank 56: Timer for terminate callback has been set.
749
+ Total limit: 240min
750
+ Pre terminate time: 10min elapsed_time: 188.7942099571228s
751
+ length of dataloader: 28 14336
752
+ [GPU memory] before trainer 2.292407512664795
753
+ length of dataloader: 28 14336
754
+ [GPU memory] before trainer 2.292407512664795
755
+ length of dataloader: 28 14336
756
+ [GPU memory] before trainer 2.292407512664795
757
+ [2025-07-01 08:48:07] Rank 39: Timer for terminate callback has been set.
758
+ Total limit: 240min
759
+ Pre terminate time: 10min elapsed_time: 188.70067310333252s
760
+ length of dataloader: 28 14336
761
+ [GPU memory] before trainer 2.292407512664795
762
+ length of dataloader: 28 14336
763
+ [GPU memory] before trainer 2.292407512664795
764
+ length of dataloader: 28 14336
765
+ [GPU memory] before trainer 2.292407512664795
766
+ length of dataloader: 28 14336
767
+ [GPU memory] before trainer 2.292407512664795
768
+ Parameter Offload: Total persistent parameters: 771184 in 421 params
slurm/1038247.0.err ADDED
The diff for this file is too large to render. See raw diff
 
slurm/1038247.0.out ADDED
@@ -0,0 +1,768 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ SLURM_JOB_ID = 1038247
2
+ SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
3
+ RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
4
+ OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
5
+ NNODES = 8
6
+ NODES = pool0-01504 pool0-01641 pool0-02076 pool0-01787 pool0-02443 pool0-02338 pool0-02349 pool0-02374
7
+ NODE_RANK = 3
8
+ GPUS_PER_NODE = 8
9
+ MASTER_ADDR = pool0-01504
10
+ MASTER_PORT = 25001
11
+ GLOBAL_TRAIN_BATCH_SIZE = 2048
12
+ GRADIENT_ACCUMULATION_STEPS = 4
13
+ PER_DEVICE_TRAIN_BATCH_SIZE = 8
14
+ DEFAULT_LEARNING_RATE: 2e-5
15
+ SLURM_JOB_ID = 1038247
16
+ SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
17
+ RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
18
+ OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
19
+ NNODES = 8
20
+ NODES = pool0-01504 pool0-01641 pool0-02076 pool0-01787 pool0-02443 pool0-02338 pool0-02349 pool0-02374
21
+ NODE_RANK = 6
22
+ GPUS_PER_NODE = 8
23
+ MASTER_ADDR = pool0-01504
24
+ MASTER_PORT = 25001
25
+ GLOBAL_TRAIN_BATCH_SIZE = 2048
26
+ GRADIENT_ACCUMULATION_STEPS = 4
27
+ PER_DEVICE_TRAIN_BATCH_SIZE = 8
28
+ DEFAULT_LEARNING_RATE: 2e-5
29
+ SLURM_JOB_ID = 1038247
30
+ SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
31
+ RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
32
+ OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
33
+ NNODES = 8
34
+ NODES = pool0-01504 pool0-01641 pool0-02076 pool0-01787 pool0-02443 pool0-02338 pool0-02349 pool0-02374
35
+ NODE_RANK = 5
36
+ GPUS_PER_NODE = 8
37
+ MASTER_ADDR = pool0-01504
38
+ MASTER_PORT = 25001
39
+ GLOBAL_TRAIN_BATCH_SIZE = 2048
40
+ GRADIENT_ACCUMULATION_STEPS = 4
41
+ PER_DEVICE_TRAIN_BATCH_SIZE = 8
42
+ DEFAULT_LEARNING_RATE: 2e-5
43
+ SLURM_JOB_ID = 1038247
44
+ SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
45
+ RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
46
+ OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
47
+ NNODES = 8
48
+ NODES = pool0-01504 pool0-01641 pool0-02076 pool0-01787 pool0-02443 pool0-02338 pool0-02349 pool0-02374
49
+ NODE_RANK = 4
50
+ GPUS_PER_NODE = 8
51
+ MASTER_ADDR = pool0-01504
52
+ MASTER_PORT = 25001
53
+ GLOBAL_TRAIN_BATCH_SIZE = 2048
54
+ GRADIENT_ACCUMULATION_STEPS = 4
55
+ PER_DEVICE_TRAIN_BATCH_SIZE = 8
56
+ DEFAULT_LEARNING_RATE: 2e-5
57
+ SLURM_JOB_ID = 1038247
58
+ SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
59
+ RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
60
+ OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
61
+ NNODES = 8
62
+ NODES = pool0-01504 pool0-01641 pool0-02076 pool0-01787 pool0-02443 pool0-02338 pool0-02349 pool0-02374
63
+ NODE_RANK = 2
64
+ GPUS_PER_NODE = 8
65
+ MASTER_ADDR = pool0-01504
66
+ MASTER_PORT = 25001
67
+ GLOBAL_TRAIN_BATCH_SIZE = 2048
68
+ GRADIENT_ACCUMULATION_STEPS = 4
69
+ PER_DEVICE_TRAIN_BATCH_SIZE = 8
70
+ DEFAULT_LEARNING_RATE: 2e-5
71
+ SLURM_JOB_ID = 1038247
72
+ SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
73
+ RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
74
+ OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
75
+ NNODES = 8
76
+ NODES = pool0-01504 pool0-01641 pool0-02076 pool0-01787 pool0-02443 pool0-02338 pool0-02349 pool0-02374
77
+ NODE_RANK = 7
78
+ GPUS_PER_NODE = 8
79
+ MASTER_ADDR = pool0-01504
80
+ MASTER_PORT = 25001
81
+ GLOBAL_TRAIN_BATCH_SIZE = 2048
82
+ GRADIENT_ACCUMULATION_STEPS = 4
83
+ PER_DEVICE_TRAIN_BATCH_SIZE = 8
84
+ DEFAULT_LEARNING_RATE: 2e-5
85
+ SLURM_JOB_ID = 1038247
86
+ SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
87
+ RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
88
+ OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
89
+ NNODES = 8
90
+ NODES = pool0-01504 pool0-01641 pool0-02076 pool0-01787 pool0-02443 pool0-02338 pool0-02349 pool0-02374
91
+ NODE_RANK = 0
92
+ GPUS_PER_NODE = 8
93
+ MASTER_ADDR = pool0-01504
94
+ MASTER_PORT = 25001
95
+ GLOBAL_TRAIN_BATCH_SIZE = 2048
96
+ GRADIENT_ACCUMULATION_STEPS = 4
97
+ PER_DEVICE_TRAIN_BATCH_SIZE = 8
98
+ DEFAULT_LEARNING_RATE: 2e-5
99
+ SLURM_JOB_ID = 1038247
100
+ SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
101
+ RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
102
+ OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
103
+ NNODES = 8
104
+ NODES = pool0-01504 pool0-01641 pool0-02076 pool0-01787 pool0-02443 pool0-02338 pool0-02349 pool0-02374
105
+ NODE_RANK = 1
106
+ GPUS_PER_NODE = 8
107
+ MASTER_ADDR = pool0-01504
108
+ MASTER_PORT = 25001
109
+ GLOBAL_TRAIN_BATCH_SIZE = 2048
110
+ GRADIENT_ACCUMULATION_STEPS = 4
111
+ PER_DEVICE_TRAIN_BATCH_SIZE = 8
112
+ DEFAULT_LEARNING_RATE: 2e-5
113
+ [2025-07-01 08:49:27,028] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
114
+ [2025-07-01 08:49:27,330] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
115
+ [2025-07-01 08:49:27,386] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
116
+ [2025-07-01 08:49:27,390] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
117
+ [2025-07-01 08:49:27,399] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
118
+ [2025-07-01 08:49:27,443] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
119
+ [2025-07-01 08:49:27,449] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
120
+ [2025-07-01 08:49:27,451] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
121
+ [2025-07-01 08:49:27,451] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
122
+ [2025-07-01 08:49:27,453] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
123
+ [2025-07-01 08:49:27,527] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
124
+ [2025-07-01 08:49:27,584] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
125
+ [2025-07-01 08:49:27,587] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
126
+ [2025-07-01 08:49:27,643] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
127
+ [2025-07-01 08:49:27,658] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
128
+ [2025-07-01 08:49:27,659] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
129
+ [2025-07-01 08:49:27,693] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
130
+ [2025-07-01 08:49:27,694] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
131
+ [2025-07-01 08:49:27,696] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
132
+ [2025-07-01 08:49:27,697] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
133
+ [2025-07-01 08:49:27,698] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
134
+ [2025-07-01 08:49:27,736] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
135
+ [2025-07-01 08:49:27,738] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
136
+ [2025-07-01 08:49:27,739] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
137
+ [2025-07-01 08:49:27,745] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
138
+ [2025-07-01 08:49:28,093] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
139
+ [2025-07-01 08:49:28,213] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
140
+ [2025-07-01 08:49:28,213] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
141
+ [2025-07-01 08:49:28,214] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
142
+ [2025-07-01 08:49:28,215] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
143
+ [2025-07-01 08:49:28,219] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
144
+ [2025-07-01 08:49:28,222] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
145
+ [2025-07-01 08:49:28,281] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
146
+ [2025-07-01 08:49:28,311] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
147
+ [2025-07-01 08:49:28,314] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
148
+ [2025-07-01 08:49:28,326] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
149
+ [2025-07-01 08:49:28,383] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
150
+ [2025-07-01 08:49:28,385] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
151
+ [2025-07-01 08:49:28,387] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
152
+ [2025-07-01 08:49:28,389] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
153
+ [2025-07-01 08:49:28,419] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
154
+ [2025-07-01 08:49:28,464] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
155
+ [2025-07-01 08:49:28,590] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
156
+ [2025-07-01 08:49:28,590] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
157
+ [2025-07-01 08:49:28,602] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
158
+ [2025-07-01 08:49:28,603] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
159
+ [2025-07-01 08:49:28,604] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
160
+ [2025-07-01 08:49:28,608] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
161
+ [2025-07-01 08:49:29,434] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
162
+ [2025-07-01 08:49:29,469] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
163
+ [2025-07-01 08:49:29,470] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
164
+ [2025-07-01 08:49:29,481] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
165
+ [2025-07-01 08:49:29,510] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
166
+ [2025-07-01 08:49:29,523] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
167
+ [2025-07-01 08:49:29,599] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
168
+ [2025-07-01 08:49:29,600] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
169
+ [2025-07-01 08:49:30,356] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
170
+ [2025-07-01 08:49:30,356] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
171
+ [2025-07-01 08:49:30,357] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
172
+ [2025-07-01 08:49:30,393] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
173
+ [2025-07-01 08:49:30,393] [INFO] [comm.py:594:init_distributed] cdb=None
174
+ [2025-07-01 08:49:30,462] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
175
+ [2025-07-01 08:49:30,468] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
176
+ [2025-07-01 08:49:30,470] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
177
+ [2025-07-01 08:49:30,472] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
178
+ [2025-07-01 08:49:30,479] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
179
+ [2025-07-01 08:49:30,627] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
180
+ [2025-07-01 08:49:30,627] [INFO] [comm.py:594:init_distributed] cdb=None
181
+ [2025-07-01 08:49:30,702] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
182
+ [2025-07-01 08:49:30,702] [INFO] [comm.py:594:init_distributed] cdb=None
183
+ [2025-07-01 08:49:30,728] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
184
+ [2025-07-01 08:49:30,728] [INFO] [comm.py:594:init_distributed] cdb=None
185
+ [2025-07-01 08:49:30,741] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
186
+ [2025-07-01 08:49:30,741] [INFO] [comm.py:594:init_distributed] cdb=None
187
+ [2025-07-01 08:49:30,803] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
188
+ [2025-07-01 08:49:30,804] [INFO] [comm.py:594:init_distributed] cdb=None
189
+ [2025-07-01 08:49:30,820] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
190
+ [2025-07-01 08:49:30,820] [INFO] [comm.py:594:init_distributed] cdb=None
191
+ [2025-07-01 08:49:30,824] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
192
+ [2025-07-01 08:49:30,824] [INFO] [comm.py:594:init_distributed] cdb=None
193
+ [2025-07-01 08:49:30,828] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
194
+ [2025-07-01 08:49:30,828] [INFO] [comm.py:594:init_distributed] cdb=None
195
+ [2025-07-01 08:49:30,840] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
196
+ [2025-07-01 08:49:30,840] [INFO] [comm.py:594:init_distributed] cdb=None
197
+ [2025-07-01 08:49:30,857] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
198
+ [2025-07-01 08:49:30,857] [INFO] [comm.py:594:init_distributed] cdb=None
199
+ [2025-07-01 08:49:30,857] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
200
+ [2025-07-01 08:49:30,858] [INFO] [comm.py:594:init_distributed] cdb=None
201
+ [2025-07-01 08:49:30,985] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
202
+ [2025-07-01 08:49:30,985] [INFO] [comm.py:594:init_distributed] cdb=None
203
+ [2025-07-01 08:49:30,991] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
204
+ [2025-07-01 08:49:30,991] [INFO] [comm.py:594:init_distributed] cdb=None
205
+ [2025-07-01 08:49:30,993] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
206
+ [2025-07-01 08:49:30,993] [INFO] [comm.py:594:init_distributed] cdb=None
207
+ [2025-07-01 08:49:30,996] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
208
+ [2025-07-01 08:49:30,996] [INFO] [comm.py:594:init_distributed] cdb=None
209
+ [2025-07-01 08:49:31,151] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
210
+ [2025-07-01 08:49:31,151] [INFO] [comm.py:594:init_distributed] cdb=None
211
+ [2025-07-01 08:49:31,173] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
212
+ [2025-07-01 08:49:31,173] [INFO] [comm.py:594:init_distributed] cdb=None
213
+ [2025-07-01 08:49:31,178] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
214
+ [2025-07-01 08:49:31,178] [INFO] [comm.py:594:init_distributed] cdb=None
215
+ [2025-07-01 08:49:31,179] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
216
+ [2025-07-01 08:49:31,179] [INFO] [comm.py:594:init_distributed] cdb=None
217
+ [2025-07-01 08:49:31,300] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
218
+ [2025-07-01 08:49:31,300] [INFO] [comm.py:594:init_distributed] cdb=None
219
+ [2025-07-01 08:49:31,300] [INFO] [comm.py:625:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
220
+ [2025-07-01 08:49:31,328] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
221
+ [2025-07-01 08:49:31,328] [INFO] [comm.py:594:init_distributed] cdb=None
222
+ [2025-07-01 08:49:31,375] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
223
+ [2025-07-01 08:49:31,375] [INFO] [comm.py:594:init_distributed] cdb=None
224
+ [2025-07-01 08:49:31,375] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
225
+ [2025-07-01 08:49:31,376] [INFO] [comm.py:594:init_distributed] cdb=None
226
+ [2025-07-01 08:49:31,389] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
227
+ [2025-07-01 08:49:31,390] [INFO] [comm.py:594:init_distributed] cdb=None
228
+ [2025-07-01 08:49:31,458] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
229
+ [2025-07-01 08:49:31,458] [INFO] [comm.py:594:init_distributed] cdb=None
230
+ [2025-07-01 08:49:31,626] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
231
+ [2025-07-01 08:49:31,626] [INFO] [comm.py:594:init_distributed] cdb=None
232
+ [2025-07-01 08:49:31,626] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
233
+ [2025-07-01 08:49:31,626] [INFO] [comm.py:594:init_distributed] cdb=None
234
+ [2025-07-01 08:49:31,626] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
235
+ [2025-07-01 08:49:31,626] [INFO] [comm.py:594:init_distributed] cdb=None
236
+ [2025-07-01 08:49:31,632] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
237
+ [2025-07-01 08:49:31,632] [INFO] [comm.py:594:init_distributed] cdb=None
238
+ [2025-07-01 08:49:31,639] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
239
+ [2025-07-01 08:49:31,639] [INFO] [comm.py:594:init_distributed] cdb=None
240
+ [2025-07-01 08:49:31,655] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
241
+ [2025-07-01 08:49:31,655] [INFO] [comm.py:594:init_distributed] cdb=None
242
+ [2025-07-01 08:49:31,662] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
243
+ [2025-07-01 08:49:31,662] [INFO] [comm.py:594:init_distributed] cdb=None
244
+ [2025-07-01 08:49:31,665] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
245
+ [2025-07-01 08:49:31,665] [INFO] [comm.py:594:init_distributed] cdb=None
246
+ [2025-07-01 08:49:31,702] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
247
+ [2025-07-01 08:49:31,702] [INFO] [comm.py:594:init_distributed] cdb=None
248
+ [2025-07-01 08:49:31,720] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
249
+ [2025-07-01 08:49:31,720] [INFO] [comm.py:594:init_distributed] cdb=None
250
+ [2025-07-01 08:49:31,727] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
251
+ [2025-07-01 08:49:31,727] [INFO] [comm.py:594:init_distributed] cdb=None
252
+ [2025-07-01 08:49:31,756] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
253
+ [2025-07-01 08:49:31,756] [INFO] [comm.py:594:init_distributed] cdb=None
254
+ [2025-07-01 08:49:31,807] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
255
+ [2025-07-01 08:49:31,807] [INFO] [comm.py:594:init_distributed] cdb=None
256
+ [2025-07-01 08:49:31,853] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
257
+ [2025-07-01 08:49:31,853] [INFO] [comm.py:594:init_distributed] cdb=None
258
+ [2025-07-01 08:49:31,864] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
259
+ [2025-07-01 08:49:31,864] [INFO] [comm.py:594:init_distributed] cdb=None
260
+ [2025-07-01 08:49:31,887] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
261
+ [2025-07-01 08:49:31,887] [INFO] [comm.py:594:init_distributed] cdb=None
262
+ [2025-07-01 08:49:32,074] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
263
+ [2025-07-01 08:49:32,074] [INFO] [comm.py:594:init_distributed] cdb=None
264
+ [2025-07-01 08:49:32,091] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
265
+ [2025-07-01 08:49:32,091] [INFO] [comm.py:594:init_distributed] cdb=None
266
+ [2025-07-01 08:49:32,091] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
267
+ [2025-07-01 08:49:32,091] [INFO] [comm.py:594:init_distributed] cdb=None
268
+ [2025-07-01 08:49:32,105] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
269
+ [2025-07-01 08:49:32,105] [INFO] [comm.py:594:init_distributed] cdb=None
270
+ [2025-07-01 08:49:32,108] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
271
+ [2025-07-01 08:49:32,108] [INFO] [comm.py:594:init_distributed] cdb=None
272
+ [2025-07-01 08:49:32,109] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
273
+ [2025-07-01 08:49:32,109] [INFO] [comm.py:594:init_distributed] cdb=None
274
+ [2025-07-01 08:49:32,831] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
275
+ [2025-07-01 08:49:32,831] [INFO] [comm.py:594:init_distributed] cdb=None
276
+ [2025-07-01 08:49:32,871] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
277
+ [2025-07-01 08:49:32,871] [INFO] [comm.py:594:init_distributed] cdb=None
278
+ [2025-07-01 08:49:32,881] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
279
+ [2025-07-01 08:49:32,881] [INFO] [comm.py:594:init_distributed] cdb=None
280
+ [2025-07-01 08:49:32,882] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
281
+ [2025-07-01 08:49:32,882] [INFO] [comm.py:594:init_distributed] cdb=None
282
+ [2025-07-01 08:49:32,882] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
283
+ [2025-07-01 08:49:32,882] [INFO] [comm.py:594:init_distributed] cdb=None
284
+ [2025-07-01 08:49:32,886] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
285
+ [2025-07-01 08:49:32,886] [INFO] [comm.py:594:init_distributed] cdb=None
286
+ [2025-07-01 08:49:32,889] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
287
+ [2025-07-01 08:49:32,890] [INFO] [comm.py:594:init_distributed] cdb=None
288
+ [2025-07-01 08:49:32,891] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
289
+ [2025-07-01 08:49:32,891] [INFO] [comm.py:594:init_distributed] cdb=None
290
+ [2025-07-01 08:49:33,702] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
291
+ [2025-07-01 08:49:33,702] [INFO] [comm.py:594:init_distributed] cdb=None
292
+ [2025-07-01 08:49:33,702] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
293
+ [2025-07-01 08:49:33,702] [INFO] [comm.py:594:init_distributed] cdb=None
294
+ [2025-07-01 08:49:33,702] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
295
+ [2025-07-01 08:49:33,702] [INFO] [comm.py:594:init_distributed] cdb=None
296
+ [2025-07-01 08:49:33,791] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
297
+ [2025-07-01 08:49:33,791] [INFO] [comm.py:594:init_distributed] cdb=None
298
+ [2025-07-01 08:49:33,924] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
299
+ [2025-07-01 08:49:33,924] [INFO] [comm.py:594:init_distributed] cdb=None
300
+ [2025-07-01 08:49:34,044] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
301
+ [2025-07-01 08:49:34,044] [INFO] [comm.py:594:init_distributed] cdb=None
302
+ [2025-07-01 08:49:34,108] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
303
+ [2025-07-01 08:49:34,108] [INFO] [comm.py:594:init_distributed] cdb=None
304
+ [2025-07-01 08:49:34,138] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
305
+ [2025-07-01 08:49:34,138] [INFO] [comm.py:594:init_distributed] cdb=None
306
+ [2025-07-01 08:49:48,774] [INFO] [partition_parameters.py:453:__exit__] finished initializing model with 7.61B parameters
307
+ [2025-07-01 08:49:57,388] [INFO] [partition_parameters.py:453:__exit__] finished initializing model with 8.03B parameters
308
+ [2025-07-01 08:49:58,024] [INFO] [partition_parameters.py:453:__exit__] finished initializing model with 8.09B parameters
309
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
310
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
311
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
312
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
313
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
314
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
315
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
316
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
317
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
318
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
319
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
320
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
321
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
322
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
323
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
324
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
325
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
326
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
327
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
328
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
329
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
330
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
331
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
332
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
333
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
334
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
335
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
336
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
337
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
338
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
339
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
340
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
341
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
342
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
343
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
344
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
345
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
346
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
347
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
348
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
349
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
350
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
351
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
352
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
353
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
354
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
355
+
356
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
357
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
358
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
359
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
360
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
361
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
362
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
363
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
364
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
365
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
366
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
367
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
368
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
369
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
370
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
371
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
372
+ [dist-0-of-64] LlavaLlamaModel(
373
+ (llm): Qwen2ForCausalLM(
374
+ (model): Qwen2Model(
375
+ (embed_tokens): Embedding(151648, 3584)
376
+ (layers): ModuleList(
377
+ (0-27): 28 x Qwen2DecoderLayer(
378
+ (self_attn): Qwen2FlashAttention2(
379
+ (q_proj): Linear(in_features=3584, out_features=3584, bias=True)
380
+ (k_proj): Linear(in_features=3584, out_features=512, bias=True)
381
+ (v_proj): Linear(in_features=3584, out_features=512, bias=True)
382
+ (o_proj): Linear(in_features=3584, out_features=3584, bias=False)
383
+ (rotary_emb): Qwen2RotaryEmbedding()
384
+ )
385
+ (mlp): Qwen2MLP(
386
+ (gate_proj): Linear(in_features=3584, out_features=18944, bias=False)
387
+ (up_proj): Linear(in_features=3584, out_features=18944, bias=False)
388
+ (down_proj): Linear(in_features=18944, out_features=3584, bias=False)
389
+ (act_fn): SiLU()
390
+ )
391
+ (input_layernorm): Qwen2RMSNorm((0,), eps=1e-06)
392
+ (post_attention_layernorm): Qwen2RMSNorm((0,), eps=1e-06)
393
+ )
394
+ )
395
+ (norm): Qwen2RMSNorm((0,), eps=1e-06)
396
+ (rotary_emb): Qwen2RotaryEmbedding()
397
+ )
398
+ (lm_head): Linear(in_features=3584, out_features=151648, bias=False)
399
+ )
400
+ (vision_tower): SiglipVisionTower(
401
+ (vision_tower): SiglipVisionModel(
402
+ (vision_model): SiglipVisionTransformer(
403
+ (embeddings): SiglipVisionEmbeddings(
404
+ (patch_embedding): Conv2d(3, 1152, kernel_size=(14, 14), stride=(14, 14), padding=valid)
405
+ (position_embedding): Embedding(1024, 1152)
406
+ )
407
+ (encoder): SiglipEncoder(
408
+ (layers): ModuleList(
409
+ (0-26): 27 x SiglipEncoderLayer(
410
+ (self_attn): SiglipFlashAttention2(
411
+ (k_proj): Linear(in_features=1152, out_features=1152, bias=True)
412
+ (v_proj): Linear(in_features=1152, out_features=1152, bias=True)
413
+ (q_proj): Linear(in_features=1152, out_features=1152, bias=True)
414
+ (out_proj): Linear(in_features=1152, out_features=1152, bias=True)
415
+ )
416
+ (layer_norm1): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
417
+ (mlp): SiglipMLP(
418
+ (activation_fn): PytorchGELUTanh()
419
+ (fc1): Linear(in_features=1152, out_features=4304, bias=True)
420
+ (fc2): Linear(in_features=4304, out_features=1152, bias=True)
421
+ )
422
+ (layer_norm2): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
423
+ )
424
+ )
425
+ )
426
+ (post_layernorm): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
427
+ )
428
+ )
429
+ )
430
+ (mm_projector): MultimodalProjector(
431
+ (layers): Sequential(
432
+ (0): DownSample3x3BlockFix()
433
+ (1): LayerNorm((10368,), eps=1e-05, elementwise_affine=True)
434
+ (2): Linear(in_features=10368, out_features=3456, bias=True)
435
+ (3): GELU(approximate='none')
436
+ (4): LayerNorm((3456,), eps=1e-05, elementwise_affine=True)
437
+ (5): Linear(in_features=3456, out_features=3584, bias=True)
438
+ (6): GELU(approximate='none')
439
+ (7): Linear(in_features=3584, out_features=3584, bias=True)
440
+ )
441
+ )
442
+ )
443
+ [dist-0-of-64] Tunable parameters:
444
+ language model True
445
+ [dist-0-of-64] vision tower True
446
+ [dist-0-of-64] mm projector True
447
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
448
+ [2025-07-01 08:52:31] Rank 41: Timer for terminate callback has been set.
449
+ Total limit: 240min
450
+ Pre terminate time: 10min elapsed_time: 179.37434482574463s
451
+ [2025-07-01 08:52:31] Rank 6: Timer for terminate callback has been set.
452
+ Total limit: 240min
453
+ Pre terminate time: 10min elapsed_time: 180.40121841430664s
454
+ [2025-07-01 08:52:31] Rank 21: Timer for terminate callback has been set.
455
+ Total limit: 240min
456
+ Pre terminate time: 10min elapsed_time: 180.3818507194519s
457
+ length of dataloader: 28 14336
458
+ [GPU memory] before trainer 2.292407512664795
459
+ [2025-07-01 08:52:31] Rank 25: Timer for terminate callback has been set.
460
+ Total limit: 240min
461
+ Pre terminate time: 10min elapsed_time: 179.95334601402283s
462
+ length of dataloader: 28 14336
463
+ [GPU memory] before trainer 2.292407512664795
464
+ length of dataloader: 28 14336
465
+ [GPU memory] before trainer 2.292407512664795
466
+ length of dataloader: 28 14336
467
+ [GPU memory] before trainer 2.292407512664795
468
+ [2025-07-01 08:52:31] Rank 58: Timer for terminate callback has been set.
469
+ Total limit: 240min
470
+ Pre terminate time: 10min elapsed_time: 177.84617686271667s
471
+ [2025-07-01 08:52:31] Rank 36: Timer for terminate callback has been set.
472
+ Total limit: 240min
473
+ Pre terminate time: 10min elapsed_time: 179.8410665988922s
474
+ [2025-07-01 08:52:31] Rank 38: Timer for terminate callback has been set.
475
+ Total limit: 240min
476
+ Pre terminate time: 10min elapsed_time: 179.93657898902893s
477
+ [2025-07-01 08:52:31] Rank 45: Timer for terminate callback has been set.
478
+ Total limit: 240min
479
+ Pre terminate time: 10min elapsed_time: 179.68102931976318s
480
+ [2025-07-01 08:52:31] Rank 55: Timer for terminate callback has been set.
481
+ Total limit: 240min
482
+ Pre terminate time: 10min elapsed_time: 180.8664586544037s
483
+ [2025-07-01 08:52:31] Rank 14: Timer for terminate callback has been set.
484
+ Total limit: 240min
485
+ Pre terminate time: 10min elapsed_time: 178.74036169052124s
486
+ length of dataloader: 28 14336
487
+ [GPU memory] before trainer 2.292407512664795
488
+ [2025-07-01 08:52:31] Rank 56: Timer for terminate callback has been set.
489
+ Total limit: 240min
490
+ Pre terminate time: 10min elapsed_time: 177.8835060596466s
491
+ length of dataloader: 28 14336
492
+ [GPU memory] before trainer 2.292407512664795
493
+ length of dataloader: 28 14336
494
+ [GPU memory] before trainer 2.292407512664795
495
+ length of dataloader: 28 14336
496
+ [GPU memory] before trainer 2.292407512664795
497
+ length of dataloader: 28 14336
498
+ [GPU memory] before trainer 2.292407512664795
499
+ length of dataloader: 28 14336
500
+ [GPU memory] before trainer 2.292407512664795
501
+ [2025-07-01 08:52:31] Rank 16: Timer for terminate callback has been set.
502
+ Total limit: 240min
503
+ Pre terminate time: 10min elapsed_time: 180.6106789112091s
504
+ [2025-07-01 08:52:31] Rank 28: Timer for terminate callback has been set.
505
+ Total limit: 240min
506
+ Pre terminate time: 10min elapsed_time: 180.10080122947693s
507
+ length of dataloader: 28 14336
508
+ [GPU memory] before trainer 2.292407512664795
509
+ [2025-07-01 08:52:31] Rank 51: Timer for terminate callback has been set.
510
+ Total limit: 240min
511
+ Pre terminate time: 10min elapsed_time: 181.0412666797638s
512
+ [2025-07-01 08:52:31] Rank 5: Timer for terminate callback has been set.
513
+ Total limit: 240min
514
+ Pre terminate time: 10min elapsed_time: 180.63762664794922s
515
+ [2025-07-01 08:52:31] Rank 11: Timer for terminate callback has been set.
516
+ Total limit: 240min
517
+ Pre terminate time: 10min elapsed_time: 178.8860149383545s
518
+ length of dataloader: 28 14336
519
+ [GPU memory] before trainer 2.292407512664795
520
+ [2025-07-01 08:52:31] Rank 12: Timer for terminate callback has been set.
521
+ Total limit: 240min
522
+ Pre terminate time: 10min elapsed_time: 178.91869187355042s
523
+ length of dataloader: 28 14336
524
+ [GPU memory] before trainer 2.292407512664795
525
+ [2025-07-01 08:52:31] Rank 49: Timer for terminate callback has been set.
526
+ Total limit: 240min
527
+ Pre terminate time: 10min elapsed_time: 181.07620072364807s
528
+ [2025-07-01 08:52:31] Rank 54: Timer for terminate callback has been set.
529
+ Total limit: 240min
530
+ Pre terminate time: 10min elapsed_time: 181.11947393417358s
531
+ length of dataloader: 28 14336
532
+ [GPU memory] before trainer 2.292407512664795
533
+ length of dataloader: 28 14336
534
+ [GPU memory] before trainer 2.292407512664795
535
+ length of dataloader: 28 14336
536
+ [GPU memory] before trainer 2.292407512664795
537
+ [2025-07-01 08:52:31] Rank 15: Timer for terminate callback has been set.
538
+ Total limit: 240min
539
+ Pre terminate time: 10min elapsed_time: 178.96962904930115s
540
+ length of dataloader: 28 14336
541
+ [GPU memory] before trainer 2.292407512664795
542
+ [2025-07-01 08:52:31] Rank 53: Timer for terminate callback has been set.
543
+ Total limit: 240min
544
+ Pre terminate time: 10min elapsed_time: 181.16217613220215s
545
+ length of dataloader: 28 14336
546
+ [GPU memory] before trainer 2.292407512664795
547
+ length of dataloader: 28 14336
548
+ [GPU memory] before trainer 2.292407512664795
549
+ [2025-07-01 08:52:31] Rank 48: Timer for terminate callback has been set.
550
+ Total limit: 240min
551
+ Pre terminate time: 10min elapsed_time: 181.2186541557312s
552
+ length of dataloader: 28 14336
553
+ [GPU memory] before trainer 2.292407512664795
554
+ [2025-07-01 08:52:31] Rank 52: Timer for terminate callback has been set.
555
+ Total limit: 240min
556
+ Pre terminate time: 10min elapsed_time: 181.21066880226135s
557
+ [2025-07-01 08:52:31] Rank 31: Timer for terminate callback has been set.
558
+ Total limit: 240min
559
+ Pre terminate time: 10min elapsed_time: 180.3190746307373s
560
+ [2025-07-01 08:52:31] Rank 8: Timer for terminate callback has been set.
561
+ Total limit: 240min
562
+ Pre terminate time: 10min elapsed_time: 179.07767581939697s
563
+ [2025-07-01 08:52:31] Rank 24: Timer for terminate callback has been set.
564
+ Total limit: 240min
565
+ Pre terminate time: 10min elapsed_time: 180.3272933959961s
566
+ [2025-07-01 08:52:31] Rank 1: Timer for terminate callback has been set.
567
+ Total limit: 240min
568
+ Pre terminate time: 10min elapsed_time: 180.84223079681396s
569
+ [2025-07-01 08:52:31] Rank 3: Timer for terminate callback has been set.
570
+ Total limit: 240min
571
+ Pre terminate time: 10min elapsed_time: 181.00526809692383s
572
+ [2025-07-01 08:52:31] Rank 9: Timer for terminate callback has been set.
573
+ Total limit: 240min
574
+ Pre terminate time: 10min elapsed_time: 179.134206533432s
575
+ [2025-07-01 08:52:31] Rank 33: Timer for terminate callback has been set.
576
+ Total limit: 240min
577
+ Pre terminate time: 10min elapsed_time: 180.2782702445984s
578
+ [2025-07-01 08:52:31] Rank 32: Timer for terminate callback has been set.
579
+ Total limit: 240min
580
+ Pre terminate time: 10min elapsed_time: 180.3015902042389s
581
+ length of dataloader: 28 14336
582
+ [GPU memory] before trainer 2.292407512664795
583
+ [2025-07-01 08:52:31] Rank 50: Timer for terminate callback has been set.
584
+ Total limit: 240min
585
+ Pre terminate time: 10min elapsed_time: 181.3520963191986s
586
+ [2025-07-01 08:52:31] Rank 39: Timer for terminate callback has been set.
587
+ Total limit: 240min
588
+ Pre terminate time: 10min elapsed_time: 180.27661395072937s
589
+ [2025-07-01 08:52:31] Rank 40: Timer for terminate callback has been set.
590
+ Total limit: 240min
591
+ Pre terminate time: 10min elapsed_time: 180.0057246685028s
592
+ length of dataloader: 28 14336
593
+ [GPU memory] before trainer 2.292407512664795
594
+ [2025-07-01 08:52:31] Rank 20: Timer for terminate callback has been set.
595
+ Total limit: 240min
596
+ Pre terminate time: 10min elapsed_time: 180.9391541481018s
597
+ [2025-07-01 08:52:31] Rank 37: Timer for terminate callback has been set.
598
+ Total limit: 240min
599
+ Pre terminate time: 10min elapsed_time: 180.3413987159729s
600
+ [2025-07-01 08:52:31] Rank 34: Timer for terminate callback has been set.
601
+ Total limit: 240min
602
+ Pre terminate time: 10min elapsed_time: 180.33079957962036s
603
+ length of dataloader: 28 14336
604
+ [GPU memory] before trainer 2.292407512664795
605
+ [2025-07-01 08:52:31] Rank 22: Timer for terminate callback has been set.
606
+ Total limit: 240min
607
+ Pre terminate time: 10min elapsed_time: 181.61974906921387s
608
+ [2025-07-01 08:52:31] Rank 13: Timer for terminate callback has been set.
609
+ Total limit: 240min
610
+ Pre terminate time: 10min elapsed_time: 179.15128827095032s
611
+ length of dataloader: 28 14336
612
+ [GPU memory] before trainer 2.292407512664795
613
+ length of dataloader: 28 14336
614
+ [GPU memory] before trainer 2.292407512664795
615
+ [2025-07-01 08:52:31] Rank 43: Timer for terminate callback has been set.
616
+ Total limit: 240min
617
+ Pre terminate time: 10min elapsed_time: 180.01913499832153s
618
+ [2025-07-01 08:52:31] Rank 10: Timer for terminate callback has been set.
619
+ Total limit: 240min
620
+ Pre terminate time: 10min elapsed_time: 179.16416096687317s
621
+ length of dataloader: 28 14336
622
+ [GPU memory] before trainer 2.292407512664795
623
+ length of dataloader: 28 14336
624
+ [GPU memory] before trainer 2.292407512664795
625
+ [2025-07-01 08:52:31] Rank 23: Timer for terminate callback has been set.
626
+ Total limit: 240min
627
+ Pre terminate time: 10min elapsed_time: 181.1800184249878s
628
+ length of dataloader: 28 14336
629
+ [GPU memory] before trainer 2.292407512664795
630
+ [2025-07-01 08:52:31] Rank 46: Timer for terminate callback has been set.
631
+ Total limit: 240min
632
+ Pre terminate time: 10min elapsed_time: 181.30903506278992s
633
+ length of dataloader: 28 14336
634
+ [GPU memory] before trainer 2.292407512664795
635
+ length of dataloader: 28 14336
636
+ [GPU memory] before trainer 2.292407512664795
637
+ [2025-07-01 08:52:32] Rank 29: Timer for terminate callback has been set.
638
+ Total limit: 240min
639
+ Pre terminate time: 10min elapsed_time: 180.40558052062988s
640
+ [2025-07-01 08:52:32] Rank 47: Timer for terminate callback has been set.
641
+ Total limit: 240min
642
+ Pre terminate time: 10min elapsed_time: 180.4238715171814s
643
+ length of dataloader: 28 14336
644
+ [GPU memory] before trainer 2.292407512664795
645
+ length of dataloader: 28 14336
646
+ [GPU memory] before trainer 2.292407512664795
647
+ [2025-07-01 08:52:32] Rank 27: Timer for terminate callback has been set.
648
+ Total limit: 240min
649
+ Pre terminate time: 10min elapsed_time: 180.43949127197266s
650
+ length of dataloader: 28 14336
651
+ [GPU memory] before trainer 2.292407512664795
652
+ [2025-07-01 08:52:32] Rank 17: Timer for terminate callback has been set.
653
+ Total limit: 240min
654
+ Pre terminate time: 10min elapsed_time: 180.98406744003296s
655
+ [2025-07-01 08:52:32] Rank 61: Timer for terminate callback has been set.
656
+ Total limit: 240min
657
+ Pre terminate time: 10min elapsed_time: 178.336608171463s
658
+ [2025-07-01 08:52:32] Rank 57: Timer for terminate callback has been set.
659
+ Total limit: 240min
660
+ Pre terminate time: 10min elapsed_time: 178.334801197052s
661
+ length of dataloader: 28 14336
662
+ [GPU memory] before trainer 2.292407512664795
663
+ [2025-07-01 08:52:32] Rank 35: Timer for terminate callback has been set.
664
+ Total limit: 240min
665
+ Pre terminate time: 10min elapsed_time: 180.47004318237305s
666
+ [2025-07-01 08:52:32] Rank 42: Timer for terminate callback has been set.
667
+ Total limit: 240min
668
+ Pre terminate time: 10min elapsed_time: 180.17485332489014s
669
+ length of dataloader: 28 14336
670
+ [GPU memory] before trainer 2.292407512664795
671
+ [2025-07-01 08:52:32] Rank 63: Timer for terminate callback has been set.
672
+ Total limit: 240min
673
+ Pre terminate time: 10min elapsed_time: 178.42643857002258s
674
+ length of dataloader: 28 14336
675
+ [GPU memory] before trainer 2.292407512664795
676
+ length of dataloader: 28 14336
677
+ [GPU memory] before trainer 2.292407512664795
678
+ length of dataloader: 28 14336
679
+ [GPU memory] before trainer 2.292407512664795
680
+ [2025-07-01 08:52:32] Rank 60: Timer for terminate callback has been set.
681
+ Total limit: 240min
682
+ Pre terminate time: 10min elapsed_time: 178.29783725738525s
683
+ length of dataloader: 28 14336
684
+ [GPU memory] before trainer 2.292407512664795
685
+ length of dataloader: 28 14336
686
+ [GPU memory] before trainer 2.292407512664795
687
+ length of dataloader: 28 14336
688
+ [GPU memory] before trainer 2.292407512664795
689
+ [2025-07-01 08:52:32] Rank 19: Timer for terminate callback has been set.
690
+ Total limit: 240min
691
+ Pre terminate time: 10min elapsed_time: 181.14956998825073s
692
+ length of dataloader: 28 14336
693
+ [GPU memory] before trainer 2.292407512664795
694
+ [2025-07-01 08:52:32] Rank 59: Timer for terminate callback has been set.
695
+ Total limit: 240min
696
+ Pre terminate time: 10min elapsed_time: 178.32719588279724s
697
+ [2025-07-01 08:52:32] Rank 18: Timer for terminate callback has been set.
698
+ Total limit: 240min
699
+ Pre terminate time: 10min elapsed_time: 181.05258059501648s
700
+ length of dataloader: 28 14336
701
+ [GPU memory] before trainer 2.292407512664795
702
+ [2025-07-01 08:52:32] Rank 26: Timer for terminate callback has been set.
703
+ Total limit: 240min
704
+ Pre terminate time: 10min elapsed_time: 180.52194571495056s
705
+ length of dataloader: 28 14336
706
+ [GPU memory] before trainer 2.292407512664795
707
+ length of dataloader: 28 14336
708
+ [GPU memory] before trainer 2.292407512664795
709
+ length of dataloader: 28 14336
710
+ [GPU memory] before trainer 2.292407512664795
711
+ [2025-07-01 08:52:32] Rank 30: Timer for terminate callback has been set.
712
+ Total limit: 240min
713
+ Pre terminate time: 10min elapsed_time: 180.52348446846008s
714
+ length of dataloader: 28 14336
715
+ [GPU memory] before trainer 2.292407512664795
716
+ [2025-07-01 08:52:32] Rank 62: Timer for terminate callback has been set.
717
+ Total limit: 240min
718
+ Pre terminate time: 10min elapsed_time: 178.48913526535034s
719
+ length of dataloader: 28 14336
720
+ [GPU memory] before trainer 2.292407512664795
721
+ length of dataloader: 28 14336
722
+ [GPU memory] before trainer 2.292407512664795
723
+ [2025-07-01 08:52:32] Rank 2: Timer for terminate callback has been set.
724
+ Total limit: 240min
725
+ Pre terminate time: 10min elapsed_time: 181.0904836654663s
726
+ length of dataloader: 28 14336
727
+ [GPU memory] before trainer 2.292407512664795
728
+ length of dataloader: 28 14336
729
+ [GPU memory] before trainer 2.292407512664795
730
+ length of dataloader: 28 14336
731
+ [GPU memory] before trainer 2.292407512664795
732
+ length of dataloader: 28 14336
733
+ [GPU memory] before trainer 2.292407512664795
734
+ length of dataloader: 28 14336
735
+ length of dataloader: 28 14336
736
+ [GPU memory] before trainer 2.292407512664795
737
+ [GPU memory] before trainer 2.292407512664795
738
+ [2025-07-01 08:52:32] Rank 7: Timer for terminate callback has been set.
739
+ Total limit: 240min
740
+ Pre terminate time: 10min elapsed_time: 181.38074707984924s
741
+ [2025-07-01 08:52:32] Rank 0: Timer for terminate callback has been set.
742
+ Total limit: 240min
743
+ Pre terminate time: 10min elapsed_time: 181.12232398986816s
744
+ [2025-07-01 08:52:32] Rank 4: Timer for terminate callback has been set.
745
+ Total limit: 240min
746
+ Pre terminate time: 10min elapsed_time: 181.12325024604797s
747
+ length of dataloader: 28 14336
748
+ [GPU memory] before trainer 2.292407512664795
749
+ length of dataloader: 28 14336
750
+ [GPU memory] before trainer 2.292407512664795
751
+ length of dataloader: 28 14336
752
+ [GPU memory] before trainer 2.292407512664795
753
+ length of dataloader: 28 14336
754
+ [GPU memory] before trainer 2.292407512664795
755
+ length of dataloader: 28 14336
756
+ [GPU memory] before trainer 2.292407512664795
757
+ length of dataloader: 28 14336
758
+ [GPU memory] before trainer 2.292407512664795
759
+ length of dataloader: 28 14336
760
+ [GPU memory] before trainer 2.292407512664795
761
+ length of dataloader: 28 14336
762
+ [GPU memory] before trainer 2.292407512664795
763
+ [2025-07-01 08:52:32] Rank 44: Timer for terminate callback has been set.
764
+ Total limit: 240min
765
+ Pre terminate time: 10min elapsed_time: 180.46628999710083s
766
+ length of dataloader: 28 14336
767
+ [GPU memory] before trainer 2.292407512664795
768
+ Parameter Offload: Total persistent parameters: 771184 in 421 params
slurm/1038254.0.err ADDED
The diff for this file is too large to render. See raw diff
 
slurm/1038254.0.out ADDED
@@ -0,0 +1,768 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ SLURM_JOB_ID = 1038254
2
+ SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
3
+ RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
4
+ OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
5
+ NNODES = 8
6
+ NODES = pool0-01504 pool0-01641 pool0-02076 pool0-01787 pool0-02443 pool0-02338 pool0-02349 pool0-02374
7
+ NODE_RANK = 4
8
+ GPUS_PER_NODE = 8
9
+ MASTER_ADDR = pool0-01504
10
+ MASTER_PORT = 25001
11
+ GLOBAL_TRAIN_BATCH_SIZE = 2048
12
+ GRADIENT_ACCUMULATION_STEPS = 4
13
+ PER_DEVICE_TRAIN_BATCH_SIZE = 8
14
+ DEFAULT_LEARNING_RATE: 2e-5
15
+ SLURM_JOB_ID = 1038254
16
+ SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
17
+ RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
18
+ OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
19
+ NNODES = 8
20
+ NODES = pool0-01504 pool0-01641 pool0-02076 pool0-01787 pool0-02443 pool0-02338 pool0-02349 pool0-02374
21
+ NODE_RANK = 3
22
+ GPUS_PER_NODE = 8
23
+ MASTER_ADDR = pool0-01504
24
+ MASTER_PORT = 25001
25
+ GLOBAL_TRAIN_BATCH_SIZE = 2048
26
+ GRADIENT_ACCUMULATION_STEPS = 4
27
+ PER_DEVICE_TRAIN_BATCH_SIZE = 8
28
+ DEFAULT_LEARNING_RATE: 2e-5
29
+ SLURM_JOB_ID = 1038254
30
+ SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
31
+ RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
32
+ OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
33
+ NNODES = 8
34
+ SLURM_JOB_ID = 1038254
35
+ SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
36
+ RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
37
+ OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
38
+ NNODES = 8
39
+ SLURM_JOB_ID = 1038254
40
+ SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
41
+ RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
42
+ OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
43
+ NNODES = 8
44
+ NODES = pool0-01504 pool0-01641 pool0-02076 pool0-01787 pool0-02443 pool0-02338 pool0-02349 pool0-02374
45
+ NODE_RANK = 0
46
+ GPUS_PER_NODE = 8
47
+ NODES = pool0-01504 pool0-01641 pool0-02076 pool0-01787 pool0-02443 pool0-02338 pool0-02349 pool0-02374
48
+ NODE_RANK = 6
49
+ GPUS_PER_NODE = 8
50
+ MASTER_ADDR = pool0-01504
51
+ MASTER_PORT = 25001
52
+ GLOBAL_TRAIN_BATCH_SIZE = 2048
53
+ GRADIENT_ACCUMULATION_STEPS = 4
54
+ PER_DEVICE_TRAIN_BATCH_SIZE = 8
55
+ NODES = pool0-01504 pool0-01641 pool0-02076 pool0-01787 pool0-02443 pool0-02338 pool0-02349 pool0-02374
56
+ NODE_RANK = 5
57
+ DEFAULT_LEARNING_RATE: 2e-5
58
+ GPUS_PER_NODE = 8
59
+ MASTER_ADDR = pool0-01504
60
+ MASTER_PORT = 25001
61
+ GLOBAL_TRAIN_BATCH_SIZE = 2048
62
+ GRADIENT_ACCUMULATION_STEPS = 4
63
+ PER_DEVICE_TRAIN_BATCH_SIZE = 8
64
+ DEFAULT_LEARNING_RATE: 2e-5
65
+ MASTER_ADDR = pool0-01504
66
+ MASTER_PORT = 25001
67
+ GLOBAL_TRAIN_BATCH_SIZE = 2048
68
+ GRADIENT_ACCUMULATION_STEPS = 4
69
+ PER_DEVICE_TRAIN_BATCH_SIZE = 8
70
+ DEFAULT_LEARNING_RATE: 2e-5
71
+ SLURM_JOB_ID = 1038254
72
+ SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
73
+ RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
74
+ OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
75
+ NNODES = 8
76
+ NODES = pool0-01504 pool0-01641 pool0-02076 pool0-01787 pool0-02443 pool0-02338 pool0-02349 pool0-02374
77
+ NODE_RANK = 1
78
+ GPUS_PER_NODE = 8
79
+ MASTER_ADDR = pool0-01504
80
+ MASTER_PORT = 25001
81
+ GLOBAL_TRAIN_BATCH_SIZE = 2048
82
+ GRADIENT_ACCUMULATION_STEPS = 4
83
+ PER_DEVICE_TRAIN_BATCH_SIZE = 8
84
+ DEFAULT_LEARNING_RATE: 2e-5
85
+ SLURM_JOB_ID = 1038254
86
+ SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
87
+ RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
88
+ OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
89
+ NNODES = 8
90
+ NODES = pool0-01504 pool0-01641 pool0-02076 pool0-01787 pool0-02443 pool0-02338 pool0-02349 pool0-02374
91
+ NODE_RANK = 2
92
+ GPUS_PER_NODE = 8
93
+ MASTER_ADDR = pool0-01504
94
+ MASTER_PORT = 25001
95
+ GLOBAL_TRAIN_BATCH_SIZE = 2048
96
+ GRADIENT_ACCUMULATION_STEPS = 4
97
+ PER_DEVICE_TRAIN_BATCH_SIZE = 8
98
+ DEFAULT_LEARNING_RATE: 2e-5
99
+ SLURM_JOB_ID = 1038254
100
+ SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
101
+ RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
102
+ OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
103
+ NNODES = 8
104
+ NODES = pool0-01504 pool0-01641 pool0-02076 pool0-01787 pool0-02443 pool0-02338 pool0-02349 pool0-02374
105
+ NODE_RANK = 7
106
+ GPUS_PER_NODE = 8
107
+ MASTER_ADDR = pool0-01504
108
+ MASTER_PORT = 25001
109
+ GLOBAL_TRAIN_BATCH_SIZE = 2048
110
+ GRADIENT_ACCUMULATION_STEPS = 4
111
+ PER_DEVICE_TRAIN_BATCH_SIZE = 8
112
+ DEFAULT_LEARNING_RATE: 2e-5
113
+ [2025-07-01 08:54:00,867] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
114
+ [2025-07-01 08:54:00,974] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
115
+ [2025-07-01 08:54:01,467] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
116
+ [2025-07-01 08:54:01,563] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
117
+ [2025-07-01 08:54:01,572] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
118
+ [2025-07-01 08:54:01,576] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
119
+ [2025-07-01 08:54:01,592] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
120
+ [2025-07-01 08:54:01,592] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
121
+ [2025-07-01 08:54:01,612] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
122
+ [2025-07-01 08:54:01,764] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
123
+ [2025-07-01 08:54:01,764] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
124
+ [2025-07-01 08:54:01,769] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
125
+ [2025-07-01 08:54:01,775] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
126
+ [2025-07-01 08:54:01,779] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
127
+ [2025-07-01 08:54:01,783] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
128
+ [2025-07-01 08:54:01,847] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
129
+ [2025-07-01 08:54:01,880] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
130
+ [2025-07-01 08:54:01,960] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
131
+ [2025-07-01 08:54:01,962] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
132
+ [2025-07-01 08:54:02,013] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
133
+ [2025-07-01 08:54:02,021] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
134
+ [2025-07-01 08:54:02,035] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
135
+ [2025-07-01 08:54:02,048] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
136
+ [2025-07-01 08:54:02,054] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
137
+ [2025-07-01 08:54:02,099] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
138
+ [2025-07-01 08:54:02,101] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
139
+ [2025-07-01 08:54:02,102] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
140
+ [2025-07-01 08:54:02,114] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
141
+ [2025-07-01 08:54:02,145] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
142
+ [2025-07-01 08:54:02,187] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
143
+ [2025-07-01 08:54:02,188] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
144
+ [2025-07-01 08:54:02,189] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
145
+ [2025-07-01 08:54:02,193] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
146
+ [2025-07-01 08:54:02,194] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
147
+ [2025-07-01 08:54:02,265] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
148
+ [2025-07-01 08:54:02,373] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
149
+ [2025-07-01 08:54:02,373] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
150
+ [2025-07-01 08:54:02,383] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
151
+ [2025-07-01 08:54:02,383] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
152
+ [2025-07-01 08:54:02,387] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
153
+ [2025-07-01 08:54:02,389] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
154
+ [2025-07-01 08:54:02,656] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
155
+ [2025-07-01 08:54:02,664] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
156
+ [2025-07-01 08:54:02,693] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
157
+ [2025-07-01 08:54:02,731] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
158
+ [2025-07-01 08:54:02,756] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
159
+ [2025-07-01 08:54:02,756] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
160
+ [2025-07-01 08:54:02,776] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
161
+ [2025-07-01 08:54:02,782] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
162
+ [2025-07-01 08:54:02,785] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
163
+ [2025-07-01 08:54:02,788] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
164
+ [2025-07-01 08:54:02,791] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
165
+ [2025-07-01 08:54:02,835] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
166
+ [2025-07-01 08:54:02,839] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
167
+ [2025-07-01 08:54:02,841] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
168
+ [2025-07-01 08:54:02,851] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
169
+ [2025-07-01 08:54:03,212] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
170
+ [2025-07-01 08:54:03,212] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
171
+ [2025-07-01 08:54:03,217] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
172
+ [2025-07-01 08:54:03,217] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
173
+ [2025-07-01 08:54:03,245] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
174
+ [2025-07-01 08:54:03,247] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
175
+ [2025-07-01 08:54:03,247] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
176
+ [2025-07-01 08:54:03,254] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
177
+ [2025-07-01 08:54:04,150] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
178
+ [2025-07-01 08:54:04,150] [INFO] [comm.py:594:init_distributed] cdb=None
179
+ [2025-07-01 08:54:04,274] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
180
+ [2025-07-01 08:54:04,274] [INFO] [comm.py:594:init_distributed] cdb=None
181
+ [2025-07-01 08:54:04,839] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
182
+ [2025-07-01 08:54:04,839] [INFO] [comm.py:594:init_distributed] cdb=None
183
+ [2025-07-01 08:54:04,847] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
184
+ [2025-07-01 08:54:04,847] [INFO] [comm.py:594:init_distributed] cdb=None
185
+ [2025-07-01 08:54:04,895] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
186
+ [2025-07-01 08:54:04,895] [INFO] [comm.py:594:init_distributed] cdb=None
187
+ [2025-07-01 08:54:04,913] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
188
+ [2025-07-01 08:54:04,913] [INFO] [comm.py:594:init_distributed] cdb=None
189
+ [2025-07-01 08:54:04,932] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
190
+ [2025-07-01 08:54:04,932] [INFO] [comm.py:594:init_distributed] cdb=None
191
+ [2025-07-01 08:54:05,228] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
192
+ [2025-07-01 08:54:05,229] [INFO] [comm.py:594:init_distributed] cdb=None
193
+ [2025-07-01 08:54:05,229] [INFO] [comm.py:625:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
194
+ [2025-07-01 08:54:05,261] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
195
+ [2025-07-01 08:54:05,261] [INFO] [comm.py:594:init_distributed] cdb=None
196
+ [2025-07-01 08:54:05,276] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
197
+ [2025-07-01 08:54:05,276] [INFO] [comm.py:594:init_distributed] cdb=None
198
+ [2025-07-01 08:54:05,281] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
199
+ [2025-07-01 08:54:05,281] [INFO] [comm.py:594:init_distributed] cdb=None
200
+ [2025-07-01 08:54:05,286] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
201
+ [2025-07-01 08:54:05,286] [INFO] [comm.py:594:init_distributed] cdb=None
202
+ [2025-07-01 08:54:05,287] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
203
+ [2025-07-01 08:54:05,287] [INFO] [comm.py:594:init_distributed] cdb=None
204
+ [2025-07-01 08:54:05,292] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
205
+ [2025-07-01 08:54:05,293] [INFO] [comm.py:594:init_distributed] cdb=None
206
+ [2025-07-01 08:54:05,293] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
207
+ [2025-07-01 08:54:05,293] [INFO] [comm.py:594:init_distributed] cdb=None
208
+ [2025-07-01 08:54:05,295] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
209
+ [2025-07-01 08:54:05,295] [INFO] [comm.py:594:init_distributed] cdb=None
210
+ [2025-07-01 08:54:05,297] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
211
+ [2025-07-01 08:54:05,297] [INFO] [comm.py:594:init_distributed] cdb=None
212
+ [2025-07-01 08:54:05,303] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
213
+ [2025-07-01 08:54:05,303] [INFO] [comm.py:594:init_distributed] cdb=None
214
+ [2025-07-01 08:54:05,303] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
215
+ [2025-07-01 08:54:05,303] [INFO] [comm.py:594:init_distributed] cdb=None
216
+ [2025-07-01 08:54:05,340] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
217
+ [2025-07-01 08:54:05,340] [INFO] [comm.py:594:init_distributed] cdb=None
218
+ [2025-07-01 08:54:05,391] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
219
+ [2025-07-01 08:54:05,391] [INFO] [comm.py:594:init_distributed] cdb=None
220
+ [2025-07-01 08:54:05,391] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
221
+ [2025-07-01 08:54:05,391] [INFO] [comm.py:594:init_distributed] cdb=None
222
+ [2025-07-01 08:54:05,392] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
223
+ [2025-07-01 08:54:05,392] [INFO] [comm.py:594:init_distributed] cdb=None
224
+ [2025-07-01 08:54:05,393] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
225
+ [2025-07-01 08:54:05,393] [INFO] [comm.py:594:init_distributed] cdb=None
226
+ [2025-07-01 08:54:05,395] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
227
+ [2025-07-01 08:54:05,395] [INFO] [comm.py:594:init_distributed] cdb=None
228
+ [2025-07-01 08:54:05,429] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
229
+ [2025-07-01 08:54:05,429] [INFO] [comm.py:594:init_distributed] cdb=None
230
+ [2025-07-01 08:54:05,462] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
231
+ [2025-07-01 08:54:05,462] [INFO] [comm.py:594:init_distributed] cdb=None
232
+ [2025-07-01 08:54:05,478] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
233
+ [2025-07-01 08:54:05,478] [INFO] [comm.py:594:init_distributed] cdb=None
234
+ [2025-07-01 08:54:05,539] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
235
+ [2025-07-01 08:54:05,539] [INFO] [comm.py:594:init_distributed] cdb=None
236
+ [2025-07-01 08:54:05,575] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
237
+ [2025-07-01 08:54:05,575] [INFO] [comm.py:594:init_distributed] cdb=None
238
+ [2025-07-01 08:54:05,684] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
239
+ [2025-07-01 08:54:05,685] [INFO] [comm.py:594:init_distributed] cdb=None
240
+ [2025-07-01 08:54:05,698] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
241
+ [2025-07-01 08:54:05,698] [INFO] [comm.py:594:init_distributed] cdb=None
242
+ [2025-07-01 08:54:05,801] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
243
+ [2025-07-01 08:54:05,801] [INFO] [comm.py:594:init_distributed] cdb=None
244
+ [2025-07-01 08:54:05,816] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
245
+ [2025-07-01 08:54:05,817] [INFO] [comm.py:594:init_distributed] cdb=None
246
+ [2025-07-01 08:54:05,843] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
247
+ [2025-07-01 08:54:05,843] [INFO] [comm.py:594:init_distributed] cdb=None
248
+ [2025-07-01 08:54:05,862] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
249
+ [2025-07-01 08:54:05,862] [INFO] [comm.py:594:init_distributed] cdb=None
250
+ [2025-07-01 08:54:05,965] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
251
+ [2025-07-01 08:54:05,965] [INFO] [comm.py:594:init_distributed] cdb=None
252
+ [2025-07-01 08:54:05,994] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
253
+ [2025-07-01 08:54:05,995] [INFO] [comm.py:594:init_distributed] cdb=None
254
+ [2025-07-01 08:54:05,997] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
255
+ [2025-07-01 08:54:05,997] [INFO] [comm.py:594:init_distributed] cdb=None
256
+ [2025-07-01 08:54:06,071] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
257
+ [2025-07-01 08:54:06,071] [INFO] [comm.py:594:init_distributed] cdb=None
258
+ [2025-07-01 08:54:06,124] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
259
+ [2025-07-01 08:54:06,124] [INFO] [comm.py:594:init_distributed] cdb=None
260
+ [2025-07-01 08:54:06,126] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
261
+ [2025-07-01 08:54:06,126] [INFO] [comm.py:594:init_distributed] cdb=None
262
+ [2025-07-01 08:54:06,131] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
263
+ [2025-07-01 08:54:06,131] [INFO] [comm.py:594:init_distributed] cdb=None
264
+ [2025-07-01 08:54:06,149] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
265
+ [2025-07-01 08:54:06,149] [INFO] [comm.py:594:init_distributed] cdb=None
266
+ [2025-07-01 08:54:06,173] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
267
+ [2025-07-01 08:54:06,173] [INFO] [comm.py:594:init_distributed] cdb=None
268
+ [2025-07-01 08:54:06,186] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
269
+ [2025-07-01 08:54:06,186] [INFO] [comm.py:594:init_distributed] cdb=None
270
+ [2025-07-01 08:54:06,217] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
271
+ [2025-07-01 08:54:06,217] [INFO] [comm.py:594:init_distributed] cdb=None
272
+ [2025-07-01 08:54:06,236] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
273
+ [2025-07-01 08:54:06,236] [INFO] [comm.py:594:init_distributed] cdb=None
274
+ [2025-07-01 08:54:06,269] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
275
+ [2025-07-01 08:54:06,269] [INFO] [comm.py:594:init_distributed] cdb=None
276
+ [2025-07-01 08:54:06,278] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
277
+ [2025-07-01 08:54:06,278] [INFO] [comm.py:594:init_distributed] cdb=None
278
+ [2025-07-01 08:54:06,285] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
279
+ [2025-07-01 08:54:06,285] [INFO] [comm.py:594:init_distributed] cdb=None
280
+ [2025-07-01 08:54:06,289] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
281
+ [2025-07-01 08:54:06,289] [INFO] [comm.py:594:init_distributed] cdb=None
282
+ [2025-07-01 08:54:06,291] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
283
+ [2025-07-01 08:54:06,291] [INFO] [comm.py:594:init_distributed] cdb=None
284
+ [2025-07-01 08:54:06,299] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
285
+ [2025-07-01 08:54:06,299] [INFO] [comm.py:594:init_distributed] cdb=None
286
+ [2025-07-01 08:54:06,302] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
287
+ [2025-07-01 08:54:06,302] [INFO] [comm.py:594:init_distributed] cdb=None
288
+ [2025-07-01 08:54:06,313] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
289
+ [2025-07-01 08:54:06,313] [INFO] [comm.py:594:init_distributed] cdb=None
290
+ [2025-07-01 08:54:06,539] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
291
+ [2025-07-01 08:54:06,539] [INFO] [comm.py:594:init_distributed] cdb=None
292
+ [2025-07-01 08:54:06,666] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
293
+ [2025-07-01 08:54:06,666] [INFO] [comm.py:594:init_distributed] cdb=None
294
+ [2025-07-01 08:54:06,687] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
295
+ [2025-07-01 08:54:06,687] [INFO] [comm.py:594:init_distributed] cdb=None
296
+ [2025-07-01 08:54:06,691] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
297
+ [2025-07-01 08:54:06,691] [INFO] [comm.py:594:init_distributed] cdb=None
298
+ [2025-07-01 08:54:06,711] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
299
+ [2025-07-01 08:54:06,711] [INFO] [comm.py:594:init_distributed] cdb=None
300
+ [2025-07-01 08:54:06,729] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
301
+ [2025-07-01 08:54:06,730] [INFO] [comm.py:594:init_distributed] cdb=None
302
+ [2025-07-01 08:54:06,735] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
303
+ [2025-07-01 08:54:06,735] [INFO] [comm.py:594:init_distributed] cdb=None
304
+ [2025-07-01 08:54:06,776] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
305
+ [2025-07-01 08:54:06,776] [INFO] [comm.py:594:init_distributed] cdb=None
306
+ [2025-07-01 08:54:23,138] [INFO] [partition_parameters.py:453:__exit__] finished initializing model with 7.61B parameters
307
+ [2025-07-01 08:54:31,001] [INFO] [partition_parameters.py:453:__exit__] finished initializing model with 8.03B parameters
308
+ [2025-07-01 08:54:31,607] [INFO] [partition_parameters.py:453:__exit__] finished initializing model with 8.09B parameters
309
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
310
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
311
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
312
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
313
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
314
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
315
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
316
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
317
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
318
+
319
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
320
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
321
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
322
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
323
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
324
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
325
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
326
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
327
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
328
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
329
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
330
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
331
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
332
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
333
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
334
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
335
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
336
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
337
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
338
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
339
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
340
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
341
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
342
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
343
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
344
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
345
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
346
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
347
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
348
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
349
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
350
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
351
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
352
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
353
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
354
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
355
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
356
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
357
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
358
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
359
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
360
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
361
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
362
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
363
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
364
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
365
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
366
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
367
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
368
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
369
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
370
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
371
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
372
+ [dist-0-of-64] LlavaLlamaModel(
373
+ (llm): Qwen2ForCausalLM(
374
+ (model): Qwen2Model(
375
+ (embed_tokens): Embedding(151648, 3584)
376
+ (layers): ModuleList(
377
+ (0-27): 28 x Qwen2DecoderLayer(
378
+ (self_attn): Qwen2FlashAttention2(
379
+ (q_proj): Linear(in_features=3584, out_features=3584, bias=True)
380
+ (k_proj): Linear(in_features=3584, out_features=512, bias=True)
381
+ (v_proj): Linear(in_features=3584, out_features=512, bias=True)
382
+ (o_proj): Linear(in_features=3584, out_features=3584, bias=False)
383
+ (rotary_emb): Qwen2RotaryEmbedding()
384
+ )
385
+ (mlp): Qwen2MLP(
386
+ (gate_proj): Linear(in_features=3584, out_features=18944, bias=False)
387
+ (up_proj): Linear(in_features=3584, out_features=18944, bias=False)
388
+ (down_proj): Linear(in_features=18944, out_features=3584, bias=False)
389
+ (act_fn): SiLU()
390
+ )
391
+ (input_layernorm): Qwen2RMSNorm((0,), eps=1e-06)
392
+ (post_attention_layernorm): Qwen2RMSNorm((0,), eps=1e-06)
393
+ )
394
+ )
395
+ (norm): Qwen2RMSNorm((0,), eps=1e-06)
396
+ (rotary_emb): Qwen2RotaryEmbedding()
397
+ )
398
+ (lm_head): Linear(in_features=3584, out_features=151648, bias=False)
399
+ )
400
+ (vision_tower): SiglipVisionTower(
401
+ (vision_tower): SiglipVisionModel(
402
+ (vision_model): SiglipVisionTransformer(
403
+ (embeddings): SiglipVisionEmbeddings(
404
+ (patch_embedding): Conv2d(3, 1152, kernel_size=(14, 14), stride=(14, 14), padding=valid)
405
+ (position_embedding): Embedding(1024, 1152)
406
+ )
407
+ (encoder): SiglipEncoder(
408
+ (layers): ModuleList(
409
+ (0-26): 27 x SiglipEncoderLayer(
410
+ (self_attn): SiglipFlashAttention2(
411
+ (k_proj): Linear(in_features=1152, out_features=1152, bias=True)
412
+ (v_proj): Linear(in_features=1152, out_features=1152, bias=True)
413
+ (q_proj): Linear(in_features=1152, out_features=1152, bias=True)
414
+ (out_proj): Linear(in_features=1152, out_features=1152, bias=True)
415
+ )
416
+ (layer_norm1): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
417
+ (mlp): SiglipMLP(
418
+ (activation_fn): PytorchGELUTanh()
419
+ (fc1): Linear(in_features=1152, out_features=4304, bias=True)
420
+ (fc2): Linear(in_features=4304, out_features=1152, bias=True)
421
+ )
422
+ (layer_norm2): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
423
+ )
424
+ )
425
+ )
426
+ (post_layernorm): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
427
+ )
428
+ )
429
+ )
430
+ (mm_projector): MultimodalProjector(
431
+ (layers): Sequential(
432
+ (0): DownSample3x3BlockFix()
433
+ (1): LayerNorm((10368,), eps=1e-05, elementwise_affine=True)
434
+ (2): Linear(in_features=10368, out_features=3456, bias=True)
435
+ (3): GELU(approximate='none')
436
+ (4): LayerNorm((3456,), eps=1e-05, elementwise_affine=True)
437
+ (5): Linear(in_features=3456, out_features=3584, bias=True)
438
+ (6): GELU(approximate='none')
439
+ (7): Linear(in_features=3584, out_features=3584, bias=True)
440
+ )
441
+ )
442
+ )
443
+ [dist-0-of-64] Tunable parameters:
444
+ language model True
445
+ [dist-0-of-64] vision tower True
446
+ [dist-0-of-64] mm projector True
447
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
448
+ [2025-07-01 08:57:05] Rank 62: Timer for terminate callback has been set.
449
+ Total limit: 240min
450
+ Pre terminate time: 10min elapsed_time: 179.40018558502197s
451
+ [2025-07-01 08:57:05] Rank 18: Timer for terminate callback has been set.
452
+ Total limit: 240min
453
+ Pre terminate time: 10min elapsed_time: 178.6438548564911s
454
+ [2025-07-01 08:57:05] Rank 1: Timer for terminate callback has been set.
455
+ Total limit: 240min
456
+ Pre terminate time: 10min elapsed_time: 179.9796986579895s
457
+ length of dataloader: 28 14336
458
+ [GPU memory] before trainer 2.292407512664795
459
+ [2025-07-01 08:57:05] Rank 40: Timer for terminate callback has been set.
460
+ Total limit: 240min
461
+ Pre terminate time: 10min elapsed_time: 179.92801594734192s
462
+ [2025-07-01 08:57:05] Rank 13: Timer for terminate callback has been set.
463
+ Total limit: 240min
464
+ Pre terminate time: 10min elapsed_time: 180.29648852348328s
465
+ [2025-07-01 08:57:05] Rank 50: Timer for terminate callback has been set.
466
+ Total limit: 240min
467
+ Pre terminate time: 10min elapsed_time: 179.9351155757904s
468
+ length of dataloader: 28 14336
469
+ [GPU memory] before trainer 2.292407512664795
470
+ length of dataloader: 28 14336
471
+ [GPU memory] before trainer 2.292407512664795
472
+ length of dataloader: 28 14336
473
+ [GPU memory] before trainer 2.292407512664795
474
+ length of dataloader: 28 14336
475
+ [GPU memory] before trainer 2.292407512664795
476
+ length of dataloader: 28 14336
477
+ [GPU memory] before trainer 2.292407512664795
478
+ [2025-07-01 08:57:05] Rank 8: Timer for terminate callback has been set.
479
+ Total limit: 240min
480
+ Pre terminate time: 10min elapsed_time: 180.53496980667114s
481
+ [2025-07-01 08:57:05] Rank 34: Timer for terminate callback has been set.
482
+ Total limit: 240min
483
+ Pre terminate time: 10min elapsed_time: 179.31973552703857s
484
+ [2025-07-01 08:57:05] Rank 6: Timer for terminate callback has been set.
485
+ Total limit: 240min
486
+ Pre terminate time: 10min elapsed_time: 180.11812615394592s
487
+ [2025-07-01 08:57:05] Rank 37: Timer for terminate callback has been set.
488
+ Total limit: 240min
489
+ Pre terminate time: 10min elapsed_time: 179.35892605781555s
490
+ [2025-07-01 08:57:05] Rank 30: Timer for terminate callback has been set.
491
+ Total limit: 240min
492
+ Pre terminate time: 10min elapsed_time: 179.2960913181305s
493
+ length of dataloader: 28 14336
494
+ [GPU memory] before trainer 2.292407512664795
495
+ [2025-07-01 08:57:05] Rank 21: Timer for terminate callback has been set.
496
+ Total limit: 240min
497
+ Pre terminate time: 10min elapsed_time: 178.88449788093567s
498
+ length of dataloader: 28 14336
499
+ [GPU memory] before trainer 2.292407512664795
500
+ length of dataloader: 28 14336
501
+ [GPU memory] before trainer 2.292407512664795
502
+ length of dataloader: 28 14336
503
+ [GPU memory] before trainer 2.292407512664795
504
+ length of dataloader: 28 14336
505
+ [GPU memory] before trainer 2.292407512664795
506
+ [2025-07-01 08:57:05] Rank 14: Timer for terminate callback has been set.
507
+ Total limit: 240min
508
+ Pre terminate time: 10min elapsed_time: 180.5244917869568s
509
+ [2025-07-01 08:57:05] Rank 51: Timer for terminate callback has been set.
510
+ Total limit: 240min
511
+ Pre terminate time: 10min elapsed_time: 180.05884408950806s
512
+ [2025-07-01 08:57:05] Rank 24: Timer for terminate callback has been set.
513
+ Total limit: 240min
514
+ Pre terminate time: 10min elapsed_time: 179.45844531059265s
515
+ length of dataloader: 28 14336
516
+ [GPU memory] before trainer 2.292407512664795
517
+ [2025-07-01 08:57:05] Rank 59: Timer for terminate callback has been set.
518
+ Total limit: 240min
519
+ Pre terminate time: 10min elapsed_time: 179.76254534721375s
520
+ length of dataloader: 28 14336
521
+ [GPU memory] before trainer 2.292407512664795
522
+ [2025-07-01 08:57:05] Rank 46: Timer for terminate callback has been set.
523
+ Total limit: 240min
524
+ Pre terminate time: 10min elapsed_time: 180.31962299346924s
525
+ [2025-07-01 08:57:05] Rank 27: Timer for terminate callback has been set.
526
+ Total limit: 240min
527
+ Pre terminate time: 10min elapsed_time: 179.5118260383606s
528
+ length of dataloader: 28 14336
529
+ [GPU memory] before trainer 2.292407512664795
530
+ [2025-07-01 08:57:05] Rank 31: Timer for terminate callback has been set.
531
+ Total limit: 240min
532
+ Pre terminate time: 10min elapsed_time: 179.44188499450684s
533
+ [2025-07-01 08:57:05] Rank 42: Timer for terminate callback has been set.
534
+ Total limit: 240min
535
+ Pre terminate time: 10min elapsed_time: 181.3304316997528s
536
+ [2025-07-01 08:57:05] Rank 12: Timer for terminate callback has been set.
537
+ Total limit: 240min
538
+ Pre terminate time: 10min elapsed_time: 180.74464964866638s
539
+ length of dataloader: 28 14336
540
+ [GPU memory] before trainer 2.292407512664795
541
+ [2025-07-01 08:57:05] Rank 44: Timer for terminate callback has been set.
542
+ Total limit: 240min
543
+ Pre terminate time: 10min elapsed_time: 180.2750883102417s
544
+ [2025-07-01 08:57:05] Rank 22: Timer for terminate callback has been set.
545
+ Total limit: 240min
546
+ Pre terminate time: 10min elapsed_time: 179.0400447845459s
547
+ [2025-07-01 08:57:05] Rank 41: Timer for terminate callback has been set.
548
+ Total limit: 240min
549
+ Pre terminate time: 10min elapsed_time: 180.36503434181213s
550
+ [2025-07-01 08:57:05] Rank 29: Timer for terminate callback has been set.
551
+ Total limit: 240min
552
+ Pre terminate time: 10min elapsed_time: 179.6715350151062s
553
+ [2025-07-01 08:57:05] Rank 9: Timer for terminate callback has been set.
554
+ Total limit: 240min
555
+ Pre terminate time: 10min elapsed_time: 180.794335603714s
556
+ length of dataloader: 28 14336
557
+ [GPU memory] before trainer 2.292407512664795
558
+ [2025-07-01 08:57:05] Rank 47: Timer for terminate callback has been set.
559
+ Total limit: 240min
560
+ Pre terminate time: 10min elapsed_time: 180.25037503242493s
561
+ [2025-07-01 08:57:05] Rank 53: Timer for terminate callback has been set.
562
+ Total limit: 240min
563
+ Pre terminate time: 10min elapsed_time: 180.32664608955383s
564
+ [2025-07-01 08:57:05] Rank 26: Timer for terminate callback has been set.
565
+ Total limit: 240min
566
+ Pre terminate time: 10min elapsed_time: 179.59264469146729s
567
+ length of dataloader: 28 14336
568
+ [GPU memory] before trainer 2.292407512664795
569
+ [2025-07-01 08:57:05] Rank 0: Timer for terminate callback has been set.
570
+ Total limit: 240min
571
+ Pre terminate time: 10min elapsed_time: 180.4588851928711s
572
+ length of dataloader: 28 14336
573
+ [GPU memory] before trainer 2.292407512664795
574
+ length of dataloader: 28 14336
575
+ [GPU memory] before trainer 2.292407512664795
576
+ [2025-07-01 08:57:05] Rank 25: Timer for terminate callback has been set.
577
+ Total limit: 240min
578
+ Pre terminate time: 10min elapsed_time: 179.56515669822693s
579
+ length of dataloader: 28 14336
580
+ [GPU memory] before trainer 2.292407512664795
581
+ length of dataloader: 28 14336
582
+ [GPU memory] before trainer 2.292407512664795
583
+ length of dataloader: 28 14336
584
+ [GPU memory] before trainer 2.292407512664795
585
+ [2025-07-01 08:57:05] Rank 10: Timer for terminate callback has been set.
586
+ Total limit: 240min
587
+ Pre terminate time: 10min elapsed_time: 180.8198959827423s
588
+ [2025-07-01 08:57:05] Rank 33: Timer for terminate callback has been set.
589
+ Total limit: 240min
590
+ Pre terminate time: 10min elapsed_time: 179.61562252044678s
591
+ [2025-07-01 08:57:05] Rank 55: Timer for terminate callback has been set.
592
+ Total limit: 240min
593
+ Pre terminate time: 10min elapsed_time: 180.15942478179932s
594
+ length of dataloader: 28 14336
595
+ [2025-07-01 08:57:05] Rank 39: Timer for terminate callback has been set.
596
+ Total limit: 240min
597
+ [GPU memory] before trainer 2.292407512664795
598
+ Pre terminate time: 10min elapsed_time: 179.74457502365112s
599
+ [2025-07-01 08:57:05] Rank 15: Timer for terminate callback has been set.
600
+ Total limit: 240min
601
+ Pre terminate time: 10min elapsed_time: 181.5756447315216s
602
+ length of dataloader: 28 14336
603
+ [GPU memory] before trainer 2.292407512664795
604
+ length of dataloader: 28 14336
605
+ [GPU memory] before trainer 2.292407512664795
606
+ length of dataloader: 28 14336
607
+ [GPU memory] before trainer 2.292407512664795
608
+ [2025-07-01 08:57:05] Rank 49: Timer for terminate callback has been set.
609
+ Total limit: 240min
610
+ Pre terminate time: 10min elapsed_time: 180.26583528518677s
611
+ length of dataloader: 28 14336
612
+ [GPU memory] before trainer 2.292407512664795
613
+ [2025-07-01 08:57:05] Rank 11: Timer for terminate callback has been set.
614
+ Total limit: 240min
615
+ Pre terminate time: 10min elapsed_time: 180.88644003868103s
616
+ [2025-07-01 08:57:05] Rank 60: Timer for terminate callback has been set.
617
+ Total limit: 240min
618
+ Pre terminate time: 10min elapsed_time: 179.95006847381592s
619
+ [2025-07-01 08:57:05] Rank 61: Timer for terminate callback has been set.
620
+ Total limit: 240min
621
+ Pre terminate time: 10min elapsed_time: 180.19447827339172s
622
+ length of dataloader: 28 14336
623
+ length of dataloader: 28 14336
624
+ [GPU memory] before trainer 2.292407512664795
625
+ [GPU memory] before trainer 2.292407512664795
626
+ [2025-07-01 08:57:05] Rank 45: Timer for terminate callback has been set.
627
+ Total limit: 240min
628
+ Pre terminate time: 10min elapsed_time: 180.3910529613495s
629
+ [2025-07-01 08:57:05] Rank 58: Timer for terminate callback has been set.
630
+ Total limit: 240min
631
+ Pre terminate time: 10min elapsed_time: 180.2317771911621s
632
+ [2025-07-01 08:57:05] Rank 32: Timer for terminate callback has been set.
633
+ Total limit: 240min
634
+ Pre terminate time: 10min elapsed_time: 180.3425772190094s
635
+ length of dataloader: 28 14336
636
+ [GPU memory] before trainer 2.292407512664795
637
+ [2025-07-01 08:57:05] Rank 36: Timer for terminate callback has been set.
638
+ Total limit: 240min
639
+ Pre terminate time: 10min elapsed_time: 179.71157217025757s
640
+ [2025-07-01 08:57:05] Rank 28: Timer for terminate callback has been set.
641
+ Total limit: 240min
642
+ Pre terminate time: 10min elapsed_time: 179.64928483963013s
643
+ [2025-07-01 08:57:05] Rank 20: Timer for terminate callback has been set.
644
+ Total limit: 240min
645
+ Pre terminate time: 10min elapsed_time: 179.196674823761s
646
+ length of dataloader: 28 14336
647
+ [GPU memory] before trainer 2.292407512664795
648
+ [2025-07-01 08:57:05] Rank 57: Timer for terminate callback has been set.
649
+ Total limit: 240min
650
+ Pre terminate time: 10min elapsed_time: 180.0103840827942s
651
+ [2025-07-01 08:57:05] Rank 2: Timer for terminate callback has been set.
652
+ Total limit: 240min
653
+ Pre terminate time: 10min elapsed_time: 180.53355860710144s
654
+ [2025-07-01 08:57:05] Rank 19: Timer for terminate callback has been set.
655
+ Total limit: 240min
656
+ Pre terminate time: 10min elapsed_time: 179.20799660682678s
657
+ length of dataloader: 28 14336
658
+ length of dataloader: [GPU memory] before trainer28 14336
659
+ 2.292407512664795
660
+ [GPU memory] before trainer 2.292407512664795
661
+ [2025-07-01 08:57:05] Rank 43: Timer for terminate callback has been set.
662
+ Total limit: 240min
663
+ Pre terminate time: 10min elapsed_time: 180.38389587402344s
664
+ length of dataloader: 28 14336
665
+ [GPU memory] before trainer 2.292407512664795
666
+ length of dataloader: 28 14336
667
+ [GPU memory] before trainer 2.292407512664795
668
+ length of dataloader: 28 14336
669
+ [GPU memory] before trainer 2.292407512664795
670
+ length of dataloader: 28 14336
671
+ [GPU memory] before trainer 2.292407512664795
672
+ [2025-07-01 08:57:05] Rank 3: Timer for terminate callback has been set.
673
+ Total limit: 240min
674
+ Pre terminate time: 10min elapsed_time: 180.54649567604065s
675
+ [2025-07-01 08:57:05] Rank 52: Timer for terminate callback has been set.
676
+ Total limit: 240min
677
+ Pre terminate time: 10min elapsed_time: 180.3105661869049s
678
+ [2025-07-01 08:57:05] Rank 5: Timer for terminate callback has been set.
679
+ Total limit: 240min
680
+ Pre terminate time: 10min elapsed_time: 180.55701327323914s
681
+ [2025-07-01 08:57:05] Rank 54: Timer for terminate callback has been set.
682
+ Total limit: 240min
683
+ Pre terminate time: 10min elapsed_time: 180.3739037513733s
684
+ length of dataloader: 28 14336
685
+ [GPU memory] before trainer 2.292407512664795
686
+ [2025-07-01 08:57:05] Rank 17: Timer for terminate callback has been set.
687
+ Total limit: 240min
688
+ Pre terminate time: 10min elapsed_time: 179.306396484375s
689
+ [2025-07-01 08:57:05] Rank 16: Timer for terminate callback has been set.
690
+ Total limit: 240min
691
+ Pre terminate time: 10min elapsed_time: 179.22698402404785s
692
+ length of dataloader: 28 14336
693
+ [GPU memory] before trainer 2.292407512664795
694
+ length of dataloader: 28 14336
695
+ [GPU memory] before trainer 2.292407512664795
696
+ [2025-07-01 08:57:05] Rank 4: Timer for terminate callback has been set.
697
+ Total limit: 240min
698
+ Pre terminate time: 10min elapsed_time: 180.58894228935242s
699
+ length of dataloader: 28 14336
700
+ length of dataloader: 28 14336
701
+ [GPU memory] before trainer 2.292407512664795
702
+ [GPU memory] before trainer 2.292407512664795
703
+ [2025-07-01 08:57:05] Rank 38: Timer for terminate callback has been set.
704
+ Total limit: 240min
705
+ Pre terminate time: 10min elapsed_time: 179.79484272003174s
706
+ length of dataloader: 28 14336
707
+ [GPU memory] before trainer 2.292407512664795
708
+ length of dataloader: 28 14336
709
+ [GPU memory] before trainer 2.292407512664795
710
+ length of dataloader: 28 14336
711
+ [GPU memory] before trainer 2.292407512664795
712
+ length of dataloader: 28 14336
713
+ [GPU memory] before trainer 2.292407512664795
714
+ [2025-07-01 08:57:05] Rank 23: Timer for terminate callback has been set.
715
+ Total limit: 240min
716
+ Pre terminate time: 10min elapsed_time: 179.23841524124146s
717
+ length of dataloader: 28 14336
718
+ [GPU memory] before trainer 2.292407512664795
719
+ length of dataloader: 28 14336
720
+ [GPU memory] before trainer 2.292407512664795
721
+ length of dataloader: 28 14336
722
+ [GPU memory] before trainer 2.292407512664795
723
+ length of dataloader: 28 14336
724
+ [GPU memory] before trainer 2.292407512664795
725
+ [2025-07-01 08:57:05] Rank 35: Timer for terminate callback has been set.
726
+ Total limit: 240min
727
+ Pre terminate time: 10min elapsed_time: 179.92344903945923s
728
+ length of dataloader: 28 14336
729
+ [GPU memory] before trainer 2.292407512664795
730
+ length of dataloader: 28 14336
731
+ [GPU memory] before trainer 2.292407512664795
732
+ length of dataloader: 28 14336
733
+ [GPU memory] before trainer 2.292407512664795
734
+ length of dataloader: 28 14336
735
+ [GPU memory] before trainer 2.292407512664795
736
+ [2025-07-01 08:57:05] Rank 56: Timer for terminate callback has been set.
737
+ Total limit: 240min
738
+ Pre terminate time: 10min elapsed_time: 180.1525583267212s
739
+ length of dataloader: 28 14336
740
+ [GPU memory] before trainer 2.292407512664795
741
+ length of dataloader: 28 14336
742
+ [GPU memory] before trainer 2.292407512664795
743
+ length of dataloader: 28 14336
744
+ [GPU memory] before trainer 2.292407512664795
745
+ length of dataloader: 28 14336
746
+ [GPU memory] before trainer 2.292407512664795
747
+ [2025-07-01 08:57:05] Rank 48: Timer for terminate callback has been set.
748
+ Total limit: 240min
749
+ Pre terminate time: 10min elapsed_time: 180.58931589126587s
750
+ length of dataloader: 28 14336
751
+ [GPU memory] before trainer 2.292407512664795
752
+ length of dataloader: 28 14336
753
+ [GPU memory] before trainer 2.292407512664795
754
+ length of dataloader: 28 14336
755
+ [GPU memory] before trainer 2.292407512664795
756
+ length of dataloader: 28 14336
757
+ [GPU memory] before trainer 2.292407512664795
758
+ [2025-07-01 08:57:05] Rank 7: Timer for terminate callback has been set.
759
+ Total limit: 240min
760
+ Pre terminate time: 10min elapsed_time: 180.75982356071472s
761
+ [2025-07-01 08:57:06] Rank 63: Timer for terminate callback has been set.
762
+ Total limit: 240min
763
+ Pre terminate time: 10min elapsed_time: 180.29284620285034s
764
+ length of dataloader: 28 14336
765
+ [GPU memory] before trainer 2.292407512664795
766
+ length of dataloader: 28 14336
767
+ [GPU memory] before trainer 2.292407512664795
768
+ Parameter Offload: Total persistent parameters: 771184 in 421 params
slurm/1038255.0.err ADDED
The diff for this file is too large to render. See raw diff
 
slurm/1038255.0.out ADDED
@@ -0,0 +1,768 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ SLURM_JOB_ID = 1038255
2
+ SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
3
+ RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
4
+ OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
5
+ NNODES = 8
6
+ NODES = pool0-01868 pool0-01869 pool0-01894 pool0-01900 pool0-01911 pool0-01921 pool0-02007 pool0-02015
7
+ NODE_RANK = 1
8
+ GPUS_PER_NODE = 8
9
+ MASTER_ADDR = pool0-01868
10
+ MASTER_PORT = 25001
11
+ GLOBAL_TRAIN_BATCH_SIZE = 2048
12
+ GRADIENT_ACCUMULATION_STEPS = 4
13
+ PER_DEVICE_TRAIN_BATCH_SIZE = 8
14
+ DEFAULT_LEARNING_RATE: 2e-5
15
+ SLURM_JOB_ID = 1038255
16
+ SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
17
+ RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
18
+ OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
19
+ NNODES = 8
20
+ SLURM_JOB_ID = 1038255
21
+ SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
22
+ RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
23
+ OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
24
+ NNODES = 8
25
+ NODES = pool0-01868 pool0-01869 pool0-01894 pool0-01900 pool0-01911 pool0-01921 pool0-02007 pool0-02015
26
+ NODE_RANK = 5
27
+ GPUS_PER_NODE = 8
28
+ MASTER_ADDR = pool0-01868
29
+ MASTER_PORT = 25001
30
+ GLOBAL_TRAIN_BATCH_SIZE = 2048
31
+ GRADIENT_ACCUMULATION_STEPS = 4
32
+ PER_DEVICE_TRAIN_BATCH_SIZE = 8
33
+ DEFAULT_LEARNING_RATE: 2e-5
34
+ NODES = pool0-01868 pool0-01869 pool0-01894 pool0-01900 pool0-01911 pool0-01921 pool0-02007 pool0-02015
35
+ NODE_RANK = 6
36
+ GPUS_PER_NODE = 8
37
+ MASTER_ADDR = pool0-01868
38
+ MASTER_PORT = 25001
39
+ GLOBAL_TRAIN_BATCH_SIZE = 2048
40
+ GRADIENT_ACCUMULATION_STEPS = 4
41
+ PER_DEVICE_TRAIN_BATCH_SIZE = 8
42
+ DEFAULT_LEARNING_RATE: 2e-5
43
+ SLURM_JOB_ID = 1038255
44
+ SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
45
+ RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
46
+ OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
47
+ NNODES = 8
48
+ NODES = pool0-01868 pool0-01869 pool0-01894 pool0-01900 pool0-01911 pool0-01921 pool0-02007 pool0-02015
49
+ NODE_RANK = 4
50
+ GPUS_PER_NODE = 8
51
+ MASTER_ADDR = pool0-01868
52
+ MASTER_PORT = 25001
53
+ GLOBAL_TRAIN_BATCH_SIZE = 2048
54
+ GRADIENT_ACCUMULATION_STEPS = 4
55
+ PER_DEVICE_TRAIN_BATCH_SIZE = 8
56
+ DEFAULT_LEARNING_RATE: 2e-5
57
+ SLURM_JOB_ID = 1038255
58
+ SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
59
+ RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
60
+ OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
61
+ NNODES = 8
62
+ NODES = pool0-01868 pool0-01869 pool0-01894 pool0-01900 pool0-01911 pool0-01921 pool0-02007 pool0-02015
63
+ NODE_RANK = 7
64
+ GPUS_PER_NODE = 8
65
+ MASTER_ADDR = pool0-01868
66
+ MASTER_PORT = 25001
67
+ GLOBAL_TRAIN_BATCH_SIZE = 2048
68
+ GRADIENT_ACCUMULATION_STEPS = 4
69
+ PER_DEVICE_TRAIN_BATCH_SIZE = 8
70
+ DEFAULT_LEARNING_RATE: 2e-5
71
+ SLURM_JOB_ID = 1038255
72
+ SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
73
+ RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
74
+ OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
75
+ NNODES = 8
76
+ NODES = pool0-01868 pool0-01869 pool0-01894 pool0-01900 pool0-01911 pool0-01921 pool0-02007 pool0-02015
77
+ NODE_RANK = 2
78
+ GPUS_PER_NODE = 8
79
+ MASTER_ADDR = pool0-01868
80
+ MASTER_PORT = 25001
81
+ GLOBAL_TRAIN_BATCH_SIZE = 2048
82
+ GRADIENT_ACCUMULATION_STEPS = 4
83
+ PER_DEVICE_TRAIN_BATCH_SIZE = 8
84
+ DEFAULT_LEARNING_RATE: 2e-5
85
+ SLURM_JOB_ID = 1038255
86
+ SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
87
+ RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
88
+ OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
89
+ NNODES = 8
90
+ NODES = pool0-01868 pool0-01869 pool0-01894 pool0-01900 pool0-01911 pool0-01921 pool0-02007 pool0-02015
91
+ NODE_RANK = 0
92
+ GPUS_PER_NODE = 8
93
+ MASTER_ADDR = pool0-01868
94
+ MASTER_PORT = 25001
95
+ GLOBAL_TRAIN_BATCH_SIZE = 2048
96
+ GRADIENT_ACCUMULATION_STEPS = 4
97
+ PER_DEVICE_TRAIN_BATCH_SIZE = 8
98
+ DEFAULT_LEARNING_RATE: 2e-5
99
+ SLURM_JOB_ID = 1038255
100
+ SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
101
+ RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
102
+ OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
103
+ NNODES = 8
104
+ NODES = pool0-01868 pool0-01869 pool0-01894 pool0-01900 pool0-01911 pool0-01921 pool0-02007 pool0-02015
105
+ NODE_RANK = 3
106
+ GPUS_PER_NODE = 8
107
+ MASTER_ADDR = pool0-01868
108
+ MASTER_PORT = 25001
109
+ GLOBAL_TRAIN_BATCH_SIZE = 2048
110
+ GRADIENT_ACCUMULATION_STEPS = 4
111
+ PER_DEVICE_TRAIN_BATCH_SIZE = 8
112
+ DEFAULT_LEARNING_RATE: 2e-5
113
+ [2025-07-01 08:58:36,726] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
114
+ [2025-07-01 08:58:36,726] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
115
+ [2025-07-01 08:58:36,726] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
116
+ [2025-07-01 08:58:36,726] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
117
+ [2025-07-01 08:58:36,726] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
118
+ [2025-07-01 08:58:36,726] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
119
+ [2025-07-01 08:58:36,726] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
120
+ [2025-07-01 08:58:36,726] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
121
+ [2025-07-01 08:58:36,743] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
122
+ [2025-07-01 08:58:36,743] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
123
+ [2025-07-01 08:58:36,743] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
124
+ [2025-07-01 08:58:36,743] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
125
+ [2025-07-01 08:58:36,743] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
126
+ [2025-07-01 08:58:36,743] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
127
+ [2025-07-01 08:58:36,743] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
128
+ [2025-07-01 08:58:36,743] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
129
+ [2025-07-01 08:58:36,807] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
130
+ [2025-07-01 08:58:36,807] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
131
+ [2025-07-01 08:58:36,807] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
132
+ [2025-07-01 08:58:36,807] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
133
+ [2025-07-01 08:58:36,807] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
134
+ [2025-07-01 08:58:36,807] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
135
+ [2025-07-01 08:58:36,807] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
136
+ [2025-07-01 08:58:36,808] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
137
+ [2025-07-01 08:58:36,934] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
138
+ [2025-07-01 08:58:36,934] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
139
+ [2025-07-01 08:58:36,934] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
140
+ [2025-07-01 08:58:36,934] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
141
+ [2025-07-01 08:58:36,934] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
142
+ [2025-07-01 08:58:36,934] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
143
+ [2025-07-01 08:58:36,969] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
144
+ [2025-07-01 08:58:36,970] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
145
+ [2025-07-01 08:58:37,174] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
146
+ [2025-07-01 08:58:37,174] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
147
+ [2025-07-01 08:58:37,174] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
148
+ [2025-07-01 08:58:37,174] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
149
+ [2025-07-01 08:58:37,174] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
150
+ [2025-07-01 08:58:37,174] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
151
+ [2025-07-01 08:58:37,174] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
152
+ [2025-07-01 08:58:37,174] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
153
+ [2025-07-01 08:58:37,482] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
154
+ [2025-07-01 08:58:37,482] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
155
+ [2025-07-01 08:58:37,482] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
156
+ [2025-07-01 08:58:37,482] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
157
+ [2025-07-01 08:58:37,482] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
158
+ [2025-07-01 08:58:37,482] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
159
+ [2025-07-01 08:58:37,482] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
160
+ [2025-07-01 08:58:37,483] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
161
+ [2025-07-01 08:58:37,827] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
162
+ [2025-07-01 08:58:37,827] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
163
+ [2025-07-01 08:58:37,827] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
164
+ [2025-07-01 08:58:37,827] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
165
+ [2025-07-01 08:58:37,827] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
166
+ [2025-07-01 08:58:37,827] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
167
+ [2025-07-01 08:58:37,828] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
168
+ [2025-07-01 08:58:37,828] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
169
+ [2025-07-01 08:58:37,997] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
170
+ [2025-07-01 08:58:37,997] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
171
+ [2025-07-01 08:58:37,997] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
172
+ [2025-07-01 08:58:37,997] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
173
+ [2025-07-01 08:58:37,997] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
174
+ [2025-07-01 08:58:37,997] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
175
+ [2025-07-01 08:58:37,997] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
176
+ [2025-07-01 08:58:37,998] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
177
+ [2025-07-01 08:58:49,040] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
178
+ [2025-07-01 08:58:49,040] [INFO] [comm.py:594:init_distributed] cdb=None
179
+ [2025-07-01 08:58:49,044] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
180
+ [2025-07-01 08:58:49,044] [INFO] [comm.py:594:init_distributed] cdb=None
181
+ [2025-07-01 08:58:49,050] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
182
+ [2025-07-01 08:58:49,050] [INFO] [comm.py:594:init_distributed] cdb=None
183
+ [2025-07-01 08:58:49,055] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
184
+ [2025-07-01 08:58:49,055] [INFO] [comm.py:594:init_distributed] cdb=None
185
+ [2025-07-01 08:58:49,088] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
186
+ [2025-07-01 08:58:49,088] [INFO] [comm.py:594:init_distributed] cdb=None
187
+ [2025-07-01 08:58:49,089] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
188
+ [2025-07-01 08:58:49,089] [INFO] [comm.py:594:init_distributed] cdb=None
189
+ [2025-07-01 08:58:49,089] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
190
+ [2025-07-01 08:58:49,089] [INFO] [comm.py:594:init_distributed] cdb=None
191
+ [2025-07-01 08:58:49,089] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
192
+ [2025-07-01 08:58:49,089] [INFO] [comm.py:594:init_distributed] cdb=None
193
+ [2025-07-01 08:58:49,198] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
194
+ [2025-07-01 08:58:49,198] [INFO] [comm.py:594:init_distributed] cdb=None
195
+ [2025-07-01 08:58:49,204] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
196
+ [2025-07-01 08:58:49,204] [INFO] [comm.py:594:init_distributed] cdb=None
197
+ [2025-07-01 08:58:49,206] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
198
+ [2025-07-01 08:58:49,206] [INFO] [comm.py:594:init_distributed] cdb=None
199
+ [2025-07-01 08:58:49,208] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
200
+ [2025-07-01 08:58:49,208] [INFO] [comm.py:594:init_distributed] cdb=None
201
+ [2025-07-01 08:58:49,211] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
202
+ [2025-07-01 08:58:49,211] [INFO] [comm.py:594:init_distributed] cdb=None
203
+ [2025-07-01 08:58:49,219] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
204
+ [2025-07-01 08:58:49,219] [INFO] [comm.py:594:init_distributed] cdb=None
205
+ [2025-07-01 08:58:49,221] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
206
+ [2025-07-01 08:58:49,221] [INFO] [comm.py:594:init_distributed] cdb=None
207
+ [2025-07-01 08:58:49,221] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
208
+ [2025-07-01 08:58:49,221] [INFO] [comm.py:594:init_distributed] cdb=None
209
+ [2025-07-01 08:58:49,222] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
210
+ [2025-07-01 08:58:49,222] [INFO] [comm.py:594:init_distributed] cdb=None
211
+ [2025-07-01 08:58:49,226] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
212
+ [2025-07-01 08:58:49,226] [INFO] [comm.py:594:init_distributed] cdb=None
213
+ [2025-07-01 08:58:49,227] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
214
+ [2025-07-01 08:58:49,227] [INFO] [comm.py:594:init_distributed] cdb=None
215
+ [2025-07-01 08:58:49,227] [INFO] [comm.py:625:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
216
+ [2025-07-01 08:58:49,228] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
217
+ [2025-07-01 08:58:49,228] [INFO] [comm.py:594:init_distributed] cdb=None
218
+ [2025-07-01 08:58:49,239] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
219
+ [2025-07-01 08:58:49,239] [INFO] [comm.py:594:init_distributed] cdb=None
220
+ [2025-07-01 08:58:49,243] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
221
+ [2025-07-01 08:58:49,243] [INFO] [comm.py:594:init_distributed] cdb=None
222
+ [2025-07-01 08:58:49,243] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
223
+ [2025-07-01 08:58:49,243] [INFO] [comm.py:594:init_distributed] cdb=None
224
+ [2025-07-01 08:58:49,249] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
225
+ [2025-07-01 08:58:49,249] [INFO] [comm.py:594:init_distributed] cdb=None
226
+ [2025-07-01 08:58:49,252] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
227
+ [2025-07-01 08:58:49,253] [INFO] [comm.py:594:init_distributed] cdb=None
228
+ [2025-07-01 08:58:49,253] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
229
+ [2025-07-01 08:58:49,253] [INFO] [comm.py:594:init_distributed] cdb=None
230
+ [2025-07-01 08:58:49,257] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
231
+ [2025-07-01 08:58:49,257] [INFO] [comm.py:594:init_distributed] cdb=None
232
+ [2025-07-01 08:58:49,260] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
233
+ [2025-07-01 08:58:49,260] [INFO] [comm.py:594:init_distributed] cdb=None
234
+ [2025-07-01 08:58:49,272] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
235
+ [2025-07-01 08:58:49,272] [INFO] [comm.py:594:init_distributed] cdb=None
236
+ [2025-07-01 08:58:49,283] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
237
+ [2025-07-01 08:58:49,283] [INFO] [comm.py:594:init_distributed] cdb=None
238
+ [2025-07-01 08:58:49,286] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
239
+ [2025-07-01 08:58:49,286] [INFO] [comm.py:594:init_distributed] cdb=None
240
+ [2025-07-01 08:58:49,288] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
241
+ [2025-07-01 08:58:49,288] [INFO] [comm.py:594:init_distributed] cdb=None
242
+ [2025-07-01 08:58:49,300] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
243
+ [2025-07-01 08:58:49,300] [INFO] [comm.py:594:init_distributed] cdb=None
244
+ [2025-07-01 08:58:49,304] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
245
+ [2025-07-01 08:58:49,304] [INFO] [comm.py:594:init_distributed] cdb=None
246
+ [2025-07-01 08:58:49,316] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
247
+ [2025-07-01 08:58:49,316] [INFO] [comm.py:594:init_distributed] cdb=None
248
+ [2025-07-01 08:58:49,339] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
249
+ [2025-07-01 08:58:49,339] [INFO] [comm.py:594:init_distributed] cdb=None
250
+ [2025-07-01 08:58:49,349] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
251
+ [2025-07-01 08:58:49,349] [INFO] [comm.py:594:init_distributed] cdb=None
252
+ [2025-07-01 08:58:49,595] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
253
+ [2025-07-01 08:58:49,595] [INFO] [comm.py:594:init_distributed] cdb=None
254
+ [2025-07-01 08:58:49,640] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
255
+ [2025-07-01 08:58:49,640] [INFO] [comm.py:594:init_distributed] cdb=None
256
+ [2025-07-01 08:58:49,684] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
257
+ [2025-07-01 08:58:49,684] [INFO] [comm.py:594:init_distributed] cdb=None
258
+ [2025-07-01 08:58:49,925] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
259
+ [2025-07-01 08:58:49,925] [INFO] [comm.py:594:init_distributed] cdb=None
260
+ [2025-07-01 08:58:49,984] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
261
+ [2025-07-01 08:58:49,984] [INFO] [comm.py:594:init_distributed] cdb=None
262
+ [2025-07-01 08:58:49,997] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
263
+ [2025-07-01 08:58:49,997] [INFO] [comm.py:594:init_distributed] cdb=None
264
+ [2025-07-01 08:58:50,001] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
265
+ [2025-07-01 08:58:50,001] [INFO] [comm.py:594:init_distributed] cdb=None
266
+ [2025-07-01 08:58:50,007] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
267
+ [2025-07-01 08:58:50,007] [INFO] [comm.py:594:init_distributed] cdb=None
268
+ [2025-07-01 08:58:50,009] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
269
+ [2025-07-01 08:58:50,009] [INFO] [comm.py:594:init_distributed] cdb=None
270
+ [2025-07-01 08:58:50,009] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
271
+ [2025-07-01 08:58:50,009] [INFO] [comm.py:594:init_distributed] cdb=None
272
+ [2025-07-01 08:58:50,010] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
273
+ [2025-07-01 08:58:50,010] [INFO] [comm.py:594:init_distributed] cdb=None
274
+ [2025-07-01 08:58:50,691] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
275
+ [2025-07-01 08:58:50,691] [INFO] [comm.py:594:init_distributed] cdb=None
276
+ [2025-07-01 08:58:50,692] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
277
+ [2025-07-01 08:58:50,692] [INFO] [comm.py:594:init_distributed] cdb=None
278
+ [2025-07-01 08:58:50,723] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
279
+ [2025-07-01 08:58:50,723] [INFO] [comm.py:594:init_distributed] cdb=None
280
+ [2025-07-01 08:58:50,741] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
281
+ [2025-07-01 08:58:50,741] [INFO] [comm.py:594:init_distributed] cdb=None
282
+ [2025-07-01 08:58:50,745] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
283
+ [2025-07-01 08:58:50,745] [INFO] [comm.py:594:init_distributed] cdb=None
284
+ [2025-07-01 08:58:50,754] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
285
+ [2025-07-01 08:58:50,754] [INFO] [comm.py:594:init_distributed] cdb=None
286
+ [2025-07-01 08:58:50,758] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
287
+ [2025-07-01 08:58:50,758] [INFO] [comm.py:594:init_distributed] cdb=None
288
+ [2025-07-01 08:58:50,760] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
289
+ [2025-07-01 08:58:50,760] [INFO] [comm.py:594:init_distributed] cdb=None
290
+ [2025-07-01 08:58:50,760] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
291
+ [2025-07-01 08:58:50,760] [INFO] [comm.py:594:init_distributed] cdb=None
292
+ [2025-07-01 08:58:50,782] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
293
+ [2025-07-01 08:58:50,782] [INFO] [comm.py:594:init_distributed] cdb=None
294
+ [2025-07-01 08:58:50,821] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
295
+ [2025-07-01 08:58:50,821] [INFO] [comm.py:594:init_distributed] cdb=None
296
+ [2025-07-01 08:58:50,831] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
297
+ [2025-07-01 08:58:50,831] [INFO] [comm.py:594:init_distributed] cdb=None
298
+ [2025-07-01 08:58:50,834] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
299
+ [2025-07-01 08:58:50,834] [INFO] [comm.py:594:init_distributed] cdb=None
300
+ [2025-07-01 08:58:50,855] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
301
+ [2025-07-01 08:58:50,855] [INFO] [comm.py:594:init_distributed] cdb=None
302
+ [2025-07-01 08:58:50,860] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
303
+ [2025-07-01 08:58:50,860] [INFO] [comm.py:594:init_distributed] cdb=None
304
+ [2025-07-01 08:58:50,864] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
305
+ [2025-07-01 08:58:50,864] [INFO] [comm.py:594:init_distributed] cdb=None
306
+ [2025-07-01 08:59:04,797] [INFO] [partition_parameters.py:453:__exit__] finished initializing model with 7.61B parameters
307
+ [2025-07-01 08:59:22,183] [INFO] [partition_parameters.py:453:__exit__] finished initializing model with 8.03B parameters
308
+ [2025-07-01 08:59:23,443] [INFO] [partition_parameters.py:453:__exit__] finished initializing model with 8.09B parameters
309
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
310
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
311
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
312
+
313
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
314
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
315
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
316
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
317
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
318
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
319
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
320
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
321
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
322
+
323
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
324
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
325
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
326
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
327
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
328
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
329
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
330
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
331
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
332
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
333
+
334
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
335
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
336
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
337
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
338
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
339
+
340
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
341
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
342
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
343
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
344
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
345
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
346
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
347
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
348
+
349
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
350
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
351
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
352
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
353
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
354
+
355
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
356
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
357
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
358
+
359
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
360
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
361
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
362
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
363
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
364
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
365
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
366
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
367
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
368
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
369
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
370
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
371
+
372
+ [dist-0-of-64] LlavaLlamaModel(
373
+ (llm): Qwen2ForCausalLM(
374
+ (model): Qwen2Model(
375
+ (embed_tokens): Embedding(151648, 3584)
376
+ (layers): ModuleList(
377
+ (0-27): 28 x Qwen2DecoderLayer(
378
+ (self_attn): Qwen2FlashAttention2(
379
+ (q_proj): Linear(in_features=3584, out_features=3584, bias=True)
380
+ (k_proj): Linear(in_features=3584, out_features=512, bias=True)
381
+ (v_proj): Linear(in_features=3584, out_features=512, bias=True)
382
+ (o_proj): Linear(in_features=3584, out_features=3584, bias=False)
383
+ (rotary_emb): Qwen2RotaryEmbedding()
384
+ )
385
+ (mlp): Qwen2MLP(
386
+ (gate_proj): Linear(in_features=3584, out_features=18944, bias=False)
387
+ (up_proj): Linear(in_features=3584, out_features=18944, bias=False)
388
+ (down_proj): Linear(in_features=18944, out_features=3584, bias=False)
389
+ (act_fn): SiLU()
390
+ )
391
+ (input_layernorm): Qwen2RMSNorm((0,), eps=1e-06)
392
+ (post_attention_layernorm): Qwen2RMSNorm((0,), eps=1e-06)
393
+ )
394
+ )
395
+ (norm): Qwen2RMSNorm((0,), eps=1e-06)
396
+ (rotary_emb): Qwen2RotaryEmbedding()
397
+ )
398
+ (lm_head): Linear(in_features=3584, out_features=151648, bias=False)
399
+ )
400
+ (vision_tower): SiglipVisionTower(
401
+ (vision_tower): SiglipVisionModel(
402
+ (vision_model): SiglipVisionTransformer(
403
+ (embeddings): SiglipVisionEmbeddings(
404
+ (patch_embedding): Conv2d(3, 1152, kernel_size=(14, 14), stride=(14, 14), padding=valid)
405
+ (position_embedding): Embedding(1024, 1152)
406
+ )
407
+ (encoder): SiglipEncoder(
408
+ (layers): ModuleList(
409
+ (0-26): 27 x SiglipEncoderLayer(
410
+ (self_attn): SiglipFlashAttention2(
411
+ (k_proj): Linear(in_features=1152, out_features=1152, bias=True)
412
+ (v_proj): Linear(in_features=1152, out_features=1152, bias=True)
413
+ (q_proj): Linear(in_features=1152, out_features=1152, bias=True)
414
+ (out_proj): Linear(in_features=1152, out_features=1152, bias=True)
415
+ )
416
+ (layer_norm1): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
417
+ (mlp): SiglipMLP(
418
+ (activation_fn): PytorchGELUTanh()
419
+ (fc1): Linear(in_features=1152, out_features=4304, bias=True)
420
+ (fc2): Linear(in_features=4304, out_features=1152, bias=True)
421
+ )
422
+ (layer_norm2): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
423
+ )
424
+ )
425
+ )
426
+ (post_layernorm): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
427
+ )
428
+ )
429
+ )
430
+ (mm_projector): MultimodalProjector(
431
+ (layers): Sequential(
432
+ (0): DownSample3x3BlockFix()
433
+ (1): LayerNorm((10368,), eps=1e-05, elementwise_affine=True)
434
+ (2): Linear(in_features=10368, out_features=3456, bias=True)
435
+ (3): GELU(approximate='none')
436
+ (4): LayerNorm((3456,), eps=1e-05, elementwise_affine=True)
437
+ (5): Linear(in_features=3456, out_features=3584, bias=True)
438
+ (6): GELU(approximate='none')
439
+ (7): Linear(in_features=3584, out_features=3584, bias=True)
440
+ )
441
+ )
442
+ )
443
+ [dist-0-of-64] Tunable parameters:
444
+ language model True
445
+ [dist-0-of-64] vision tower True
446
+ [dist-0-of-64] mm projector True
447
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
448
+ [2025-07-01 09:01:57] Rank 31: Timer for terminate callback has been set.
449
+ Total limit: 240min
450
+ Pre terminate time: 10min elapsed_time: 186.7240424156189s
451
+ [2025-07-01 09:01:57] Rank 3: Timer for terminate callback has been set.
452
+ Total limit: 240min
453
+ Pre terminate time: 10min elapsed_time: 188.2856583595276s
454
+ [2025-07-01 09:01:57] Rank 32: Timer for terminate callback has been set.
455
+ Total limit: 240min
456
+ Pre terminate time: 10min elapsed_time: 188.3059437274933s
457
+ [2025-07-01 09:01:57] Rank 63: Timer for terminate callback has been set.
458
+ Total limit: 240min
459
+ Pre terminate time: 10min elapsed_time: 188.31311774253845s
460
+ [2025-07-01 09:01:57] Rank 18: Timer for terminate callback has been set.
461
+ Total limit: 240min
462
+ Pre terminate time: 10min elapsed_time: 186.8426342010498s
463
+ [2025-07-01 09:01:57] Rank 9: Timer for terminate callback has been set.
464
+ Total limit: 240min
465
+ Pre terminate time: 10min elapsed_time: 188.27460193634033s
466
+ [2025-07-01 09:01:57] Rank 52: Timer for terminate callback has been set.
467
+ Total limit: 240min
468
+ Pre terminate time: 10min elapsed_time: 188.58034896850586s
469
+ length of dataloader: 28 14336
470
+ length of dataloader: 28 14336
471
+ [GPU memory] before trainer 2.292407512664795
472
+ [GPU memory] before trainer 2.292407512664795
473
+ length of dataloader: 28 14336
474
+ [GPU memory] before trainer 2.292407512664795
475
+ length of dataloader: 28 14336
476
+ [GPU memory] before trainer 2.292407512664795
477
+ length of dataloader: 28 14336
478
+ [GPU memory] before trainer 2.292407512664795
479
+ [2025-07-01 09:01:57] Rank 33: Timer for terminate callback has been set.
480
+ Total limit: 240min
481
+ Pre terminate time: 10min elapsed_time: 188.53948974609375s
482
+ length of dataloader: 28 14336
483
+ [GPU memory] before trainer 2.292407512664795
484
+ [2025-07-01 09:01:57] Rank 47: Timer for terminate callback has been set.
485
+ Total limit: 240min
486
+ Pre terminate time: 10min elapsed_time: 187.8767204284668s
487
+ length of dataloader: 28 14336
488
+ [GPU memory] before trainer 2.292407512664795
489
+ length of dataloader: 28 14336
490
+ [GPU memory] before trainer 2.292407512664795
491
+ [2025-07-01 09:01:57] Rank 1: Timer for terminate callback has been set.
492
+ Total limit: 240min
493
+ Pre terminate time: 10min elapsed_time: 188.6350953578949s
494
+ [2025-07-01 09:01:57] Rank 19: Timer for terminate callback has been set.
495
+ Total limit: 240min
496
+ Pre terminate time: 10min elapsed_time: 187.17372345924377s
497
+ [2025-07-01 09:01:57] Rank 46: Timer for terminate callback has been set.
498
+ Total limit: 240min
499
+ Pre terminate time: 10min elapsed_time: 187.946551322937s
500
+ [2025-07-01 09:01:57] Rank 44: Timer for terminate callback has been set.
501
+ Total limit: 240min
502
+ Pre terminate time: 10min elapsed_time: 187.947163105011s
503
+ [2025-07-01 09:01:57] Rank 30: Timer for terminate callback has been set.
504
+ Total limit: 240min
505
+ Pre terminate time: 10min elapsed_time: 187.1679859161377s
506
+ length of dataloader: 28 14336
507
+ [GPU memory] before trainer 2.292407512664795
508
+ [2025-07-01 09:01:57] Rank 61: Timer for terminate callback has been set.
509
+ Total limit: 240min
510
+ Pre terminate time: 10min elapsed_time: 188.73403882980347s
511
+ length of dataloader: 28 14336
512
+ [GPU memory] before trainer 2.292407512664795
513
+ [2025-07-01 09:01:57] Rank 51: Timer for terminate callback has been set.
514
+ Total limit: 240min
515
+ Pre terminate time: 10min elapsed_time: 188.93320965766907s
516
+ [2025-07-01 09:01:57] Rank 58: Timer for terminate callback has been set.
517
+ Total limit: 240min
518
+ Pre terminate time: 10min elapsed_time: 188.77459192276s
519
+ [2025-07-01 09:01:57] Rank 12: Timer for terminate callback has been set.
520
+ Total limit: 240min
521
+ Pre terminate time: 10min elapsed_time: 188.71449184417725s
522
+ length of dataloader: 28 14336
523
+ [GPU memory] before trainer 2.292407512664795
524
+ [2025-07-01 09:01:57] Rank 48: Timer for terminate callback has been set.
525
+ Total limit: 240min
526
+ [2025-07-01 09:01:57] Rank 36: Timer for terminate callback has been set.
527
+ Total limit: 240min
528
+ Pre terminate time: 10min elapsed_time: 188.8108024597168s
529
+ Pre terminate time: 10min elapsed_time: 188.9705455303192s
530
+ [2025-07-01 09:01:57] Rank 39: Timer for terminate callback has been set.
531
+ Total limit: 240min
532
+ Pre terminate time: 10min elapsed_time: 188.81623101234436s
533
+ length of dataloader: 28 14336
534
+ [GPU memory] before trainer 2.292407512664795
535
+ [2025-07-01 09:01:57] Rank 14: Timer for terminate callback has been set.
536
+ Total limit: 240min
537
+ Pre terminate time: 10min elapsed_time: 188.76175379753113s
538
+ [2025-07-01 09:01:57] Rank 60: Timer for terminate callback has been set.
539
+ Total limit: 240min
540
+ Pre terminate time: 10min elapsed_time: 188.82697677612305s
541
+ length of dataloader:length of dataloader: 28 2814336
542
+ 14336
543
+ length of dataloader: 28 14336
544
+ [GPU memory] before trainer [GPU memory] before trainer 2.292407512664795
545
+ 2.292407512664795
546
+ [GPU memory] before trainer 2.292407512664795
547
+ [2025-07-01 09:01:57] Rank 50: Timer for terminate callback has been set.
548
+ Total limit: 240min
549
+ Pre terminate time: 10min elapsed_time: 189.00980305671692s
550
+ [2025-07-01 09:01:57] Rank 29: Timer for terminate callback has been set.
551
+ Total limit: 240min
552
+ Pre terminate time: 10min elapsed_time: 187.3063566684723s
553
+ length of dataloader: 28 14336
554
+ [GPU memory] before trainer 2.292407512664795
555
+ [2025-07-01 09:01:57] Rank 37: Timer for terminate callback has been set.
556
+ Total limit: 240min
557
+ Pre terminate time: 10min elapsed_time: 188.86100125312805s
558
+ length of dataloader: 28 14336
559
+ [2025-07-01 09:01:57] Rank 59: Timer for terminate callback has been set.
560
+ Total limit: 240min
561
+ Pre terminate time: 10min elapsed_time: 188.8586766719818s
562
+ [GPU memory] before trainer 2.292407512664795
563
+ length of dataloader: 28 14336
564
+ [GPU memory] before trainer 2.292407512664795
565
+ [2025-07-01 09:01:58] Rank 62: Timer for terminate callback has been set.
566
+ Total limit: 240min
567
+ Pre terminate time: 10min elapsed_time: 188.87530517578125s
568
+ [2025-07-01 09:01:58] Rank 23: Timer for terminate callback has been set.
569
+ Total limit: 240min
570
+ Pre terminate time: 10min elapsed_time: 187.39407753944397s
571
+ [2025-07-01 09:01:58] Rank 54: Timer for terminate callback has been set.
572
+ Total limit: 240min
573
+ Pre terminate time: 10min elapsed_time: 189.04404830932617s
574
+ [2025-07-01 09:01:58] Rank 57: Timer for terminate callback has been set.
575
+ Total limit: 240min
576
+ Pre terminate time: 10min elapsed_time: 188.88598775863647s
577
+ length of dataloader: 28 14336
578
+ [GPU memory] before trainer 2.292407512664795
579
+ length of dataloader: 28 14336
580
+ [GPU memory] before trainer 2.292407512664795
581
+ length of dataloader: 28 14336
582
+ [GPU memory] before trainer 2.292407512664795
583
+ [2025-07-01 09:01:58] Rank 28: Timer for terminate callback has been set.
584
+ Total limit: 240min
585
+ Pre terminate time: 10min elapsed_time: 187.35117411613464s
586
+ length of dataloader: 28 14336
587
+ [GPU memory] before trainer 2.292407512664795
588
+ length of dataloader: 28 14336
589
+ [GPU memory] before trainer 2.292407512664795
590
+ [2025-07-01 09:01:58] Rank 25: Timer for terminate callback has been set.
591
+ Total limit: 240min
592
+ Pre terminate time: 10min elapsed_time: 187.3726806640625s
593
+ [2025-07-01 09:01:58] Rank 49: Timer for terminate callback has been set.
594
+ Total limit: 240min
595
+ Pre terminate time: 10min elapsed_time: 189.0832874774933s
596
+ [2025-07-01 09:01:58] Rank 7: Timer for terminate callback has been set.
597
+ Total limit: 240min
598
+ Pre terminate time: 10min elapsed_time: 188.92747592926025s
599
+ [2025-07-01 09:01:58] Rank 55: Timer for terminate callback has been set.
600
+ Total limit: 240min
601
+ Pre terminate time: 10min elapsed_time: 189.09379124641418s
602
+ [2025-07-01 09:01:58] Rank 43: Timer for terminate callback has been set.
603
+ Total limit: 240min
604
+ Pre terminate time: 10min elapsed_time: 188.19229888916016s
605
+ length of dataloader: 28 14336
606
+ [GPU memory] before trainer 2.292407512664795
607
+ length of dataloader: 28 14336
608
+ [GPU memory] before trainer 2.292407512664795
609
+ [2025-07-01 09:01:58] Rank 41: Timer for terminate callback has been set.
610
+ Total limit: 240min
611
+ Pre terminate time: 10min elapsed_time: 188.19884490966797s
612
+ [2025-07-01 09:01:58] Rank 40: Timer for terminate callback has been set.
613
+ Total limit: 240min
614
+ Pre terminate time: 10min elapsed_time: 188.20034885406494s
615
+ length of dataloader: 28 14336
616
+ [GPU memory] before trainer 2.292407512664795
617
+ [2025-07-01 09:01:58] Rank 45: Timer for terminate callback has been set.
618
+ Total limit: 240min
619
+ Pre terminate time: 10min elapsed_time: 188.20093441009521s
620
+ [2025-07-01 09:01:58] Rank 24: Timer for terminate callback has been set.
621
+ Total limit: 240min
622
+ Pre terminate time: 10min elapsed_time: 187.39897632598877s
623
+ length of dataloader: 28 14336
624
+ [GPU memory] before trainer 2.292407512664795
625
+ [2025-07-01 09:01:58] Rank 42: Timer for terminate callback has been set.
626
+ Total limit: 240min
627
+ Pre terminate time: 10min elapsed_time: 188.20916652679443s
628
+ [2025-07-01 09:01:58] Rank 17: Timer for terminate callback has been set.
629
+ Total limit: 240min
630
+ Pre terminate time: 10min elapsed_time: 187.46580815315247s
631
+ length of dataloader: 28 14336
632
+ [GPU memory] before trainer 2.292407512664795
633
+ length of dataloader: 28 14336
634
+ [GPU memory] before trainer 2.292407512664795
635
+ length of dataloader: 28 14336
636
+ [GPU memory] before trainer 2.292407512664795
637
+ length of dataloader: 28 14336
638
+ [2025-07-01 09:01:58] Rank 22: Timer for terminate callback has been set.
639
+ Total limit: 240min
640
+ Pre terminate time: 10min elapsed_time: 187.48894619941711s
641
+ [GPU memory] before trainer 2.292407512664795
642
+ [2025-07-01 09:01:58] Rank 34: Timer for terminate callback has been set.
643
+ Total limit: 240min
644
+ Pre terminate time: 10min elapsed_time: 188.97889137268066s
645
+ [2025-07-01 09:01:58] Rank 35: Timer for terminate callback has been set.
646
+ Total limit: 240min
647
+ Pre terminate time: 10min elapsed_time: 188.97930574417114s
648
+ [2025-07-01 09:01:58] Rank 6: Timer for terminate callback has been set.
649
+ Total limit: 240min
650
+ Pre terminate time: 10min elapsed_time: 188.98443937301636s
651
+ length of dataloader: 28 14336
652
+ [GPU memory] before trainer 2.292407512664795
653
+ [2025-07-01 09:01:58] Rank 16: Timer for terminate callback has been set.
654
+ Total limit: 240min
655
+ Pre terminate time: 10min elapsed_time: 187.5020468235016s
656
+ [2025-07-01 09:01:58] Rank 8: Timer for terminate callback has been set.
657
+ Total limit: 240min
658
+ Pre terminate time: 10min elapsed_time: 188.9302875995636s
659
+ [2025-07-01 09:01:58] Rank 10: Timer for terminate callback has been set.
660
+ Total limit: 240min
661
+ Pre terminate time: 10min elapsed_time: 188.93019914627075s
662
+ [2025-07-01 09:01:58] Rank 11: Timer for terminate callback has been set.
663
+ Total limit: 240min
664
+ Pre terminate time: 10min elapsed_time: 188.9377384185791s
665
+ [2025-07-01 09:01:58] Rank 15: Timer for terminate callback has been set.
666
+ Total limit: 240min
667
+ Pre terminate time: 10min elapsed_time: 188.93913388252258s
668
+ [2025-07-01 09:01:58] Rank 5: Timer for terminate callback has been set.
669
+ Total limit: 240min
670
+ Pre terminate time: 10min elapsed_time: 189.00355291366577s
671
+ [2025-07-01 09:01:58] Rank 56: Timer for terminate callback has been set.
672
+ Total limit: 240min
673
+ Pre terminate time: 10min elapsed_time: 189.0038776397705s
674
+ length of dataloader: 28 14336
675
+ [GPU memory] before trainer 2.292407512664795
676
+ length of dataloader: 28 14336
677
+ [GPU memory] before trainer 2.292407512664795
678
+ [2025-07-01 09:01:58] Rank 21: Timer for terminate callback has been set.
679
+ Total limit: 240min
680
+ Pre terminate time: 10min elapsed_time: 187.52710509300232s
681
+ [2025-07-01 09:01:58] Rank 0: Timer for terminate callback has been set.
682
+ Total limit: 240min
683
+ Pre terminate time: 10min elapsed_time: 189.0127944946289s
684
+ length of dataloader: 28 14336
685
+ [GPU memory] before trainer 2.292407512664795
686
+ length of dataloader: 28 14336
687
+ [GPU memory] before trainer 2.292407512664795
688
+ [2025-07-01 09:01:58] Rank 38: Timer for terminate callback has been set.
689
+ Total limit: 240min
690
+ Pre terminate time: 10min elapsed_time: 189.02126288414001s
691
+ [2025-07-01 09:01:58] Rank 20: Timer for terminate callback has been set.
692
+ Total limit: 240min
693
+ Pre terminate time: 10min elapsed_time: 187.5401885509491s
694
+ length of dataloader: 28 14336
695
+ [GPU memory] before trainer 2.292407512664795
696
+ length of dataloader: 28 14336
697
+ [GPU memory] before trainer 2.292407512664795
698
+ length of dataloader: 28 14336
699
+ [GPU memory] before trainer 2.292407512664795
700
+ length of dataloader: 28 14336
701
+ [GPU memory] before trainer 2.292407512664795
702
+ length of dataloader: 28 14336
703
+ [GPU memory] before trainer 2.292407512664795
704
+ length of dataloader: 28 14336
705
+ [GPU memory] before trainer 2.292407512664795
706
+ [2025-07-01 09:01:58] Rank 53: Timer for terminate callback has been set.
707
+ Total limit: 240min
708
+ Pre terminate time: 10min elapsed_time: 189.20625829696655s
709
+ length of dataloader: 28 14336
710
+ [GPU memory] before trainer 2.292407512664795
711
+ [2025-07-01 09:01:58] Rank 4: Timer for terminate callback has been set.
712
+ Total limit: 240min
713
+ Pre terminate time: 10min elapsed_time: 189.05517554283142s
714
+ length of dataloader: 28 14336
715
+ [GPU memory] before trainer 2.292407512664795
716
+ length of dataloader: 28 14336
717
+ [GPU memory] before trainer 2.292407512664795length of dataloader:
718
+ 28 14336
719
+ [GPU memory] before trainer 2.292407512664795
720
+ [2025-07-01 09:01:58] Rank 2: Timer for terminate callback has been set.
721
+ Total limit: 240min
722
+ Pre terminate time: 10min elapsed_time: 189.06267762184143s
723
+ length of dataloader: 28 14336
724
+ [GPU memory] before trainer 2.292407512664795
725
+ length of dataloader: 28 14336
726
+ [GPU memory] before trainer 2.292407512664795
727
+ [2025-07-01 09:01:58] Rank 13: Timer for terminate callback has been set.
728
+ Total limit: 240min
729
+ Pre terminate time: 10min elapsed_time: 189.01914143562317s
730
+ length of dataloader: 28 14336
731
+ [GPU memory] before trainer 2.292407512664795
732
+ length of dataloader: 28 14336
733
+ [GPU memory] before trainer 2.292407512664795
734
+ length of dataloader: 28 14336
735
+ length of dataloader: 28 14336
736
+ [GPU memory] before trainer 2.292407512664795
737
+ [GPU memory] before trainer 2.292407512664795
738
+ length of dataloader: 28 14336
739
+ [GPU memory] before trainer 2.292407512664795
740
+ length of dataloader: 28 14336
741
+ [GPU memory] before trainer 2.292407512664795
742
+ length of dataloader: 28 14336
743
+ [GPU memory] before trainer 2.292407512664795
744
+ [2025-07-01 09:01:58] Rank 26: Timer for terminate callback has been set.
745
+ Total limit: 240min
746
+ Pre terminate time: 10min elapsed_time: 187.55401301383972s
747
+ length of dataloader: 28 14336
748
+ [GPU memory] before trainer 2.292407512664795
749
+ length of dataloader: 28 14336
750
+ [GPU memory] before trainer 2.292407512664795
751
+ length of dataloader: 28 14336
752
+ [GPU memory] before trainer 2.292407512664795
753
+ [2025-07-01 09:01:58] Rank 27: Timer for terminate callback has been set.
754
+ Total limit: 240min
755
+ Pre terminate time: 10min elapsed_time: 187.57627320289612s
756
+ length of dataloader: 28 14336
757
+ [GPU memory] before trainer 2.292407512664795
758
+ length of dataloader: 28 14336
759
+ [GPU memory] before trainer 2.292407512664795
760
+ length of dataloader: 28 14336
761
+ [GPU memory] before trainer 2.292407512664795
762
+ length of dataloader: 28 14336
763
+ [GPU memory] before trainer 2.292407512664795
764
+ length of dataloader: 28 14336
765
+ [GPU memory] before trainer 2.292407512664795
766
+ length of dataloader: 28 14336
767
+ [GPU memory] before trainer 2.292407512664795
768
+ Parameter Offload: Total persistent parameters: 771184 in 421 params
slurm/1038286.0.err ADDED
The diff for this file is too large to render. See raw diff
 
slurm/1038286.0.out ADDED
@@ -0,0 +1,768 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ SLURM_JOB_ID = 1038286
2
+ SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
3
+ RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
4
+ OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
5
+ NNODES = 8
6
+ NODES = pool0-02124 pool0-02152 pool0-02160 pool0-02193 pool0-02639 pool0-02605 pool0-02007 pool0-02015
7
+ NODE_RANK = 5
8
+ GPUS_PER_NODE = 8
9
+ MASTER_ADDR = pool0-02124
10
+ MASTER_PORT = 25001
11
+ GLOBAL_TRAIN_BATCH_SIZE = 2048
12
+ GRADIENT_ACCUMULATION_STEPS = 4
13
+ PER_DEVICE_TRAIN_BATCH_SIZE = 8
14
+ DEFAULT_LEARNING_RATE: 2e-5
15
+ SLURM_JOB_ID = 1038286
16
+ SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
17
+ RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
18
+ OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
19
+ NNODES = 8
20
+ NODES = pool0-02124 pool0-02152 pool0-02160 pool0-02193 pool0-02639 pool0-02605 pool0-02007 pool0-02015
21
+ NODE_RANK = 6
22
+ GPUS_PER_NODE = 8
23
+ MASTER_ADDR = pool0-02124
24
+ MASTER_PORT = 25001
25
+ GLOBAL_TRAIN_BATCH_SIZE = 2048
26
+ GRADIENT_ACCUMULATION_STEPS = 4
27
+ PER_DEVICE_TRAIN_BATCH_SIZE = 8
28
+ DEFAULT_LEARNING_RATE: 2e-5
29
+ SLURM_JOB_ID = 1038286
30
+ SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
31
+ RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
32
+ OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
33
+ NNODES = 8
34
+ NODES = pool0-02124 pool0-02152 pool0-02160 pool0-02193 pool0-02639 pool0-02605 pool0-02007 pool0-02015
35
+ NODE_RANK = 4
36
+ GPUS_PER_NODE = 8
37
+ MASTER_ADDR = pool0-02124
38
+ MASTER_PORT = 25001
39
+ GLOBAL_TRAIN_BATCH_SIZE = 2048
40
+ GRADIENT_ACCUMULATION_STEPS = 4
41
+ PER_DEVICE_TRAIN_BATCH_SIZE = 8
42
+ DEFAULT_LEARNING_RATE: 2e-5
43
+ SLURM_JOB_ID = 1038286
44
+ SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
45
+ RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
46
+ OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
47
+ NNODES = 8
48
+ SLURM_JOB_ID = 1038286
49
+ SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
50
+ RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
51
+ OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
52
+ NNODES = 8
53
+ NODES = pool0-02124 pool0-02152 pool0-02160 pool0-02193 pool0-02639 pool0-02605 pool0-02007 pool0-02015
54
+ NODE_RANK = 3
55
+ GPUS_PER_NODE = 8
56
+ MASTER_ADDR = pool0-02124
57
+ MASTER_PORT = 25001
58
+ GLOBAL_TRAIN_BATCH_SIZE = 2048
59
+ GRADIENT_ACCUMULATION_STEPS = 4
60
+ PER_DEVICE_TRAIN_BATCH_SIZE = 8
61
+ DEFAULT_LEARNING_RATE: 2e-5
62
+ NODES = pool0-02124 pool0-02152 pool0-02160 pool0-02193 pool0-02639 pool0-02605 pool0-02007 pool0-02015
63
+ NODE_RANK = 2
64
+ GPUS_PER_NODE = 8
65
+ MASTER_ADDR = pool0-02124
66
+ MASTER_PORT = 25001
67
+ GLOBAL_TRAIN_BATCH_SIZE = 2048
68
+ GRADIENT_ACCUMULATION_STEPS = 4
69
+ PER_DEVICE_TRAIN_BATCH_SIZE = 8
70
+ DEFAULT_LEARNING_RATE: 2e-5
71
+ SLURM_JOB_ID = 1038286
72
+ SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
73
+ RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
74
+ OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
75
+ NNODES = 8
76
+ NODES = pool0-02124 pool0-02152 pool0-02160 pool0-02193 pool0-02639 pool0-02605 pool0-02007 pool0-02015
77
+ NODE_RANK = 1
78
+ GPUS_PER_NODE = 8
79
+ MASTER_ADDR = pool0-02124
80
+ MASTER_PORT = 25001
81
+ GLOBAL_TRAIN_BATCH_SIZE = 2048
82
+ GRADIENT_ACCUMULATION_STEPS = 4
83
+ PER_DEVICE_TRAIN_BATCH_SIZE = 8
84
+ DEFAULT_LEARNING_RATE: 2e-5
85
+ SLURM_JOB_ID = 1038286
86
+ SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
87
+ RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
88
+ OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
89
+ NNODES = 8
90
+ NODES = pool0-02124 pool0-02152 pool0-02160 pool0-02193 pool0-02639 pool0-02605 pool0-02007 pool0-02015
91
+ NODE_RANK = 7
92
+ GPUS_PER_NODE = 8
93
+ MASTER_ADDR = pool0-02124
94
+ MASTER_PORT = 25001
95
+ GLOBAL_TRAIN_BATCH_SIZE = 2048
96
+ GRADIENT_ACCUMULATION_STEPS = 4
97
+ PER_DEVICE_TRAIN_BATCH_SIZE = 8
98
+ DEFAULT_LEARNING_RATE: 2e-5
99
+ SLURM_JOB_ID = 1038286
100
+ SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
101
+ RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
102
+ OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
103
+ NNODES = 8
104
+ NODES = pool0-02124 pool0-02152 pool0-02160 pool0-02193 pool0-02639 pool0-02605 pool0-02007 pool0-02015
105
+ NODE_RANK = 0
106
+ GPUS_PER_NODE = 8
107
+ MASTER_ADDR = pool0-02124
108
+ MASTER_PORT = 25001
109
+ GLOBAL_TRAIN_BATCH_SIZE = 2048
110
+ GRADIENT_ACCUMULATION_STEPS = 4
111
+ PER_DEVICE_TRAIN_BATCH_SIZE = 8
112
+ DEFAULT_LEARNING_RATE: 2e-5
113
+ [2025-07-01 09:10:30,122] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
114
+ [2025-07-01 09:10:30,706] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
115
+ [2025-07-01 09:10:30,713] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
116
+ [2025-07-01 09:10:30,744] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
117
+ [2025-07-01 09:10:30,752] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
118
+ [2025-07-01 09:10:30,755] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
119
+ [2025-07-01 09:10:30,762] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
120
+ [2025-07-01 09:10:30,764] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
121
+ [2025-07-01 09:10:30,784] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
122
+ [2025-07-01 09:10:30,825] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
123
+ [2025-07-01 09:10:30,850] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
124
+ [2025-07-01 09:10:30,857] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
125
+ [2025-07-01 09:10:30,884] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
126
+ [2025-07-01 09:10:30,886] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
127
+ [2025-07-01 09:10:30,887] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
128
+ [2025-07-01 09:10:30,887] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
129
+ [2025-07-01 09:10:33,393] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
130
+ [2025-07-01 09:10:33,394] [INFO] [comm.py:594:init_distributed] cdb=None
131
+ [2025-07-01 09:10:33,920] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
132
+ [2025-07-01 09:10:33,920] [INFO] [comm.py:594:init_distributed] cdb=None
133
+ [2025-07-01 09:10:33,962] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
134
+ [2025-07-01 09:10:33,962] [INFO] [comm.py:594:init_distributed] cdb=None
135
+ [2025-07-01 09:10:34,066] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
136
+ [2025-07-01 09:10:34,067] [INFO] [comm.py:594:init_distributed] cdb=None
137
+ [2025-07-01 09:10:34,067] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
138
+ [2025-07-01 09:10:34,067] [INFO] [comm.py:594:init_distributed] cdb=None
139
+ [2025-07-01 09:10:34,083] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
140
+ [2025-07-01 09:10:34,083] [INFO] [comm.py:594:init_distributed] cdb=None
141
+ [2025-07-01 09:10:34,164] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
142
+ [2025-07-01 09:10:34,164] [INFO] [comm.py:594:init_distributed] cdb=None
143
+ [2025-07-01 09:10:34,165] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
144
+ [2025-07-01 09:10:34,165] [INFO] [comm.py:594:init_distributed] cdb=None
145
+ [2025-07-01 09:10:34,168] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
146
+ [2025-07-01 09:10:34,169] [INFO] [comm.py:594:init_distributed] cdb=None
147
+ [2025-07-01 09:10:34,169] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
148
+ [2025-07-01 09:10:34,169] [INFO] [comm.py:594:init_distributed] cdb=None
149
+ [2025-07-01 09:10:34,175] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
150
+ [2025-07-01 09:10:34,175] [INFO] [comm.py:594:init_distributed] cdb=None
151
+ [2025-07-01 09:10:34,205] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
152
+ [2025-07-01 09:10:34,205] [INFO] [comm.py:594:init_distributed] cdb=None
153
+ [2025-07-01 09:10:34,206] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
154
+ [2025-07-01 09:10:34,206] [INFO] [comm.py:594:init_distributed] cdb=None
155
+ [2025-07-01 09:10:34,214] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
156
+ [2025-07-01 09:10:34,214] [INFO] [comm.py:594:init_distributed] cdb=None
157
+ [2025-07-01 09:10:34,217] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
158
+ [2025-07-01 09:10:34,217] [INFO] [comm.py:594:init_distributed] cdb=None
159
+ [2025-07-01 09:10:34,296] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
160
+ [2025-07-01 09:10:34,296] [INFO] [comm.py:594:init_distributed] cdb=None
161
+ [2025-07-01 09:10:35,443] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
162
+ [2025-07-01 09:10:35,443] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
163
+ [2025-07-01 09:10:35,443] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
164
+ [2025-07-01 09:10:35,443] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
165
+ [2025-07-01 09:10:35,443] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
166
+ [2025-07-01 09:10:35,443] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
167
+ [2025-07-01 09:10:35,443] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
168
+ [2025-07-01 09:10:35,444] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
169
+ [2025-07-01 09:10:35,444] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
170
+ [2025-07-01 09:10:35,444] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
171
+ [2025-07-01 09:10:35,444] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
172
+ [2025-07-01 09:10:35,444] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
173
+ [2025-07-01 09:10:35,444] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
174
+ [2025-07-01 09:10:35,444] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
175
+ [2025-07-01 09:10:35,444] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
176
+ [2025-07-01 09:10:35,444] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
177
+ [2025-07-01 09:10:35,504] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
178
+ [2025-07-01 09:10:35,504] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
179
+ [2025-07-01 09:10:35,504] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
180
+ [2025-07-01 09:10:35,504] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
181
+ [2025-07-01 09:10:35,504] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
182
+ [2025-07-01 09:10:35,504] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
183
+ [2025-07-01 09:10:35,504] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
184
+ [2025-07-01 09:10:35,504] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
185
+ [2025-07-01 09:10:35,544] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
186
+ [2025-07-01 09:10:35,544] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
187
+ [2025-07-01 09:10:35,544] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
188
+ [2025-07-01 09:10:35,544] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
189
+ [2025-07-01 09:10:35,544] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
190
+ [2025-07-01 09:10:35,544] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
191
+ [2025-07-01 09:10:35,544] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
192
+ [2025-07-01 09:10:35,544] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
193
+ [2025-07-01 09:10:36,054] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
194
+ [2025-07-01 09:10:36,054] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
195
+ [2025-07-01 09:10:36,054] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
196
+ [2025-07-01 09:10:36,054] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
197
+ [2025-07-01 09:10:36,054] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
198
+ [2025-07-01 09:10:36,054] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
199
+ [2025-07-01 09:10:36,054] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
200
+ [2025-07-01 09:10:36,054] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
201
+ [2025-07-01 09:10:37,641] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
202
+ [2025-07-01 09:10:37,641] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
203
+ [2025-07-01 09:10:37,641] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
204
+ [2025-07-01 09:10:37,641] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
205
+ [2025-07-01 09:10:37,641] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
206
+ [2025-07-01 09:10:37,641] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
207
+ [2025-07-01 09:10:37,641] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
208
+ [2025-07-01 09:10:37,641] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
209
+ [2025-07-01 09:10:46,125] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
210
+ [2025-07-01 09:10:46,125] [INFO] [comm.py:594:init_distributed] cdb=None
211
+ [2025-07-01 09:10:46,149] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
212
+ [2025-07-01 09:10:46,149] [INFO] [comm.py:594:init_distributed] cdb=None
213
+ [2025-07-01 09:10:46,150] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
214
+ [2025-07-01 09:10:46,150] [INFO] [comm.py:594:init_distributed] cdb=None
215
+ [2025-07-01 09:10:46,151] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
216
+ [2025-07-01 09:10:46,151] [INFO] [comm.py:594:init_distributed] cdb=None
217
+ [2025-07-01 09:10:46,152] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
218
+ [2025-07-01 09:10:46,152] [INFO] [comm.py:594:init_distributed] cdb=None
219
+ [2025-07-01 09:10:46,160] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
220
+ [2025-07-01 09:10:46,160] [INFO] [comm.py:594:init_distributed] cdb=None
221
+ [2025-07-01 09:10:46,168] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
222
+ [2025-07-01 09:10:46,168] [INFO] [comm.py:594:init_distributed] cdb=None
223
+ [2025-07-01 09:10:46,168] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
224
+ [2025-07-01 09:10:46,168] [INFO] [comm.py:594:init_distributed] cdb=None
225
+ [2025-07-01 09:10:46,370] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
226
+ [2025-07-01 09:10:46,370] [INFO] [comm.py:594:init_distributed] cdb=None
227
+ [2025-07-01 09:10:46,376] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
228
+ [2025-07-01 09:10:46,376] [INFO] [comm.py:594:init_distributed] cdb=None
229
+ [2025-07-01 09:10:46,382] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
230
+ [2025-07-01 09:10:46,382] [INFO] [comm.py:594:init_distributed] cdb=None
231
+ [2025-07-01 09:10:46,386] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
232
+ [2025-07-01 09:10:46,386] [INFO] [comm.py:594:init_distributed] cdb=None
233
+ [2025-07-01 09:10:46,387] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
234
+ [2025-07-01 09:10:46,387] [INFO] [comm.py:594:init_distributed] cdb=None
235
+ [2025-07-01 09:10:46,394] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
236
+ [2025-07-01 09:10:46,394] [INFO] [comm.py:594:init_distributed] cdb=None
237
+ [2025-07-01 09:10:46,394] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
238
+ [2025-07-01 09:10:46,394] [INFO] [comm.py:594:init_distributed] cdb=None
239
+ [2025-07-01 09:10:46,396] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
240
+ [2025-07-01 09:10:46,397] [INFO] [comm.py:594:init_distributed] cdb=None
241
+ [2025-07-01 09:10:46,409] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
242
+ [2025-07-01 09:10:46,409] [INFO] [comm.py:594:init_distributed] cdb=None
243
+ [2025-07-01 09:10:46,413] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
244
+ [2025-07-01 09:10:46,413] [INFO] [comm.py:594:init_distributed] cdb=None
245
+ [2025-07-01 09:10:46,413] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
246
+ [2025-07-01 09:10:46,413] [INFO] [comm.py:594:init_distributed] cdb=None
247
+ [2025-07-01 09:10:46,416] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
248
+ [2025-07-01 09:10:46,416] [INFO] [comm.py:594:init_distributed] cdb=None
249
+ [2025-07-01 09:10:46,422] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
250
+ [2025-07-01 09:10:46,422] [INFO] [comm.py:594:init_distributed] cdb=None
251
+ [2025-07-01 09:10:46,428] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
252
+ [2025-07-01 09:10:46,428] [INFO] [comm.py:594:init_distributed] cdb=None
253
+ [2025-07-01 09:10:46,428] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
254
+ [2025-07-01 09:10:46,429] [INFO] [comm.py:594:init_distributed] cdb=None
255
+ [2025-07-01 09:10:46,429] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
256
+ [2025-07-01 09:10:46,429] [INFO] [comm.py:594:init_distributed] cdb=None
257
+ [2025-07-01 09:10:46,493] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
258
+ [2025-07-01 09:10:46,493] [INFO] [comm.py:594:init_distributed] cdb=None
259
+ [2025-07-01 09:10:46,502] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
260
+ [2025-07-01 09:10:46,502] [INFO] [comm.py:594:init_distributed] cdb=None
261
+ [2025-07-01 09:10:46,515] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
262
+ [2025-07-01 09:10:46,515] [INFO] [comm.py:594:init_distributed] cdb=None
263
+ [2025-07-01 09:10:46,533] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
264
+ [2025-07-01 09:10:46,533] [INFO] [comm.py:594:init_distributed] cdb=None
265
+ [2025-07-01 09:10:46,534] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
266
+ [2025-07-01 09:10:46,534] [INFO] [comm.py:594:init_distributed] cdb=None
267
+ [2025-07-01 09:10:46,551] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
268
+ [2025-07-01 09:10:46,551] [INFO] [comm.py:594:init_distributed] cdb=None
269
+ [2025-07-01 09:10:46,554] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
270
+ [2025-07-01 09:10:46,554] [INFO] [comm.py:594:init_distributed] cdb=None
271
+ [2025-07-01 09:10:46,555] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
272
+ [2025-07-01 09:10:46,555] [INFO] [comm.py:594:init_distributed] cdb=None
273
+ [2025-07-01 09:10:46,787] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
274
+ [2025-07-01 09:10:46,787] [INFO] [comm.py:594:init_distributed] cdb=None
275
+ [2025-07-01 09:10:46,819] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
276
+ [2025-07-01 09:10:46,820] [INFO] [comm.py:594:init_distributed] cdb=None
277
+ [2025-07-01 09:10:46,821] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
278
+ [2025-07-01 09:10:46,821] [INFO] [comm.py:594:init_distributed] cdb=None
279
+ [2025-07-01 09:10:46,828] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
280
+ [2025-07-01 09:10:46,828] [INFO] [comm.py:594:init_distributed] cdb=None
281
+ [2025-07-01 09:10:46,854] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
282
+ [2025-07-01 09:10:46,854] [INFO] [comm.py:594:init_distributed] cdb=None
283
+ [2025-07-01 09:10:46,862] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
284
+ [2025-07-01 09:10:46,862] [INFO] [comm.py:594:init_distributed] cdb=None
285
+ [2025-07-01 09:10:46,866] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
286
+ [2025-07-01 09:10:46,866] [INFO] [comm.py:594:init_distributed] cdb=None
287
+ [2025-07-01 09:10:46,868] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
288
+ [2025-07-01 09:10:46,869] [INFO] [comm.py:594:init_distributed] cdb=None
289
+ [2025-07-01 09:10:48,303] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
290
+ [2025-07-01 09:10:48,303] [INFO] [comm.py:594:init_distributed] cdb=None
291
+ [2025-07-01 09:10:48,325] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
292
+ [2025-07-01 09:10:48,325] [INFO] [comm.py:594:init_distributed] cdb=None
293
+ [2025-07-01 09:10:48,330] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
294
+ [2025-07-01 09:10:48,331] [INFO] [comm.py:594:init_distributed] cdb=None
295
+ [2025-07-01 09:10:48,468] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
296
+ [2025-07-01 09:10:48,468] [INFO] [comm.py:594:init_distributed] cdb=None
297
+ [2025-07-01 09:10:48,478] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
298
+ [2025-07-01 09:10:48,478] [INFO] [comm.py:594:init_distributed] cdb=None
299
+ [2025-07-01 09:10:48,480] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
300
+ [2025-07-01 09:10:48,480] [INFO] [comm.py:594:init_distributed] cdb=None
301
+ [2025-07-01 09:10:48,480] [INFO] [comm.py:625:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
302
+ [2025-07-01 09:10:48,511] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
303
+ [2025-07-01 09:10:48,511] [INFO] [comm.py:594:init_distributed] cdb=None
304
+ [2025-07-01 09:10:48,511] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
305
+ [2025-07-01 09:10:48,511] [INFO] [comm.py:594:init_distributed] cdb=None
306
+ [2025-07-01 09:11:04,481] [INFO] [partition_parameters.py:453:__exit__] finished initializing model with 7.61B parameters
307
+ [2025-07-01 09:11:20,374] [INFO] [partition_parameters.py:453:__exit__] finished initializing model with 8.03B parameters
308
+ [2025-07-01 09:11:21,706] [INFO] [partition_parameters.py:453:__exit__] finished initializing model with 8.09B parameters
309
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
310
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
311
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
312
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
313
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
314
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
315
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
316
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
317
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
318
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
319
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
320
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
321
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
322
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
323
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
324
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
325
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
326
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
327
+
328
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
329
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
330
+
331
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
332
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
333
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
334
+
335
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
336
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
337
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
338
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
339
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
340
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
341
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
342
+
343
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
344
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
345
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
346
+
347
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
348
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
349
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
350
+
351
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
352
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
353
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
354
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
355
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
356
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
357
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
358
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
359
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
360
+
361
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
362
+
363
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
364
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
365
+
366
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
367
+
368
+
369
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
370
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
371
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
372
+ [dist-0-of-64] LlavaLlamaModel(
373
+ (llm): Qwen2ForCausalLM(
374
+ (model): Qwen2Model(
375
+ (embed_tokens): Embedding(151648, 3584)
376
+ (layers): ModuleList(
377
+ (0-27): 28 x Qwen2DecoderLayer(
378
+ (self_attn): Qwen2FlashAttention2(
379
+ (q_proj): Linear(in_features=3584, out_features=3584, bias=True)
380
+ (k_proj): Linear(in_features=3584, out_features=512, bias=True)
381
+ (v_proj): Linear(in_features=3584, out_features=512, bias=True)
382
+ (o_proj): Linear(in_features=3584, out_features=3584, bias=False)
383
+ (rotary_emb): Qwen2RotaryEmbedding()
384
+ )
385
+ (mlp): Qwen2MLP(
386
+ (gate_proj): Linear(in_features=3584, out_features=18944, bias=False)
387
+ (up_proj): Linear(in_features=3584, out_features=18944, bias=False)
388
+ (down_proj): Linear(in_features=18944, out_features=3584, bias=False)
389
+ (act_fn): SiLU()
390
+ )
391
+ (input_layernorm): Qwen2RMSNorm((0,), eps=1e-06)
392
+ (post_attention_layernorm): Qwen2RMSNorm((0,), eps=1e-06)
393
+ )
394
+ )
395
+ (norm): Qwen2RMSNorm((0,), eps=1e-06)
396
+ (rotary_emb): Qwen2RotaryEmbedding()
397
+ )
398
+ (lm_head): Linear(in_features=3584, out_features=151648, bias=False)
399
+ )
400
+ (vision_tower): SiglipVisionTower(
401
+ (vision_tower): SiglipVisionModel(
402
+ (vision_model): SiglipVisionTransformer(
403
+ (embeddings): SiglipVisionEmbeddings(
404
+ (patch_embedding): Conv2d(3, 1152, kernel_size=(14, 14), stride=(14, 14), padding=valid)
405
+ (position_embedding): Embedding(1024, 1152)
406
+ )
407
+ (encoder): SiglipEncoder(
408
+ (layers): ModuleList(
409
+ (0-26): 27 x SiglipEncoderLayer(
410
+ (self_attn): SiglipFlashAttention2(
411
+ (k_proj): Linear(in_features=1152, out_features=1152, bias=True)
412
+ (v_proj): Linear(in_features=1152, out_features=1152, bias=True)
413
+ (q_proj): Linear(in_features=1152, out_features=1152, bias=True)
414
+ (out_proj): Linear(in_features=1152, out_features=1152, bias=True)
415
+ )
416
+ (layer_norm1): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
417
+ (mlp): SiglipMLP(
418
+ (activation_fn): PytorchGELUTanh()
419
+ (fc1): Linear(in_features=1152, out_features=4304, bias=True)
420
+ (fc2): Linear(in_features=4304, out_features=1152, bias=True)
421
+ )
422
+ (layer_norm2): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
423
+ )
424
+ )
425
+ )
426
+ (post_layernorm): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
427
+ )
428
+ )
429
+ )
430
+ (mm_projector): MultimodalProjector(
431
+ (layers): Sequential(
432
+ (0): DownSample3x3BlockFix()
433
+ (1): LayerNorm((10368,), eps=1e-05, elementwise_affine=True)
434
+ (2): Linear(in_features=10368, out_features=3456, bias=True)
435
+ (3): GELU(approximate='none')
436
+ (4): LayerNorm((3456,), eps=1e-05, elementwise_affine=True)
437
+ (5): Linear(in_features=3456, out_features=3584, bias=True)
438
+ (6): GELU(approximate='none')
439
+ (7): Linear(in_features=3584, out_features=3584, bias=True)
440
+ )
441
+ )
442
+ )
443
+ [dist-0-of-64] Tunable parameters:
444
+ language model True
445
+ [dist-0-of-64] vision tower True
446
+ [dist-0-of-64] mm projector True
447
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
448
+ [2025-07-01 09:13:55] Rank 15: Timer for terminate callback has been set.
449
+ Total limit: 240min
450
+ Pre terminate time: 10min elapsed_time: 189.35445713996887s
451
+ [2025-07-01 09:13:55] Rank 55: Timer for terminate callback has been set.
452
+ Total limit: 240min
453
+ Pre terminate time: 10min elapsed_time: 201.59668064117432s
454
+ [2025-07-01 09:13:55] Rank 43: Timer for terminate callback has been set.
455
+ Total limit: 240min
456
+ Pre terminate time: 10min elapsed_time: 189.26995539665222s
457
+ [2025-07-01 09:13:55] Rank 19: Timer for terminate callback has been set.
458
+ Total limit: 240min
459
+ Pre terminate time: 10min elapsed_time: 189.44619512557983s
460
+ length of dataloader: 28 14336
461
+ [GPU memory] before trainer 2.292407512664795
462
+ [2025-07-01 09:13:55] Rank 27: Timer for terminate callback has been set.
463
+ Total limit: 240min
464
+ Pre terminate time: 10min elapsed_time: 189.14272332191467s
465
+ length of dataloader: 28 14336
466
+ [GPU memory] before trainer 2.292407512664795
467
+ length of dataloader: 28 14336
468
+ [GPU memory] before trainer 2.292407512664795
469
+ [2025-07-01 09:13:55] Rank 57: Timer for terminate callback has been set.
470
+ Total limit: 240min
471
+ Pre terminate time: 10min elapsed_time: 201.8264696598053s
472
+ [2025-07-01 09:13:55] Rank 7: Timer for terminate callback has been set.
473
+ Total limit: 240min
474
+ Pre terminate time: 10min elapsed_time: 187.70187664031982s
475
+ length of dataloader: 28 14336
476
+ [GPU memory] before trainer 2.292407512664795
477
+ length of dataloader: 28 14336
478
+ [GPU memory] before trainer 2.292407512664795
479
+ [2025-07-01 09:13:55] Rank 8: Timer for terminate callback has been set.
480
+ Total limit: 240min
481
+ Pre terminate time: 10min elapsed_time: 189.69908165931702s
482
+ [2025-07-01 09:13:55] Rank 21: Timer for terminate callback has been set.
483
+ Total limit: 240min
484
+ Pre terminate time: 10min elapsed_time: 189.7061002254486s
485
+ [2025-07-01 09:13:56] Rank 46: Timer for terminate callback has been set.
486
+ Total limit: 240min
487
+ Pre terminate time: 10min elapsed_time: 189.5839924812317s
488
+ [2025-07-01 09:13:56] Rank 24: Timer for terminate callback has been set.
489
+ Total limit: 240min
490
+ Pre terminate time: 10min elapsed_time: 189.28924942016602s
491
+ [2025-07-01 09:13:56] Rank 35: Timer for terminate callback has been set.
492
+ Total limit: 240min
493
+ Pre terminate time: 10min elapsed_time: 189.95243191719055s
494
+ [2025-07-01 09:13:56] Rank 1: Timer for terminate callback has been set.
495
+ Total limit: 240min
496
+ Pre terminate time: 10min elapsed_time: 187.78522086143494s
497
+ length of dataloader: 28 14336
498
+ [GPU memory] before trainer 2.292407512664795
499
+ length of dataloader: 28 14336
500
+ [GPU memory] before trainer 2.292407512664795
501
+ length of dataloader: 28 14336
502
+ [GPU memory] before trainer 2.292407512664795
503
+ length of dataloader: 28 14336
504
+ [GPU memory] before trainer 2.292407512664795
505
+ length of dataloader: 28 14336
506
+ [GPU memory] before trainer 2.292407512664795
507
+ [2025-07-01 09:13:56] Rank 50: Timer for terminate callback has been set.
508
+ Total limit: 240min
509
+ Pre terminate time: 10min elapsed_time: 202.2034502029419s
510
+ [2025-07-01 09:13:56] Rank 31: Timer for terminate callback has been set.
511
+ Total limit: 240min
512
+ Pre terminate time: 10min elapsed_time: 189.37554931640625s
513
+ [2025-07-01 09:13:56] Rank 56: Timer for terminate callback has been set.
514
+ Total limit: 240min
515
+ Pre terminate time: 10min elapsed_time: 202.09286189079285s
516
+ [2025-07-01 09:13:56] Rank 3: Timer for terminate callback has been set.
517
+ Total limit: 240min
518
+ Pre terminate time: 10min elapsed_time: 187.87985610961914s
519
+ [2025-07-01 09:13:56] Rank 60: Timer for terminate callback has been set.
520
+ Total limit: 240min
521
+ Pre terminate time: 10min elapsed_time: 202.09541821479797s
522
+ [2025-07-01 09:13:56] Rank 9: Timer for terminate callback has been set.
523
+ Total limit: 240min
524
+ Pre terminate time: 10min elapsed_time: 189.82429432868958s
525
+ [2025-07-01 09:13:56] Rank 38: Timer for terminate callback has been set.
526
+ Total limit: 240min
527
+ Pre terminate time: 10min elapsed_time: 190.08812403678894s
528
+ length of dataloader: 28 14336
529
+ length of dataloader: 28 14336
530
+ [GPU memory] before trainer 2.292407512664795
531
+ [GPU memory] before trainer 2.292407512664795
532
+ [2025-07-01 09:13:56] Rank 20: Timer for terminate callback has been set.
533
+ Total limit: 240min
534
+ Pre terminate time: 10min elapsed_time: 189.87151789665222s
535
+ [2025-07-01 09:13:56] Rank 18: Timer for terminate callback has been set.
536
+ Total limit: 240min
537
+ Pre terminate time: 10min elapsed_time: 189.87728786468506s
538
+ length of dataloader: 28 14336
539
+ [GPU memory] before trainer 2.292407512664795
540
+ [2025-07-01 09:13:56] Rank 17: Timer for terminate callback has been set.
541
+ Total limit: 240min
542
+ Pre terminate time: 10min elapsed_time: 189.8816032409668s
543
+ length of dataloader: 28 14336
544
+ [GPU memory] before trainer 2.292407512664795
545
+ [2025-07-01 09:13:56] Rank 5: Timer for terminate callback has been set.
546
+ Total limit: 240min
547
+ Pre terminate time: 10min elapsed_time: 187.94594931602478s
548
+ [2025-07-01 09:13:56] Rank 4: Timer for terminate callback has been set.
549
+ Total limit: 240min
550
+ Pre terminate time: 10min elapsed_time: 187.95590615272522s
551
+ [2025-07-01 09:13:56] Rank 41: Timer for terminate callback has been set.
552
+ Total limit: 240min
553
+ Pre terminate time: 10min elapsed_time: 189.7688856124878s
554
+ length of dataloader: 28 14336
555
+ [GPU memory] before trainer 2.292407512664795
556
+ [2025-07-01 09:13:56] Rank 36: Timer for terminate callback has been set.
557
+ Total limit: 240min
558
+ Pre terminate time: 10min elapsed_time: 190.14091515541077s
559
+ length of dataloader: 28 14336
560
+ [GPU memory] before trainer 2.292407512664795
561
+ length of dataloader: 28 14336
562
+ [GPU memory] before trainer 2.292407512664795
563
+ [2025-07-01 09:13:56] Rank 62: Timer for terminate callback has been set.
564
+ Total limit: 240min
565
+ Pre terminate time: 10min elapsed_time: 202.1776213645935s
566
+ length of dataloader: 28 14336
567
+ [GPU memory] before trainer 2.292407512664795
568
+ [2025-07-01 09:13:56] Rank 22: Timer for terminate callback has been set.
569
+ Total limit: 240min
570
+ Pre terminate time: 10min elapsed_time: 189.92305088043213s
571
+ [2025-07-01 09:13:56] Rank 12: Timer for terminate callback has been set.
572
+ Total limit: 240min
573
+ Pre terminate time: 10min elapsed_time: 189.92479276657104s
574
+ [2025-07-01 09:13:56] Rank 26: Timer for terminate callback has been set.
575
+ Total limit: 240min
576
+ Pre terminate time: 10min elapsed_time: 189.50398349761963s
577
+ [2025-07-01 09:13:56] Rank 30: Timer for terminate callback has been set.
578
+ Total limit: 240min
579
+ Pre terminate time: 10min elapsed_time: 189.50814604759216s
580
+ [2025-07-01 09:13:56] Rank 33: Timer for terminate callback has been set.
581
+ Total limit: 240min
582
+ Pre terminate time: 10min elapsed_time: 190.1808216571808s
583
+ length of dataloader: 28 14336
584
+ length of dataloader: 28 14336
585
+ [GPU memory] before trainer 2.292407512664795
586
+ [GPU memory] before trainer 2.292407512664795
587
+ [2025-07-01 09:13:56] Rank 59: Timer for terminate callback has been set.
588
+ Total limit: 240min
589
+ Pre terminate time: 10min elapsed_time: 202.2262580394745s
590
+ [2025-07-01 09:13:56] Rank 16: Timer for terminate callback has been set.
591
+ Total limit: 240min
592
+ Pre terminate time: 10min elapsed_time: 189.95579552650452s
593
+ length of dataloader: 28 14336
594
+ [GPU memory] before trainer 2.292407512664795
595
+ length of dataloader: 28 14336
596
+ [GPU memory] before trainer 2.292407512664795
597
+ length of dataloader: 28 14336
598
+ [GPU memory] before trainer 2.292407512664795
599
+ length of dataloader: 28 14336
600
+ [GPU memory] before trainer 2.292407512664795
601
+ [2025-07-01 09:13:56] Rank 53: Timer for terminate callback has been set.
602
+ Total limit: 240min
603
+ Pre terminate time: 10min elapsed_time: 202.1831030845642s
604
+ length of dataloader: 28 14336
605
+ [GPU memory] before trainer 2.292407512664795
606
+ length of dataloader: 28 14336
607
+ [GPU memory] before trainer 2.292407512664795
608
+ length of dataloader: 28 14336
609
+ [GPU memory] before trainer 2.292407512664795
610
+ length of dataloader: 28 14336
611
+ [GPU memory] before trainer 2.292407512664795
612
+ [2025-07-01 09:13:56] Rank 13: Timer for terminate callback has been set.
613
+ Total limit: 240min
614
+ Pre terminate time: 10min elapsed_time: 189.99509859085083s
615
+ [2025-07-01 09:13:56] Rank 23: Timer for terminate callback has been set.
616
+ Total limit: 240min
617
+ Pre terminate time: 10min elapsed_time: 189.99686932563782s
618
+ [2025-07-01 09:13:56] Rank 45: Timer for terminate callback has been set.
619
+ Total limit: 240min
620
+ Pre terminate time: 10min elapsed_time: 189.87132096290588s
621
+ [2025-07-01 09:13:56] Rank 6: Timer for terminate callback has been set.
622
+ Total limit: 240min
623
+ Pre terminate time: 10min elapsed_time: 188.06090354919434s
624
+ length of dataloader: 28 14336
625
+ [GPU memory] before trainer 2.292407512664795
626
+ length of dataloader: 28 14336
627
+ [2025-07-01 09:13:56] Rank 61: Timer for terminate callback has been set.
628
+ Total limit: 240min
629
+ Pre terminate time: 10min elapsed_time: 202.2874138355255s
630
+ [GPU memory] before trainer 2.292407512664795
631
+ [2025-07-01 09:13:56] Rank 54: Timer for terminate callback has been set.
632
+ Total limit: 240min
633
+ Pre terminate time: 10min elapsed_time: 202.3776957988739s
634
+ length of dataloader: 28 14336
635
+ [GPU memory] before trainer 2.292407512664795
636
+ [2025-07-01 09:13:56] Rank 14: Timer for terminate callback has been set.
637
+ Total limit: 240min
638
+ Pre terminate time: 10min elapsed_time: 190.0164692401886s
639
+ [2025-07-01 09:13:56] Rank 42: Timer for terminate callback has been set.
640
+ Total limit: 240min
641
+ Pre terminate time: 10min elapsed_time: 189.89093255996704s
642
+ length of dataloader: 28 14336
643
+ [GPU memory] before trainer 2.292407512664795
644
+ [2025-07-01 09:13:56] Rank 47: Timer for terminate callback has been set.
645
+ Total limit: 240min
646
+ Pre terminate time: 10min elapsed_time: 189.8917055130005s
647
+ [2025-07-01 09:13:56] Rank 25: Timer for terminate callback has been set.
648
+ Total limit: 240min
649
+ Pre terminate time: 10min elapsed_time: 189.60104870796204s
650
+ [2025-07-01 09:13:56] Rank 58: Timer for terminate callback has been set.
651
+ Total limit: 240min
652
+ Pre terminate time: 10min elapsed_time: 202.30275464057922s
653
+ length of dataloader: 28 14336
654
+ [GPU memory] before trainer 2.292407512664795
655
+ [2025-07-01 09:13:56] Rank 28: Timer for terminate callback has been set.
656
+ Total limit: 240min
657
+ Pre terminate time: 10min elapsed_time: 189.60884761810303s
658
+ length of dataloader: 28 14336
659
+ [GPU memory] before trainer 2.292407512664795
660
+ length of dataloader: 28 14336
661
+ [GPU memory] before trainer 2.292407512664795
662
+ [2025-07-01 09:13:56] Rank 0: Timer for terminate callback has been set.
663
+ Total limit: 240min
664
+ Pre terminate time: 10min elapsed_time: 188.10366201400757s
665
+ [2025-07-01 09:13:56] Rank 29: Timer for terminate callback has been set.
666
+ Total limit: 240min
667
+ Pre terminate time: 10min elapsed_time: 189.62677335739136s
668
+ [2025-07-01 09:13:56] Rank 49: Timer for terminate callback has been set.
669
+ Total limit: 240min
670
+ Pre terminate time: 10min elapsed_time: 202.26963424682617s
671
+ length of dataloader: 28 14336
672
+ [GPU memory] before trainer 2.292407512664795
673
+ [2025-07-01 09:13:56] Rank 10: Timer for terminate callback has been set.
674
+ Total limit: 240min
675
+ Pre terminate time: 10min elapsed_time: 190.0586109161377s
676
+ [2025-07-01 09:13:56] Rank 37: Timer for terminate callback has been set.
677
+ Total limit: 240min
678
+ Pre terminate time: 10min elapsed_time: 190.2976393699646s
679
+ [2025-07-01 09:13:56] Rank 11: Timer for terminate callback has been set.
680
+ Total limit: 240min
681
+ Pre terminate time: 10min elapsed_time: 190.06773209571838s
682
+ [2025-07-01 09:13:56] Rank 39: Timer for terminate callback has been set.
683
+ Total limit: 240min
684
+ Pre terminate time: 10min elapsed_time: 190.31072449684143s
685
+ [2025-07-01 09:13:56] Rank 34: Timer for terminate callback has been set.
686
+ Total limit: 240min
687
+ Pre terminate time: 10min elapsed_time: 190.3135223388672s
688
+ length of dataloader: 28 14336
689
+ [GPU memory] before trainer 2.292407512664795
690
+ [2025-07-01 09:13:56] Rank 40: Timer for terminate callback has been set.
691
+ Total limit: 240min
692
+ Pre terminate time: 10min elapsed_time: 189.95435571670532s
693
+ length of dataloader: 28 14336
694
+ [GPU memory] before trainer 2.292407512664795
695
+ length of dataloader: 28 14336
696
+ [GPU memory] before trainer 2.292407512664795
697
+ length of dataloader: 28 14336
698
+ [GPU memory] before trainer 2.292407512664795
699
+ length of dataloader: 28 14336
700
+ [GPU memory] before trainer 2.292407512664795
701
+ [2025-07-01 09:13:56] Rank 44: Timer for terminate callback has been set.
702
+ Total limit: 240min
703
+ Pre terminate time: 10min elapsed_time: 189.97284388542175s
704
+ length of dataloader: 28 14336
705
+ [GPU memory] before trainer 2.292407512664795
706
+ length of dataloader: 28 14336
707
+ [GPU memory] before trainer 2.292407512664795
708
+ length of dataloader: 28 14336
709
+ [GPU memory] before trainer 2.292407512664795
710
+ length of dataloader: 28 14336
711
+ [GPU memory] before trainer 2.292407512664795
712
+ length of dataloader: 28 14336
713
+ [GPU memory] before trainer 2.292407512664795
714
+ length of dataloader: 28 14336
715
+ [GPU memory] before trainer 2.292407512664795
716
+ [2025-07-01 09:13:56] Rank 51: Timer for terminate callback has been set.
717
+ Total limit: 240min
718
+ Pre terminate time: 10min elapsed_time: 202.32772946357727s
719
+ [2025-07-01 09:13:56] Rank 2: Timer for terminate callback has been set.
720
+ Total limit: 240min
721
+ Pre terminate time: 10min elapsed_time: 188.1806402206421s
722
+ [2025-07-01 09:13:56] Rank 48: Timer for terminate callback has been set.
723
+ Total limit: 240min
724
+ Pre terminate time: 10min elapsed_time: 202.36938166618347s
725
+ length of dataloader: 28 14336
726
+ [GPU memory] before trainer 2.292407512664795
727
+ length of dataloader: 28 14336
728
+ [GPU memory] before trainer 2.292407512664795
729
+ length of dataloader: 28 14336
730
+ [GPU memory] before trainer 2.292407512664795
731
+ length of dataloader: 28 14336
732
+ [GPU memory] before trainer 2.292407512664795
733
+ [2025-07-01 09:13:56] Rank 63: Timer for terminate callback has been set.
734
+ Total limit: 240min
735
+ Pre terminate time: 10min elapsed_time: 202.3610863685608s
736
+ length of dataloader: 28 14336
737
+ length of dataloader: 28 14336
738
+ [GPU memory] before trainer 2.292407512664795
739
+ [GPU memory] before trainer 2.292407512664795
740
+ length of dataloader: 28 14336
741
+ [GPU memory] before trainer 2.292407512664795
742
+ [2025-07-01 09:13:56] Rank 32: Timer for terminate callback has been set.
743
+ Total limit: 240min
744
+ Pre terminate time: 10min elapsed_time: 190.39307260513306s
745
+ length of dataloader: 28 14336
746
+ [GPU memory] before trainer 2.292407512664795
747
+ length of dataloader: 28 14336
748
+ [GPU memory] before trainer 2.292407512664795
749
+ length of dataloader: 28 14336
750
+ [GPU memory] before trainer 2.292407512664795
751
+ [2025-07-01 09:13:56] Rank 52: Timer for terminate callback has been set.
752
+ Total limit: 240min
753
+ Pre terminate time: 10min elapsed_time: 203.1189968585968s
754
+ length of dataloader: 28 14336
755
+ [GPU memory] before trainer 2.292407512664795
756
+ length of dataloader: 28 14336
757
+ [GPU memory] before trainer 2.292407512664795
758
+ length of dataloader: 28 14336
759
+ [GPU memory] before trainer 2.292407512664795
760
+ length of dataloader: 28 14336
761
+ [GPU memory] before trainer 2.292407512664795
762
+ length of dataloader: 28 14336
763
+ [GPU memory] before trainer 2.292407512664795
764
+ length of dataloader: 28 14336
765
+ [GPU memory] before trainer 2.292407512664795
766
+ length of dataloader: 28 14336
767
+ [GPU memory] before trainer 2.292407512664795
768
+ Parameter Offload: Total persistent parameters: 771184 in 421 params
slurm/1038294.0.err ADDED
The diff for this file is too large to render. See raw diff
 
slurm/1038294.0.out ADDED
@@ -0,0 +1,768 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ SLURM_JOB_ID = 1038294
2
+ SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
3
+ RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
4
+ OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
5
+ NNODES = 8
6
+ NODES = pool0-01504 pool0-01683 pool0-01722 pool0-01867 pool0-01881 pool0-01893 pool0-01919 pool0-01939
7
+ NODE_RANK = 7
8
+ GPUS_PER_NODE = 8
9
+ MASTER_ADDR = pool0-01504
10
+ MASTER_PORT = 25001
11
+ GLOBAL_TRAIN_BATCH_SIZE = 2048
12
+ GRADIENT_ACCUMULATION_STEPS = 4
13
+ PER_DEVICE_TRAIN_BATCH_SIZE = 8
14
+ DEFAULT_LEARNING_RATE: 2e-5
15
+ SLURM_JOB_ID = 1038294
16
+ SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
17
+ RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
18
+ OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
19
+ NNODES = 8
20
+ NODES = pool0-01504 pool0-01683 pool0-01722 pool0-01867 pool0-01881 pool0-01893 pool0-01919 pool0-01939
21
+ NODE_RANK = 0
22
+ GPUS_PER_NODE = 8
23
+ MASTER_ADDR = pool0-01504
24
+ MASTER_PORT = 25001
25
+ GLOBAL_TRAIN_BATCH_SIZE = 2048
26
+ GRADIENT_ACCUMULATION_STEPS = 4
27
+ PER_DEVICE_TRAIN_BATCH_SIZE = 8
28
+ DEFAULT_LEARNING_RATE: 2e-5
29
+ SLURM_JOB_ID = 1038294
30
+ SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
31
+ RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
32
+ OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
33
+ NNODES = 8
34
+ NODES = pool0-01504 pool0-01683 pool0-01722 pool0-01867 pool0-01881 pool0-01893 pool0-01919 pool0-01939
35
+ NODE_RANK = 6
36
+ GPUS_PER_NODE = 8
37
+ MASTER_ADDR = pool0-01504
38
+ MASTER_PORT = 25001
39
+ GLOBAL_TRAIN_BATCH_SIZE = 2048
40
+ GRADIENT_ACCUMULATION_STEPS = 4
41
+ PER_DEVICE_TRAIN_BATCH_SIZE = 8
42
+ DEFAULT_LEARNING_RATE: 2e-5
43
+ SLURM_JOB_ID = 1038294
44
+ SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
45
+ RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
46
+ OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
47
+ NNODES = 8
48
+ NODES = pool0-01504 pool0-01683 pool0-01722 pool0-01867 pool0-01881 pool0-01893 pool0-01919 pool0-01939
49
+ NODE_RANK = 2
50
+ GPUS_PER_NODE = 8
51
+ MASTER_ADDR = pool0-01504
52
+ MASTER_PORT = 25001
53
+ GLOBAL_TRAIN_BATCH_SIZE = 2048
54
+ GRADIENT_ACCUMULATION_STEPS = 4
55
+ PER_DEVICE_TRAIN_BATCH_SIZE = 8
56
+ DEFAULT_LEARNING_RATE: 2e-5
57
+ SLURM_JOB_ID = 1038294
58
+ SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
59
+ RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
60
+ OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
61
+ NNODES = 8
62
+ NODES = pool0-01504 pool0-01683 pool0-01722 pool0-01867 pool0-01881 pool0-01893 pool0-01919 pool0-01939
63
+ NODE_RANK = 1
64
+ GPUS_PER_NODE = 8
65
+ MASTER_ADDR = pool0-01504
66
+ MASTER_PORT = 25001
67
+ GLOBAL_TRAIN_BATCH_SIZE = 2048
68
+ GRADIENT_ACCUMULATION_STEPS = 4
69
+ PER_DEVICE_TRAIN_BATCH_SIZE = 8
70
+ DEFAULT_LEARNING_RATE: 2e-5
71
+ SLURM_JOB_ID = 1038294
72
+ SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
73
+ RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
74
+ OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
75
+ NNODES = 8
76
+ NODES = pool0-01504 pool0-01683 pool0-01722 pool0-01867 pool0-01881 pool0-01893 pool0-01919 pool0-01939
77
+ NODE_RANK = 5
78
+ GPUS_PER_NODE = 8
79
+ MASTER_ADDR = pool0-01504
80
+ MASTER_PORT = 25001
81
+ GLOBAL_TRAIN_BATCH_SIZE = 2048
82
+ GRADIENT_ACCUMULATION_STEPS = 4
83
+ PER_DEVICE_TRAIN_BATCH_SIZE = 8
84
+ DEFAULT_LEARNING_RATE: 2e-5
85
+ SLURM_JOB_ID = 1038294
86
+ SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
87
+ RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
88
+ OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
89
+ NNODES = 8
90
+ NODES = pool0-01504 pool0-01683 pool0-01722 pool0-01867 pool0-01881 pool0-01893 pool0-01919 pool0-01939
91
+ NODE_RANK = 4
92
+ GPUS_PER_NODE = 8
93
+ MASTER_ADDR = pool0-01504
94
+ MASTER_PORT = 25001
95
+ GLOBAL_TRAIN_BATCH_SIZE = 2048
96
+ GRADIENT_ACCUMULATION_STEPS = 4
97
+ PER_DEVICE_TRAIN_BATCH_SIZE = 8
98
+ DEFAULT_LEARNING_RATE: 2e-5
99
+ SLURM_JOB_ID = 1038294
100
+ SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
101
+ RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
102
+ OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
103
+ NNODES = 8
104
+ NODES = pool0-01504 pool0-01683 pool0-01722 pool0-01867 pool0-01881 pool0-01893 pool0-01919 pool0-01939
105
+ NODE_RANK = 3
106
+ GPUS_PER_NODE = 8
107
+ MASTER_ADDR = pool0-01504
108
+ MASTER_PORT = 25001
109
+ GLOBAL_TRAIN_BATCH_SIZE = 2048
110
+ GRADIENT_ACCUMULATION_STEPS = 4
111
+ PER_DEVICE_TRAIN_BATCH_SIZE = 8
112
+ DEFAULT_LEARNING_RATE: 2e-5
113
+ [2025-07-01 09:15:45,395] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
114
+ [2025-07-01 09:15:45,549] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
115
+ [2025-07-01 09:15:46,175] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
116
+ [2025-07-01 09:15:46,197] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
117
+ [2025-07-01 09:15:46,284] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
118
+ [2025-07-01 09:15:46,324] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
119
+ [2025-07-01 09:15:46,329] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
120
+ [2025-07-01 09:15:46,330] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
121
+ [2025-07-01 09:15:48,763] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
122
+ [2025-07-01 09:15:48,763] [INFO] [comm.py:594:init_distributed] cdb=None
123
+ [2025-07-01 09:15:48,885] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
124
+ [2025-07-01 09:15:48,885] [INFO] [comm.py:594:init_distributed] cdb=None
125
+ [2025-07-01 09:15:49,514] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
126
+ [2025-07-01 09:15:49,514] [INFO] [comm.py:594:init_distributed] cdb=None
127
+ [2025-07-01 09:15:49,515] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
128
+ [2025-07-01 09:15:49,515] [INFO] [comm.py:594:init_distributed] cdb=None
129
+ [2025-07-01 09:15:49,535] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
130
+ [2025-07-01 09:15:49,535] [INFO] [comm.py:594:init_distributed] cdb=None
131
+ [2025-07-01 09:15:50,031] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
132
+ [2025-07-01 09:15:50,031] [INFO] [comm.py:594:init_distributed] cdb=None
133
+ [2025-07-01 09:15:50,031] [INFO] [comm.py:625:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
134
+ [2025-07-01 09:15:50,095] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
135
+ [2025-07-01 09:15:50,095] [INFO] [comm.py:594:init_distributed] cdb=None
136
+ [2025-07-01 09:15:50,099] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
137
+ [2025-07-01 09:15:50,099] [INFO] [comm.py:594:init_distributed] cdb=None
138
+ [2025-07-01 09:15:50,847] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
139
+ [2025-07-01 09:15:50,847] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
140
+ [2025-07-01 09:15:50,847] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
141
+ [2025-07-01 09:15:50,847] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
142
+ [2025-07-01 09:15:50,847] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
143
+ [2025-07-01 09:15:50,847] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
144
+ [2025-07-01 09:15:50,854] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
145
+ [2025-07-01 09:15:50,854] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
146
+ [2025-07-01 09:15:50,854] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
147
+ [2025-07-01 09:15:50,854] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
148
+ [2025-07-01 09:15:50,854] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
149
+ [2025-07-01 09:15:50,854] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
150
+ [2025-07-01 09:15:50,854] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
151
+ [2025-07-01 09:15:50,854] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
152
+ [2025-07-01 09:15:50,856] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
153
+ [2025-07-01 09:15:50,856] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
154
+ [2025-07-01 09:15:50,856] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
155
+ [2025-07-01 09:15:50,856] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
156
+ [2025-07-01 09:15:50,856] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
157
+ [2025-07-01 09:15:50,856] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
158
+ [2025-07-01 09:15:50,856] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
159
+ [2025-07-01 09:15:50,856] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
160
+ [2025-07-01 09:15:50,861] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
161
+ [2025-07-01 09:15:50,862] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
162
+ [2025-07-01 09:15:51,005] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
163
+ [2025-07-01 09:15:51,005] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
164
+ [2025-07-01 09:15:51,005] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
165
+ [2025-07-01 09:15:51,005] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
166
+ [2025-07-01 09:15:51,005] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
167
+ [2025-07-01 09:15:51,005] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
168
+ [2025-07-01 09:15:51,023] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
169
+ [2025-07-01 09:15:51,024] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
170
+ [2025-07-01 09:15:51,070] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
171
+ [2025-07-01 09:15:51,070] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
172
+ [2025-07-01 09:15:51,070] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
173
+ [2025-07-01 09:15:51,070] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
174
+ [2025-07-01 09:15:51,070] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
175
+ [2025-07-01 09:15:51,070] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
176
+ [2025-07-01 09:15:51,070] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
177
+ [2025-07-01 09:15:51,071] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
178
+ [2025-07-01 09:15:51,307] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
179
+ [2025-07-01 09:15:51,307] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
180
+ [2025-07-01 09:15:51,307] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
181
+ [2025-07-01 09:15:51,307] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
182
+ [2025-07-01 09:15:51,307] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
183
+ [2025-07-01 09:15:51,307] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
184
+ [2025-07-01 09:15:51,307] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
185
+ [2025-07-01 09:15:51,307] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
186
+ [2025-07-01 09:15:53,021] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
187
+ [2025-07-01 09:15:53,021] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
188
+ [2025-07-01 09:15:53,021] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
189
+ [2025-07-01 09:15:53,021] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
190
+ [2025-07-01 09:15:53,021] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
191
+ [2025-07-01 09:15:53,021] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
192
+ [2025-07-01 09:15:53,021] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
193
+ [2025-07-01 09:15:53,021] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
194
+ [2025-07-01 09:16:01,457] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
195
+ [2025-07-01 09:16:01,457] [INFO] [comm.py:594:init_distributed] cdb=None
196
+ [2025-07-01 09:16:01,466] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
197
+ [2025-07-01 09:16:01,466] [INFO] [comm.py:594:init_distributed] cdb=None
198
+ [2025-07-01 09:16:01,467] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
199
+ [2025-07-01 09:16:01,467] [INFO] [comm.py:594:init_distributed] cdb=None
200
+ [2025-07-01 09:16:01,468] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
201
+ [2025-07-01 09:16:01,468] [INFO] [comm.py:594:init_distributed] cdb=None
202
+ [2025-07-01 09:16:01,471] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
203
+ [2025-07-01 09:16:01,471] [INFO] [comm.py:594:init_distributed] cdb=None
204
+ [2025-07-01 09:16:01,475] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
205
+ [2025-07-01 09:16:01,475] [INFO] [comm.py:594:init_distributed] cdb=None
206
+ [2025-07-01 09:16:01,476] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
207
+ [2025-07-01 09:16:01,476] [INFO] [comm.py:594:init_distributed] cdb=None
208
+ [2025-07-01 09:16:01,482] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
209
+ [2025-07-01 09:16:01,482] [INFO] [comm.py:594:init_distributed] cdb=None
210
+ [2025-07-01 09:16:01,486] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
211
+ [2025-07-01 09:16:01,486] [INFO] [comm.py:594:init_distributed] cdb=None
212
+ [2025-07-01 09:16:01,490] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
213
+ [2025-07-01 09:16:01,490] [INFO] [comm.py:594:init_distributed] cdb=None
214
+ [2025-07-01 09:16:01,501] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
215
+ [2025-07-01 09:16:01,501] [INFO] [comm.py:594:init_distributed] cdb=None
216
+ [2025-07-01 09:16:01,502] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
217
+ [2025-07-01 09:16:01,502] [INFO] [comm.py:594:init_distributed] cdb=None
218
+ [2025-07-01 09:16:01,505] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
219
+ [2025-07-01 09:16:01,505] [INFO] [comm.py:594:init_distributed] cdb=None
220
+ [2025-07-01 09:16:01,510] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
221
+ [2025-07-01 09:16:01,510] [INFO] [comm.py:594:init_distributed] cdb=None
222
+ [2025-07-01 09:16:01,512] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
223
+ [2025-07-01 09:16:01,512] [INFO] [comm.py:594:init_distributed] cdb=None
224
+ [2025-07-01 09:16:01,516] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
225
+ [2025-07-01 09:16:01,516] [INFO] [comm.py:594:init_distributed] cdb=None
226
+ [2025-07-01 09:16:01,523] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
227
+ [2025-07-01 09:16:01,523] [INFO] [comm.py:594:init_distributed] cdb=None
228
+ [2025-07-01 09:16:01,568] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
229
+ [2025-07-01 09:16:01,569] [INFO] [comm.py:594:init_distributed] cdb=None
230
+ [2025-07-01 09:16:01,570] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
231
+ [2025-07-01 09:16:01,570] [INFO] [comm.py:594:init_distributed] cdb=None
232
+ [2025-07-01 09:16:01,577] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
233
+ [2025-07-01 09:16:01,577] [INFO] [comm.py:594:init_distributed] cdb=None
234
+ [2025-07-01 09:16:01,578] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
235
+ [2025-07-01 09:16:01,579] [INFO] [comm.py:594:init_distributed] cdb=None
236
+ [2025-07-01 09:16:01,580] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
237
+ [2025-07-01 09:16:01,580] [INFO] [comm.py:594:init_distributed] cdb=None
238
+ [2025-07-01 09:16:01,580] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
239
+ [2025-07-01 09:16:01,581] [INFO] [comm.py:594:init_distributed] cdb=None
240
+ [2025-07-01 09:16:01,591] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
241
+ [2025-07-01 09:16:01,591] [INFO] [comm.py:594:init_distributed] cdb=None
242
+ [2025-07-01 09:16:01,739] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
243
+ [2025-07-01 09:16:01,739] [INFO] [comm.py:594:init_distributed] cdb=None
244
+ [2025-07-01 09:16:01,779] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
245
+ [2025-07-01 09:16:01,779] [INFO] [comm.py:594:init_distributed] cdb=None
246
+ [2025-07-01 09:16:01,782] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
247
+ [2025-07-01 09:16:01,782] [INFO] [comm.py:594:init_distributed] cdb=None
248
+ [2025-07-01 09:16:01,786] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
249
+ [2025-07-01 09:16:01,787] [INFO] [comm.py:594:init_distributed] cdb=None
250
+ [2025-07-01 09:16:01,791] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
251
+ [2025-07-01 09:16:01,791] [INFO] [comm.py:594:init_distributed] cdb=None
252
+ [2025-07-01 09:16:01,792] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
253
+ [2025-07-01 09:16:01,792] [INFO] [comm.py:594:init_distributed] cdb=None
254
+ [2025-07-01 09:16:01,793] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
255
+ [2025-07-01 09:16:01,794] [INFO] [comm.py:594:init_distributed] cdb=None
256
+ [2025-07-01 09:16:01,796] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
257
+ [2025-07-01 09:16:01,796] [INFO] [comm.py:594:init_distributed] cdb=None
258
+ [2025-07-01 09:16:01,890] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
259
+ [2025-07-01 09:16:01,890] [INFO] [comm.py:594:init_distributed] cdb=None
260
+ [2025-07-01 09:16:01,891] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
261
+ [2025-07-01 09:16:01,891] [INFO] [comm.py:594:init_distributed] cdb=None
262
+ [2025-07-01 09:16:01,896] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
263
+ [2025-07-01 09:16:01,896] [INFO] [comm.py:594:init_distributed] cdb=None
264
+ [2025-07-01 09:16:01,897] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
265
+ [2025-07-01 09:16:01,897] [INFO] [comm.py:594:init_distributed] cdb=None
266
+ [2025-07-01 09:16:01,898] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
267
+ [2025-07-01 09:16:01,898] [INFO] [comm.py:594:init_distributed] cdb=None
268
+ [2025-07-01 09:16:01,901] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
269
+ [2025-07-01 09:16:01,901] [INFO] [comm.py:594:init_distributed] cdb=None
270
+ [2025-07-01 09:16:01,906] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
271
+ [2025-07-01 09:16:01,906] [INFO] [comm.py:594:init_distributed] cdb=None
272
+ [2025-07-01 09:16:01,907] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
273
+ [2025-07-01 09:16:01,907] [INFO] [comm.py:594:init_distributed] cdb=None
274
+ [2025-07-01 09:16:01,929] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
275
+ [2025-07-01 09:16:01,929] [INFO] [comm.py:594:init_distributed] cdb=None
276
+ [2025-07-01 09:16:01,930] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
277
+ [2025-07-01 09:16:01,930] [INFO] [comm.py:594:init_distributed] cdb=None
278
+ [2025-07-01 09:16:01,932] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
279
+ [2025-07-01 09:16:01,932] [INFO] [comm.py:594:init_distributed] cdb=None
280
+ [2025-07-01 09:16:01,936] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
281
+ [2025-07-01 09:16:01,936] [INFO] [comm.py:594:init_distributed] cdb=None
282
+ [2025-07-01 09:16:01,937] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
283
+ [2025-07-01 09:16:01,937] [INFO] [comm.py:594:init_distributed] cdb=None
284
+ [2025-07-01 09:16:01,939] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
285
+ [2025-07-01 09:16:01,939] [INFO] [comm.py:594:init_distributed] cdb=None
286
+ [2025-07-01 09:16:01,946] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
287
+ [2025-07-01 09:16:01,946] [INFO] [comm.py:594:init_distributed] cdb=None
288
+ [2025-07-01 09:16:01,946] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
289
+ [2025-07-01 09:16:01,947] [INFO] [comm.py:594:init_distributed] cdb=None
290
+ [2025-07-01 09:16:04,276] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
291
+ [2025-07-01 09:16:04,276] [INFO] [comm.py:594:init_distributed] cdb=None
292
+ [2025-07-01 09:16:04,281] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
293
+ [2025-07-01 09:16:04,282] [INFO] [comm.py:594:init_distributed] cdb=None
294
+ [2025-07-01 09:16:04,284] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
295
+ [2025-07-01 09:16:04,284] [INFO] [comm.py:594:init_distributed] cdb=None
296
+ [2025-07-01 09:16:04,324] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
297
+ [2025-07-01 09:16:04,324] [INFO] [comm.py:594:init_distributed] cdb=None
298
+ [2025-07-01 09:16:04,378] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
299
+ [2025-07-01 09:16:04,378] [INFO] [comm.py:594:init_distributed] cdb=None
300
+ [2025-07-01 09:16:04,385] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
301
+ [2025-07-01 09:16:04,386] [INFO] [comm.py:594:init_distributed] cdb=None
302
+ [2025-07-01 09:16:04,392] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
303
+ [2025-07-01 09:16:04,392] [INFO] [comm.py:594:init_distributed] cdb=None
304
+ [2025-07-01 09:16:04,395] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
305
+ [2025-07-01 09:16:04,395] [INFO] [comm.py:594:init_distributed] cdb=None
306
+ [2025-07-01 09:16:20,231] [INFO] [partition_parameters.py:453:__exit__] finished initializing model with 7.61B parameters
307
+ [2025-07-01 09:16:29,303] [INFO] [partition_parameters.py:453:__exit__] finished initializing model with 8.03B parameters
308
+ [2025-07-01 09:16:29,934] [INFO] [partition_parameters.py:453:__exit__] finished initializing model with 8.09B parameters
309
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
310
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
311
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
312
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
313
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
314
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
315
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
316
+ [dist-0-of-64] LlavaLlamaModel(
317
+ (llm): Qwen2ForCausalLM(
318
+ (model): Qwen2Model(
319
+ (embed_tokens): Embedding(151648, 3584)
320
+ (layers): ModuleList(
321
+ (0-27): 28 x Qwen2DecoderLayer(
322
+ (self_attn): Qwen2FlashAttention2(
323
+ (q_proj): Linear(in_features=3584, out_features=3584, bias=True)
324
+ (k_proj): Linear(in_features=3584, out_features=512, bias=True)
325
+ (v_proj): Linear(in_features=3584, out_features=512, bias=True)
326
+ (o_proj): Linear(in_features=3584, out_features=3584, bias=False)
327
+ (rotary_emb): Qwen2RotaryEmbedding()
328
+ )
329
+ (mlp): Qwen2MLP(
330
+ (gate_proj): Linear(in_features=3584, out_features=18944, bias=False)
331
+ (up_proj): Linear(in_features=3584, out_features=18944, bias=False)
332
+ (down_proj): Linear(in_features=18944, out_features=3584, bias=False)
333
+ (act_fn): SiLU()
334
+ )
335
+ (input_layernorm): Qwen2RMSNorm((0,), eps=1e-06)
336
+ (post_attention_layernorm): Qwen2RMSNorm((0,), eps=1e-06)
337
+ )
338
+ )
339
+ (norm): Qwen2RMSNorm((0,), eps=1e-06)
340
+ (rotary_emb): Qwen2RotaryEmbedding()
341
+ )
342
+ (lm_head): Linear(in_features=3584, out_features=151648, bias=False)
343
+ )
344
+ (vision_tower): SiglipVisionTower(
345
+ (vision_tower): SiglipVisionModel(
346
+ (vision_model): SiglipVisionTransformer(
347
+ (embeddings): SiglipVisionEmbeddings(
348
+ (patch_embedding): Conv2d(3, 1152, kernel_size=(14, 14), stride=(14, 14), padding=valid)
349
+ (position_embedding): Embedding(1024, 1152)
350
+ )
351
+ (encoder): SiglipEncoder(
352
+ (layers): ModuleList(
353
+ (0-26): 27 x SiglipEncoderLayer(
354
+ (self_attn): SiglipFlashAttention2(
355
+ (k_proj): Linear(in_features=1152, out_features=1152, bias=True)
356
+ (v_proj): Linear(in_features=1152, out_features=1152, bias=True)
357
+ (q_proj): Linear(in_features=1152, out_features=1152, bias=True)
358
+ (out_proj): Linear(in_features=1152, out_features=1152, bias=True)
359
+ )
360
+ (layer_norm1): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
361
+ (mlp): SiglipMLP(
362
+ (activation_fn): PytorchGELUTanh()
363
+ (fc1): Linear(in_features=1152, out_features=4304, bias=True)
364
+ (fc2): Linear(in_features=4304, out_features=1152, bias=True)
365
+ )
366
+ (layer_norm2): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
367
+ )
368
+ )
369
+ )
370
+ (post_layernorm): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
371
+ )
372
+ )
373
+ )
374
+ (mm_projector): MultimodalProjector(
375
+ (layers): Sequential(
376
+ (0): DownSample3x3BlockFix()
377
+ (1): LayerNorm((10368,), eps=1e-05, elementwise_affine=True)
378
+ (2): Linear(in_features=10368, out_features=3456, bias=True)
379
+ (3): GELU(approximate='none')
380
+ (4): LayerNorm((3456,), eps=1e-05, elementwise_affine=True)
381
+ (5): Linear(in_features=3456, out_features=3584, bias=True)
382
+ (6): GELU(approximate='none')
383
+ (7): Linear(in_features=3584, out_features=3584, bias=True)
384
+ )
385
+ )
386
+ )
387
+ [dist-0-of-64] Tunable parameters:
388
+ language model True
389
+ [dist-0-of-64] vision tower True
390
+ [dist-0-of-64] mm projector True
391
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
392
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
393
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
394
+
395
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
396
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
397
+
398
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
399
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
400
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
401
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
402
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
403
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
404
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
405
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
406
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
407
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
408
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
409
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
410
+
411
+
412
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
413
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
414
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
415
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
416
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
417
+
418
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
419
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
420
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
421
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
422
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
423
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
424
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
425
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
426
+
427
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
428
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
429
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
430
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
431
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
432
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
433
+
434
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
435
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
436
+
437
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
438
+
439
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
440
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
441
+
442
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
443
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
444
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
445
+
446
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
447
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
448
+ [2025-07-01 09:19:03] Rank 31: Timer for terminate callback has been set.
449
+ Total limit: 240min
450
+ Pre terminate time: 10min elapsed_time: 182.53949546813965s
451
+ [2025-07-01 09:19:03] Rank 41: Timer for terminate callback has been set.
452
+ Total limit: 240min
453
+ Pre terminate time: 10min elapsed_time: 182.08894276618958s
454
+ [2025-07-01 09:19:03] Rank 18: Timer for terminate callback has been set.
455
+ Total limit: 240min
456
+ Pre terminate time: 10min elapsed_time: 182.17969298362732s
457
+ [2025-07-01 09:19:03] Rank 58: Timer for terminate callback has been set.
458
+ Total limit: 240min
459
+ Pre terminate time: 10min elapsed_time: 182.5725381374359s
460
+ [2025-07-01 09:19:04] Rank 34: Timer for terminate callback has been set.
461
+ Total limit: 240min
462
+ Pre terminate time: 10min elapsed_time: 182.6094913482666s
463
+ [2025-07-01 09:19:04] Rank 55: Timer for terminate callback has been set.
464
+ Total limit: 240min
465
+ Pre terminate time: 10min elapsed_time: 179.82354164123535s
466
+ [2025-07-01 09:19:04] Rank 36: Timer for terminate callback has been set.
467
+ Total limit: 240min
468
+ Pre terminate time: 10min elapsed_time: 182.67147135734558s
469
+ length of dataloader: 28 14336
470
+ [GPU memory] before trainer 2.292407512664795
471
+ length of dataloader: 28 14336
472
+ [GPU memory] before trainer 2.292407512664795
473
+ [2025-07-01 09:19:04] Rank 20: Timer for terminate callback has been set.
474
+ Total limit: 240min
475
+ Pre terminate time: 10min elapsed_time: 182.38758826255798s
476
+ length of dataloader: 28 14336
477
+ [GPU memory] before trainer 2.292407512664795
478
+ length of dataloader: 28 14336
479
+ [GPU memory] before trainer 2.292407512664795
480
+ length of dataloader: 28 14336
481
+ [GPU memory] before trainer 2.292407512664795
482
+ length of dataloader: 28 14336
483
+ [GPU memory] before trainer 2.292407512664795
484
+ length of dataloader: 28 14336
485
+ [GPU memory] before trainer 2.292407512664795
486
+ [2025-07-01 09:19:04] Rank 47: Timer for terminate callback has been set.
487
+ Total limit: 240min
488
+ Pre terminate time: 10min elapsed_time: 182.3753411769867s
489
+ [2025-07-01 09:19:04] Rank 62: Timer for terminate callback has been set.
490
+ Total limit: 240min
491
+ Pre terminate time: 10min elapsed_time: 182.7982451915741s
492
+ [2025-07-01 09:19:04] Rank 11: Timer for terminate callback has been set.
493
+ Total limit: 240min
494
+ Pre terminate time: 10min elapsed_time: 182.58949255943298s
495
+ [2025-07-01 09:19:04] Rank 51: Timer for terminate callback has been set.
496
+ Total limit: 240min
497
+ Pre terminate time: 10min elapsed_time: 180.0623378753662s
498
+ length of dataloader: 28 14336
499
+ [GPU memory] before trainer 2.292407512664795
500
+ [2025-07-01 09:19:04] Rank 13: Timer for terminate callback has been set.
501
+ Total limit: 240min
502
+ Pre terminate time: 10min elapsed_time: 182.6344199180603s
503
+ length of dataloader: 28 14336
504
+ [GPU memory] before trainer 2.292407512664795
505
+ length of dataloader: 28 14336
506
+ [GPU memory] before trainer 2.292407512664795
507
+ [2025-07-01 09:19:04] Rank 8: Timer for terminate callback has been set.
508
+ Total limit: 240min
509
+ Pre terminate time: 10min elapsed_time: 182.6651632785797s
510
+ [2025-07-01 09:19:04] Rank 27: Timer for terminate callback has been set.
511
+ Total limit: 240min
512
+ Pre terminate time: 10min elapsed_time: 182.92465996742249s
513
+ [2025-07-01 09:19:04] Rank 9: Timer for terminate callback has been set.
514
+ Total limit: 240min
515
+ Pre terminate time: 10min elapsed_time: 182.67580819129944s
516
+ [2025-07-01 09:19:04] Rank 37: Timer for terminate callback has been set.
517
+ Total limit: 240min
518
+ Pre terminate time: 10min elapsed_time: 182.93791437149048s
519
+ length of dataloader: 28 14336
520
+ [GPU memory] before trainer 2.292407512664795
521
+ [2025-07-01 09:19:04] Rank 15: Timer for terminate callback has been set.
522
+ Total limit: 240min
523
+ Pre terminate time: 10min elapsed_time: 182.69433569908142s
524
+ [2025-07-01 09:19:04] Rank 23: Timer for terminate callback has been set.
525
+ Total limit: 240min
526
+ Pre terminate time: 10min elapsed_time: 182.5678791999817s
527
+ [2025-07-01 09:19:04] Rank 48: Timer for terminate callback has been set.
528
+ Total limit: 240min
529
+ Pre terminate time: 10min elapsed_time: 180.17630982398987s
530
+ [2025-07-01 09:19:04] Rank 54: Timer for terminate callback has been set.
531
+ Total limit: 240min
532
+ Pre terminate time: 10min elapsed_time: 180.1825351715088s
533
+ [2025-07-01 09:19:04] Rank 53: Timer for terminate callback has been set.
534
+ Total limit: 240min
535
+ Pre terminate time: 10min elapsed_time: 180.1913890838623s
536
+ [2025-07-01 09:19:04] Rank 21: Timer for terminate callback has been set.
537
+ Total limit: 240min
538
+ Pre terminate time: 10min elapsed_time: 182.60063314437866s
539
+ [2025-07-01 09:19:04] Rank 57: Timer for terminate callback has been set.
540
+ Total limit: 240min
541
+ Pre terminate time: 10min elapsed_time: 182.97364211082458s
542
+ [2025-07-01 09:19:04] Rank 59: Timer for terminate callback has been set.
543
+ Total limit: 240min
544
+ Pre terminate time: 10min elapsed_time: 182.97400450706482s
545
+ length of dataloader: 28 14336
546
+ [GPU memory] before trainer 2.292407512664795
547
+ [2025-07-01 09:19:04] Rank 40: Timer for terminate callback has been set.
548
+ Total limit: 240min
549
+ Pre terminate time: 10min elapsed_time: 182.56714510917664s
550
+ [2025-07-01 09:19:04] Rank 52: Timer for terminate callback has been set.
551
+ Total limit: 240min
552
+ Pre terminate time: 10min elapsed_time: 180.22575664520264s
553
+ [2025-07-01 09:19:04] Rank 29: Timer for terminate callback has been set.
554
+ Total limit: 240min
555
+ Pre terminate time: 10min elapsed_time: 183.02315592765808s
556
+ [2025-07-01 09:19:04] Rank 24: Timer for terminate callback has been set.
557
+ Total limit: 240min
558
+ Pre terminate time: 10min elapsed_time: 183.02467370033264s
559
+ length of dataloader: 28 14336
560
+ [GPU memory] before trainer 2.292407512664795
561
+ [2025-07-01 09:19:04] Rank 12: Timer for terminate callback has been set.
562
+ Total limit: 240min
563
+ Pre terminate time: 10min elapsed_time: 182.78038883209229s
564
+ [2025-07-01 09:19:04] Rank 10: Timer for terminate callback has been set.
565
+ Total limit: 240min
566
+ Pre terminate time: 10min elapsed_time: 182.78413200378418s
567
+ [2025-07-01 09:19:04] Rank 32: Timer for terminate callback has been set.
568
+ Total limit: 240min
569
+ Pre terminate time: 10min elapsed_time: 183.04120206832886s
570
+ [2025-07-01 09:19:04] Rank 25: Timer for terminate callback has been set.
571
+ Total limit: 240min
572
+ Pre terminate time: 10min elapsed_time: 183.0450234413147s
573
+ [2025-07-01 09:19:04] Rank 43: Timer for terminate callback has been set.
574
+ Total limit: 240min
575
+ Pre terminate time: 10min elapsed_time: 182.59447646141052s
576
+ length of dataloader: 28 14336
577
+ [GPU memory] before trainer 2.292407512664795
578
+ [2025-07-01 09:19:04] Rank 44: Timer for terminate callback has been set.
579
+ Total limit: 240min
580
+ Pre terminate time: 10min elapsed_time: 182.60314464569092s
581
+ length of dataloader: 28 14336
582
+ [GPU memory] before trainer 2.292407512664795
583
+ length of dataloader: 28 14336
584
+ [GPU memory] before trainer 2.292407512664795
585
+ [2025-07-01 09:19:04] Rank 56: Timer for terminate callback has been set.
586
+ Total limit: 240min
587
+ Pre terminate time: 10min elapsed_time: 183.03522372245789s
588
+ [2025-07-01 09:19:04] Rank 63: Timer for terminate callback has been set.
589
+ Total limit: 240min
590
+ Pre terminate time: 10min elapsed_time: 183.0367488861084s
591
+ [2025-07-01 09:19:04] Rank 35: Timer for terminate callback has been set.
592
+ Total limit: 240min
593
+ Pre terminate time: 10min elapsed_time: 183.07129096984863s
594
+ length of dataloader: 28 14336
595
+ [GPU memory] before trainer 2.292407512664795
596
+ length of dataloader: 28 14336
597
+ length of dataloader: 28 14336
598
+ length of dataloader: 28 14336
599
+ [GPU memory] before trainer 2.292407512664795[GPU memory] before trainer
600
+ 2.292407512664795
601
+ [GPU memory] before trainer length of dataloader:2.292407512664795
602
+ 28 14336
603
+ [GPU memory] before trainer 2.292407512664795
604
+ length of dataloader: 28 14336
605
+ [GPU memory] before trainer 2.292407512664795
606
+ [2025-07-01 09:19:04] Rank 49: Timer for terminate callback has been set.
607
+ Total limit: 240min
608
+ Pre terminate time: 10min elapsed_time: 180.28140139579773s
609
+ [2025-07-01 09:19:04] Rank 39: Timer for terminate callback has been set.
610
+ Total limit: 240min
611
+ Pre terminate time: 10min elapsed_time: 183.07610249519348s
612
+ length of dataloader: 28 14336
613
+ [GPU memory] before trainer 2.292407512664795
614
+ [2025-07-01 09:19:04] Rank 28: Timer for terminate callback has been set.
615
+ Total limit: 240min
616
+ Pre terminate time: 10min elapsed_time: 183.08507561683655s
617
+ [2025-07-01 09:19:04] Rank 6: Timer for terminate callback has been set.
618
+ Total limit: 240min
619
+ Pre terminate time: 10min elapsed_time: 195.64789366722107s
620
+ length of dataloader: 28 14336
621
+ length of dataloader: 28 14336
622
+ [GPU memory] before trainer 2.292407512664795
623
+ [GPU memory] before trainer 2.292407512664795
624
+ [2025-07-01 09:19:04] Rank 1: Timer for terminate callback has been set.
625
+ Total limit: 240min
626
+ Pre terminate time: 10min elapsed_time: 194.98654437065125s
627
+ [2025-07-01 09:19:04] Rank 46: Timer for terminate callback has been set.
628
+ Total limit: 240min
629
+ Pre terminate time: 10min elapsed_time: 182.64663672447205s
630
+ [2025-07-01 09:19:04] Rank 5: Timer for terminate callback has been set.
631
+ Total limit: 240min
632
+ Pre terminate time: 10min elapsed_time: 195.77389311790466s
633
+ length of dataloader: 28 14336
634
+ [GPU memory] before trainer 2.292407512664795
635
+ [2025-07-01 09:19:04] Rank 2: Timer for terminate callback has been set.
636
+ Total limit: 240min
637
+ Pre terminate time: 10min elapsed_time: 194.8103952407837s
638
+ length of dataloader: 28 14336
639
+ [2025-07-01 09:19:04] Rank 0: Timer for terminate callback has been set.
640
+ Total limit: 240min
641
+ Pre terminate time: 10min elapsed_time: 194.84082126617432s
642
+ [GPU memory] before trainer 2.292407512664795
643
+ [2025-07-01 09:19:04] Rank 33: Timer for terminate callback has been set.
644
+ Total limit: 240min
645
+ Pre terminate time: 10min elapsed_time: 183.11176013946533s
646
+ [2025-07-01 09:19:04] Rank 4: Timer for terminate callback has been set.
647
+ Total limit: 240min
648
+ Pre terminate time: 10min elapsed_time: 194.82239317893982s
649
+ length of dataloader: 28 14336
650
+ [GPU memory] before trainer 2.292407512664795
651
+ length of dataloader: 28 14336
652
+ [GPU memory] before trainer 2.292407512664795
653
+ [2025-07-01 09:19:04] Rank 3: Timer for terminate callback has been set.
654
+ Total limit: 240min
655
+ Pre terminate time: 10min elapsed_time: 195.04775762557983s
656
+ [2025-07-01 09:19:04] Rank 7: Timer for terminate callback has been set.
657
+ Total limit: 240min
658
+ Pre terminate time: 10min elapsed_time: 195.0560109615326s
659
+ length of dataloader: 28 14336
660
+ [GPU memory] before trainer 2.292407512664795
661
+ [2025-07-01 09:19:04] Rank 38: Timer for terminate callback has been set.
662
+ Total limit: 240min
663
+ Pre terminate time: 10min elapsed_time: 183.1278281211853s
664
+ length of dataloader: 28 14336
665
+ [GPU memory] before trainer 2.292407512664795
666
+ length of dataloader: 28 14336
667
+ [GPU memory] before trainer 2.292407512664795
668
+ [2025-07-01 09:19:04] Rank 30: Timer for terminate callback has been set.
669
+ Total limit: 240min
670
+ Pre terminate time: 10min elapsed_time: 183.13366270065308s
671
+ length of dataloader: 28 14336
672
+ [GPU memory] before trainer 2.292407512664795
673
+ length of dataloader: 28 14336
674
+ [GPU memory] before trainer 2.292407512664795
675
+ length of dataloader: 28 14336
676
+ [GPU memory] before trainer 2.292407512664795
677
+ [2025-07-01 09:19:04] Rank 26: Timer for terminate callback has been set.
678
+ Total limit: 240min
679
+ Pre terminate time: 10min elapsed_time: 183.14656853675842s
680
+ length of dataloader: 28 14336
681
+ length of dataloader: 28 14336
682
+ [GPU memory] before trainer 2.292407512664795
683
+ [GPU memory] before trainer 2.292407512664795
684
+ [2025-07-01 09:19:04] Rank 22: Timer for terminate callback has been set.
685
+ Total limit: 240min
686
+ Pre terminate time: 10min elapsed_time: 182.76397037506104s
687
+ length of dataloader: 28 14336
688
+ [GPU memory] before trainer 2.292407512664795
689
+ length of dataloader: 28 14336
690
+ [GPU memory] before trainer 2.292407512664795
691
+ length of dataloader: 28 14336
692
+ [GPU memory] before trainer 2.292407512664795
693
+ [2025-07-01 09:19:04] Rank 19: Timer for terminate callback has been set.
694
+ Total limit: 240min
695
+ Pre terminate time: 10min elapsed_time: 182.77731561660767s
696
+ [2025-07-01 09:19:04] Rank 45: Timer for terminate callback has been set.
697
+ Total limit: 240min
698
+ Pre terminate time: 10min elapsed_time: 182.71926474571228s
699
+ length of dataloader: 28 14336
700
+ [GPU memory] before trainer 2.292407512664795
701
+ [2025-07-01 09:19:04] Rank 17: Timer for terminate callback has been set.
702
+ Total limit: 240min
703
+ Pre terminate time: 10min elapsed_time: 182.7852520942688s
704
+ [2025-07-01 09:19:04] Rank 61: Timer for terminate callback has been set.
705
+ Total limit: 240min
706
+ Pre terminate time: 10min elapsed_time: 183.14857816696167s
707
+ [2025-07-01 09:19:04] Rank 14: Timer for terminate callback has been set.
708
+ Total limit: 240min
709
+ Pre terminate time: 10min elapsed_time: 182.92455291748047s
710
+ length of dataloader: 28 14336
711
+ [GPU memory] before trainer 2.292407512664795
712
+ length of dataloader: 28 14336
713
+ [GPU memory] before trainer 2.292407512664795
714
+ length of dataloader: 28 14336
715
+ [GPU memory] before trainer 2.292407512664795
716
+ length of dataloader: 28 14336
717
+ [GPU memory] before trainer 2.292407512664795
718
+ length of dataloader: 28 14336
719
+ [GPU memory] before trainer 2.292407512664795
720
+ length of dataloader: 28 14336
721
+ [GPU memory] before trainer 2.292407512664795
722
+ length of dataloader: 28 14336
723
+ [GPU memory] before trainer 2.292407512664795
724
+ length of dataloader: 28 14336
725
+ [GPU memory] before trainer 2.292407512664795
726
+ [2025-07-01 09:19:04] Rank 16: Timer for terminate callback has been set.
727
+ Total limit: 240min
728
+ Pre terminate time: 10min elapsed_time: 182.81509160995483s
729
+ length of dataloader: 28 14336
730
+ [GPU memory] before trainer 2.292407512664795
731
+ length of dataloader: 28 14336
732
+ [GPU memory] before trainer 2.292407512664795
733
+ length of dataloader: 28 14336
734
+ [GPU memory] before trainer 2.292407512664795
735
+ length of dataloader: 28 14336
736
+ [GPU memory] before trainer 2.292407512664795
737
+ [2025-07-01 09:19:04] Rank 50: Timer for terminate callback has been set.
738
+ Total limit: 240min
739
+ Pre terminate time: 10min elapsed_time: 180.4306833744049s
740
+ length of dataloader: 28 14336
741
+ [GPU memory] before trainer 2.292407512664795
742
+ [2025-07-01 09:19:04] Rank 42: Timer for terminate callback has been set.
743
+ Total limit: 240min
744
+ Pre terminate time: 10min elapsed_time: 182.78269171714783s
745
+ length of dataloader: 28 14336
746
+ [GPU memory] before trainer 2.292407512664795
747
+ [2025-07-01 09:19:04] Rank 60: Timer for terminate callback has been set.
748
+ Total limit: 240min
749
+ Pre terminate time: 10min elapsed_time: 183.22880291938782s
750
+ length of dataloader: 28 14336
751
+ [GPU memory] before trainer 2.292407512664795
752
+ length of dataloader: 28 14336
753
+ [GPU memory] before trainer 2.292407512664795
754
+ length of dataloader: 28 14336
755
+ [GPU memory] before trainer 2.292407512664795
756
+ length of dataloader: 28 14336
757
+ length of dataloader: 28 14336
758
+ [GPU memory] before trainer 2.292407512664795
759
+ [GPU memory] before trainer 2.292407512664795
760
+ length of dataloader: 28 14336
761
+ [GPU memory] before trainer 2.292407512664795
762
+ length of dataloader: 28 14336
763
+ [GPU memory] before trainer 2.292407512664795
764
+ length of dataloader: 28 14336
765
+ [GPU memory] before trainer 2.292407512664795
766
+ length of dataloader: 28 14336
767
+ [GPU memory] before trainer 2.292407512664795
768
+ Parameter Offload: Total persistent parameters: 771184 in 421 params
slurm/1038301.0.err ADDED
File without changes
slurm/1038301.0.out ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ SLURM_JOB_ID = 1038301
2
+ SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
3
+ RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
4
+ OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
5
+ NNODES = 8
6
+ NODES = pool0-02107 pool0-02117 pool0-02099 pool0-02196 pool0-02404 pool0-02496 pool0-02566 pool0-02669
7
+ NODE_RANK = 1
8
+ GPUS_PER_NODE = 8
9
+ MASTER_ADDR = pool0-02107
10
+ MASTER_PORT = 25001
11
+ GLOBAL_TRAIN_BATCH_SIZE = 2048
12
+ GRADIENT_ACCUMULATION_STEPS = 4
13
+ PER_DEVICE_TRAIN_BATCH_SIZE = 8
14
+ DEFAULT_LEARNING_RATE: 2e-5
15
+ SLURM_JOB_ID = 1038301
16
+ SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
17
+ RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
18
+ OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
19
+ NNODES = 8
20
+ NODES = pool0-02107 pool0-02117 pool0-02099 pool0-02196 pool0-02404 pool0-02496 pool0-02566 pool0-02669
21
+ NODE_RANK = 5
22
+ GPUS_PER_NODE = 8
23
+ MASTER_ADDR = pool0-02107
24
+ MASTER_PORT = 25001
25
+ GLOBAL_TRAIN_BATCH_SIZE = 2048
26
+ GRADIENT_ACCUMULATION_STEPS = 4
27
+ PER_DEVICE_TRAIN_BATCH_SIZE = 8
28
+ DEFAULT_LEARNING_RATE: 2e-5
29
+ SLURM_JOB_ID = 1038301
30
+ SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
31
+ RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
32
+ OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
33
+ NNODES = 8
34
+ NODES = pool0-02107 pool0-02117 pool0-02099 pool0-02196 pool0-02404 pool0-02496 pool0-02566 pool0-02669
35
+ NODE_RANK = 0
36
+ GPUS_PER_NODE = 8
37
+ MASTER_ADDR = pool0-02107
38
+ MASTER_PORT = 25001
39
+ GLOBAL_TRAIN_BATCH_SIZE = 2048
40
+ GRADIENT_ACCUMULATION_STEPS = 4
41
+ PER_DEVICE_TRAIN_BATCH_SIZE = 8
42
+ DEFAULT_LEARNING_RATE: 2e-5
43
+ SLURM_JOB_ID = 1038301
44
+ SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
45
+ RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
46
+ OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
47
+ NNODES = 8
48
+ NODES = pool0-02107 pool0-02117 pool0-02099 pool0-02196 pool0-02404 pool0-02496 pool0-02566 pool0-02669
49
+ NODE_RANK = 2
50
+ GPUS_PER_NODE = 8
51
+ MASTER_ADDR = pool0-02107
52
+ MASTER_PORT = 25001
53
+ GLOBAL_TRAIN_BATCH_SIZE = 2048
54
+ GRADIENT_ACCUMULATION_STEPS = 4
55
+ PER_DEVICE_TRAIN_BATCH_SIZE = 8
56
+ DEFAULT_LEARNING_RATE: 2e-5
57
+ SLURM_JOB_ID = 1038301
58
+ SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
59
+ RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
60
+ OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
61
+ NNODES = 8
62
+ NODES = pool0-02107 pool0-02117 pool0-02099 pool0-02196 pool0-02404 pool0-02496 pool0-02566 pool0-02669
63
+ NODE_RANK = 7
64
+ GPUS_PER_NODE = 8
65
+ MASTER_ADDR = pool0-02107
66
+ MASTER_PORT = 25001
67
+ GLOBAL_TRAIN_BATCH_SIZE = 2048
68
+ GRADIENT_ACCUMULATION_STEPS = 4
69
+ PER_DEVICE_TRAIN_BATCH_SIZE = 8
70
+ DEFAULT_LEARNING_RATE: 2e-5
71
+ SLURM_JOB_ID = 1038301
72
+ SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
73
+ RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
74
+ OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
75
+ NNODES = 8
76
+ NODES = pool0-02107 pool0-02117 pool0-02099 pool0-02196 pool0-02404 pool0-02496 pool0-02566 pool0-02669
77
+ NODE_RANK = 6
78
+ GPUS_PER_NODE = 8
79
+ MASTER_ADDR = pool0-02107
80
+ MASTER_PORT = 25001
81
+ GLOBAL_TRAIN_BATCH_SIZE = 2048
82
+ GRADIENT_ACCUMULATION_STEPS = 4
83
+ PER_DEVICE_TRAIN_BATCH_SIZE = 8
84
+ DEFAULT_LEARNING_RATE: 2e-5
85
+ SLURM_JOB_ID = 1038301
86
+ SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
87
+ RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
88
+ OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
89
+ NNODES = 8
90
+ NODES = pool0-02107 pool0-02117 pool0-02099 pool0-02196 pool0-02404 pool0-02496 pool0-02566 pool0-02669
91
+ NODE_RANK = 3
92
+ GPUS_PER_NODE = 8
93
+ MASTER_ADDR = pool0-02107
94
+ MASTER_PORT = 25001
95
+ GLOBAL_TRAIN_BATCH_SIZE = 2048
96
+ GRADIENT_ACCUMULATION_STEPS = 4
97
+ PER_DEVICE_TRAIN_BATCH_SIZE = 8
98
+ DEFAULT_LEARNING_RATE: 2e-5
99
+ SLURM_JOB_ID = 1038301
100
+ SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
101
+ RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
102
+ OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
103
+ NNODES = 8
104
+ NODES = pool0-02107 pool0-02117 pool0-02099 pool0-02196 pool0-02404 pool0-02496 pool0-02566 pool0-02669
105
+ NODE_RANK = 4
106
+ GPUS_PER_NODE = 8
107
+ MASTER_ADDR = pool0-02107
108
+ MASTER_PORT = 25001
109
+ GLOBAL_TRAIN_BATCH_SIZE = 2048
110
+ GRADIENT_ACCUMULATION_STEPS = 4
111
+ PER_DEVICE_TRAIN_BATCH_SIZE = 8
112
+ DEFAULT_LEARNING_RATE: 2e-5
slurm/1038303.0.err ADDED
The diff for this file is too large to render. See raw diff
 
slurm/1038303.0.out ADDED
@@ -0,0 +1,792 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ SLURM_JOB_ID = 1038303
2
+ SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
3
+ RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
4
+ OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
5
+ NNODES = 8
6
+ NODES = pool0-01504 pool0-01683 pool0-01722 pool0-01867 pool0-01881 pool0-01893 pool0-01919 pool0-01939
7
+ NODE_RANK = 4
8
+ GPUS_PER_NODE = 8
9
+ MASTER_ADDR = pool0-01504
10
+ MASTER_PORT = 25001
11
+ GLOBAL_TRAIN_BATCH_SIZE = 1024
12
+ GRADIENT_ACCUMULATION_STEPS = 4
13
+ PER_DEVICE_TRAIN_BATCH_SIZE = 4
14
+ DEFAULT_LEARNING_RATE: 2e-5
15
+ SLURM_JOB_ID = 1038303
16
+ SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
17
+ RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
18
+ OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
19
+ NNODES = 8
20
+ NODES = pool0-01504 pool0-01683 pool0-01722 pool0-01867 pool0-01881 pool0-01893 pool0-01919 pool0-01939
21
+ NODE_RANK = 5
22
+ GPUS_PER_NODE = 8
23
+ MASTER_ADDR = pool0-01504
24
+ MASTER_PORT = 25001
25
+ GLOBAL_TRAIN_BATCH_SIZE = 1024
26
+ GRADIENT_ACCUMULATION_STEPS = 4
27
+ PER_DEVICE_TRAIN_BATCH_SIZE = 4
28
+ DEFAULT_LEARNING_RATE: 2e-5
29
+ SLURM_JOB_ID = 1038303
30
+ SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
31
+ RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
32
+ OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
33
+ NNODES = 8
34
+ NODES = pool0-01504 pool0-01683 pool0-01722 pool0-01867 pool0-01881 pool0-01893 pool0-01919 pool0-01939
35
+ NODE_RANK = 7
36
+ GPUS_PER_NODE = 8
37
+ MASTER_ADDR = pool0-01504
38
+ MASTER_PORT = 25001
39
+ GLOBAL_TRAIN_BATCH_SIZE = 1024
40
+ GRADIENT_ACCUMULATION_STEPS = 4
41
+ PER_DEVICE_TRAIN_BATCH_SIZE = 4
42
+ DEFAULT_LEARNING_RATE: 2e-5
43
+ SLURM_JOB_ID = 1038303
44
+ SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
45
+ RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
46
+ OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
47
+ NNODES = 8
48
+ NODES = pool0-01504 pool0-01683 pool0-01722 pool0-01867 pool0-01881 pool0-01893 pool0-01919 pool0-01939
49
+ NODE_RANK = 6
50
+ GPUS_PER_NODE = 8
51
+ MASTER_ADDR = pool0-01504
52
+ MASTER_PORT = 25001
53
+ GLOBAL_TRAIN_BATCH_SIZE = 1024
54
+ GRADIENT_ACCUMULATION_STEPS = 4
55
+ PER_DEVICE_TRAIN_BATCH_SIZE = 4
56
+ DEFAULT_LEARNING_RATE: 2e-5
57
+ SLURM_JOB_ID = 1038303
58
+ SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
59
+ RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
60
+ OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
61
+ NNODES = 8
62
+ NODES = pool0-01504 pool0-01683 pool0-01722 pool0-01867 pool0-01881 pool0-01893 pool0-01919 pool0-01939
63
+ NODE_RANK = 1
64
+ GPUS_PER_NODE = 8
65
+ MASTER_ADDR = pool0-01504
66
+ MASTER_PORT = 25001
67
+ GLOBAL_TRAIN_BATCH_SIZE = 1024
68
+ GRADIENT_ACCUMULATION_STEPS = 4
69
+ PER_DEVICE_TRAIN_BATCH_SIZE = 4
70
+ DEFAULT_LEARNING_RATE: 2e-5
71
+ SLURM_JOB_ID = 1038303
72
+ SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
73
+ RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
74
+ OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
75
+ NNODES = 8
76
+ NODES = pool0-01504 pool0-01683 pool0-01722 pool0-01867 pool0-01881 pool0-01893 pool0-01919 pool0-01939
77
+ NODE_RANK = 3
78
+ GPUS_PER_NODE = 8
79
+ MASTER_ADDR = pool0-01504
80
+ MASTER_PORT = 25001
81
+ GLOBAL_TRAIN_BATCH_SIZE = 1024
82
+ GRADIENT_ACCUMULATION_STEPS = 4
83
+ PER_DEVICE_TRAIN_BATCH_SIZE = 4
84
+ DEFAULT_LEARNING_RATE: 2e-5
85
+ SLURM_JOB_ID = 1038303
86
+ SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
87
+ RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
88
+ OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
89
+ NNODES = 8
90
+ NODES = pool0-01504 pool0-01683 pool0-01722 pool0-01867 pool0-01881 pool0-01893 pool0-01919 pool0-01939
91
+ NODE_RANK = 0
92
+ GPUS_PER_NODE = 8
93
+ MASTER_ADDR = pool0-01504
94
+ MASTER_PORT = 25001
95
+ GLOBAL_TRAIN_BATCH_SIZE = 1024
96
+ GRADIENT_ACCUMULATION_STEPS = 4
97
+ PER_DEVICE_TRAIN_BATCH_SIZE = 4
98
+ DEFAULT_LEARNING_RATE: 2e-5
99
+ SLURM_JOB_ID = 1038303
100
+ SLURM_JOB_NAME = nvr_elm_llm:train/NVILA-Lite-8B-quantumn-qa-train
101
+ RUN_NAME = NVILA-Lite-8B-quantumn-qa-train
102
+ OUTPUT_DIR = runs/train/NVILA-Lite-8B-quantumn-qa-train
103
+ NNODES = 8
104
+ NODES = pool0-01504 pool0-01683 pool0-01722 pool0-01867 pool0-01881 pool0-01893 pool0-01919 pool0-01939
105
+ NODE_RANK = 2
106
+ GPUS_PER_NODE = 8
107
+ MASTER_ADDR = pool0-01504
108
+ MASTER_PORT = 25001
109
+ GLOBAL_TRAIN_BATCH_SIZE = 1024
110
+ GRADIENT_ACCUMULATION_STEPS = 4
111
+ PER_DEVICE_TRAIN_BATCH_SIZE = 4
112
+ DEFAULT_LEARNING_RATE: 2e-5
113
+ [2025-07-01 09:21:22,856] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
114
+ [2025-07-01 09:21:22,972] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
115
+ [2025-07-01 09:21:23,111] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
116
+ [2025-07-01 09:21:23,563] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
117
+ [2025-07-01 09:21:23,735] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
118
+ [2025-07-01 09:21:23,799] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
119
+ [2025-07-01 09:21:23,799] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
120
+ [2025-07-01 09:21:23,803] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
121
+ [2025-07-01 09:21:23,847] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
122
+ [2025-07-01 09:21:23,855] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
123
+ [2025-07-01 09:21:23,856] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
124
+ [2025-07-01 09:21:23,911] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
125
+ [2025-07-01 09:21:24,002] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
126
+ [2025-07-01 09:21:24,003] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
127
+ [2025-07-01 09:21:24,019] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
128
+ [2025-07-01 09:21:24,021] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
129
+ [2025-07-01 09:21:24,028] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
130
+ [2025-07-01 09:21:24,030] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
131
+ [2025-07-01 09:21:24,113] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
132
+ [2025-07-01 09:21:24,232] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
133
+ [2025-07-01 09:21:24,243] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
134
+ [2025-07-01 09:21:24,247] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
135
+ [2025-07-01 09:21:24,251] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
136
+ [2025-07-01 09:21:24,253] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
137
+ [2025-07-01 09:21:24,311] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
138
+ [2025-07-01 09:21:24,311] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
139
+ [2025-07-01 09:21:24,328] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
140
+ [2025-07-01 09:21:24,330] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
141
+ [2025-07-01 09:21:24,332] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
142
+ [2025-07-01 09:21:24,369] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
143
+ [2025-07-01 09:21:24,369] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
144
+ [2025-07-01 09:21:24,370] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
145
+ [2025-07-01 09:21:24,371] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
146
+ [2025-07-01 09:21:24,372] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
147
+ [2025-07-01 09:21:24,376] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
148
+ [2025-07-01 09:21:24,424] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
149
+ [2025-07-01 09:21:24,429] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
150
+ [2025-07-01 09:21:24,442] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
151
+ [2025-07-01 09:21:24,446] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
152
+ [2025-07-01 09:21:24,452] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
153
+ [2025-07-01 09:21:24,465] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
154
+ [2025-07-01 09:21:24,518] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
155
+ [2025-07-01 09:21:24,542] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
156
+ [2025-07-01 09:21:24,543] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
157
+ [2025-07-01 09:21:24,563] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
158
+ [2025-07-01 09:21:24,564] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
159
+ [2025-07-01 09:21:24,579] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
160
+ [2025-07-01 09:21:24,587] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
161
+ [2025-07-01 09:21:25,700] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
162
+ [2025-07-01 09:21:25,700] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
163
+ [2025-07-01 09:21:25,775] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
164
+ [2025-07-01 09:21:25,790] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
165
+ [2025-07-01 09:21:25,791] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
166
+ [2025-07-01 09:21:25,792] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
167
+ [2025-07-01 09:21:25,864] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
168
+ [2025-07-01 09:21:25,901] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
169
+ [2025-07-01 09:21:25,906] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
170
+ [2025-07-01 09:21:26,124] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
171
+ [2025-07-01 09:21:26,215] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
172
+ [2025-07-01 09:21:26,241] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
173
+ [2025-07-01 09:21:26,247] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
174
+ [2025-07-01 09:21:26,306] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
175
+ [2025-07-01 09:21:26,306] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
176
+ [2025-07-01 09:21:26,313] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
177
+ [2025-07-01 09:21:26,339] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
178
+ [2025-07-01 09:21:26,339] [INFO] [comm.py:594:init_distributed] cdb=None
179
+ [2025-07-01 09:21:26,459] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
180
+ [2025-07-01 09:21:26,459] [INFO] [comm.py:594:init_distributed] cdb=None
181
+ [2025-07-01 09:21:26,560] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
182
+ [2025-07-01 09:21:26,561] [INFO] [comm.py:594:init_distributed] cdb=None
183
+ [2025-07-01 09:21:26,944] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
184
+ [2025-07-01 09:21:26,944] [INFO] [comm.py:594:init_distributed] cdb=None
185
+ [2025-07-01 09:21:27,281] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
186
+ [2025-07-01 09:21:27,281] [INFO] [comm.py:594:init_distributed] cdb=None
187
+ [2025-07-01 09:21:27,322] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
188
+ [2025-07-01 09:21:27,322] [INFO] [comm.py:594:init_distributed] cdb=None
189
+ [2025-07-01 09:21:27,339] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
190
+ [2025-07-01 09:21:27,339] [INFO] [comm.py:594:init_distributed] cdb=None
191
+ [2025-07-01 09:21:27,459] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
192
+ [2025-07-01 09:21:27,459] [INFO] [comm.py:594:init_distributed] cdb=None
193
+ [2025-07-01 09:21:27,513] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
194
+ [2025-07-01 09:21:27,513] [INFO] [comm.py:594:init_distributed] cdb=None
195
+ [2025-07-01 09:21:27,516] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
196
+ [2025-07-01 09:21:27,516] [INFO] [comm.py:594:init_distributed] cdb=None
197
+ [2025-07-01 09:21:27,549] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
198
+ [2025-07-01 09:21:27,549] [INFO] [comm.py:594:init_distributed] cdb=None
199
+ [2025-07-01 09:21:27,571] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
200
+ [2025-07-01 09:21:27,571] [INFO] [comm.py:594:init_distributed] cdb=None
201
+ [2025-07-01 09:21:27,582] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
202
+ [2025-07-01 09:21:27,583] [INFO] [comm.py:594:init_distributed] cdb=None
203
+ [2025-07-01 09:21:27,583] [INFO] [comm.py:625:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
204
+ [2025-07-01 09:21:27,616] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
205
+ [2025-07-01 09:21:27,617] [INFO] [comm.py:594:init_distributed] cdb=None
206
+ [2025-07-01 09:21:27,621] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
207
+ [2025-07-01 09:21:27,621] [INFO] [comm.py:594:init_distributed] cdb=None
208
+ [2025-07-01 09:21:27,623] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
209
+ [2025-07-01 09:21:27,624] [INFO] [comm.py:594:init_distributed] cdb=None
210
+ [2025-07-01 09:21:27,636] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
211
+ [2025-07-01 09:21:27,636] [INFO] [comm.py:594:init_distributed] cdb=None
212
+ [2025-07-01 09:21:27,637] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
213
+ [2025-07-01 09:21:27,637] [INFO] [comm.py:594:init_distributed] cdb=None
214
+ [2025-07-01 09:21:27,643] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
215
+ [2025-07-01 09:21:27,643] [INFO] [comm.py:594:init_distributed] cdb=None
216
+ [2025-07-01 09:21:27,661] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
217
+ [2025-07-01 09:21:27,661] [INFO] [comm.py:594:init_distributed] cdb=None
218
+ [2025-07-01 09:21:27,662] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
219
+ [2025-07-01 09:21:27,662] [INFO] [comm.py:594:init_distributed] cdb=None
220
+ [2025-07-01 09:21:27,679] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
221
+ [2025-07-01 09:21:27,680] [INFO] [comm.py:594:init_distributed] cdb=None
222
+ [2025-07-01 09:21:27,798] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
223
+ [2025-07-01 09:21:27,799] [INFO] [comm.py:594:init_distributed] cdb=None
224
+ [2025-07-01 09:21:27,828] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
225
+ [2025-07-01 09:21:27,828] [INFO] [comm.py:594:init_distributed] cdb=None
226
+ [2025-07-01 09:21:27,854] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
227
+ [2025-07-01 09:21:27,854] [INFO] [comm.py:594:init_distributed] cdb=None
228
+ [2025-07-01 09:21:27,856] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
229
+ [2025-07-01 09:21:27,856] [INFO] [comm.py:594:init_distributed] cdb=None
230
+ [2025-07-01 09:21:27,862] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
231
+ [2025-07-01 09:21:27,862] [INFO] [comm.py:594:init_distributed] cdb=None
232
+ [2025-07-01 09:21:27,865] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
233
+ [2025-07-01 09:21:27,865] [INFO] [comm.py:594:init_distributed] cdb=None
234
+ [2025-07-01 09:21:27,890] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
235
+ [2025-07-01 09:21:27,890] [INFO] [comm.py:594:init_distributed] cdb=None
236
+ [2025-07-01 09:21:27,892] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
237
+ [2025-07-01 09:21:27,892] [INFO] [comm.py:594:init_distributed] cdb=None
238
+ [2025-07-01 09:21:27,930] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
239
+ [2025-07-01 09:21:27,931] [INFO] [comm.py:594:init_distributed] cdb=None
240
+ [2025-07-01 09:21:27,932] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
241
+ [2025-07-01 09:21:27,932] [INFO] [comm.py:594:init_distributed] cdb=None
242
+ [2025-07-01 09:21:27,975] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
243
+ [2025-07-01 09:21:27,975] [INFO] [comm.py:594:init_distributed] cdb=None
244
+ [2025-07-01 09:21:27,984] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
245
+ [2025-07-01 09:21:27,984] [INFO] [comm.py:594:init_distributed] cdb=None
246
+ [2025-07-01 09:21:27,986] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
247
+ [2025-07-01 09:21:27,986] [INFO] [comm.py:594:init_distributed] cdb=None
248
+ [2025-07-01 09:21:27,988] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
249
+ [2025-07-01 09:21:27,988] [INFO] [comm.py:594:init_distributed] cdb=None
250
+ [2025-07-01 09:21:27,990] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
251
+ [2025-07-01 09:21:27,990] [INFO] [comm.py:594:init_distributed] cdb=None
252
+ [2025-07-01 09:21:28,007] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
253
+ [2025-07-01 09:21:28,007] [INFO] [comm.py:594:init_distributed] cdb=None
254
+ [2025-07-01 09:21:28,034] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
255
+ [2025-07-01 09:21:28,034] [INFO] [comm.py:594:init_distributed] cdb=None
256
+ [2025-07-01 09:21:28,054] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
257
+ [2025-07-01 09:21:28,054] [INFO] [comm.py:594:init_distributed] cdb=None
258
+ [2025-07-01 09:21:28,056] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
259
+ [2025-07-01 09:21:28,056] [INFO] [comm.py:594:init_distributed] cdb=None
260
+ [2025-07-01 09:21:28,114] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
261
+ [2025-07-01 09:21:28,114] [INFO] [comm.py:594:init_distributed] cdb=None
262
+ [2025-07-01 09:21:28,186] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
263
+ [2025-07-01 09:21:28,186] [INFO] [comm.py:594:init_distributed] cdb=None
264
+ [2025-07-01 09:21:28,277] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
265
+ [2025-07-01 09:21:28,277] [INFO] [comm.py:594:init_distributed] cdb=None
266
+ [2025-07-01 09:21:28,330] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
267
+ [2025-07-01 09:21:28,330] [INFO] [comm.py:594:init_distributed] cdb=None
268
+ [2025-07-01 09:21:28,361] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
269
+ [2025-07-01 09:21:28,361] [INFO] [comm.py:594:init_distributed] cdb=None
270
+ [2025-07-01 09:21:28,409] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
271
+ [2025-07-01 09:21:28,410] [INFO] [comm.py:594:init_distributed] cdb=None
272
+ [2025-07-01 09:21:28,424] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
273
+ [2025-07-01 09:21:28,424] [INFO] [comm.py:594:init_distributed] cdb=None
274
+ [2025-07-01 09:21:29,112] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
275
+ [2025-07-01 09:21:29,112] [INFO] [comm.py:594:init_distributed] cdb=None
276
+ [2025-07-01 09:21:29,152] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
277
+ [2025-07-01 09:21:29,153] [INFO] [comm.py:594:init_distributed] cdb=None
278
+ [2025-07-01 09:21:29,158] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
279
+ [2025-07-01 09:21:29,158] [INFO] [comm.py:594:init_distributed] cdb=None
280
+ [2025-07-01 09:21:29,181] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
281
+ [2025-07-01 09:21:29,181] [INFO] [comm.py:594:init_distributed] cdb=None
282
+ [2025-07-01 09:21:29,378] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
283
+ [2025-07-01 09:21:29,379] [INFO] [comm.py:594:init_distributed] cdb=None
284
+ [2025-07-01 09:21:29,436] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
285
+ [2025-07-01 09:21:29,436] [INFO] [comm.py:594:init_distributed] cdb=None
286
+ [2025-07-01 09:21:29,474] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
287
+ [2025-07-01 09:21:29,474] [INFO] [comm.py:594:init_distributed] cdb=None
288
+ [2025-07-01 09:21:29,528] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
289
+ [2025-07-01 09:21:29,528] [INFO] [comm.py:594:init_distributed] cdb=None
290
+ [2025-07-01 09:21:29,605] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
291
+ [2025-07-01 09:21:29,605] [INFO] [comm.py:594:init_distributed] cdb=None
292
+ [2025-07-01 09:21:29,635] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
293
+ [2025-07-01 09:21:29,635] [INFO] [comm.py:594:init_distributed] cdb=None
294
+ [2025-07-01 09:21:29,709] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
295
+ [2025-07-01 09:21:29,709] [INFO] [comm.py:594:init_distributed] cdb=None
296
+ [2025-07-01 09:21:29,775] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
297
+ [2025-07-01 09:21:29,775] [INFO] [comm.py:594:init_distributed] cdb=None
298
+ [2025-07-01 09:21:29,782] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
299
+ [2025-07-01 09:21:29,782] [INFO] [comm.py:594:init_distributed] cdb=None
300
+ [2025-07-01 09:21:29,784] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
301
+ [2025-07-01 09:21:29,784] [INFO] [comm.py:594:init_distributed] cdb=None
302
+ [2025-07-01 09:21:29,786] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
303
+ [2025-07-01 09:21:29,786] [INFO] [comm.py:594:init_distributed] cdb=None
304
+ [2025-07-01 09:21:29,786] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
305
+ [2025-07-01 09:21:29,786] [INFO] [comm.py:594:init_distributed] cdb=None
306
+ [2025-07-01 09:21:43,807] [INFO] [partition_parameters.py:453:__exit__] finished initializing model with 7.61B parameters
307
+ [2025-07-01 09:21:52,746] [INFO] [partition_parameters.py:453:__exit__] finished initializing model with 8.03B parameters
308
+ [2025-07-01 09:21:53,403] [INFO] [partition_parameters.py:453:__exit__] finished initializing model with 8.09B parameters
309
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
310
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
311
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
312
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
313
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
314
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
315
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
316
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
317
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
318
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
319
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
320
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
321
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
322
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
323
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
324
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
325
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
326
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
327
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
328
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
329
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
330
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
331
+
332
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
333
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
334
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
335
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
336
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
337
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
338
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
339
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
340
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
341
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
342
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
343
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
344
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
345
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
346
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
347
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
348
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
349
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
350
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
351
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
352
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
353
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
354
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
355
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
356
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
357
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
358
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
359
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
360
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
361
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
362
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
363
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
364
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
365
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
366
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
367
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
368
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
369
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
370
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
371
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
372
+ [dist-0-of-64] LlavaLlamaModel(
373
+ (llm): Qwen2ForCausalLM(
374
+ (model): Qwen2Model(
375
+ (embed_tokens): Embedding(151648, 3584)
376
+ (layers): ModuleList(
377
+ (0-27): 28 x Qwen2DecoderLayer(
378
+ (self_attn): Qwen2FlashAttention2(
379
+ (q_proj): Linear(in_features=3584, out_features=3584, bias=True)
380
+ (k_proj): Linear(in_features=3584, out_features=512, bias=True)
381
+ (v_proj): Linear(in_features=3584, out_features=512, bias=True)
382
+ (o_proj): Linear(in_features=3584, out_features=3584, bias=False)
383
+ (rotary_emb): Qwen2RotaryEmbedding()
384
+ )
385
+ (mlp): Qwen2MLP(
386
+ (gate_proj): Linear(in_features=3584, out_features=18944, bias=False)
387
+ (up_proj): Linear(in_features=3584, out_features=18944, bias=False)
388
+ (down_proj): Linear(in_features=18944, out_features=3584, bias=False)
389
+ (act_fn): SiLU()
390
+ )
391
+ (input_layernorm): Qwen2RMSNorm((0,), eps=1e-06)
392
+ (post_attention_layernorm): Qwen2RMSNorm((0,), eps=1e-06)
393
+ )
394
+ )
395
+ (norm): Qwen2RMSNorm((0,), eps=1e-06)
396
+ (rotary_emb): Qwen2RotaryEmbedding()
397
+ )
398
+ (lm_head): Linear(in_features=3584, out_features=151648, bias=False)
399
+ )
400
+ (vision_tower): SiglipVisionTower(
401
+ (vision_tower): SiglipVisionModel(
402
+ (vision_model): SiglipVisionTransformer(
403
+ (embeddings): SiglipVisionEmbeddings(
404
+ (patch_embedding): Conv2d(3, 1152, kernel_size=(14, 14), stride=(14, 14), padding=valid)
405
+ (position_embedding): Embedding(1024, 1152)
406
+ )
407
+ (encoder): SiglipEncoder(
408
+ (layers): ModuleList(
409
+ (0-26): 27 x SiglipEncoderLayer(
410
+ (self_attn): SiglipFlashAttention2(
411
+ (k_proj): Linear(in_features=1152, out_features=1152, bias=True)
412
+ (v_proj): Linear(in_features=1152, out_features=1152, bias=True)
413
+ (q_proj): Linear(in_features=1152, out_features=1152, bias=True)
414
+ (out_proj): Linear(in_features=1152, out_features=1152, bias=True)
415
+ )
416
+ (layer_norm1): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
417
+ (mlp): SiglipMLP(
418
+ (activation_fn): PytorchGELUTanh()
419
+ (fc1): Linear(in_features=1152, out_features=4304, bias=True)
420
+ (fc2): Linear(in_features=4304, out_features=1152, bias=True)
421
+ )
422
+ (layer_norm2): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
423
+ )
424
+ )
425
+ )
426
+ (post_layernorm): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
427
+ )
428
+ )
429
+ )
430
+ (mm_projector): MultimodalProjector(
431
+ (layers): Sequential(
432
+ (0): DownSample3x3BlockFix()
433
+ (1): LayerNorm((10368,), eps=1e-05, elementwise_affine=True)
434
+ (2): Linear(in_features=10368, out_features=3456, bias=True)
435
+ (3): GELU(approximate='none')
436
+ (4): LayerNorm((3456,), eps=1e-05, elementwise_affine=True)
437
+ (5): Linear(in_features=3456, out_features=3584, bias=True)
438
+ (6): GELU(approximate='none')
439
+ (7): Linear(in_features=3584, out_features=3584, bias=True)
440
+ )
441
+ )
442
+ )
443
+ [dist-0-of-64] Tunable parameters:
444
+ language model True
445
+ [dist-0-of-64] vision tower True
446
+ [dist-0-of-64] mm projector True
447
+ trainable params: 8,087,063,152 || all params: 8,087,063,152 || trainable%: 100.0000
448
+ [2025-07-01 09:24:26] Rank 51: Timer for terminate callback has been set.
449
+ Total limit: 240min
450
+ Pre terminate time: 10min elapsed_time: 178.73019170761108s
451
+ [2025-07-01 09:24:26] Rank 15: Timer for terminate callback has been set.
452
+ Total limit: 240min
453
+ Pre terminate time: 10min elapsed_time: 178.88804817199707s
454
+ [2025-07-01 09:24:26] Rank 61: Timer for terminate callback has been set.
455
+ Total limit: 240min
456
+ Pre terminate time: 10min elapsed_time: 178.51190781593323s
457
+ [2025-07-01 09:24:26] Rank 18: Timer for terminate callback has been set.
458
+ Total limit: 240min
459
+ Pre terminate time: 10min elapsed_time: 179.18334817886353s
460
+ length of dataloader: 52 13312
461
+ [GPU memory] before trainer 2.292407512664795
462
+ length of dataloader: 52 13312
463
+ [GPU memory] before trainer 2.292407512664795
464
+ [2025-07-01 09:24:26] Rank 35: Timer for terminate callback has been set.
465
+ Total limit: 240min
466
+ Pre terminate time: 10min elapsed_time: 177.17961239814758s
467
+ [2025-07-01 09:24:26] Rank 30: Timer for terminate callback has been set.
468
+ Total limit: 240min
469
+ Pre terminate time: 10min elapsed_time: 177.50609588623047s
470
+ length of dataloader: 52 13312
471
+ [GPU memory] before trainer 2.292407512664795
472
+ [2025-07-01 09:24:26] Rank 42: Timer for terminate callback has been set.
473
+ Total limit: 240min
474
+ Pre terminate time: 10min elapsed_time: 179.07143235206604s
475
+ length of dataloader: 52 13312
476
+ [GPU memory] before trainer 2.292407512664795
477
+ length of dataloader: 52 13312
478
+ [GPU memory] before trainer 2.292407512664795
479
+ length of dataloader: 52 13312
480
+ [GPU memory] before trainer 2.292407512664795
481
+ length of dataloader: 52 13312
482
+ [GPU memory] before trainer 2.292407512664795
483
+ [2025-07-01 09:24:26] Rank 50: Timer for terminate callback has been set.
484
+ Total limit: 240min
485
+ Pre terminate time: 10min elapsed_time: 178.9429168701172s
486
+ [2025-07-01 09:24:26] Rank 16: Timer for terminate callback has been set.
487
+ Total limit: 240min
488
+ Pre terminate time: 10min elapsed_time: 179.6384243965149s
489
+ [2025-07-01 09:24:26] Rank 0: Timer for terminate callback has been set.
490
+ Total limit: 240min
491
+ Pre terminate time: 10min elapsed_time: 179.61671090126038s
492
+ length of dataloader: 52 13312
493
+ [GPU memory] before trainer 2.292407512664795
494
+ [2025-07-01 09:24:26] Rank 11: Timer for terminate callback has been set.
495
+ Total limit: 240min
496
+ Pre terminate time: 10min elapsed_time: 179.24142980575562s
497
+ [2025-07-01 09:24:26] Rank 3: Timer for terminate callback has been set.
498
+ Total limit: 240min
499
+ Pre terminate time: 10min elapsed_time: 179.68241500854492s
500
+ length of dataloader: 52 13312
501
+ [GPU memory] before trainer 2.292407512664795
502
+ length of dataloader: 52 13312
503
+ [GPU memory] before trainer 2.292407512664795
504
+ length of dataloader: 52 13312
505
+ [GPU memory] before trainer 2.292407512664795
506
+ [2025-07-01 09:24:27] Rank 59: Timer for terminate callback has been set.
507
+ Total limit: 240min
508
+ Pre terminate time: 10min elapsed_time: 178.93752551078796s
509
+ [2025-07-01 09:24:27] Rank 36: Timer for terminate callback has been set.
510
+ Total limit: 240min
511
+ Pre terminate time: 10min elapsed_time: 177.47626900672913s
512
+ length of dataloader: 52 13312
513
+ [GPU memory] before trainer 2.292407512664795
514
+ [2025-07-01 09:24:27] Rank 43: Timer for terminate callback has been set.
515
+ Total limit: 240min
516
+ Pre terminate time: 10min elapsed_time: 179.4129433631897s
517
+ [2025-07-01 09:24:27] Rank 39: Timer for terminate callback has been set.
518
+ Total limit: 240min
519
+ Pre terminate time: 10min elapsed_time: 178.0158293247223s
520
+ [2025-07-01 09:24:27] Rank 38: Timer for terminate callback has been set.
521
+ Total limit: 240min
522
+ Pre terminate time: 10min elapsed_time: 177.80202722549438s
523
+ length of dataloader: 52 13312
524
+ [GPU memory] before trainer 2.292407512664795
525
+ length of dataloader: 52 13312
526
+ [GPU memory] before trainer 2.292407512664795
527
+ [2025-07-01 09:24:27] Rank 10: Timer for terminate callback has been set.
528
+ Total limit: 240min
529
+ Pre terminate time: 10min elapsed_time: 179.43888783454895s
530
+ length of dataloader: 52 13312
531
+ [GPU memory] before trainer 2.292407512664795
532
+ [2025-07-01 09:24:27] Rank 60: Timer for terminate callback has been set.
533
+ Total limit: 240min
534
+ Pre terminate time: 10min elapsed_time: 179.1183044910431s
535
+ [2025-07-01 09:24:27] Rank 49: Timer for terminate callback has been set.
536
+ Total limit: 240min
537
+ Pre terminate time: 10min elapsed_time: 179.2389600276947s
538
+ length of dataloader: 52 13312
539
+ [GPU memory] before trainer 2.292407512664795
540
+ [2025-07-01 09:24:27] Rank 57: Timer for terminate callback has been set.
541
+ Total limit: 240min
542
+ Pre terminate time: 10min elapsed_time: 179.06473851203918s
543
+ [2025-07-01 09:24:27] Rank 63: Timer for terminate callback has been set.
544
+ Total limit: 240min
545
+ Pre terminate time: 10min elapsed_time: 179.11095666885376s
546
+ [2025-07-01 09:24:27] Rank 41: Timer for terminate callback has been set.
547
+ Total limit: 240min
548
+ Pre terminate time: 10min elapsed_time: 179.51858401298523s
549
+ [2025-07-01 09:24:27] Rank 52: Timer for terminate callback has been set.
550
+ Total limit: 240min
551
+ Pre terminate time: 10min elapsed_time: 179.30554270744324s
552
+ length of dataloader: 52 13312
553
+ [GPU memory] before trainer 2.292407512664795
554
+ [2025-07-01 09:24:27] Rank 40: Timer for terminate callback has been set.
555
+ Total limit: 240min
556
+ Pre terminate time: 10min elapsed_time: 179.5245840549469s
557
+ [2025-07-01 09:24:27] Rank 21: Timer for terminate callback has been set.
558
+ Total limit: 240min
559
+ Pre terminate time: 10min elapsed_time: 180.82316184043884s
560
+ [2025-07-01 09:24:27] Rank 37: Timer for terminate callback has been set.
561
+ Total limit: 240min
562
+ Pre terminate time: 10min elapsed_time: 177.645690202713s
563
+ [2025-07-01 09:24:27] Rank 56: Timer for terminate callback has been set.
564
+ Total limit: 240min
565
+ Pre terminate time: 10min elapsed_time: 179.1373429298401s
566
+ [2025-07-01 09:24:27] Rank 7: Timer for terminate callback has been set.
567
+ Total limit: 240min
568
+ Pre terminate time: 10min elapsed_time: 179.96211647987366s
569
+ length of dataloader: 52 13312
570
+ [GPU memory] before trainer 2.292407512664795
571
+ [2025-07-01 09:24:27] Rank 9: Timer for terminate callback has been set.
572
+ Total limit: 240min
573
+ Pre terminate time: 10min elapsed_time: 179.6913924217224s
574
+ [2025-07-01 09:24:27] Rank 32: Timer for terminate callback has been set.
575
+ Total limit: 240min
576
+ Pre terminate time: 10min elapsed_time: 177.7236123085022s
577
+ length of dataloader: 52 13312
578
+ [GPU memory] before trainer 2.292407512664795
579
+ [2025-07-01 09:24:27] Rank 26: Timer for terminate callback has been set.
580
+ Total limit: 240min
581
+ Pre terminate time: 10min elapsed_time: 178.0112874507904s
582
+ length of dataloader: 52 13312
583
+ [GPU memory] before trainer 2.292407512664795
584
+ [2025-07-01 09:24:27] Rank 1: Timer for terminate callback has been set.
585
+ Total limit: 240min
586
+ Pre terminate time: 10min elapsed_time: 180.98196148872375s
587
+ [2025-07-01 09:24:27] Rank 12: Timer for terminate callback has been set.
588
+ Total limit: 240min
589
+ Pre terminate time: 10min elapsed_time: 180.38428473472595s
590
+ length of dataloader: 52 13312
591
+ [GPU memory] before trainer 2.292407512664795
592
+ length of dataloader: 52 13312
593
+ [GPU memory] before trainer 2.292407512664795
594
+ [2025-07-01 09:24:27] Rank 23: Timer for terminate callback has been set.
595
+ Total limit: 240min
596
+ Pre terminate time: 10min elapsed_time: 179.83022689819336s
597
+ [2025-07-01 09:24:27] Rank 62: Timer for terminate callback has been set.
598
+ Total limit: 240min
599
+ Pre terminate time: 10min elapsed_time: 179.28459119796753s
600
+ [2025-07-01 09:24:27] Rank 33: Timer for terminate callback has been set.
601
+ Total limit: 240min
602
+ Pre terminate time: 10min elapsed_time: 177.73978686332703s
603
+ length of dataloader: 52 13312
604
+ [GPU memory] before trainer 2.292407512664795
605
+ [2025-07-01 09:24:27] Rank 53: Timer for terminate callback has been set.
606
+ Total limit: 240min
607
+ Pre terminate time: 10min elapsed_time: 179.41727900505066s
608
+ [2025-07-01 09:24:27] Rank 34: Timer for terminate callback has been set.
609
+ Total limit: 240min
610
+ Pre terminate time: 10min elapsed_time: 177.72356247901917s
611
+ length of dataloader: 52 13312
612
+ [GPU memory] before trainer 2.292407512664795
613
+ length of dataloader: 52 13312
614
+ [GPU memory] before trainer 2.292407512664795
615
+ length of dataloader: 52 13312
616
+ [GPU memory] before trainer 2.292407512664795
617
+ length of dataloader: 52 13312
618
+ [GPU memory] before trainer 2.292407512664795
619
+ length of dataloader: 52 13312
620
+ [GPU memory] before trainer 2.292407512664795
621
+ length of dataloader: 52 13312
622
+ [GPU memory] before trainer 2.292407512664795
623
+ [2025-07-01 09:24:27] Rank 44: Timer for terminate callback has been set.
624
+ Total limit: 240min
625
+ Pre terminate time: 10min elapsed_time: 180.81957077980042s
626
+ [2025-07-01 09:24:27] Rank 13: Timer for terminate callback has been set.
627
+ Total limit: 240min
628
+ Pre terminate time: 10min elapsed_time: 179.6111810207367s
629
+ [2025-07-01 09:24:27] Rank 20: Timer for terminate callback has been set.
630
+ Total limit: 240min
631
+ Pre terminate time: 10min elapsed_time: 179.8662896156311s
632
+ [2025-07-01 09:24:27] Rank 58: Timer for terminate callback has been set.
633
+ Total limit: 240min
634
+ Pre terminate time: 10min elapsed_time: 179.25026488304138s
635
+ [2025-07-01 09:24:27] Rank 25: Timer for terminate callback has been set.
636
+ Total limit: 240min
637
+ Pre terminate time: 10min elapsed_time: 178.25618314743042s
638
+ length of dataloader: 52 13312
639
+ [GPU memory] before trainer 2.292407512664795
640
+ length of dataloader: 52 13312
641
+ [GPU memory] before trainer 2.292407512664795
642
+ [2025-07-01 09:24:27] Rank 4: Timer for terminate callback has been set.
643
+ Total limit: 240min
644
+ Pre terminate time: 10min elapsed_time: 180.23809123039246s
645
+ length of dataloader: 52 13312
646
+ [GPU memory] before trainer 2.292407512664795
647
+ length of dataloader: 52 13312
648
+ [GPU memory] before trainer 2.292407512664795
649
+ [2025-07-01 09:24:27] Rank 27: Timer for terminate callback has been set.
650
+ Total limit: 240min
651
+ Pre terminate time: 10min elapsed_time: 178.11056566238403s
652
+ length of dataloader: 52 13312
653
+ [GPU memory] before trainer 2.292407512664795
654
+ [2025-07-01 09:24:27] Rank 46: Timer for terminate callback has been set.
655
+ Total limit: 240min
656
+ Pre terminate time: 10min elapsed_time: 179.69872188568115s
657
+ length of dataloader: 52 13312
658
+ [GPU memory] before trainer 2.292407512664795
659
+ length of dataloader: 52 13312
660
+ [GPU memory] before trainer 2.292407512664795
661
+ [2025-07-01 09:24:27] Rank 14: Timer for terminate callback has been set.
662
+ Total limit: 240min
663
+ Pre terminate time: 10min elapsed_time: 179.87972497940063s
664
+ [2025-07-01 09:24:27] Rank 8: Timer for terminate callback has been set.
665
+ Total limit: 240min
666
+ Pre terminate time: 10min elapsed_time: 179.7657687664032s
667
+ length of dataloader: 52 13312
668
+ [GPU memory] before trainer 2.292407512664795
669
+ length of dataloader: 52 13312
670
+ [GPU memory] before trainer 2.292407512664795
671
+ [2025-07-01 09:24:27] Rank 5: Timer for terminate callback has been set.
672
+ Total limit: 240min
673
+ Pre terminate time: 10min elapsed_time: 180.11463713645935s
674
+ [2025-07-01 09:24:27] Rank 2: Timer for terminate callback has been set.
675
+ Total limit: 240min
676
+ Pre terminate time: 10min elapsed_time: 180.15511107444763s
677
+ [2025-07-01 09:24:27] Rank 22: Timer for terminate callback has been set.
678
+ Total limit: 240min
679
+ Pre terminate time: 10min elapsed_time: 179.94539523124695s
680
+ length of dataloader: 52 13312
681
+ [GPU memory] before trainer 2.292407512664795
682
+ [2025-07-01 09:24:27] Rank 48: Timer for terminate callback has been set.
683
+ Total limit: 240min
684
+ Pre terminate time: 10min elapsed_time: 179.58666896820068s
685
+ [2025-07-01 09:24:27] Rank 17: Timer for terminate callback has been set.
686
+ Total limit: 240min
687
+ Pre terminate time: 10min elapsed_time: 179.94371223449707s
688
+ [2025-07-01 09:24:27] Rank 54: Timer for terminate callback has been set.
689
+ Total limit: 240min
690
+ Pre terminate time: 10min elapsed_time: 179.51171231269836s
691
+ [2025-07-01 09:24:27] Rank 6: Timer for terminate callback has been set.
692
+ Total limit: 240min
693
+ Pre terminate time: 10min elapsed_time: 180.14709281921387s
694
+ length of dataloader: 52 13312
695
+ [GPU memory] before trainer 2.292407512664795
696
+ length of dataloader: 52 13312
697
+ [GPU memory] before trainer 2.292407512664795
698
+ [2025-07-01 09:24:27] Rank 19: Timer for terminate callback has been set.
699
+ Total limit: 240min
700
+ Pre terminate time: 10min elapsed_time: 179.9675374031067s
701
+ length of dataloader: 52 13312
702
+ [GPU memory] before trainer 2.292407512664795
703
+ length of dataloader: 52 13312
704
+ [GPU memory] before trainer 2.292407512664795
705
+ length of dataloader: 52 13312
706
+ [GPU memory] before trainer 2.292407512664795
707
+ length of dataloader: 52 13312
708
+ [GPU memory] before trainer 2.292407512664795
709
+ length of dataloader: 52 13312
710
+ [GPU memory] before trainer 2.292407512664795
711
+ [2025-07-01 09:24:27] Rank 28: Timer for terminate callback has been set.
712
+ Total limit: 240min
713
+ Pre terminate time: 10min elapsed_time: 178.3642818927765s
714
+ [2025-07-01 09:24:27] Rank 31: Timer for terminate callback has been set.
715
+ Total limit: 240min
716
+ Pre terminate time: 10min elapsed_time: 178.2817325592041s
717
+ length of dataloader: 52 13312
718
+ [GPU memory] before trainer 2.292407512664795
719
+ [2025-07-01 09:24:27] Rank 47: Timer for terminate callback has been set.
720
+ Total limit: 240min
721
+ Pre terminate time: 10min elapsed_time: 179.9938542842865s
722
+ [2025-07-01 09:24:27] Rank 24: Timer for terminate callback has been set.
723
+ Total limit: 240min
724
+ Pre terminate time: 10min elapsed_time: 178.28741884231567s
725
+ [2025-07-01 09:24:27] Rank 29: Timer for terminate callback has been set.
726
+ Total limit: 240min
727
+ Pre terminate time: 10min elapsed_time: 178.41006183624268s
728
+ length of dataloader: 52 13312
729
+ [GPU memory] before trainer 2.292407512664795
730
+ length of dataloader: 52 13312
731
+ [GPU memory] before trainer 2.292407512664795
732
+ [2025-07-01 09:24:27] Rank 55: Timer for terminate callback has been set.
733
+ Total limit: 240min
734
+ Pre terminate time: 10min elapsed_time: 179.64781546592712s
735
+ [2025-07-01 09:24:27] Rank 45: Timer for terminate callback has been set.
736
+ Total limit: 240min
737
+ Pre terminate time: 10min elapsed_time: 179.78166794776917s
738
+ length of dataloader: 52 13312
739
+ [GPU memory] before trainer 2.292407512664795
740
+ length of dataloader: 52 13312
741
+ [GPU memory] before trainer 2.292407512664795
742
+ length of dataloader: 52 13312
743
+ [GPU memory] before trainer 2.292407512664795
744
+ length of dataloader: 52 13312
745
+ [GPU memory] before trainer 2.292407512664795
746
+ length of dataloader: 52 13312
747
+ [GPU memory] before trainer 2.292407512664795
748
+ length of dataloader: 52 13312
749
+ [GPU memory] before trainer 2.292407512664795
750
+ length of dataloader: 52 13312
751
+ [GPU memory] before trainer 2.292407512664795
752
+ length of dataloader: 52 13312
753
+ [GPU memory] before trainer 2.292407512664795
754
+ length of dataloader: 52 13312
755
+ [GPU memory] before trainer 2.292407512664795
756
+ length of dataloader: 52 13312
757
+ [GPU memory] before trainer 2.292407512664795
758
+ length of dataloader: 52 13312
759
+ length of dataloader: 52 13312
760
+ [GPU memory] before trainer 2.292407512664795
761
+ [GPU memory] before trainer 2.292407512664795
762
+ length of dataloader: 52 13312
763
+ [GPU memory] before trainer 2.292407512664795
764
+ length of dataloader: 52 13312
765
+ [GPU memory] before trainer 2.292407512664795
766
+ length of dataloader: 52 13312
767
+ [GPU memory] before trainer 2.292407512664795
768
+ Parameter Offload: Total persistent parameters: 771184 in 421 params
769
+ {'loss': 1.2648, 'grad_norm': 130.75577214745246, 'learning_rate': 2e-05, 'epoch': 0.08}
770
+ {'loss': 1.2283, 'grad_norm': 126.67616596972118, 'learning_rate': 1.9659258262890683e-05, 'epoch': 0.15}
771
+ [2025-07-01 09:25:57,641] [WARNING] [stage3.py:1850:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
772
+ {'loss': 0.4188, 'grad_norm': 56.84219933724937, 'learning_rate': 1.866025403784439e-05, 'epoch': 0.23}
773
+ {'loss': 2.4789, 'grad_norm': 89.42666319016989, 'learning_rate': 1.7071067811865477e-05, 'epoch': 0.31}
774
+ {'loss': 0.7853, 'grad_norm': 72.41844398977439, 'learning_rate': 1.5000000000000002e-05, 'epoch': 0.38}
775
+ {'loss': 8.2197, 'grad_norm': 799.0148731599335, 'learning_rate': 1.2588190451025209e-05, 'epoch': 0.46}
776
+ {'loss': 0.3008, 'grad_norm': 34.525610403243014, 'learning_rate': 1e-05, 'epoch': 0.54}
777
+ {'loss': 0.3999, 'grad_norm': 64.98250527603693, 'learning_rate': 7.411809548974792e-06, 'epoch': 0.62}
778
+ {'loss': 0.2575, 'grad_norm': 11.46902235575636, 'learning_rate': 5.000000000000003e-06, 'epoch': 0.69}
779
+ [2025-07-01 09:28:31,764] [WARNING] [stage3.py:1850:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
780
+ {'loss': 0.3174, 'grad_norm': 42.63293180170212, 'learning_rate': 2.9289321881345257e-06, 'epoch': 0.77}
781
+ {'loss': 0.3054, 'grad_norm': 40.64988981794197, 'learning_rate': 1.339745962155613e-06, 'epoch': 0.85}
782
+ {'loss': 0.2827, 'grad_norm': 27.588182133457394, 'learning_rate': 3.4074173710931804e-07, 'epoch': 0.92}
783
+ {'loss': 0.2751, 'grad_norm': 17.48926557604337, 'learning_rate': 0.0, 'epoch': 1.0}
784
+ saving llm to runs/train/NVILA-Lite-8B-quantumn-qa-train/model/tmp-checkpoint-13/llm
785
+ saving vision_tower to runs/train/NVILA-Lite-8B-quantumn-qa-train/model/tmp-checkpoint-13/vision_tower
786
+ saving mm_projector to runs/train/NVILA-Lite-8B-quantumn-qa-train/model/tmp-checkpoint-13/mm_projector
787
+ {'train_runtime': 323.0635, 'train_samples_per_second': 41.206, 'train_steps_per_second': 0.04, 'train_loss': 1.2718768601234143, 'epoch': 1.0}
788
+ saving llm to runs/train/NVILA-Lite-8B-quantumn-qa-train/model/llm
789
+ saving vision_tower to runs/train/NVILA-Lite-8B-quantumn-qa-train/model/vision_tower
790
+ saving mm_projector to runs/train/NVILA-Lite-8B-quantumn-qa-train/model/mm_projector
791
+ wandb:
792
+ wandb: 🚀 View run NVILA-Lite-8B-quantumn-qa-train at: https://wandb.ai/ligeng-zhu/vila/runs/NVILA-Lite-8B-quantumn-qa-train