diff --git "a/training_log_20250116_222124.txt" "b/training_log_20250116_222124.txt" new file mode 100644--- /dev/null +++ "b/training_log_20250116_222124.txt" @@ -0,0 +1,1168 @@ +[2025-01-16 22:21:31,220] torch.distributed.run: [WARNING] +[2025-01-16 22:21:31,220] torch.distributed.run: [WARNING] ***************************************** +[2025-01-16 22:21:31,220] torch.distributed.run: [WARNING] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +[2025-01-16 22:21:31,220] torch.distributed.run: [WARNING] ***************************************** +The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`. + +0it [00:00, ?it/s] +0it [00:00, ?it/s] +/cpfs02/user/zhaoxiangyu/miniconda3/envs/llava/lib/python3.10/site-packages/_distutils_hack/__init__.py:54: UserWarning: Reliance on distutils from stdlib is deprecated. Users must rely on setuptools to provide the distutils module. Avoid importing distutils or import setuptools first, and avoid setting SETUPTOOLS_USE_DISTUTILS=stdlib. Register concerns at https://github.com/pypa/setuptools/issues/new?template=distutils-deprecation.yml + warnings.warn( +/cpfs02/user/zhaoxiangyu/miniconda3/envs/llava/lib/python3.10/site-packages/_distutils_hack/__init__.py:54: UserWarning: Reliance on distutils from stdlib is deprecated. Users must rely on setuptools to provide the distutils module. Avoid importing distutils or import setuptools first, and avoid setting SETUPTOOLS_USE_DISTUTILS=stdlib. Register concerns at https://github.com/pypa/setuptools/issues/new?template=distutils-deprecation.yml + warnings.warn( +/cpfs02/user/zhaoxiangyu/miniconda3/envs/llava/lib/python3.10/site-packages/_distutils_hack/__init__.py:54: UserWarning: Reliance on distutils from stdlib is deprecated. Users must rely on setuptools to provide the distutils module. Avoid importing distutils or import setuptools first, and avoid setting SETUPTOOLS_USE_DISTUTILS=stdlib. Register concerns at https://github.com/pypa/setuptools/issues/new?template=distutils-deprecation.yml + warnings.warn( +/cpfs02/user/zhaoxiangyu/miniconda3/envs/llava/lib/python3.10/site-packages/_distutils_hack/__init__.py:54: UserWarning: Reliance on distutils from stdlib is deprecated. Users must rely on setuptools to provide the distutils module. Avoid importing distutils or import setuptools first, and avoid setting SETUPTOOLS_USE_DISTUTILS=stdlib. Register concerns at https://github.com/pypa/setuptools/issues/new?template=distutils-deprecation.yml + warnings.warn( +/cpfs02/user/zhaoxiangyu/miniconda3/envs/llava/lib/python3.10/site-packages/_distutils_hack/__init__.py:54: UserWarning: Reliance on distutils from stdlib is deprecated. Users must rely on setuptools to provide the distutils module. Avoid importing distutils or import setuptools first, and avoid setting SETUPTOOLS_USE_DISTUTILS=stdlib. Register concerns at https://github.com/pypa/setuptools/issues/new?template=distutils-deprecation.yml + warnings.warn( +/cpfs02/user/zhaoxiangyu/miniconda3/envs/llava/lib/python3.10/site-packages/_distutils_hack/__init__.py:54: UserWarning: Reliance on distutils from stdlib is deprecated. Users must rely on setuptools to provide the distutils module. Avoid importing distutils or import setuptools first, and avoid setting SETUPTOOLS_USE_DISTUTILS=stdlib. Register concerns at https://github.com/pypa/setuptools/issues/new?template=distutils-deprecation.yml + warnings.warn( +/cpfs02/user/zhaoxiangyu/miniconda3/envs/llava/lib/python3.10/site-packages/_distutils_hack/__init__.py:54: UserWarning: Reliance on distutils from stdlib is deprecated. Users must rely on setuptools to provide the distutils module. Avoid importing distutils or import setuptools first, and avoid setting SETUPTOOLS_USE_DISTUTILS=stdlib. Register concerns at https://github.com/pypa/setuptools/issues/new?template=distutils-deprecation.yml + warnings.warn( +/cpfs02/user/zhaoxiangyu/miniconda3/envs/llava/lib/python3.10/site-packages/_distutils_hack/__init__.py:54: UserWarning: Reliance on distutils from stdlib is deprecated. Users must rely on setuptools to provide the distutils module. Avoid importing distutils or import setuptools first, and avoid setting SETUPTOOLS_USE_DISTUTILS=stdlib. Register concerns at https://github.com/pypa/setuptools/issues/new?template=distutils-deprecation.yml + warnings.warn( +[2025-01-16 22:23:58,407] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-01-16 22:23:58,407] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-01-16 22:23:58,407] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-01-16 22:23:58,407] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-01-16 22:23:58,407] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-01-16 22:23:58,407] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-01-16 22:23:58,407] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-01-16 22:23:58,407] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect) +df: df: /root/.triton/autotunedf: /root/.triton/autotune/root/.triton/autotune: 没有那个文件或目录: 没有那个文件或目录: 没有那个文件或目录 + + +df: /root/.triton/autotune: 没有那个文件或目录 +df: /root/.triton/autotune: 没有那个文件或目录 +df: /root/.triton/autotune: 没有那个文件或目录 + [WARNING]  Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH + [WARNING]  Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH + [WARNING]  Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH + [WARNING]  Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH + [WARNING]  Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH + [WARNING]  Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH + [WARNING]  Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH + [WARNING]  Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH + [WARNING]  sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.1 + [WARNING]  using untested triton version (2.1.0), only 1.0.0 is known to be compatible + [WARNING]  sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.1 + [WARNING]  using untested triton version (2.1.0), only 1.0.0 is known to be compatible + [WARNING]  sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.1 + [WARNING]  using untested triton version (2.1.0), only 1.0.0 is known to be compatible + [WARNING]  sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.1 + [WARNING]  using untested triton version (2.1.0), only 1.0.0 is known to be compatible + [WARNING]  sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.1 + [WARNING]  using untested triton version (2.1.0), only 1.0.0 is known to be compatible + [WARNING]  sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.1 + [WARNING]  using untested triton version (2.1.0), only 1.0.0 is known to be compatible + [WARNING]  sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.1 + [WARNING]  using untested triton version (2.1.0), only 1.0.0 is known to be compatible + [WARNING]  sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.1 + [WARNING]  using untested triton version (2.1.0), only 1.0.0 is known to be compatible +[2025-01-16 22:24:14,158] [INFO] [comm.py:637:init_distributed] cdb=None +[2025-01-16 22:24:14,158] [INFO] [comm.py:637:init_distributed] cdb=None +[2025-01-16 22:24:14,158] [INFO] [comm.py:668:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl +[2025-01-16 22:24:14,158] [INFO] [comm.py:637:init_distributed] cdb=None +[2025-01-16 22:24:14,158] [INFO] [comm.py:637:init_distributed] cdb=None +[2025-01-16 22:24:14,158] [INFO] [comm.py:637:init_distributed] cdb=None +[2025-01-16 22:24:14,159] [INFO] [comm.py:637:init_distributed] cdb=None +[2025-01-16 22:24:14,159] [INFO] [comm.py:637:init_distributed] cdb=None +[2025-01-16 22:24:14,159] [INFO] [comm.py:637:init_distributed] cdb=None +01/16/2025 22:24:14 - WARNING - llava.train.train - Process rank: 0, device: cuda:0, n_gpu: 1distributed training: True, 16-bits training: False +01/16/2025 22:24:14 - INFO - llava.train.train - Training/evaluation parameters TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=True, +bf16_full_eval=False, +bits=16, +cache_dir=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=4, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=./scripts/zero3.json, +disable_tqdm=False, +dispatch_batches=None, +do_eval=False, +do_predict=False, +do_train=False, +double_quant=True, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +freeze_mm_mlp_adapter=False, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=2, +gradient_checkpointing=True, +gradient_checkpointing_kwargs=None, +greater_is_better=None, +group_by_length=False, +group_by_modality_length=True, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=0, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=./checkpoints/llavaAR4-internlm2_5-7b-sft-llavanext-notext-kn-infpolishmd-detail-knins40k-creationme10kfixed-chart11kmerge-tqa8k-info28kgpt/runs/Jan16_22-24-14_dlc1abaccnl2nzws-master-0, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1.0, +logging_strategy=steps, +lora_alpha=16, +lora_bias=none, +lora_dropout=0.05, +lora_enable=False, +lora_r=64, +lora_weight_path=, +lr_scheduler_kwargs={}, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mm_projector_lr=None, +mm_vision_tower_lr=2e-06, +model_max_length=32768, +mp_parameters=, +mpt_attn_impl=triton, +neftune_noise_alpha=None, +no_cuda=False, +num_train_epochs=1.0, +optim=adamw_torch, +optim_args=None, +output_dir=./checkpoints/llavaAR4-internlm2_5-7b-sft-llavanext-notext-kn-infpolishmd-detail-knins40k-creationme10kfixed-chart11kmerge-tqa8k-info28kgpt, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=4, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +quant_type=nf4, +ray_scope=last, +remove_unused_columns=False, +report_to=['wandb'], +resume_from_checkpoint=None, +run_name=llavaAR4-internlm2_5-7b-sft-llavanext-notext-kn-infpolishmd-detail-knins40k-creationme10kfixed-chart11kmerge-tqa8k-info28kgpt, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=10000, +save_strategy=steps, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=False, +tf32=True, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.03, +warmup_steps=0, +weight_decay=0.0, +) +01/16/2025 22:24:14 - INFO - llava.train.train - Training/evaluation parameters DataArguments(data_path=None, meta_path='playground/meta_json/llavanext_sample/llava_next_notext_inf37kpolishmd_de35k_know40k_knins40k_creation10kfixed_chart11kmerge_tqa8k_info28k_gpt.json', lazy_preprocess=True, is_multimodal=False, image_folder=None, image_aspect_ratio='anyres', image_grid_pinpoints='[(336, 672), (672, 336), (672, 672), (1008, 336), (336, 1008)]', image_crop_resolution=None, image_split_resolution=None, use_data_resampling=False) +[INFO|configuration_utils.py:727] 2025-01-16 22:24:14,176 >> loading configuration file models/internlm/internlm2_5-7b-chat/config.json +[INFO|configuration_utils.py:727] 2025-01-16 22:24:14,202 >> loading configuration file models/internlm/internlm2_5-7b-chat/config.json +[INFO|configuration_utils.py:792] 2025-01-16 22:24:14,202 >> Model config InternLM2Config { + "_name_or_path": "models/internlm/internlm2_5-7b-chat", + "architectures": [ + "InternLM2ForCausalLM" + ], + "attn_implementation": "eager", + "auto_map": { + "AutoConfig": "configuration_internlm2.InternLM2Config", + "AutoModel": "modeling_internlm2.InternLM2ForCausalLM", + "AutoModelForCausalLM": "modeling_internlm2.InternLM2ForCausalLM" + }, + "bias": false, + "bos_token_id": 1, + "eos_token_id": 2, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 32768, + "model_type": "internlm2", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pad_token_id": 2, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 2.0, + "type": "dynamic" + }, + "rope_theta": 1000000, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.37.2", + "use_cache": true, + "vocab_size": 92544 +} + +[WARNING|modeling_utils.py:2918] 2025-01-16 22:24:14,209 >> The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored. +[INFO|modeling_utils.py:3473] 2025-01-16 22:24:14,216 >> loading weights file models/internlm/internlm2_5-7b-chat/model.safetensors.index.json +[INFO|modeling_utils.py:1426] 2025-01-16 22:24:14,219 >> Instantiating LlavaInternlm2ForCausalLM model under default dtype torch.bfloat16. +[INFO|modeling_utils.py:3582] 2025-01-16 22:24:14,219 >> Detected DeepSpeed ZeRO-3: activating zero.init() for this model +[INFO|configuration_utils.py:826] 2025-01-16 22:24:14,224 >> Generate config GenerationConfig { + "bos_token_id": 1, + "eos_token_id": 2, + "pad_token_id": 2 +} + +01/16/2025 22:24:16 - WARNING - llava.train.train - Process rank: 6, device: cuda:6, n_gpu: 1distributed training: True, 16-bits training: False +01/16/2025 22:24:16 - WARNING - llava.train.train - Process rank: 7, device: cuda:7, n_gpu: 1distributed training: True, 16-bits training: False +[WARNING|modeling_utils.py:2918] 2025-01-16 22:24:16,496 >> The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored. +01/16/2025 22:24:16 - WARNING - llava.train.train - Process rank: 2, device: cuda:2, n_gpu: 1distributed training: True, 16-bits training: False +01/16/2025 22:24:16 - WARNING - llava.train.train - Process rank: 3, device: cuda:3, n_gpu: 1distributed training: True, 16-bits training: False +01/16/2025 22:24:16 - WARNING - llava.train.train - Process rank: 5, device: cuda:5, n_gpu: 1distributed training: True, 16-bits training: False +[WARNING|modeling_utils.py:2918] 2025-01-16 22:24:16,505 >> The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored. +01/16/2025 22:24:16 - WARNING - llava.train.train - Process rank: 4, device: cuda:4, n_gpu: 1distributed training: True, 16-bits training: False +01/16/2025 22:24:16 - WARNING - llava.train.train - Process rank: 1, device: cuda:1, n_gpu: 1distributed training: True, 16-bits training: False +[WARNING|modeling_utils.py:2918] 2025-01-16 22:24:16,521 >> The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored. +[WARNING|modeling_utils.py:2918] 2025-01-16 22:24:16,521 >> The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored. +[WARNING|modeling_utils.py:2918] 2025-01-16 22:24:16,521 >> The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored. +[WARNING|modeling_utils.py:2918] 2025-01-16 22:24:16,533 >> The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored. +[WARNING|modeling_utils.py:2918] 2025-01-16 22:24:16,536 >> The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored. +dlc1abaccnl2nzws-master-0:82:82 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth +dlc1abaccnl2nzws-master-0:82:82 [0] NCCL INFO Bootstrap : Using eth0:22.8.30.79<0> +dlc1abaccnl2nzws-master-0:82:82 [0] NCCL INFO Plugin name set by env to libnccl-net-none.so +dlc1abaccnl2nzws-master-0:82:82 [0] NCCL INFO NET/Plugin : Plugin load (libnccl-net-none.so) returned 2 : libnccl-net-none.so: cannot open shared object file: No such file or directory +dlc1abaccnl2nzws-master-0:82:82 [0] NCCL INFO NET/Plugin : No plugin found, using internal implementation +dlc1abaccnl2nzws-master-0:82:82 [0] NCCL INFO cudaDriverVersion 12010 +NCCL version 2.18.6+cuda12.1 +dlc1abaccnl2nzws-master-0:84:84 [2] NCCL INFO cudaDriverVersion 12010 +dlc1abaccnl2nzws-master-0:84:84 [2] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth +dlc1abaccnl2nzws-master-0:89:89 [7] NCCL INFO cudaDriverVersion 12010 +dlc1abaccnl2nzws-master-0:89:89 [7] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth +dlc1abaccnl2nzws-master-0:88:88 [6] NCCL INFO cudaDriverVersion 12010 +dlc1abaccnl2nzws-master-0:88:88 [6] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth +dlc1abaccnl2nzws-master-0:83:83 [1] NCCL INFO cudaDriverVersion 12010 +dlc1abaccnl2nzws-master-0:83:83 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth +dlc1abaccnl2nzws-master-0:87:87 [5] NCCL INFO cudaDriverVersion 12010 +dlc1abaccnl2nzws-master-0:87:87 [5] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth +dlc1abaccnl2nzws-master-0:85:85 [3] NCCL INFO cudaDriverVersion 12010 +dlc1abaccnl2nzws-master-0:85:85 [3] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth +dlc1abaccnl2nzws-master-0:84:84 [2] NCCL INFO Bootstrap : Using eth0:22.8.30.79<0> +dlc1abaccnl2nzws-master-0:84:84 [2] NCCL INFO Plugin name set by env to libnccl-net-none.so +dlc1abaccnl2nzws-master-0:89:89 [7] NCCL INFO Bootstrap : Using eth0:22.8.30.79<0> +dlc1abaccnl2nzws-master-0:89:89 [7] NCCL INFO Plugin name set by env to libnccl-net-none.so +dlc1abaccnl2nzws-master-0:88:88 [6] NCCL INFO Bootstrap : Using eth0:22.8.30.79<0> +dlc1abaccnl2nzws-master-0:88:88 [6] NCCL INFO Plugin name set by env to libnccl-net-none.so +dlc1abaccnl2nzws-master-0:83:83 [1] NCCL INFO Bootstrap : Using eth0:22.8.30.79<0> +dlc1abaccnl2nzws-master-0:83:83 [1] NCCL INFO Plugin name set by env to libnccl-net-none.so +dlc1abaccnl2nzws-master-0:87:87 [5] NCCL INFO Bootstrap : Using eth0:22.8.30.79<0> +dlc1abaccnl2nzws-master-0:87:87 [5] NCCL INFO Plugin name set by env to libnccl-net-none.so +dlc1abaccnl2nzws-master-0:85:85 [3] NCCL INFO Bootstrap : Using eth0:22.8.30.79<0> +dlc1abaccnl2nzws-master-0:85:85 [3] NCCL INFO Plugin name set by env to libnccl-net-none.so +dlc1abaccnl2nzws-master-0:84:84 [2] NCCL INFO NET/Plugin : Plugin load (libnccl-net-none.so) returned 2 : libnccl-net-none.so: cannot open shared object file: No such file or directory +dlc1abaccnl2nzws-master-0:84:84 [2] NCCL INFO NET/Plugin : No plugin found, using internal implementation +dlc1abaccnl2nzws-master-0:89:89 [7] NCCL INFO NET/Plugin : Plugin load (libnccl-net-none.so) returned 2 : libnccl-net-none.so: cannot open shared object file: No such file or directory +dlc1abaccnl2nzws-master-0:89:89 [7] NCCL INFO NET/Plugin : No plugin found, using internal implementation +dlc1abaccnl2nzws-master-0:87:87 [5] NCCL INFO NET/Plugin : Plugin load (libnccl-net-none.so) returned 2 : libnccl-net-none.so: cannot open shared object file: No such file or directory +dlc1abaccnl2nzws-master-0:87:87 [5] NCCL INFO NET/Plugin : No plugin found, using internal implementation +dlc1abaccnl2nzws-master-0:88:88 [6] NCCL INFO NET/Plugin : Plugin load (libnccl-net-none.so) returned 2 : libnccl-net-none.so: cannot open shared object file: No such file or directory +dlc1abaccnl2nzws-master-0:88:88 [6] NCCL INFO NET/Plugin : No plugin found, using internal implementation +dlc1abaccnl2nzws-master-0:85:85 [3] NCCL INFO NET/Plugin : Plugin load (libnccl-net-none.so) returned 2 : libnccl-net-none.so: cannot open shared object file: No such file or directory +dlc1abaccnl2nzws-master-0:85:85 [3] NCCL INFO NET/Plugin : No plugin found, using internal implementation +dlc1abaccnl2nzws-master-0:83:83 [1] NCCL INFO NET/Plugin : Plugin load (libnccl-net-none.so) returned 2 : libnccl-net-none.so: cannot open shared object file: No such file or directory +dlc1abaccnl2nzws-master-0:83:83 [1] NCCL INFO NET/Plugin : No plugin found, using internal implementation +dlc1abaccnl2nzws-master-0:86:86 [4] NCCL INFO cudaDriverVersion 12010 +dlc1abaccnl2nzws-master-0:86:86 [4] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth +dlc1abaccnl2nzws-master-0:86:86 [4] NCCL INFO Bootstrap : Using eth0:22.8.30.79<0> +dlc1abaccnl2nzws-master-0:86:86 [4] NCCL INFO Plugin name set by env to libnccl-net-none.so +dlc1abaccnl2nzws-master-0:86:86 [4] NCCL INFO NET/Plugin : Plugin load (libnccl-net-none.so) returned 2 : libnccl-net-none.so: cannot open shared object file: No such file or directory +dlc1abaccnl2nzws-master-0:86:86 [4] NCCL INFO NET/Plugin : No plugin found, using internal implementation +dlc1abaccnl2nzws-master-0:82:326 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth +dlc1abaccnl2nzws-master-0:82:326 [0] NCCL INFO NCCL_IB_HCA set to mlx5 +dlc1abaccnl2nzws-master-0:84:327 [2] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth +dlc1abaccnl2nzws-master-0:89:329 [7] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth +dlc1abaccnl2nzws-master-0:87:330 [5] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth +dlc1abaccnl2nzws-master-0:83:328 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth +dlc1abaccnl2nzws-master-0:85:331 [3] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth +dlc1abaccnl2nzws-master-0:89:329 [7] NCCL INFO NCCL_IB_HCA set to mlx5 +dlc1abaccnl2nzws-master-0:84:327 [2] NCCL INFO NCCL_IB_HCA set to mlx5 +dlc1abaccnl2nzws-master-0:87:330 [5] NCCL INFO NCCL_IB_HCA set to mlx5 +dlc1abaccnl2nzws-master-0:85:331 [3] NCCL INFO NCCL_IB_HCA set to mlx5 +dlc1abaccnl2nzws-master-0:83:328 [1] NCCL INFO NCCL_IB_HCA set to mlx5 +dlc1abaccnl2nzws-master-0:88:332 [6] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth +dlc1abaccnl2nzws-master-0:88:332 [6] NCCL INFO NCCL_IB_HCA set to mlx5 +dlc1abaccnl2nzws-master-0:86:333 [4] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth +dlc1abaccnl2nzws-master-0:86:333 [4] NCCL INFO NCCL_IB_HCA set to mlx5 +dlc1abaccnl2nzws-master-0:82:326 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [1]mlx5_1:1/RoCE [2]mlx5_2:1/RoCE [3]mlx5_3:1/RoCE [RO]; OOB eth0:22.8.30.79<0> +dlc1abaccnl2nzws-master-0:82:326 [0] NCCL INFO Using network IB +dlc1abaccnl2nzws-master-0:83:328 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [1]mlx5_1:1/RoCE [2]mlx5_2:1/RoCE [3]mlx5_3:1/RoCE [RO]; OOB eth0:22.8.30.79<0> +dlc1abaccnl2nzws-master-0:83:328 [1] NCCL INFO Using network IB +dlc1abaccnl2nzws-master-0:86:333 [4] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [1]mlx5_1:1/RoCE [2]mlx5_2:1/RoCE [3]mlx5_3:1/RoCE [RO]; OOB eth0:22.8.30.79<0> +dlc1abaccnl2nzws-master-0:86:333 [4] NCCL INFO Using network IB +dlc1abaccnl2nzws-master-0:87:330 [5] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [1]mlx5_1:1/RoCE [2]mlx5_2:1/RoCE [3]mlx5_3:1/RoCE [RO]; OOB eth0:22.8.30.79<0> +dlc1abaccnl2nzws-master-0:87:330 [5] NCCL INFO Using network IB +dlc1abaccnl2nzws-master-0:85:331 [3] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [1]mlx5_1:1/RoCE [2]mlx5_2:1/RoCE [3]mlx5_3:1/RoCE [RO]; OOB eth0:22.8.30.79<0> +dlc1abaccnl2nzws-master-0:85:331 [3] NCCL INFO Using network IB +dlc1abaccnl2nzws-master-0:89:329 [7] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [1]mlx5_1:1/RoCE [2]mlx5_2:1/RoCE [3]mlx5_3:1/RoCE [RO]; OOB eth0:22.8.30.79<0> +dlc1abaccnl2nzws-master-0:89:329 [7] NCCL INFO Using network IB +dlc1abaccnl2nzws-master-0:88:332 [6] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [1]mlx5_1:1/RoCE [2]mlx5_2:1/RoCE [3]mlx5_3:1/RoCE [RO]; OOB eth0:22.8.30.79<0> +dlc1abaccnl2nzws-master-0:88:332 [6] NCCL INFO Using network IB +dlc1abaccnl2nzws-master-0:84:327 [2] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [1]mlx5_1:1/RoCE [2]mlx5_2:1/RoCE [3]mlx5_3:1/RoCE [RO]; OOB eth0:22.8.30.79<0> +dlc1abaccnl2nzws-master-0:84:327 [2] NCCL INFO Using network IB +dlc1abaccnl2nzws-master-0:82:326 [0] NCCL INFO comm 0x9a19ed70 rank 0 nranks 16 cudaDev 0 nvmlDev 0 busId 10 commId 0x13a7e6351c9956c9 - Init START +dlc1abaccnl2nzws-master-0:85:331 [3] NCCL INFO comm 0x9a3f5910 rank 3 nranks 16 cudaDev 3 nvmlDev 3 busId 40 commId 0x13a7e6351c9956c9 - Init START +dlc1abaccnl2nzws-master-0:84:327 [2] NCCL INFO comm 0x9b487020 rank 2 nranks 16 cudaDev 2 nvmlDev 2 busId 30 commId 0x13a7e6351c9956c9 - Init START +dlc1abaccnl2nzws-master-0:83:328 [1] NCCL INFO comm 0x9bc9e470 rank 1 nranks 16 cudaDev 1 nvmlDev 1 busId 20 commId 0x13a7e6351c9956c9 - Init START +dlc1abaccnl2nzws-master-0:88:332 [6] NCCL INFO comm 0x9bbbfcd0 rank 6 nranks 16 cudaDev 6 nvmlDev 6 busId 70 commId 0x13a7e6351c9956c9 - Init START +dlc1abaccnl2nzws-master-0:89:329 [7] NCCL INFO comm 0x9a729d00 rank 7 nranks 16 cudaDev 7 nvmlDev 7 busId 80 commId 0x13a7e6351c9956c9 - Init START +dlc1abaccnl2nzws-master-0:87:330 [5] NCCL INFO comm 0x9b1e3b20 rank 5 nranks 16 cudaDev 5 nvmlDev 5 busId 60 commId 0x13a7e6351c9956c9 - Init START +dlc1abaccnl2nzws-master-0:86:333 [4] NCCL INFO comm 0x99f3b9a0 rank 4 nranks 16 cudaDev 4 nvmlDev 4 busId 50 commId 0x13a7e6351c9956c9 - Init START +dlc1abaccnl2nzws-master-0:84:327 [2] NCCL INFO Setting affinity for GPU 2 to ffffffff,ffffffff,ffffffff +dlc1abaccnl2nzws-master-0:84:327 [2] NCCL INFO NVLS multicast support is not available on dev 2 +dlc1abaccnl2nzws-master-0:89:329 [7] NCCL INFO NVLS multicast support is not available on dev 7 +dlc1abaccnl2nzws-master-0:83:328 [1] NCCL INFO Setting affinity for GPU 1 to ffffffff,ffffffff,ffffffff +dlc1abaccnl2nzws-master-0:83:328 [1] NCCL INFO NVLS multicast support is not available on dev 1 +dlc1abaccnl2nzws-master-0:87:330 [5] NCCL INFO NVLS multicast support is not available on dev 5 +dlc1abaccnl2nzws-master-0:86:333 [4] NCCL INFO NVLS multicast support is not available on dev 4 +dlc1abaccnl2nzws-master-0:82:326 [0] NCCL INFO Setting affinity for GPU 0 to ffffffff,ffffffff,ffffffff +dlc1abaccnl2nzws-master-0:82:326 [0] NCCL INFO NVLS multicast support is not available on dev 0 +dlc1abaccnl2nzws-master-0:85:331 [3] NCCL INFO Setting affinity for GPU 3 to ffffffff,ffffffff,ffffffff +dlc1abaccnl2nzws-master-0:85:331 [3] NCCL INFO NVLS multicast support is not available on dev 3 +dlc1abaccnl2nzws-master-0:88:332 [6] NCCL INFO NVLS multicast support is not available on dev 6 +dlc1abaccnl2nzws-master-0:85:331 [3] NCCL INFO NCCL_MIN_NCHANNELS set by environment to 4. +dlc1abaccnl2nzws-master-0:85:331 [3] NCCL INFO Trees [0] 4/-1/-1->3->2 [1] 4/-1/-1->3->2 [2] -1/-1/-1->3->2 [3] 4/-1/-1->3->2 [4] 4/-1/-1->3->2 [5] 4/-1/-1->3->2 [6] -1/-1/-1->3->2 [7] 4/-1/-1->3->2 +dlc1abaccnl2nzws-master-0:86:333 [4] NCCL INFO NCCL_MIN_NCHANNELS set by environment to 4. +dlc1abaccnl2nzws-master-0:85:331 [3] NCCL INFO P2P Chunksize set to 131072 +dlc1abaccnl2nzws-master-0:86:333 [4] NCCL INFO Trees [0] 5/-1/-1->4->3 [1] 5/-1/-1->4->3 [2] 5/12/-1->4->-1 [3] 5/-1/-1->4->3 [4] 5/-1/-1->4->3 [5] 5/-1/-1->4->3 [6] 5/-1/-1->4->12 [7] 5/-1/-1->4->3 +dlc1abaccnl2nzws-master-0:86:333 [4] NCCL INFO P2P Chunksize set to 131072 +dlc1abaccnl2nzws-master-0:87:330 [5] NCCL INFO NCCL_MIN_NCHANNELS set by environment to 4. +dlc1abaccnl2nzws-master-0:87:330 [5] NCCL INFO Trees [0] 6/-1/-1->5->4 [1] 6/-1/-1->5->4 [2] 6/-1/-1->5->4 [3] -1/-1/-1->5->4 [4] 6/-1/-1->5->4 [5] 6/-1/-1->5->4 [6] 6/-1/-1->5->4 [7] -1/-1/-1->5->4 +dlc1abaccnl2nzws-master-0:87:330 [5] NCCL INFO P2P Chunksize set to 131072 +dlc1abaccnl2nzws-master-0:88:332 [6] NCCL INFO NCCL_MIN_NCHANNELS set by environment to 4. +dlc1abaccnl2nzws-master-0:89:329 [7] NCCL INFO NCCL_MIN_NCHANNELS set by environment to 4. +dlc1abaccnl2nzws-master-0:88:332 [6] NCCL INFO Trees [0] 7/-1/-1->6->5 [1] 7/-1/-1->6->5 [2] 7/-1/-1->6->5 [3] 7/14/-1->6->-1 [4] 7/-1/-1->6->5 [5] 7/-1/-1->6->5 [6] 7/-1/-1->6->5 [7] 7/-1/-1->6->14 +dlc1abaccnl2nzws-master-0:89:329 [7] NCCL INFO Trees [0] -1/-1/-1->7->6 [1] 0/-1/-1->7->6 [2] 0/-1/-1->7->6 [3] 0/-1/-1->7->6 [4] -1/-1/-1->7->6 [5] 0/-1/-1->7->6 [6] 0/-1/-1->7->6 [7] 0/-1/-1->7->6 +dlc1abaccnl2nzws-master-0:88:332 [6] NCCL INFO P2P Chunksize set to 131072 +dlc1abaccnl2nzws-master-0:89:329 [7] NCCL INFO P2P Chunksize set to 131072 +dlc1abaccnl2nzws-master-0:84:327 [2] NCCL INFO NCCL_MIN_NCHANNELS set by environment to 4. +dlc1abaccnl2nzws-master-0:83:328 [1] NCCL INFO NCCL_MIN_NCHANNELS set by environment to 4. +dlc1abaccnl2nzws-master-0:82:326 [0] NCCL INFO NCCL_MIN_NCHANNELS set by environment to 4. +dlc1abaccnl2nzws-master-0:82:326 [0] NCCL INFO Channel 00/08 : 0 7 6 5 4 3 2 1 8 15 14 13 12 11 10 9 +dlc1abaccnl2nzws-master-0:84:327 [2] NCCL INFO Trees [0] 3/-1/-1->2->1 [1] 3/10/-1->2->-1 [2] 3/-1/-1->2->1 [3] 3/-1/-1->2->1 [4] 3/-1/-1->2->1 [5] 3/-1/-1->2->10 [6] 3/-1/-1->2->1 [7] 3/-1/-1->2->1 +dlc1abaccnl2nzws-master-0:83:328 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] -1/-1/-1->1->0 [2] 2/-1/-1->1->0 [3] 2/-1/-1->1->0 [4] 2/-1/-1->1->0 [5] -1/-1/-1->1->0 [6] 2/-1/-1->1->0 [7] 2/-1/-1->1->0 +dlc1abaccnl2nzws-master-0:82:326 [0] NCCL INFO Channel 01/08 : 0 3 10 15 14 13 12 9 8 11 2 7 6 5 4 1 +dlc1abaccnl2nzws-master-0:84:327 [2] NCCL INFO P2P Chunksize set to 131072 +dlc1abaccnl2nzws-master-0:83:328 [1] NCCL INFO P2P Chunksize set to 131072 +dlc1abaccnl2nzws-master-0:82:326 [0] NCCL INFO Channel 02/08 : 0 7 6 5 12 11 10 9 8 15 14 13 4 3 2 1 +dlc1abaccnl2nzws-master-0:82:326 [0] NCCL INFO Channel 03/08 : 0 5 4 7 14 11 10 9 8 13 12 15 6 3 2 1 +dlc1abaccnl2nzws-master-0:82:326 [0] NCCL INFO Channel 04/08 : 0 7 6 5 4 3 2 1 8 15 14 13 12 11 10 9 +dlc1abaccnl2nzws-master-0:82:326 [0] NCCL INFO Channel 05/08 : 0 3 10 15 14 13 12 9 8 11 2 7 6 5 4 1 +dlc1abaccnl2nzws-master-0:82:326 [0] NCCL INFO Channel 06/08 : 0 7 6 5 12 11 10 9 8 15 14 13 4 3 2 1 +dlc1abaccnl2nzws-master-0:82:326 [0] NCCL INFO Channel 07/08 : 0 5 4 7 14 11 10 9 8 13 12 15 6 3 2 1 +dlc1abaccnl2nzws-master-0:82:326 [0] NCCL INFO Trees [0] 1/8/-1->0->-1 [1] 1/-1/-1->0->7 [2] 1/-1/-1->0->7 [3] 1/-1/-1->0->7 [4] 1/-1/-1->0->8 [5] 1/-1/-1->0->7 [6] 1/-1/-1->0->7 [7] 1/-1/-1->0->7 +dlc1abaccnl2nzws-master-0:82:326 [0] NCCL INFO P2P Chunksize set to 131072 +dlc1abaccnl2nzws-master-0:84:327 [2] NCCL INFO Channel 01/0 : 2[2] -> 7[7] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:86:333 [4] NCCL INFO Channel 03/0 : 4[4] -> 7[7] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:82:326 [0] NCCL INFO Channel 01/0 : 0[0] -> 3[3] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:88:332 [6] NCCL INFO Channel 03/0 : 15[7] -> 6[6] [receive] via NET/IB/3/GDRDMA +dlc1abaccnl2nzws-master-0:88:332 [6] NCCL INFO Channel 07/0 : 15[7] -> 6[6] [receive] via NET/IB/3/GDRDMA +dlc1abaccnl2nzws-master-0:83:328 [1] NCCL INFO Channel 00/0 : 1[1] -> 8[0] [send] via NET/IB/0/GDRDMA +dlc1abaccnl2nzws-master-0:83:328 [1] NCCL INFO Channel 04/0 : 1[1] -> 8[0] [send] via NET/IB/0/GDRDMA +dlc1abaccnl2nzws-master-0:84:327 [2] NCCL INFO Channel 05/0 : 2[2] -> 7[7] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:86:333 [4] NCCL INFO Channel 07/0 : 4[4] -> 7[7] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:82:326 [0] NCCL INFO Channel 05/0 : 0[0] -> 3[3] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:82:326 [0] NCCL INFO Channel 03/0 : 0[0] -> 5[5] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:82:326 [0] NCCL INFO Channel 07/0 : 0[0] -> 5[5] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:86:333 [4] NCCL INFO Channel 02/0 : 13[5] -> 4[4] [receive] via NET/IB/2/GDRDMA +dlc1abaccnl2nzws-master-0:86:333 [4] NCCL INFO Channel 06/0 : 13[5] -> 4[4] [receive] via NET/IB/2/GDRDMA +dlc1abaccnl2nzws-master-0:85:331 [3] NCCL INFO Channel 01/0 : 3[3] -> 10[2] [send] via NET/IB/1/GDRDMA +dlc1abaccnl2nzws-master-0:85:331 [3] NCCL INFO Channel 05/0 : 3[3] -> 10[2] [send] via NET/IB/1/GDRDMA +dlc1abaccnl2nzws-master-0:84:327 [2] NCCL INFO Channel 01/0 : 11[3] -> 2[2] [receive] via NET/IB/1/GDRDMA +dlc1abaccnl2nzws-master-0:82:326 [0] NCCL INFO Channel 00/0 : 9[1] -> 0[0] [receive] via NET/IB/0/GDRDMA +dlc1abaccnl2nzws-master-0:84:327 [2] NCCL INFO Channel 05/0 : 11[3] -> 2[2] [receive] via NET/IB/1/GDRDMA +dlc1abaccnl2nzws-master-0:82:326 [0] NCCL INFO Channel 04/0 : 9[1] -> 0[0] [receive] via NET/IB/0/GDRDMA +dlc1abaccnl2nzws-master-0:82:326 [0] NCCL INFO Channel 00/0 : 0[0] -> 7[7] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:87:330 [5] NCCL INFO Channel 02/0 : 5[5] -> 12[4] [send] via NET/IB/2/GDRDMA +dlc1abaccnl2nzws-master-0:86:333 [4] NCCL INFO Channel 01/0 : 4[4] -> 1[1] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:87:330 [5] NCCL INFO Channel 06/0 : 5[5] -> 12[4] [send] via NET/IB/2/GDRDMA +dlc1abaccnl2nzws-master-0:82:326 [0] NCCL INFO Channel 02/0 : 0[0] -> 7[7] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:86:333 [4] NCCL INFO Channel 05/0 : 4[4] -> 1[1] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:89:329 [7] NCCL INFO Channel 03/0 : 7[7] -> 14[6] [send] via NET/IB/3/GDRDMA +dlc1abaccnl2nzws-master-0:89:329 [7] NCCL INFO Channel 07/0 : 7[7] -> 14[6] [send] via NET/IB/3/GDRDMA +dlc1abaccnl2nzws-master-0:88:332 [6] NCCL INFO Channel 03/0 : 6[6] -> 3[3] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:82:326 [0] NCCL INFO Channel 04/0 : 0[0] -> 7[7] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:82:326 [0] NCCL INFO Channel 06/0 : 0[0] -> 7[7] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:88:332 [6] NCCL INFO Channel 07/0 : 6[6] -> 3[3] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:89:329 [7] NCCL INFO Channel 00/0 : 7[7] -> 6[6] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:89:329 [7] NCCL INFO Channel 01/0 : 7[7] -> 6[6] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:84:327 [2] NCCL INFO Channel 00/0 : 2[2] -> 1[1] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:89:329 [7] NCCL INFO Channel 02/0 : 7[7] -> 6[6] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:84:327 [2] NCCL INFO Channel 02/0 : 2[2] -> 1[1] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:87:330 [5] NCCL INFO Channel 00/0 : 5[5] -> 4[4] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:89:329 [7] NCCL INFO Channel 04/0 : 7[7] -> 6[6] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:84:327 [2] NCCL INFO Channel 03/0 : 2[2] -> 1[1] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:87:330 [5] NCCL INFO Channel 01/0 : 5[5] -> 4[4] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:89:329 [7] NCCL INFO Channel 05/0 : 7[7] -> 6[6] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:85:331 [3] NCCL INFO Channel 00/0 : 3[3] -> 2[2] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:84:327 [2] NCCL INFO Channel 04/0 : 2[2] -> 1[1] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:89:329 [7] NCCL INFO Channel 06/0 : 7[7] -> 6[6] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:87:330 [5] NCCL INFO Channel 03/0 : 5[5] -> 4[4] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:85:331 [3] NCCL INFO Channel 02/0 : 3[3] -> 2[2] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:84:327 [2] NCCL INFO Channel 06/0 : 2[2] -> 1[1] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:87:330 [5] NCCL INFO Channel 04/0 : 5[5] -> 4[4] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:88:332 [6] NCCL INFO Channel 00/0 : 6[6] -> 5[5] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:85:331 [3] NCCL INFO Channel 03/0 : 3[3] -> 2[2] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:84:327 [2] NCCL INFO Channel 07/0 : 2[2] -> 1[1] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:87:330 [5] NCCL INFO Channel 05/0 : 5[5] -> 4[4] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:83:328 [1] NCCL INFO Channel 01/0 : 1[1] -> 0[0] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:88:332 [6] NCCL INFO Channel 01/0 : 6[6] -> 5[5] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:86:333 [4] NCCL INFO Channel 00/0 : 4[4] -> 3[3] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:85:331 [3] NCCL INFO Channel 04/0 : 3[3] -> 2[2] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:87:330 [5] NCCL INFO Channel 07/0 : 5[5] -> 4[4] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:83:328 [1] NCCL INFO Channel 02/0 : 1[1] -> 0[0] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:88:332 [6] NCCL INFO Channel 02/0 : 6[6] -> 5[5] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:86:333 [4] NCCL INFO Channel 02/0 : 4[4] -> 3[3] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:85:331 [3] NCCL INFO Channel 06/0 : 3[3] -> 2[2] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:83:328 [1] NCCL INFO Channel 03/0 : 1[1] -> 0[0] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:88:332 [6] NCCL INFO Channel 04/0 : 6[6] -> 5[5] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:86:333 [4] NCCL INFO Channel 04/0 : 4[4] -> 3[3] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:85:331 [3] NCCL INFO Channel 07/0 : 3[3] -> 2[2] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:83:328 [1] NCCL INFO Channel 05/0 : 1[1] -> 0[0] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:88:332 [6] NCCL INFO Channel 05/0 : 6[6] -> 5[5] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:86:333 [4] NCCL INFO Channel 06/0 : 4[4] -> 3[3] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:83:328 [1] NCCL INFO Channel 06/0 : 1[1] -> 0[0] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:88:332 [6] NCCL INFO Channel 06/0 : 6[6] -> 5[5] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:83:328 [1] NCCL INFO Channel 07/0 : 1[1] -> 0[0] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:85:374 [3] NCCL INFO NCCL_IB_QPS_PER_CONNECTION set by environment to 8. +dlc1abaccnl2nzws-master-0:83:370 [1] NCCL INFO NCCL_IB_QPS_PER_CONNECTION set by environment to 8. +dlc1abaccnl2nzws-master-0:87:367 [5] NCCL INFO NCCL_IB_QPS_PER_CONNECTION set by environment to 8. +dlc1abaccnl2nzws-master-0:86:373 [4] NCCL INFO NCCL_IB_GID_INDEX set by environment to 3. +dlc1abaccnl2nzws-master-0:86:373 [4] NCCL INFO NCCL_IB_QPS_PER_CONNECTION set by environment to 8. +dlc1abaccnl2nzws-master-0:85:374 [3] NCCL INFO NCCL_IB_GID_INDEX set by environment to 3. +dlc1abaccnl2nzws-master-0:84:371 [2] NCCL INFO NCCL_IB_GID_INDEX set by environment to 3. +dlc1abaccnl2nzws-master-0:84:371 [2] NCCL INFO NCCL_IB_QPS_PER_CONNECTION set by environment to 8. +dlc1abaccnl2nzws-master-0:83:370 [1] NCCL INFO NCCL_IB_GID_INDEX set by environment to 3. +dlc1abaccnl2nzws-master-0:87:367 [5] NCCL INFO NCCL_IB_GID_INDEX set by environment to 3. +dlc1abaccnl2nzws-master-0:86:373 [4] NCCL INFO NCCL_IB_TC set by environment to 136. +dlc1abaccnl2nzws-master-0:86:373 [4] NCCL INFO NCCL_IB_SL set by environment to 5. +dlc1abaccnl2nzws-master-0:84:371 [2] NCCL INFO NCCL_IB_TC set by environment to 136. +dlc1abaccnl2nzws-master-0:84:371 [2] NCCL INFO NCCL_IB_SL set by environment to 5. +dlc1abaccnl2nzws-master-0:86:373 [4] NCCL INFO NCCL_IB_TIMEOUT set by environment to 22. +dlc1abaccnl2nzws-master-0:84:371 [2] NCCL INFO NCCL_IB_TIMEOUT set by environment to 22. +dlc1abaccnl2nzws-master-0:88:369 [6] NCCL INFO NCCL_IB_GID_INDEX set by environment to 3. +dlc1abaccnl2nzws-master-0:88:369 [6] NCCL INFO NCCL_IB_QPS_PER_CONNECTION set by environment to 8. +dlc1abaccnl2nzws-master-0:89:368 [7] NCCL INFO NCCL_IB_QPS_PER_CONNECTION set by environment to 8. +dlc1abaccnl2nzws-master-0:85:374 [3] NCCL INFO NCCL_IB_TC set by environment to 136. +dlc1abaccnl2nzws-master-0:85:374 [3] NCCL INFO NCCL_IB_SL set by environment to 5. +dlc1abaccnl2nzws-master-0:85:374 [3] NCCL INFO NCCL_IB_TIMEOUT set by environment to 22. +dlc1abaccnl2nzws-master-0:82:372 [0] NCCL INFO NCCL_IB_GID_INDEX set by environment to 3. +dlc1abaccnl2nzws-master-0:87:367 [5] NCCL INFO NCCL_IB_TC set by environment to 136. +dlc1abaccnl2nzws-master-0:87:367 [5] NCCL INFO NCCL_IB_SL set by environment to 5. +dlc1abaccnl2nzws-master-0:82:372 [0] NCCL INFO NCCL_IB_QPS_PER_CONNECTION set by environment to 8. +dlc1abaccnl2nzws-master-0:88:369 [6] NCCL INFO NCCL_IB_TC set by environment to 136. +dlc1abaccnl2nzws-master-0:88:369 [6] NCCL INFO NCCL_IB_SL set by environment to 5. +dlc1abaccnl2nzws-master-0:87:367 [5] NCCL INFO NCCL_IB_TIMEOUT set by environment to 22. +dlc1abaccnl2nzws-master-0:88:369 [6] NCCL INFO NCCL_IB_TIMEOUT set by environment to 22. +dlc1abaccnl2nzws-master-0:89:368 [7] NCCL INFO NCCL_IB_GID_INDEX set by environment to 3. +dlc1abaccnl2nzws-master-0:82:372 [0] NCCL INFO NCCL_IB_TC set by environment to 136. +dlc1abaccnl2nzws-master-0:82:372 [0] NCCL INFO NCCL_IB_SL set by environment to 5. +dlc1abaccnl2nzws-master-0:82:372 [0] NCCL INFO NCCL_IB_TIMEOUT set by environment to 22. +dlc1abaccnl2nzws-master-0:83:370 [1] NCCL INFO NCCL_IB_TC set by environment to 136. +dlc1abaccnl2nzws-master-0:83:370 [1] NCCL INFO NCCL_IB_SL set by environment to 5. +dlc1abaccnl2nzws-master-0:83:370 [1] NCCL INFO NCCL_IB_TIMEOUT set by environment to 22. +dlc1abaccnl2nzws-master-0:89:368 [7] NCCL INFO NCCL_IB_TC set by environment to 136. +dlc1abaccnl2nzws-master-0:89:368 [7] NCCL INFO NCCL_IB_SL set by environment to 5. +dlc1abaccnl2nzws-master-0:89:368 [7] NCCL INFO NCCL_IB_TIMEOUT set by environment to 22. +dlc1abaccnl2nzws-master-0:85:331 [3] NCCL INFO Connected all rings +dlc1abaccnl2nzws-master-0:87:330 [5] NCCL INFO Connected all rings +dlc1abaccnl2nzws-master-0:83:328 [1] NCCL INFO Connected all rings +dlc1abaccnl2nzws-master-0:88:332 [6] NCCL INFO Connected all rings +dlc1abaccnl2nzws-master-0:85:331 [3] NCCL INFO Channel 00/0 : 3[3] -> 4[4] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:89:329 [7] NCCL INFO Connected all rings +dlc1abaccnl2nzws-master-0:85:331 [3] NCCL INFO Channel 01/0 : 3[3] -> 4[4] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:86:333 [4] NCCL INFO Connected all rings +dlc1abaccnl2nzws-master-0:87:330 [5] NCCL INFO Channel 00/0 : 5[5] -> 6[6] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:85:331 [3] NCCL INFO Channel 03/0 : 3[3] -> 4[4] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:84:327 [2] NCCL INFO Connected all rings +dlc1abaccnl2nzws-master-0:87:330 [5] NCCL INFO Channel 01/0 : 5[5] -> 6[6] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:85:331 [3] NCCL INFO Channel 04/0 : 3[3] -> 4[4] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:87:330 [5] NCCL INFO Channel 02/0 : 5[5] -> 6[6] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:85:331 [3] NCCL INFO Channel 05/0 : 3[3] -> 4[4] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:82:326 [0] NCCL INFO Connected all rings +dlc1abaccnl2nzws-master-0:82:326 [0] NCCL INFO Channel 00/0 : 0[0] -> 1[1] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:87:330 [5] NCCL INFO Channel 04/0 : 5[5] -> 6[6] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:85:331 [3] NCCL INFO Channel 07/0 : 3[3] -> 4[4] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:82:326 [0] NCCL INFO Channel 01/0 : 0[0] -> 1[1] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:87:330 [5] NCCL INFO Channel 05/0 : 5[5] -> 6[6] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:88:332 [6] NCCL INFO Channel 00/0 : 6[6] -> 7[7] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:83:328 [1] NCCL INFO Channel 00/0 : 1[1] -> 2[2] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:82:326 [0] NCCL INFO Channel 02/0 : 0[0] -> 1[1] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:87:330 [5] NCCL INFO Channel 06/0 : 5[5] -> 6[6] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:88:332 [6] NCCL INFO Channel 01/0 : 6[6] -> 7[7] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:83:328 [1] NCCL INFO Channel 02/0 : 1[1] -> 2[2] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:82:326 [0] NCCL INFO Channel 03/0 : 0[0] -> 1[1] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:86:333 [4] NCCL INFO Channel 00/0 : 4[4] -> 5[5] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:88:332 [6] NCCL INFO Channel 02/0 : 6[6] -> 7[7] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:84:327 [2] NCCL INFO Channel 00/0 : 2[2] -> 3[3] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:83:328 [1] NCCL INFO Channel 03/0 : 1[1] -> 2[2] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:82:326 [0] NCCL INFO Channel 04/0 : 0[0] -> 1[1] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:86:333 [4] NCCL INFO Channel 01/0 : 4[4] -> 5[5] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:88:332 [6] NCCL INFO Channel 03/0 : 6[6] -> 7[7] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:84:327 [2] NCCL INFO Channel 01/0 : 2[2] -> 3[3] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:83:328 [1] NCCL INFO Channel 04/0 : 1[1] -> 2[2] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:82:326 [0] NCCL INFO Channel 05/0 : 0[0] -> 1[1] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:86:333 [4] NCCL INFO Channel 02/0 : 4[4] -> 5[5] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:88:332 [6] NCCL INFO Channel 04/0 : 6[6] -> 7[7] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:84:327 [2] NCCL INFO Channel 02/0 : 2[2] -> 3[3] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:83:328 [1] NCCL INFO Channel 06/0 : 1[1] -> 2[2] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:82:326 [0] NCCL INFO Channel 06/0 : 0[0] -> 1[1] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:86:333 [4] NCCL INFO Channel 03/0 : 4[4] -> 5[5] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:88:332 [6] NCCL INFO Channel 05/0 : 6[6] -> 7[7] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:84:327 [2] NCCL INFO Channel 03/0 : 2[2] -> 3[3] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:83:328 [1] NCCL INFO Channel 07/0 : 1[1] -> 2[2] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:82:326 [0] NCCL INFO Channel 07/0 : 0[0] -> 1[1] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:86:333 [4] NCCL INFO Channel 04/0 : 4[4] -> 5[5] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:88:332 [6] NCCL INFO Channel 06/0 : 6[6] -> 7[7] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:84:327 [2] NCCL INFO Channel 04/0 : 2[2] -> 3[3] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:82:326 [0] NCCL INFO Channel 01/0 : 0[0] -> 7[7] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:86:333 [4] NCCL INFO Channel 05/0 : 4[4] -> 5[5] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:88:332 [6] NCCL INFO Channel 07/0 : 6[6] -> 7[7] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:84:327 [2] NCCL INFO Channel 05/0 : 2[2] -> 3[3] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:82:326 [0] NCCL INFO Channel 03/0 : 0[0] -> 7[7] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:86:333 [4] NCCL INFO Channel 06/0 : 4[4] -> 5[5] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:88:332 [6] NCCL INFO Channel 03/0 : 14[6] -> 6[6] [receive] via NET/IB/3/GDRDMA +dlc1abaccnl2nzws-master-0:88:332 [6] NCCL INFO Channel 07/0 : 14[6] -> 6[6] [receive] via NET/IB/3/GDRDMA +dlc1abaccnl2nzws-master-0:88:332 [6] NCCL INFO Channel 03/0 : 6[6] -> 14[6] [send] via NET/IB/3/GDRDMA +dlc1abaccnl2nzws-master-0:88:332 [6] NCCL INFO Channel 07/0 : 6[6] -> 14[6] [send] via NET/IB/3/GDRDMA +dlc1abaccnl2nzws-master-0:84:327 [2] NCCL INFO Channel 06/0 : 2[2] -> 3[3] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:82:326 [0] NCCL INFO Channel 05/0 : 0[0] -> 7[7] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:86:333 [4] NCCL INFO Channel 07/0 : 4[4] -> 5[5] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:84:327 [2] NCCL INFO Channel 07/0 : 2[2] -> 3[3] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:82:326 [0] NCCL INFO Channel 07/0 : 0[0] -> 7[7] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:87:330 [5] NCCL INFO Channel 02/0 : 5[5] -> 4[4] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:86:333 [4] NCCL INFO Channel 02/0 : 12[4] -> 4[4] [receive] via NET/IB/2/GDRDMA +dlc1abaccnl2nzws-master-0:86:333 [4] NCCL INFO Channel 06/0 : 12[4] -> 4[4] [receive] via NET/IB/2/GDRDMA +dlc1abaccnl2nzws-master-0:86:333 [4] NCCL INFO Channel 02/0 : 4[4] -> 12[4] [send] via NET/IB/2/GDRDMA +dlc1abaccnl2nzws-master-0:86:333 [4] NCCL INFO Channel 06/0 : 4[4] -> 12[4] [send] via NET/IB/2/GDRDMA +dlc1abaccnl2nzws-master-0:83:328 [1] NCCL INFO Channel 00/0 : 1[1] -> 0[0] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:84:327 [2] NCCL INFO Channel 01/0 : 10[2] -> 2[2] [receive] via NET/IB/1/GDRDMA +dlc1abaccnl2nzws-master-0:84:327 [2] NCCL INFO Channel 05/0 : 10[2] -> 2[2] [receive] via NET/IB/1/GDRDMA +dlc1abaccnl2nzws-master-0:84:327 [2] NCCL INFO Channel 01/0 : 2[2] -> 10[2] [send] via NET/IB/1/GDRDMA +dlc1abaccnl2nzws-master-0:84:327 [2] NCCL INFO Channel 05/0 : 2[2] -> 10[2] [send] via NET/IB/1/GDRDMA +dlc1abaccnl2nzws-master-0:87:330 [5] NCCL INFO Channel 06/0 : 5[5] -> 4[4] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:83:328 [1] NCCL INFO Channel 04/0 : 1[1] -> 0[0] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:86:333 [4] NCCL INFO Channel 01/0 : 4[4] -> 3[3] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:89:329 [7] NCCL INFO Channel 01/0 : 7[7] -> 0[0] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:82:326 [0] NCCL INFO Channel 00/0 : 8[0] -> 0[0] [receive] via NET/IB/0/GDRDMA +dlc1abaccnl2nzws-master-0:82:326 [0] NCCL INFO Channel 04/0 : 8[0] -> 0[0] [receive] via NET/IB/0/GDRDMA +dlc1abaccnl2nzws-master-0:82:326 [0] NCCL INFO Channel 00/0 : 0[0] -> 8[0] [send] via NET/IB/0/GDRDMA +dlc1abaccnl2nzws-master-0:82:326 [0] NCCL INFO Channel 04/0 : 0[0] -> 8[0] [send] via NET/IB/0/GDRDMA +dlc1abaccnl2nzws-master-0:86:333 [4] NCCL INFO Channel 03/0 : 4[4] -> 3[3] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:89:329 [7] NCCL INFO Channel 02/0 : 7[7] -> 0[0] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:86:333 [4] NCCL INFO Channel 05/0 : 4[4] -> 3[3] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:89:329 [7] NCCL INFO Channel 03/0 : 7[7] -> 0[0] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:85:331 [3] NCCL INFO Channel 01/0 : 3[3] -> 2[2] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:86:333 [4] NCCL INFO Channel 07/0 : 4[4] -> 3[3] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:89:329 [7] NCCL INFO Channel 05/0 : 7[7] -> 0[0] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:85:331 [3] NCCL INFO Channel 05/0 : 3[3] -> 2[2] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:89:329 [7] NCCL INFO Channel 06/0 : 7[7] -> 0[0] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:89:329 [7] NCCL INFO Channel 07/0 : 7[7] -> 0[0] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:89:329 [7] NCCL INFO Channel 03/0 : 7[7] -> 6[6] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:89:329 [7] NCCL INFO Channel 07/0 : 7[7] -> 6[6] via P2P/IPC/read +dlc1abaccnl2nzws-master-0:87:330 [5] NCCL INFO Connected all trees +dlc1abaccnl2nzws-master-0:87:330 [5] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 +dlc1abaccnl2nzws-master-0:87:330 [5] NCCL INFO 8 coll channels, 0 nvls channels, 8 p2p channels, 2 p2p channels per peer +dlc1abaccnl2nzws-master-0:85:331 [3] NCCL INFO Connected all trees +dlc1abaccnl2nzws-master-0:85:331 [3] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 +dlc1abaccnl2nzws-master-0:85:331 [3] NCCL INFO 8 coll channels, 0 nvls channels, 8 p2p channels, 2 p2p channels per peer +dlc1abaccnl2nzws-master-0:83:328 [1] NCCL INFO Connected all trees +dlc1abaccnl2nzws-master-0:83:328 [1] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 +dlc1abaccnl2nzws-master-0:83:328 [1] NCCL INFO 8 coll channels, 0 nvls channels, 8 p2p channels, 2 p2p channels per peer +dlc1abaccnl2nzws-master-0:89:329 [7] NCCL INFO Connected all trees +dlc1abaccnl2nzws-master-0:89:329 [7] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 +dlc1abaccnl2nzws-master-0:89:329 [7] NCCL INFO 8 coll channels, 0 nvls channels, 8 p2p channels, 2 p2p channels per peer +dlc1abaccnl2nzws-master-0:86:333 [4] NCCL INFO Connected all trees +dlc1abaccnl2nzws-master-0:86:333 [4] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 +dlc1abaccnl2nzws-master-0:86:333 [4] NCCL INFO 8 coll channels, 0 nvls channels, 8 p2p channels, 2 p2p channels per peer +dlc1abaccnl2nzws-master-0:84:327 [2] NCCL INFO Connected all trees +dlc1abaccnl2nzws-master-0:84:327 [2] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 +dlc1abaccnl2nzws-master-0:84:327 [2] NCCL INFO 8 coll channels, 0 nvls channels, 8 p2p channels, 2 p2p channels per peer +dlc1abaccnl2nzws-master-0:88:332 [6] NCCL INFO Connected all trees +dlc1abaccnl2nzws-master-0:88:332 [6] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 +dlc1abaccnl2nzws-master-0:88:332 [6] NCCL INFO 8 coll channels, 0 nvls channels, 8 p2p channels, 2 p2p channels per peer +dlc1abaccnl2nzws-master-0:82:326 [0] NCCL INFO Connected all trees +dlc1abaccnl2nzws-master-0:82:326 [0] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 +dlc1abaccnl2nzws-master-0:82:326 [0] NCCL INFO 8 coll channels, 0 nvls channels, 8 p2p channels, 2 p2p channels per peer +dlc1abaccnl2nzws-master-0:89:329 [7] NCCL INFO comm 0x9a729d00 rank 7 nranks 16 cudaDev 7 nvmlDev 7 busId 80 commId 0x13a7e6351c9956c9 - Init COMPLETE +dlc1abaccnl2nzws-master-0:83:328 [1] NCCL INFO comm 0x9bc9e470 rank 1 nranks 16 cudaDev 1 nvmlDev 1 busId 20 commId 0x13a7e6351c9956c9 - Init COMPLETE +dlc1abaccnl2nzws-master-0:87:330 [5] NCCL INFO comm 0x9b1e3b20 rank 5 nranks 16 cudaDev 5 nvmlDev 5 busId 60 commId 0x13a7e6351c9956c9 - Init COMPLETE +dlc1abaccnl2nzws-master-0:85:331 [3] NCCL INFO comm 0x9a3f5910 rank 3 nranks 16 cudaDev 3 nvmlDev 3 busId 40 commId 0x13a7e6351c9956c9 - Init COMPLETE +dlc1abaccnl2nzws-master-0:88:332 [6] NCCL INFO comm 0x9bbbfcd0 rank 6 nranks 16 cudaDev 6 nvmlDev 6 busId 70 commId 0x13a7e6351c9956c9 - Init COMPLETE +dlc1abaccnl2nzws-master-0:86:333 [4] NCCL INFO comm 0x99f3b9a0 rank 4 nranks 16 cudaDev 4 nvmlDev 4 busId 50 commId 0x13a7e6351c9956c9 - Init COMPLETE +dlc1abaccnl2nzws-master-0:84:327 [2] NCCL INFO comm 0x9b487020 rank 2 nranks 16 cudaDev 2 nvmlDev 2 busId 30 commId 0x13a7e6351c9956c9 - Init COMPLETE +dlc1abaccnl2nzws-master-0:82:326 [0] NCCL INFO comm 0x9a19ed70 rank 0 nranks 16 cudaDev 0 nvmlDev 0 busId 10 commId 0x13a7e6351c9956c9 - Init COMPLETE +[2025-01-16 22:24:18,658] [INFO] [partition_parameters.py:345:__exit__] finished initializing model - num_params = 454, num_elems = 15.48B + +Loading checkpoint shards: 0%| | 0/8 [00:00> Some weights of LlavaInternlm2ForCausalLM were not initialized from the model checkpoint at models/internlm/internlm2_5-7b-chat and are newly initialized: ['lm_head.weight'] +You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference. + +Loading checkpoint shards: 100%|██████████| 8/8 [00:07<00:00, 1.02it/s] +Loading checkpoint shards: 100%|██████████| 8/8 [00:07<00:00, 1.05it/s] +[WARNING|modeling_utils.py:4352] 2025-01-16 22:24:26,332 >> Some weights of LlavaInternlm2ForCausalLM were not initialized from the model checkpoint at models/internlm/internlm2_5-7b-chat and are newly initialized: ['lm_head.weight'] +You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference. + +Loading checkpoint shards: 100%|██████████| 8/8 [00:07<00:00, 1.02it/s] +Loading checkpoint shards: 100%|██████████| 8/8 [00:07<00:00, 1.05it/s] +[WARNING|modeling_utils.py:4352] 2025-01-16 22:24:26,333 >> Some weights of LlavaInternlm2ForCausalLM were not initialized from the model checkpoint at models/internlm/internlm2_5-7b-chat and are newly initialized: ['lm_head.weight'] +You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference. + +Loading checkpoint shards: 100%|██████████| 8/8 [00:07<00:00, 1.02it/s] +Loading checkpoint shards: 100%|██████████| 8/8 [00:07<00:00, 1.06it/s] +[WARNING|modeling_utils.py:4352] 2025-01-16 22:24:26,333 >> Some weights of LlavaInternlm2ForCausalLM were not initialized from the model checkpoint at models/internlm/internlm2_5-7b-chat and are newly initialized: ['lm_head.weight'] +You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference. + +Loading checkpoint shards: 100%|██████████| 8/8 [00:07<00:00, 1.02it/s] +Loading checkpoint shards: 100%|██████████| 8/8 [00:07<00:00, 1.06it/s] +[WARNING|modeling_utils.py:4352] 2025-01-16 22:24:26,334 >> Some weights of LlavaInternlm2ForCausalLM were not initialized from the model checkpoint at models/internlm/internlm2_5-7b-chat and are newly initialized: ['lm_head.weight'] +You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference. + +Loading checkpoint shards: 100%|██████████| 8/8 [00:07<00:00, 1.02it/s]Using tokenizer from models/internlm/internlm2_5-7b-chat +using cache dir None + +Loading checkpoint shards: 100%|██████████| 8/8 [00:07<00:00, 1.05it/s] +[WARNING|modeling_utils.py:4352] 2025-01-16 22:24:26,334 >> Some weights of LlavaInternlm2ForCausalLM were not initialized from the model checkpoint at models/internlm/internlm2_5-7b-chat and are newly initialized: ['lm_head.weight'] +You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference. +/fs-computility/mllm1/shared/hub/ +Loading checkpoint shards: 100%|██████████| 8/8 [00:07<00:00, 1.02it/s] +Loading checkpoint shards: 100%|██████████| 8/8 [00:07<00:00, 1.05it/s] +[WARNING|modeling_utils.py:4352] 2025-01-16 22:24:26,334 >> Some weights of LlavaInternlm2ForCausalLM were not initialized from the model checkpoint at models/internlm/internlm2_5-7b-chat and are newly initialized: ['lm_head.weight'] +You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference. +Using tokenizer from models/internlm/internlm2_5-7b-chat +using cache dir None +Using tokenizer from models/internlm/internlm2_5-7b-chat +using cache dir None +Using tokenizer from models/internlm/internlm2_5-7b-chat +using cache dir None +Using tokenizer from models/internlm/internlm2_5-7b-chat +using cache dir None +Using tokenizer from models/internlm/internlm2_5-7b-chat +using cache dir None +Using tokenizer from models/internlm/internlm2_5-7b-chat +using cache dir None +01/16/2025 22:24:26 - WARNING - llava.train.train - Using conversation template: Conversation(system='<|im_start|>system\nYou are a helpful assistant. ', roles=('<|im_start|>user\n', '<|im_start|>assistant\n'), messages=[], offset=0, sep_style=, sep='<|im_end|>', sep2=None, version='internlm_v2', mm_system=None, skip_next=False) +01/16/2025 22:24:26 - WARNING - llava.train.train - Using conversation template: Conversation(system='<|im_start|>system\nYou are a helpful assistant. ', roles=('<|im_start|>user\n', '<|im_start|>assistant\n'), messages=[], offset=0, sep_style=, sep='<|im_end|>', sep2=None, version='internlm_v2', mm_system=None, skip_next=False) +01/16/2025 22:24:26 - WARNING - llava.train.train - Using conversation template: Conversation(system='<|im_start|>system\nYou are a helpful assistant. ', roles=('<|im_start|>user\n', '<|im_start|>assistant\n'), messages=[], offset=0, sep_style=, sep='<|im_end|>', sep2=None, version='internlm_v2', mm_system=None, skip_next=False) +01/16/2025 22:24:26 - WARNING - llava.train.train - Using conversation templat/fs-computility/mllm1/shared/hub/tem\nYou are a helpful assistant. ', roles=('<|im_start|>user\n', '<|im_start|>assistant\n'), messages=[], offset=0, sep_style=, sep='<|im_end|>', sep2=None, version='internlm_v2', mm_system=None, skip_next=False) +01/16/2025 22:24:26 - WARNING - llava.train.train - Using conversation template: Conversation(system='<|im_start|>system\nYou are a helpful assistant. ', roles=('<|im_start|>user\n', '<|im_start|>assistant\n'), messages=[], offset=0, sep_style=, sep='<|im_end|>', sep2=None, version='internlm_v2', mm_system=None, skip_next=False) +01/16/2025 22:24:26 - WARNING - llava.train.train - Using conversation template: Conversation(system='<|im_start|>system\nYou are a helpful assistant. ', roles=('<|im_start|>user\n', '<|im_start|>assistant\n'), messages=[], offset=0, sep_style=, sep='<|im_end|>', sep2=None, version='internlm_v2', mm_system=None, skip_next=False) +01/16/2025 22:24:26 - WARNING - llava.train.train - Using conversation template: Conversation(sys/fs-computility/mllm1/shared/hub/ful assistant. ', roles=('<|im_start|>user\n', '<|im_start|>assistant\n'), messages=[], offset=0, sep_style=, sep='<|im_end|>', sep2=None, version='internlm_v2', mm_system=None, skip_next=False) + +Loading checkpoint shards: 75%|███████▌ | 6/8 [00:07<00:02, 1.28s/it] +Loading checkpoint shards: 88%|████████▊ | 7/8 [00:09<00:01, 1.52s/it]/fs-computility/mllm1/shared/hub/ +Loading checkpoint shards: 100%|██████████| 8/8 [00:11<00:00, 1.53s/it] +Loading checkpoint shards: 100%|██████████| 8/8 [00:11<00:00, 1.44s/it] +[INFO|modeling_utils.py:4350] 2025-01-16 22:24:30,239 >> All model checkpoint weights were used when initializing LlavaInternlm2ForCausalLM. + +[WARNING|modeling_utils.py:4352] 2025-01-16 22:24:30,240 >> Some weights of LlavaInternlm2ForCausalLM were not initialized from the model checkpoint at models/internlm/internlm2_5-7b-chat and are newly initialized: ['lm_head.weight'] +You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference. +[INFO|configuration_utils.py:779] 2025-01-16 22:24:30,244 >> loading configuration file models/internlm/internlm2_5-7b-chat/generation_config.json +[INFO|configuration_utils.py:826] 2025-01-16 22:24:30,245 >> Generate config GenerationConfig { + "bos_token_id": 1, + "eos_token_id": [ + 2, + 92542 + ], + "pad_token_id": 2 +} + +Using tokenizer from models/internlm/internlm2_5-7b-chat +using cache dir None +[INFO|tokenization_utils_base.py:2025] 2025-01-16 22:24:30,256 >> loading file ./tokenizer.model +[INFO|tokenization_utils_base.py:2025] 2025-01-16 22:24:30,256 >> loading file added_tokens.json +[INFO|tokenization_utils_base.py:2025] 2025-01-16 22:24:30,256 >> loading file special_tokens_map.json +[INFO|tokenization_utils_base.py:2025] 2025-01-16 22:24:30,256 >> loading file tokenizer_config.json +[INFO|tokenization_utils_base.py:2025] 2025-01-16 22:24:30,256 >> loading file tokenizer.json +01/16/2025 22:24:30 - INFO - llava.train.train - Using conversation template: Conversation(system='<|im_start|>system\nYou are a helpful assistant. ', roles=('<|im_start|>user\n', '<|im_start|>assistant\n'), messages=[], offset=0, sep_style=, sep='<|im_end|>', sep2=None, version='internlm_v2', mm_system=None, skip_next=False) +[INFO|image_processing_utils.py:373] 2025-01-16 22:24:30,406 >> loading configuration file /fs-computility/mllm1/shared/hub/models--openai--clip-vit-large-patch14-336/snapshots/ce19dc912ca5cd21c8a653c79e251e808ccabcd1/preprocessor_config.json +[INFO|image_processing_utils.py:738] 2025-01-16 22:24:30,406 >> size should be a dictionary on of the following set of keys: ({'height', 'width'}, {'shortest_edge'}, {'shortest_edge', 'longest_edge'}, {'longest_edge'}), got 336. Converted to {'shortest_edge': 336}. +[INFO|image_processing_utils.py:738] 2025-01-16 22:24:30,406 >> crop_size should be a dictionary on of the following set of keys: ({'height', 'width'}, {'shortest_edge'}, {'shortest_edge', 'longest_edge'}, {'longest_edge'}), got 336. Converted to {'height': 336, 'width': 336}. +[INFO|image_processing_utils.py:425] 2025-01-16 22:24:30,406 >> Image processor CLIPImageProcessor { + "crop_size": { + "height": 336, + "width": 336 + }, + "do_center_crop": true, + "do_convert_rgb": true, + "do_normalize": true, + "do_rescale": true, + "do_resize": true, + "image_mean": [ + 0.48145466, + 0.4578275, + 0.40821073 + ], + "image_processor_type": "CLIPImageProcessor", + "image_std": [ + 0.26862954, + 0.26130258, + 0.27577711 + ], + "resample": 3, + "rescale_factor": 0.00392156862745098, + "size": { + "shortest_edge": 336 + } +} + +[INFO|configuration_utils.py:727] 2025-01-16 22:24:30,413 >> loading configuration file /fs-computility/mllm1/shared/hub/models--openai--clip-vit-large-patch14-336/snapshots/ce19dc912ca5cd21c8a653c79e251e808ccabcd1/config.json +[INFO|configuration_utils.py:792] 2025-01-16 22:24:30,413 >> Model config CLIPVisionConfig { + "attention_dropout": 0.0, + "dropout": 0.0, + "hidden_act": "quick_gelu", + "hidden_size": 1024, + "image_size": 336, + "initializer_factor": 1.0, + "initializer_range": 0.02, + "intermediate_size": 4096, + "layer_norm_eps": 1e-05, + "model_type": "clip_vision_model", + "num_attention_heads": 16, + "num_channels": 3, + "num_hidden_layers": 24, + "patch_size": 14, + "projection_dim": 768, + "transformers_version": "4.37.2" +} + +[INFO|modeling_utils.py:3473] 2025-01-16 22:24:30,414 >> loading weights file /fs-computility/mllm1/shared/hub/models--openai--clip-vit-large-patch14-336/snapshots/ce19dc912ca5cd21c8a653c79e251e808ccabcd1/pytorch_model.bin +[INFO|modeling_utils.py:3582] 2025-01-16 22:24:30,856 >> Detected DeepSpeed ZeRO-3: activating zero.init() for this model +[2025-01-16 22:24:30,996] [INFO] [partition_parameters.py:345:__exit__] finished initializing model - num_params = 845, num_elems = 15.78B +[INFO|modeling_utils.py:4340] 2025-01-16 22:24:31,912 >> Some weights of the model checkpoint at /fs-computility/mllm1/shared/hub/models--openai--clip-vit-large-patch14-336/snapshots/ce19dc912ca5cd21c8a653c79e251e808ccabcd1 were not used when initializing CLIPVisionModel: ['logit_scale', 'text_model.embeddings.position_embedding.weight', 'text_model.embeddings.position_ids', 'text_model.embeddings.token_embedding.weight', 'text_model.encoder.layers.0.layer_norm1.bias', 'text_model.encoder.layers.0.layer_norm1.weight', 'text_model.encoder.layers.0.layer_norm2.bias', 'text_model.encoder.layers.0.layer_norm2.weight', 'text_model.encoder.layers.0.mlp.fc1.bias', 'text_model.encoder.layers.0.mlp.fc1.weight', 'text_model.encoder.layers.0.mlp.fc2.bias', 'text_model.encoder.layers.0.mlp.fc2.weight', 'text_model.encoder.layers.0.self_attn.k_proj.bias', 'text_model.encoder.layers.0.self_attn.k_proj.weight', 'text_model.encoder.layers.0.self_attn.out_proj.bias', 'text_model.encoder.layers.0.self_attn.out_proj.weight', 'text_model.encoder.layers.0.self_attn.q_proj.bias', 'text_model.encoder.layers.0.self_attn.q_proj.weight', 'text_model.encoder.layers.0.self_attn.v_proj.bias', 'text_model.encoder.layers.0.self_attn.v_proj.weight', 'text_model.encoder.layers.1.layer_norm1.bias', 'text_model.encoder.layers.1.layer_norm1.weight', 'text_model.encoder.layers.1.layer_norm2.bias', 'text_model.encoder.layers.1.layer_norm2.weight', 'text_model.encoder.layers.1.mlp.fc1.bias', 'text_model.encoder.layers.1.mlp.fc1.weight', 'text_model.encoder.layers.1.mlp.fc2.bias', 'text_model.encoder.layers.1.mlp.fc2.weight', 'text_model.encoder.layers.1.self_attn.k_proj.bias', 'text_model.encoder.layers.1.self_attn.k_proj.weight', 'text_model.encoder.layers.1.self_attn.out_proj.bias', 'text_model.encoder.layers.1.self_attn.out_proj.weight', 'text_model.encoder.layers.1.self_attn.q_proj.bias', 'text_model.encoder.layers.1.self_attn.q_proj.weight', 'text_model.encoder.layers.1.self_attn.v_proj.bias', 'text_model.encoder.layers.1.self_attn.v_proj.weight', 'text_model.encoder.layers.10.layer_norm1.bias', 'text_model.encoder.layers.10.layer_norm1.weight', 'text_model.encoder.layers.10.layer_norm2.bias', 'text_model.encoder.layers.10.layer_norm2.weight', 'text_model.encoder.layers.10.mlp.fc1.bias', 'text_model.encoder.layers.10.mlp.fc1.weight', 'text_model.encoder.layers.10.mlp.fc2.bias', 'text_model.encoder.layers.10.mlp.fc2.weight', 'text_model.encoder.layers.10.self_attn.k_proj.bias', 'text_model.encoder.layers.10.self_attn.k_proj.weight', 'text_model.encoder.layers.10.self_attn.out_proj.bias', 'text_model.encoder.layers.10.self_attn.out_proj.weight', 'text_model.encoder.layers.10.self_attn.q_proj.bias', 'text_model.encoder.layers.10.self_attn.q_proj.weight', 'text_model.encoder.layers.10.self_attn.v_proj.bias', 'text_model.encoder.layers.10.self_attn.v_proj.weight', 'text_model.encoder.layers.11.layer_norm1.bias', 'text_model.encoder.layers.11.layer_norm1.weight', 'text_model.encoder.layers.11.layer_norm2.bias', 'text_model.encoder.layers.11.layer_norm2.weight', 'text_model.encoder.layers.11.mlp.fc1.bias', 'text_model.encoder.layers.11.mlp.fc1.weight', 'text_model.encoder.layers.11.mlp.fc2.bias', 'text_model.encoder.layers.11.mlp.fc2.weight', 'text_model.encoder.layers.11.self_attn.k_proj.bias', 'text_model.encoder.layers.11.self_attn.k_proj.weight', 'text_model.encoder.layers.11.self_attn.out_proj.bias', 'text_model.encoder.layers.11.self_attn.out_proj.weight', 'text_model.encoder.layers.11.self_attn.q_proj.bias', 'text_model.encoder.layers.11.self_attn.q_proj.weight', 'text_model.encoder.layers.11.self_attn.v_proj.bias', 'text_model.encoder.layers.11.self_attn.v_proj.weight', 'text_model.encoder.layers.2.layer_norm1.bias', 'text_model.encoder.layers.2.layer_norm1.weight', 'text_model.encoder.layers.2.layer_norm2.bias', 'text_model.encoder.layers.2.layer_norm2.weight', 'text_model.encoder.layers.2.mlp.fc1.bias', 'text_model.encoder.layers.2.mlp.fc1.weight', 'text_model.encoder.layers.2.mlp.fc2.bias', 'text_model.encoder.layers.2.mlp.fc2.weight', 'text_model.encoder.layers.2.self_attn.k_proj.bias', 'text_model.encoder.layers.2.self_attn.k_proj.weight', 'text_model.encoder.layers.2.self_attn.out_proj.bias', 'text_model.encoder.layers.2.self_attn.out_proj.weight', 'text_model.encoder.layers.2.self_attn.q_proj.bias', 'text_model.encoder.layers.2.self_attn.q_proj.weight', 'text_model.encoder.layers.2.self_attn.v_proj.bias', 'text_model.encoder.layers.2.self_attn.v_proj.weight', 'text_model.encoder.layers.3.layer_norm1.bias', 'text_model.encoder.layers.3.layer_norm1.weight', 'text_model.encoder.layers.3.layer_norm2.bias', 'text_model.encoder.layers.3.layer_norm2.weight', 'text_model.encoder.layers.3.mlp.fc1.bias', 'text_model.encoder.layers.3.mlp.fc1.weight', 'text_model.encoder.layers.3.mlp.fc2.bias', 'text_model.encoder.layers.3.mlp.fc2.weight', 'text_model.encoder.layers.3.self_attn.k_proj.bias', 'text_model.encoder.layers.3.self_attn.k_proj.weight', 'text_model.encoder.layers.3.self_attn.out_proj.bias', 'text_model.encoder.layers.3.self_attn.out_proj.weight', 'text_model.encoder.layers.3.self_attn.q_proj.bias', 'text_model.encoder.layers.3.self_attn.q_proj.weight', 'text_model.encoder.layers.3.self_attn.v_proj.bias', 'text_model.encoder.layers.3.self_attn.v_proj.weight', 'text_model.encoder.layers.4.layer_norm1.bias', 'text_model.encoder.layers.4.layer_norm1.weight', 'text_model.encoder.layers.4.layer_norm2.bias', 'text_model.encoder.layers.4.layer_norm2.weight', 'text_model.encoder.layers.4.mlp.fc1.bias', 'text_model.encoder.layers.4.mlp.fc1.weight', 'text_model.encoder.layers.4.mlp.fc2.bias', 'text_model.encoder.layers.4.mlp.fc2.weight', 'text_model.encoder.layers.4.self_attn.k_proj.bias', 'text_model.encoder.layers.4.self_attn.k_proj.weight', 'text_model.encoder.layers.4.self_attn.out_proj.bias', 'text_model.encoder.layers.4.self_attn.out_proj.weight', 'text_model.encoder.layers.4.self_attn.q_proj.bias', 'text_model.encoder.layers.4.self_attn.q_proj.weight', 'text_model.encoder.layers.4.self_attn.v_proj.bias', 'text_model.encoder.layers.4.self_attn.v_proj.weight', 'text_model.encoder.layers.5.layer_norm1.bias', 'text_model.encoder.layers.5.layer_norm1.weight', 'text_model.encoder.layers.5.layer_norm2.bias', 'text_model.encoder.layers.5.layer_norm2.weight', 'text_model.encoder.layers.5.mlp.fc1.bias', 'text_model.encoder.layers.5.mlp.fc1.weight', 'text_model.encoder.layers.5.mlp.fc2.bias', 'text_model.encoder.layers.5.mlp.fc2.weight', 'text_model.encoder.layers.5.self_attn.k_proj.bias', 'text_model.encoder.layers.5.self_attn.k_proj.weight', 'text_model.encoder.layers.5.self_attn.out_proj.bias', 'text_model.encoder.layers.5.self_attn.out_proj.weight', 'text_model.encoder.layers.5.self_attn.q_proj.bias', 'text_model.encoder.layers.5.self_attn.q_proj.weight', 'text_model.encoder.layers.5.self_attn.v_proj.bias', 'text_model.encoder.layers.5.self_attn.v_proj.weight', 'text_model.encoder.layers.6.layer_norm1.bias', 'text_model.encoder.layers.6.layer_norm1.weight', 'text_model.encoder.layers.6.layer_norm2.bias', 'text_model.encoder.layers.6.layer_norm2.weight', 'text_model.encoder.layers.6.mlp.fc1.bias', 'text_model.encoder.layers.6.mlp.fc1.weight', 'text_model.encoder.layers.6.mlp.fc2.bias', 'text_model.encoder.layers.6.mlp.fc2.weight', 'text_model.encoder.layers.6.self_attn.k_proj.bias', 'text_model.encoder.layers.6.self_attn.k_proj.weight', 'text_model.encoder.layers.6.self_attn.out_proj.bias', 'text_model.encoder.layers.6.self_attn.out_proj.weight', 'text_model.encoder.layers.6.self_attn.q_proj.bias', 'text_model.encoder.layers.6.self_attn.q_proj.weight', 'text_model.encoder.layers.6.self_attn.v_proj.bias', 'text_model.encoder.layers.6.self_attn.v_proj.weight', 'text_model.encoder.layers.7.layer_norm1.bias', 'text_model.encoder.layers.7.layer_norm1.weight', 'text_model.encoder.layers.7.layer_norm2.bias', 'text_model.encoder.layers.7.layer_norm2.weight', 'text_model.encoder.layers.7.mlp.fc1.bias', 'text_model.encoder.layers.7.mlp.fc1.weight', 'text_model.encoder.layers.7.mlp.fc2.bias', 'text_model.encoder.layers.7.mlp.fc2.weight', 'text_model.encoder.layers.7.self_attn.k_proj.bias', 'text_model.encoder.layers.7.self_attn.k_proj.weight', 'text_model.encoder.layers.7.self_attn.out_proj.bias', 'text_model.encoder.layers.7.self_attn.out_proj.weight', 'text_model.encoder.layers.7.self_attn.q_proj.bias', 'text_model.encoder.layers.7.self_attn.q_proj.weight', 'text_model.encoder.layers.7.self_attn.v_proj.bias', 'text_model.encoder.layers.7.self_attn.v_proj.weight', 'text_model.encoder.layers.8.layer_norm1.bias', 'text_model.encoder.layers.8.layer_norm1.weight', 'text_model.encoder.layers.8.layer_norm2.bias', 'text_model.encoder.layers.8.layer_norm2.weight', 'text_model.encoder.layers.8.mlp.fc1.bias', 'text_model.encoder.layers.8.mlp.fc1.weight', 'text_model.encoder.layers.8.mlp.fc2.bias', 'text_model.encoder.layers.8.mlp.fc2.weight', 'text_model.encoder.layers.8.self_attn.k_proj.bias', 'text_model.encoder.layers.8.self_attn.k_proj.weight', 'text_model.encoder.layers.8.self_attn.out_proj.bias', 'text_model.encoder.layers.8.self_attn.out_proj.weight', 'text_model.encoder.layers.8.self_attn.q_proj.bias', 'text_model.encoder.layers.8.self_attn.q_proj.weight', 'text_model.encoder.layers.8.self_attn.v_proj.bias', 'text_model.encoder.layers.8.self_attn.v_proj.weight', 'text_model.encoder.layers.9.layer_norm1.bias', 'text_model.encoder.layers.9.layer_norm1.weight', 'text_model.encoder.layers.9.layer_norm2.bias', 'text_model.encoder.layers.9.layer_norm2.weight', 'text_model.encoder.layers.9.mlp.fc1.bias', 'text_model.encoder.layers.9.mlp.fc1.weight', 'text_model.encoder.layers.9.mlp.fc2.bias', 'text_model.encoder.layers.9.mlp.fc2.weight', 'text_model.encoder.layers.9.self_attn.k_proj.bias', 'text_model.encoder.layers.9.self_attn.k_proj.weight', 'text_model.encoder.layers.9.self_attn.out_proj.bias', 'text_model.encoder.layers.9.self_attn.out_proj.weight', 'text_model.encoder.layers.9.self_attn.q_proj.bias', 'text_model.encoder.layers.9.self_attn.q_proj.weight', 'text_model.encoder.layers.9.self_attn.v_proj.bias', 'text_model.encoder.layers.9.self_attn.v_proj.weight', 'text_model.final_layer_norm.bias', 'text_model.final_layer_norm.weight', 'text_projection.weight', 'visual_projection.weight'] +- This IS expected if you are initializing CLIPVisionModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). +- This IS NOT expected if you are initializing CLIPVisionModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). +[INFO|modeling_utils.py:4358] 2025-01-16 22:24:31,912 >> All the weights of CLIPVisionModel were initialized from the model checkpoint at /fs-computility/mllm1/shared/hub/models--openai--clip-vit-large-patch14-336/snapshots/ce19dc912ca5cd21c8a653c79e251e808ccabcd1. +If your task is similar to the task the model of the checkpoint was trained on, you can already use CLIPVisionModel for predictions without further training. +Rank 0: Using mm_tunable_parts: mm_vision_tower,mm_mlp_adapter,mm_language_model +Rank 0: Formatting inputs...Skip in lazy mode +01/16/2025 22:24:51 - INFO - llava.train.train - Add dataset: llava-next-sft-notext with length: 738601, data type: normal, seed: 0 +Rank 0: Formatting inputs...Skip in lazy mode +01/16/2025 22:24:54 - INFO - llava.train.train - Add dataset: knowledge_gqa9k_art1500_cc3m30k with length: 40813, data type: know, seed: 1 +Rank 0: Formatting inputs...Skip in lazy mode +01/16/2025 22:24:58 - INFO - llava.train.train - Add dataset: Inferencial_flickr7k_cc3m30k_polished_md with length: 37117, data type: inf_polishmd, seed: 2 +Rank 0: Formatting inputs...Skip in lazy mode +01/16/2025 22:25:01 - INFO - llava.train.train - Add dataset: Detail_flickr7k_cc3m28k with length: 35313, data type: detail, seed: 3 +Rank 0: Formatting inputs...Skip in lazy mode +01/16/2025 22:25:06 - INFO - llava.train.train - Add dataset: Knowledge_instruct40k with length: 40218, data type: know_ins, seed: 4 +Rank 0: Formatting inputs...Skip in lazy mode +01/16/2025 22:25:09 - INFO - llava.train.train - Add dataset: Creation10k_fixed with length: 9698, data type: creation, seed: 5 +Rank 0: Formatting inputs...Skip in lazy mode +01/16/2025 22:25:13 - INFO - llava.train.train - Add dataset: Chartqa_generate_11k_gpt_qwen_merge with length: 11160, data type: chart, seed: 6 +Rank 0: Formatting inputs...Skip in lazy mode +01/16/2025 22:25:16 - INFO - llava.train.train - Add dataset: Tqa_detail_qwengenerate_multi8k_gpt with length: 8391, data type: tqa, seed: 7 +Rank 0: Formatting inputs...Skip in lazy mode +01/16/2025 22:25:20 - INFO - llava.train.train - Add dataset: Infovqa_single_gpt with length: 23068, data type: info, seed: 8 +Rank 0: Trainable parameters: ['model.image_newline', 'model.tok_embeddings.weight', 'model.layers.0.attention.wqkv.weight', 'model.layers.0.attention.wo.weight', 'model.layers.0.feed_forward.w1.weight', 'model.layers.0.feed_forward.w3.weight', 'model.layers.0.feed_forward.w2.weight', 'model.layers.0.attention_norm.weight', 'model.layers.0.ffn_norm.weight', 'model.layers.1.attention.wqkv.weight', 'model.layers.1.attention.wo.weight', 'model.layers.1.feed_forward.w1.weight', 'model.layers.1.feed_forward.w3.weight', 'model.layers.1.feed_forward.w2.weight', 'model.layers.1.attention_norm.weight', 'model.layers.1.ffn_norm.weight', 'model.layers.2.attention.wqkv.weight', 'model.layers.2.attention.wo.weight', 'model.layers.2.feed_forward.w1.weight', 'model.layers.2.feed_forward.w3.weight', 'model.layers.2.feed_forward.w2.weight', 'model.layers.2.attention_norm.weight', 'model.layers.2.ffn_norm.weight', 'model.layers.3.attention.wqkv.weight', 'model.layers.3.attention.wo.weight', 'model.layers.3.feed_forward.w1.weight', 'model.layers.3.feed_forward.w3.weight', 'model.layers.3.feed_forward.w2.weight', 'model.layers.3.attention_norm.weight', 'model.layers.3.ffn_norm.weight', 'model.layers.4.attention.wqkv.weight', 'model.layers.4.attention.wo.weight', 'model.layers.4.feed_forward.w1.weight', 'model.layers.4.feed_forward.w3.weight', 'model.layers.4.feed_forward.w2.weight', 'model.layers.4.attention_norm.weight', 'model.layers.4.ffn_norm.weight', 'model.layers.5.attention.wqkv.weight', 'model.layers.5.attention.wo.weight', 'model.layers.5.feed_forward.w1.weight', 'model.layers.5.feed_forward.w3.weight', 'model.layers.5.feed_forward.w2.weight', 'model.layers.5.attention_norm.weight', 'model.layers.5.ffn_norm.weight', 'model.layers.6.attention.wqkv.weight', 'model.layers.6.attention.wo.weight', 'model.layers.6.feed_forward.w1.weight', 'model.layers.6.feed_forward.w3.weight', 'model.layers.6.feed_forward.w2.weight', 'model.layers.6.attention_norm.weight', 'model.layers.6.ffn_norm.weight', 'model.layers.7.attention.wqkv.weight', 'model.layers.7.attention.wo.weight', 'model.layers.7.feed_forward.w1.weight', 'model.layers.7.feed_forward.w3.weight', 'model.layers.7.feed_forward.w2.weight', 'model.layers.7.attention_norm.weight', 'model.layers.7.ffn_norm.weight', 'model.layers.8.attention.wqkv.weight', 'model.layers.8.attention.wo.weight', 'model.layers.8.feed_forward.w1.weight', 'model.layers.8.feed_forward.w3.weight', 'model.layers.8.feed_forward.w2.weight', 'model.layers.8.attention_norm.weight', 'model.layers.8.ffn_norm.weight', 'model.layers.9.attention.wqkv.weight', 'model.layers.9.attention.wo.weight', 'model.layers.9.feed_forward.w1.weight', 'model.layers.9.feed_forward.w3.weight', 'model.layers.9.feed_forward.w2.weight', 'model.layers.9.attention_norm.weight', 'model.layers.9.ffn_norm.weight', 'model.layers.10.attention.wqkv.weight', 'model.layers.10.attention.wo.weight', 'model.layers.10.feed_forward.w1.weight', 'model.layers.10.feed_forward.w3.weight', 'model.layers.10.feed_forward.w2.weight', 'model.layers.10.attention_norm.weight', 'model.layers.10.ffn_norm.weight', 'model.layers.11.attention.wqkv.weight', 'model.layers.11.attention.wo.weight', 'model.layers.11.feed_forward.w1.weight', 'model.layers.11.feed_forward.w3.weight', 'model.layers.11.feed_forward.w2.weight', 'model.layers.11.attention_norm.weight', 'model.layers.11.ffn_norm.weight', 'model.layers.12.attention.wqkv.weight', 'model.layers.12.attention.wo.weight', 'model.layers.12.feed_forward.w1.weight', 'model.layers.12.feed_forward.w3.weight', 'model.layers.12.feed_forward.w2.weight', 'model.layers.12.attention_norm.weight', 'model.layers.12.ffn_norm.weight', 'model.layers.13.attention.wqkv.weight', 'model.layers.13.attention.wo.weight', 'model.layers.13.feed_forward.w1.weight', 'model.layers.13.feed_forward.w3.weight', 'model.layers.13.feed_forward.w2.weight', 'model.layers.13.attention_norm.weight', 'model.layers.13.ffn_norm.weight', 'model.layers.14.attention.wqkv.weight', 'model.layers.14.attention.wo.weight', 'model.layers.14.feed_forward.w1.weight', 'model.layers.14.feed_forward.w3.weight', 'model.layers.14.feed_forward.w2.weight', 'model.layers.14.attention_norm.weight', 'model.layers.14.ffn_norm.weight', 'model.layers.15.attention.wqkv.weight', 'model.layers.15.attention.wo.weight', 'model.layers.15.feed_forward.w1.weight', 'model.layers.15.feed_forward.w3.weight', 'model.layers.15.feed_forward.w2.weight', 'model.layers.15.attention_norm.weight', 'model.layers.15.ffn_norm.weight', 'model.layers.16.attention.wqkv.weight', 'model.layers.16.attention.wo.weight', 'model.layers.16.feed_forward.w1.weight', 'model.layers.16.feed_forward.w3.weight', 'model.layers.16.feed_forward.w2.weight', 'model.layers.16.attention_norm.weight', 'model.layers.16.ffn_norm.weight', 'model.layers.17.attention.wqkv.weight', 'model.layers.17.attention.wo.weight', 'model.layers.17.feed_forward.w1.weight', 'model.layers.17.feed_forward.w3.weight', 'model.layers.17.feed_forward.w2.weight', 'model.layers.17.attention_norm.weight', 'model.layers.17.ffn_norm.weight', 'model.layers.18.attention.wqkv.weight', 'model.layers.18.attention.wo.weight', 'model.layers.18.feed_forward.w1.weight', 'model.layers.18.feed_forward.w3.weight', 'model.layers.18.feed_forward.w2.weight', 'model.layers.18.attention_norm.weight', 'model.layers.18.ffn_norm.weight', 'model.layers.19.attention.wqkv.weight', 'model.layers.19.attention.wo.weight', 'model.layers.19.feed_forward.w1.weight', 'model.layers.19.feed_forward.w3.weight', 'model.layers.19.feed_forward.w2.weight', 'model.layers.19.attention_norm.weight', 'model.layers.19.ffn_norm.weight', 'model.layers.20.attention.wqkv.weight', 'model.layers.20.attention.wo.weight', 'model.layers.20.feed_forward.w1.weight', 'model.layers.20.feed_forward.w3.weight', 'model.layers.20.feed_forward.w2.weight', 'model.layers.20.attention_norm.weight', 'model.layers.20.ffn_norm.weight', 'model.layers.21.attention.wqkv.weight', 'model.layers.21.attention.wo.weight', 'model.layers.21.feed_forward.w1.weight', 'model.layers.21.feed_forward.w3.weight', 'model.layers.21.feed_forward.w2.weight', 'model.layers.21.attention_norm.weight', 'model.layers.21.ffn_norm.weight', 'model.layers.22.attention.wqkv.weight', 'model.layers.22.attention.wo.weight', 'model.layers.22.feed_forward.w1.weight', 'model.layers.22.feed_forward.w3.weight', 'model.layers.22.feed_forward.w2.weight', 'model.layers.22.attention_norm.weight', 'model.layers.22.ffn_norm.weight', 'model.layers.23.attention.wqkv.weight', 'model.layers.23.attention.wo.weight', 'model.layers.23.feed_forward.w1.weight', 'model.layers.23.feed_forward.w3.weight', 'model.layers.23.feed_forward.w2.weight', 'model.layers.23.attention_norm.weight', 'model.layers.23.ffn_norm.weight', 'model.layers.24.attention.wqkv.weight', 'model.layers.24.attention.wo.weight', 'model.layers.24.feed_forward.w1.weight', 'model.layers.24.feed_forward.w3.weight', 'model.layers.24.feed_forward.w2.weight', 'model.layers.24.attention_norm.weight', 'model.layers.24.ffn_norm.weight', 'model.layers.25.attention.wqkv.weight', 'model.layers.25.attention.wo.weight', 'model.layers.25.feed_forward.w1.weight', 'model.layers.25.feed_forward.w3.weight', 'model.layers.25.feed_forward.w2.weight', 'model.layers.25.attention_norm.weight', 'model.layers.25.ffn_norm.weight', 'model.layers.26.attention.wqkv.weight', 'model.layers.26.attention.wo.weight', 'model.layers.26.feed_forward.w1.weight', 'model.layers.26.feed_forward.w3.weight', 'model.layers.26.feed_forward.w2.weight', 'model.layers.26.attention_norm.weight', 'model.layers.26.ffn_norm.weight', 'model.layers.27.attention.wqkv.weight', 'model.layers.27.attention.wo.weight', 'model.layers.27.feed_forward.w1.weight', 'model.layers.27.feed_forward.w3.weight', 'model.layers.27.feed_forward.w2.weight', 'model.layers.27.attention_norm.weight', 'model.layers.27.ffn_norm.weight', 'model.layers.28.attention.wqkv.weight', 'model.layers.28.attention.wo.weight', 'model.layers.28.feed_forward.w1.weight', 'model.layers.28.feed_forward.w3.weight', 'model.layers.28.feed_forward.w2.weight', 'model.layers.28.attention_norm.weight', 'model.layers.28.ffn_norm.weight', 'model.layers.29.attention.wqkv.weight', 'model.layers.29.attention.wo.weight', 'model.layers.29.feed_forward.w1.weight', 'model.layers.29.feed_forward.w3.weight', 'model.layers.29.feed_forward.w2.weight', 'model.layers.29.attention_norm.weight', 'model.layers.29.ffn_norm.weight', 'model.layers.30.attention.wqkv.weight', 'model.layers.30.attention.wo.weight', 'model.layers.30.feed_forward.w1.weight', 'model.layers.30.feed_forward.w3.weight', 'model.layers.30.feed_forward.w2.weight', 'model.layers.30.attention_norm.weight', 'model.layers.30.ffn_norm.weight', 'model.layers.31.attention.wqkv.weight', 'model.layers.31.attention.wo.weight', 'model.layers.31.feed_forward.w1.weight', 'model.layers.31.feed_forward.w3.weight', 'model.layers.31.feed_forward.w2.weight', 'model.layers.31.attention_norm.weight', 'model.layers.31.ffn_norm.weight', 'model.norm.weight', 'model.vision_tower.vision_tower.vision_model.embeddings.class_embedding', 'model.vision_tower.vision_tower.vision_model.embeddings.patch_embedding.weight', 'model.vision_tower.vision_tower.vision_model.embeddings.position_embedding.weight', 'model.vision_tower.vision_tower.vision_model.pre_layrnorm.weight', 'model.vision_tower.vision_tower.vision_model.pre_layrnorm.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias', 'model.vision_tower.vision_tower.vision_model.post_layernorm.weight', 'model.vision_tower.vision_tower.vision_model.post_layernorm.bias', 'model.mm_projector.0.weight', 'model.mm_projector.0.bias', 'model.mm_projector.2.weight', 'model.mm_projector.2.bias', 'output.weight', 'lm_head.weight'] +[INFO|trainer.py:571] 2025-01-16 22:25:20,368 >> Using auto half precision backend +[2025-01-16 22:25:32,104] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed info: version=0.14.4, git-hash=unknown, git-branch=unknown +[2025-01-16 22:25:32,134] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False +[2025-01-16 22:25:32,136] [INFO] [logging.py:96:log_dist] [Rank 0] Using client Optimizer as basic optimizer +[2025-01-16 22:25:32,136] [INFO] [logging.py:96:log_dist] [Rank 0] Removing param_group that has no 'params' in the basic Optimizer +[2025-01-16 22:25:32,164] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Basic Optimizer = AdamW +[2025-01-16 22:25:32,164] [INFO] [utils.py:56:is_zero_supported_optimizer] Checking ZeRO support for optimizer=AdamW type= +[2025-01-16 22:25:32,164] [INFO] [logging.py:96:log_dist] [Rank 0] Creating fp16 ZeRO stage 3 optimizer, MiCS is enabled False, Hierarchical params gather False +[2025-01-16 22:25:32,164] [INFO] [logging.py:96:log_dist] [Rank 0] Creating torch.bfloat16 ZeRO stage 3 optimizer +[2025-01-16 22:25:35,124] [INFO] [utils.py:781:see_memory_usage] Stage 3 initialize beginning +[2025-01-16 22:25:35,125] [INFO] [utils.py:782:see_memory_usage] MA 1.51 GB Max_MA 3.78 GB CA 4.29 GB Max_CA 13 GB +[2025-01-16 22:25:35,125] [INFO] [utils.py:789:see_memory_usage] CPU Virtual Memory: used = 69.58 GB, percent = 23.2% +[2025-01-16 22:25:35,128] [INFO] [stage3.py:130:__init__] Reduce bucket size 16777216 +[2025-01-16 22:25:35,128] [INFO] [stage3.py:131:__init__] Prefetch bucket size 15099494 +[2025-01-16 22:25:38,012] [INFO] [utils.py:781:see_memory_usage] DeepSpeedZeRoOffload initialize [begin] +[2025-01-16 22:25:38,012] [INFO] [utils.py:782:see_memory_usage] MA 1.51 GB Max_MA 1.51 GB CA 4.29 GB Max_CA 4 GB +[2025-01-16 22:25:38,013] [INFO] [utils.py:789:see_memory_usage] CPU Virtual Memory: used = 69.58 GB, percent = 23.2% +Parameter Offload: Total persistent parameters: 603136 in 313 params +[2025-01-16 22:25:41,003] [INFO] [utils.py:781:see_memory_usage] DeepSpeedZeRoOffload initialize [end] +[2025-01-16 22:25:41,003] [INFO] [utils.py:782:see_memory_usage] MA 1.51 GB Max_MA 1.55 GB CA 4.29 GB Max_CA 4 GB +[2025-01-16 22:25:41,004] [INFO] [utils.py:789:see_memory_usage] CPU Virtual Memory: used = 69.57 GB, percent = 23.2% +[2025-01-16 22:25:43,852] [INFO] [utils.py:781:see_memory_usage] Before creating fp16 partitions +[2025-01-16 22:25:43,853] [INFO] [utils.py:782:see_memory_usage] MA 1.51 GB Max_MA 1.51 GB CA 4.29 GB Max_CA 4 GB +[2025-01-16 22:25:43,853] [INFO] [utils.py:789:see_memory_usage] CPU Virtual Memory: used = 69.57 GB, percent = 23.2% +[2025-01-16 22:25:50,442] [INFO] [utils.py:781:see_memory_usage] After creating fp16 partitions: 4 +[2025-01-16 22:25:50,443] [INFO] [utils.py:782:see_memory_usage] MA 1.49 GB Max_MA 1.51 GB CA 4.94 GB Max_CA 5 GB +[2025-01-16 22:25:50,443] [INFO] [utils.py:789:see_memory_usage] CPU Virtual Memory: used = 69.57 GB, percent = 23.2% +[2025-01-16 22:25:53,256] [INFO] [utils.py:781:see_memory_usage] Before creating fp32 partitions +[2025-01-16 22:25:53,256] [INFO] [utils.py:782:see_memory_usage] MA 1.49 GB Max_MA 1.49 GB CA 4.94 GB Max_CA 5 GB +[2025-01-16 22:25:53,256] [INFO] [utils.py:789:see_memory_usage] CPU Virtual Memory: used = 69.57 GB, percent = 23.2% +[2025-01-16 22:25:56,197] [INFO] [utils.py:781:see_memory_usage] After creating fp32 partitions +[2025-01-16 22:25:56,197] [INFO] [utils.py:782:see_memory_usage] MA 3.45 GB Max_MA 4.33 GB CA 7.79 GB Max_CA 8 GB +[2025-01-16 22:25:56,197] [INFO] [utils.py:789:see_memory_usage] CPU Virtual Memory: used = 69.57 GB, percent = 23.2% +[2025-01-16 22:25:59,088] [INFO] [utils.py:781:see_memory_usage] Before initializing optimizer states +[2025-01-16 22:25:59,089] [INFO] [utils.py:782:see_memory_usage] MA 3.45 GB Max_MA 3.45 GB CA 7.79 GB Max_CA 8 GB +[2025-01-16 22:25:59,089] [INFO] [utils.py:789:see_memory_usage] CPU Virtual Memory: used = 69.57 GB, percent = 23.2% +[2025-01-16 22:26:02,005] [INFO] [utils.py:781:see_memory_usage] After initializing optimizer states +[2025-01-16 22:26:02,006] [INFO] [utils.py:782:see_memory_usage] MA 3.45 GB Max_MA 5.35 GB CA 9.69 GB Max_CA 10 GB +[2025-01-16 22:26:02,006] [INFO] [utils.py:789:see_memory_usage] CPU Virtual Memory: used = 69.57 GB, percent = 23.2% +[2025-01-16 22:26:02,006] [INFO] [stage3.py:486:_setup_for_real_optimizer] optimizer state initialized +[2025-01-16 22:26:05,277] [INFO] [utils.py:781:see_memory_usage] After initializing ZeRO optimizer +[2025-01-16 22:26:05,279] [INFO] [utils.py:782:see_memory_usage] MA 4.46 GB Max_MA 5.88 GB CA 11.1 GB Max_CA 11 GB +[2025-01-16 22:26:05,280] [INFO] [utils.py:789:see_memory_usage] CPU Virtual Memory: used = 69.57 GB, percent = 23.2% +[2025-01-16 22:26:05,280] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Final Optimizer = DeepSpeedZeroOptimizer_Stage3 +[2025-01-16 22:26:05,280] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed using client LR scheduler +[2025-01-16 22:26:05,280] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed LR Scheduler = None +[2025-01-16 22:26:05,280] [INFO] [logging.py:96:log_dist] [Rank 0] step=0, skipped=0, lr=[0.0, 0.0, 0.0, 0.0], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +[2025-01-16 22:26:05,281] [INFO] [config.py:997:print] DeepSpeedEngine configuration: +[2025-01-16 22:26:05,281] [INFO] [config.py:1001:print] activation_checkpointing_config { + "partition_activations": false, + "contiguous_memory_optimization": false, + "cpu_checkpointing": false, + "number_checkpoints": null, + "synchronize_checkpoint_boundary": false, + "profile": false +} +[2025-01-16 22:26:05,281] [INFO] [config.py:1001:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True} +[2025-01-16 22:26:05,282] [INFO] [config.py:1001:print] amp_enabled .................. False +[2025-01-16 22:26:05,282] [INFO] [config.py:1001:print] amp_params ................... False +[2025-01-16 22:26:05,282] [INFO] [config.py:1001:print] autotuning_config ............ { + "enabled": false, + "start_step": null, + "end_step": null, + "metric_path": null, + "arg_mappings": null, + "metric": "throughput", + "model_info": null, + "results_dir": "autotuning_results", + "exps_dir": "autotuning_exps", + "overwrite": true, + "fast": true, + "start_profile_step": 3, + "end_profile_step": 5, + "tuner_type": "gridsearch", + "tuner_early_stopping": 5, + "tuner_num_trials": 50, + "model_info_path": null, + "mp_size": 1, + "max_train_batch_size": null, + "min_train_batch_size": 1, + "max_train_micro_batch_size_per_gpu": 1.024000e+03, + "min_train_micro_batch_size_per_gpu": 1, + "num_tuning_micro_batch_sizes": 3 +} +[2025-01-16 22:26:05,282] [INFO] [config.py:1001:print] bfloat16_enabled ............. True +[2025-01-16 22:26:05,282] [INFO] [config.py:1001:print] bfloat16_immediate_grad_update False +[2025-01-16 22:26:05,282] [INFO] [config.py:1001:print] checkpoint_parallel_write_pipeline False +[2025-01-16 22:26:05,282] [INFO] [config.py:1001:print] checkpoint_tag_validation_enabled True +[2025-01-16 22:26:05,282] [INFO] [config.py:1001:print] checkpoint_tag_validation_fail False +[2025-01-16 22:26:05,282] [INFO] [config.py:1001:print] comms_config ................. +[2025-01-16 22:26:05,282] [INFO] [config.py:1001:print] communication_data_type ...... None +[2025-01-16 22:26:05,282] [INFO] [config.py:1001:print] compression_config ........... {'weight_quantization': {'shared_parameters': {'enabled': False, 'quantizer_kernel': False, 'schedule_offset': 0, 'quantize_groups': 1, 'quantize_verbose': False, 'quantization_type': 'symmetric', 'quantize_weight_in_forward': False, 'rounding': 'nearest', 'fp16_mixed_quantize': False, 'quantize_change_ratio': 0.001}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {'enabled': False, 'quantization_type': 'symmetric', 'range_calibration': 'dynamic', 'schedule_offset': 1000}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {'enabled': False, 'method': 'topk', 'schedule_offset': 1000}, 'different_groups': {}}, 'channel_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'layer_reduction': {'enabled': False}} +[2025-01-16 22:26:05,282] [INFO] [config.py:1001:print] curriculum_enabled_legacy .... False +[2025-01-16 22:26:05,282] [INFO] [config.py:1001:print] curriculum_params_legacy ..... False +[2025-01-16 22:26:05,282] [INFO] [config.py:1001:print] data_efficiency_config ....... {'enabled': False, 'seed': 1234, 'data_sampling': {'enabled': False, 'num_epochs': 1000, 'num_workers': 0, 'curriculum_learning': {'enabled': False}}, 'data_routing': {'enabled': False, 'random_ltd': {'enabled': False, 'layer_token_lr_schedule': {'enabled': False}}}} +[2025-01-16 22:26:05,282] [INFO] [config.py:1001:print] data_efficiency_enabled ...... False +[2025-01-16 22:26:05,282] [INFO] [config.py:1001:print] dataloader_drop_last ......... False +[2025-01-16 22:26:05,282] [INFO] [config.py:1001:print] disable_allgather ............ False +[2025-01-16 22:26:05,282] [INFO] [config.py:1001:print] dump_state ................... False +[2025-01-16 22:26:05,282] [INFO] [config.py:1001:print] dynamic_loss_scale_args ...... None +[2025-01-16 22:26:05,282] [INFO] [config.py:1001:print] eigenvalue_enabled ........... False +[2025-01-16 22:26:05,282] [INFO] [config.py:1001:print] eigenvalue_gas_boundary_resolution 1 +[2025-01-16 22:26:05,282] [INFO] [config.py:1001:print] eigenvalue_layer_name ........ bert.encoder.layer +[2025-01-16 22:26:05,282] [INFO] [config.py:1001:print] eigenvalue_layer_num ......... 0 +[2025-01-16 22:26:05,282] [INFO] [config.py:1001:print] eigenvalue_max_iter .......... 100 +[2025-01-16 22:26:05,282] [INFO] [config.py:1001:print] eigenvalue_stability ......... 1e-06 +[2025-01-16 22:26:05,282] [INFO] [config.py:1001:print] eigenvalue_tol ............... 0.01 +[2025-01-16 22:26:05,282] [INFO] [config.py:1001:print] eigenvalue_verbose ........... False +[2025-01-16 22:26:05,282] [INFO] [config.py:1001:print] elasticity_enabled ........... False +[2025-01-16 22:26:05,282] [INFO] [config.py:1001:print] flops_profiler_config ........ { + "enabled": false, + "recompute_fwd_factor": 0.0, + "profile_step": 1, + "module_depth": -1, + "top_modules": 1, + "detailed": true, + "output_file": null +} +[2025-01-16 22:26:05,282] [INFO] [config.py:1001:print] fp16_auto_cast ............... None +[2025-01-16 22:26:05,282] [INFO] [config.py:1001:print] fp16_enabled ................. False +[2025-01-16 22:26:05,282] [INFO] [config.py:1001:print] fp16_master_weights_and_gradients False +[2025-01-16 22:26:05,282] [INFO] [config.py:1001:print] global_rank .................. 0 +[2025-01-16 22:26:05,282] [INFO] [config.py:1001:print] grad_accum_dtype ............. None +[2025-01-16 22:26:05,282] [INFO] [config.py:1001:print] gradient_accumulation_steps .. 2 +[2025-01-16 22:26:05,282] [INFO] [config.py:1001:print] gradient_clipping ............ 0.0 +[2025-01-16 22:26:05,282] [INFO] [config.py:1001:print] gradient_predivide_factor .... 1.0 +[2025-01-16 22:26:05,282] [INFO] [config.py:1001:print] graph_harvesting ............. False +[2025-01-16 22:26:05,282] [INFO] [config.py:1001:print] hybrid_engine ................ enabled=False max_out_tokens=512 inference_tp_size=1 release_inference_cache=False pin_parameters=True tp_gather_partition_size=8 +[2025-01-16 22:26:05,282] [INFO] [config.py:1001:print] initial_dynamic_scale ........ 1 +[2025-01-16 22:26:05,282] [INFO] [config.py:1001:print] load_universal_checkpoint .... False +[2025-01-16 22:26:05,282] [INFO] [config.py:1001:print] loss_scale ................... 1.0 +[2025-01-16 22:26:05,282] [INFO] [config.py:1001:print] memory_breakdown ............. False +[2025-01-16 22:26:05,283] [INFO] [config.py:1001:print] mics_hierarchial_params_gather False +[2025-01-16 22:26:05,283] [INFO] [config.py:1001:print] mics_shard_size .............. -1 +[2025-01-16 22:26:05,283] [INFO] [config.py:1001:print] monitor_config ............... tensorboard=TensorBoardConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') comet=CometConfig(enabled=False, samples_log_interval=100, project=None, workspace=None, api_key=None, experiment_name=None, experiment_key=None, online=None, mode=None) wandb=WandbConfig(enabled=False, group=None, team=None, project='deepspeed') csv_monitor=CSVConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') enabled=False +[2025-01-16 22:26:05,283] [INFO] [config.py:1001:print] nebula_config ................ { + "enabled": false, + "persistent_storage_path": null, + "persistent_time_interval": 100, + "num_of_version_in_retention": 2, + "enable_nebula_load": true, + "load_path": null +} +[2025-01-16 22:26:05,283] [INFO] [config.py:1001:print] optimizer_legacy_fusion ...... False +[2025-01-16 22:26:05,283] [INFO] [config.py:1001:print] optimizer_name ............... None +[2025-01-16 22:26:05,283] [INFO] [config.py:1001:print] optimizer_params ............. None +[2025-01-16 22:26:05,283] [INFO] [config.py:1001:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0, 'pipe_partitioned': True, 'grad_partitioned': True} +[2025-01-16 22:26:05,283] [INFO] [config.py:1001:print] pld_enabled .................. False +[2025-01-16 22:26:05,283] [INFO] [config.py:1001:print] pld_params ................... False +[2025-01-16 22:26:05,283] [INFO] [config.py:1001:print] prescale_gradients ........... False +[2025-01-16 22:26:05,283] [INFO] [config.py:1001:print] scheduler_name ............... None +[2025-01-16 22:26:05,283] [INFO] [config.py:1001:print] scheduler_params ............. None +[2025-01-16 22:26:05,283] [INFO] [config.py:1001:print] seq_parallel_communication_data_type torch.float32 +[2025-01-16 22:26:05,283] [INFO] [config.py:1001:print] sparse_attention ............. None +[2025-01-16 22:26:05,283] [INFO] [config.py:1001:print] sparse_gradients_enabled ..... False +[2025-01-16 22:26:05,283] [INFO] [config.py:1001:print] steps_per_print .............. inf +[2025-01-16 22:26:05,283] [INFO] [config.py:1001:print] timers_config ................ enabled=True synchronized=True +[2025-01-16 22:26:05,283] [INFO] [config.py:1001:print] train_batch_size ............. 128 +[2025-01-16 22:26:05,283] [INFO] [config.py:1001:print] train_micro_batch_size_per_gpu 4 +[2025-01-16 22:26:05,283] [INFO] [config.py:1001:print] use_data_before_expert_parallel_ False +[2025-01-16 22:26:05,283] [INFO] [config.py:1001:print] use_node_local_storage ....... False +[2025-01-16 22:26:05,283] [INFO] [config.py:1001:print] wall_clock_breakdown ......... False +[2025-01-16 22:26:05,283] [INFO] [config.py:1001:print] weight_quantization_config ... None +[2025-01-16 22:26:05,283] [INFO] [config.py:1001:print] world_size ................... 16 +[2025-01-16 22:26:05,283] [INFO] [config.py:1001:print] zero_allow_untested_optimizer True +[2025-01-16 22:26:05,283] [INFO] [config.py:1001:print] zero_config .................. stage=3 contiguous_gradients=True reduce_scatter=True reduce_bucket_size=16777216 use_multi_rank_bucket_allreduce=True allgather_partitions=True allgather_bucket_size=500,000,000 overlap_comm=True load_from_fp32_weights=True elastic_checkpoint=False offload_param=None offload_optimizer=None sub_group_size=1000000000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=15099494 param_persistence_threshold=40960 model_persistence_threshold=sys.maxsize max_live_parameters=1000000000 max_reuse_distance=1000000000 gather_16bit_weights_on_model_save=True use_all_reduce_for_fetch_params=False stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=False zero_hpz_partition_size=1 zero_quantized_weights=False zero_quantized_nontrainable_weights=False zero_quantized_gradients=False mics_shard_size=-1 mics_hierarchical_params_gather=False memory_efficient_linear=True pipeline_loading_checkpoint=False override_module_apply=True +[2025-01-16 22:26:05,283] [INFO] [config.py:1001:print] zero_enabled ................. True +[2025-01-16 22:26:05,283] [INFO] [config.py:1001:print] zero_force_ds_cpu_optimizer .. True +[2025-01-16 22:26:05,283] [INFO] [config.py:1001:print] zero_optimization_stage ...... 3 +[2025-01-16 22:26:05,283] [INFO] [config.py:987:print_user_config] json = { + "fp16": { + "enabled": false, + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": true + }, + "train_micro_batch_size_per_gpu": 4, + "train_batch_size": 128, + "gradient_accumulation_steps": 2, + "zero_optimization": { + "stage": 3, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1.000000e+09, + "reduce_bucket_size": 1.677722e+07, + "stage3_prefetch_bucket_size": 1.509949e+07, + "stage3_param_persistence_threshold": 4.096000e+04, + "stage3_max_live_parameters": 1.000000e+09, + "stage3_max_reuse_distance": 1.000000e+09, + "stage3_gather_16bit_weights_on_model_save": true + }, + "steps_per_print": inf, + "zero_allow_untested_optimizer": true +} +[INFO|trainer.py:1721] 2025-01-16 22:26:05,283 >> ***** Running training ***** +[INFO|trainer.py:1722] 2025-01-16 22:26:05,283 >> Num examples = 944,379 +[INFO|trainer.py:1723] 2025-01-16 22:26:05,283 >> Num Epochs = 1 +[INFO|trainer.py:1724] 2025-01-16 22:26:05,283 >> Instantaneous batch size per device = 4 +[INFO|trainer.py:1727] 2025-01-16 22:26:05,284 >> Total train batch size (w. parallel, distributed & accumulation) = 128 +[INFO|trainer.py:1728] 2025-01-16 22:26:05,284 >> Gradient Accumulation steps = 2 +[INFO|trainer.py:1729] 2025-01-16 22:26:05,284 >> Total optimization steps = 7,378 +[INFO|trainer.py:1730] 2025-01-16 22:26:05,285 >> Number of trainable parameters = 8,441,260,032 +[INFO|integration_utils.py:722] 2025-01-16 22:26:05,287 >> Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true" +wandb: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information. +wandb: Currently logged in as: z2855064151 (openmmlab_zxy). Use `wandb login --relogin` to force relogin +wandb: - Waiting for wandb.init()... +wandb: \ Waiting for wandb.init()... +wandb: Tracking run with wandb version 0.18.5 +wandb: Run data is saved locally in /cpfs02/user/zhaoxiangyu/code_new/LLaVA/wandb/run-20250116_222609-ai9xrssy +wandb: Run `wandb offline` to turn off syncing. +wandb: Syncing run llavaAR4-internlm2_5-7b-sft-llavanext-notext-kn-infpolishmd-detail-knins40k-creationme10kfixed-chart11kmerge-tqa8k-info28kgpt +wandb: ⭐️ View project at https://wandb.ai/openmmlab_zxy/huggingface +wandb: 🚀 View run at https://wandb.ai/openmmlab_zxy/huggingface/runs/ai9xrssy + + 0%| | 0/7378 [00:00 + train(attn_implementation="flash_attention_2") + File "/cpfs02/user/zhaoxiangyu/code_new/LLaVA/llava/train/train.py", line 904, in train + trainer.train() + File "/cpfs02/user/zhaoxiangyu/miniconda3/envs/llava/lib/python3.10/site-packages/transformers/trainer.py", line 1539, in train + return inner_training_loop( + File "/cpfs02/user/zhaoxiangyu/miniconda3/envs/llava/lib/python3.10/site-packages/transformers/trainer.py", line 1836, in _inner_training_loop + for step, inputs in enumerate(epoch_iterator): + File "/cpfs02/user/zhaoxiangyu/miniconda3/envs/llava/lib/python3.10/site-packages/accelerate/data_loader.py", line 384, in __iter__ + current_batch = next(dataloader_iter) + File "/cpfs02/user/zhaoxiangyu/miniconda3/envs/llava/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 630, in __next__ + data = self._next_data() + File "/cpfs02/user/zhaoxiangyu/miniconda3/envs/llava/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 1345, in _next_data + return self._process_data(data) + File "/cpfs02/user/zhaoxiangyu/miniconda3/envs/llava/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 1371, in _process_data + data.reraise() + File "/cpfs02/user/zhaoxiangyu/miniconda3/envs/llava/lib/python3.10/site-packages/torch/_utils.py", line 694, in reraise + raise exception +KeyError: Caught KeyError in DataLoader worker process 0. +Original Traceback (most recent call last): + File "/cpfs02/user/zhaoxiangyu/miniconda3/envs/llava/lib/python3.10/site-packages/torch/utils/data/_utils/worker.py", line 308, in _worker_loop + data = fetcher.fetch(index) + File "/cpfs02/user/zhaoxiangyu/miniconda3/envs/llava/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py", line 51, in fetch + data = [self.dataset[idx] for idx in possibly_batched_index] + File "/cpfs02/user/zhaoxiangyu/miniconda3/envs/llava/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py", line 51, in + data = [self.dataset[idx] for idx in possibly_batched_index] + File "/cpfs02/user/zhaoxiangyu/miniconda3/envs/llava/lib/python3.10/site-packages/torch/utils/data/dataset.py", line 302, in __getitem__ + return self.datasets[dataset_idx][sample_idx] + File "/cpfs02/user/zhaoxiangyu/code_new/LLaVA/llava/train/train.py", line 547, in __getitem__ + data_dict = preprocess(sources, self.tokenizer, has_image=('image' in self.list_data_dict[i])) + File "/cpfs02/user/zhaoxiangyu/code_new/LLaVA/llava/train/train.py", line 380, in preprocess + return preprocess_internlm(sources, tokenizer, has_image=has_image) + File "/cpfs02/user/zhaoxiangyu/code_new/LLaVA/llava/train/preprocess.py", line 425, in preprocess_internlm + role = roles[sentence['from']] +KeyError: 'internvl2_5' + +wandb: 🚀 View run llavaAR4-internlm2_5-7b-sft-llavanext-notext-kn-infpolishmd-detail-knins40k-creationme10kfixed-chart11kmerge-tqa8k-info28kgpt at: https://wandb.ai/openmmlab_zxy/huggingface/runs/ai9xrssy +wandb: Find logs at: wandb/run-20250116_222609-ai9xrssy/logs +dlc1abaccnl2nzws-master-0:88:369 [6] NCCL INFO [Service thread] Connection closed by localRank 0 +dlc1abaccnl2nzws-master-0:86:373 [4] NCCL INFO [Service thread] Connection closed by localRank 0 +dlc1abaccnl2nzws-master-0:84:371 [2] NCCL INFO [Service thread] Connection closed by localRank 0 +[2025-01-16 22:26:34,522] torch.distributed.elastic.multiprocessing.api: [WARNING] Sending process 83 closing signal SIGTERM +[2025-01-16 22:26:34,522] torch.distributed.elastic.multiprocessing.api: [WARNING] Sending process 84 closing signal SIGTERM +[2025-01-16 22:26:34,522] torch.distributed.elastic.multiprocessing.api: [WARNING] Sending process 85 closing signal SIGTERM +[2025-01-16 22:26:34,522] torch.distributed.elastic.multiprocessing.api: [WARNING] Sending process 86 closing signal SIGTERM +[2025-01-16 22:26:34,523] torch.distributed.elastic.multiprocessing.api: [WARNING] Sending process 87 closing signal SIGTERM +[2025-01-16 22:26:34,523] torch.distributed.elastic.multiprocessing.api: [WARNING] Sending process 88 closing signal SIGTERM +[2025-01-16 22:26:34,523] torch.distributed.elastic.multiprocessing.api: [WARNING] Sending process 89 closing signal SIGTERM +[2025-01-16 22:26:34,737] torch.distributed.elastic.multiprocessing.api: [ERROR] failed (exitcode: 1) local_rank: 0 (pid: 82) of binary: /cpfs02/user/zhaoxiangyu/miniconda3/envs/llava/bin/python +Traceback (most recent call last): + File "/cpfs02/user/zhaoxiangyu/miniconda3/envs/llava/bin/torchrun", line 8, in + sys.exit(main()) + File "/cpfs02/user/zhaoxiangyu/miniconda3/envs/llava/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 346, in wrapper + return f(*args, **kwargs) + File "/cpfs02/user/zhaoxiangyu/miniconda3/envs/llava/lib/python3.10/site-packages/torch/distributed/run.py", line 806, in main + run(args) + File "/cpfs02/user/zhaoxiangyu/miniconda3/envs/llava/lib/python3.10/site-packages/torch/distributed/run.py", line 797, in run + elastic_launch( + File "/cpfs02/user/zhaoxiangyu/miniconda3/envs/llava/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 134, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/cpfs02/user/zhaoxiangyu/miniconda3/envs/llava/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 264, in launch_agent + raise ChildFailedError( +torch.distributed.elastic.multiprocessing.errors.ChildFailedError: +============================================================ +llava/train/train_mem.py FAILED +------------------------------------------------------------ +Failures: + +------------------------------------------------------------ +Root Cause (first observed failure): +[0]: + time : 2025-01-16_22:26:34 + host : dlc1abaccnl2nzws-master-0 + rank : 0 (local_rank: 0) + exitcode : 1 (pid: 82) + error_file: + traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html +============================================================