PhoenixZ commited on
Commit
b5591e4
·
verified ·
1 Parent(s): 862f8eb

Add files using upload-large-folder tool

Browse files
config.json ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "models/internlm/internlm2_5-7b-chat",
3
+ "architectures": ["LlavaInternlm2ForCausalLM"],
4
+ "attn_implementation": "flash_attention_2",
5
+ "auto_map": {
6
+ "AutoConfig": "configuration_internlm2.InternLM2Config",
7
+ "AutoModel": "modeling_internlm2.InternLM2ForCausalLM",
8
+ "AutoModelForCausalLM": "llava_internlm2.LlavaInternlm2ForCausalLM"
9
+ },
10
+ "bias": false,
11
+ "bos_token_id": 1,
12
+ "datatype_loss": false,
13
+ "eos_token_id": 2,
14
+ "hidden_act": "silu",
15
+ "hidden_size": 4096,
16
+ "image_aspect_ratio": "anyres",
17
+ "image_crop_resolution": null,
18
+ "image_grid_pinpoints": "[(336, 672), (672, 336), (672, 672), (1008, 336), (336, 1008)]",
19
+ "image_split_resolution": null,
20
+ "initializer_range": 0.02,
21
+ "intermediate_size": 14336,
22
+ "max_position_embeddings": 32768,
23
+ "mm_hidden_size": 1024,
24
+ "mm_patch_merge_type": "spatial_unpad",
25
+ "mm_projector_lr": null,
26
+ "mm_projector_type": "mlp2x_gelu",
27
+ "mm_tunable_parts": "mm_vision_tower,mm_mlp_adapter,mm_language_model",
28
+ "mm_use_im_patch_token": false,
29
+ "mm_use_im_start_end": false,
30
+ "mm_vision_select_feature": "patch",
31
+ "mm_vision_select_layer": -2,
32
+ "mm_vision_tower": "/fs-computility/mllm1/shared/hub/models--openai--clip-vit-large-patch14-336/snapshots/ce19dc912ca5cd21c8a653c79e251e808ccabcd1",
33
+ "mm_vision_tower_lr": 2e-6,
34
+ "model_type": "internlm2",
35
+ "num_attention_heads": 32,
36
+ "num_hidden_layers": 32,
37
+ "num_key_value_heads": 8,
38
+ "pad_token_id": 2,
39
+ "pretraining_tp": 1,
40
+ "rms_norm_eps": 1e-5,
41
+ "rope_scaling": {
42
+ "factor": 2.0,
43
+ "type": "dynamic"
44
+ },
45
+ "rope_theta": 1000000,
46
+ "tie_word_embeddings": false,
47
+ "tokenizer_model_max_length": 32768,
48
+ "tokenizer_padding_side": "right",
49
+ "torch_dtype": "bfloat16",
50
+ "transformers_version": "4.37.2",
51
+ "use_cache": true,
52
+ "use_mm_proj": true,
53
+ "vocab_size": 92544
54
+ }
configuration_internlm2.py ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright (c) The InternLM team and The HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # This code is based on transformers/src/transformers/models/llama/configuration_llama.py
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License");
7
+ # you may not use this file except in compliance with the License.
8
+ # You may obtain a copy of the License at
9
+ #
10
+ # http://www.apache.org/licenses/LICENSE-2.0
11
+ #
12
+ # Unless required by applicable law or agreed to in writing, software
13
+ # distributed under the License is distributed on an "AS IS" BASIS,
14
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
+ # See the License for the specific language governing permissions and
16
+ # limitations under the License.
17
+ """ InternLM2 model configuration"""
18
+
19
+ from transformers.configuration_utils import PretrainedConfig
20
+ from transformers.utils import logging
21
+
22
+ logger = logging.get_logger(__name__)
23
+
24
+ INTERNLM2_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
25
+
26
+
27
+ # Modified from transformers.model.llama.configuration_llama.LlamaConfig
28
+ class InternLM2Config(PretrainedConfig):
29
+ r"""
30
+ This is the configuration class to store the configuration of a [`InternLM2Model`]. It is used to instantiate
31
+ an InternLM2 model according to the specified arguments, defining the model architecture. Instantiating a
32
+ configuration with the defaults will yield a similar configuration to that of the InternLM2-7B.
33
+
34
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
35
+ documentation from [`PretrainedConfig`] for more information.
36
+
37
+
38
+ Args:
39
+ vocab_size (`int`, *optional*, defaults to 32000):
40
+ Vocabulary size of the InternLM2 model. Defines the number of different tokens that can be represented by the
41
+ `inputs_ids` passed when calling [`InternLM2Model`]
42
+ hidden_size (`int`, *optional*, defaults to 4096):
43
+ Dimension of the hidden representations.
44
+ intermediate_size (`int`, *optional*, defaults to 11008):
45
+ Dimension of the MLP representations.
46
+ num_hidden_layers (`int`, *optional*, defaults to 32):
47
+ Number of hidden layers in the Transformer decoder.
48
+ num_attention_heads (`int`, *optional*, defaults to 32):
49
+ Number of attention heads for each attention layer in the Transformer decoder.
50
+ num_key_value_heads (`int`, *optional*):
51
+ This is the number of key_value heads that should be used to implement Grouped Query Attention. If
52
+ `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
53
+ `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
54
+ converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
55
+ by meanpooling all the original heads within that group. For more details checkout [this
56
+ paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
57
+ `num_attention_heads`.
58
+ hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
59
+ The non-linear activation function (function or string) in the decoder.
60
+ max_position_embeddings (`int`, *optional*, defaults to 2048):
61
+ The maximum sequence length that this model might ever be used with. InternLM2 supports up to 32768 tokens.
62
+ initializer_range (`float`, *optional*, defaults to 0.02):
63
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
64
+ rms_norm_eps (`float`, *optional*, defaults to 1e-06):
65
+ The epsilon used by the rms normalization layers.
66
+ use_cache (`bool`, *optional*, defaults to `True`):
67
+ Whether or not the model should return the last key/values attentions (not used by all models). Only
68
+ relevant if `config.is_decoder=True`.
69
+ pad_token_id (`int`, *optional*):
70
+ Padding token id.
71
+ bos_token_id (`int`, *optional*, defaults to 1):
72
+ Beginning of stream token id.
73
+ eos_token_id (`int`, *optional*, defaults to 2):
74
+ End of stream token id.
75
+ pretraining_tp (`int`, *optional*, defaults to 1):
76
+ Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
77
+ document](https://huggingface.co/docs/transformers/main/perf_train_gpu_many#tensor-parallelism)
78
+ to understand more about it. This value is necessary to ensure exact reproducibility
79
+ of the pretraining results. Please refer to [this
80
+ issue](https://github.com/pytorch/pytorch/issues/76232).
81
+ tie_word_embeddings (`bool`, *optional*, defaults to `False`):
82
+ Whether to tie weight embeddings
83
+ rope_theta (`float`, *optional*, defaults to 10000.0):
84
+ The base period of the RoPE embeddings.
85
+ rope_scaling (`Dict`, *optional*):
86
+ Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
87
+ strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
88
+ `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
89
+ `max_position_embeddings` to the expected new maximum. See the following thread for more information on how
90
+ these scaling strategies behave:
91
+ https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an
92
+ experimental feature, subject to breaking API changes in future versions.
93
+ """
94
+ _auto_class = "AutoConfig"
95
+ model_type = "internlm2"
96
+ keys_to_ignore_at_inference = ["past_key_values"]
97
+
98
+ def __init__( # pylint: disable=W0102
99
+ self,
100
+ vocab_size=103168,
101
+ hidden_size=4096,
102
+ intermediate_size=11008,
103
+ num_hidden_layers=32,
104
+ num_attention_heads=32,
105
+ num_key_value_heads=None,
106
+ hidden_act="silu",
107
+ max_position_embeddings=2048,
108
+ initializer_range=0.02,
109
+ rms_norm_eps=1e-6,
110
+ use_cache=True,
111
+ pad_token_id=0,
112
+ bos_token_id=1,
113
+ eos_token_id=2,
114
+ pretraining_tp=1,
115
+ tie_word_embeddings=False,
116
+ bias=True,
117
+ rope_theta=10000,
118
+ rope_scaling=None,
119
+ attn_implementation=None,
120
+ **kwargs,
121
+ ):
122
+ self.vocab_size = vocab_size
123
+ self.max_position_embeddings = max_position_embeddings
124
+ self.hidden_size = hidden_size
125
+ self.intermediate_size = intermediate_size
126
+ self.num_hidden_layers = num_hidden_layers
127
+ self.num_attention_heads = num_attention_heads
128
+ self.bias = bias
129
+
130
+ if num_key_value_heads is None:
131
+ num_key_value_heads = num_attention_heads
132
+ self.num_key_value_heads = num_key_value_heads
133
+
134
+ self.hidden_act = hidden_act
135
+ self.initializer_range = initializer_range
136
+ self.rms_norm_eps = rms_norm_eps
137
+ self.pretraining_tp = pretraining_tp
138
+ self.use_cache = use_cache
139
+ self.rope_theta = rope_theta
140
+ self.rope_scaling = rope_scaling
141
+ self._rope_scaling_validation()
142
+ self.attn_implementation = attn_implementation
143
+ if self.attn_implementation is None:
144
+ self.attn_implementation = "eager"
145
+
146
+ super().__init__(
147
+ pad_token_id=pad_token_id,
148
+ bos_token_id=bos_token_id,
149
+ eos_token_id=eos_token_id,
150
+ tie_word_embeddings=tie_word_embeddings,
151
+ **kwargs,
152
+ )
153
+
154
+ def _rope_scaling_validation(self):
155
+ """
156
+ Validate the `rope_scaling` configuration.
157
+ """
158
+ if self.rope_scaling is None:
159
+ return
160
+
161
+ if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
162
+ raise ValueError(
163
+ "`rope_scaling` must be a dictionary with with two fields, `type` and `factor`, "
164
+ f"got {self.rope_scaling}"
165
+ )
166
+ rope_scaling_type = self.rope_scaling.get("type", None)
167
+ rope_scaling_factor = self.rope_scaling.get("factor", None)
168
+ if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
169
+ raise ValueError(
170
+ f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
171
+ )
172
+ if (
173
+ rope_scaling_factor is None
174
+ or not isinstance(rope_scaling_factor, (float, int))
175
+ or rope_scaling_factor < 1.0
176
+ ):
177
+ raise ValueError(
178
+ f"`rope_scaling`'s factor field must be a number >= 1, got {rope_scaling_factor} "
179
+ f"of type {type(rope_scaling_factor)}"
180
+ )
generation_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 1,
3
+ "eos_token_id": [
4
+ 2,
5
+ 92542
6
+ ],
7
+ "pad_token_id": 2,
8
+ "transformers_version": "4.37.2"
9
+ }
llava_internlm2.py ADDED
@@ -0,0 +1,242 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2023 Haotian Liu
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+
16
+ from typing import List, Optional, Tuple, Union
17
+
18
+ import torch
19
+ import torch.nn as nn
20
+ from torch.nn import CrossEntropyLoss
21
+
22
+ from transformers import AutoConfig, AutoModelForCausalLM
23
+ from llava.model.language_model.internlm2.modeling_internlm2 import InternLM2ForCausalLM, InternLM2Model
24
+ from llava.model.language_model.internlm2.configuration_internlm2 import InternLM2Config
25
+
26
+ from transformers.modeling_outputs import CausalLMOutputWithPast
27
+ from transformers.generation.utils import GenerateOutput
28
+
29
+ from llava.model.llava_arch import LlavaMetaModel, LlavaMetaForCausalLM
30
+ from llava.utils import rank0_print
31
+ # from ..llava_arch import LlavaMetaModel, LlavaMetaForCausalLM
32
+
33
+
34
+ class LlavaInternlm2Config(InternLM2Config):
35
+ model_type = "llava_internlm2"
36
+
37
+
38
+ class LlavaInternlm2Model(LlavaMetaModel, InternLM2Model):
39
+ config_class = LlavaInternlm2Config
40
+
41
+ def __init__(self, config: InternLM2Config):
42
+ super(LlavaInternlm2Model, self).__init__(config)
43
+
44
+
45
+ class LlavaInternlm2ForCausalLM(InternLM2ForCausalLM, LlavaMetaForCausalLM):
46
+ config_class = LlavaInternlm2Config
47
+
48
+ def __init__(self, config):
49
+ # super(InternLM2ForCausalLM, self).__init__(config)
50
+ InternLM2ForCausalLM.__init__(self, config)
51
+ self.model = LlavaInternlm2Model(config)
52
+ # self.pretraining_tp = config.pretraining_tp
53
+ self.vocab_size = config.vocab_size
54
+ self.datatype_loss = config.datatype_loss if hasattr(config, "datatype_loss") else False
55
+ if self.datatype_loss:
56
+ rank0_print("Logging per datatype loss")
57
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
58
+ # self.output = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
59
+
60
+ # Initialize weights and apply final processing
61
+ self.post_init()
62
+
63
+ def get_model(self):
64
+ return self.model
65
+
66
+ def forward(
67
+ self,
68
+ input_ids: torch.LongTensor = None,
69
+ attention_mask: Optional[torch.Tensor] = None,
70
+ position_ids: Optional[torch.LongTensor] = None,
71
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
72
+ inputs_embeds: Optional[torch.FloatTensor] = None,
73
+ labels: Optional[torch.LongTensor] = None,
74
+ use_cache: Optional[bool] = None,
75
+ output_attentions: Optional[bool] = None,
76
+ output_hidden_states: Optional[bool] = None,
77
+ images: Optional[torch.FloatTensor] = None,
78
+ image_sizes: Optional[List[List[int]]] = None,
79
+ modalities: Optional[List[str]] = ["image"],
80
+ data_type: Optional[str] = "normal",
81
+ return_dict: Optional[bool] = None,
82
+ dpo_forward: Optional[bool] = False,
83
+ cache_position=None,
84
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
85
+
86
+ if inputs_embeds is None:
87
+ (input_ids, position_ids, attention_mask, past_key_values, inputs_embeds, labels) = (
88
+ self.prepare_inputs_labels_for_multimodal(
89
+ input_ids, position_ids, attention_mask, past_key_values, labels, images, image_sizes
90
+ )
91
+ )
92
+ if not self.datatype_loss:
93
+ if dpo_forward:
94
+ outputs = self.model(
95
+ input_ids=input_ids,
96
+ attention_mask=attention_mask,
97
+ position_ids=position_ids,
98
+ past_key_values=past_key_values,
99
+ inputs_embeds=inputs_embeds,
100
+ use_cache=use_cache,
101
+ output_attentions=output_attentions,
102
+ output_hidden_states=output_hidden_states,
103
+ return_dict=return_dict,
104
+ )
105
+
106
+ hidden_states = outputs[0]
107
+ logits = self.output(hidden_states)
108
+ return logits, labels
109
+ else:
110
+ return super().forward(
111
+ input_ids=input_ids,
112
+ attention_mask=attention_mask,
113
+ position_ids=position_ids,
114
+ past_key_values=past_key_values,
115
+ inputs_embeds=inputs_embeds,
116
+ labels=labels,
117
+ use_cache=use_cache,
118
+ output_attentions=output_attentions,
119
+ output_hidden_states=output_hidden_states,
120
+ return_dict=return_dict,
121
+ )
122
+ else:
123
+ output_attentions = (
124
+ output_attentions if output_attentions is not None else self.config.output_attentions
125
+ )
126
+ output_hidden_states = (
127
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
128
+ )
129
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
130
+
131
+ # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
132
+ outputs = self.model(
133
+ input_ids=input_ids,
134
+ attention_mask=attention_mask,
135
+ position_ids=position_ids,
136
+ past_key_values=past_key_values,
137
+ inputs_embeds=inputs_embeds,
138
+ use_cache=use_cache,
139
+ output_attentions=output_attentions,
140
+ output_hidden_states=output_hidden_states,
141
+ return_dict=return_dict,
142
+ )
143
+
144
+ hidden_states = outputs[0]
145
+ logits = self.output(hidden_states)
146
+ logits = logits.float()
147
+
148
+ loss = None
149
+ per_sample_losses = None
150
+
151
+ if labels is not None:
152
+ # Shift so that tokens < n predict n
153
+ shift_logits = logits[..., :-1, :].contiguous()
154
+ shift_labels = labels[..., 1:].contiguous()
155
+ # Flatten the tokens
156
+ # loss_fct = CrossEntropyLoss()
157
+ shift_logits = shift_logits.view(-1, self.config.vocab_size)
158
+ shift_labels = shift_labels.view(-1)
159
+ # Enable model parallelism
160
+ shift_labels = shift_labels.to(shift_logits.device)
161
+ # loss = loss_fct(shift_logits, shift_labels)
162
+
163
+ ##### Compute per sample loss #####
164
+ # Compute the token-level loss
165
+ loss_fct = CrossEntropyLoss(reduction="none") # "none" for token-level losses
166
+ token_losses = loss_fct(shift_logits, shift_labels) # Shape: [batch_size * seq_len]
167
+
168
+ # Reshape token losses to [batch_size, seq_len - 1]
169
+ token_losses = token_losses.view(-1, shift_logits.size(0) // inputs_embeds.size(0))
170
+ # batch_size = inputs_embeds.size(0)
171
+ # seq_len = inputs_embeds.size(1)
172
+ # token_losses = token_losses.view(batch_size, seq_len - 1)
173
+
174
+ # Mask out padding tokens
175
+ active_tokens = (shift_labels != -100).view(-1, token_losses.size(1))
176
+ token_losses *= active_tokens
177
+
178
+ # Compute per-sample losses by summing over the sequence length
179
+ per_sample_losses = token_losses.sum(dim=1) / active_tokens.sum(dim=1).clamp(min=1)
180
+
181
+ # Compute overall loss as the mean of per-sample losses
182
+ loss = per_sample_losses.mean()
183
+
184
+ if not return_dict:
185
+ output = (logits,) + outputs[1:]
186
+ return (loss,) + output if loss is not None else output
187
+
188
+ device = input_ids.device if input_ids is not None else inputs_embeds.device
189
+ output = CausalLMOutputWithPast(
190
+ loss=loss,
191
+ logits=logits,
192
+ past_key_values=outputs.past_key_values,
193
+ hidden_states=outputs.hidden_states,
194
+ attentions=outputs.attentions,
195
+ )
196
+ output['logits'] = output['logits'].to(device)
197
+
198
+ output["per_sample_losses"] = per_sample_losses # Include per-sample losses in the output
199
+
200
+ return output
201
+
202
+ @torch.no_grad()
203
+ def generate(
204
+ self,
205
+ inputs: Optional[torch.Tensor] = None,
206
+ images: Optional[torch.Tensor] = None,
207
+ image_sizes: Optional[torch.Tensor] = None,
208
+ **kwargs,
209
+ ) -> Union[GenerateOutput, torch.LongTensor]:
210
+ position_ids = kwargs.pop("position_ids", None)
211
+ attention_mask = kwargs.pop("attention_mask", None)
212
+ if "inputs_embeds" in kwargs:
213
+ raise NotImplementedError("`inputs_embeds` is not supported")
214
+
215
+ if images is not None:
216
+ (inputs, position_ids, attention_mask, _, inputs_embeds, _) = (
217
+ self.prepare_inputs_labels_for_multimodal(
218
+ inputs, position_ids, attention_mask, None, None, images, image_sizes=image_sizes
219
+ )
220
+ )
221
+ else:
222
+ inputs_embeds = self.get_model().get_input_embeddings()(inputs)
223
+
224
+ return super().generate(
225
+ position_ids=position_ids, attention_mask=attention_mask, inputs_embeds=inputs_embeds, **kwargs
226
+ )
227
+
228
+ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
229
+ images = kwargs.pop("images", None)
230
+ image_sizes = kwargs.pop("image_sizes", None)
231
+ inputs = super().prepare_inputs_for_generation(
232
+ input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs
233
+ )
234
+ if images is not None:
235
+ inputs['images'] = images
236
+ if image_sizes is not None:
237
+ inputs['image_sizes'] = image_sizes
238
+ return inputs
239
+
240
+
241
+ AutoConfig.register("llava_internlm2", LlavaInternlm2Config)
242
+ AutoModelForCausalLM.register(LlavaInternlm2Config, LlavaInternlm2ForCausalLM)
model-00001-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6b11d8af859153e85107f12e24577c1b426e949a83e3ab8cbf0cf2e09ec937b0
3
+ size 4885478896
model-00002-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8f52b47bed4892a966b9afe7ae6c276f12d3e4d26d7b1d06d08baa858ff373dc
3
+ size 4915913424
model-00003-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:79f86d746c312b9b04137d777ae0a226b961f51ac2191365b26c8d2ba7a12438
3
+ size 4998115008
model-00004-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a8f338cf107da0432a32c50e616ddc2ddab620bfca46d75e841c2a14b1c0699b
3
+ size 2083100280
model.safetensors.index.json ADDED
@@ -0,0 +1,631 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 16882520064
4
+ },
5
+ "weight_map": {
6
+ "lm_head.weight": "model-00004-of-00004.safetensors",
7
+ "model.image_newline": "model-00001-of-00004.safetensors",
8
+ "model.layers.0.attention.wo.weight": "model-00001-of-00004.safetensors",
9
+ "model.layers.0.attention.wqkv.weight": "model-00001-of-00004.safetensors",
10
+ "model.layers.0.attention_norm.weight": "model-00001-of-00004.safetensors",
11
+ "model.layers.0.feed_forward.w1.weight": "model-00001-of-00004.safetensors",
12
+ "model.layers.0.feed_forward.w2.weight": "model-00001-of-00004.safetensors",
13
+ "model.layers.0.feed_forward.w3.weight": "model-00001-of-00004.safetensors",
14
+ "model.layers.0.ffn_norm.weight": "model-00001-of-00004.safetensors",
15
+ "model.layers.1.attention.wo.weight": "model-00001-of-00004.safetensors",
16
+ "model.layers.1.attention.wqkv.weight": "model-00001-of-00004.safetensors",
17
+ "model.layers.1.attention_norm.weight": "model-00001-of-00004.safetensors",
18
+ "model.layers.1.feed_forward.w1.weight": "model-00001-of-00004.safetensors",
19
+ "model.layers.1.feed_forward.w2.weight": "model-00001-of-00004.safetensors",
20
+ "model.layers.1.feed_forward.w3.weight": "model-00001-of-00004.safetensors",
21
+ "model.layers.1.ffn_norm.weight": "model-00001-of-00004.safetensors",
22
+ "model.layers.10.attention.wo.weight": "model-00002-of-00004.safetensors",
23
+ "model.layers.10.attention.wqkv.weight": "model-00002-of-00004.safetensors",
24
+ "model.layers.10.attention_norm.weight": "model-00002-of-00004.safetensors",
25
+ "model.layers.10.feed_forward.w1.weight": "model-00002-of-00004.safetensors",
26
+ "model.layers.10.feed_forward.w2.weight": "model-00002-of-00004.safetensors",
27
+ "model.layers.10.feed_forward.w3.weight": "model-00002-of-00004.safetensors",
28
+ "model.layers.10.ffn_norm.weight": "model-00002-of-00004.safetensors",
29
+ "model.layers.11.attention.wo.weight": "model-00002-of-00004.safetensors",
30
+ "model.layers.11.attention.wqkv.weight": "model-00002-of-00004.safetensors",
31
+ "model.layers.11.attention_norm.weight": "model-00002-of-00004.safetensors",
32
+ "model.layers.11.feed_forward.w1.weight": "model-00002-of-00004.safetensors",
33
+ "model.layers.11.feed_forward.w2.weight": "model-00002-of-00004.safetensors",
34
+ "model.layers.11.feed_forward.w3.weight": "model-00002-of-00004.safetensors",
35
+ "model.layers.11.ffn_norm.weight": "model-00002-of-00004.safetensors",
36
+ "model.layers.12.attention.wo.weight": "model-00002-of-00004.safetensors",
37
+ "model.layers.12.attention.wqkv.weight": "model-00002-of-00004.safetensors",
38
+ "model.layers.12.attention_norm.weight": "model-00002-of-00004.safetensors",
39
+ "model.layers.12.feed_forward.w1.weight": "model-00002-of-00004.safetensors",
40
+ "model.layers.12.feed_forward.w2.weight": "model-00002-of-00004.safetensors",
41
+ "model.layers.12.feed_forward.w3.weight": "model-00002-of-00004.safetensors",
42
+ "model.layers.12.ffn_norm.weight": "model-00002-of-00004.safetensors",
43
+ "model.layers.13.attention.wo.weight": "model-00002-of-00004.safetensors",
44
+ "model.layers.13.attention.wqkv.weight": "model-00002-of-00004.safetensors",
45
+ "model.layers.13.attention_norm.weight": "model-00002-of-00004.safetensors",
46
+ "model.layers.13.feed_forward.w1.weight": "model-00002-of-00004.safetensors",
47
+ "model.layers.13.feed_forward.w2.weight": "model-00002-of-00004.safetensors",
48
+ "model.layers.13.feed_forward.w3.weight": "model-00002-of-00004.safetensors",
49
+ "model.layers.13.ffn_norm.weight": "model-00002-of-00004.safetensors",
50
+ "model.layers.14.attention.wo.weight": "model-00002-of-00004.safetensors",
51
+ "model.layers.14.attention.wqkv.weight": "model-00002-of-00004.safetensors",
52
+ "model.layers.14.attention_norm.weight": "model-00002-of-00004.safetensors",
53
+ "model.layers.14.feed_forward.w1.weight": "model-00002-of-00004.safetensors",
54
+ "model.layers.14.feed_forward.w2.weight": "model-00002-of-00004.safetensors",
55
+ "model.layers.14.feed_forward.w3.weight": "model-00002-of-00004.safetensors",
56
+ "model.layers.14.ffn_norm.weight": "model-00002-of-00004.safetensors",
57
+ "model.layers.15.attention.wo.weight": "model-00002-of-00004.safetensors",
58
+ "model.layers.15.attention.wqkv.weight": "model-00002-of-00004.safetensors",
59
+ "model.layers.15.attention_norm.weight": "model-00002-of-00004.safetensors",
60
+ "model.layers.15.feed_forward.w1.weight": "model-00002-of-00004.safetensors",
61
+ "model.layers.15.feed_forward.w2.weight": "model-00002-of-00004.safetensors",
62
+ "model.layers.15.feed_forward.w3.weight": "model-00002-of-00004.safetensors",
63
+ "model.layers.15.ffn_norm.weight": "model-00002-of-00004.safetensors",
64
+ "model.layers.16.attention.wo.weight": "model-00002-of-00004.safetensors",
65
+ "model.layers.16.attention.wqkv.weight": "model-00002-of-00004.safetensors",
66
+ "model.layers.16.attention_norm.weight": "model-00002-of-00004.safetensors",
67
+ "model.layers.16.feed_forward.w1.weight": "model-00002-of-00004.safetensors",
68
+ "model.layers.16.feed_forward.w2.weight": "model-00002-of-00004.safetensors",
69
+ "model.layers.16.feed_forward.w3.weight": "model-00002-of-00004.safetensors",
70
+ "model.layers.16.ffn_norm.weight": "model-00002-of-00004.safetensors",
71
+ "model.layers.17.attention.wo.weight": "model-00002-of-00004.safetensors",
72
+ "model.layers.17.attention.wqkv.weight": "model-00002-of-00004.safetensors",
73
+ "model.layers.17.attention_norm.weight": "model-00002-of-00004.safetensors",
74
+ "model.layers.17.feed_forward.w1.weight": "model-00002-of-00004.safetensors",
75
+ "model.layers.17.feed_forward.w2.weight": "model-00002-of-00004.safetensors",
76
+ "model.layers.17.feed_forward.w3.weight": "model-00002-of-00004.safetensors",
77
+ "model.layers.17.ffn_norm.weight": "model-00002-of-00004.safetensors",
78
+ "model.layers.18.attention.wo.weight": "model-00002-of-00004.safetensors",
79
+ "model.layers.18.attention.wqkv.weight": "model-00002-of-00004.safetensors",
80
+ "model.layers.18.attention_norm.weight": "model-00002-of-00004.safetensors",
81
+ "model.layers.18.feed_forward.w1.weight": "model-00002-of-00004.safetensors",
82
+ "model.layers.18.feed_forward.w2.weight": "model-00002-of-00004.safetensors",
83
+ "model.layers.18.feed_forward.w3.weight": "model-00002-of-00004.safetensors",
84
+ "model.layers.18.ffn_norm.weight": "model-00002-of-00004.safetensors",
85
+ "model.layers.19.attention.wo.weight": "model-00002-of-00004.safetensors",
86
+ "model.layers.19.attention.wqkv.weight": "model-00002-of-00004.safetensors",
87
+ "model.layers.19.attention_norm.weight": "model-00002-of-00004.safetensors",
88
+ "model.layers.19.feed_forward.w1.weight": "model-00002-of-00004.safetensors",
89
+ "model.layers.19.feed_forward.w2.weight": "model-00002-of-00004.safetensors",
90
+ "model.layers.19.feed_forward.w3.weight": "model-00002-of-00004.safetensors",
91
+ "model.layers.19.ffn_norm.weight": "model-00002-of-00004.safetensors",
92
+ "model.layers.2.attention.wo.weight": "model-00001-of-00004.safetensors",
93
+ "model.layers.2.attention.wqkv.weight": "model-00001-of-00004.safetensors",
94
+ "model.layers.2.attention_norm.weight": "model-00001-of-00004.safetensors",
95
+ "model.layers.2.feed_forward.w1.weight": "model-00001-of-00004.safetensors",
96
+ "model.layers.2.feed_forward.w2.weight": "model-00001-of-00004.safetensors",
97
+ "model.layers.2.feed_forward.w3.weight": "model-00001-of-00004.safetensors",
98
+ "model.layers.2.ffn_norm.weight": "model-00001-of-00004.safetensors",
99
+ "model.layers.20.attention.wo.weight": "model-00002-of-00004.safetensors",
100
+ "model.layers.20.attention.wqkv.weight": "model-00002-of-00004.safetensors",
101
+ "model.layers.20.attention_norm.weight": "model-00003-of-00004.safetensors",
102
+ "model.layers.20.feed_forward.w1.weight": "model-00002-of-00004.safetensors",
103
+ "model.layers.20.feed_forward.w2.weight": "model-00003-of-00004.safetensors",
104
+ "model.layers.20.feed_forward.w3.weight": "model-00002-of-00004.safetensors",
105
+ "model.layers.20.ffn_norm.weight": "model-00003-of-00004.safetensors",
106
+ "model.layers.21.attention.wo.weight": "model-00003-of-00004.safetensors",
107
+ "model.layers.21.attention.wqkv.weight": "model-00003-of-00004.safetensors",
108
+ "model.layers.21.attention_norm.weight": "model-00003-of-00004.safetensors",
109
+ "model.layers.21.feed_forward.w1.weight": "model-00003-of-00004.safetensors",
110
+ "model.layers.21.feed_forward.w2.weight": "model-00003-of-00004.safetensors",
111
+ "model.layers.21.feed_forward.w3.weight": "model-00003-of-00004.safetensors",
112
+ "model.layers.21.ffn_norm.weight": "model-00003-of-00004.safetensors",
113
+ "model.layers.22.attention.wo.weight": "model-00003-of-00004.safetensors",
114
+ "model.layers.22.attention.wqkv.weight": "model-00003-of-00004.safetensors",
115
+ "model.layers.22.attention_norm.weight": "model-00003-of-00004.safetensors",
116
+ "model.layers.22.feed_forward.w1.weight": "model-00003-of-00004.safetensors",
117
+ "model.layers.22.feed_forward.w2.weight": "model-00003-of-00004.safetensors",
118
+ "model.layers.22.feed_forward.w3.weight": "model-00003-of-00004.safetensors",
119
+ "model.layers.22.ffn_norm.weight": "model-00003-of-00004.safetensors",
120
+ "model.layers.23.attention.wo.weight": "model-00003-of-00004.safetensors",
121
+ "model.layers.23.attention.wqkv.weight": "model-00003-of-00004.safetensors",
122
+ "model.layers.23.attention_norm.weight": "model-00003-of-00004.safetensors",
123
+ "model.layers.23.feed_forward.w1.weight": "model-00003-of-00004.safetensors",
124
+ "model.layers.23.feed_forward.w2.weight": "model-00003-of-00004.safetensors",
125
+ "model.layers.23.feed_forward.w3.weight": "model-00003-of-00004.safetensors",
126
+ "model.layers.23.ffn_norm.weight": "model-00003-of-00004.safetensors",
127
+ "model.layers.24.attention.wo.weight": "model-00003-of-00004.safetensors",
128
+ "model.layers.24.attention.wqkv.weight": "model-00003-of-00004.safetensors",
129
+ "model.layers.24.attention_norm.weight": "model-00003-of-00004.safetensors",
130
+ "model.layers.24.feed_forward.w1.weight": "model-00003-of-00004.safetensors",
131
+ "model.layers.24.feed_forward.w2.weight": "model-00003-of-00004.safetensors",
132
+ "model.layers.24.feed_forward.w3.weight": "model-00003-of-00004.safetensors",
133
+ "model.layers.24.ffn_norm.weight": "model-00003-of-00004.safetensors",
134
+ "model.layers.25.attention.wo.weight": "model-00003-of-00004.safetensors",
135
+ "model.layers.25.attention.wqkv.weight": "model-00003-of-00004.safetensors",
136
+ "model.layers.25.attention_norm.weight": "model-00003-of-00004.safetensors",
137
+ "model.layers.25.feed_forward.w1.weight": "model-00003-of-00004.safetensors",
138
+ "model.layers.25.feed_forward.w2.weight": "model-00003-of-00004.safetensors",
139
+ "model.layers.25.feed_forward.w3.weight": "model-00003-of-00004.safetensors",
140
+ "model.layers.25.ffn_norm.weight": "model-00003-of-00004.safetensors",
141
+ "model.layers.26.attention.wo.weight": "model-00003-of-00004.safetensors",
142
+ "model.layers.26.attention.wqkv.weight": "model-00003-of-00004.safetensors",
143
+ "model.layers.26.attention_norm.weight": "model-00003-of-00004.safetensors",
144
+ "model.layers.26.feed_forward.w1.weight": "model-00003-of-00004.safetensors",
145
+ "model.layers.26.feed_forward.w2.weight": "model-00003-of-00004.safetensors",
146
+ "model.layers.26.feed_forward.w3.weight": "model-00003-of-00004.safetensors",
147
+ "model.layers.26.ffn_norm.weight": "model-00003-of-00004.safetensors",
148
+ "model.layers.27.attention.wo.weight": "model-00003-of-00004.safetensors",
149
+ "model.layers.27.attention.wqkv.weight": "model-00003-of-00004.safetensors",
150
+ "model.layers.27.attention_norm.weight": "model-00003-of-00004.safetensors",
151
+ "model.layers.27.feed_forward.w1.weight": "model-00003-of-00004.safetensors",
152
+ "model.layers.27.feed_forward.w2.weight": "model-00003-of-00004.safetensors",
153
+ "model.layers.27.feed_forward.w3.weight": "model-00003-of-00004.safetensors",
154
+ "model.layers.27.ffn_norm.weight": "model-00003-of-00004.safetensors",
155
+ "model.layers.28.attention.wo.weight": "model-00003-of-00004.safetensors",
156
+ "model.layers.28.attention.wqkv.weight": "model-00003-of-00004.safetensors",
157
+ "model.layers.28.attention_norm.weight": "model-00003-of-00004.safetensors",
158
+ "model.layers.28.feed_forward.w1.weight": "model-00003-of-00004.safetensors",
159
+ "model.layers.28.feed_forward.w2.weight": "model-00003-of-00004.safetensors",
160
+ "model.layers.28.feed_forward.w3.weight": "model-00003-of-00004.safetensors",
161
+ "model.layers.28.ffn_norm.weight": "model-00003-of-00004.safetensors",
162
+ "model.layers.29.attention.wo.weight": "model-00003-of-00004.safetensors",
163
+ "model.layers.29.attention.wqkv.weight": "model-00003-of-00004.safetensors",
164
+ "model.layers.29.attention_norm.weight": "model-00003-of-00004.safetensors",
165
+ "model.layers.29.feed_forward.w1.weight": "model-00003-of-00004.safetensors",
166
+ "model.layers.29.feed_forward.w2.weight": "model-00003-of-00004.safetensors",
167
+ "model.layers.29.feed_forward.w3.weight": "model-00003-of-00004.safetensors",
168
+ "model.layers.29.ffn_norm.weight": "model-00003-of-00004.safetensors",
169
+ "model.layers.3.attention.wo.weight": "model-00001-of-00004.safetensors",
170
+ "model.layers.3.attention.wqkv.weight": "model-00001-of-00004.safetensors",
171
+ "model.layers.3.attention_norm.weight": "model-00001-of-00004.safetensors",
172
+ "model.layers.3.feed_forward.w1.weight": "model-00001-of-00004.safetensors",
173
+ "model.layers.3.feed_forward.w2.weight": "model-00001-of-00004.safetensors",
174
+ "model.layers.3.feed_forward.w3.weight": "model-00001-of-00004.safetensors",
175
+ "model.layers.3.ffn_norm.weight": "model-00001-of-00004.safetensors",
176
+ "model.layers.30.attention.wo.weight": "model-00003-of-00004.safetensors",
177
+ "model.layers.30.attention.wqkv.weight": "model-00003-of-00004.safetensors",
178
+ "model.layers.30.attention_norm.weight": "model-00003-of-00004.safetensors",
179
+ "model.layers.30.feed_forward.w1.weight": "model-00003-of-00004.safetensors",
180
+ "model.layers.30.feed_forward.w2.weight": "model-00003-of-00004.safetensors",
181
+ "model.layers.30.feed_forward.w3.weight": "model-00003-of-00004.safetensors",
182
+ "model.layers.30.ffn_norm.weight": "model-00003-of-00004.safetensors",
183
+ "model.layers.31.attention.wo.weight": "model-00003-of-00004.safetensors",
184
+ "model.layers.31.attention.wqkv.weight": "model-00003-of-00004.safetensors",
185
+ "model.layers.31.attention_norm.weight": "model-00003-of-00004.safetensors",
186
+ "model.layers.31.feed_forward.w1.weight": "model-00003-of-00004.safetensors",
187
+ "model.layers.31.feed_forward.w2.weight": "model-00003-of-00004.safetensors",
188
+ "model.layers.31.feed_forward.w3.weight": "model-00003-of-00004.safetensors",
189
+ "model.layers.31.ffn_norm.weight": "model-00003-of-00004.safetensors",
190
+ "model.layers.4.attention.wo.weight": "model-00001-of-00004.safetensors",
191
+ "model.layers.4.attention.wqkv.weight": "model-00001-of-00004.safetensors",
192
+ "model.layers.4.attention_norm.weight": "model-00001-of-00004.safetensors",
193
+ "model.layers.4.feed_forward.w1.weight": "model-00001-of-00004.safetensors",
194
+ "model.layers.4.feed_forward.w2.weight": "model-00001-of-00004.safetensors",
195
+ "model.layers.4.feed_forward.w3.weight": "model-00001-of-00004.safetensors",
196
+ "model.layers.4.ffn_norm.weight": "model-00001-of-00004.safetensors",
197
+ "model.layers.5.attention.wo.weight": "model-00001-of-00004.safetensors",
198
+ "model.layers.5.attention.wqkv.weight": "model-00001-of-00004.safetensors",
199
+ "model.layers.5.attention_norm.weight": "model-00001-of-00004.safetensors",
200
+ "model.layers.5.feed_forward.w1.weight": "model-00001-of-00004.safetensors",
201
+ "model.layers.5.feed_forward.w2.weight": "model-00001-of-00004.safetensors",
202
+ "model.layers.5.feed_forward.w3.weight": "model-00001-of-00004.safetensors",
203
+ "model.layers.5.ffn_norm.weight": "model-00001-of-00004.safetensors",
204
+ "model.layers.6.attention.wo.weight": "model-00001-of-00004.safetensors",
205
+ "model.layers.6.attention.wqkv.weight": "model-00001-of-00004.safetensors",
206
+ "model.layers.6.attention_norm.weight": "model-00001-of-00004.safetensors",
207
+ "model.layers.6.feed_forward.w1.weight": "model-00001-of-00004.safetensors",
208
+ "model.layers.6.feed_forward.w2.weight": "model-00001-of-00004.safetensors",
209
+ "model.layers.6.feed_forward.w3.weight": "model-00001-of-00004.safetensors",
210
+ "model.layers.6.ffn_norm.weight": "model-00001-of-00004.safetensors",
211
+ "model.layers.7.attention.wo.weight": "model-00001-of-00004.safetensors",
212
+ "model.layers.7.attention.wqkv.weight": "model-00001-of-00004.safetensors",
213
+ "model.layers.7.attention_norm.weight": "model-00001-of-00004.safetensors",
214
+ "model.layers.7.feed_forward.w1.weight": "model-00001-of-00004.safetensors",
215
+ "model.layers.7.feed_forward.w2.weight": "model-00001-of-00004.safetensors",
216
+ "model.layers.7.feed_forward.w3.weight": "model-00001-of-00004.safetensors",
217
+ "model.layers.7.ffn_norm.weight": "model-00001-of-00004.safetensors",
218
+ "model.layers.8.attention.wo.weight": "model-00001-of-00004.safetensors",
219
+ "model.layers.8.attention.wqkv.weight": "model-00001-of-00004.safetensors",
220
+ "model.layers.8.attention_norm.weight": "model-00001-of-00004.safetensors",
221
+ "model.layers.8.feed_forward.w1.weight": "model-00001-of-00004.safetensors",
222
+ "model.layers.8.feed_forward.w2.weight": "model-00001-of-00004.safetensors",
223
+ "model.layers.8.feed_forward.w3.weight": "model-00001-of-00004.safetensors",
224
+ "model.layers.8.ffn_norm.weight": "model-00001-of-00004.safetensors",
225
+ "model.layers.9.attention.wo.weight": "model-00001-of-00004.safetensors",
226
+ "model.layers.9.attention.wqkv.weight": "model-00001-of-00004.safetensors",
227
+ "model.layers.9.attention_norm.weight": "model-00002-of-00004.safetensors",
228
+ "model.layers.9.feed_forward.w1.weight": "model-00001-of-00004.safetensors",
229
+ "model.layers.9.feed_forward.w2.weight": "model-00002-of-00004.safetensors",
230
+ "model.layers.9.feed_forward.w3.weight": "model-00002-of-00004.safetensors",
231
+ "model.layers.9.ffn_norm.weight": "model-00002-of-00004.safetensors",
232
+ "model.mm_projector.0.bias": "model-00004-of-00004.safetensors",
233
+ "model.mm_projector.0.weight": "model-00004-of-00004.safetensors",
234
+ "model.mm_projector.2.bias": "model-00004-of-00004.safetensors",
235
+ "model.mm_projector.2.weight": "model-00004-of-00004.safetensors",
236
+ "model.norm.weight": "model-00003-of-00004.safetensors",
237
+ "model.tok_embeddings.weight": "model-00001-of-00004.safetensors",
238
+ "model.vision_tower.vision_tower.vision_model.embeddings.class_embedding": "model-00003-of-00004.safetensors",
239
+ "model.vision_tower.vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00003-of-00004.safetensors",
240
+ "model.vision_tower.vision_tower.vision_model.embeddings.position_embedding.weight": "model-00003-of-00004.safetensors",
241
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00003-of-00004.safetensors",
242
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00003-of-00004.safetensors",
243
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00003-of-00004.safetensors",
244
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00003-of-00004.safetensors",
245
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00003-of-00004.safetensors",
246
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00003-of-00004.safetensors",
247
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00003-of-00004.safetensors",
248
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00003-of-00004.safetensors",
249
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
250
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
251
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
252
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
253
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
254
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
255
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
256
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
257
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00003-of-00004.safetensors",
258
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00003-of-00004.safetensors",
259
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00003-of-00004.safetensors",
260
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00003-of-00004.safetensors",
261
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00003-of-00004.safetensors",
262
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00003-of-00004.safetensors",
263
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00003-of-00004.safetensors",
264
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00003-of-00004.safetensors",
265
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
266
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
267
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
268
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
269
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
270
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
271
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
272
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
273
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00004-of-00004.safetensors",
274
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00004-of-00004.safetensors",
275
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00004-of-00004.safetensors",
276
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00004-of-00004.safetensors",
277
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00004-of-00004.safetensors",
278
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00004-of-00004.safetensors",
279
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00004-of-00004.safetensors",
280
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00004-of-00004.safetensors",
281
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
282
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
283
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
284
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
285
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
286
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
287
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
288
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
289
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00004-of-00004.safetensors",
290
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00004-of-00004.safetensors",
291
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00004-of-00004.safetensors",
292
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00004-of-00004.safetensors",
293
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00004-of-00004.safetensors",
294
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00004-of-00004.safetensors",
295
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00004-of-00004.safetensors",
296
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00004-of-00004.safetensors",
297
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
298
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
299
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
300
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
301
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
302
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
303
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
304
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
305
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00004-of-00004.safetensors",
306
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00004-of-00004.safetensors",
307
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00004-of-00004.safetensors",
308
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00004-of-00004.safetensors",
309
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00004-of-00004.safetensors",
310
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00004-of-00004.safetensors",
311
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00004-of-00004.safetensors",
312
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00004-of-00004.safetensors",
313
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
314
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
315
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
316
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
317
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
318
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
319
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
320
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
321
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00004-of-00004.safetensors",
322
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00004-of-00004.safetensors",
323
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00004-of-00004.safetensors",
324
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00004-of-00004.safetensors",
325
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00004-of-00004.safetensors",
326
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00004-of-00004.safetensors",
327
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00004-of-00004.safetensors",
328
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00004-of-00004.safetensors",
329
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
330
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
331
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
332
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
333
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
334
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
335
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
336
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
337
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00004-of-00004.safetensors",
338
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00004-of-00004.safetensors",
339
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00004-of-00004.safetensors",
340
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00004-of-00004.safetensors",
341
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00004-of-00004.safetensors",
342
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00004-of-00004.safetensors",
343
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00004-of-00004.safetensors",
344
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00004-of-00004.safetensors",
345
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
346
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
347
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
348
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
349
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
350
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
351
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
352
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
353
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00004-of-00004.safetensors",
354
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00004-of-00004.safetensors",
355
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00004-of-00004.safetensors",
356
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00004-of-00004.safetensors",
357
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00004-of-00004.safetensors",
358
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00004-of-00004.safetensors",
359
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00004-of-00004.safetensors",
360
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00004-of-00004.safetensors",
361
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
362
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
363
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
364
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
365
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
366
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
367
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
368
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
369
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00004-of-00004.safetensors",
370
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00004-of-00004.safetensors",
371
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00004-of-00004.safetensors",
372
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00004-of-00004.safetensors",
373
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00004-of-00004.safetensors",
374
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00004-of-00004.safetensors",
375
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00004-of-00004.safetensors",
376
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00004-of-00004.safetensors",
377
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
378
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
379
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
380
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
381
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
382
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
383
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
384
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
385
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00004-of-00004.safetensors",
386
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00004-of-00004.safetensors",
387
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00004-of-00004.safetensors",
388
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00004-of-00004.safetensors",
389
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00004-of-00004.safetensors",
390
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00004-of-00004.safetensors",
391
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00004-of-00004.safetensors",
392
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00004-of-00004.safetensors",
393
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
394
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
395
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
396
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
397
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
398
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
399
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
400
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
401
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00004-of-00004.safetensors",
402
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00004-of-00004.safetensors",
403
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00004-of-00004.safetensors",
404
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00004-of-00004.safetensors",
405
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00004-of-00004.safetensors",
406
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00004-of-00004.safetensors",
407
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00004-of-00004.safetensors",
408
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00004-of-00004.safetensors",
409
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
410
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
411
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
412
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
413
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
414
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
415
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
416
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
417
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00004-of-00004.safetensors",
418
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00004-of-00004.safetensors",
419
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00004-of-00004.safetensors",
420
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00004-of-00004.safetensors",
421
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00004-of-00004.safetensors",
422
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00004-of-00004.safetensors",
423
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00004-of-00004.safetensors",
424
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00004-of-00004.safetensors",
425
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
426
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
427
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
428
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
429
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
430
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
431
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
432
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
433
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00003-of-00004.safetensors",
434
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00003-of-00004.safetensors",
435
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00003-of-00004.safetensors",
436
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00003-of-00004.safetensors",
437
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00003-of-00004.safetensors",
438
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00003-of-00004.safetensors",
439
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00003-of-00004.safetensors",
440
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00003-of-00004.safetensors",
441
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
442
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
443
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
444
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
445
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
446
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
447
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
448
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
449
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00004-of-00004.safetensors",
450
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00004-of-00004.safetensors",
451
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00004-of-00004.safetensors",
452
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00004-of-00004.safetensors",
453
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00004-of-00004.safetensors",
454
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00004-of-00004.safetensors",
455
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00004-of-00004.safetensors",
456
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00004-of-00004.safetensors",
457
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
458
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
459
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
460
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
461
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
462
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
463
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
464
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
465
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00004-of-00004.safetensors",
466
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00004-of-00004.safetensors",
467
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00004-of-00004.safetensors",
468
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00004-of-00004.safetensors",
469
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00004-of-00004.safetensors",
470
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00004-of-00004.safetensors",
471
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00004-of-00004.safetensors",
472
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00004-of-00004.safetensors",
473
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
474
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
475
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
476
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
477
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
478
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
479
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
480
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
481
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00004-of-00004.safetensors",
482
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00004-of-00004.safetensors",
483
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00004-of-00004.safetensors",
484
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00004-of-00004.safetensors",
485
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00004-of-00004.safetensors",
486
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00004-of-00004.safetensors",
487
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00004-of-00004.safetensors",
488
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00004-of-00004.safetensors",
489
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
490
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
491
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
492
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
493
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
494
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
495
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
496
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
497
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00004-of-00004.safetensors",
498
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00004-of-00004.safetensors",
499
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00004-of-00004.safetensors",
500
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00004-of-00004.safetensors",
501
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00004-of-00004.safetensors",
502
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00004-of-00004.safetensors",
503
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00004-of-00004.safetensors",
504
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00004-of-00004.safetensors",
505
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
506
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
507
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
508
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
509
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
510
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
511
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
512
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
513
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00004-of-00004.safetensors",
514
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00004-of-00004.safetensors",
515
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00004-of-00004.safetensors",
516
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00004-of-00004.safetensors",
517
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00004-of-00004.safetensors",
518
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00004-of-00004.safetensors",
519
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00004-of-00004.safetensors",
520
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00004-of-00004.safetensors",
521
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
522
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
523
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
524
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
525
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
526
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
527
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
528
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
529
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00004-of-00004.safetensors",
530
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00004-of-00004.safetensors",
531
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00004-of-00004.safetensors",
532
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00004-of-00004.safetensors",
533
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00004-of-00004.safetensors",
534
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00004-of-00004.safetensors",
535
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00004-of-00004.safetensors",
536
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00004-of-00004.safetensors",
537
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
538
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
539
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
540
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
541
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
542
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
543
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
544
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
545
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00004-of-00004.safetensors",
546
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00004-of-00004.safetensors",
547
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00004-of-00004.safetensors",
548
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00004-of-00004.safetensors",
549
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00004-of-00004.safetensors",
550
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00004-of-00004.safetensors",
551
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00004-of-00004.safetensors",
552
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00004-of-00004.safetensors",
553
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
554
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
555
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
556
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
557
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
558
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
559
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
560
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
561
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00004-of-00004.safetensors",
562
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00004-of-00004.safetensors",
563
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00004-of-00004.safetensors",
564
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00004-of-00004.safetensors",
565
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00004-of-00004.safetensors",
566
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00004-of-00004.safetensors",
567
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00004-of-00004.safetensors",
568
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00004-of-00004.safetensors",
569
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
570
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
571
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
572
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
573
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
574
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
575
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
576
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
577
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00004-of-00004.safetensors",
578
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00004-of-00004.safetensors",
579
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00004-of-00004.safetensors",
580
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00004-of-00004.safetensors",
581
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00004-of-00004.safetensors",
582
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00004-of-00004.safetensors",
583
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00004-of-00004.safetensors",
584
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00004-of-00004.safetensors",
585
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
586
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
587
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
588
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
589
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
590
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
591
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
592
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
593
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00004-of-00004.safetensors",
594
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00004-of-00004.safetensors",
595
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00004-of-00004.safetensors",
596
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00004-of-00004.safetensors",
597
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00004-of-00004.safetensors",
598
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00004-of-00004.safetensors",
599
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00004-of-00004.safetensors",
600
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00004-of-00004.safetensors",
601
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
602
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
603
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
604
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
605
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
606
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
607
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
608
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
609
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00004-of-00004.safetensors",
610
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00004-of-00004.safetensors",
611
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00004-of-00004.safetensors",
612
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00004-of-00004.safetensors",
613
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00004-of-00004.safetensors",
614
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00004-of-00004.safetensors",
615
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00004-of-00004.safetensors",
616
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00004-of-00004.safetensors",
617
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
618
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
619
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
620
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
621
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
622
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
623
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
624
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
625
+ "model.vision_tower.vision_tower.vision_model.post_layernorm.bias": "model-00004-of-00004.safetensors",
626
+ "model.vision_tower.vision_tower.vision_model.post_layernorm.weight": "model-00004-of-00004.safetensors",
627
+ "model.vision_tower.vision_tower.vision_model.pre_layrnorm.bias": "model-00003-of-00004.safetensors",
628
+ "model.vision_tower.vision_tower.vision_model.pre_layrnorm.weight": "model-00003-of-00004.safetensors",
629
+ "output.weight": "model-00004-of-00004.safetensors"
630
+ }
631
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|action_start|>",
6
+ "<|action_end|>",
7
+ "<|interpreter|>",
8
+ "<|plugin|>"
9
+ ],
10
+ "bos_token": {
11
+ "content": "<s>",
12
+ "lstrip": false,
13
+ "normalized": false,
14
+ "rstrip": false,
15
+ "single_word": false
16
+ },
17
+ "eos_token": {
18
+ "content": "</s>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "</s>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ },
31
+ "unk_token": {
32
+ "content": "<unk>",
33
+ "lstrip": false,
34
+ "normalized": false,
35
+ "rstrip": false,
36
+ "single_word": false
37
+ }
38
+ }
tokenization_internlm2.py ADDED
@@ -0,0 +1,236 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright (c) The InternLM team and The HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # This code is based on transformers/src/transformers/models/llama/tokenization_llama.py
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License");
7
+ # you may not use this file except in compliance with the License.
8
+ # You may obtain a copy of the License at
9
+ #
10
+ # http://www.apache.org/licenses/LICENSE-2.0
11
+ #
12
+ # Unless required by applicable law or agreed to in writing, software
13
+ # distributed under the License is distributed on an "AS IS" BASIS,
14
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
+ # See the License for the specific language governing permissions and
16
+ # limitations under the License.
17
+
18
+ """Tokenization classes for InternLM."""
19
+ import os
20
+ from shutil import copyfile
21
+ from typing import Any, Dict, List, Optional, Tuple
22
+
23
+ import sentencepiece as spm
24
+ from transformers.tokenization_utils import PreTrainedTokenizer
25
+ from transformers.utils import logging
26
+
27
+ logger = logging.get_logger(__name__)
28
+
29
+ VOCAB_FILES_NAMES = {"vocab_file": "./tokenizer.model"}
30
+
31
+ PRETRAINED_VOCAB_FILES_MAP = {}
32
+
33
+
34
+ # Modified from transformers.model.llama.tokenization_llama.LlamaTokenizer
35
+ class InternLM2Tokenizer(PreTrainedTokenizer):
36
+ """
37
+ Construct a InternLM2 tokenizer. Based on byte-level Byte-Pair-Encoding.
38
+
39
+ Args:
40
+ vocab_file (`str`):
41
+ Path to the vocabulary file.
42
+ """
43
+
44
+ vocab_files_names = VOCAB_FILES_NAMES
45
+ pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
46
+ model_input_names = ["input_ids", "attention_mask"]
47
+ _auto_class = "AutoTokenizer"
48
+
49
+ def __init__(
50
+ self,
51
+ vocab_file,
52
+ unk_token="<unk>",
53
+ bos_token="<s>",
54
+ eos_token="</s>",
55
+ pad_token="</s>",
56
+ sp_model_kwargs: Optional[Dict[str, Any]] = None,
57
+ add_bos_token=True,
58
+ add_eos_token=False,
59
+ decode_with_prefix_space=False,
60
+ clean_up_tokenization_spaces=False,
61
+ **kwargs,
62
+ ):
63
+ self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
64
+ self.vocab_file = vocab_file
65
+ self.add_bos_token = add_bos_token
66
+ self.add_eos_token = add_eos_token
67
+ self.decode_with_prefix_space = decode_with_prefix_space
68
+ self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
69
+ self.sp_model.Load(vocab_file)
70
+ self._no_prefix_space_tokens = None
71
+ super().__init__(
72
+ bos_token=bos_token,
73
+ eos_token=eos_token,
74
+ unk_token=unk_token,
75
+ pad_token=pad_token,
76
+ clean_up_tokenization_spaces=clean_up_tokenization_spaces,
77
+ **kwargs,
78
+ )
79
+
80
+ @property
81
+ def no_prefix_space_tokens(self):
82
+ if self._no_prefix_space_tokens is None:
83
+ vocab = self.convert_ids_to_tokens(list(range(self.vocab_size)))
84
+ self._no_prefix_space_tokens = {i for i, tok in enumerate(vocab) if not tok.startswith("▁")}
85
+ return self._no_prefix_space_tokens
86
+
87
+ @property
88
+ def vocab_size(self):
89
+ """Returns vocab size"""
90
+ return self.sp_model.get_piece_size()
91
+
92
+ @property
93
+ def bos_token_id(self) -> Optional[int]:
94
+ return self.sp_model.bos_id()
95
+
96
+ @property
97
+ def eos_token_id(self) -> Optional[int]:
98
+ return self.sp_model.eos_id()
99
+
100
+ def get_vocab(self):
101
+ """Returns vocab as a dict"""
102
+ vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
103
+ vocab.update(self.added_tokens_encoder)
104
+ return vocab
105
+
106
+ def _tokenize(self, text):
107
+ """Returns a tokenized string."""
108
+ return self.sp_model.encode(text, out_type=str)
109
+
110
+ def _convert_token_to_id(self, token):
111
+ """Converts a token (str) in an id using the vocab."""
112
+ return self.sp_model.piece_to_id(token)
113
+
114
+ def _convert_id_to_token(self, index):
115
+ """Converts an index (integer) in a token (str) using the vocab."""
116
+ token = self.sp_model.IdToPiece(index)
117
+ return token
118
+
119
+ def _maybe_add_prefix_space(self, tokens, decoded):
120
+ if tokens and tokens[0] not in self.no_prefix_space_tokens:
121
+ return " " + decoded
122
+ else:
123
+ return decoded
124
+
125
+ def convert_tokens_to_string(self, tokens):
126
+ """Converts a sequence of tokens (string) in a single string."""
127
+ current_sub_tokens = []
128
+ out_string = ""
129
+ prev_is_special = False
130
+ for token in tokens:
131
+ # make sure that special tokens are not decoded using sentencepiece model
132
+ if token in self.all_special_tokens:
133
+ if not prev_is_special:
134
+ out_string += " "
135
+ out_string += self.sp_model.decode(current_sub_tokens) + token
136
+ prev_is_special = True
137
+ current_sub_tokens = []
138
+ else:
139
+ current_sub_tokens.append(token)
140
+ prev_is_special = False
141
+ out_string += self.sp_model.decode(current_sub_tokens)
142
+ out_string = self.clean_up_tokenization(out_string)
143
+ out_string = self._maybe_add_prefix_space(tokens=tokens, decoded=out_string)
144
+ return out_string[1:]
145
+
146
+ def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]:
147
+ """
148
+ Save the vocabulary and special tokens file to a directory.
149
+
150
+ Args:
151
+ save_directory (`str`):
152
+ The directory in which to save the vocabulary.
153
+
154
+ Returns:
155
+ `Tuple(str)`: Paths to the files saved.
156
+ """
157
+ if not os.path.isdir(save_directory):
158
+ logger.error(f"Vocabulary path ({save_directory}) should be a directory")
159
+ return
160
+ out_vocab_file = os.path.join(
161
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
162
+ )
163
+
164
+ if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
165
+ copyfile(self.vocab_file, out_vocab_file)
166
+ elif not os.path.isfile(self.vocab_file):
167
+ with open(out_vocab_file, "wb") as fi:
168
+ content_spiece_model = self.sp_model.serialized_model_proto()
169
+ fi.write(content_spiece_model)
170
+
171
+ return (out_vocab_file,)
172
+
173
+ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
174
+ if self.add_bos_token:
175
+ bos_token_ids = [self.bos_token_id]
176
+ else:
177
+ bos_token_ids = []
178
+
179
+ output = bos_token_ids + token_ids_0
180
+
181
+ if token_ids_1 is not None:
182
+ output = output + token_ids_1
183
+
184
+ if self.add_eos_token:
185
+ output = output + [self.eos_token_id]
186
+
187
+ return output
188
+
189
+ def get_special_tokens_mask(
190
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
191
+ ) -> List[int]:
192
+ """
193
+ Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
194
+ special tokens using the tokenizer `prepare_for_model` method.
195
+
196
+ Args:
197
+ token_ids_0 (`List[int]`):
198
+ List of IDs.
199
+ token_ids_1 (`List[int]`, *optional*):
200
+ Optional second list of IDs for sequence pairs.
201
+ already_has_special_tokens (`bool`, *optional*, defaults to `False`):
202
+ Whether or not the token list is already formatted with special tokens for the model.
203
+
204
+ Returns:
205
+ `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
206
+ """
207
+ if already_has_special_tokens:
208
+ return super().get_special_tokens_mask(
209
+ token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
210
+ )
211
+
212
+ if token_ids_1 is None:
213
+ return [1] + ([0] * len(token_ids_0)) + [1]
214
+ return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
215
+
216
+ def create_token_type_ids_from_sequences(
217
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
218
+ ) -> List[int]:
219
+ """
220
+ Create a mask from the two sequences passed to be used in a sequence-pair classification task. T5 does not make
221
+ use of token type ids, therefore a list of zeros is returned.
222
+
223
+ Args:
224
+ token_ids_0 (`List[int]`):
225
+ List of IDs.
226
+ token_ids_1 (`List[int]`, *optional*):
227
+ Optional second list of IDs for sequence pairs.
228
+
229
+ Returns:
230
+ `List[int]`: List of zeros.
231
+ """
232
+ eos = [self.eos_token_id]
233
+
234
+ if token_ids_1 is None:
235
+ return len(token_ids_0 + eos) * [0]
236
+ return len(token_ids_0 + eos + token_ids_1 + eos) * [0]
tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f868398fc4e05ee1e8aeba95ddf18ddcc45b8bce55d5093bead5bbf80429b48b
3
+ size 1477754
tokenizer_config.json ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<unk>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<s>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "</s>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "92538": {
28
+ "content": "<|plugin|>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "92539": {
36
+ "content": "<|interpreter|>",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ },
43
+ "92540": {
44
+ "content": "<|action_end|>",
45
+ "lstrip": false,
46
+ "normalized": false,
47
+ "rstrip": false,
48
+ "single_word": false,
49
+ "special": true
50
+ },
51
+ "92541": {
52
+ "content": "<|action_start|>",
53
+ "lstrip": false,
54
+ "normalized": false,
55
+ "rstrip": false,
56
+ "single_word": false,
57
+ "special": true
58
+ },
59
+ "92542": {
60
+ "content": "<|im_end|>",
61
+ "lstrip": false,
62
+ "normalized": false,
63
+ "rstrip": false,
64
+ "single_word": false,
65
+ "special": true
66
+ },
67
+ "92543": {
68
+ "content": "<|im_start|>",
69
+ "lstrip": false,
70
+ "normalized": false,
71
+ "rstrip": false,
72
+ "single_word": false,
73
+ "special": true
74
+ }
75
+ },
76
+ "additional_special_tokens": [
77
+ "<|im_start|>",
78
+ "<|im_end|>",
79
+ "<|action_start|>",
80
+ "<|action_end|>",
81
+ "<|interpreter|>",
82
+ "<|plugin|>"
83
+ ],
84
+ "auto_map": {
85
+ "AutoTokenizer": [
86
+ "tokenization_internlm2.InternLM2Tokenizer",
87
+ null
88
+ ]
89
+ },
90
+ "bos_token": "<s>",
91
+ "chat_template": "{{ bos_token }}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
92
+ "clean_up_tokenization_spaces": false,
93
+ "eos_token": "</s>",
94
+ "model_max_length": 32768,
95
+ "pad_token": "</s>",
96
+ "padding_side": "right",
97
+ "tokenizer_class": "InternLM2Tokenizer",
98
+ "unk_token": "<unk>"
99
+ }
trainer_state.json ADDED
The diff for this file is too large to render. See raw diff
 
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:69649ba3f3a80b494a682545ea472ba821288c2aff4b418db2314ea18d4c607c
3
+ size 7160
training_log_20250116_215534.txt ADDED
The diff for this file is too large to render. See raw diff
 
training_log_20250116_215545.txt ADDED
@@ -0,0 +1,856 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [2025-01-16 21:55:52,608] torch.distributed.run: [WARNING]
2
+ [2025-01-16 21:55:52,608] torch.distributed.run: [WARNING] *****************************************
3
+ [2025-01-16 21:55:52,608] torch.distributed.run: [WARNING] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
4
+ [2025-01-16 21:55:52,608] torch.distributed.run: [WARNING] *****************************************
5
+ The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.
6
+
7
+ 0it [00:00, ?it/s]
8
+ 0it [00:00, ?it/s]
9
+ /cpfs02/user/zhaoxiangyu/miniconda3/envs/llava/lib/python3.10/site-packages/_distutils_hack/__init__.py:54: UserWarning: Reliance on distutils from stdlib is deprecated. Users must rely on setuptools to provide the distutils module. Avoid importing distutils or import setuptools first, and avoid setting SETUPTOOLS_USE_DISTUTILS=stdlib. Register concerns at https://github.com/pypa/setuptools/issues/new?template=distutils-deprecation.yml
10
+ warnings.warn(
11
+ /cpfs02/user/zhaoxiangyu/miniconda3/envs/llava/lib/python3.10/site-packages/_distutils_hack/__init__.py:54: UserWarning: Reliance on distutils from stdlib is deprecated. Users must rely on setuptools to provide the distutils module. Avoid importing distutils or import setuptools first, and avoid setting SETUPTOOLS_USE_DISTUTILS=stdlib. Register concerns at https://github.com/pypa/setuptools/issues/new?template=distutils-deprecation.yml
12
+ warnings.warn(
13
+ /cpfs02/user/zhaoxiangyu/miniconda3/envs/llava/lib/python3.10/site-packages/_distutils_hack/__init__.py:54: UserWarning: Reliance on distutils from stdlib is deprecated. Users must rely on setuptools to provide the distutils module. Avoid importing distutils or import setuptools first, and avoid setting SETUPTOOLS_USE_DISTUTILS=stdlib. Register concerns at https://github.com/pypa/setuptools/issues/new?template=distutils-deprecation.yml
14
+ warnings.warn(
15
+ /cpfs02/user/zhaoxiangyu/miniconda3/envs/llava/lib/python3.10/site-packages/_distutils_hack/__init__.py:54: UserWarning: Reliance on distutils from stdlib is deprecated. Users must rely on setuptools to provide the distutils module. Avoid importing distutils or import setuptools first, and avoid setting SETUPTOOLS_USE_DISTUTILS=stdlib. Register concerns at https://github.com/pypa/setuptools/issues/new?template=distutils-deprecation.yml
16
+ warnings.warn(
17
+ /cpfs02/user/zhaoxiangyu/miniconda3/envs/llava/lib/python3.10/site-packages/_distutils_hack/__init__.py:54: UserWarning: Reliance on distutils from stdlib is deprecated. Users must rely on setuptools to provide the distutils module. Avoid importing distutils or import setuptools first, and avoid setting SETUPTOOLS_USE_DISTUTILS=stdlib. Register concerns at https://github.com/pypa/setuptools/issues/new?template=distutils-deprecation.yml
18
+ warnings.warn(
19
+ /cpfs02/user/zhaoxiangyu/miniconda3/envs/llava/lib/python3.10/site-packages/_distutils_hack/__init__.py:54: UserWarning: Reliance on distutils from stdlib is deprecated. Users must rely on setuptools to provide the distutils module. Avoid importing distutils or import setuptools first, and avoid setting SETUPTOOLS_USE_DISTUTILS=stdlib. Register concerns at https://github.com/pypa/setuptools/issues/new?template=distutils-deprecation.yml
20
+ warnings.warn(
21
+ /cpfs02/user/zhaoxiangyu/miniconda3/envs/llava/lib/python3.10/site-packages/_distutils_hack/__init__.py:54: UserWarning: Reliance on distutils from stdlib is deprecated. Users must rely on setuptools to provide the distutils module. Avoid importing distutils or import setuptools first, and avoid setting SETUPTOOLS_USE_DISTUTILS=stdlib. Register concerns at https://github.com/pypa/setuptools/issues/new?template=distutils-deprecation.yml
22
+ warnings.warn(
23
+ /cpfs02/user/zhaoxiangyu/miniconda3/envs/llava/lib/python3.10/site-packages/_distutils_hack/__init__.py:54: UserWarning: Reliance on distutils from stdlib is deprecated. Users must rely on setuptools to provide the distutils module. Avoid importing distutils or import setuptools first, and avoid setting SETUPTOOLS_USE_DISTUTILS=stdlib. Register concerns at https://github.com/pypa/setuptools/issues/new?template=distutils-deprecation.yml
24
+ warnings.warn(
25
+ [2025-01-16 21:56:07,205] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
26
+ [2025-01-16 21:56:07,205] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
27
+ [2025-01-16 21:56:07,205] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
28
+ [2025-01-16 21:56:07,205] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
29
+ [2025-01-16 21:56:07,205] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
30
+ [2025-01-16 21:56:07,205] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
31
+ [2025-01-16 21:56:07,205] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
32
+ [2025-01-16 21:56:07,205] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
33
+ df: df: /root/.triton/autotune/root/.triton/autotunedf: /root/.triton/autotune: 没有那个文件或目录: 没有那个文件或目录
34
+ : 没有那个文件或目录
35
+
36
+ df: /root/.triton/autotunedf: /root/.triton/autotune: 没有那个文件或目录
37
+ : 没有那个文件或目录
38
+ df: /root/.triton/autotune: 没有那个文件或目录
39
+  [WARNING]  Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
40
+  [WARNING]  Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
41
+  [WARNING]  Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
42
+  [WARNING]  Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
43
+  [WARNING]  Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
44
+  [WARNING]  Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
45
+  [WARNING]  Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
46
+  [WARNING]  Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
47
+  [WARNING]  sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.1
48
+  [WARNING]  using untested triton version (2.1.0), only 1.0.0 is known to be compatible
49
+  [WARNING]  sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.1
50
+  [WARNING]  using untested triton version (2.1.0), only 1.0.0 is known to be compatible
51
+  [WARNING]  sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.1
52
+  [WARNING]  using untested triton version (2.1.0), only 1.0.0 is known to be compatible
53
+  [WARNING]  sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.1
54
+  [WARNING]  using untested triton version (2.1.0), only 1.0.0 is known to be compatible
55
+  [WARNING]  sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.1
56
+  [WARNING]  using untested triton version (2.1.0), only 1.0.0 is known to be compatible
57
+  [WARNING]  sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.1
58
+  [WARNING]  using untested triton version (2.1.0), only 1.0.0 is known to be compatible
59
+  [WARNING]  sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.1
60
+  [WARNING]  using untested triton version (2.1.0), only 1.0.0 is known to be compatible
61
+  [WARNING]  sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.1
62
+  [WARNING]  using untested triton version (2.1.0), only 1.0.0 is known to be compatible
63
+ [2025-01-16 21:56:24,924] [INFO] [comm.py:637:init_distributed] cdb=None
64
+ [2025-01-16 21:56:24,924] [INFO] [comm.py:637:init_distributed] cdb=None
65
+ [2025-01-16 21:56:24,925] [INFO] [comm.py:637:init_distributed] cdb=None
66
+ [2025-01-16 21:56:24,925] [INFO] [comm.py:637:init_distributed] cdb=None
67
+ [2025-01-16 21:56:24,925] [INFO] [comm.py:637:init_distributed] cdb=None
68
+ [2025-01-16 21:56:24,925] [INFO] [comm.py:637:init_distributed] cdb=None
69
+ [2025-01-16 21:56:24,925] [INFO] [comm.py:637:init_distributed] cdb=None
70
+ [2025-01-16 21:56:24,926] [INFO] [comm.py:637:init_distributed] cdb=None
71
+ 01/16/2025 21:56:24 - WARNING - llava.train.train - Process rank: 0, device: cuda:0, n_gpu: 1distributed training: True, 16-bits training: False
72
+ 01/16/2025 21:56:24 - INFO - llava.train.train - Training/evaluation parameters TrainingArguments(
73
+ _n_gpu=1,
74
+ adafactor=False,
75
+ adam_beta1=0.9,
76
+ adam_beta2=0.999,
77
+ adam_epsilon=1e-08,
78
+ auto_find_batch_size=False,
79
+ bf16=True,
80
+ bf16_full_eval=False,
81
+ bits=16,
82
+ cache_dir=None,
83
+ data_seed=None,
84
+ dataloader_drop_last=False,
85
+ dataloader_num_workers=4,
86
+ dataloader_persistent_workers=False,
87
+ dataloader_pin_memory=True,
88
+ ddp_backend=None,
89
+ ddp_broadcast_buffers=None,
90
+ ddp_bucket_cap_mb=None,
91
+ ddp_find_unused_parameters=None,
92
+ ddp_timeout=1800,
93
+ debug=[],
94
+ deepspeed=./scripts/zero3.json,
95
+ disable_tqdm=False,
96
+ dispatch_batches=None,
97
+ do_eval=False,
98
+ do_predict=False,
99
+ do_train=False,
100
+ double_quant=True,
101
+ eval_accumulation_steps=None,
102
+ eval_delay=0,
103
+ eval_steps=None,
104
+ evaluation_strategy=no,
105
+ fp16=False,
106
+ fp16_backend=auto,
107
+ fp16_full_eval=False,
108
+ fp16_opt_level=O1,
109
+ freeze_mm_mlp_adapter=False,
110
+ fsdp=[],
111
+ fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False},
112
+ fsdp_min_num_params=0,
113
+ fsdp_transformer_layer_cls_to_wrap=None,
114
+ full_determinism=False,
115
+ gradient_accumulation_steps=2,
116
+ gradient_checkpointing=True,
117
+ gradient_checkpointing_kwargs=None,
118
+ greater_is_better=None,
119
+ group_by_length=False,
120
+ group_by_modality_length=True,
121
+ half_precision_backend=auto,
122
+ hub_always_push=False,
123
+ hub_model_id=None,
124
+ hub_private_repo=False,
125
+ hub_strategy=every_save,
126
+ hub_token=<HUB_TOKEN>,
127
+ ignore_data_skip=False,
128
+ include_inputs_for_metrics=False,
129
+ include_num_input_tokens_seen=False,
130
+ include_tokens_per_second=False,
131
+ jit_mode_eval=False,
132
+ label_names=None,
133
+ label_smoothing_factor=0.0,
134
+ learning_rate=2e-05,
135
+ length_column_name=length,
136
+ load_best_model_at_end=False,
137
+ local_rank=0,
138
+ log_level=passive,
139
+ log_level_replica=warning,
140
+ log_on_each_node=True,
141
+ logging_dir=./checkpoints/llavaAR4-internlm2_5-7b-sft-llavanext-notext-kn-infpolishmd-detail-knins40k-creationme10kfixed-chart11kmerge-tqa8k-info28kgpt/runs/Jan16_21-56-24_dlc1w85u6fy2xvlb-worker-0,
142
+ logging_first_step=False,
143
+ logging_nan_inf_filter=True,
144
+ logging_steps=1.0,
145
+ logging_strategy=steps,
146
+ lora_alpha=16,
147
+ lora_bias=none,
148
+ lora_dropout=0.05,
149
+ lora_enable=False,
150
+ lora_r=64,
151
+ lora_weight_path=,
152
+ lr_scheduler_kwargs={},
153
+ lr_scheduler_type=cosine,
154
+ max_grad_norm=1.0,
155
+ max_steps=-1,
156
+ metric_for_best_model=None,
157
+ mm_projector_lr=None,
158
+ mm_vision_tower_lr=2e-06,
159
+ model_max_length=32768,
160
+ mp_parameters=,
161
+ mpt_attn_impl=triton,
162
+ neftune_noise_alpha=None,
163
+ no_cuda=False,
164
+ num_train_epochs=1.0,
165
+ optim=adamw_torch,
166
+ optim_args=None,
167
+ output_dir=./checkpoints/llavaAR4-internlm2_5-7b-sft-llavanext-notext-kn-infpolishmd-detail-knins40k-creationme10kfixed-chart11kmerge-tqa8k-info28kgpt,
168
+ overwrite_output_dir=False,
169
+ past_index=-1,
170
+ per_device_eval_batch_size=4,
171
+ per_device_train_batch_size=4,
172
+ prediction_loss_only=False,
173
+ push_to_hub=False,
174
+ push_to_hub_model_id=None,
175
+ push_to_hub_organization=None,
176
+ push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
177
+ quant_type=nf4,
178
+ ray_scope=last,
179
+ remove_unused_columns=False,
180
+ report_to=['wandb'],
181
+ resume_from_checkpoint=None,
182
+ run_name=llavaAR4-internlm2_5-7b-sft-llavanext-notext-kn-infpolishmd-detail-knins40k-creationme10kfixed-chart11kmerge-tqa8k-info28kgpt,
183
+ save_on_each_node=False,
184
+ save_only_model=False,
185
+ save_safetensors=True,
186
+ save_steps=10000,
187
+ save_strategy=steps,
188
+ save_total_limit=1,
189
+ seed=42,
190
+ skip_memory_metrics=True,
191
+ split_batches=False,
192
+ tf32=True,
193
+ torch_compile=False,
194
+ torch_compile_backend=None,
195
+ torch_compile_mode=None,
196
+ torchdynamo=None,
197
+ tpu_metrics_debug=False,
198
+ tpu_num_cores=None,
199
+ use_cpu=False,
200
+ use_ipex=False,
201
+ use_legacy_prediction_loop=False,
202
+ use_mps_device=False,
203
+ warmup_ratio=0.03,
204
+ warmup_steps=0,
205
+ weight_decay=0.0,
206
+ )
207
+ 01/16/2025 21:56:24 - INFO - llava.train.train - Training/evaluation parameters DataArguments(data_path=None, meta_path='playground/meta_json/llavanext_sample/llava_next_notext_inf37kpolishmd_de35k_know40k_knins40k_creation10kfixed_chart11kmerge_tqa8k_info28k_gpt.json', lazy_preprocess=True, is_multimodal=False, image_folder=None, image_aspect_ratio='anyres', image_grid_pinpoints='[(336, 672), (672, 336), (672, 672), (1008, 336), (336, 1008)]', image_crop_resolution=None, image_split_resolution=None, use_data_resampling=False)
208
+ [INFO|configuration_utils.py:727] 2025-01-16 21:56:24,953 >> loading configuration file models/internlm/internlm2_5-7b-chat/config.json
209
+ [INFO|configuration_utils.py:727] 2025-01-16 21:56:24,975 >> loading configuration file models/internlm/internlm2_5-7b-chat/config.json
210
+ [INFO|configuration_utils.py:792] 2025-01-16 21:56:24,976 >> Model config InternLM2Config {
211
+ "_name_or_path": "models/internlm/internlm2_5-7b-chat",
212
+ "architectures": [
213
+ "InternLM2ForCausalLM"
214
+ ],
215
+ "attn_implementation": "eager",
216
+ "auto_map": {
217
+ "AutoConfig": "configuration_internlm2.InternLM2Config",
218
+ "AutoModel": "modeling_internlm2.InternLM2ForCausalLM",
219
+ "AutoModelForCausalLM": "modeling_internlm2.InternLM2ForCausalLM"
220
+ },
221
+ "bias": false,
222
+ "bos_token_id": 1,
223
+ "eos_token_id": 2,
224
+ "hidden_act": "silu",
225
+ "hidden_size": 4096,
226
+ "initializer_range": 0.02,
227
+ "intermediate_size": 14336,
228
+ "max_position_embeddings": 32768,
229
+ "model_type": "internlm2",
230
+ "num_attention_heads": 32,
231
+ "num_hidden_layers": 32,
232
+ "num_key_value_heads": 8,
233
+ "pad_token_id": 2,
234
+ "pretraining_tp": 1,
235
+ "rms_norm_eps": 1e-05,
236
+ "rope_scaling": {
237
+ "factor": 2.0,
238
+ "type": "dynamic"
239
+ },
240
+ "rope_theta": 1000000,
241
+ "tie_word_embeddings": false,
242
+ "torch_dtype": "bfloat16",
243
+ "transformers_version": "4.37.2",
244
+ "use_cache": true,
245
+ "vocab_size": 92544
246
+ }
247
+
248
+ [WARNING|modeling_utils.py:2918] 2025-01-16 21:56:24,979 >> The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
249
+ [INFO|modeling_utils.py:3473] 2025-01-16 21:56:25,020 >> loading weights file models/internlm/internlm2_5-7b-chat/model.safetensors.index.json
250
+ [INFO|modeling_utils.py:1426] 2025-01-16 21:56:25,023 >> Instantiating LlavaInternlm2ForCausalLM model under default dtype torch.bfloat16.
251
+ [INFO|modeling_utils.py:3582] 2025-01-16 21:56:25,023 >> Detected DeepSpeed ZeRO-3: activating zero.init() for this model
252
+ [INFO|configuration_utils.py:826] 2025-01-16 21:56:25,030 >> Generate config GenerationConfig {
253
+ "bos_token_id": 1,
254
+ "eos_token_id": 2,
255
+ "pad_token_id": 2
256
+ }
257
+
258
+ 01/16/2025 21:56:27 - WARNING - llava.train.train - Process rank: 2, device: cuda:2, n_gpu: 1distributed training: True, 16-bits training: False
259
+ 01/16/2025 21:56:27 - WARNING - llava.train.train - Process rank: 3, device: cuda:3, n_gpu: 1distributed training: True, 16-bits training: False
260
+ 01/16/2025 21:56:27 - WARNING - llava.train.train - Process rank: 7, device: cuda:7, n_gpu: 1distributed training: True, 16-bits training: False
261
+ [WARNING|modeling_utils.py:2918] 2025-01-16 21:56:27,414 >> The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
262
+ [WARNING|modeling_utils.py:2918] 2025-01-16 21:56:27,415 >> The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
263
+ [WARNING|modeling_utils.py:2918] 2025-01-16 21:56:27,426 >> The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
264
+ 01/16/2025 21:56:27 - WARNING - llava.train.train - Process rank: 1, device: cuda:1, n_gpu: 1distributed training: True, 16-bits training: False
265
+ [WARNING|modeling_utils.py:2918] 2025-01-16 21:56:27,443 >> The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
266
+ 01/16/2025 21:56:27 - WARNING - llava.train.train - Process rank: 4, device: cuda:4, n_gpu: 1distributed training: True, 16-bits training: False
267
+ 01/16/2025 21:56:27 - WARNING - llava.train.train - Process rank: 6, device: cuda:6, n_gpu: 1distributed training: True, 16-bits training: False
268
+ [WARNING|modeling_utils.py:2918] 2025-01-16 21:56:27,472 >> The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
269
+ [WARNING|modeling_utils.py:2918] 2025-01-16 21:56:27,479 >> The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
270
+ 01/16/2025 21:56:27 - WARNING - llava.train.train - Process rank: 5, device: cuda:5, n_gpu: 1distributed training: True, 16-bits training: False
271
+ [WARNING|modeling_utils.py:2918] 2025-01-16 21:56:27,787 >> The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
272
+ dlc1w85u6fy2xvlb-worker-0:74:74 [1] NCCL INFO cudaDriverVersion 12010
273
+ dlc1w85u6fy2xvlb-worker-0:80:80 [7] NCCL INFO cudaDriverVersion 12010
274
+ dlc1w85u6fy2xvlb-worker-0:75:75 [2] NCCL INFO cudaDriverVersion 12010
275
+ dlc1w85u6fy2xvlb-worker-0:78:78 [5] NCCL INFO cudaDriverVersion 12010
276
+ dlc1w85u6fy2xvlb-worker-0:79:79 [6] NCCL INFO cudaDriverVersion 12010
277
+ dlc1w85u6fy2xvlb-worker-0:77:77 [4] NCCL INFO cudaDriverVersion 12010
278
+ dlc1w85u6fy2xvlb-worker-0:73:73 [0] NCCL INFO cudaDriverVersion 12010
279
+ dlc1w85u6fy2xvlb-worker-0:76:76 [3] NCCL INFO cudaDriverVersion 12010
280
+ dlc1w85u6fy2xvlb-worker-0:80:80 [7] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth
281
+ dlc1w85u6fy2xvlb-worker-0:77:77 [4] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth
282
+ dlc1w85u6fy2xvlb-worker-0:78:78 [5] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth
283
+ dlc1w85u6fy2xvlb-worker-0:75:75 [2] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth
284
+ dlc1w85u6fy2xvlb-worker-0:76:76 [3] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth
285
+ dlc1w85u6fy2xvlb-worker-0:74:74 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth
286
+ dlc1w85u6fy2xvlb-worker-0:73:73 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth
287
+ dlc1w85u6fy2xvlb-worker-0:79:79 [6] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth
288
+ dlc1w85u6fy2xvlb-worker-0:78:78 [5] NCCL INFO Bootstrap : Using eth0:22.8.45.198<0>
289
+ dlc1w85u6fy2xvlb-worker-0:77:77 [4] NCCL INFO Bootstrap : Using eth0:22.8.45.198<0>
290
+ dlc1w85u6fy2xvlb-worker-0:80:80 [7] NCCL INFO Bootstrap : Using eth0:22.8.45.198<0>
291
+ dlc1w85u6fy2xvlb-worker-0:77:77 [4] NCCL INFO Plugin name set by env to libnccl-net-none.so
292
+ dlc1w85u6fy2xvlb-worker-0:78:78 [5] NCCL INFO Plugin name set by env to libnccl-net-none.so
293
+ dlc1w85u6fy2xvlb-worker-0:80:80 [7] NCCL INFO Plugin name set by env to libnccl-net-none.so
294
+ dlc1w85u6fy2xvlb-worker-0:74:74 [1] NCCL INFO Bootstrap : Using eth0:22.8.45.198<0>
295
+ dlc1w85u6fy2xvlb-worker-0:74:74 [1] NCCL INFO Plugin name set by env to libnccl-net-none.so
296
+ dlc1w85u6fy2xvlb-worker-0:76:76 [3] NCCL INFO Bootstrap : Using eth0:22.8.45.198<0>
297
+ dlc1w85u6fy2xvlb-worker-0:79:79 [6] NCCL INFO Bootstrap : Using eth0:22.8.45.198<0>
298
+ dlc1w85u6fy2xvlb-worker-0:76:76 [3] NCCL INFO Plugin name set by env to libnccl-net-none.so
299
+ dlc1w85u6fy2xvlb-worker-0:73:73 [0] NCCL INFO Bootstrap : Using eth0:22.8.45.198<0>
300
+ dlc1w85u6fy2xvlb-worker-0:79:79 [6] NCCL INFO Plugin name set by env to libnccl-net-none.so
301
+ dlc1w85u6fy2xvlb-worker-0:73:73 [0] NCCL INFO Plugin name set by env to libnccl-net-none.so
302
+ dlc1w85u6fy2xvlb-worker-0:75:75 [2] NCCL INFO Bootstrap : Using eth0:22.8.45.198<0>
303
+ dlc1w85u6fy2xvlb-worker-0:75:75 [2] NCCL INFO Plugin name set by env to libnccl-net-none.so
304
+ dlc1w85u6fy2xvlb-worker-0:74:74 [1] NCCL INFO NET/Plugin : Plugin load (libnccl-net-none.so) returned 2 : libnccl-net-none.so: cannot open shared object file: No such file or directory
305
+ dlc1w85u6fy2xvlb-worker-0:78:78 [5] NCCL INFO NET/Plugin : Plugin load (libnccl-net-none.so) returned 2 : libnccl-net-none.so: cannot open shared object file: No such file or directory
306
+ dlc1w85u6fy2xvlb-worker-0:75:75 [2] NCCL INFO NET/Plugin : Plugin load (libnccl-net-none.so) returned 2 : libnccl-net-none.so: cannot open shared object file: No such file or directory
307
+ dlc1w85u6fy2xvlb-worker-0:79:79 [6] NCCL INFO NET/Plugin : Plugin load (libnccl-net-none.so) returned 2 : libnccl-net-none.so: cannot open shared object file: No such file or directory
308
+ dlc1w85u6fy2xvlb-worker-0:74:74 [1] NCCL INFO NET/Plugin : No plugin found, using internal implementation
309
+ dlc1w85u6fy2xvlb-worker-0:78:78 [5] NCCL INFO NET/Plugin : No plugin found, using internal implementation
310
+ dlc1w85u6fy2xvlb-worker-0:79:79 [6] NCCL INFO NET/Plugin : No plugin found, using internal implementation
311
+ dlc1w85u6fy2xvlb-worker-0:75:75 [2] NCCL INFO NET/Plugin : No plugin found, using internal implementation
312
+ dlc1w85u6fy2xvlb-worker-0:80:80 [7] NCCL INFO NET/Plugin : Plugin load (libnccl-net-none.so) returned 2 : libnccl-net-none.so: cannot open shared object file: No such file or directory
313
+ dlc1w85u6fy2xvlb-worker-0:77:77 [4] NCCL INFO NET/Plugin : Plugin load (libnccl-net-none.so) returned 2 : libnccl-net-none.so: cannot open shared object file: No such file or directory
314
+ dlc1w85u6fy2xvlb-worker-0:80:80 [7] NCCL INFO NET/Plugin : No plugin found, using internal implementation
315
+ dlc1w85u6fy2xvlb-worker-0:77:77 [4] NCCL INFO NET/Plugin : No plugin found, using internal implementation
316
+ dlc1w85u6fy2xvlb-worker-0:73:73 [0] NCCL INFO NET/Plugin : Plugin load (libnccl-net-none.so) returned 2 : libnccl-net-none.so: cannot open shared object file: No such file or directory
317
+ dlc1w85u6fy2xvlb-worker-0:76:76 [3] NCCL INFO NET/Plugin : Plugin load (libnccl-net-none.so) returned 2 : libnccl-net-none.so: cannot open shared object file: No such file or directory
318
+ dlc1w85u6fy2xvlb-worker-0:73:73 [0] NCCL INFO NET/Plugin : No plugin found, using internal implementation
319
+ dlc1w85u6fy2xvlb-worker-0:76:76 [3] NCCL INFO NET/Plugin : No plugin found, using internal implementation
320
+ dlc1w85u6fy2xvlb-worker-0:74:318 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth
321
+ dlc1w85u6fy2xvlb-worker-0:75:317 [2] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth
322
+ dlc1w85u6fy2xvlb-worker-0:80:320 [7] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth
323
+ dlc1w85u6fy2xvlb-worker-0:76:316 [3] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth
324
+ dlc1w85u6fy2xvlb-worker-0:78:321 [5] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth
325
+ dlc1w85u6fy2xvlb-worker-0:73:323 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth
326
+ dlc1w85u6fy2xvlb-worker-0:74:318 [1] NCCL INFO NCCL_IB_HCA set to mlx5
327
+ dlc1w85u6fy2xvlb-worker-0:75:317 [2] NCCL INFO NCCL_IB_HCA set to mlx5
328
+ dlc1w85u6fy2xvlb-worker-0:78:321 [5] NCCL INFO NCCL_IB_HCA set to mlx5
329
+ dlc1w85u6fy2xvlb-worker-0:80:320 [7] NCCL INFO NCCL_IB_HCA set to mlx5
330
+ dlc1w85u6fy2xvlb-worker-0:76:316 [3] NCCL INFO NCCL_IB_HCA set to mlx5
331
+ dlc1w85u6fy2xvlb-worker-0:73:323 [0] NCCL INFO NCCL_IB_HCA set to mlx5
332
+ dlc1w85u6fy2xvlb-worker-0:77:319 [4] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth
333
+ dlc1w85u6fy2xvlb-worker-0:77:319 [4] NCCL INFO NCCL_IB_HCA set to mlx5
334
+ dlc1w85u6fy2xvlb-worker-0:79:322 [6] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth
335
+ dlc1w85u6fy2xvlb-worker-0:79:322 [6] NCCL INFO NCCL_IB_HCA set to mlx5
336
+ dlc1w85u6fy2xvlb-worker-0:73:323 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [1]mlx5_1:1/RoCE [2]mlx5_2:1/RoCE [3]mlx5_3:1/RoCE [RO]; OOB eth0:22.8.45.198<0>
337
+ dlc1w85u6fy2xvlb-worker-0:73:323 [0] NCCL INFO Using network IB
338
+ dlc1w85u6fy2xvlb-worker-0:75:317 [2] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [1]mlx5_1:1/RoCE [2]mlx5_2:1/RoCE [3]mlx5_3:1/RoCE [RO]; OOB eth0:22.8.45.198<0>
339
+ dlc1w85u6fy2xvlb-worker-0:75:317 [2] NCCL INFO Using network IB
340
+ dlc1w85u6fy2xvlb-worker-0:78:321 [5] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [1]mlx5_1:1/RoCE [2]mlx5_2:1/RoCE [3]mlx5_3:1/RoCE [RO]; OOB eth0:22.8.45.198<0>
341
+ dlc1w85u6fy2xvlb-worker-0:78:321 [5] NCCL INFO Using network IB
342
+ dlc1w85u6fy2xvlb-worker-0:74:318 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [1]mlx5_1:1/RoCE [2]mlx5_2:1/RoCE [3]mlx5_3:1/RoCE [RO]; OOB eth0:22.8.45.198<0>
343
+ dlc1w85u6fy2xvlb-worker-0:74:318 [1] NCCL INFO Using network IB
344
+ dlc1w85u6fy2xvlb-worker-0:76:316 [3] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [1]mlx5_1:1/RoCE [2]mlx5_2:1/RoCE [3]mlx5_3:1/RoCE [RO]; OOB eth0:22.8.45.198<0>
345
+ dlc1w85u6fy2xvlb-worker-0:76:316 [3] NCCL INFO Using network IB
346
+ dlc1w85u6fy2xvlb-worker-0:77:319 [4] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [1]mlx5_1:1/RoCE [2]mlx5_2:1/RoCE [3]mlx5_3:1/RoCE [RO]; OOB eth0:22.8.45.198<0>
347
+ dlc1w85u6fy2xvlb-worker-0:77:319 [4] NCCL INFO Using network IB
348
+ dlc1w85u6fy2xvlb-worker-0:80:320 [7] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [1]mlx5_1:1/RoCE [2]mlx5_2:1/RoCE [3]mlx5_3:1/RoCE [RO]; OOB eth0:22.8.45.198<0>
349
+ dlc1w85u6fy2xvlb-worker-0:80:320 [7] NCCL INFO Using network IB
350
+ dlc1w85u6fy2xvlb-worker-0:79:322 [6] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [1]mlx5_1:1/RoCE [2]mlx5_2:1/RoCE [3]mlx5_3:1/RoCE [RO]; OOB eth0:22.8.45.198<0>
351
+ dlc1w85u6fy2xvlb-worker-0:79:322 [6] NCCL INFO Using network IB
352
+ dlc1w85u6fy2xvlb-worker-0:76:316 [3] NCCL INFO comm 0x9b7bdc70 rank 11 nranks 16 cudaDev 3 nvmlDev 3 busId 40 commId 0xe730164ea45002c1 - Init START
353
+ dlc1w85u6fy2xvlb-worker-0:79:322 [6] NCCL INFO comm 0x9aa4c060 rank 14 nranks 16 cudaDev 6 nvmlDev 6 busId 70 commId 0xe730164ea45002c1 - Init START
354
+ dlc1w85u6fy2xvlb-worker-0:78:321 [5] NCCL INFO comm 0x9b39f9e0 rank 13 nranks 16 cudaDev 5 nvmlDev 5 busId 60 commId 0xe730164ea45002c1 - Init START
355
+ dlc1w85u6fy2xvlb-worker-0:77:319 [4] NCCL INFO comm 0x9b9e2970 rank 12 nranks 16 cudaDev 4 nvmlDev 4 busId 50 commId 0xe730164ea45002c1 - Init START
356
+ dlc1w85u6fy2xvlb-worker-0:80:320 [7] NCCL INFO comm 0x9bccd080 rank 15 nranks 16 cudaDev 7 nvmlDev 7 busId 80 commId 0xe730164ea45002c1 - Init START
357
+ dlc1w85u6fy2xvlb-worker-0:73:323 [0] NCCL INFO comm 0x9aabc5f0 rank 8 nranks 16 cudaDev 0 nvmlDev 0 busId 10 commId 0xe730164ea45002c1 - Init START
358
+ dlc1w85u6fy2xvlb-worker-0:75:317 [2] NCCL INFO comm 0x9bcdb060 rank 10 nranks 16 cudaDev 2 nvmlDev 2 busId 30 commId 0xe730164ea45002c1 - Init START
359
+ dlc1w85u6fy2xvlb-worker-0:74:318 [1] NCCL INFO comm 0x9a893920 rank 9 nranks 16 cudaDev 1 nvmlDev 1 busId 20 commId 0xe730164ea45002c1 - Init START
360
+ dlc1w85u6fy2xvlb-worker-0:79:322 [6] NCCL INFO NVLS multicast support is not available on dev 6
361
+ dlc1w85u6fy2xvlb-worker-0:76:316 [3] NCCL INFO Setting affinity for GPU 3 to ffffffff,ffffffff,ffffffff
362
+ dlc1w85u6fy2xvlb-worker-0:76:316 [3] NCCL INFO NVLS multicast support is not available on dev 3
363
+ dlc1w85u6fy2xvlb-worker-0:80:320 [7] NCCL INFO NVLS multicast support is not available on dev 7
364
+ dlc1w85u6fy2xvlb-worker-0:74:318 [1] NCCL INFO Setting affinity for GPU 1 to ffffffff,ffffffff,ffffffff
365
+ dlc1w85u6fy2xvlb-worker-0:74:318 [1] NCCL INFO NVLS multicast support is not available on dev 1
366
+ dlc1w85u6fy2xvlb-worker-0:78:321 [5] NCCL INFO NVLS multicast support is not available on dev 5
367
+ dlc1w85u6fy2xvlb-worker-0:73:323 [0] NCCL INFO Setting affinity for GPU 0 to ffffffff,ffffffff,ffffffff
368
+ dlc1w85u6fy2xvlb-worker-0:73:323 [0] NCCL INFO NVLS multicast support is not available on dev 0
369
+ dlc1w85u6fy2xvlb-worker-0:77:319 [4] NCCL INFO NVLS multicast support is not available on dev 4
370
+ dlc1w85u6fy2xvlb-worker-0:75:317 [2] NCCL INFO Setting affinity for GPU 2 to ffffffff,ffffffff,ffffffff
371
+ dlc1w85u6fy2xvlb-worker-0:75:317 [2] NCCL INFO NVLS multicast support is not available on dev 2
372
+ dlc1w85u6fy2xvlb-worker-0:77:319 [4] NCCL INFO NCCL_MIN_NCHANNELS set by environment to 4.
373
+ dlc1w85u6fy2xvlb-worker-0:77:319 [4] NCCL INFO Trees [0] 13/-1/-1->12->11 [1] 13/-1/-1->12->11 [2] 13/-1/-1->12->4 [3] 13/-1/-1->12->11 [4] 13/-1/-1->12->11 [5] 13/-1/-1->12->11 [6] 13/4/-1->12->-1 [7] 13/-1/-1->12->11
374
+ dlc1w85u6fy2xvlb-worker-0:77:319 [4] NCCL INFO P2P Chunksize set to 131072
375
+ dlc1w85u6fy2xvlb-worker-0:78:321 [5] NCCL INFO NCCL_MIN_NCHANNELS set by environment to 4.
376
+ dlc1w85u6fy2xvlb-worker-0:80:320 [7] NCCL INFO NCCL_MIN_NCHANNELS set by environment to 4.
377
+ dlc1w85u6fy2xvlb-worker-0:79:322 [6] NCCL INFO NCCL_MIN_NCHANNELS set by environment to 4.
378
+ dlc1w85u6fy2xvlb-worker-0:78:321 [5] NCCL INFO Trees [0] 14/-1/-1->13->12 [1] 14/-1/-1->13->12 [2] 14/-1/-1->13->12 [3] -1/-1/-1->13->12 [4] 14/-1/-1->13->12 [5] 14/-1/-1->13->12 [6] 14/-1/-1->13->12 [7] -1/-1/-1->13->12
379
+ dlc1w85u6fy2xvlb-worker-0:80:320 [7] NCCL INFO Trees [0] -1/-1/-1->15->14 [1] 8/-1/-1->15->14 [2] 8/-1/-1->15->14 [3] 8/-1/-1->15->14 [4] -1/-1/-1->15->14 [5] 8/-1/-1->15->14 [6] 8/-1/-1->15->14 [7] 8/-1/-1->15->14
380
+ dlc1w85u6fy2xvlb-worker-0:78:321 [5] NCCL INFO P2P Chunksize set to 131072
381
+ dlc1w85u6fy2xvlb-worker-0:80:320 [7] NCCL INFO P2P Chunksize set to 131072
382
+ dlc1w85u6fy2xvlb-worker-0:79:322 [6] NCCL INFO Trees [0] 15/-1/-1->14->13 [1] 15/-1/-1->14->13 [2] 15/-1/-1->14->13 [3] 15/-1/-1->14->6 [4] 15/-1/-1->14->13 [5] 15/-1/-1->14->13 [6] 15/-1/-1->14->13 [7] 15/6/-1->14->-1
383
+ dlc1w85u6fy2xvlb-worker-0:76:316 [3] NCCL INFO NCCL_MIN_NCHANNELS set by environment to 4.
384
+ dlc1w85u6fy2xvlb-worker-0:79:322 [6] NCCL INFO P2P Chunksize set to 131072
385
+ dlc1w85u6fy2xvlb-worker-0:75:317 [2] NCCL INFO NCCL_MIN_NCHANNELS set by environment to 4.
386
+ dlc1w85u6fy2xvlb-worker-0:75:317 [2] NCCL INFO Trees [0] 11/-1/-1->10->9 [1] 11/-1/-1->10->2 [2] 11/-1/-1->10->9 [3] 11/-1/-1->10->9 [4] 11/-1/-1->10->9 [5] 11/2/-1->10->-1 [6] 11/-1/-1->10->9 [7] 11/-1/-1->10->9
387
+ dlc1w85u6fy2xvlb-worker-0:76:316 [3] NCCL INFO Trees [0] 12/-1/-1->11->10 [1] 12/-1/-1->11->10 [2] -1/-1/-1->11->10 [3] 12/-1/-1->11->10 [4] 12/-1/-1->11->10 [5] 12/-1/-1->11->10 [6] -1/-1/-1->11->10 [7] 12/-1/-1->11->10
388
+ dlc1w85u6fy2xvlb-worker-0:75:317 [2] NCCL INFO P2P Chunksize set to 131072
389
+ dlc1w85u6fy2xvlb-worker-0:76:316 [3] NCCL INFO P2P Chunksize set to 131072
390
+ dlc1w85u6fy2xvlb-worker-0:74:318 [1] NCCL INFO NCCL_MIN_NCHANNELS set by environment to 4.
391
+ dlc1w85u6fy2xvlb-worker-0:73:323 [0] NCCL INFO NCCL_MIN_NCHANNELS set by environment to 4.
392
+ dlc1w85u6fy2xvlb-worker-0:74:318 [1] NCCL INFO Trees [0] 10/-1/-1->9->8 [1] -1/-1/-1->9->8 [2] 10/-1/-1->9->8 [3] 10/-1/-1->9->8 [4] 10/-1/-1->9->8 [5] -1/-1/-1->9->8 [6] 10/-1/-1->9->8 [7] 10/-1/-1->9->8
393
+ dlc1w85u6fy2xvlb-worker-0:73:323 [0] NCCL INFO Trees [0] 9/-1/-1->8->0 [1] 9/-1/-1->8->15 [2] 9/-1/-1->8->15 [3] 9/-1/-1->8->15 [4] 9/0/-1->8->-1 [5] 9/-1/-1->8->15 [6] 9/-1/-1->8->15 [7] 9/-1/-1->8->15
394
+ dlc1w85u6fy2xvlb-worker-0:74:318 [1] NCCL INFO P2P Chunksize set to 131072
395
+ dlc1w85u6fy2xvlb-worker-0:73:323 [0] NCCL INFO P2P Chunksize set to 131072
396
+ dlc1w85u6fy2xvlb-worker-0:77:319 [4] NCCL INFO Channel 03/0 : 12[4] -> 15[7] via P2P/IPC/read
397
+ dlc1w85u6fy2xvlb-worker-0:75:317 [2] NCCL INFO Channel 01/0 : 10[2] -> 15[7] via P2P/IPC/read
398
+ dlc1w85u6fy2xvlb-worker-0:73:323 [0] NCCL INFO Channel 01/0 : 8[0] -> 11[3] via P2P/IPC/read
399
+ dlc1w85u6fy2xvlb-worker-0:77:319 [4] NCCL INFO Channel 07/0 : 12[4] -> 15[7] via P2P/IPC/read
400
+ dlc1w85u6fy2xvlb-worker-0:74:318 [1] NCCL INFO Channel 00/0 : 9[1] -> 0[0] [send] via NET/IB/0/GDRDMA
401
+ dlc1w85u6fy2xvlb-worker-0:79:322 [6] NCCL INFO Channel 03/0 : 7[7] -> 14[6] [receive] via NET/IB/3/GDRDMA
402
+ dlc1w85u6fy2xvlb-worker-0:74:318 [1] NCCL INFO Channel 04/0 : 9[1] -> 0[0] [send] via NET/IB/0/GDRDMA
403
+ dlc1w85u6fy2xvlb-worker-0:79:322 [6] NCCL INFO Channel 07/0 : 7[7] -> 14[6] [receive] via NET/IB/3/GDRDMA
404
+ dlc1w85u6fy2xvlb-worker-0:75:317 [2] NCCL INFO Channel 05/0 : 10[2] -> 15[7] via P2P/IPC/read
405
+ dlc1w85u6fy2xvlb-worker-0:73:323 [0] NCCL INFO Channel 05/0 : 8[0] -> 11[3] via P2P/IPC/read
406
+ dlc1w85u6fy2xvlb-worker-0:73:323 [0] NCCL INFO Channel 03/0 : 8[0] -> 13[5] via P2P/IPC/read
407
+ dlc1w85u6fy2xvlb-worker-0:73:323 [0] NCCL INFO Channel 07/0 : 8[0] -> 13[5] via P2P/IPC/read
408
+ dlc1w85u6fy2xvlb-worker-0:77:319 [4] NCCL INFO Channel 02/0 : 5[5] -> 12[4] [receive] via NET/IB/2/GDRDMA
409
+ dlc1w85u6fy2xvlb-worker-0:77:319 [4] NCCL INFO Channel 06/0 : 5[5] -> 12[4] [receive] via NET/IB/2/GDRDMA
410
+ dlc1w85u6fy2xvlb-worker-0:76:316 [3] NCCL INFO Channel 01/0 : 11[3] -> 2[2] [send] via NET/IB/1/GDRDMA
411
+ dlc1w85u6fy2xvlb-worker-0:76:316 [3] NCCL INFO Channel 05/0 : 11[3] -> 2[2] [send] via NET/IB/1/GDRDMA
412
+ dlc1w85u6fy2xvlb-worker-0:73:323 [0] NCCL INFO Channel 00/0 : 1[1] -> 8[0] [receive] via NET/IB/0/GDRDMA
413
+ dlc1w85u6fy2xvlb-worker-0:75:317 [2] NCCL INFO Channel 01/0 : 3[3] -> 10[2] [receive] via NET/IB/1/GDRDMA
414
+ dlc1w85u6fy2xvlb-worker-0:73:323 [0] NCCL INFO Channel 04/0 : 1[1] -> 8[0] [receive] via NET/IB/0/GDRDMA
415
+ dlc1w85u6fy2xvlb-worker-0:73:323 [0] NCCL INFO Channel 00/0 : 8[0] -> 15[7] via P2P/IPC/read
416
+ dlc1w85u6fy2xvlb-worker-0:75:317 [2] NCCL INFO Channel 05/0 : 3[3] -> 10[2] [receive] via NET/IB/1/GDRDMA
417
+ dlc1w85u6fy2xvlb-worker-0:73:323 [0] NCCL INFO Channel 02/0 : 8[0] -> 15[7] via P2P/IPC/read
418
+ dlc1w85u6fy2xvlb-worker-0:77:319 [4] NCCL INFO Channel 01/0 : 12[4] -> 9[1] via P2P/IPC/read
419
+ dlc1w85u6fy2xvlb-worker-0:73:323 [0] NCCL INFO Channel 04/0 : 8[0] -> 15[7] via P2P/IPC/read
420
+ dlc1w85u6fy2xvlb-worker-0:78:321 [5] NCCL INFO Channel 02/0 : 13[5] -> 4[4] [send] via NET/IB/2/GDRDMA
421
+ dlc1w85u6fy2xvlb-worker-0:78:321 [5] NCCL INFO Channel 06/0 : 13[5] -> 4[4] [send] via NET/IB/2/GDRDMA
422
+ dlc1w85u6fy2xvlb-worker-0:79:322 [6] NCCL INFO Channel 03/0 : 14[6] -> 11[3] via P2P/IPC/read
423
+ dlc1w85u6fy2xvlb-worker-0:80:320 [7] NCCL INFO Channel 03/0 : 15[7] -> 6[6] [send] via NET/IB/3/GDRDMA
424
+ dlc1w85u6fy2xvlb-worker-0:80:320 [7] NCCL INFO Channel 07/0 : 15[7] -> 6[6] [send] via NET/IB/3/GDRDMA
425
+ dlc1w85u6fy2xvlb-worker-0:77:319 [4] NCCL INFO Channel 05/0 : 12[4] -> 9[1] via P2P/IPC/read
426
+ dlc1w85u6fy2xvlb-worker-0:73:323 [0] NCCL INFO Channel 06/0 : 8[0] -> 15[7] via P2P/IPC/read
427
+ dlc1w85u6fy2xvlb-worker-0:80:320 [7] NCCL INFO Channel 00/0 : 15[7] -> 14[6] via P2P/IPC/read
428
+ dlc1w85u6fy2xvlb-worker-0:79:322 [6] NCCL INFO Channel 07/0 : 14[6] -> 11[3] via P2P/IPC/read
429
+ dlc1w85u6fy2xvlb-worker-0:80:320 [7] NCCL INFO Channel 01/0 : 15[7] -> 14[6] via P2P/IPC/read
430
+ dlc1w85u6fy2xvlb-worker-0:80:320 [7] NCCL INFO Channel 02/0 : 15[7] -> 14[6] via P2P/IPC/read
431
+ dlc1w85u6fy2xvlb-worker-0:80:320 [7] NCCL INFO Channel 04/0 : 15[7] -> 14[6] via P2P/IPC/read
432
+ dlc1w85u6fy2xvlb-worker-0:75:317 [2] NCCL INFO Channel 00/0 : 10[2] -> 9[1] via P2P/IPC/read
433
+ dlc1w85u6fy2xvlb-worker-0:80:320 [7] NCCL INFO Channel 05/0 : 15[7] -> 14[6] via P2P/IPC/read
434
+ dlc1w85u6fy2xvlb-worker-0:75:317 [2] NCCL INFO Channel 02/0 : 10[2] -> 9[1] via P2P/IPC/read
435
+ dlc1w85u6fy2xvlb-worker-0:78:321 [5] NCCL INFO Channel 00/0 : 13[5] -> 12[4] via P2P/IPC/read
436
+ dlc1w85u6fy2xvlb-worker-0:80:320 [7] NCCL INFO Channel 06/0 : 15[7] -> 14[6] via P2P/IPC/read
437
+ dlc1w85u6fy2xvlb-worker-0:76:316 [3] NCCL INFO Channel 00/0 : 11[3] -> 10[2] via P2P/IPC/read
438
+ dlc1w85u6fy2xvlb-worker-0:75:317 [2] NCCL INFO Channel 03/0 : 10[2] -> 9[1] via P2P/IPC/read
439
+ dlc1w85u6fy2xvlb-worker-0:78:321 [5] NCCL INFO Channel 01/0 : 13[5] -> 12[4] via P2P/IPC/read
440
+ dlc1w85u6fy2xvlb-worker-0:76:316 [3] NCCL INFO Channel 02/0 : 11[3] -> 10[2] via P2P/IPC/read
441
+ dlc1w85u6fy2xvlb-worker-0:75:317 [2] NCCL INFO Channel 04/0 : 10[2] -> 9[1] via P2P/IPC/read
442
+ dlc1w85u6fy2xvlb-worker-0:78:321 [5] NCCL INFO Channel 03/0 : 13[5] -> 12[4] via P2P/IPC/read
443
+ dlc1w85u6fy2xvlb-worker-0:79:322 [6] NCCL INFO Channel 00/0 : 14[6] -> 13[5] via P2P/IPC/read
444
+ dlc1w85u6fy2xvlb-worker-0:76:316 [3] NCCL INFO Channel 03/0 : 11[3] -> 10[2] via P2P/IPC/read
445
+ dlc1w85u6fy2xvlb-worker-0:75:317 [2] NCCL INFO Channel 06/0 : 10[2] -> 9[1] via P2P/IPC/read
446
+ dlc1w85u6fy2xvlb-worker-0:78:321 [5] NCCL INFO Channel 04/0 : 13[5] -> 12[4] via P2P/IPC/read
447
+ dlc1w85u6fy2xvlb-worker-0:79:322 [6] NCCL INFO Channel 01/0 : 14[6] -> 13[5] via P2P/IPC/read
448
+ dlc1w85u6fy2xvlb-worker-0:76:316 [3] NCCL INFO Channel 04/0 : 11[3] -> 10[2] via P2P/IPC/read
449
+ dlc1w85u6fy2xvlb-worker-0:75:317 [2] NCCL INFO Channel 07/0 : 10[2] -> 9[1] via P2P/IPC/read
450
+ dlc1w85u6fy2xvlb-worker-0:78:321 [5] NCCL INFO Channel 05/0 : 13[5] -> 12[4] via P2P/IPC/read
451
+ dlc1w85u6fy2xvlb-worker-0:79:322 [6] NCCL INFO Channel 02/0 : 14[6] -> 13[5] via P2P/IPC/read
452
+ dlc1w85u6fy2xvlb-worker-0:76:316 [3] NCCL INFO Channel 06/0 : 11[3] -> 10[2] via P2P/IPC/read
453
+ dlc1w85u6fy2xvlb-worker-0:78:321 [5] NCCL INFO Channel 07/0 : 13[5] -> 12[4] via P2P/IPC/read
454
+ dlc1w85u6fy2xvlb-worker-0:77:319 [4] NCCL INFO Channel 00/0 : 12[4] -> 11[3] via P2P/IPC/read
455
+ dlc1w85u6fy2xvlb-worker-0:79:322 [6] NCCL INFO Channel 04/0 : 14[6] -> 13[5] via P2P/IPC/read
456
+ dlc1w85u6fy2xvlb-worker-0:76:316 [3] NCCL INFO Channel 07/0 : 11[3] -> 10[2] via P2P/IPC/read
457
+ dlc1w85u6fy2xvlb-worker-0:74:318 [1] NCCL INFO Channel 01/0 : 9[1] -> 8[0] via P2P/IPC/read
458
+ dlc1w85u6fy2xvlb-worker-0:77:319 [4] NCCL INFO Channel 02/0 : 12[4] -> 11[3] via P2P/IPC/read
459
+ dlc1w85u6fy2xvlb-worker-0:79:322 [6] NCCL INFO Channel 05/0 : 14[6] -> 13[5] via P2P/IPC/read
460
+ dlc1w85u6fy2xvlb-worker-0:74:318 [1] NCCL INFO Channel 02/0 : 9[1] -> 8[0] via P2P/IPC/read
461
+ dlc1w85u6fy2xvlb-worker-0:77:319 [4] NCCL INFO Channel 04/0 : 12[4] -> 11[3] via P2P/IPC/read
462
+ dlc1w85u6fy2xvlb-worker-0:79:322 [6] NCCL INFO Channel 06/0 : 14[6] -> 13[5] via P2P/IPC/read
463
+ dlc1w85u6fy2xvlb-worker-0:74:318 [1] NCCL INFO Channel 03/0 : 9[1] -> 8[0] via P2P/IPC/read
464
+ dlc1w85u6fy2xvlb-worker-0:77:319 [4] NCCL INFO Channel 06/0 : 12[4] -> 11[3] via P2P/IPC/read
465
+ dlc1w85u6fy2xvlb-worker-0:74:318 [1] NCCL INFO Channel 05/0 : 9[1] -> 8[0] via P2P/IPC/read
466
+ dlc1w85u6fy2xvlb-worker-0:74:318 [1] NCCL INFO Channel 06/0 : 9[1] -> 8[0] via P2P/IPC/read
467
+ dlc1w85u6fy2xvlb-worker-0:78:357 [5] NCCL INFO NCCL_IB_QPS_PER_CONNECTION set by environment to 8.
468
+ dlc1w85u6fy2xvlb-worker-0:74:318 [1] NCCL INFO Channel 07/0 : 9[1] -> 8[0] via P2P/IPC/read
469
+ dlc1w85u6fy2xvlb-worker-0:76:360 [3] NCCL INFO NCCL_IB_QPS_PER_CONNECTION set by environment to 8.
470
+ dlc1w85u6fy2xvlb-worker-0:77:356 [4] NCCL INFO NCCL_IB_GID_INDEX set by environment to 3.
471
+ dlc1w85u6fy2xvlb-worker-0:77:356 [4] NCCL INFO NCCL_IB_QPS_PER_CONNECTION set by environment to 8.
472
+ dlc1w85u6fy2xvlb-worker-0:74:363 [1] NCCL INFO NCCL_IB_QPS_PER_CONNECTION set by environment to 8.
473
+ dlc1w85u6fy2xvlb-worker-0:78:357 [5] NCCL INFO NCCL_IB_GID_INDEX set by environment to 3.
474
+ dlc1w85u6fy2xvlb-worker-0:76:360 [3] NCCL INFO NCCL_IB_GID_INDEX set by environment to 3.
475
+ dlc1w85u6fy2xvlb-worker-0:80:358 [7] NCCL INFO NCCL_IB_QPS_PER_CONNECTION set by environment to 8.
476
+ dlc1w85u6fy2xvlb-worker-0:77:356 [4] NCCL INFO NCCL_IB_TC set by environment to 136.
477
+ dlc1w85u6fy2xvlb-worker-0:77:356 [4] NCCL INFO NCCL_IB_SL set by environment to 5.
478
+ dlc1w85u6fy2xvlb-worker-0:77:356 [4] NCCL INFO NCCL_IB_TIMEOUT set by environment to 22.
479
+ dlc1w85u6fy2xvlb-worker-0:74:363 [1] NCCL INFO NCCL_IB_GID_INDEX set by environment to 3.
480
+ dlc1w85u6fy2xvlb-worker-0:75:359 [2] NCCL INFO NCCL_IB_GID_INDEX set by environment to 3.
481
+ dlc1w85u6fy2xvlb-worker-0:75:359 [2] NCCL INFO NCCL_IB_QPS_PER_CONNECTION set by environment to 8.
482
+ dlc1w85u6fy2xvlb-worker-0:80:358 [7] NCCL INFO NCCL_IB_GID_INDEX set by environment to 3.
483
+ dlc1w85u6fy2xvlb-worker-0:79:361 [6] NCCL INFO NCCL_IB_GID_INDEX set by environment to 3.
484
+ dlc1w85u6fy2xvlb-worker-0:79:361 [6] NCCL INFO NCCL_IB_QPS_PER_CONNECTION set by environment to 8.
485
+ dlc1w85u6fy2xvlb-worker-0:75:359 [2] NCCL INFO NCCL_IB_TC set by environment to 136.
486
+ dlc1w85u6fy2xvlb-worker-0:75:359 [2] NCCL INFO NCCL_IB_SL set by environment to 5.
487
+ dlc1w85u6fy2xvlb-worker-0:75:359 [2] NCCL INFO NCCL_IB_TIMEOUT set by environment to 22.
488
+ dlc1w85u6fy2xvlb-worker-0:76:360 [3] NCCL INFO NCCL_IB_TC set by environment to 136.
489
+ dlc1w85u6fy2xvlb-worker-0:76:360 [3] NCCL INFO NCCL_IB_SL set by environment to 5.
490
+ dlc1w85u6fy2xvlb-worker-0:76:360 [3] NCCL INFO NCCL_IB_TIMEOUT set by environment to 22.
491
+ dlc1w85u6fy2xvlb-worker-0:78:357 [5] NCCL INFO NCCL_IB_TC set by environment to 136.
492
+ dlc1w85u6fy2xvlb-worker-0:78:357 [5] NCCL INFO NCCL_IB_SL set by environment to 5.
493
+ dlc1w85u6fy2xvlb-worker-0:78:357 [5] NCCL INFO NCCL_IB_TIMEOUT set by environment to 22.
494
+ dlc1w85u6fy2xvlb-worker-0:79:361 [6] NCCL INFO NCCL_IB_TC set by environment to 136.
495
+ dlc1w85u6fy2xvlb-worker-0:79:361 [6] NCCL INFO NCCL_IB_SL set by environment to 5.
496
+ dlc1w85u6fy2xvlb-worker-0:73:362 [0] NCCL INFO NCCL_IB_GID_INDEX set by environment to 3.
497
+ dlc1w85u6fy2xvlb-worker-0:73:362 [0] NCCL INFO NCCL_IB_QPS_PER_CONNECTION set by environment to 8.
498
+ dlc1w85u6fy2xvlb-worker-0:79:361 [6] NCCL INFO NCCL_IB_TIMEOUT set by environment to 22.
499
+ dlc1w85u6fy2xvlb-worker-0:74:363 [1] NCCL INFO NCCL_IB_TC set by environment to 136.
500
+ dlc1w85u6fy2xvlb-worker-0:74:363 [1] NCCL INFO NCCL_IB_SL set by environment to 5.
501
+ dlc1w85u6fy2xvlb-worker-0:74:363 [1] NCCL INFO NCCL_IB_TIMEOUT set by environment to 22.
502
+ dlc1w85u6fy2xvlb-worker-0:80:358 [7] NCCL INFO NCCL_IB_TC set by environment to 136.
503
+ dlc1w85u6fy2xvlb-worker-0:80:358 [7] NCCL INFO NCCL_IB_SL set by environment to 5.
504
+ dlc1w85u6fy2xvlb-worker-0:80:358 [7] NCCL INFO NCCL_IB_TIMEOUT set by environment to 22.
505
+ dlc1w85u6fy2xvlb-worker-0:73:362 [0] NCCL INFO NCCL_IB_TC set by environment to 136.
506
+ dlc1w85u6fy2xvlb-worker-0:73:362 [0] NCCL INFO NCCL_IB_SL set by environment to 5.
507
+ dlc1w85u6fy2xvlb-worker-0:73:362 [0] NCCL INFO NCCL_IB_TIMEOUT set by environment to 22.
508
+ dlc1w85u6fy2xvlb-worker-0:76:316 [3] NCCL INFO Connected all rings
509
+ dlc1w85u6fy2xvlb-worker-0:80:320 [7] NCCL INFO Connected all rings
510
+ dlc1w85u6fy2xvlb-worker-0:78:321 [5] NCCL INFO Connected all rings
511
+ dlc1w85u6fy2xvlb-worker-0:74:318 [1] NCCL INFO Connected all rings
512
+ dlc1w85u6fy2xvlb-worker-0:76:316 [3] NCCL INFO Channel 00/0 : 11[3] -> 12[4] via P2P/IPC/read
513
+ dlc1w85u6fy2xvlb-worker-0:76:316 [3] NCCL INFO Channel 01/0 : 11[3] -> 12[4] via P2P/IPC/read
514
+ dlc1w85u6fy2xvlb-worker-0:76:316 [3] NCCL INFO Channel 03/0 : 11[3] -> 12[4] via P2P/IPC/read
515
+ dlc1w85u6fy2xvlb-worker-0:77:319 [4] NCCL INFO Connected all rings
516
+ dlc1w85u6fy2xvlb-worker-0:76:316 [3] NCCL INFO Channel 04/0 : 11[3] -> 12[4] via P2P/IPC/read
517
+ dlc1w85u6fy2xvlb-worker-0:79:322 [6] NCCL INFO Connected all rings
518
+ dlc1w85u6fy2xvlb-worker-0:76:316 [3] NCCL INFO Channel 05/0 : 11[3] -> 12[4] via P2P/IPC/read
519
+ dlc1w85u6fy2xvlb-worker-0:73:323 [0] NCCL INFO Connected all rings
520
+ dlc1w85u6fy2xvlb-worker-0:73:323 [0] NCCL INFO Channel 00/0 : 8[0] -> 9[1] via P2P/IPC/read
521
+ dlc1w85u6fy2xvlb-worker-0:75:317 [2] NCCL INFO Connected all rings
522
+ dlc1w85u6fy2xvlb-worker-0:76:316 [3] NCCL INFO Channel 07/0 : 11[3] -> 12[4] via P2P/IPC/read
523
+ dlc1w85u6fy2xvlb-worker-0:73:323 [0] NCCL INFO Channel 01/0 : 8[0] -> 9[1] via P2P/IPC/read
524
+ dlc1w85u6fy2xvlb-worker-0:78:321 [5] NCCL INFO Channel 00/0 : 13[5] -> 14[6] via P2P/IPC/read
525
+ dlc1w85u6fy2xvlb-worker-0:73:323 [0] NCCL INFO Channel 02/0 : 8[0] -> 9[1] via P2P/IPC/read
526
+ dlc1w85u6fy2xvlb-worker-0:78:321 [5] NCCL INFO Channel 01/0 : 13[5] -> 14[6] via P2P/IPC/read
527
+ dlc1w85u6fy2xvlb-worker-0:73:323 [0] NCCL INFO Channel 03/0 : 8[0] -> 9[1] via P2P/IPC/read
528
+ dlc1w85u6fy2xvlb-worker-0:74:318 [1] NCCL INFO Channel 00/0 : 9[1] -> 10[2] via P2P/IPC/read
529
+ dlc1w85u6fy2xvlb-worker-0:78:321 [5] NCCL INFO Channel 02/0 : 13[5] -> 14[6] via P2P/IPC/read
530
+ dlc1w85u6fy2xvlb-worker-0:73:323 [0] NCCL INFO Channel 04/0 : 8[0] -> 9[1] via P2P/IPC/read
531
+ dlc1w85u6fy2xvlb-worker-0:74:318 [1] NCCL INFO Channel 02/0 : 9[1] -> 10[2] via P2P/IPC/read
532
+ dlc1w85u6fy2xvlb-worker-0:78:321 [5] NCCL INFO Channel 04/0 : 13[5] -> 14[6] via P2P/IPC/read
533
+ dlc1w85u6fy2xvlb-worker-0:77:319 [4] NCCL INFO Channel 00/0 : 12[4] -> 13[5] via P2P/IPC/read
534
+ dlc1w85u6fy2xvlb-worker-0:73:323 [0] NCCL INFO Channel 05/0 : 8[0] -> 9[1] via P2P/IPC/read
535
+ dlc1w85u6fy2xvlb-worker-0:74:318 [1] NCCL INFO Channel 03/0 : 9[1] -> 10[2] via P2P/IPC/read
536
+ dlc1w85u6fy2xvlb-worker-0:78:321 [5] NCCL INFO Channel 05/0 : 13[5] -> 14[6] via P2P/IPC/read
537
+ dlc1w85u6fy2xvlb-worker-0:79:322 [6] NCCL INFO Channel 00/0 : 14[6] -> 15[7] via P2P/IPC/read
538
+ dlc1w85u6fy2xvlb-worker-0:77:319 [4] NCCL INFO Channel 01/0 : 12[4] -> 13[5] via P2P/IPC/read
539
+ dlc1w85u6fy2xvlb-worker-0:73:323 [0] NCCL INFO Channel 06/0 : 8[0] -> 9[1] via P2P/IPC/read
540
+ dlc1w85u6fy2xvlb-worker-0:74:318 [1] NCCL INFO Channel 04/0 : 9[1] -> 10[2] via P2P/IPC/read
541
+ dlc1w85u6fy2xvlb-worker-0:78:321 [5] NCCL INFO Channel 06/0 : 13[5] -> 14[6] via P2P/IPC/read
542
+ dlc1w85u6fy2xvlb-worker-0:79:322 [6] NCCL INFO Channel 01/0 : 14[6] -> 15[7] via P2P/IPC/read
543
+ dlc1w85u6fy2xvlb-worker-0:75:317 [2] NCCL INFO Channel 00/0 : 10[2] -> 11[3] via P2P/IPC/read
544
+ dlc1w85u6fy2xvlb-worker-0:77:319 [4] NCCL INFO Channel 02/0 : 12[4] -> 13[5] via P2P/IPC/read
545
+ dlc1w85u6fy2xvlb-worker-0:73:323 [0] NCCL INFO Channel 07/0 : 8[0] -> 9[1] via P2P/IPC/read
546
+ dlc1w85u6fy2xvlb-worker-0:74:318 [1] NCCL INFO Channel 06/0 : 9[1] -> 10[2] via P2P/IPC/read
547
+ dlc1w85u6fy2xvlb-worker-0:79:322 [6] NCCL INFO Channel 02/0 : 14[6] -> 15[7] via P2P/IPC/read
548
+ dlc1w85u6fy2xvlb-worker-0:75:317 [2] NCCL INFO Channel 01/0 : 10[2] -> 11[3] via P2P/IPC/read
549
+ dlc1w85u6fy2xvlb-worker-0:77:319 [4] NCCL INFO Channel 03/0 : 12[4] -> 13[5] via P2P/IPC/read
550
+ dlc1w85u6fy2xvlb-worker-0:74:318 [1] NCCL INFO Channel 07/0 : 9[1] -> 10[2] via P2P/IPC/read
551
+ dlc1w85u6fy2xvlb-worker-0:79:322 [6] NCCL INFO Channel 03/0 : 14[6] -> 15[7] via P2P/IPC/read
552
+ dlc1w85u6fy2xvlb-worker-0:75:317 [2] NCCL INFO Channel 02/0 : 10[2] -> 11[3] via P2P/IPC/read
553
+ dlc1w85u6fy2xvlb-worker-0:77:319 [4] NCCL INFO Channel 04/0 : 12[4] -> 13[5] via P2P/IPC/read
554
+ dlc1w85u6fy2xvlb-worker-0:73:323 [0] NCCL INFO Channel 01/0 : 8[0] -> 15[7] via P2P/IPC/read
555
+ dlc1w85u6fy2xvlb-worker-0:79:322 [6] NCCL INFO Channel 04/0 : 14[6] -> 15[7] via P2P/IPC/read
556
+ dlc1w85u6fy2xvlb-worker-0:75:317 [2] NCCL INFO Channel 03/0 : 10[2] -> 11[3] via P2P/IPC/read
557
+ dlc1w85u6fy2xvlb-worker-0:77:319 [4] NCCL INFO Channel 05/0 : 12[4] -> 13[5] via P2P/IPC/read
558
+ dlc1w85u6fy2xvlb-worker-0:73:323 [0] NCCL INFO Channel 03/0 : 8[0] -> 15[7] via P2P/IPC/read
559
+ dlc1w85u6fy2xvlb-worker-0:79:322 [6] NCCL INFO Channel 05/0 : 14[6] -> 15[7] via P2P/IPC/read
560
+ dlc1w85u6fy2xvlb-worker-0:75:317 [2] NCCL INFO Channel 04/0 : 10[2] -> 11[3] via P2P/IPC/read
561
+ dlc1w85u6fy2xvlb-worker-0:77:319 [4] NCCL INFO Channel 06/0 : 12[4] -> 13[5] via P2P/IPC/read
562
+ dlc1w85u6fy2xvlb-worker-0:79:322 [6] NCCL INFO Channel 06/0 : 14[6] -> 15[7] via P2P/IPC/read
563
+ dlc1w85u6fy2xvlb-worker-0:73:323 [0] NCCL INFO Channel 05/0 : 8[0] -> 15[7] via P2P/IPC/read
564
+ dlc1w85u6fy2xvlb-worker-0:75:317 [2] NCCL INFO Channel 05/0 : 10[2] -> 11[3] via P2P/IPC/read
565
+ dlc1w85u6fy2xvlb-worker-0:77:319 [4] NCCL INFO Channel 07/0 : 12[4] -> 13[5] via P2P/IPC/read
566
+ dlc1w85u6fy2xvlb-worker-0:79:322 [6] NCCL INFO Channel 07/0 : 14[6] -> 15[7] via P2P/IPC/read
567
+ dlc1w85u6fy2xvlb-worker-0:73:323 [0] NCCL INFO Channel 07/0 : 8[0] -> 15[7] via P2P/IPC/read
568
+ dlc1w85u6fy2xvlb-worker-0:75:317 [2] NCCL INFO Channel 06/0 : 10[2] -> 11[3] via P2P/IPC/read
569
+ dlc1w85u6fy2xvlb-worker-0:77:319 [4] NCCL INFO Channel 02/0 : 4[4] -> 12[4] [receive] via NET/IB/2/GDRDMA
570
+ dlc1w85u6fy2xvlb-worker-0:77:319 [4] NCCL INFO Channel 06/0 : 4[4] -> 12[4] [receive] via NET/IB/2/GDRDMA
571
+ dlc1w85u6fy2xvlb-worker-0:77:319 [4] NCCL INFO Channel 02/0 : 12[4] -> 4[4] [send] via NET/IB/2/GDRDMA
572
+ dlc1w85u6fy2xvlb-worker-0:77:319 [4] NCCL INFO Channel 06/0 : 12[4] -> 4[4] [send] via NET/IB/2/GDRDMA
573
+ dlc1w85u6fy2xvlb-worker-0:78:321 [5] NCCL INFO Channel 02/0 : 13[5] -> 12[4] via P2P/IPC/read
574
+ dlc1w85u6fy2xvlb-worker-0:79:322 [6] NCCL INFO Channel 03/0 : 6[6] -> 14[6] [receive] via NET/IB/3/GDRDMA
575
+ dlc1w85u6fy2xvlb-worker-0:79:322 [6] NCCL INFO Channel 07/0 : 6[6] -> 14[6] [receive] via NET/IB/3/GDRDMA
576
+ dlc1w85u6fy2xvlb-worker-0:79:322 [6] NCCL INFO Channel 03/0 : 14[6] -> 6[6] [send] via NET/IB/3/GDRDMA
577
+ dlc1w85u6fy2xvlb-worker-0:79:322 [6] NCCL INFO Channel 07/0 : 14[6] -> 6[6] [send] via NET/IB/3/GDRDMA
578
+ dlc1w85u6fy2xvlb-worker-0:75:317 [2] NCCL INFO Channel 07/0 : 10[2] -> 11[3] via P2P/IPC/read
579
+ dlc1w85u6fy2xvlb-worker-0:78:321 [5] NCCL INFO Channel 06/0 : 13[5] -> 12[4] via P2P/IPC/read
580
+ dlc1w85u6fy2xvlb-worker-0:74:318 [1] NCCL INFO Channel 00/0 : 9[1] -> 8[0] via P2P/IPC/read
581
+ dlc1w85u6fy2xvlb-worker-0:75:317 [2] NCCL INFO Channel 01/0 : 2[2] -> 10[2] [receive] via NET/IB/1/GDRDMA
582
+ dlc1w85u6fy2xvlb-worker-0:75:317 [2] NCCL INFO Channel 05/0 : 2[2] -> 10[2] [receive] via NET/IB/1/GDRDMA
583
+ dlc1w85u6fy2xvlb-worker-0:75:317 [2] NCCL INFO Channel 01/0 : 10[2] -> 2[2] [send] via NET/IB/1/GDRDMA
584
+ dlc1w85u6fy2xvlb-worker-0:75:317 [2] NCCL INFO Channel 05/0 : 10[2] -> 2[2] [send] via NET/IB/1/GDRDMA
585
+ dlc1w85u6fy2xvlb-worker-0:77:319 [4] NCCL INFO Channel 01/0 : 12[4] -> 11[3] via P2P/IPC/read
586
+ dlc1w85u6fy2xvlb-worker-0:74:318 [1] NCCL INFO Channel 04/0 : 9[1] -> 8[0] via P2P/IPC/read
587
+ dlc1w85u6fy2xvlb-worker-0:77:319 [4] NCCL INFO Channel 03/0 : 12[4] -> 11[3] via P2P/IPC/read
588
+ dlc1w85u6fy2xvlb-worker-0:77:319 [4] NCCL INFO Channel 05/0 : 12[4] -> 11[3] via P2P/IPC/read
589
+ dlc1w85u6fy2xvlb-worker-0:80:320 [7] NCCL INFO Channel 01/0 : 15[7] -> 8[0] via P2P/IPC/read
590
+ dlc1w85u6fy2xvlb-worker-0:73:323 [0] NCCL INFO Channel 00/0 : 0[0] -> 8[0] [receive] via NET/IB/0/GDRDMA
591
+ dlc1w85u6fy2xvlb-worker-0:73:323 [0] NCCL INFO Channel 04/0 : 0[0] -> 8[0] [receive] via NET/IB/0/GDRDMA
592
+ dlc1w85u6fy2xvlb-worker-0:73:323 [0] NCCL INFO Channel 00/0 : 8[0] -> 0[0] [send] via NET/IB/0/GDRDMA
593
+ dlc1w85u6fy2xvlb-worker-0:73:323 [0] NCCL INFO Channel 04/0 : 8[0] -> 0[0] [send] via NET/IB/0/GDRDMA
594
+ dlc1w85u6fy2xvlb-worker-0:77:319 [4] NCCL INFO Channel 07/0 : 12[4] -> 11[3] via P2P/IPC/read
595
+ dlc1w85u6fy2xvlb-worker-0:80:320 [7] NCCL INFO Channel 02/0 : 15[7] -> 8[0] via P2P/IPC/read
596
+ dlc1w85u6fy2xvlb-worker-0:76:316 [3] NCCL INFO Channel 01/0 : 11[3] -> 10[2] via P2P/IPC/read
597
+ dlc1w85u6fy2xvlb-worker-0:80:320 [7] NCCL INFO Channel 03/0 : 15[7] -> 8[0] via P2P/IPC/read
598
+ dlc1w85u6fy2xvlb-worker-0:76:316 [3] NCCL INFO Channel 05/0 : 11[3] -> 10[2] via P2P/IPC/read
599
+ dlc1w85u6fy2xvlb-worker-0:80:320 [7] NCCL INFO Channel 05/0 : 15[7] -> 8[0] via P2P/IPC/read
600
+ dlc1w85u6fy2xvlb-worker-0:80:320 [7] NCCL INFO Channel 06/0 : 15[7] -> 8[0] via P2P/IPC/read
601
+ dlc1w85u6fy2xvlb-worker-0:80:320 [7] NCCL INFO Channel 07/0 : 15[7] -> 8[0] via P2P/IPC/read
602
+ dlc1w85u6fy2xvlb-worker-0:80:320 [7] NCCL INFO Channel 03/0 : 15[7] -> 14[6] via P2P/IPC/read
603
+ dlc1w85u6fy2xvlb-worker-0:80:320 [7] NCCL INFO Channel 07/0 : 15[7] -> 14[6] via P2P/IPC/read
604
+ dlc1w85u6fy2xvlb-worker-0:78:321 [5] NCCL INFO Connected all trees
605
+ dlc1w85u6fy2xvlb-worker-0:78:321 [5] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512
606
+ dlc1w85u6fy2xvlb-worker-0:78:321 [5] NCCL INFO 8 coll channels, 0 nvls channels, 8 p2p channels, 2 p2p channels per peer
607
+ dlc1w85u6fy2xvlb-worker-0:76:316 [3] NCCL INFO Connected all trees
608
+ dlc1w85u6fy2xvlb-worker-0:76:316 [3] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512
609
+ dlc1w85u6fy2xvlb-worker-0:76:316 [3] NCCL INFO 8 coll channels, 0 nvls channels, 8 p2p channels, 2 p2p channels per peer
610
+ dlc1w85u6fy2xvlb-worker-0:74:318 [1] NCCL INFO Connected all trees
611
+ dlc1w85u6fy2xvlb-worker-0:74:318 [1] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512
612
+ dlc1w85u6fy2xvlb-worker-0:74:318 [1] NCCL INFO 8 coll channels, 0 nvls channels, 8 p2p channels, 2 p2p channels per peer
613
+ dlc1w85u6fy2xvlb-worker-0:80:320 [7] NCCL INFO Connected all trees
614
+ dlc1w85u6fy2xvlb-worker-0:80:320 [7] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512
615
+ dlc1w85u6fy2xvlb-worker-0:80:320 [7] NCCL INFO 8 coll channels, 0 nvls channels, 8 p2p channels, 2 p2p channels per peer
616
+ dlc1w85u6fy2xvlb-worker-0:77:319 [4] NCCL INFO Connected all trees
617
+ dlc1w85u6fy2xvlb-worker-0:77:319 [4] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512
618
+ dlc1w85u6fy2xvlb-worker-0:77:319 [4] NCCL INFO 8 coll channels, 0 nvls channels, 8 p2p channels, 2 p2p channels per peer
619
+ dlc1w85u6fy2xvlb-worker-0:75:317 [2] NCCL INFO Connected all trees
620
+ dlc1w85u6fy2xvlb-worker-0:75:317 [2] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512
621
+ dlc1w85u6fy2xvlb-worker-0:75:317 [2] NCCL INFO 8 coll channels, 0 nvls channels, 8 p2p channels, 2 p2p channels per peer
622
+ dlc1w85u6fy2xvlb-worker-0:79:322 [6] NCCL INFO Connected all trees
623
+ dlc1w85u6fy2xvlb-worker-0:79:322 [6] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512
624
+ dlc1w85u6fy2xvlb-worker-0:79:322 [6] NCCL INFO 8 coll channels, 0 nvls channels, 8 p2p channels, 2 p2p channels per peer
625
+ dlc1w85u6fy2xvlb-worker-0:73:323 [0] NCCL INFO Connected all trees
626
+ dlc1w85u6fy2xvlb-worker-0:73:323 [0] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512
627
+ dlc1w85u6fy2xvlb-worker-0:73:323 [0] NCCL INFO 8 coll channels, 0 nvls channels, 8 p2p channels, 2 p2p channels per peer
628
+ dlc1w85u6fy2xvlb-worker-0:74:318 [1] NCCL INFO comm 0x9a893920 rank 9 nranks 16 cudaDev 1 nvmlDev 1 busId 20 commId 0xe730164ea45002c1 - Init COMPLETE
629
+ dlc1w85u6fy2xvlb-worker-0:78:321 [5] NCCL INFO comm 0x9b39f9e0 rank 13 nranks 16 cudaDev 5 nvmlDev 5 busId 60 commId 0xe730164ea45002c1 - Init COMPLETE
630
+ dlc1w85u6fy2xvlb-worker-0:76:316 [3] NCCL INFO comm 0x9b7bdc70 rank 11 nranks 16 cudaDev 3 nvmlDev 3 busId 40 commId 0xe730164ea45002c1 - Init COMPLETE
631
+ dlc1w85u6fy2xvlb-worker-0:80:320 [7] NCCL INFO comm 0x9bccd080 rank 15 nranks 16 cudaDev 7 nvmlDev 7 busId 80 commId 0xe730164ea45002c1 - Init COMPLETE
632
+ dlc1w85u6fy2xvlb-worker-0:77:319 [4] NCCL INFO comm 0x9b9e2970 rank 12 nranks 16 cudaDev 4 nvmlDev 4 busId 50 commId 0xe730164ea45002c1 - Init COMPLETE
633
+ dlc1w85u6fy2xvlb-worker-0:73:323 [0] NCCL INFO comm 0x9aabc5f0 rank 8 nranks 16 cudaDev 0 nvmlDev 0 busId 10 commId 0xe730164ea45002c1 - Init COMPLETE
634
+ dlc1w85u6fy2xvlb-worker-0:75:317 [2] NCCL INFO comm 0x9bcdb060 rank 10 nranks 16 cudaDev 2 nvmlDev 2 busId 30 commId 0xe730164ea45002c1 - Init COMPLETE
635
+ dlc1w85u6fy2xvlb-worker-0:79:322 [6] NCCL INFO comm 0x9aa4c060 rank 14 nranks 16 cudaDev 6 nvmlDev 6 busId 70 commId 0xe730164ea45002c1 - Init COMPLETE
636
+
637
+ Loading checkpoint shards: 0%| | 0/8 [00:00<?, ?it/s]
638
+ Loading checkpoint shards: 0%| | 0/8 [00:00<?, ?it/s]
639
+ Loading checkpoint shards: 0%| | 0/8 [00:00<?, ?it/s]
640
+ Loading checkpoint shards: 0%| | 0/8 [00:00<?, ?it/s]
641
+ Loading checkpoint shards: 0%| | 0/8 [00:00<?, ?it/s]
642
+ Loading checkpoint shards: 0%| | 0/8 [00:00<?, ?it/s]
643
+ Loading checkpoint shards: 0%| | 0/8 [00:00<?, ?it/s]
644
+ Loading checkpoint shards: 0%| | 0/8 [00:00<?, ?it/s]
645
+ Loading checkpoint shards: 12%|█▎ | 1/8 [00:00<00:01, 4.69it/s]
646
+ Loading checkpoint shards: 12%|█▎ | 1/8 [00:00<00:01, 4.43it/s]
647
+ Loading checkpoint shards: 12%|█▎ | 1/8 [00:00<00:01, 4.53it/s]
648
+ Loading checkpoint shards: 12%|█▎ | 1/8 [00:00<00:01, 4.60it/s]
649
+ Loading checkpoint shards: 12%|█▎ | 1/8 [00:00<00:01, 4.43it/s]
650
+ Loading checkpoint shards: 12%|█▎ | 1/8 [00:00<00:01, 4.50it/s]
651
+ Loading checkpoint shards: 12%|█▎ | 1/8 [00:00<00:01, 5.10it/s]
652
+ Loading checkpoint shards: 12%|█▎ | 1/8 [00:00<00:01, 4.90it/s]
653
+ Loading checkpoint shards: 25%|██▌ | 2/8 [00:00<00:01, 4.94it/s]
654
+ Loading checkpoint shards: 25%|██▌ | 2/8 [00:00<00:01, 4.32it/s]
655
+ Loading checkpoint shards: 25%|██▌ | 2/8 [00:00<00:01, 4.40it/s]
656
+ Loading checkpoint shards: 25%|██▌ | 2/8 [00:00<00:01, 4.40it/s]
657
+ Loading checkpoint shards: 25%|██▌ | 2/8 [00:00<00:01, 4.35it/s]
658
+ Loading checkpoint shards: 25%|██▌ | 2/8 [00:00<00:01, 4.30it/s]
659
+ Loading checkpoint shards: 25%|██▌ | 2/8 [00:00<00:01, 4.32it/s]
660
+ Loading checkpoint shards: 25%|██▌ | 2/8 [00:00<00:01, 4.84it/s]
661
+ Loading checkpoint shards: 38%|███▊ | 3/8 [00:01<00:04, 1.21it/s]
662
+ Loading checkpoint shards: 38%|███▊ | 3/8 [00:02<00:04, 1.19it/s]
663
+ Loading checkpoint shards: 38%|███▊ | 3/8 [00:02<00:04, 1.18it/s]
664
+ Loading checkpoint shards: 38%|███▊ | 3/8 [00:02<00:04, 1.19it/s]
665
+ Loading checkpoint shards: 38%|███▊ | 3/8 [00:02<00:04, 1.19it/s]
666
+ Loading checkpoint shards: 38%|███▊ | 3/8 [00:02<00:04, 1.18it/s]
667
+ Loading checkpoint shards: 38%|███▊ | 3/8 [00:01<00:04, 1.20it/s]
668
+ Loading checkpoint shards: 38%|███▊ | 3/8 [00:02<00:04, 1.18it/s]
669
+ Loading checkpoint shards: 50%|█████ | 4/8 [00:03<00:04, 1.07s/it]
670
+ Loading checkpoint shards: 50%|█████ | 4/8 [00:03<00:04, 1.07s/it]
671
+ Loading checkpoint shards: 50%|█████ | 4/8 [00:03<00:04, 1.06s/it]
672
+ Loading checkpoint shards: 50%|█████ | 4/8 [00:03<00:04, 1.07s/it]
673
+ Loading checkpoint shards: 50%|█████ | 4/8 [00:03<00:04, 1.07s/it]
674
+ Loading checkpoint shards: 50%|█████ | 4/8 [00:03<00:04, 1.07s/it]
675
+ Loading checkpoint shards: 50%|█████ | 4/8 [00:03<00:04, 1.06s/it]
676
+ Loading checkpoint shards: 50%|█████ | 4/8 [00:03<00:04, 1.07s/it]
677
+ Loading checkpoint shards: 62%|██████▎ | 5/8 [00:04<00:03, 1.12s/it]
678
+ Loading checkpoint shards: 62%|██████▎ | 5/8 [00:04<00:03, 1.12s/it]
679
+ Loading checkpoint shards: 62%|██████▎ | 5/8 [00:04<00:03, 1.12s/it]
680
+ Loading checkpoint shards: 62%|██████▎ | 5/8 [00:04<00:03, 1.12s/it]
681
+ Loading checkpoint shards: 62%|██████▎ | 5/8 [00:04<00:03, 1.12s/it]
682
+ Loading checkpoint shards: 62%|██████▎ | 5/8 [00:04<00:03, 1.12s/it]
683
+ Loading checkpoint shards: 62%|██████▎ | 5/8 [00:04<00:03, 1.11s/it]
684
+ Loading checkpoint shards: 62%|██████▎ | 5/8 [00:04<00:03, 1.12s/it]
685
+ Loading checkpoint shards: 75%|███████▌ | 6/8 [00:05<00:02, 1.16s/it]
686
+ Loading checkpoint shards: 75%|███████▌ | 6/8 [00:05<00:02, 1.17s/it]
687
+ Loading checkpoint shards: 75%|███████▌ | 6/8 [00:05<00:02, 1.17s/it]
688
+ Loading checkpoint shards: 75%|███████▌ | 6/8 [00:05<00:02, 1.17s/it]
689
+ Loading checkpoint shards: 75%|███████▌ | 6/8 [00:05<00:02, 1.17s/it]
690
+ Loading checkpoint shards: 75%|███████▌ | 6/8 [00:05<00:02, 1.17s/it]
691
+ Loading checkpoint shards: 75%|███████▌ | 6/8 [00:05<00:02, 1.16s/it]
692
+ Loading checkpoint shards: 75%|███████▌ | 6/8 [00:05<00:02, 1.16s/it]
693
+ Loading checkpoint shards: 88%|████████▊ | 7/8 [00:06<00:01, 1.12s/it]
694
+ Loading checkpoint shards: 88%|████████▊ | 7/8 [00:06<00:01, 1.13s/it]
695
+ Loading checkpoint shards: 88%|████████▊ | 7/8 [00:06<00:01, 1.13s/it]
696
+ Loading checkpoint shards: 88%|████████▊ | 7/8 [00:06<00:01, 1.13s/it]
697
+ Loading checkpoint shards: 88%|████████▊ | 7/8 [00:06<00:01, 1.13s/it]
698
+ Loading checkpoint shards: 88%|████████▊ | 7/8 [00:06<00:01, 1.13s/it]
699
+ Loading checkpoint shards: 88%|████████▊ | 7/8 [00:06<00:01, 1.12s/it]/fs-computility/mllm1/shared/hub/
700
+ Loading checkpoint shards: 88%|████████▊ | 7/8 [00:06<00:01, 1.12s/it]
701
+ Loading checkpoint shards: 100%|██████████| 8/8 [00:07<00:00, 1.10it/s]
702
+ Loading checkpoint shards: 100%|██████████| 8/8 [00:07<00:00, 1.09it/s]
703
+ [WARNING|modeling_utils.py:4352] 2025-01-16 21:56:38,453 >> Some weights of LlavaInternlm2ForCausalLM were not initialized from the model checkpoint at models/internlm/internlm2_5-7b-chat and are newly initialized: ['lm_head.weight']
704
+ You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
705
+
706
+ Loading checkpoint shards: 100%|██████████| 8/8 [00:07<00:00, 1.09it/s]
707
+ Loading checkpoint shards: 100%|██████████| 8/8 [00:07<00:00, 1.08it/s]
708
+ [WARNING|modeling_utils.py:4352] 2025-01-16 21:56:38,453 >> Some weights of LlavaInternlm2ForCausalLM were not initialized from the model checkpoint at models/internlm/internlm2_5-7b-chat and are newly initialized: ['lm_head.weight']
709
+ You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
710
+
711
+ Loading checkpoint shards: 100%|██████████| 8/8 [00:07<00:00, 1.09it/s]
712
+ Loading checkpoint shards: 100%|██████████| 8/8 [00:07<00:00, 1.08it/s]
713
+ [WARNING|modeling_utils.py:4352] 2025-01-16 21:56:38,455 >> Some weights of LlavaInternlm2ForCausalLM were not initialized from the model checkpoint at models/internlm/internlm2_5-7b-chat and are newly initialized: ['lm_head.weight']
714
+ You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
715
+
716
+ Loading checkpoint shards: 100%|██████████| 8/8 [00:07<00:00, 1.09it/s]
717
+ Loading checkpoint shards: 100%|██████████| 8/8 [00:07<00:00, 1.09it/s]
718
+ Loading checkpoint shards: 100%|██████████| 8/8 [00:07<00:00, 1.08it/s]
719
+
720
+ Loading checkpoint shards: 100%|█████���████| 8/8 [00:07<00:00, 1.08it/s]
721
+ [WARNING|modeling_utils.py:4352] 2025-01-16 21:56:38,456 >> Some weights of LlavaInternlm2ForCausalLM were not initialized from the model checkpoint at models/internlm/internlm2_5-7b-chat and are newly initialized: ['lm_head.weight']
722
+ You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
723
+ [WARNING|modeling_utils.py:4352] 2025-01-16 21:56:38,456 >> Some weights of LlavaInternlm2ForCausalLM were not initialized from the model checkpoint at models/internlm/internlm2_5-7b-chat and are newly initialized: ['lm_head.weight']
724
+ You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
725
+
726
+ Loading checkpoint shards: 100%|██████████| 8/8 [00:07<00:00, 1.09it/s]
727
+ Loading checkpoint shards: 100%|██████████| 8/8 [00:07<00:00, 1.08it/s]
728
+ [WARNING|modeling_utils.py:4352] 2025-01-16 21:56:38,458 >> Some weights of LlavaInternlm2ForCausalLM were not initialized from the model checkpoint at models/internlm/internlm2_5-7b-chat and are newly initialized: ['lm_head.weight']
729
+ You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
730
+ /fs-computility/mllm1/shared/hub/
731
+ Loading checkpoint shards: 100%|██████████| 8/8 [00:07<00:00, 1.10it/s]
732
+ Loading checkpoint shards: 100%|██████████| 8/8 [00:07<00:00, 1.09it/s]
733
+ [INFO|modeling_utils.py:4350] 2025-01-16 21:56:38,462 >> All model checkpoint weights were used when initializing LlavaInternlm2ForCausalLM.
734
+
735
+ [WARNING|modeling_utils.py:4352] 2025-01-16 21:56:38,462 >> Some weights of LlavaInternlm2ForCausalLM were not initialized from the model checkpoint at models/internlm/internlm2_5-7b-chat and are newly initialized: ['lm_head.weight']
736
+ You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
737
+
738
+ Loading checkpoint shards: 100%|██████████| 8/8 [00:07<00:00, 1.09it/s]
739
+ Loading checkpoint shards: 100%|██████████| 8/8 [00:07<00:00, 1.08it/s]
740
+ [WARNING|modeling_utils.py:4352] 2025-01-16 21:56:38,471 >> Some weights of LlavaInternlm2ForCausalLM were not initialized from the model checkpoint at models/internlm/internlm2_5-7b-chat and are newly initialized: ['lm_head.weight']
741
+ You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
742
+ [INFO|configuration_utils.py:779] 2025-01-16 21:56:38,530 >> loading configuration file models/internlm/internlm2_5-7b-chat/generation_config.json
743
+ Using tokenizer from models/internlm/internlm2_5-7b-chat
744
+ using cache dir None
745
+ Using tokenizer from models/internlm/internlm2_5-7b-chat
746
+ using cache dir None
747
+ Using tokenizer from models/internlm/internlm2_5-7b-chat
748
+ using cache dir None
749
+ [INFO|configuration_utils.py:826] 2025-01-16 21:56:38,530 >> Generate config GenerationConfig {
750
+ "bos_token_id": 1,/fs-computility/mllm1/shared/hub/
751
+ "eos_token_id": [
752
+ 2,/fs-computility/mllm1/shared/hub/
753
+ 92542
754
+ ],
755
+ "pad_token_id": 2/fs-computility/mllm1/shared/hub/
756
+ }
757
+
758
+ Using tokenizer from models/internlm/internlm2_5-7b-chat
759
+ using cache dir None
760
+ Using tokenizer from models/internlm/internlm2_5-7b-chat
761
+ using cache dir None
762
+ Using tokenizer from models/internlm/internlm2_5-7b-chat
763
+ using cache dir None
764
+ Using tokenizer from models/internlm/internlm2_5-7b-chat
765
+ using cache dir None
766
+ Using tokenizer from models/internlm/internlm2_5-7b-chat
767
+ using cache dir None
768
+ [INFO|tokenization_utils_base.py:2025] 2025-01-16 21:56:38,548 >> loading file ./tokenizer.model
769
+ [INFO|tokenization_utils_base.py:2025] 2025-01-16 21:56:38,548 >> loading file added_tokens.json
770
+ [INFO|tokenization_utils_base.py:2025] 2025-01-16 21:56:38,548 >> loading file special_tokens_map.json
771
+ [INFO|tokenization_utils_base.py:2025] 2025-01-16 21:56:38,548 >> loading file tokenizer_config.json
772
+ [INFO|tokenization_utils_base.py:2025] 2025-01-16 21:56:38,548 >> loading file tokenizer.json
773
+ 01/16/2025 21:56:38 - WARNING - llava.train.train - Using conversation template: Conversation(system='<|im_start|>system\nYou are a helpful assistant. ', roles=('<|im_start|>user\n', '<|im_start|>assistant\n'), messages=[], offset=0, sep_style=<SeparatorStyle.MPT: 3>, sep='<|im_end|>', sep2=None, version='internlm_v2', mm_system=None, skip_next=False)
774
+ 01/16/2025 21:56:38 - WARNING - llava.train.train - Using conversation template: Conversation(system='<|im_start|>system\nYou are a helpful assistant. ', roles=('<|im_start|>user\n', '<|im_start|>assistant\n'), messages=[], offset=0, sep_style=<SeparatorStyle.MPT: 3>, sep='<|im_end|>', sep2=None, version='internlm_v2', mm_system=None, skip_next=False)
775
+ 01/16/2025 21:56:38 - WARNING - llava.train.train - Using conversation template: Conversation(system='<|im_start|>system\nYou are a helpful assistant. ', roles=('<|im_start|>user\n', '<|im_start|>assistant\n'), messages=[], offset=0, sep_style=<SeparatorStyle.MPT: 3>, sep='<|im_end|>', sep2=None, version='internlm_v2', mm_system=None, skip_next=False)
776
+ 01/16/2025 21:56:38 - WARNING - llava.train.train - Using conversation template: Conversation(system='<|im_start|>system\nYou are a helpful assistant. ', roles=('<|im_start|>user\n', '<|im_start|>assistant\n'), messages=[], offset=0, sep_style=<SeparatorStyle.MPT: 3>, sep='<|im_end|>', sep2=None, version='internlm_v2', mm_system=None, skip_next=False)
777
+ 01/16/2025 21:56:38 - WARNING - llava.train.train - Using conversation template: Conversation(system='<|im_start|>system\nYou are a helpful assistant. ', roles=('<|im_start|>user\n', '<|im_start|>assistant\n'), messages=[], offset=0, sep_style=<SeparatorStyle.MPT: 3>, sep='<|im_end|>', sep2=None, version='internlm_v2', mm_system=None, skip_next=False)
778
+ 01/16/2025 21:56:38 - WARNING - llava.train.train - Using conversation template: Conversation(system='<|im_start|>system\nYou are a helpful assistant. ', roles=('<|im_start|>user\n', '<|im_start|>assistant\n'), messages=[], offset=0, sep_style=<SeparatorStyle.MPT: 3>, sep='<|im_end|>', sep2=None, version='internlm_v2', mm_system=None, skip_next=False)
779
+ 01/16/2025 21:56:38 - WARNING - llava.train.train - Using conversation template: Conversation(system='<|im_start|>system\nYou are a helpful assistant. ', roles=('<|im_start|>user\n', '<|im_start|>assistant\n'), messages=[], offset=0, sep_style=<SeparatorStyle.MPT: 3>, sep='<|im_end|>', sep2=None, version='internlm_v2', mm_system=None, skip_next=False)
780
+ 01/16/2025 21:56:38 - INFO - llava.train.train - Using conversation template: Conversation(system='<|im_start|>system\nYou are a helpful assistant. ', roles=('<|im_start|>user\n', '<|im_start|>assistant\n'), messages=[], offset=0, sep_style=<SeparatorStyle.MPT: 3>, sep='<|im_end|>', sep2=None, version='internlm_v2', mm_system=None, skip_next=False)
781
+ [INFO|image_processing_utils.py:373] 2025-01-16 21:56:38,791 >> loading configuration file /fs-computility/mllm1/shared/hub/models--openai--clip-vit-large-patch14-336/snapshots/ce19dc912ca5cd21c8a653c79e251e808ccabcd1/preprocessor_config.json
782
+ [INFO|image_processing_utils.py:738] 2025-01-16 21:56:38,791 >> size should be a dictionary on of the following set of keys: ({'height', 'width'}, {'shortest_edge'}, {'longest_edge', 'shortest_edge'}, {'longest_edge'}), got 336. Converted to {'shortest_edge': 336}.
783
+ [INFO|image_processing_utils.py:738] 2025-01-16 21:56:38,791 >> crop_size should be a dictionary on of the following set of keys: ({'height', 'width'}, {'shortest_edge'}, {'longest_edge', 'shortest_edge'}, {'longest_edge'}), got 336. Converted to {'height': 336, 'width': 336}.
784
+ [INFO|image_processing_utils.py:425] 2025-01-16 21:56:38,791 >> Image processor CLIPImageProcessor {
785
+ "crop_size": {
786
+ "height": 336,
787
+ "width": 336
788
+ },
789
+ "do_center_crop": true,
790
+ "do_convert_rgb": true,
791
+ "do_normalize": true,
792
+ "do_rescale": true,
793
+ "do_resize": true,
794
+ "image_mean": [
795
+ 0.48145466,
796
+ 0.4578275,
797
+ 0.40821073
798
+ ],
799
+ "image_processor_type": "CLIPImageProcessor",
800
+ "image_std": [
801
+ 0.26862954,
802
+ 0.26130258,
803
+ 0.27577711
804
+ ],
805
+ "resample": 3,
806
+ "rescale_factor": 0.00392156862745098,
807
+ "size": {
808
+ "shortest_edge": 336
809
+ }
810
+ }
811
+
812
+ [INFO|configuration_utils.py:727] 2025-01-16 21:56:38,798 >> loading configuration file /fs-computility/mllm1/shared/hub/models--openai--clip-vit-large-patch14-336/snapshots/ce19dc912ca5cd21c8a653c79e251e808ccabcd1/config.json
813
+ [INFO|configuration_utils.py:792] 2025-01-16 21:56:38,798 >> Model config CLIPVisionConfig {
814
+ "attention_dropout": 0.0,
815
+ "dropout": 0.0,
816
+ "hidden_act": "quick_gelu",
817
+ "hidden_size": 1024,
818
+ "image_size": 336,
819
+ "initializer_factor": 1.0,
820
+ "initializer_range": 0.02,
821
+ "intermediate_size": 4096,
822
+ "layer_norm_eps": 1e-05,
823
+ "model_type": "clip_vision_model",
824
+ "num_attention_heads": 16,
825
+ "num_channels": 3,
826
+ "num_hidden_layers": 24,
827
+ "patch_size": 14,
828
+ "projection_dim": 768,
829
+ "transformers_version": "4.37.2"
830
+ }
831
+
832
+ [INFO|modeling_utils.py:3473] 2025-01-16 21:56:38,799 >> loading weights file /fs-computility/mllm1/shared/hub/models--openai--clip-vit-large-patch14-336/snapshots/ce19dc912ca5cd21c8a653c79e251e808ccabcd1/pytorch_model.bin
833
+ [INFO|modeling_utils.py:3582] 2025-01-16 21:56:42,256 >> Detected DeepSpeed ZeRO-3: activating zero.init() for this model
834
+ [INFO|modeling_utils.py:4340] 2025-01-16 21:56:43,384 >> Some weights of the model checkpoint at /fs-computility/mllm1/shared/hub/models--openai--clip-vit-large-patch14-336/snapshots/ce19dc912ca5cd21c8a653c79e251e808ccabcd1 were not used when initializing CLIPVisionModel: ['logit_scale', 'text_model.embeddings.position_embedding.weight', 'text_model.embeddings.position_ids', 'text_model.embeddings.token_embedding.weight', 'text_model.encoder.layers.0.layer_norm1.bias', 'text_model.encoder.layers.0.layer_norm1.weight', 'text_model.encoder.layers.0.layer_norm2.bias', 'text_model.encoder.layers.0.layer_norm2.weight', 'text_model.encoder.layers.0.mlp.fc1.bias', 'text_model.encoder.layers.0.mlp.fc1.weight', 'text_model.encoder.layers.0.mlp.fc2.bias', 'text_model.encoder.layers.0.mlp.fc2.weight', 'text_model.encoder.layers.0.self_attn.k_proj.bias', 'text_model.encoder.layers.0.self_attn.k_proj.weight', 'text_model.encoder.layers.0.self_attn.out_proj.bias', 'text_model.encoder.layers.0.self_attn.out_proj.weight', 'text_model.encoder.layers.0.self_attn.q_proj.bias', 'text_model.encoder.layers.0.self_attn.q_proj.weight', 'text_model.encoder.layers.0.self_attn.v_proj.bias', 'text_model.encoder.layers.0.self_attn.v_proj.weight', 'text_model.encoder.layers.1.layer_norm1.bias', 'text_model.encoder.layers.1.layer_norm1.weight', 'text_model.encoder.layers.1.layer_norm2.bias', 'text_model.encoder.layers.1.layer_norm2.weight', 'text_model.encoder.layers.1.mlp.fc1.bias', 'text_model.encoder.layers.1.mlp.fc1.weight', 'text_model.encoder.layers.1.mlp.fc2.bias', 'text_model.encoder.layers.1.mlp.fc2.weight', 'text_model.encoder.layers.1.self_attn.k_proj.bias', 'text_model.encoder.layers.1.self_attn.k_proj.weight', 'text_model.encoder.layers.1.self_attn.out_proj.bias', 'text_model.encoder.layers.1.self_attn.out_proj.weight', 'text_model.encoder.layers.1.self_attn.q_proj.bias', 'text_model.encoder.layers.1.self_attn.q_proj.weight', 'text_model.encoder.layers.1.self_attn.v_proj.bias', 'text_model.encoder.layers.1.self_attn.v_proj.weight', 'text_model.encoder.layers.10.layer_norm1.bias', 'text_model.encoder.layers.10.layer_norm1.weight', 'text_model.encoder.layers.10.layer_norm2.bias', 'text_model.encoder.layers.10.layer_norm2.weight', 'text_model.encoder.layers.10.mlp.fc1.bias', 'text_model.encoder.layers.10.mlp.fc1.weight', 'text_model.encoder.layers.10.mlp.fc2.bias', 'text_model.encoder.layers.10.mlp.fc2.weight', 'text_model.encoder.layers.10.self_attn.k_proj.bias', 'text_model.encoder.layers.10.self_attn.k_proj.weight', 'text_model.encoder.layers.10.self_attn.out_proj.bias', 'text_model.encoder.layers.10.self_attn.out_proj.weight', 'text_model.encoder.layers.10.self_attn.q_proj.bias', 'text_model.encoder.layers.10.self_attn.q_proj.weight', 'text_model.encoder.layers.10.self_attn.v_proj.bias', 'text_model.encoder.layers.10.self_attn.v_proj.weight', 'text_model.encoder.layers.11.layer_norm1.bias', 'text_model.encoder.layers.11.layer_norm1.weight', 'text_model.encoder.layers.11.layer_norm2.bias', 'text_model.encoder.layers.11.layer_norm2.weight', 'text_model.encoder.layers.11.mlp.fc1.bias', 'text_model.encoder.layers.11.mlp.fc1.weight', 'text_model.encoder.layers.11.mlp.fc2.bias', 'text_model.encoder.layers.11.mlp.fc2.weight', 'text_model.encoder.layers.11.self_attn.k_proj.bias', 'text_model.encoder.layers.11.self_attn.k_proj.weight', 'text_model.encoder.layers.11.self_attn.out_proj.bias', 'text_model.encoder.layers.11.self_attn.out_proj.weight', 'text_model.encoder.layers.11.self_attn.q_proj.bias', 'text_model.encoder.layers.11.self_attn.q_proj.weight', 'text_model.encoder.layers.11.self_attn.v_proj.bias', 'text_model.encoder.layers.11.self_attn.v_proj.weight', 'text_model.encoder.layers.2.layer_norm1.bias', 'text_model.encoder.layers.2.layer_norm1.weight', 'text_model.encoder.layers.2.layer_norm2.bias', 'text_model.encoder.layers.2.layer_norm2.weight', 'text_model.encoder.layers.2.mlp.fc1.bias', 'text_model.encoder.layers.2.mlp.fc1.weight', 'text_model.encoder.layers.2.mlp.fc2.bias', 'text_model.encoder.layers.2.mlp.fc2.weight', 'text_model.encoder.layers.2.self_attn.k_proj.bias', 'text_model.encoder.layers.2.self_attn.k_proj.weight', 'text_model.encoder.layers.2.self_attn.out_proj.bias', 'text_model.encoder.layers.2.self_attn.out_proj.weight', 'text_model.encoder.layers.2.self_attn.q_proj.bias', 'text_model.encoder.layers.2.self_attn.q_proj.weight', 'text_model.encoder.layers.2.self_attn.v_proj.bias', 'text_model.encoder.layers.2.self_attn.v_proj.weight', 'text_model.encoder.layers.3.layer_norm1.bias', 'text_model.encoder.layers.3.layer_norm1.weight', 'text_model.encoder.layers.3.layer_norm2.bias', 'text_model.encoder.layers.3.layer_norm2.weight', 'text_model.encoder.layers.3.mlp.fc1.bias', 'text_model.encoder.layers.3.mlp.fc1.weight', 'text_model.encoder.layers.3.mlp.fc2.bias', 'text_model.encoder.layers.3.mlp.fc2.weight', 'text_model.encoder.layers.3.self_attn.k_proj.bias', 'text_model.encoder.layers.3.self_attn.k_proj.weight', 'text_model.encoder.layers.3.self_attn.out_proj.bias', 'text_model.encoder.layers.3.self_attn.out_proj.weight', 'text_model.encoder.layers.3.self_attn.q_proj.bias', 'text_model.encoder.layers.3.self_attn.q_proj.weight', 'text_model.encoder.layers.3.self_attn.v_proj.bias', 'text_model.encoder.layers.3.self_attn.v_proj.weight', 'text_model.encoder.layers.4.layer_norm1.bias', 'text_model.encoder.layers.4.layer_norm1.weight', 'text_model.encoder.layers.4.layer_norm2.bias', 'text_model.encoder.layers.4.layer_norm2.weight', 'text_model.encoder.layers.4.mlp.fc1.bias', 'text_model.encoder.layers.4.mlp.fc1.weight', 'text_model.encoder.layers.4.mlp.fc2.bias', 'text_model.encoder.layers.4.mlp.fc2.weight', 'text_model.encoder.layers.4.self_attn.k_proj.bias', 'text_model.encoder.layers.4.self_attn.k_proj.weight', 'text_model.encoder.layers.4.self_attn.out_proj.bias', 'text_model.encoder.layers.4.self_attn.out_proj.weight', 'text_model.encoder.layers.4.self_attn.q_proj.bias', 'text_model.encoder.layers.4.self_attn.q_proj.weight', 'text_model.encoder.layers.4.self_attn.v_proj.bias', 'text_model.encoder.layers.4.self_attn.v_proj.weight', 'text_model.encoder.layers.5.layer_norm1.bias', 'text_model.encoder.layers.5.layer_norm1.weight', 'text_model.encoder.layers.5.layer_norm2.bias', 'text_model.encoder.layers.5.layer_norm2.weight', 'text_model.encoder.layers.5.mlp.fc1.bias', 'text_model.encoder.layers.5.mlp.fc1.weight', 'text_model.encoder.layers.5.mlp.fc2.bias', 'text_model.encoder.layers.5.mlp.fc2.weight', 'text_model.encoder.layers.5.self_attn.k_proj.bias', 'text_model.encoder.layers.5.self_attn.k_proj.weight', 'text_model.encoder.layers.5.self_attn.out_proj.bias', 'text_model.encoder.layers.5.self_attn.out_proj.weight', 'text_model.encoder.layers.5.self_attn.q_proj.bias', 'text_model.encoder.layers.5.self_attn.q_proj.weight', 'text_model.encoder.layers.5.self_attn.v_proj.bias', 'text_model.encoder.layers.5.self_attn.v_proj.weight', 'text_model.encoder.layers.6.layer_norm1.bias', 'text_model.encoder.layers.6.layer_norm1.weight', 'text_model.encoder.layers.6.layer_norm2.bias', 'text_model.encoder.layers.6.layer_norm2.weight', 'text_model.encoder.layers.6.mlp.fc1.bias', 'text_model.encoder.layers.6.mlp.fc1.weight', 'text_model.encoder.layers.6.mlp.fc2.bias', 'text_model.encoder.layers.6.mlp.fc2.weight', 'text_model.encoder.layers.6.self_attn.k_proj.bias', 'text_model.encoder.layers.6.self_attn.k_proj.weight', 'text_model.encoder.layers.6.self_attn.out_proj.bias', 'text_model.encoder.layers.6.self_attn.out_proj.weight', 'text_model.encoder.layers.6.self_attn.q_proj.bias', 'text_model.encoder.layers.6.self_attn.q_proj.weight', 'text_model.encoder.layers.6.self_attn.v_proj.bias', 'text_model.encoder.layers.6.self_attn.v_proj.weight', 'text_model.encoder.layers.7.layer_norm1.bias', 'text_model.encoder.layers.7.layer_norm1.weight', 'text_model.encoder.layers.7.layer_norm2.bias', 'text_model.encoder.layers.7.layer_norm2.weight', 'text_model.encoder.layers.7.mlp.fc1.bias', 'text_model.encoder.layers.7.mlp.fc1.weight', 'text_model.encoder.layers.7.mlp.fc2.bias', 'text_model.encoder.layers.7.mlp.fc2.weight', 'text_model.encoder.layers.7.self_attn.k_proj.bias', 'text_model.encoder.layers.7.self_attn.k_proj.weight', 'text_model.encoder.layers.7.self_attn.out_proj.bias', 'text_model.encoder.layers.7.self_attn.out_proj.weight', 'text_model.encoder.layers.7.self_attn.q_proj.bias', 'text_model.encoder.layers.7.self_attn.q_proj.weight', 'text_model.encoder.layers.7.self_attn.v_proj.bias', 'text_model.encoder.layers.7.self_attn.v_proj.weight', 'text_model.encoder.layers.8.layer_norm1.bias', 'text_model.encoder.layers.8.layer_norm1.weight', 'text_model.encoder.layers.8.layer_norm2.bias', 'text_model.encoder.layers.8.layer_norm2.weight', 'text_model.encoder.layers.8.mlp.fc1.bias', 'text_model.encoder.layers.8.mlp.fc1.weight', 'text_model.encoder.layers.8.mlp.fc2.bias', 'text_model.encoder.layers.8.mlp.fc2.weight', 'text_model.encoder.layers.8.self_attn.k_proj.bias', 'text_model.encoder.layers.8.self_attn.k_proj.weight', 'text_model.encoder.layers.8.self_attn.out_proj.bias', 'text_model.encoder.layers.8.self_attn.out_proj.weight', 'text_model.encoder.layers.8.self_attn.q_proj.bias', 'text_model.encoder.layers.8.self_attn.q_proj.weight', 'text_model.encoder.layers.8.self_attn.v_proj.bias', 'text_model.encoder.layers.8.self_attn.v_proj.weight', 'text_model.encoder.layers.9.layer_norm1.bias', 'text_model.encoder.layers.9.layer_norm1.weight', 'text_model.encoder.layers.9.layer_norm2.bias', 'text_model.encoder.layers.9.layer_norm2.weight', 'text_model.encoder.layers.9.mlp.fc1.bias', 'text_model.encoder.layers.9.mlp.fc1.weight', 'text_model.encoder.layers.9.mlp.fc2.bias', 'text_model.encoder.layers.9.mlp.fc2.weight', 'text_model.encoder.layers.9.self_attn.k_proj.bias', 'text_model.encoder.layers.9.self_attn.k_proj.weight', 'text_model.encoder.layers.9.self_attn.out_proj.bias', 'text_model.encoder.layers.9.self_attn.out_proj.weight', 'text_model.encoder.layers.9.self_attn.q_proj.bias', 'text_model.encoder.layers.9.self_attn.q_proj.weight', 'text_model.encoder.layers.9.self_attn.v_proj.bias', 'text_model.encoder.layers.9.self_attn.v_proj.weight', 'text_model.final_layer_norm.bias', 'text_model.final_layer_norm.weight', 'text_projection.weight', 'visual_projection.weight']
835
+ - This IS expected if you are initializing CLIPVisionModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
836
+ - This IS NOT expected if you are initializing CLIPVisionModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
837
+ [INFO|modeling_utils.py:4358] 2025-01-16 21:56:43,384 >> All the weights of CLIPVisionModel were initialized from the model checkpoint at /fs-computility/mllm1/shared/hub/models--openai--clip-vit-large-patch14-336/snapshots/ce19dc912ca5cd21c8a653c79e251e808ccabcd1.
838
+ If your task is similar to the task the model of the checkpoint was trained on, you can already use CLIPVisionModel for predictions without further training.
839
+ 01/16/2025 21:57:01 - INFO - llava.train.train - Add dataset: llava-next-sft-notext with length: 738601, data type: normal, seed: 0
840
+ 01/16/2025 21:57:04 - INFO - llava.train.train - Add dataset: knowledge_gqa9k_art1500_cc3m30k with length: 40813, data type: know, seed: 1
841
+ 01/16/2025 21:57:07 - INFO - llava.train.train - Add dataset: Inferencial_flickr7k_cc3m30k_polished_md with length: 37117, data type: inf_polishmd, seed: 2
842
+ 01/16/2025 21:57:10 - INFO - llava.train.train - Add dataset: Detail_flickr7k_cc3m28k with length: 35313, data type: detail, seed: 3
843
+ 01/16/2025 21:57:14 - INFO - llava.train.train - Add dataset: Knowledge_instruct40k with length: 40218, data type: know_ins, seed: 4
844
+ 01/16/2025 21:57:17 - INFO - llava.train.train - Add dataset: Creation10k_fixed with length: 9698, data type: creation, seed: 5
845
+ 01/16/2025 21:57:21 - INFO - llava.train.train - Add dataset: Chartqa_generate_11k_gpt_qwen_merge with length: 11160, data type: chart, seed: 6
846
+ 01/16/2025 21:57:24 - INFO - llava.train.train - Add dataset: Tqa_detail_qwengenerate_multi8k_gpt with length: 8391, data type: tqa, seed: 7
847
+ 01/16/2025 21:57:27 - INFO - llava.train.train - Add dataset: Infovqa_single_gpt with length: 23068, data type: info, seed: 8
848
+ [INFO|trainer.py:571] 2025-01-16 21:57:27,893 >> Using auto half precision backend
849
+ [INFO|trainer.py:1721] 2025-01-16 21:58:16,446 >> ***** Running training *****
850
+ [INFO|trainer.py:1722] 2025-01-16 21:58:16,446 >> Num examples = 944,379
851
+ [INFO|trainer.py:1723] 2025-01-16 21:58:16,446 >> Num Epochs = 1
852
+ [INFO|trainer.py:1724] 2025-01-16 21:58:16,446 >> Instantaneous batch size per device = 4
853
+ [INFO|trainer.py:1727] 2025-01-16 21:58:16,446 >> Total train batch size (w. parallel, distributed & accumulation) = 128
854
+ [INFO|trainer.py:1728] 2025-01-16 21:58:16,447 >> Gradient Accumulation steps = 2
855
+ [INFO|trainer.py:1729] 2025-01-16 21:58:16,447 >> Total optimization steps = 7,378
856
+ [INFO|trainer.py:1730] 2025-01-16 21:58:16,448 >> Number of trainable parameters = 8,441,260,032
training_log_20250116_222124.txt ADDED
The diff for this file is too large to render. See raw diff
 
training_log_20250116_222338.txt ADDED
@@ -0,0 +1,852 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [2025-01-16 22:23:44,292] torch.distributed.run: [WARNING]
2
+ [2025-01-16 22:23:44,292] torch.distributed.run: [WARNING] *****************************************
3
+ [2025-01-16 22:23:44,292] torch.distributed.run: [WARNING] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
4
+ [2025-01-16 22:23:44,292] torch.distributed.run: [WARNING] *****************************************
5
+ /cpfs02/user/zhaoxiangyu/miniconda3/envs/llava/lib/python3.10/site-packages/_distutils_hack/__init__.py:54: UserWarning: Reliance on distutils from stdlib is deprecated. Users must rely on setuptools to provide the distutils module. Avoid importing distutils or import setuptools first, and avoid setting SETUPTOOLS_USE_DISTUTILS=stdlib. Register concerns at https://github.com/pypa/setuptools/issues/new?template=distutils-deprecation.yml
6
+ warnings.warn(
7
+ /cpfs02/user/zhaoxiangyu/miniconda3/envs/llava/lib/python3.10/site-packages/_distutils_hack/__init__.py:54: UserWarning: Reliance on distutils from stdlib is deprecated. Users must rely on setuptools to provide the distutils module. Avoid importing distutils or import setuptools first, and avoid setting SETUPTOOLS_USE_DISTUTILS=stdlib. Register concerns at https://github.com/pypa/setuptools/issues/new?template=distutils-deprecation.yml
8
+ warnings.warn(
9
+ /cpfs02/user/zhaoxiangyu/miniconda3/envs/llava/lib/python3.10/site-packages/_distutils_hack/__init__.py:54: UserWarning: Reliance on distutils from stdlib is deprecated. Users must rely on setuptools to provide the distutils module. Avoid importing distutils or import setuptools first, and avoid setting SETUPTOOLS_USE_DISTUTILS=stdlib. Register concerns at https://github.com/pypa/setuptools/issues/new?template=distutils-deprecation.yml
10
+ warnings.warn(
11
+ /cpfs02/user/zhaoxiangyu/miniconda3/envs/llava/lib/python3.10/site-packages/_distutils_hack/__init__.py:54: UserWarning: Reliance on distutils from stdlib is deprecated. Users must rely on setuptools to provide the distutils module. Avoid importing distutils or import setuptools first, and avoid setting SETUPTOOLS_USE_DISTUTILS=stdlib. Register concerns at https://github.com/pypa/setuptools/issues/new?template=distutils-deprecation.yml
12
+ warnings.warn(
13
+ /cpfs02/user/zhaoxiangyu/miniconda3/envs/llava/lib/python3.10/site-packages/_distutils_hack/__init__.py:54: UserWarning: Reliance on distutils from stdlib is deprecated. Users must rely on setuptools to provide the distutils module. Avoid importing distutils or import setuptools first, and avoid setting SETUPTOOLS_USE_DISTUTILS=stdlib. Register concerns at https://github.com/pypa/setuptools/issues/new?template=distutils-deprecation.yml
14
+ warnings.warn(
15
+ /cpfs02/user/zhaoxiangyu/miniconda3/envs/llava/lib/python3.10/site-packages/_distutils_hack/__init__.py:54: UserWarning: Reliance on distutils from stdlib is deprecated. Users must rely on setuptools to provide the distutils module. Avoid importing distutils or import setuptools first, and avoid setting SETUPTOOLS_USE_DISTUTILS=stdlib. Register concerns at https://github.com/pypa/setuptools/issues/new?template=distutils-deprecation.yml
16
+ warnings.warn(
17
+ /cpfs02/user/zhaoxiangyu/miniconda3/envs/llava/lib/python3.10/site-packages/_distutils_hack/__init__.py:54: UserWarning: Reliance on distutils from stdlib is deprecated. Users must rely on setuptools to provide the distutils module. Avoid importing distutils or import setuptools first, and avoid setting SETUPTOOLS_USE_DISTUTILS=stdlib. Register concerns at https://github.com/pypa/setuptools/issues/new?template=distutils-deprecation.yml
18
+ warnings.warn(
19
+ /cpfs02/user/zhaoxiangyu/miniconda3/envs/llava/lib/python3.10/site-packages/_distutils_hack/__init__.py:54: UserWarning: Reliance on distutils from stdlib is deprecated. Users must rely on setuptools to provide the distutils module. Avoid importing distutils or import setuptools first, and avoid setting SETUPTOOLS_USE_DISTUTILS=stdlib. Register concerns at https://github.com/pypa/setuptools/issues/new?template=distutils-deprecation.yml
20
+ warnings.warn(
21
+ [2025-01-16 22:23:58,303] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
22
+ [2025-01-16 22:23:58,303] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
23
+ [2025-01-16 22:23:58,303] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
24
+ [2025-01-16 22:23:58,303] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
25
+ [2025-01-16 22:23:58,303] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
26
+ [2025-01-16 22:23:58,303] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
27
+ [2025-01-16 22:23:58,303] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
28
+ [2025-01-16 22:23:58,303] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
29
+ df: df: df: df: df: /root/.triton/autotune/root/.triton/autotune/root/.triton/autotune/root/.triton/autotune/root/.triton/autotunedf: /root/.triton/autotune: 没有那个文件或目录
30
+ : 没有那个文件或目录
31
+ : 没有那个文件或目录: 没有那个文件或目录
32
+
33
+ : 没有那个文件或目录
34
+ : 没有那个文件或目录
35
+  [WARNING]  Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
36
+  [WARNING]  Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
37
+  [WARNING]  Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
38
+  [WARNING]  Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
39
+  [WARNING]  Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
40
+  [WARNING]  Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
41
+  [WARNING]  Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
42
+  [WARNING]  Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
43
+  [WARNING]  sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.1
44
+  [WARNING]  using untested triton version (2.1.0), only 1.0.0 is known to be compatible
45
+  [WARNING]  sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.1
46
+  [WARNING]  using untested triton version (2.1.0), only 1.0.0 is known to be compatible
47
+  [WARNING]  sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.1
48
+  [WARNING]  using untested triton version (2.1.0), only 1.0.0 is known to be compatible
49
+  [WARNING]  sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.1
50
+  [WARNING]  using untested triton version (2.1.0), only 1.0.0 is known to be compatible
51
+  [WARNING]  sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.1
52
+  [WARNING]  using untested triton version (2.1.0), only 1.0.0 is known to be compatible
53
+  [WARNING]  sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.1
54
+  [WARNING]  using untested triton version (2.1.0), only 1.0.0 is known to be compatible
55
+  [WARNING]  sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.1
56
+  [WARNING]  using untested triton version (2.1.0), only 1.0.0 is known to be compatible
57
+  [WARNING]  sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.1
58
+  [WARNING]  using untested triton version (2.1.0), only 1.0.0 is known to be compatible
59
+ [2025-01-16 22:24:14,156] [INFO] [comm.py:637:init_distributed] cdb=None
60
+ [2025-01-16 22:24:14,156] [INFO] [comm.py:637:init_distributed] cdb=None
61
+ [2025-01-16 22:24:14,157] [INFO] [comm.py:637:init_distributed] cdb=None
62
+ [2025-01-16 22:24:14,157] [INFO] [comm.py:637:init_distributed] cdb=None
63
+ [2025-01-16 22:24:14,157] [INFO] [comm.py:637:init_distributed] cdb=None
64
+ [2025-01-16 22:24:14,157] [INFO] [comm.py:637:init_distributed] cdb=None
65
+ [2025-01-16 22:24:14,157] [INFO] [comm.py:637:init_distributed] cdb=None
66
+ [2025-01-16 22:24:14,157] [INFO] [comm.py:637:init_distributed] cdb=None
67
+ 01/16/2025 22:24:14 - WARNING - llava.train.train - Process rank: 0, device: cuda:0, n_gpu: 1distributed training: True, 16-bits training: False
68
+ 01/16/2025 22:24:14 - INFO - llava.train.train - Training/evaluation parameters TrainingArguments(
69
+ _n_gpu=1,
70
+ adafactor=False,
71
+ adam_beta1=0.9,
72
+ adam_beta2=0.999,
73
+ adam_epsilon=1e-08,
74
+ auto_find_batch_size=False,
75
+ bf16=True,
76
+ bf16_full_eval=False,
77
+ bits=16,
78
+ cache_dir=None,
79
+ data_seed=None,
80
+ dataloader_drop_last=False,
81
+ dataloader_num_workers=4,
82
+ dataloader_persistent_workers=False,
83
+ dataloader_pin_memory=True,
84
+ ddp_backend=None,
85
+ ddp_broadcast_buffers=None,
86
+ ddp_bucket_cap_mb=None,
87
+ ddp_find_unused_parameters=None,
88
+ ddp_timeout=1800,
89
+ debug=[],
90
+ deepspeed=./scripts/zero3.json,
91
+ disable_tqdm=False,
92
+ dispatch_batches=None,
93
+ do_eval=False,
94
+ do_predict=False,
95
+ do_train=False,
96
+ double_quant=True,
97
+ eval_accumulation_steps=None,
98
+ eval_delay=0,
99
+ eval_steps=None,
100
+ evaluation_strategy=no,
101
+ fp16=False,
102
+ fp16_backend=auto,
103
+ fp16_full_eval=False,
104
+ fp16_opt_level=O1,
105
+ freeze_mm_mlp_adapter=False,
106
+ fsdp=[],
107
+ fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False},
108
+ fsdp_min_num_params=0,
109
+ fsdp_transformer_layer_cls_to_wrap=None,
110
+ full_determinism=False,
111
+ gradient_accumulation_steps=2,
112
+ gradient_checkpointing=True,
113
+ gradient_checkpointing_kwargs=None,
114
+ greater_is_better=None,
115
+ group_by_length=False,
116
+ group_by_modality_length=True,
117
+ half_precision_backend=auto,
118
+ hub_always_push=False,
119
+ hub_model_id=None,
120
+ hub_private_repo=False,
121
+ hub_strategy=every_save,
122
+ hub_token=<HUB_TOKEN>,
123
+ ignore_data_skip=False,
124
+ include_inputs_for_metrics=False,
125
+ include_num_input_tokens_seen=False,
126
+ include_tokens_per_second=False,
127
+ jit_mode_eval=False,
128
+ label_names=None,
129
+ label_smoothing_factor=0.0,
130
+ learning_rate=2e-05,
131
+ length_column_name=length,
132
+ load_best_model_at_end=False,
133
+ local_rank=0,
134
+ log_level=passive,
135
+ log_level_replica=warning,
136
+ log_on_each_node=True,
137
+ logging_dir=./checkpoints/llavaAR4-internlm2_5-7b-sft-llavanext-notext-kn-infpolishmd-detail-knins40k-creationme10kfixed-chart11kmerge-tqa8k-info28kgpt/runs/Jan16_22-24-14_dlc1abaccnl2nzws-worker-0,
138
+ logging_first_step=False,
139
+ logging_nan_inf_filter=True,
140
+ logging_steps=1.0,
141
+ logging_strategy=steps,
142
+ lora_alpha=16,
143
+ lora_bias=none,
144
+ lora_dropout=0.05,
145
+ lora_enable=False,
146
+ lora_r=64,
147
+ lora_weight_path=,
148
+ lr_scheduler_kwargs={},
149
+ lr_scheduler_type=cosine,
150
+ max_grad_norm=1.0,
151
+ max_steps=-1,
152
+ metric_for_best_model=None,
153
+ mm_projector_lr=None,
154
+ mm_vision_tower_lr=2e-06,
155
+ model_max_length=32768,
156
+ mp_parameters=,
157
+ mpt_attn_impl=triton,
158
+ neftune_noise_alpha=None,
159
+ no_cuda=False,
160
+ num_train_epochs=1.0,
161
+ optim=adamw_torch,
162
+ optim_args=None,
163
+ output_dir=./checkpoints/llavaAR4-internlm2_5-7b-sft-llavanext-notext-kn-infpolishmd-detail-knins40k-creationme10kfixed-chart11kmerge-tqa8k-info28kgpt,
164
+ overwrite_output_dir=False,
165
+ past_index=-1,
166
+ per_device_eval_batch_size=4,
167
+ per_device_train_batch_size=4,
168
+ prediction_loss_only=False,
169
+ push_to_hub=False,
170
+ push_to_hub_model_id=None,
171
+ push_to_hub_organization=None,
172
+ push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
173
+ quant_type=nf4,
174
+ ray_scope=last,
175
+ remove_unused_columns=False,
176
+ report_to=['wandb'],
177
+ resume_from_checkpoint=None,
178
+ run_name=llavaAR4-internlm2_5-7b-sft-llavanext-notext-kn-infpolishmd-detail-knins40k-creationme10kfixed-chart11kmerge-tqa8k-info28kgpt,
179
+ save_on_each_node=False,
180
+ save_only_model=False,
181
+ save_safetensors=True,
182
+ save_steps=10000,
183
+ save_strategy=steps,
184
+ save_total_limit=1,
185
+ seed=42,
186
+ skip_memory_metrics=True,
187
+ split_batches=False,
188
+ tf32=True,
189
+ torch_compile=False,
190
+ torch_compile_backend=None,
191
+ torch_compile_mode=None,
192
+ torchdynamo=None,
193
+ tpu_metrics_debug=False,
194
+ tpu_num_cores=None,
195
+ use_cpu=False,
196
+ use_ipex=False,
197
+ use_legacy_prediction_loop=False,
198
+ use_mps_device=False,
199
+ warmup_ratio=0.03,
200
+ warmup_steps=0,
201
+ weight_decay=0.0,
202
+ )
203
+ 01/16/2025 22:24:14 - INFO - llava.train.train - Training/evaluation parameters DataArguments(data_path=None, meta_path='playground/meta_json/llavanext_sample/llava_next_notext_inf37kpolishmd_de35k_know40k_knins40k_creation10kfixed_chart11kmerge_tqa8k_info28k_gpt.json', lazy_preprocess=True, is_multimodal=False, image_folder=None, image_aspect_ratio='anyres', image_grid_pinpoints='[(336, 672), (672, 336), (672, 672), (1008, 336), (336, 1008)]', image_crop_resolution=None, image_split_resolution=None, use_data_resampling=False)
204
+ [INFO|configuration_utils.py:727] 2025-01-16 22:24:14,198 >> loading configuration file models/internlm/internlm2_5-7b-chat/config.json
205
+ [INFO|configuration_utils.py:727] 2025-01-16 22:24:14,220 >> loading configuration file models/internlm/internlm2_5-7b-chat/config.json
206
+ [INFO|configuration_utils.py:792] 2025-01-16 22:24:14,221 >> Model config InternLM2Config {
207
+ "_name_or_path": "models/internlm/internlm2_5-7b-chat",
208
+ "architectures": [
209
+ "InternLM2ForCausalLM"
210
+ ],
211
+ "attn_implementation": "eager",
212
+ "auto_map": {
213
+ "AutoConfig": "configuration_internlm2.InternLM2Config",
214
+ "AutoModel": "modeling_internlm2.InternLM2ForCausalLM",
215
+ "AutoModelForCausalLM": "modeling_internlm2.InternLM2ForCausalLM"
216
+ },
217
+ "bias": false,
218
+ "bos_token_id": 1,
219
+ "eos_token_id": 2,
220
+ "hidden_act": "silu",
221
+ "hidden_size": 4096,
222
+ "initializer_range": 0.02,
223
+ "intermediate_size": 14336,
224
+ "max_position_embeddings": 32768,
225
+ "model_type": "internlm2",
226
+ "num_attention_heads": 32,
227
+ "num_hidden_layers": 32,
228
+ "num_key_value_heads": 8,
229
+ "pad_token_id": 2,
230
+ "pretraining_tp": 1,
231
+ "rms_norm_eps": 1e-05,
232
+ "rope_scaling": {
233
+ "factor": 2.0,
234
+ "type": "dynamic"
235
+ },
236
+ "rope_theta": 1000000,
237
+ "tie_word_embeddings": false,
238
+ "torch_dtype": "bfloat16",
239
+ "transformers_version": "4.37.2",
240
+ "use_cache": true,
241
+ "vocab_size": 92544
242
+ }
243
+
244
+ [WARNING|modeling_utils.py:2918] 2025-01-16 22:24:14,225 >> The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
245
+ [INFO|modeling_utils.py:3473] 2025-01-16 22:24:14,230 >> loading weights file models/internlm/internlm2_5-7b-chat/model.safetensors.index.json
246
+ [INFO|modeling_utils.py:1426] 2025-01-16 22:24:14,232 >> Instantiating LlavaInternlm2ForCausalLM model under default dtype torch.bfloat16.
247
+ [INFO|modeling_utils.py:3582] 2025-01-16 22:24:14,232 >> Detected DeepSpeed ZeRO-3: activating zero.init() for this model
248
+ [INFO|configuration_utils.py:826] 2025-01-16 22:24:14,238 >> Generate config GenerationConfig {
249
+ "bos_token_id": 1,
250
+ "eos_token_id": 2,
251
+ "pad_token_id": 2
252
+ }
253
+
254
+ 01/16/2025 22:24:16 - WARNING - llava.train.train - Process rank: 5, device: cuda:5, n_gpu: 1distributed training: True, 16-bits training: False
255
+ [WARNING|modeling_utils.py:2918] 2025-01-16 22:24:16,509 >> The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
256
+ 01/16/2025 22:24:16 - WARNING - llava.train.train - Process rank: 3, device: cuda:3, n_gpu: 1distributed training: True, 16-bits training: False
257
+ 01/16/2025 22:24:16 - WARNING - llava.train.train - Process rank: 6, device: cuda:6, n_gpu: 1distributed training: True, 16-bits training: False
258
+ 01/16/2025 22:24:16 - WARNING - llava.train.train - Process rank: 2, device: cuda:2, n_gpu: 1distributed training: True, 16-bits training: False
259
+ 01/16/2025 22:24:16 - WARNING - llava.train.train - Process rank: 4, device: cuda:4, n_gpu: 1distributed training: True, 16-bits training: False
260
+ 01/16/2025 22:24:16 - WARNING - llava.train.train - Process rank: 1, device: cuda:1, n_gpu: 1distributed training: True, 16-bits training: False
261
+ [WARNING|modeling_utils.py:2918] 2025-01-16 22:24:16,539 >> The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
262
+ [WARNING|modeling_utils.py:2918] 2025-01-16 22:24:16,541 >> The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
263
+ [WARNING|modeling_utils.py:2918] 2025-01-16 22:24:16,545 >> The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
264
+ [WARNING|modeling_utils.py:2918] 2025-01-16 22:24:16,549 >> The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
265
+ [WARNING|modeling_utils.py:2918] 2025-01-16 22:24:16,553 >> The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
266
+ 01/16/2025 22:24:16 - WARNING - llava.train.train - Process rank: 7, device: cuda:7, n_gpu: 1distributed training: True, 16-bits training: False
267
+ [WARNING|modeling_utils.py:2918] 2025-01-16 22:24:16,603 >> The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
268
+ dlc1abaccnl2nzws-worker-0:78:78 [6] NCCL INFO cudaDriverVersion 12010
269
+ dlc1abaccnl2nzws-worker-0:76:76 [4] NCCL INFO cudaDriverVersion 12010
270
+ dlc1abaccnl2nzws-worker-0:78:78 [6] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth
271
+ dlc1abaccnl2nzws-worker-0:76:76 [4] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth
272
+ dlc1abaccnl2nzws-worker-0:76:76 [4] NCCL INFO Bootstrap : Using eth0:22.8.38.24<0>
273
+ dlc1abaccnl2nzws-worker-0:78:78 [6] NCCL INFO Bootstrap : Using eth0:22.8.38.24<0>
274
+ dlc1abaccnl2nzws-worker-0:78:78 [6] NCCL INFO Plugin name set by env to libnccl-net-none.so
275
+ dlc1abaccnl2nzws-worker-0:76:76 [4] NCCL INFO Plugin name set by env to libnccl-net-none.so
276
+ dlc1abaccnl2nzws-worker-0:76:76 [4] NCCL INFO NET/Plugin : Plugin load (libnccl-net-none.so) returned 2 : libnccl-net-none.so: cannot open shared object file: No such file or directory
277
+ dlc1abaccnl2nzws-worker-0:78:78 [6] NCCL INFO NET/Plugin : Plugin load (libnccl-net-none.so) returned 2 : libnccl-net-none.so: cannot open shared object file: No such file or directory
278
+ dlc1abaccnl2nzws-worker-0:78:78 [6] NCCL INFO NET/Plugin : No plugin found, using internal implementation
279
+ dlc1abaccnl2nzws-worker-0:76:76 [4] NCCL INFO NET/Plugin : No plugin found, using internal implementation
280
+ dlc1abaccnl2nzws-worker-0:75:75 [3] NCCL INFO cudaDriverVersion 12010
281
+ dlc1abaccnl2nzws-worker-0:75:75 [3] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth
282
+ dlc1abaccnl2nzws-worker-0:75:75 [3] NCCL INFO Bootstrap : Using eth0:22.8.38.24<0>
283
+ dlc1abaccnl2nzws-worker-0:75:75 [3] NCCL INFO Plugin name set by env to libnccl-net-none.so
284
+ dlc1abaccnl2nzws-worker-0:75:75 [3] NCCL INFO NET/Plugin : Plugin load (libnccl-net-none.so) returned 2 : libnccl-net-none.so: cannot open shared object file: No such file or directory
285
+ dlc1abaccnl2nzws-worker-0:75:75 [3] NCCL INFO NET/Plugin : No plugin found, using internal implementation
286
+ dlc1abaccnl2nzws-worker-0:77:77 [5] NCCL INFO cudaDriverVersion 12010
287
+ dlc1abaccnl2nzws-worker-0:77:77 [5] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth
288
+ dlc1abaccnl2nzws-worker-0:77:77 [5] NCCL INFO Bootstrap : Using eth0:22.8.38.24<0>
289
+ dlc1abaccnl2nzws-worker-0:77:77 [5] NCCL INFO Plugin name set by env to libnccl-net-none.so
290
+ dlc1abaccnl2nzws-worker-0:77:77 [5] NCCL INFO NET/Plugin : Plugin load (libnccl-net-none.so) returned 2 : libnccl-net-none.so: cannot open shared object file: No such file or directory
291
+ dlc1abaccnl2nzws-worker-0:77:77 [5] NCCL INFO NET/Plugin : No plugin found, using internal implementation
292
+ dlc1abaccnl2nzws-worker-0:79:79 [7] NCCL INFO cudaDriverVersion 12010
293
+ dlc1abaccnl2nzws-worker-0:79:79 [7] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth
294
+ dlc1abaccnl2nzws-worker-0:79:79 [7] NCCL INFO Bootstrap : Using eth0:22.8.38.24<0>
295
+ dlc1abaccnl2nzws-worker-0:79:79 [7] NCCL INFO Plugin name set by env to libnccl-net-none.so
296
+ dlc1abaccnl2nzws-worker-0:79:79 [7] NCCL INFO NET/Plugin : Plugin load (libnccl-net-none.so) returned 2 : libnccl-net-none.so: cannot open shared object file: No such file or directory
297
+ dlc1abaccnl2nzws-worker-0:79:79 [7] NCCL INFO NET/Plugin : No plugin found, using internal implementation
298
+ dlc1abaccnl2nzws-worker-0:74:74 [2] NCCL INFO cudaDriverVersion 12010
299
+ dlc1abaccnl2nzws-worker-0:74:74 [2] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth
300
+ dlc1abaccnl2nzws-worker-0:74:74 [2] NCCL INFO Bootstrap : Using eth0:22.8.38.24<0>
301
+ dlc1abaccnl2nzws-worker-0:74:74 [2] NCCL INFO Plugin name set by env to libnccl-net-none.so
302
+ dlc1abaccnl2nzws-worker-0:74:74 [2] NCCL INFO NET/Plugin : Plugin load (libnccl-net-none.so) returned 2 : libnccl-net-none.so: cannot open shared object file: No such file or directory
303
+ dlc1abaccnl2nzws-worker-0:74:74 [2] NCCL INFO NET/Plugin : No plugin found, using internal implementation
304
+ dlc1abaccnl2nzws-worker-0:73:73 [1] NCCL INFO cudaDriverVersion 12010
305
+ dlc1abaccnl2nzws-worker-0:73:73 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth
306
+ dlc1abaccnl2nzws-worker-0:73:73 [1] NCCL INFO Bootstrap : Using eth0:22.8.38.24<0>
307
+ dlc1abaccnl2nzws-worker-0:73:73 [1] NCCL INFO Plugin name set by env to libnccl-net-none.so
308
+ dlc1abaccnl2nzws-worker-0:73:73 [1] NCCL INFO NET/Plugin : Plugin load (libnccl-net-none.so) returned 2 : libnccl-net-none.so: cannot open shared object file: No such file or directory
309
+ dlc1abaccnl2nzws-worker-0:73:73 [1] NCCL INFO NET/Plugin : No plugin found, using internal implementation
310
+ dlc1abaccnl2nzws-worker-0:78:314 [6] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth
311
+ dlc1abaccnl2nzws-worker-0:76:315 [4] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth
312
+ dlc1abaccnl2nzws-worker-0:78:314 [6] NCCL INFO NCCL_IB_HCA set to mlx5
313
+ dlc1abaccnl2nzws-worker-0:76:315 [4] NCCL INFO NCCL_IB_HCA set to mlx5
314
+ dlc1abaccnl2nzws-worker-0:72:72 [0] NCCL INFO cudaDriverVersion 12010
315
+ dlc1abaccnl2nzws-worker-0:72:72 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth
316
+ dlc1abaccnl2nzws-worker-0:72:72 [0] NCCL INFO Bootstrap : Using eth0:22.8.38.24<0>
317
+ dlc1abaccnl2nzws-worker-0:72:72 [0] NCCL INFO Plugin name set by env to libnccl-net-none.so
318
+ dlc1abaccnl2nzws-worker-0:72:72 [0] NCCL INFO NET/Plugin : Plugin load (libnccl-net-none.so) returned 2 : libnccl-net-none.so: cannot open shared object file: No such file or directory
319
+ dlc1abaccnl2nzws-worker-0:72:72 [0] NCCL INFO NET/Plugin : No plugin found, using internal implementation
320
+ dlc1abaccnl2nzws-worker-0:75:316 [3] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth
321
+ dlc1abaccnl2nzws-worker-0:75:316 [3] NCCL INFO NCCL_IB_HCA set to mlx5
322
+ dlc1abaccnl2nzws-worker-0:77:317 [5] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth
323
+ dlc1abaccnl2nzws-worker-0:77:317 [5] NCCL INFO NCCL_IB_HCA set to mlx5
324
+ dlc1abaccnl2nzws-worker-0:79:321 [7] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth
325
+ dlc1abaccnl2nzws-worker-0:79:321 [7] NCCL INFO NCCL_IB_HCA set to mlx5
326
+ dlc1abaccnl2nzws-worker-0:74:323 [2] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth
327
+ dlc1abaccnl2nzws-worker-0:74:323 [2] NCCL INFO NCCL_IB_HCA set to mlx5
328
+ dlc1abaccnl2nzws-worker-0:73:324 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth
329
+ dlc1abaccnl2nzws-worker-0:73:324 [1] NCCL INFO NCCL_IB_HCA set to mlx5
330
+ dlc1abaccnl2nzws-worker-0:72:326 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth
331
+ dlc1abaccnl2nzws-worker-0:72:326 [0] NCCL INFO NCCL_IB_HCA set to mlx5
332
+ dlc1abaccnl2nzws-worker-0:76:315 [4] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [1]mlx5_1:1/RoCE [2]mlx5_2:1/RoCE [3]mlx5_3:1/RoCE [RO]; OOB eth0:22.8.38.24<0>
333
+ dlc1abaccnl2nzws-worker-0:76:315 [4] NCCL INFO Using network IB
334
+ dlc1abaccnl2nzws-worker-0:75:316 [3] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [1]mlx5_1:1/RoCE [2]mlx5_2:1/RoCE [3]mlx5_3:1/RoCE [RO]; OOB eth0:22.8.38.24<0>
335
+ dlc1abaccnl2nzws-worker-0:75:316 [3] NCCL INFO Using network IB
336
+ dlc1abaccnl2nzws-worker-0:78:314 [6] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [1]mlx5_1:1/RoCE [2]mlx5_2:1/RoCE [3]mlx5_3:1/RoCE [RO]; OOB eth0:22.8.38.24<0>
337
+ dlc1abaccnl2nzws-worker-0:78:314 [6] NCCL INFO Using network IB
338
+ dlc1abaccnl2nzws-worker-0:77:317 [5] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [1]mlx5_1:1/RoCE [2]mlx5_2:1/RoCE [3]mlx5_3:1/RoCE [RO]; OOB eth0:22.8.38.24<0>
339
+ dlc1abaccnl2nzws-worker-0:77:317 [5] NCCL INFO Using network IB
340
+ dlc1abaccnl2nzws-worker-0:79:321 [7] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [1]mlx5_1:1/RoCE [2]mlx5_2:1/RoCE [3]mlx5_3:1/RoCE [RO]; OOB eth0:22.8.38.24<0>
341
+ dlc1abaccnl2nzws-worker-0:79:321 [7] NCCL INFO Using network IB
342
+ dlc1abaccnl2nzws-worker-0:73:324 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [1]mlx5_1:1/RoCE [2]mlx5_2:1/RoCE [3]mlx5_3:1/RoCE [RO]; OOB eth0:22.8.38.24<0>
343
+ dlc1abaccnl2nzws-worker-0:73:324 [1] NCCL INFO Using network IB
344
+ dlc1abaccnl2nzws-worker-0:74:323 [2] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [1]mlx5_1:1/RoCE [2]mlx5_2:1/RoCE [3]mlx5_3:1/RoCE [RO]; OOB eth0:22.8.38.24<0>
345
+ dlc1abaccnl2nzws-worker-0:74:323 [2] NCCL INFO Using network IB
346
+ dlc1abaccnl2nzws-worker-0:72:326 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [1]mlx5_1:1/RoCE [2]mlx5_2:1/RoCE [3]mlx5_3:1/RoCE [RO]; OOB eth0:22.8.38.24<0>
347
+ dlc1abaccnl2nzws-worker-0:72:326 [0] NCCL INFO Using network IB
348
+ dlc1abaccnl2nzws-worker-0:79:321 [7] NCCL INFO comm 0x9afcfd70 rank 15 nranks 16 cudaDev 7 nvmlDev 7 busId 80 commId 0x13a7e6351c9956c9 - Init START
349
+ dlc1abaccnl2nzws-worker-0:72:326 [0] NCCL INFO comm 0x9baa4e10 rank 8 nranks 16 cudaDev 0 nvmlDev 0 busId 10 commId 0x13a7e6351c9956c9 - Init START
350
+ dlc1abaccnl2nzws-worker-0:78:314 [6] NCCL INFO comm 0x9a052210 rank 14 nranks 16 cudaDev 6 nvmlDev 6 busId 70 commId 0x13a7e6351c9956c9 - Init START
351
+ dlc1abaccnl2nzws-worker-0:77:317 [5] NCCL INFO comm 0x99f6a330 rank 13 nranks 16 cudaDev 5 nvmlDev 5 busId 60 commId 0x13a7e6351c9956c9 - Init START
352
+ dlc1abaccnl2nzws-worker-0:74:323 [2] NCCL INFO comm 0x9b2bc4b0 rank 10 nranks 16 cudaDev 2 nvmlDev 2 busId 30 commId 0x13a7e6351c9956c9 - Init START
353
+ dlc1abaccnl2nzws-worker-0:73:324 [1] NCCL INFO comm 0x9a987d60 rank 9 nranks 16 cudaDev 1 nvmlDev 1 busId 20 commId 0x13a7e6351c9956c9 - Init START
354
+ dlc1abaccnl2nzws-worker-0:76:315 [4] NCCL INFO comm 0x9a914480 rank 12 nranks 16 cudaDev 4 nvmlDev 4 busId 50 commId 0x13a7e6351c9956c9 - Init START
355
+ dlc1abaccnl2nzws-worker-0:75:316 [3] NCCL INFO comm 0x9b8645b0 rank 11 nranks 16 cudaDev 3 nvmlDev 3 busId 40 commId 0x13a7e6351c9956c9 - Init START
356
+ dlc1abaccnl2nzws-worker-0:77:317 [5] NCCL INFO NVLS multicast support is not available on dev 5
357
+ dlc1abaccnl2nzws-worker-0:79:321 [7] NCCL INFO NVLS multicast support is not available on dev 7
358
+ dlc1abaccnl2nzws-worker-0:76:315 [4] NCCL INFO NVLS multicast support is not available on dev 4
359
+ dlc1abaccnl2nzws-worker-0:75:316 [3] NCCL INFO Setting affinity for GPU 3 to ffffffff,ffffffff,ffffffff
360
+ dlc1abaccnl2nzws-worker-0:75:316 [3] NCCL INFO NVLS multicast support is not available on dev 3
361
+ dlc1abaccnl2nzws-worker-0:78:314 [6] NCCL INFO NVLS multicast support is not available on dev 6
362
+ dlc1abaccnl2nzws-worker-0:74:323 [2] NCCL INFO Setting affinity for GPU 2 to ffffffff,ffffffff,ffffffff
363
+ dlc1abaccnl2nzws-worker-0:74:323 [2] NCCL INFO NVLS multicast support is not available on dev 2
364
+ dlc1abaccnl2nzws-worker-0:72:326 [0] NCCL INFO Setting affinity for GPU 0 to ffffffff,ffffffff,ffffffff
365
+ dlc1abaccnl2nzws-worker-0:72:326 [0] NCCL INFO NVLS multicast support is not available on dev 0
366
+ dlc1abaccnl2nzws-worker-0:73:324 [1] NCCL INFO Setting affinity for GPU 1 to ffffffff,ffffffff,ffffffff
367
+ dlc1abaccnl2nzws-worker-0:73:324 [1] NCCL INFO NVLS multicast support is not available on dev 1
368
+ dlc1abaccnl2nzws-worker-0:72:326 [0] NCCL INFO NCCL_MIN_NCHANNELS set by environment to 4.
369
+ dlc1abaccnl2nzws-worker-0:79:321 [7] NCCL INFO NCCL_MIN_NCHANNELS set by environment to 4.
370
+ dlc1abaccnl2nzws-worker-0:72:326 [0] NCCL INFO Trees [0] 9/-1/-1->8->0 [1] 9/-1/-1->8->15 [2] 9/-1/-1->8->15 [3] 9/-1/-1->8->15 [4] 9/0/-1->8->-1 [5] 9/-1/-1->8->15 [6] 9/-1/-1->8->15 [7] 9/-1/-1->8->15
371
+ dlc1abaccnl2nzws-worker-0:78:314 [6] NCCL INFO NCCL_MIN_NCHANNELS set by environment to 4.
372
+ dlc1abaccnl2nzws-worker-0:79:321 [7] NCCL INFO Trees [0] -1/-1/-1->15->14 [1] 8/-1/-1->15->14 [2] 8/-1/-1->15->14 [3] 8/-1/-1->15->14 [4] -1/-1/-1->15->14 [5] 8/-1/-1->15->14 [6] 8/-1/-1->15->14 [7] 8/-1/-1->15->14
373
+ dlc1abaccnl2nzws-worker-0:72:326 [0] NCCL INFO P2P Chunksize set to 131072
374
+ dlc1abaccnl2nzws-worker-0:79:321 [7] NCCL INFO P2P Chunksize set to 131072
375
+ dlc1abaccnl2nzws-worker-0:77:317 [5] NCCL INFO NCCL_MIN_NCHANNELS set by environment to 4.
376
+ dlc1abaccnl2nzws-worker-0:78:314 [6] NCCL INFO Trees [0] 15/-1/-1->14->13 [1] 15/-1/-1->14->13 [2] 15/-1/-1->14->13 [3] 15/-1/-1->14->6 [4] 15/-1/-1->14->13 [5] 15/-1/-1->14->13 [6] 15/-1/-1->14->13 [7] 15/6/-1->14->-1
377
+ dlc1abaccnl2nzws-worker-0:73:324 [1] NCCL INFO NCCL_MIN_NCHANNELS set by environment to 4.
378
+ dlc1abaccnl2nzws-worker-0:78:314 [6] NCCL INFO P2P Chunksize set to 131072
379
+ dlc1abaccnl2nzws-worker-0:74:323 [2] NCCL INFO NCCL_MIN_NCHANNELS set by environment to 4.
380
+ dlc1abaccnl2nzws-worker-0:77:317 [5] NCCL INFO Trees [0] 14/-1/-1->13->12 [1] 14/-1/-1->13->12 [2] 14/-1/-1->13->12 [3] -1/-1/-1->13->12 [4] 14/-1/-1->13->12 [5] 14/-1/-1->13->12 [6] 14/-1/-1->13->12 [7] -1/-1/-1->13->12
381
+ dlc1abaccnl2nzws-worker-0:77:317 [5] NCCL INFO P2P Chunksize set to 131072
382
+ dlc1abaccnl2nzws-worker-0:73:324 [1] NCCL INFO Trees [0] 10/-1/-1->9->8 [1] -1/-1/-1->9->8 [2] 10/-1/-1->9->8 [3] 10/-1/-1->9->8 [4] 10/-1/-1->9->8 [5] -1/-1/-1->9->8 [6] 10/-1/-1->9->8 [7] 10/-1/-1->9->8
383
+ dlc1abaccnl2nzws-worker-0:74:323 [2] NCCL INFO Trees [0] 11/-1/-1->10->9 [1] 11/-1/-1->10->2 [2] 11/-1/-1->10->9 [3] 11/-1/-1->10->9 [4] 11/-1/-1->10->9 [5] 11/2/-1->10->-1 [6] 11/-1/-1->10->9 [7] 11/-1/-1->10->9
384
+ dlc1abaccnl2nzws-worker-0:75:316 [3] NCCL INFO NCCL_MIN_NCHANNELS set by environment to 4.
385
+ dlc1abaccnl2nzws-worker-0:73:324 [1] NCCL INFO P2P Chunksize set to 131072
386
+ dlc1abaccnl2nzws-worker-0:74:323 [2] NCCL INFO P2P Chunksize set to 131072
387
+ dlc1abaccnl2nzws-worker-0:76:315 [4] NCCL INFO NCCL_MIN_NCHANNELS set by environment to 4.
388
+ dlc1abaccnl2nzws-worker-0:75:316 [3] NCCL INFO Trees [0] 12/-1/-1->11->10 [1] 12/-1/-1->11->10 [2] -1/-1/-1->11->10 [3] 12/-1/-1->11->10 [4] 12/-1/-1->11->10 [5] 12/-1/-1->11->10 [6] -1/-1/-1->11->10 [7] 12/-1/-1->11->10
389
+ dlc1abaccnl2nzws-worker-0:75:316 [3] NCCL INFO P2P Chunksize set to 131072
390
+ dlc1abaccnl2nzws-worker-0:76:315 [4] NCCL INFO Trees [0] 13/-1/-1->12->11 [1] 13/-1/-1->12->11 [2] 13/-1/-1->12->4 [3] 13/-1/-1->12->11 [4] 13/-1/-1->12->11 [5] 13/-1/-1->12->11 [6] 13/4/-1->12->-1 [7] 13/-1/-1->12->11
391
+ dlc1abaccnl2nzws-worker-0:76:315 [4] NCCL INFO P2P Chunksize set to 131072
392
+ dlc1abaccnl2nzws-worker-0:72:326 [0] NCCL INFO Channel 01/0 : 8[0] -> 11[3] via P2P/IPC/read
393
+ dlc1abaccnl2nzws-worker-0:74:323 [2] NCCL INFO Channel 01/0 : 10[2] -> 15[7] via P2P/IPC/read
394
+ dlc1abaccnl2nzws-worker-0:76:315 [4] NCCL INFO Channel 03/0 : 12[4] -> 15[7] via P2P/IPC/read
395
+ dlc1abaccnl2nzws-worker-0:73:324 [1] NCCL INFO Channel 00/0 : 9[1] -> 0[0] [send] via NET/IB/0/GDRDMA
396
+ dlc1abaccnl2nzws-worker-0:78:314 [6] NCCL INFO Channel 03/0 : 7[7] -> 14[6] [receive] via NET/IB/3/GDRDMA
397
+ dlc1abaccnl2nzws-worker-0:78:314 [6] NCCL INFO Channel 07/0 : 7[7] -> 14[6] [receive] via NET/IB/3/GDRDMA
398
+ dlc1abaccnl2nzws-worker-0:73:324 [1] NCCL INFO Channel 04/0 : 9[1] -> 0[0] [send] via NET/IB/0/GDRDMA
399
+ dlc1abaccnl2nzws-worker-0:72:326 [0] NCCL INFO Channel 05/0 : 8[0] -> 11[3] via P2P/IPC/read
400
+ dlc1abaccnl2nzws-worker-0:74:323 [2] NCCL INFO Channel 05/0 : 10[2] -> 15[7] via P2P/IPC/read
401
+ dlc1abaccnl2nzws-worker-0:76:315 [4] NCCL INFO Channel 07/0 : 12[4] -> 15[7] via P2P/IPC/read
402
+ dlc1abaccnl2nzws-worker-0:72:326 [0] NCCL INFO Channel 03/0 : 8[0] -> 13[5] via P2P/IPC/read
403
+ dlc1abaccnl2nzws-worker-0:72:326 [0] NCCL INFO Channel 07/0 : 8[0] -> 13[5] via P2P/IPC/read
404
+ dlc1abaccnl2nzws-worker-0:75:316 [3] NCCL INFO Channel 01/0 : 11[3] -> 2[2] [send] via NET/IB/1/GDRDMA
405
+ dlc1abaccnl2nzws-worker-0:75:316 [3] NCCL INFO Channel 05/0 : 11[3] -> 2[2] [send] via NET/IB/1/GDRDMA
406
+ dlc1abaccnl2nzws-worker-0:76:315 [4] NCCL INFO Channel 02/0 : 5[5] -> 12[4] [receive] via NET/IB/2/GDRDMA
407
+ dlc1abaccnl2nzws-worker-0:76:315 [4] NCCL INFO Channel 06/0 : 5[5] -> 12[4] [receive] via NET/IB/2/GDRDMA
408
+ dlc1abaccnl2nzws-worker-0:72:326 [0] NCCL INFO Channel 00/0 : 1[1] -> 8[0] [receive] via NET/IB/0/GDRDMA
409
+ dlc1abaccnl2nzws-worker-0:72:326 [0] NCCL INFO Channel 04/0 : 1[1] -> 8[0] [receive] via NET/IB/0/GDRDMA
410
+ dlc1abaccnl2nzws-worker-0:72:326 [0] NCCL INFO Channel 00/0 : 8[0] -> 15[7] via P2P/IPC/read
411
+ dlc1abaccnl2nzws-worker-0:72:326 [0] NCCL INFO Channel 02/0 : 8[0] -> 15[7] via P2P/IPC/read
412
+ dlc1abaccnl2nzws-worker-0:77:317 [5] NCCL INFO Channel 02/0 : 13[5] -> 4[4] [send] via NET/IB/2/GDRDMA
413
+ dlc1abaccnl2nzws-worker-0:77:317 [5] NCCL INFO Channel 06/0 : 13[5] -> 4[4] [send] via NET/IB/2/GDRDMA
414
+ dlc1abaccnl2nzws-worker-0:72:326 [0] NCCL INFO Channel 04/0 : 8[0] -> 15[7] via P2P/IPC/read
415
+ dlc1abaccnl2nzws-worker-0:76:315 [4] NCCL INFO Channel 01/0 : 12[4] -> 9[1] via P2P/IPC/read
416
+ dlc1abaccnl2nzws-worker-0:79:321 [7] NCCL INFO Channel 03/0 : 15[7] -> 6[6] [send] via NET/IB/3/GDRDMA
417
+ dlc1abaccnl2nzws-worker-0:79:321 [7] NCCL INFO Channel 07/0 : 15[7] -> 6[6] [send] via NET/IB/3/GDRDMA
418
+ dlc1abaccnl2nzws-worker-0:72:326 [0] NCCL INFO Channel 06/0 : 8[0] -> 15[7] via P2P/IPC/read
419
+ dlc1abaccnl2nzws-worker-0:78:314 [6] NCCL INFO Channel 03/0 : 14[6] -> 11[3] via P2P/IPC/read
420
+ dlc1abaccnl2nzws-worker-0:74:323 [2] NCCL INFO Channel 01/0 : 3[3] -> 10[2] [receive] via NET/IB/1/GDRDMA
421
+ dlc1abaccnl2nzws-worker-0:74:323 [2] NCCL INFO Channel 05/0 : 3[3] -> 10[2] [receive] via NET/IB/1/GDRDMA
422
+ dlc1abaccnl2nzws-worker-0:76:315 [4] NCCL INFO Channel 05/0 : 12[4] -> 9[1] via P2P/IPC/read
423
+ dlc1abaccnl2nzws-worker-0:79:321 [7] NCCL INFO Channel 00/0 : 15[7] -> 14[6] via P2P/IPC/read
424
+ dlc1abaccnl2nzws-worker-0:78:314 [6] NCCL INFO Channel 07/0 : 14[6] -> 11[3] via P2P/IPC/read
425
+ dlc1abaccnl2nzws-worker-0:79:321 [7] NCCL INFO Channel 01/0 : 15[7] -> 14[6] via P2P/IPC/read
426
+ dlc1abaccnl2nzws-worker-0:79:321 [7] NCCL INFO Channel 02/0 : 15[7] -> 14[6] via P2P/IPC/read
427
+ dlc1abaccnl2nzws-worker-0:79:321 [7] NCCL INFO Channel 04/0 : 15[7] -> 14[6] via P2P/IPC/read
428
+ dlc1abaccnl2nzws-worker-0:77:317 [5] NCCL INFO Channel 00/0 : 13[5] -> 12[4] via P2P/IPC/read
429
+ dlc1abaccnl2nzws-worker-0:79:321 [7] NCCL INFO Channel 05/0 : 15[7] -> 14[6] via P2P/IPC/read
430
+ dlc1abaccnl2nzws-worker-0:77:317 [5] NCCL INFO Channel 01/0 : 13[5] -> 12[4] via P2P/IPC/read
431
+ dlc1abaccnl2nzws-worker-0:79:321 [7] NCCL INFO Channel 06/0 : 15[7] -> 14[6] via P2P/IPC/read
432
+ dlc1abaccnl2nzws-worker-0:74:323 [2] NCCL INFO Channel 00/0 : 10[2] -> 9[1] via P2P/IPC/read
433
+ dlc1abaccnl2nzws-worker-0:75:316 [3] NCCL INFO Channel 00/0 : 11[3] -> 10[2] via P2P/IPC/read
434
+ dlc1abaccnl2nzws-worker-0:77:317 [5] NCCL INFO Channel 03/0 : 13[5] -> 12[4] via P2P/IPC/read
435
+ dlc1abaccnl2nzws-worker-0:74:323 [2] NCCL INFO Channel 02/0 : 10[2] -> 9[1] via P2P/IPC/read
436
+ dlc1abaccnl2nzws-worker-0:77:317 [5] NCCL INFO Channel 04/0 : 13[5] -> 12[4] via P2P/IPC/read
437
+ dlc1abaccnl2nzws-worker-0:75:316 [3] NCCL INFO Channel 02/0 : 11[3] -> 10[2] via P2P/IPC/read
438
+ dlc1abaccnl2nzws-worker-0:74:323 [2] NCCL INFO Channel 03/0 : 10[2] -> 9[1] via P2P/IPC/read
439
+ dlc1abaccnl2nzws-worker-0:77:317 [5] NCCL INFO Channel 05/0 : 13[5] -> 12[4] via P2P/IPC/read
440
+ dlc1abaccnl2nzws-worker-0:75:316 [3] NCCL INFO Channel 03/0 : 11[3] -> 10[2] via P2P/IPC/read
441
+ dlc1abaccnl2nzws-worker-0:78:314 [6] NCCL INFO Channel 00/0 : 14[6] -> 13[5] via P2P/IPC/read
442
+ dlc1abaccnl2nzws-worker-0:74:323 [2] NCCL INFO Channel 04/0 : 10[2] -> 9[1] via P2P/IPC/read
443
+ dlc1abaccnl2nzws-worker-0:77:317 [5] NCCL INFO Channel 07/0 : 13[5] -> 12[4] via P2P/IPC/read
444
+ dlc1abaccnl2nzws-worker-0:75:316 [3] NCCL INFO Channel 04/0 : 11[3] -> 10[2] via P2P/IPC/read
445
+ dlc1abaccnl2nzws-worker-0:78:314 [6] NCCL INFO Channel 01/0 : 14[6] -> 13[5] via P2P/IPC/read
446
+ dlc1abaccnl2nzws-worker-0:74:323 [2] NCCL INFO Channel 06/0 : 10[2] -> 9[1] via P2P/IPC/read
447
+ dlc1abaccnl2nzws-worker-0:75:316 [3] NCCL INFO Channel 06/0 : 11[3] -> 10[2] via P2P/IPC/read
448
+ dlc1abaccnl2nzws-worker-0:78:314 [6] NCCL INFO Channel 02/0 : 14[6] -> 13[5] via P2P/IPC/read
449
+ dlc1abaccnl2nzws-worker-0:74:323 [2] NCCL INFO Channel 07/0 : 10[2] -> 9[1] via P2P/IPC/read
450
+ dlc1abaccnl2nzws-worker-0:76:315 [4] NCCL INFO Channel 00/0 : 12[4] -> 11[3] via P2P/IPC/read
451
+ dlc1abaccnl2nzws-worker-0:75:316 [3] NCCL INFO Channel 07/0 : 11[3] -> 10[2] via P2P/IPC/read
452
+ dlc1abaccnl2nzws-worker-0:73:324 [1] NCCL INFO Channel 01/0 : 9[1] -> 8[0] via P2P/IPC/read
453
+ dlc1abaccnl2nzws-worker-0:78:314 [6] NCCL INFO Channel 04/0 : 14[6] -> 13[5] via P2P/IPC/read
454
+ dlc1abaccnl2nzws-worker-0:76:315 [4] NCCL INFO Channel 02/0 : 12[4] -> 11[3] via P2P/IPC/read
455
+ dlc1abaccnl2nzws-worker-0:73:324 [1] NCCL INFO Channel 02/0 : 9[1] -> 8[0] via P2P/IPC/read
456
+ dlc1abaccnl2nzws-worker-0:78:314 [6] NCCL INFO Channel 05/0 : 14[6] -> 13[5] via P2P/IPC/read
457
+ dlc1abaccnl2nzws-worker-0:76:315 [4] NCCL INFO Channel 04/0 : 12[4] -> 11[3] via P2P/IPC/read
458
+ dlc1abaccnl2nzws-worker-0:78:314 [6] NCCL INFO Channel 06/0 : 14[6] -> 13[5] via P2P/IPC/read
459
+ dlc1abaccnl2nzws-worker-0:73:324 [1] NCCL INFO Channel 03/0 : 9[1] -> 8[0] via P2P/IPC/read
460
+ dlc1abaccnl2nzws-worker-0:76:315 [4] NCCL INFO Channel 06/0 : 12[4] -> 11[3] via P2P/IPC/read
461
+ dlc1abaccnl2nzws-worker-0:73:324 [1] NCCL INFO Channel 05/0 : 9[1] -> 8[0] via P2P/IPC/read
462
+ dlc1abaccnl2nzws-worker-0:73:324 [1] NCCL INFO Channel 06/0 : 9[1] -> 8[0] via P2P/IPC/read
463
+ dlc1abaccnl2nzws-worker-0:73:324 [1] NCCL INFO Channel 07/0 : 9[1] -> 8[0] via P2P/IPC/read
464
+ dlc1abaccnl2nzws-worker-0:77:357 [5] NCCL INFO NCCL_IB_QPS_PER_CONNECTION set by environment to 8.
465
+ dlc1abaccnl2nzws-worker-0:75:360 [3] NCCL INFO NCCL_IB_QPS_PER_CONNECTION set by environment to 8.
466
+ dlc1abaccnl2nzws-worker-0:73:359 [1] NCCL INFO NCCL_IB_QPS_PER_CONNECTION set by environment to 8.
467
+ dlc1abaccnl2nzws-worker-0:77:357 [5] NCCL INFO NCCL_IB_GID_INDEX set by environment to 3.
468
+ dlc1abaccnl2nzws-worker-0:75:360 [3] NCCL INFO NCCL_IB_GID_INDEX set by environment to 3.
469
+ dlc1abaccnl2nzws-worker-0:79:355 [7] NCCL INFO NCCL_IB_QPS_PER_CONNECTION set by environment to 8.
470
+ dlc1abaccnl2nzws-worker-0:74:358 [2] NCCL INFO NCCL_IB_GID_INDEX set by environment to 3.
471
+ dlc1abaccnl2nzws-worker-0:73:359 [1] NCCL INFO NCCL_IB_GID_INDEX set by environment to 3.
472
+ dlc1abaccnl2nzws-worker-0:74:358 [2] NCCL INFO NCCL_IB_QPS_PER_CONNECTION set by environment to 8.
473
+ dlc1abaccnl2nzws-worker-0:76:361 [4] NCCL INFO NCCL_IB_GID_INDEX set by environment to 3.
474
+ dlc1abaccnl2nzws-worker-0:76:361 [4] NCCL INFO NCCL_IB_QPS_PER_CONNECTION set by environment to 8.
475
+ dlc1abaccnl2nzws-worker-0:74:358 [2] NCCL INFO NCCL_IB_TC set by environment to 136.
476
+ dlc1abaccnl2nzws-worker-0:74:358 [2] NCCL INFO NCCL_IB_SL set by environment to 5.
477
+ dlc1abaccnl2nzws-worker-0:74:358 [2] NCCL INFO NCCL_IB_TIMEOUT set by environment to 22.
478
+ dlc1abaccnl2nzws-worker-0:79:355 [7] NCCL INFO NCCL_IB_GID_INDEX set by environment to 3.
479
+ dlc1abaccnl2nzws-worker-0:76:361 [4] NCCL INFO NCCL_IB_TC set by environment to 136.
480
+ dlc1abaccnl2nzws-worker-0:76:361 [4] NCCL INFO NCCL_IB_SL set by environment to 5.
481
+ dlc1abaccnl2nzws-worker-0:76:361 [4] NCCL INFO NCCL_IB_TIMEOUT set by environment to 22.
482
+ dlc1abaccnl2nzws-worker-0:75:360 [3] NCCL INFO NCCL_IB_TC set by environment to 136.
483
+ dlc1abaccnl2nzws-worker-0:75:360 [3] NCCL INFO NCCL_IB_SL set by environment to 5.
484
+ dlc1abaccnl2nzws-worker-0:75:360 [3] NCCL INFO NCCL_IB_TIMEOUT set by environment to 22.
485
+ dlc1abaccnl2nzws-worker-0:77:357 [5] NCCL INFO NCCL_IB_TC set by environment to 136.
486
+ dlc1abaccnl2nzws-worker-0:77:357 [5] NCCL INFO NCCL_IB_SL set by environment to 5.
487
+ dlc1abaccnl2nzws-worker-0:77:357 [5] NCCL INFO NCCL_IB_TIMEOUT set by environment to 22.
488
+ dlc1abaccnl2nzws-worker-0:72:354 [0] NCCL INFO NCCL_IB_GID_INDEX set by environment to 3.
489
+ dlc1abaccnl2nzws-worker-0:72:354 [0] NCCL INFO NCCL_IB_QPS_PER_CONNECTION set by environment to 8.
490
+ dlc1abaccnl2nzws-worker-0:78:356 [6] NCCL INFO NCCL_IB_GID_INDEX set by environment to 3.
491
+ dlc1abaccnl2nzws-worker-0:78:356 [6] NCCL INFO NCCL_IB_QPS_PER_CONNECTION set by environment to 8.
492
+ dlc1abaccnl2nzws-worker-0:79:355 [7] NCCL INFO NCCL_IB_TC set by environment to 136.
493
+ dlc1abaccnl2nzws-worker-0:79:355 [7] NCCL INFO NCCL_IB_SL set by environment to 5.
494
+ dlc1abaccnl2nzws-worker-0:72:354 [0] NCCL INFO NCCL_IB_TC set by environment to 136.
495
+ dlc1abaccnl2nzws-worker-0:72:354 [0] NCCL INFO NCCL_IB_SL set by environment to 5.
496
+ dlc1abaccnl2nzws-worker-0:72:354 [0] NCCL INFO NCCL_IB_TIMEOUT set by environment to 22.
497
+ dlc1abaccnl2nzws-worker-0:79:355 [7] NCCL INFO NCCL_IB_TIMEOUT set by environment to 22.
498
+ dlc1abaccnl2nzws-worker-0:73:359 [1] NCCL INFO NCCL_IB_TC set by environment to 136.
499
+ dlc1abaccnl2nzws-worker-0:73:359 [1] NCCL INFO NCCL_IB_SL set by environment to 5.
500
+ dlc1abaccnl2nzws-worker-0:73:359 [1] NCCL INFO NCCL_IB_TIMEOUT set by environment to 22.
501
+ dlc1abaccnl2nzws-worker-0:78:356 [6] NCCL INFO NCCL_IB_TC set by environment to 136.
502
+ dlc1abaccnl2nzws-worker-0:78:356 [6] NCCL INFO NCCL_IB_SL set by environment to 5.
503
+ dlc1abaccnl2nzws-worker-0:78:356 [6] NCCL INFO NCCL_IB_TIMEOUT set by environment to 22.
504
+ dlc1abaccnl2nzws-worker-0:77:317 [5] NCCL INFO Connected all rings
505
+ dlc1abaccnl2nzws-worker-0:75:316 [3] NCCL INFO Connected all rings
506
+ dlc1abaccnl2nzws-worker-0:73:324 [1] NCCL INFO Connected all rings
507
+ dlc1abaccnl2nzws-worker-0:79:321 [7] NCCL INFO Connected all rings
508
+ dlc1abaccnl2nzws-worker-0:77:317 [5] NCCL INFO Channel 00/0 : 13[5] -> 14[6] via P2P/IPC/read
509
+ dlc1abaccnl2nzws-worker-0:77:317 [5] NCCL INFO Channel 01/0 : 13[5] -> 14[6] via P2P/IPC/read
510
+ dlc1abaccnl2nzws-worker-0:77:317 [5] NCCL INFO Channel 02/0 : 13[5] -> 14[6] via P2P/IPC/read
511
+ dlc1abaccnl2nzws-worker-0:77:317 [5] NCCL INFO Channel 04/0 : 13[5] -> 14[6] via P2P/IPC/read
512
+ dlc1abaccnl2nzws-worker-0:74:323 [2] NCCL INFO Connected all rings
513
+ dlc1abaccnl2nzws-worker-0:77:317 [5] NCCL INFO Channel 05/0 : 13[5] -> 14[6] via P2P/IPC/read
514
+ dlc1abaccnl2nzws-worker-0:77:317 [5] NCCL INFO Channel 06/0 : 13[5] -> 14[6] via P2P/IPC/read
515
+ dlc1abaccnl2nzws-worker-0:76:315 [4] NCCL INFO Connected all rings
516
+ dlc1abaccnl2nzws-worker-0:75:316 [3] NCCL INFO Channel 00/0 : 11[3] -> 12[4] via P2P/IPC/read
517
+ dlc1abaccnl2nzws-worker-0:78:314 [6] NCCL INFO Connected all rings
518
+ dlc1abaccnl2nzws-worker-0:72:326 [0] NCCL INFO Connected all rings
519
+ dlc1abaccnl2nzws-worker-0:72:326 [0] NCCL INFO Channel 00/0 : 8[0] -> 9[1] via P2P/IPC/read
520
+ dlc1abaccnl2nzws-worker-0:75:316 [3] NCCL INFO Channel 01/0 : 11[3] -> 12[4] via P2P/IPC/read
521
+ dlc1abaccnl2nzws-worker-0:73:324 [1] NCCL INFO Channel 00/0 : 9[1] -> 10[2] via P2P/IPC/read
522
+ dlc1abaccnl2nzws-worker-0:72:326 [0] NCCL INFO Channel 01/0 : 8[0] -> 9[1] via P2P/IPC/read
523
+ dlc1abaccnl2nzws-worker-0:75:316 [3] NCCL INFO Channel 03/0 : 11[3] -> 12[4] via P2P/IPC/read
524
+ dlc1abaccnl2nzws-worker-0:73:324 [1] NCCL INFO Channel 02/0 : 9[1] -> 10[2] via P2P/IPC/read
525
+ dlc1abaccnl2nzws-worker-0:72:326 [0] NCCL INFO Channel 02/0 : 8[0] -> 9[1] via P2P/IPC/read
526
+ dlc1abaccnl2nzws-worker-0:75:316 [3] NCCL INFO Channel 04/0 : 11[3] -> 12[4] via P2P/IPC/read
527
+ dlc1abaccnl2nzws-worker-0:73:324 [1] NCCL INFO Channel 03/0 : 9[1] -> 10[2] via P2P/IPC/read
528
+ dlc1abaccnl2nzws-worker-0:72:326 [0] NCCL INFO Channel 03/0 : 8[0] -> 9[1] via P2P/IPC/read
529
+ dlc1abaccnl2nzws-worker-0:75:316 [3] NCCL INFO Channel 05/0 : 11[3] -> 12[4] via P2P/IPC/read
530
+ dlc1abaccnl2nzws-worker-0:74:323 [2] NCCL INFO Channel 00/0 : 10[2] -> 11[3] via P2P/IPC/read
531
+ dlc1abaccnl2nzws-worker-0:73:324 [1] NCCL INFO Channel 04/0 : 9[1] -> 10[2] via P2P/IPC/read
532
+ dlc1abaccnl2nzws-worker-0:72:326 [0] NCCL INFO Channel 04/0 : 8[0] -> 9[1] via P2P/IPC/read
533
+ dlc1abaccnl2nzws-worker-0:75:316 [3] NCCL INFO Channel 07/0 : 11[3] -> 12[4] via P2P/IPC/read
534
+ dlc1abaccnl2nzws-worker-0:74:323 [2] NCCL INFO Channel 01/0 : 10[2] -> 11[3] via P2P/IPC/read
535
+ dlc1abaccnl2nzws-worker-0:73:324 [1] NCCL INFO Channel 06/0 : 9[1] -> 10[2] via P2P/IPC/read
536
+ dlc1abaccnl2nzws-worker-0:76:315 [4] NCCL INFO Channel 00/0 : 12[4] -> 13[5] via P2P/IPC/read
537
+ dlc1abaccnl2nzws-worker-0:72:326 [0] NCCL INFO Channel 05/0 : 8[0] -> 9[1] via P2P/IPC/read
538
+ dlc1abaccnl2nzws-worker-0:78:314 [6] NCCL INFO Channel 00/0 : 14[6] -> 15[7] via P2P/IPC/read
539
+ dlc1abaccnl2nzws-worker-0:74:323 [2] NCCL INFO Channel 02/0 : 10[2] -> 11[3] via P2P/IPC/read
540
+ dlc1abaccnl2nzws-worker-0:73:324 [1] NCCL INFO Channel 07/0 : 9[1] -> 10[2] via P2P/IPC/read
541
+ dlc1abaccnl2nzws-worker-0:76:315 [4] NCCL INFO Channel 01/0 : 12[4] -> 13[5] via P2P/IPC/read
542
+ dlc1abaccnl2nzws-worker-0:72:326 [0] NCCL INFO Channel 06/0 : 8[0] -> 9[1] via P2P/IPC/read
543
+ dlc1abaccnl2nzws-worker-0:78:314 [6] NCCL INFO Channel 01/0 : 14[6] -> 15[7] via P2P/IPC/read
544
+ dlc1abaccnl2nzws-worker-0:74:323 [2] NCCL INFO Channel 03/0 : 10[2] -> 11[3] via P2P/IPC/read
545
+ dlc1abaccnl2nzws-worker-0:76:315 [4] NCCL INFO Channel 02/0 : 12[4] -> 13[5] via P2P/IPC/read
546
+ dlc1abaccnl2nzws-worker-0:72:326 [0] NCCL INFO Channel 07/0 : 8[0] -> 9[1] via P2P/IPC/read
547
+ dlc1abaccnl2nzws-worker-0:78:314 [6] NCCL INFO Channel 02/0 : 14[6] -> 15[7] via P2P/IPC/read
548
+ dlc1abaccnl2nzws-worker-0:74:323 [2] NCCL INFO Channel 04/0 : 10[2] -> 11[3] via P2P/IPC/read
549
+ dlc1abaccnl2nzws-worker-0:76:315 [4] NCCL INFO Channel 03/0 : 12[4] -> 13[5] via P2P/IPC/read
550
+ dlc1abaccnl2nzws-worker-0:72:326 [0] NCCL INFO Channel 01/0 : 8[0] -> 15[7] via P2P/IPC/read
551
+ dlc1abaccnl2nzws-worker-0:78:314 [6] NCCL INFO Channel 03/0 : 14[6] -> 15[7] via P2P/IPC/read
552
+ dlc1abaccnl2nzws-worker-0:74:323 [2] NCCL INFO Channel 05/0 : 10[2] -> 11[3] via P2P/IPC/read
553
+ dlc1abaccnl2nzws-worker-0:76:315 [4] NCCL INFO Channel 04/0 : 12[4] -> 13[5] via P2P/IPC/read
554
+ dlc1abaccnl2nzws-worker-0:72:326 [0] NCCL INFO Channel 03/0 : 8[0] -> 15[7] via P2P/IPC/read
555
+ dlc1abaccnl2nzws-worker-0:78:314 [6] NCCL INFO Channel 04/0 : 14[6] -> 15[7] via P2P/IPC/read
556
+ dlc1abaccnl2nzws-worker-0:74:323 [2] NCCL INFO Channel 06/0 : 10[2] -> 11[3] via P2P/IPC/read
557
+ dlc1abaccnl2nzws-worker-0:76:315 [4] NCCL INFO Channel 05/0 : 12[4] -> 13[5] via P2P/IPC/read
558
+ dlc1abaccnl2nzws-worker-0:72:326 [0] NCCL INFO Channel 05/0 : 8[0] -> 15[7] via P2P/IPC/read
559
+ dlc1abaccnl2nzws-worker-0:78:314 [6] NCCL INFO Channel 05/0 : 14[6] -> 15[7] via P2P/IPC/read
560
+ dlc1abaccnl2nzws-worker-0:74:323 [2] NCCL INFO Channel 07/0 : 10[2] -> 11[3] via P2P/IPC/read
561
+ dlc1abaccnl2nzws-worker-0:76:315 [4] NCCL INFO Channel 06/0 : 12[4] -> 13[5] via P2P/IPC/read
562
+ dlc1abaccnl2nzws-worker-0:72:326 [0] NCCL INFO Channel 07/0 : 8[0] -> 15[7] via P2P/IPC/read
563
+ dlc1abaccnl2nzws-worker-0:78:314 [6] NCCL INFO Channel 06/0 : 14[6] -> 15[7] via P2P/IPC/read
564
+ dlc1abaccnl2nzws-worker-0:76:315 [4] NCCL INFO Channel 07/0 : 12[4] -> 13[5] via P2P/IPC/read
565
+ dlc1abaccnl2nzws-worker-0:74:323 [2] NCCL INFO Channel 01/0 : 2[2] -> 10[2] [receive] via NET/IB/1/GDRDMA
566
+ dlc1abaccnl2nzws-worker-0:73:324 [1] NCCL INFO Channel 00/0 : 9[1] -> 8[0] via P2P/IPC/read
567
+ dlc1abaccnl2nzws-worker-0:74:323 [2] NCCL INFO Channel 05/0 : 2[2] -> 10[2] [receive] via NET/IB/1/GDRDMA
568
+ dlc1abaccnl2nzws-worker-0:74:323 [2] NCCL INFO Channel 01/0 : 10[2] -> 2[2] [send] via NET/IB/1/GDRDMA
569
+ dlc1abaccnl2nzws-worker-0:74:323 [2] NCCL INFO Channel 05/0 : 10[2] -> 2[2] [send] via NET/IB/1/GDRDMA
570
+ dlc1abaccnl2nzws-worker-0:78:314 [6] NCCL INFO Channel 07/0 : 14[6] -> 15[7] via P2P/IPC/read
571
+ dlc1abaccnl2nzws-worker-0:73:324 [1] NCCL INFO Channel 04/0 : 9[1] -> 8[0] via P2P/IPC/read
572
+ dlc1abaccnl2nzws-worker-0:76:315 [4] NCCL INFO Channel 02/0 : 4[4] -> 12[4] [receive] via NET/IB/2/GDRDMA
573
+ dlc1abaccnl2nzws-worker-0:76:315 [4] NCCL INFO Channel 06/0 : 4[4] -> 12[4] [receive] via NET/IB/2/GDRDMA
574
+ dlc1abaccnl2nzws-worker-0:76:315 [4] NCCL INFO Channel 02/0 : 12[4] -> 4[4] [send] via NET/IB/2/GDRDMA
575
+ dlc1abaccnl2nzws-worker-0:76:315 [4] NCCL INFO Channel 06/0 : 12[4] -> 4[4] [send] via NET/IB/2/GDRDMA
576
+ dlc1abaccnl2nzws-worker-0:77:317 [5] NCCL INFO Channel 02/0 : 13[5] -> 12[4] via P2P/IPC/read
577
+ dlc1abaccnl2nzws-worker-0:78:314 [6] NCCL INFO Channel 03/0 : 6[6] -> 14[6] [receive] via NET/IB/3/GDRDMA
578
+ dlc1abaccnl2nzws-worker-0:78:314 [6] NCCL INFO Channel 07/0 : 6[6] -> 14[6] [receive] via NET/IB/3/GDRDMA
579
+ dlc1abaccnl2nzws-worker-0:78:314 [6] NCCL INFO Channel 03/0 : 14[6] -> 6[6] [send] via NET/IB/3/GDRDMA
580
+ dlc1abaccnl2nzws-worker-0:78:314 [6] NCCL INFO Channel 07/0 : 14[6] -> 6[6] [send] via NET/IB/3/GDRDMA
581
+ dlc1abaccnl2nzws-worker-0:77:317 [5] NCCL INFO Channel 06/0 : 13[5] -> 12[4] via P2P/IPC/read
582
+ dlc1abaccnl2nzws-worker-0:75:316 [3] NCCL INFO Channel 01/0 : 11[3] -> 10[2] via P2P/IPC/read
583
+ dlc1abaccnl2nzws-worker-0:76:315 [4] NCCL INFO Channel 01/0 : 12[4] -> 11[3] via P2P/IPC/read
584
+ dlc1abaccnl2nzws-worker-0:79:321 [7] NCCL INFO Channel 01/0 : 15[7] -> 8[0] via P2P/IPC/read
585
+ dlc1abaccnl2nzws-worker-0:72:326 [0] NCCL INFO Channel 00/0 : 0[0] -> 8[0] [receive] via NET/IB/0/GDRDMA
586
+ dlc1abaccnl2nzws-worker-0:72:326 [0] NCCL INFO Channel 04/0 : 0[0] -> 8[0] [receive] via NET/IB/0/GDRDMA
587
+ dlc1abaccnl2nzws-worker-0:72:326 [0] NCCL INFO Channel 00/0 : 8[0] -> 0[0] [send] via NET/IB/0/GDRDMA
588
+ dlc1abaccnl2nzws-worker-0:72:326 [0] NCCL INFO Channel 04/0 : 8[0] -> 0[0] [send] via NET/IB/0/GDRDMA
589
+ dlc1abaccnl2nzws-worker-0:75:316 [3] NCCL INFO Channel 05/0 : 11[3] -> 10[2] via P2P/IPC/read
590
+ dlc1abaccnl2nzws-worker-0:76:315 [4] NCCL INFO Channel 03/0 : 12[4] -> 11[3] via P2P/IPC/read
591
+ dlc1abaccnl2nzws-worker-0:79:321 [7] NCCL INFO Channel 02/0 : 15[7] -> 8[0] via P2P/IPC/read
592
+ dlc1abaccnl2nzws-worker-0:76:315 [4] NCCL INFO Channel 05/0 : 12[4] -> 11[3] via P2P/IPC/read
593
+ dlc1abaccnl2nzws-worker-0:79:321 [7] NCCL INFO Channel 03/0 : 15[7] -> 8[0] via P2P/IPC/read
594
+ dlc1abaccnl2nzws-worker-0:76:315 [4] NCCL INFO Channel 07/0 : 12[4] -> 11[3] via P2P/IPC/read
595
+ dlc1abaccnl2nzws-worker-0:79:321 [7] NCCL INFO Channel 05/0 : 15[7] -> 8[0] via P2P/IPC/read
596
+ dlc1abaccnl2nzws-worker-0:79:321 [7] NCCL INFO Channel 06/0 : 15[7] -> 8[0] via P2P/IPC/read
597
+ dlc1abaccnl2nzws-worker-0:79:321 [7] NCCL INFO Channel 07/0 : 15[7] -> 8[0] via P2P/IPC/read
598
+ dlc1abaccnl2nzws-worker-0:79:321 [7] NCCL INFO Channel 03/0 : 15[7] -> 14[6] via P2P/IPC/read
599
+ dlc1abaccnl2nzws-worker-0:79:321 [7] NCCL INFO Channel 07/0 : 15[7] -> 14[6] via P2P/IPC/read
600
+ dlc1abaccnl2nzws-worker-0:77:317 [5] NCCL INFO Connected all trees
601
+ dlc1abaccnl2nzws-worker-0:77:317 [5] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512
602
+ dlc1abaccnl2nzws-worker-0:77:317 [5] NCCL INFO 8 coll channels, 0 nvls channels, 8 p2p channels, 2 p2p channels per peer
603
+ dlc1abaccnl2nzws-worker-0:75:316 [3] NCCL INFO Connected all trees
604
+ dlc1abaccnl2nzws-worker-0:75:316 [3] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512
605
+ dlc1abaccnl2nzws-worker-0:75:316 [3] NCCL INFO 8 coll channels, 0 nvls channels, 8 p2p channels, 2 p2p channels per peer
606
+ dlc1abaccnl2nzws-worker-0:73:324 [1] NCCL INFO Connected all trees
607
+ dlc1abaccnl2nzws-worker-0:73:324 [1] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512
608
+ dlc1abaccnl2nzws-worker-0:73:324 [1] NCCL INFO 8 coll channels, 0 nvls channels, 8 p2p channels, 2 p2p channels per peer
609
+ dlc1abaccnl2nzws-worker-0:79:321 [7] NCCL INFO Connected all trees
610
+ dlc1abaccnl2nzws-worker-0:79:321 [7] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512
611
+ dlc1abaccnl2nzws-worker-0:79:321 [7] NCCL INFO 8 coll channels, 0 nvls channels, 8 p2p channels, 2 p2p channels per peer
612
+ dlc1abaccnl2nzws-worker-0:76:315 [4] NCCL INFO Connected all trees
613
+ dlc1abaccnl2nzws-worker-0:76:315 [4] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512
614
+ dlc1abaccnl2nzws-worker-0:76:315 [4] NCCL INFO 8 coll channels, 0 nvls channels, 8 p2p channels, 2 p2p channels per peer
615
+ dlc1abaccnl2nzws-worker-0:74:323 [2] NCCL INFO Connected all trees
616
+ dlc1abaccnl2nzws-worker-0:74:323 [2] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512
617
+ dlc1abaccnl2nzws-worker-0:74:323 [2] NCCL INFO 8 coll channels, 0 nvls channels, 8 p2p channels, 2 p2p channels per peer
618
+ dlc1abaccnl2nzws-worker-0:78:314 [6] NCCL INFO Connected all trees
619
+ dlc1abaccnl2nzws-worker-0:78:314 [6] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512
620
+ dlc1abaccnl2nzws-worker-0:78:314 [6] NCCL INFO 8 coll channels, 0 nvls channels, 8 p2p channels, 2 p2p channels per peer
621
+ dlc1abaccnl2nzws-worker-0:72:326 [0] NCCL INFO Connected all trees
622
+ dlc1abaccnl2nzws-worker-0:72:326 [0] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512
623
+ dlc1abaccnl2nzws-worker-0:72:326 [0] NCCL INFO 8 coll channels, 0 nvls channels, 8 p2p channels, 2 p2p channels per peer
624
+ dlc1abaccnl2nzws-worker-0:78:314 [6] NCCL INFO comm 0x9a052210 rank 14 nranks 16 cudaDev 6 nvmlDev 6 busId 70 commId 0x13a7e6351c9956c9 - Init COMPLETE
625
+ dlc1abaccnl2nzws-worker-0:77:317 [5] NCCL INFO comm 0x99f6a330 rank 13 nranks 16 cudaDev 5 nvmlDev 5 busId 60 commId 0x13a7e6351c9956c9 - Init COMPLETE
626
+ dlc1abaccnl2nzws-worker-0:79:321 [7] NCCL INFO comm 0x9afcfd70 rank 15 nranks 16 cudaDev 7 nvmlDev 7 busId 80 commId 0x13a7e6351c9956c9 - Init COMPLETE
627
+ dlc1abaccnl2nzws-worker-0:73:324 [1] NCCL INFO comm 0x9a987d60 rank 9 nranks 16 cudaDev 1 nvmlDev 1 busId 20 commId 0x13a7e6351c9956c9 - Init COMPLETE
628
+ dlc1abaccnl2nzws-worker-0:75:316 [3] NCCL INFO comm 0x9b8645b0 rank 11 nranks 16 cudaDev 3 nvmlDev 3 busId 40 commId 0x13a7e6351c9956c9 - Init COMPLETE
629
+ dlc1abaccnl2nzws-worker-0:74:323 [2] NCCL INFO comm 0x9b2bc4b0 rank 10 nranks 16 cudaDev 2 nvmlDev 2 busId 30 commId 0x13a7e6351c9956c9 - Init COMPLETE
630
+ dlc1abaccnl2nzws-worker-0:76:315 [4] NCCL INFO comm 0x9a914480 rank 12 nranks 16 cudaDev 4 nvmlDev 4 busId 50 commId 0x13a7e6351c9956c9 - Init COMPLETE
631
+ dlc1abaccnl2nzws-worker-0:72:326 [0] NCCL INFO comm 0x9baa4e10 rank 8 nranks 16 cudaDev 0 nvmlDev 0 busId 10 commId 0x13a7e6351c9956c9 - Init COMPLETE
632
+
633
+ Loading checkpoint shards: 0%| | 0/8 [00:00<?, ?it/s]
634
+ Loading checkpoint shards: 0%| | 0/8 [00:00<?, ?it/s]
635
+ Loading checkpoint shards: 0%| | 0/8 [00:00<?, ?it/s]
636
+ Loading checkpoint shards: 0%| | 0/8 [00:00<?, ?it/s]
637
+ Loading checkpoint shards: 0%| | 0/8 [00:00<?, ?it/s]
638
+ Loading checkpoint shards: 0%| | 0/8 [00:00<?, ?it/s]
639
+ Loading checkpoint shards: 0%| | 0/8 [00:00<?, ?it/s]
640
+ Loading checkpoint shards: 0%| | 0/8 [00:00<?, ?it/s]
641
+ Loading checkpoint shards: 12%|█▎ | 1/8 [00:00<00:01, 4.53it/s]
642
+ Loading checkpoint shards: 12%|█▎ | 1/8 [00:00<00:01, 4.50it/s]
643
+ Loading checkpoint shards: 12%|█▎ | 1/8 [00:00<00:01, 4.18it/s]
644
+ Loading checkpoint shards: 12%|█▎ | 1/8 [00:00<00:01, 4.61it/s]
645
+ Loading checkpoint shards: 12%|█▎ | 1/8 [00:00<00:01, 4.67it/s]
646
+ Loading checkpoint shards: 12%|█▎ | 1/8 [00:00<00:01, 4.41it/s]
647
+ Loading checkpoint shards: 12%|█▎ | 1/8 [00:00<00:01, 4.60it/s]
648
+ Loading checkpoint shards: 12%|█▎ | 1/8 [00:00<00:01, 4.57it/s]
649
+ Loading checkpoint shards: 25%|██▌ | 2/8 [00:00<00:01, 4.54it/s]
650
+ Loading checkpoint shards: 25%|██▌ | 2/8 [00:00<00:01, 4.40it/s]
651
+ Loading checkpoint shards: 25%|██▌ | 2/8 [00:00<00:01, 4.49it/s]
652
+ Loading checkpoint shards: 25%|██▌ | 2/8 [00:00<00:01, 4.51it/s]
653
+ Loading checkpoint shards: 25%|██▌ | 2/8 [00:00<00:01, 4.25it/s]
654
+ Loading checkpoint shards: 25%|██▌ | 2/8 [00:00<00:01, 4.37it/s]
655
+ Loading checkpoint shards: 25%|██▌ | 2/8 [00:00<00:01, 4.48it/s]
656
+ Loading checkpoint shards: 25%|██▌ | 2/8 [00:00<00:01, 4.39it/s]
657
+ Loading checkpoint shards: 38%|███▊ | 3/8 [00:01<00:04, 1.24it/s]
658
+ Loading checkpoint shards: 38%|███▊ | 3/8 [00:01<00:04, 1.25it/s]
659
+ Loading checkpoint shards: 38%|███▊ | 3/8 [00:01<00:04, 1.24it/s]
660
+ Loading checkpoint shards: 38%|███▊ | 3/8 [00:01<00:04, 1.24it/s]
661
+ Loading checkpoint shards: 38%|███▊ | 3/8 [00:01<00:04, 1.25it/s]
662
+ Loading checkpoint shards: 38%|███▊ | 3/8 [00:01<00:04, 1.25it/s]
663
+ Loading checkpoint shards: 38%|███▊ | 3/8 [00:01<00:04, 1.24it/s]
664
+ Loading checkpoint shards: 38%|███▊ | 3/8 [00:01<00:04, 1.24it/s]
665
+ Loading checkpoint shards: 50%|█████ | 4/8 [00:03<00:04, 1.06s/it]
666
+ Loading checkpoint shards: 50%|█████ | 4/8 [00:03<00:04, 1.06s/it]
667
+ Loading checkpoint shards: 50%|█████ | 4/8 [00:03<00:04, 1.06s/it]
668
+ Loading checkpoint shards: 50%|█████ | 4/8 [00:03<00:04, 1.06s/it]
669
+ Loading checkpoint shards: 50%|█████ | 4/8 [00:03<00:04, 1.07s/it]
670
+ Loading checkpoint shards: 50%|█████ | 4/8 [00:03<00:04, 1.06s/it]
671
+ Loading checkpoint shards: 50%|█████ | 4/8 [00:03<00:04, 1.06s/it]
672
+ Loading checkpoint shards: 50%|█████ | 4/8 [00:03<00:04, 1.06s/it]
673
+ Loading checkpoint shards: 62%|██████▎ | 5/8 [00:04<00:03, 1.12s/it]
674
+ Loading checkpoint shards: 62%|██████▎ | 5/8 [00:04<00:03, 1.12s/it]
675
+ Loading checkpoint shards: 62%|██████▎ | 5/8 [00:04<00:03, 1.12s/it]
676
+ Loading checkpoint shards: 62%|██████▎ | 5/8 [00:04<00:03, 1.12s/it]
677
+ Loading checkpoint shards: 62%|██████▎ | 5/8 [00:04<00:03, 1.12s/it]
678
+ Loading checkpoint shards: 62%|██████▎ | 5/8 [00:04<00:03, 1.12s/it]
679
+ Loading checkpoint shards: 62%|██████▎ | 5/8 [00:04<00:03, 1.12s/it]
680
+ Loading checkpoint shards: 62%|██████▎ | 5/8 [00:04<00:03, 1.12s/it]
681
+ Loading checkpoint shards: 75%|███████▌ | 6/8 [00:05<00:02, 1.18s/it]
682
+ Loading checkpoint shards: 75%|███████▌ | 6/8 [00:05<00:02, 1.18s/it]
683
+ Loading checkpoint shards: 75%|███████▌ | 6/8 [00:05<00:02, 1.18s/it]
684
+ Loading checkpoint shards: 75%|███████▌ | 6/8 [00:05<00:02, 1.18s/it]
685
+ Loading checkpoint shards: 75%|███████▌ | 6/8 [00:05<00:02, 1.18s/it]
686
+ Loading checkpoint shards: 75%|███████▌ | 6/8 [00:05<00:02, 1.18s/it]
687
+ Loading checkpoint shards: 75%|███████▌ | 6/8 [00:05<00:02, 1.18s/it]
688
+ Loading checkpoint shards: 75%|███████▌ | 6/8 [00:05<00:02, 1.18s/it]
689
+ Loading checkpoint shards: 88%|████████▊ | 7/8 [00:07<00:01, 1.20s/it]
690
+ Loading checkpoint shards: 88%|████████▊ | 7/8 [00:07<00:01, 1.20s/it]
691
+ Loading checkpoint shards: 88%|████████▊ | 7/8 [00:07<00:01, 1.20s/it]
692
+ Loading checkpoint shards: 88%|████████▊ | 7/8 [00:07<00:01, 1.20s/it]
693
+ Loading checkpoint shards: 88%|████████▊ | 7/8 [00:07<00:01, 1.20s/it]
694
+ Loading checkpoint shards: 88%|████████▊ | 7/8 [00:07<00:01, 1.20s/it]
695
+ Loading checkpoint shards: 88%|████████▊ | 7/8 [00:07<00:01, 1.20s/it]
696
+ Loading checkpoint shards: 88%|████████▊ | 7/8 [00:07<00:01, 1.20s/it]/fs-computility/mllm1/shared/hub/
697
+ Loading checkpoint shards: 100%|██████████| 8/8 [00:07<00:00, 1.02it/s]
698
+ Loading checkpoint shards: 100%|██████████| 8/8 [00:07<00:00, 1.04it/s]
699
+ [WARNING|modeling_utils.py:4352] 2025-01-16 22:24:26,316 >> Some weights of LlavaInternlm2ForCausalLM were not initialized from the model checkpoint at models/internlm/internlm2_5-7b-chat and are newly initialized: ['lm_head.weight']
700
+ You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
701
+ Using tokenizer from models/internlm/internlm2_5-7b-chat
702
+ using cache dir None
703
+
704
+ Loading checkpoint shards: 100%|██████████| 8/8 [00:07<00:00, 1.02it/s]
705
+ Loading checkpoint shards: 100%|██████████| 8/8 [00:07<00:00, 1.04it/s]
706
+ [WARNING|modeling_utils.py:4352] 2025-01-16 22:24:26,325 >> Some weights of LlavaInternlm2ForCausalLM were not initialized from the model checkpoint at models/internlm/internlm2_5-7b-chat and are newly initialized: ['lm_head.weight']
707
+ You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
708
+
709
+ Loading checkpoint shards: 100%|██████████| 8/8 [00:07<00:00, 1.02it/s]
710
+ Loading checkpoint shards: 100%|██████████| 8/8 [00:07<00:00, 1.04it/s]
711
+ [WARNING|modeling_utils.py:4352] 2025-01-16 22:24:26,326 >> Some weights of LlavaInternlm2ForCausalLM were not initialized from the model checkpoint at models/internlm/internlm2_5-7b-chat and are newly initialized: ['lm_head.weight']
712
+ You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
713
+
714
+ Loading checkpoint shards: 100%|██████████| 8/8 [00:07<00:00, 1.02it/s]
715
+ Loading checkpoint shards: 100%|██████████| 8/8 [00:07<00:00, 1.04it/s]
716
+ [WARNING|modeling_utils.py:4352] 2025-01-16 22:24:26,327 >> Some weights of LlavaInternlm2ForCausalLM were not initialized from the model checkpoint at models/internlm/internlm2_5-7b-chat and are newly initialized: ['lm_head.weight']
717
+ You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
718
+
719
+ Loading checkpoint shards: 100%|██████████| 8/8 [00:07<00:00, 1.02it/s]
720
+ Loading checkpoint shards: 100%|██████████| 8/8 [00:07<00:00, 1.04it/s]
721
+ [WARNING|modeling_utils.py:4352] 2025-01-16 22:24:26,328 >> Some weights of LlavaInternlm2ForCausalLM were not initialized from the model checkpoint at models/internlm/internlm2_5-7b-chat and are newly initialized: ['lm_head.weight']
722
+ You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
723
+
724
+ Loading checkpoint shards: 100%|██████████| 8/8 [00:07<00:00, 1.02it/s]
725
+ Loading checkpoint shards: 100%|██████████| 8/8 [00:07<00:00, 1.04it/s]
726
+ [INFO|modeling_utils.py:4350] 2025-01-16 22:24:26,328 >> All model checkpoint weights were used when initializing LlavaInternlm2ForCausalLM.
727
+
728
+ [WARNING|modeling_utils.py:4352] 2025-01-16 22:24:26,328 >> Some weights of LlavaInternl/fs-computility/mllm1/shared/hub/ the model checkpoint at models/internlm/internlm2_5-7b-chat and are newly initialized: ['lm_head.weight']
729
+ You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
730
+
731
+ Loading checkpoint shards: 100%|██████████| 8/8 [00:07<00:00, 1.02it/s]
732
+ Loading checkpoint shards: 100%|██████████| 8/8 [00:07<00:00, 1.04it/s]
733
+ [WARNING|modeling_utils.py:4352] 2025-01-16 22:24:26,329 >> Some weights of LlavaInternlm2ForCausalLM were not initialized from the model checkpoint at models/internlm/internlm2_5-7b-chat and are newly initialized: ['lm_head.weight']
734
+ You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
735
+ Using tokenizer from models/internlm/internlm2_5-7b-chat
736
+ using cache dir None
737
+
738
+ Loading checkpoint shards: 100%|██████████| 8/8 [00:07<00:00, 1.02it/s]
739
+ Loading checkpoint shards: 100%|██████████| 8/8 [00:07<00:00, 1.04it/s]
740
+ Using tokenizer from models/internlm/internlm2_5-7b-chat
741
+ using cache dir None
742
+ [WARNING|modeling_utils.py:4352] 2025-01-16 22:24:26,330 >> Some weights of LlavaInternlm2ForCausalLM were not initialized from the model checkpoint at models/internlm/internlm2_5-7b-chat and are newly initialized: ['lm_head.weight']
743
+ You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
744
+ Using tokenizer from models/internlm/internlm2_5-7b-chat
745
+ using cache dir None
746
+ Using tokenizer from models/internlm/internlm2_5-7b-chat
747
+ using cache dir None
748
+ [INFO|configuration_utils.py:779] 2025-01-16 22:24:26,332 >> loading configura/fs-computility/mllm1/shared/hub/b-chat/generation_config.json
749
+ [INFO|configuration_utils.py:826] 2025-01-16 22:24:26,332 >> Generate config GenerationConfig {
750
+ "bos_token_id": 1,/fs-computility/mllm1/shared/hub/
751
+ "eos_token_id": [
752
+ 2,
753
+ 92542/fs-computility/mllm1/shared/hub/
754
+ ],
755
+ "pad_token_id": 2
756
+ }
757
+
758
+ Using tokenizer from models/internlm/internlm2_5-7b-chat
759
+ using cache dir None
760
+ Using tokenizer from models/internlm/internlm2_5-7b-chat
761
+ using cache dir None
762
+ Using tokenizer from models/internlm/internlm2_5-7b-chat
763
+ using cache dir None
764
+ [INFO|tokenization_utils_base.py:2025] 2025-01-16 22:24:26,343 >> loading file ./tokenizer.model
765
+ [INFO|tokenization_utils_base.py:2025] 2025-01-16 22:24:26,343 >> loading file added_tokens.json
766
+ [INFO|tokenization_utils_base.py:2025] 2025-01-16 22:24:26,344 >> loading file special_tokens_map.json
767
+ [INFO|tokenization_utils_base.py:2025] 2025-01-16 22:24:26,344 >> loading file tokenizer_config.json
768
+ [INFO|tokenization_utils_base.py:2025] 2025-01-16 22:24:26,344 >> loading file tokenizer.json
769
+ 01/16/2025 22:24:26 - WARNING - llava.train.train - Using conversation template: Conversation(system='<|im_start|>system\nYou are a helpful assistant. ', roles=('<|im_start|>user\n', '<|im_start|>assistant\n'), messages=[], offset=0, sep_style=<SeparatorStyle.MPT: 3>, sep='<|im_end|>', sep2=None, version='internlm_v2', mm_system=None, skip_next=False)
770
+ 01/16/2025 22:24:26 - WARNING - llava.train.train - Using conversation template: Conversation(system='<|im_start|>system\nYou are a helpful assistant. ', roles=('<|im_start|>user\n', '<|im_start|>assistant\n'), messages=[], offset=0, sep_style=<SeparatorStyle.MPT: 3>, sep='<|im_end|>', sep2=None, version='internlm_v2', mm_system=None, skip_next=False)
771
+ 01/16/2025 22:24:26 - WARNING - llava.train.train - Using conversation template: Conversation(system='<|im_start|>system\nYou are a helpful assistant. ', roles=('<|im_start|>user\n', '<|im_start|>assistant\n'), messages=[], offset=0, sep_style=<SeparatorStyle.MPT: 3>, sep='<|im_end|>', sep2=None, version='internlm_v2', mm_system=None, skip_next=False)
772
+ 01/16/2025 22:24:26 - WARNING - llava.train.train - Using conversation template: Conversation(system='<|im_start|>system\nYou are a helpful assistant. ', roles=('<|im_start|>user\n', '<|im_start|>assistant\n'), messages=[], offset=0, sep_style=<SeparatorStyle.MPT: 3>, sep='<|im_end|>', sep2=None, version='internlm_v2', mm_system=None, skip_next=False)
773
+ 01/16/2025 22:24:26 - WARNING - llava.train.train - Using conversation template: Conversation(system='<|im_start|>system\nYou are a helpful assistant. ', roles=('<|im_start|>user\n', '<|im_start|>assistant\n'), messages=[], offset=0, sep_style=<SeparatorStyle.MPT: 3>, sep='<|im_end|>', sep2=None, version='internlm_v2', mm_system=None, skip_next=False)
774
+ 01/16/2025 22:24:26 - WARNING - llava.train.train - Using conversation template: Conversation(system='<|im_start|>system\nYou are a helpful assistant. ', roles=('<|im_start|>user\n', '<|im_start|>assistant\n'), messages=[], offset=0, sep_style=<SeparatorStyle.MPT: 3>, sep='<|im_end|>', sep2=None, version='internlm_v2', mm_system=None, skip_next=False)
775
+ 01/16/2025 22:24:26 - INFO - llava.train.train - Using conversation template: Conversation(system='<|im_start|>system\nYou are a helpful assistant. ', roles=('<|im_start|>user\n', '<|im_start|>assistant\n'), messages=[], offset=0, sep_style=<SeparatorStyle.MPT: 3>, sep='<|im_end|>', sep2=None, version='internlm_v2', mm_system=None, skip_next=False)
776
+ [INFO|image_processing_utils.py:373] 2025-01-16 22:24:26,548 >> loading configuration file /fs-computility/mllm1/shared/hub/models--openai--clip-vit-large-patch14-336/snapshots/ce19dc912ca5cd21c8a653c79e251e808ccabcd1/preprocessor_config.json
777
+ [INFO|image_processing_utils.py:738] 2025-01-16 22:24:26,548 >> size should be a dictionary on of the following set of keys: ({'width', 'height'}, {'shortest_edge'}, {'longest_edge', 'shortest_edge'}, {'longest_edge'}), got 336. Converted to {'shortest_edge': 336}.
778
+ [INFO|image_processing_utils.py:738] 2025-01-16 22:24:26,548 >> crop_size should be a dictionary on of the following set of keys: ({'width', 'height'}, {'shortest_edge'}, {'longest_edge', 'shortest_edge'}, {'longest_edge'}), got 336. Converted to {'height': 336, 'width': 336}.
779
+ [INFO|image_processing_utils.py:425] 2025-01-16 22:24:26,548 >> Image processor CLIPImageProcessor {
780
+ "crop_size": {
781
+ "height": 336,
782
+ "width": 336
783
+ },
784
+ "do_center_crop": true,
785
+ "do_convert_rgb": true,
786
+ "do_normalize": true,
787
+ "do_rescale": true,
788
+ "do_resize": true,
789
+ "image_mean": [
790
+ 0.48145466,
791
+ 0.4578275,
792
+ 0.40821073
793
+ ],
794
+ "image_processor_type": "CLIPImageProcessor",
795
+ "image_std": [
796
+ 0.26862954,
797
+ 0.26130258,
798
+ 0.27577711
799
+ ],
800
+ "resample": 3,
801
+ "rescale_factor": 0.00392156862745098,
802
+ "size": {
803
+ "shortest_edge": 336
804
+ }
805
+ }
806
+
807
+ 01/16/2025 22:24:26 - WARNING - llava.train.train - Using conversation template: Conversation(system='<|im_start|>system\nYou are a helpful assistant. ', roles=('<|im_start|>user\n', '<|im_start|>assistant\n'), messages=[], offset=0, sep_style=<SeparatorStyle.MPT: 3>, sep='<|im_end|>', sep2=None, version='internlm_v2', mm_system=None, skip_next=False)
808
+ [INFO|configuration_utils.py:727] 2025-01-16 22:24:26,554 >> loading configuration file /fs-computility/mllm1/shared/hub/models--openai--clip-vit-large-patch14-336/snapshots/ce19dc912ca5cd21c8a653c79e251e808ccabcd1/config.json
809
+ [INFO|configuration_utils.py:792] 2025-01-16 22:24:26,555 >> Model config CLIPVisionConfig {
810
+ "attention_dropout": 0.0,
811
+ "dropout": 0.0,
812
+ "hidden_act": "quick_gelu",
813
+ "hidden_size": 1024,
814
+ "image_size": 336,
815
+ "initializer_factor": 1.0,
816
+ "initializer_range": 0.02,
817
+ "intermediate_size": 4096,
818
+ "layer_norm_eps": 1e-05,
819
+ "model_type": "clip_vision_model",
820
+ "num_attention_heads": 16,
821
+ "num_channels": 3,
822
+ "num_hidden_layers": 24,
823
+ "patch_size": 14,
824
+ "projection_dim": 768,
825
+ "transformers_version": "4.37.2"
826
+ }
827
+
828
+ [INFO|modeling_utils.py:3473] 2025-01-16 22:24:26,556 >> loading weights file /fs-computility/mllm1/shared/hub/models--openai--clip-vit-large-patch14-336/snapshots/ce19dc912ca5cd21c8a653c79e251e808ccabcd1/pytorch_model.bin
829
+ [INFO|modeling_utils.py:3582] 2025-01-16 22:24:29,943 >> Detected DeepSpeed ZeRO-3: activating zero.init() for this model
830
+ [INFO|modeling_utils.py:4340] 2025-01-16 22:24:31,911 >> Some weights of the model checkpoint at /fs-computility/mllm1/shared/hub/models--openai--clip-vit-large-patch14-336/snapshots/ce19dc912ca5cd21c8a653c79e251e808ccabcd1 were not used when initializing CLIPVisionModel: ['logit_scale', 'text_model.embeddings.position_embedding.weight', 'text_model.embeddings.position_ids', 'text_model.embeddings.token_embedding.weight', 'text_model.encoder.layers.0.layer_norm1.bias', 'text_model.encoder.layers.0.layer_norm1.weight', 'text_model.encoder.layers.0.layer_norm2.bias', 'text_model.encoder.layers.0.layer_norm2.weight', 'text_model.encoder.layers.0.mlp.fc1.bias', 'text_model.encoder.layers.0.mlp.fc1.weight', 'text_model.encoder.layers.0.mlp.fc2.bias', 'text_model.encoder.layers.0.mlp.fc2.weight', 'text_model.encoder.layers.0.self_attn.k_proj.bias', 'text_model.encoder.layers.0.self_attn.k_proj.weight', 'text_model.encoder.layers.0.self_attn.out_proj.bias', 'text_model.encoder.layers.0.self_attn.out_proj.weight', 'text_model.encoder.layers.0.self_attn.q_proj.bias', 'text_model.encoder.layers.0.self_attn.q_proj.weight', 'text_model.encoder.layers.0.self_attn.v_proj.bias', 'text_model.encoder.layers.0.self_attn.v_proj.weight', 'text_model.encoder.layers.1.layer_norm1.bias', 'text_model.encoder.layers.1.layer_norm1.weight', 'text_model.encoder.layers.1.layer_norm2.bias', 'text_model.encoder.layers.1.layer_norm2.weight', 'text_model.encoder.layers.1.mlp.fc1.bias', 'text_model.encoder.layers.1.mlp.fc1.weight', 'text_model.encoder.layers.1.mlp.fc2.bias', 'text_model.encoder.layers.1.mlp.fc2.weight', 'text_model.encoder.layers.1.self_attn.k_proj.bias', 'text_model.encoder.layers.1.self_attn.k_proj.weight', 'text_model.encoder.layers.1.self_attn.out_proj.bias', 'text_model.encoder.layers.1.self_attn.out_proj.weight', 'text_model.encoder.layers.1.self_attn.q_proj.bias', 'text_model.encoder.layers.1.self_attn.q_proj.weight', 'text_model.encoder.layers.1.self_attn.v_proj.bias', 'text_model.encoder.layers.1.self_attn.v_proj.weight', 'text_model.encoder.layers.10.layer_norm1.bias', 'text_model.encoder.layers.10.layer_norm1.weight', 'text_model.encoder.layers.10.layer_norm2.bias', 'text_model.encoder.layers.10.layer_norm2.weight', 'text_model.encoder.layers.10.mlp.fc1.bias', 'text_model.encoder.layers.10.mlp.fc1.weight', 'text_model.encoder.layers.10.mlp.fc2.bias', 'text_model.encoder.layers.10.mlp.fc2.weight', 'text_model.encoder.layers.10.self_attn.k_proj.bias', 'text_model.encoder.layers.10.self_attn.k_proj.weight', 'text_model.encoder.layers.10.self_attn.out_proj.bias', 'text_model.encoder.layers.10.self_attn.out_proj.weight', 'text_model.encoder.layers.10.self_attn.q_proj.bias', 'text_model.encoder.layers.10.self_attn.q_proj.weight', 'text_model.encoder.layers.10.self_attn.v_proj.bias', 'text_model.encoder.layers.10.self_attn.v_proj.weight', 'text_model.encoder.layers.11.layer_norm1.bias', 'text_model.encoder.layers.11.layer_norm1.weight', 'text_model.encoder.layers.11.layer_norm2.bias', 'text_model.encoder.layers.11.layer_norm2.weight', 'text_model.encoder.layers.11.mlp.fc1.bias', 'text_model.encoder.layers.11.mlp.fc1.weight', 'text_model.encoder.layers.11.mlp.fc2.bias', 'text_model.encoder.layers.11.mlp.fc2.weight', 'text_model.encoder.layers.11.self_attn.k_proj.bias', 'text_model.encoder.layers.11.self_attn.k_proj.weight', 'text_model.encoder.layers.11.self_attn.out_proj.bias', 'text_model.encoder.layers.11.self_attn.out_proj.weight', 'text_model.encoder.layers.11.self_attn.q_proj.bias', 'text_model.encoder.layers.11.self_attn.q_proj.weight', 'text_model.encoder.layers.11.self_attn.v_proj.bias', 'text_model.encoder.layers.11.self_attn.v_proj.weight', 'text_model.encoder.layers.2.layer_norm1.bias', 'text_model.encoder.layers.2.layer_norm1.weight', 'text_model.encoder.layers.2.layer_norm2.bias', 'text_model.encoder.layers.2.layer_norm2.weight', 'text_model.encoder.layers.2.mlp.fc1.bias', 'text_model.encoder.layers.2.mlp.fc1.weight', 'text_model.encoder.layers.2.mlp.fc2.bias', 'text_model.encoder.layers.2.mlp.fc2.weight', 'text_model.encoder.layers.2.self_attn.k_proj.bias', 'text_model.encoder.layers.2.self_attn.k_proj.weight', 'text_model.encoder.layers.2.self_attn.out_proj.bias', 'text_model.encoder.layers.2.self_attn.out_proj.weight', 'text_model.encoder.layers.2.self_attn.q_proj.bias', 'text_model.encoder.layers.2.self_attn.q_proj.weight', 'text_model.encoder.layers.2.self_attn.v_proj.bias', 'text_model.encoder.layers.2.self_attn.v_proj.weight', 'text_model.encoder.layers.3.layer_norm1.bias', 'text_model.encoder.layers.3.layer_norm1.weight', 'text_model.encoder.layers.3.layer_norm2.bias', 'text_model.encoder.layers.3.layer_norm2.weight', 'text_model.encoder.layers.3.mlp.fc1.bias', 'text_model.encoder.layers.3.mlp.fc1.weight', 'text_model.encoder.layers.3.mlp.fc2.bias', 'text_model.encoder.layers.3.mlp.fc2.weight', 'text_model.encoder.layers.3.self_attn.k_proj.bias', 'text_model.encoder.layers.3.self_attn.k_proj.weight', 'text_model.encoder.layers.3.self_attn.out_proj.bias', 'text_model.encoder.layers.3.self_attn.out_proj.weight', 'text_model.encoder.layers.3.self_attn.q_proj.bias', 'text_model.encoder.layers.3.self_attn.q_proj.weight', 'text_model.encoder.layers.3.self_attn.v_proj.bias', 'text_model.encoder.layers.3.self_attn.v_proj.weight', 'text_model.encoder.layers.4.layer_norm1.bias', 'text_model.encoder.layers.4.layer_norm1.weight', 'text_model.encoder.layers.4.layer_norm2.bias', 'text_model.encoder.layers.4.layer_norm2.weight', 'text_model.encoder.layers.4.mlp.fc1.bias', 'text_model.encoder.layers.4.mlp.fc1.weight', 'text_model.encoder.layers.4.mlp.fc2.bias', 'text_model.encoder.layers.4.mlp.fc2.weight', 'text_model.encoder.layers.4.self_attn.k_proj.bias', 'text_model.encoder.layers.4.self_attn.k_proj.weight', 'text_model.encoder.layers.4.self_attn.out_proj.bias', 'text_model.encoder.layers.4.self_attn.out_proj.weight', 'text_model.encoder.layers.4.self_attn.q_proj.bias', 'text_model.encoder.layers.4.self_attn.q_proj.weight', 'text_model.encoder.layers.4.self_attn.v_proj.bias', 'text_model.encoder.layers.4.self_attn.v_proj.weight', 'text_model.encoder.layers.5.layer_norm1.bias', 'text_model.encoder.layers.5.layer_norm1.weight', 'text_model.encoder.layers.5.layer_norm2.bias', 'text_model.encoder.layers.5.layer_norm2.weight', 'text_model.encoder.layers.5.mlp.fc1.bias', 'text_model.encoder.layers.5.mlp.fc1.weight', 'text_model.encoder.layers.5.mlp.fc2.bias', 'text_model.encoder.layers.5.mlp.fc2.weight', 'text_model.encoder.layers.5.self_attn.k_proj.bias', 'text_model.encoder.layers.5.self_attn.k_proj.weight', 'text_model.encoder.layers.5.self_attn.out_proj.bias', 'text_model.encoder.layers.5.self_attn.out_proj.weight', 'text_model.encoder.layers.5.self_attn.q_proj.bias', 'text_model.encoder.layers.5.self_attn.q_proj.weight', 'text_model.encoder.layers.5.self_attn.v_proj.bias', 'text_model.encoder.layers.5.self_attn.v_proj.weight', 'text_model.encoder.layers.6.layer_norm1.bias', 'text_model.encoder.layers.6.layer_norm1.weight', 'text_model.encoder.layers.6.layer_norm2.bias', 'text_model.encoder.layers.6.layer_norm2.weight', 'text_model.encoder.layers.6.mlp.fc1.bias', 'text_model.encoder.layers.6.mlp.fc1.weight', 'text_model.encoder.layers.6.mlp.fc2.bias', 'text_model.encoder.layers.6.mlp.fc2.weight', 'text_model.encoder.layers.6.self_attn.k_proj.bias', 'text_model.encoder.layers.6.self_attn.k_proj.weight', 'text_model.encoder.layers.6.self_attn.out_proj.bias', 'text_model.encoder.layers.6.self_attn.out_proj.weight', 'text_model.encoder.layers.6.self_attn.q_proj.bias', 'text_model.encoder.layers.6.self_attn.q_proj.weight', 'text_model.encoder.layers.6.self_attn.v_proj.bias', 'text_model.encoder.layers.6.self_attn.v_proj.weight', 'text_model.encoder.layers.7.layer_norm1.bias', 'text_model.encoder.layers.7.layer_norm1.weight', 'text_model.encoder.layers.7.layer_norm2.bias', 'text_model.encoder.layers.7.layer_norm2.weight', 'text_model.encoder.layers.7.mlp.fc1.bias', 'text_model.encoder.layers.7.mlp.fc1.weight', 'text_model.encoder.layers.7.mlp.fc2.bias', 'text_model.encoder.layers.7.mlp.fc2.weight', 'text_model.encoder.layers.7.self_attn.k_proj.bias', 'text_model.encoder.layers.7.self_attn.k_proj.weight', 'text_model.encoder.layers.7.self_attn.out_proj.bias', 'text_model.encoder.layers.7.self_attn.out_proj.weight', 'text_model.encoder.layers.7.self_attn.q_proj.bias', 'text_model.encoder.layers.7.self_attn.q_proj.weight', 'text_model.encoder.layers.7.self_attn.v_proj.bias', 'text_model.encoder.layers.7.self_attn.v_proj.weight', 'text_model.encoder.layers.8.layer_norm1.bias', 'text_model.encoder.layers.8.layer_norm1.weight', 'text_model.encoder.layers.8.layer_norm2.bias', 'text_model.encoder.layers.8.layer_norm2.weight', 'text_model.encoder.layers.8.mlp.fc1.bias', 'text_model.encoder.layers.8.mlp.fc1.weight', 'text_model.encoder.layers.8.mlp.fc2.bias', 'text_model.encoder.layers.8.mlp.fc2.weight', 'text_model.encoder.layers.8.self_attn.k_proj.bias', 'text_model.encoder.layers.8.self_attn.k_proj.weight', 'text_model.encoder.layers.8.self_attn.out_proj.bias', 'text_model.encoder.layers.8.self_attn.out_proj.weight', 'text_model.encoder.layers.8.self_attn.q_proj.bias', 'text_model.encoder.layers.8.self_attn.q_proj.weight', 'text_model.encoder.layers.8.self_attn.v_proj.bias', 'text_model.encoder.layers.8.self_attn.v_proj.weight', 'text_model.encoder.layers.9.layer_norm1.bias', 'text_model.encoder.layers.9.layer_norm1.weight', 'text_model.encoder.layers.9.layer_norm2.bias', 'text_model.encoder.layers.9.layer_norm2.weight', 'text_model.encoder.layers.9.mlp.fc1.bias', 'text_model.encoder.layers.9.mlp.fc1.weight', 'text_model.encoder.layers.9.mlp.fc2.bias', 'text_model.encoder.layers.9.mlp.fc2.weight', 'text_model.encoder.layers.9.self_attn.k_proj.bias', 'text_model.encoder.layers.9.self_attn.k_proj.weight', 'text_model.encoder.layers.9.self_attn.out_proj.bias', 'text_model.encoder.layers.9.self_attn.out_proj.weight', 'text_model.encoder.layers.9.self_attn.q_proj.bias', 'text_model.encoder.layers.9.self_attn.q_proj.weight', 'text_model.encoder.layers.9.self_attn.v_proj.bias', 'text_model.encoder.layers.9.self_attn.v_proj.weight', 'text_model.final_layer_norm.bias', 'text_model.final_layer_norm.weight', 'text_projection.weight', 'visual_projection.weight']
831
+ - This IS expected if you are initializing CLIPVisionModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
832
+ - This IS NOT expected if you are initializing CLIPVisionModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
833
+ [INFO|modeling_utils.py:4358] 2025-01-16 22:24:31,912 >> All the weights of CLIPVisionModel were initialized from the model checkpoint at /fs-computility/mllm1/shared/hub/models--openai--clip-vit-large-patch14-336/snapshots/ce19dc912ca5cd21c8a653c79e251e808ccabcd1.
834
+ If your task is similar to the task the model of the checkpoint was trained on, you can already use CLIPVisionModel for predictions without further training.
835
+ 01/16/2025 22:24:50 - INFO - llava.train.train - Add dataset: llava-next-sft-notext with length: 738601, data type: normal, seed: 0
836
+ 01/16/2025 22:24:53 - INFO - llava.train.train - Add dataset: knowledge_gqa9k_art1500_cc3m30k with length: 40813, data type: know, seed: 1
837
+ 01/16/2025 22:24:56 - INFO - llava.train.train - Add dataset: Inferencial_flickr7k_cc3m30k_polished_md with length: 37117, data type: inf_polishmd, seed: 2
838
+ 01/16/2025 22:24:59 - INFO - llava.train.train - Add dataset: Detail_flickr7k_cc3m28k with length: 35313, data type: detail, seed: 3
839
+ 01/16/2025 22:25:03 - INFO - llava.train.train - Add dataset: Knowledge_instruct40k with length: 40218, data type: know_ins, seed: 4
840
+ 01/16/2025 22:25:06 - INFO - llava.train.train - Add dataset: Creation10k_fixed with length: 9698, data type: creation, seed: 5
841
+ 01/16/2025 22:25:10 - INFO - llava.train.train - Add dataset: Chartqa_generate_11k_gpt_qwen_merge with length: 11160, data type: chart, seed: 6
842
+ 01/16/2025 22:25:13 - INFO - llava.train.train - Add dataset: Tqa_detail_qwengenerate_multi8k_gpt with length: 8391, data type: tqa, seed: 7
843
+ 01/16/2025 22:25:17 - INFO - llava.train.train - Add dataset: Infovqa_single_gpt with length: 23068, data type: info, seed: 8
844
+ [INFO|trainer.py:571] 2025-01-16 22:25:17,060 >> Using auto half precision backend
845
+ [INFO|trainer.py:1721] 2025-01-16 22:26:02,469 >> ***** Running training *****
846
+ [INFO|trainer.py:1722] 2025-01-16 22:26:02,469 >> Num examples = 944,379
847
+ [INFO|trainer.py:1723] 2025-01-16 22:26:02,469 >> Num Epochs = 1
848
+ [INFO|trainer.py:1724] 2025-01-16 22:26:02,469 >> Instantaneous batch size per device = 4
849
+ [INFO|trainer.py:1727] 2025-01-16 22:26:02,469 >> Total train batch size (w. parallel, distributed & accumulation) = 128
850
+ [INFO|trainer.py:1728] 2025-01-16 22:26:02,469 >> Gradient Accumulation steps = 2
851
+ [INFO|trainer.py:1729] 2025-01-16 22:26:02,469 >> Total optimization steps = 7,378
852
+ [INFO|trainer.py:1730] 2025-01-16 22:26:02,471 >> Number of trainable parameters = 8,441,260,032
training_log_20250116_223308.txt ADDED
The diff for this file is too large to render. See raw diff
 
training_log_20250116_223318.txt ADDED
The diff for this file is too large to render. See raw diff