Zhiding commited on Mar 25

Commit

c906038

1 Parent(s): 442fb1f

clean codes

Browse files

Files changed (19) hide show

README.md +1 -1
config.json +2 -1
configuration_eagle_chat.py +4 -6
configuration_multi_backbone_channel_concatentation_model.py +0 -86
configuration_qwen2.py +0 -149
configuration_siglip.py +0 -302
conversation.py +0 -434
convnext.py +0 -572
convnext_encoder.py +0 -301
demo.py +14 -7
flash_attention.py +0 -76
modeling_eagle_chat.py +66 -102
modeling_qwen2.py +0 -1744
modeling_siglip.py +0 -1241
multi_backbone_channel_concatenation_encoder.py +0 -266
multi_backbone_channel_concatentation_model.py +0 -95
siglip_vision_tower.py +0 -93
tokenization_qwen2.py +0 -345
tokenization_qwen2_fast.py +0 -143

README.md CHANGED Viewed

@@ -75,7 +75,7 @@ We provide a [demo inference script](./demo.py) to help you quickly start using
 ### 0. Install the dependencies
 ```bash
-pip install transformers==4.37.2
 pip install flash-attn
 ```
 **Note**: Latest version of transformers is not compatible with the model.

 ### 0. Install the dependencies
 ```bash
+pip install transformers
 pip install flash-attn
 ```
 **Note**: Latest version of transformers is not compatible with the model.

config.json CHANGED Viewed

@@ -200,6 +200,7 @@
         "transformers_version": "4.37.2",
         "typical_p": 1.0,
         "use_bfloat16": false,
-        "vision_use_head": false
     }
 }

         "transformers_version": "4.37.2",
         "typical_p": 1.0,
         "use_bfloat16": false,
+        "vision_use_head": false,
+        "_attn_implementation": "flash_attention_2"
     }
 }

configuration_eagle_chat.py CHANGED Viewed

@@ -9,12 +9,10 @@ import copy
 from transformers import AutoConfig, LlamaConfig
 from transformers.configuration_utils import PretrainedConfig
 from transformers.utils import logging
-from .configuration_siglip import SiglipVisionConfig
-from .configuration_qwen2 import Qwen2Config
-from .configuration_multi_backbone_channel_concatentation_model import MultiBackboneChannelConcatenationVisionModelConfig
 logger = logging.get_logger(__name__)
 class Eagle2ChatConfig(PretrainedConfig):
     model_type = 'eagle_chat'
     is_composition = True
@@ -36,6 +34,7 @@ class Eagle2ChatConfig(PretrainedConfig):
             mlp_checkpoint=True,
             pre_feature_reduction=False,
             keep_aspect_ratio=False,
             **kwargs):
         super().__init__(**kwargs)
@@ -49,8 +48,6 @@ class Eagle2ChatConfig(PretrainedConfig):
         if vision_config['model_type'] == 'siglip_vision_model':
             self.vision_config = SiglipVisionConfig(**vision_config)
-        elif vision_config['model_type'].startswith("MOB"):
-            self.vision_config = MultiBackboneChannelConcatenationVisionModelConfig(**vision_config)
         else:
             raise ValueError('Unsupported model_type: {}'.format(vision_config['model_type']))
@@ -73,6 +70,7 @@ class Eagle2ChatConfig(PretrainedConfig):
         self.mlp_checkpoint = mlp_checkpoint
         self.pre_feature_reduction = pre_feature_reduction
         self.keep_aspect_ratio = keep_aspect_ratio
         logger.info(f'keep_aspect_ratio: {self.keep_aspect_ratio}')
         logger.info(f'vision_select_layer: {self.select_layer}')
         logger.info(f'min_dynamic_patch: {self.min_dynamic_patch}')

 from transformers import AutoConfig, LlamaConfig
 from transformers.configuration_utils import PretrainedConfig
 from transformers.utils import logging
+from transformers.models.siglip.configuration_siglip import SiglipVisionConfig
+from transformers.models.qwen2.configuration_qwen2 import Qwen2Config
 logger = logging.get_logger(__name__)
 class Eagle2ChatConfig(PretrainedConfig):
     model_type = 'eagle_chat'
     is_composition = True
             mlp_checkpoint=True,
             pre_feature_reduction=False,
             keep_aspect_ratio=False,
+            vocab_size=-1,
             **kwargs):
         super().__init__(**kwargs)
         if vision_config['model_type'] == 'siglip_vision_model':
             self.vision_config = SiglipVisionConfig(**vision_config)
         else:
             raise ValueError('Unsupported model_type: {}'.format(vision_config['model_type']))
         self.mlp_checkpoint = mlp_checkpoint
         self.pre_feature_reduction = pre_feature_reduction
         self.keep_aspect_ratio = keep_aspect_ratio
+        self.vocab_size = self.llm_config.vocab_size
         logger.info(f'keep_aspect_ratio: {self.keep_aspect_ratio}')
         logger.info(f'vision_select_layer: {self.select_layer}')
         logger.info(f'min_dynamic_patch: {self.min_dynamic_patch}')

configuration_multi_backbone_channel_concatentation_model.py DELETED Viewed

@@ -1,86 +0,0 @@
-# --------------------------------------------------------
-# Eagle2
-# Copyright (c) 2025 NVIDIA
-# Licensed under The Apache License [see LICENSE for details]
-# --------------------------------------------------------
-import os
-from typing import Union
-from transformers.configuration_utils import PretrainedConfig
-from transformers.utils import logging
-from .configuration_siglip import SiglipVisionConfig
-logger = logging.get_logger(__name__)
-class MultiBackboneChannelConcatenationVisionModelConfig(PretrainedConfig):
-    r"""
-    This is the configuration class to store the configuration of a [`MultiBackboneChannelConcatenationVisionModelConfig`]. It is used to
-    instantiate a vision encoder according to the specified arguments, defining the model architecture.
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PretrainedConfig`] for more information.
-    Args:
-        vision_path (str): Path to the vision model or its configuration.
-        mm_vision_select_layer (int, optional): The layer to select from the vision model
-                                                for multi-modal processing. Defaults to -2.
-        grid_size (int, optional): The size of the grid for vision processing. Defaults to 32.
-        **kwargs: Additional keyword arguments to be passed to the parent PretrainedConfig.
-    """
-    model_type = 'MOB'
-    def __init__(
-            self,
-            vision_path,
-            mm_vision_select_layer=-2,
-            grid_size=32,
-            input_image_size=1024,
-            hidden_size='lazy_calculation',
-            image_size=1024,
-            freeze_backbones=None,
-            moe_version_type=None,
-            delay_load=False,
-            convnext_img_size=1024,
-            vision_tower_siglip_path=None,
-            vision_tower_convnext_path='convnext_xxlarge.clip_laion2b_soup',
-            normalize_type='siglip',
-            **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.normalize_type = normalize_type
-        self.vision_path = vision_path
-        self.mm_vision_select_layer = mm_vision_select_layer
-        self.grid_size = grid_size
-        self.input_image_size = input_image_size
-        self.image_size = image_size
-        self.hidden_size = hidden_size
-        self.freeze_backbones = freeze_backbones
-        self.moe_version_type = moe_version_type
-        self.delay_load = delay_load
-        self.convnext_img_size = convnext_img_size
-        # other args. to make it compatable with eagle-next
-        self.vision_tower_siglip_path = vision_tower_siglip_path
-        self.vision_tower_convnext_path = vision_tower_convnext_path
-        self.vision_tower = self.vision_path[4:] # remove `MOB:` prefix
-        # asserts
-        assert image_size == input_image_size, f"input_image_size ({input_image_size}) != image_size ({image_size})"
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> 'PretrainedConfig':
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-        if 'vision_config' in config_dict:
-            config_dict = config_dict['vision_config']
-        if 'model_type' in config_dict and hasattr(cls, 'model_type') and config_dict['model_type'] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f'{cls.model_type}. This is not supported for all configurations of models and can yield errors.'
-            )
-        return cls.from_dict(config_dict, **kwargs)

configuration_qwen2.py DELETED Viewed

@@ -1,149 +0,0 @@
-# coding=utf-8
-# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Qwen2 model configuration"""
-from transformers.configuration_utils import PretrainedConfig
-from transformers.utils import logging
-logger = logging.get_logger(__name__)
-QWEN2_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "Qwen/Qwen2-7B-beta": "https://huggingface.co/Qwen/Qwen2-7B-beta/resolve/main/config.json",
-}
-class Qwen2Config(PretrainedConfig):
-    r"""
-    This is the configuration class to store the configuration of a [`Qwen2Model`]. It is used to instantiate a
-    Qwen2 model according to the specified arguments, defining the model architecture. Instantiating a configuration
-    with the defaults will yield a similar configuration to that of
-    Qwen2-7B-beta [Qwen/Qwen2-7B-beta](https://huggingface.co/Qwen/Qwen2-7B-beta).
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PretrainedConfig`] for more information.
-    Args:
-        vocab_size (`int`, *optional*, defaults to 151936):
-            Vocabulary size of the Qwen2 model. Defines the number of different tokens that can be represented by the
-            `inputs_ids` passed when calling [`Qwen2Model`]
-        hidden_size (`int`, *optional*, defaults to 4096):
-            Dimension of the hidden representations.
-        intermediate_size (`int`, *optional*, defaults to 22016):
-            Dimension of the MLP representations.
-        num_hidden_layers (`int`, *optional*, defaults to 32):
-            Number of hidden layers in the Transformer encoder.
-        num_attention_heads (`int`, *optional*, defaults to 32):
-            Number of attention heads for each attention layer in the Transformer encoder.
-        num_key_value_heads (`int`, *optional*, defaults to 32):
-            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
-            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
-            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
-            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
-            by meanpooling all the original heads within that group. For more details checkout [this
-            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
-        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
-            The non-linear activation function (function or string) in the decoder.
-        max_position_embeddings (`int`, *optional*, defaults to 32768):
-            The maximum sequence length that this model might ever be used with.
-        initializer_range (`float`, *optional*, defaults to 0.02):
-            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
-            The epsilon used by the rms normalization layers.
-        use_cache (`bool`, *optional*, defaults to `True`):
-            Whether or not the model should return the last key/values attentions (not used by all models). Only
-            relevant if `config.is_decoder=True`.
-        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
-            Whether the model's input and output word embeddings should be tied.
-        rope_theta (`float`, *optional*, defaults to 10000.0):
-            The base period of the RoPE embeddings.
-        use_sliding_window (`bool`, *optional*, defaults to `False`):
-            Whether to use sliding window attention.
-        sliding_window (`int`, *optional*, defaults to 4096):
-            Sliding window attention (SWA) window size. If not specified, will default to `4096`.
-        max_window_layers (`int`, *optional*, defaults to 28):
-            The number of layers that use SWA (Sliding Window Attention). The bottom layers use SWA while the top use full attention.
-        attention_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout ratio for the attention probabilities.
-    ```python
-    >>> from transformers import Qwen2Model, Qwen2Config
-    >>> # Initializing a Qwen2 style configuration
-    >>> configuration = Qwen2Config()
-    >>> # Initializing a model from the Qwen2-7B style configuration
-    >>> model = Qwen2Model(configuration)
-    >>> # Accessing the model configuration
-    >>> configuration = model.config
-    ```"""
-    model_type = "qwen2"
-    keys_to_ignore_at_inference = ["past_key_values"]
-    def __init__(
-        self,
-        vocab_size=151936,
-        hidden_size=4096,
-        intermediate_size=22016,
-        num_hidden_layers=32,
-        num_attention_heads=32,
-        num_key_value_heads=32,
-        hidden_act="silu",
-        max_position_embeddings=32768,
-        initializer_range=0.02,
-        rms_norm_eps=1e-6,
-        use_cache=True,
-        tie_word_embeddings=False,
-        rope_theta=10000.0,
-        use_sliding_window=False,
-        sliding_window=4096,
-        max_window_layers=28,
-        attention_dropout=0.0,
-        attn_implementation='flash_attention_2',
-        **kwargs,
-    ):
-        self.vocab_size = vocab_size
-        self.max_position_embeddings = max_position_embeddings
-        self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.use_sliding_window = use_sliding_window
-        self.sliding_window = sliding_window
-        self.max_window_layers = max_window_layers
-        self.attn_implementation = attn_implementation
-        if self.attn_implementation is None:
-            self.attn_implementation = "flash_attention_2"
-        # for backward compatibility
-        if num_key_value_heads is None:
-            num_key_value_heads = num_attention_heads
-        self.num_key_value_heads = num_key_value_heads
-        self.hidden_act = hidden_act
-        self.initializer_range = initializer_range
-        self.rms_norm_eps = rms_norm_eps
-        self.use_cache = use_cache
-        self.rope_theta = rope_theta
-        self.attention_dropout = attention_dropout
-        super().__init__(
-            tie_word_embeddings=tie_word_embeddings,
-            **kwargs,
-        )

configuration_siglip.py DELETED Viewed

@@ -1,302 +0,0 @@
-# coding=utf-8
-# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Siglip model configuration"""
-import os
-from typing import Union
-from transformers.configuration_utils import PretrainedConfig
-from transformers.utils import logging
-logger = logging.get_logger(__name__)
-SIGLIP_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "google/siglip-base-patch16-224": "https://huggingface.co/google/siglip-base-patch16-224/resolve/main/config.json",
-}
-class SiglipTextConfig(PretrainedConfig):
-    r"""
-    This is the configuration class to store the configuration of a [`SiglipTextModel`]. It is used to instantiate a
-    Siglip text encoder according to the specified arguments, defining the model architecture. Instantiating a
-    configuration with the defaults will yield a similar configuration to that of the text encoder of the Siglip
-    [google/siglip-base-patch16-224](https://huggingface.co/google/siglip-base-patch16-224) architecture.
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PretrainedConfig`] for more information.
-    Args:
-        vocab_size (`int`, *optional*, defaults to 32000):
-            Vocabulary size of the Siglip text model. Defines the number of different tokens that can be represented by
-            the `inputs_ids` passed when calling [`SiglipModel`].
-        hidden_size (`int`, *optional*, defaults to 768):
-            Dimensionality of the encoder layers and the pooler layer.
-        intermediate_size (`int`, *optional*, defaults to 3072):
-            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
-        num_hidden_layers (`int`, *optional*, defaults to 12):
-            Number of hidden layers in the Transformer encoder.
-        num_attention_heads (`int`, *optional*, defaults to 12):
-            Number of attention heads for each attention layer in the Transformer encoder.
-        max_position_embeddings (`int`, *optional*, defaults to 64):
-            The maximum sequence length that this model might ever be used with. Typically set this to something large
-            just in case (e.g., 512 or 1024 or 2048).
-        hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
-            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
-            `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
-        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
-            The epsilon used by the layer normalization layers.
-        attention_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout ratio for the attention probabilities.
-        pad_token_id (`int`, *optional*, defaults to 1):
-            The id of the padding token in the vocabulary.
-        bos_token_id (`int`, *optional*, defaults to 49406):
-            The id of the beginning-of-sequence token in the vocabulary.
-        eos_token_id (`int`, *optional*, defaults to 49407):
-            The id of the end-of-sequence token in the vocabulary.
-    Example:
-    ```python
-    >>> from transformers import SiglipTextConfig, SiglipTextModel
-    >>> # Initializing a SiglipTextConfig with google/siglip-base-patch16-224 style configuration
-    >>> configuration = SiglipTextConfig()
-    >>> # Initializing a SiglipTextModel (with random weights) from the google/siglip-base-patch16-224 style configuration
-    >>> model = SiglipTextModel(configuration)
-    >>> # Accessing the model configuration
-    >>> configuration = model.config
-    ```"""
-    model_type = "siglip_text_model"
-    def __init__(
-        self,
-        vocab_size=32000,
-        hidden_size=768,
-        intermediate_size=3072,
-        num_hidden_layers=12,
-        num_attention_heads=12,
-        max_position_embeddings=64,
-        hidden_act="gelu_pytorch_tanh",
-        layer_norm_eps=1e-6,
-        attention_dropout=0.0,
-        # This differs from `CLIPTokenizer`'s default and from openai/siglip
-        # See https://github.com/huggingface/transformers/pull/24773#issuecomment-1632287538
-        pad_token_id=1,
-        bos_token_id=49406,
-        eos_token_id=49407,
-        **kwargs,
-    ):
-        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.max_position_embeddings = max_position_embeddings
-        self.layer_norm_eps = layer_norm_eps
-        self.hidden_act = hidden_act
-        self.attention_dropout = attention_dropout
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-        # get the text config dict if we are loading from SiglipConfig
-        if config_dict.get("model_type") == "siglip":
-            config_dict = config_dict["text_config"]
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-        return cls.from_dict(config_dict, **kwargs)
-class SiglipVisionConfig(PretrainedConfig):
-    r"""
-    This is the configuration class to store the configuration of a [`SiglipVisionModel`]. It is used to instantiate a
-    Siglip vision encoder according to the specified arguments, defining the model architecture. Instantiating a
-    configuration with the defaults will yield a similar configuration to that of the vision encoder of the Siglip
-    [google/siglip-base-patch16-224](https://huggingface.co/google/siglip-base-patch16-224) architecture.
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PretrainedConfig`] for more information.
-    Args:
-        hidden_size (`int`, *optional*, defaults to 768):
-            Dimensionality of the encoder layers and the pooler layer.
-        intermediate_size (`int`, *optional*, defaults to 3072):
-            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
-        num_hidden_layers (`int`, *optional*, defaults to 12):
-            Number of hidden layers in the Transformer encoder.
-        num_attention_heads (`int`, *optional*, defaults to 12):
-            Number of attention heads for each attention layer in the Transformer encoder.
-        num_channels (`int`, *optional*, defaults to 3):
-            Number of channels in the input images.
-        image_size (`int`, *optional*, defaults to 224):
-            The size (resolution) of each image.
-        patch_size (`int`, *optional*, defaults to 16):
-            The size (resolution) of each patch.
-        hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
-            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
-            `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
-        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
-            The epsilon used by the layer normalization layers.
-        attention_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout ratio for the attention probabilities.
-    Example:
-    ```python
-    >>> from transformers import SiglipVisionConfig, SiglipVisionModel
-    >>> # Initializing a SiglipVisionConfig with google/siglip-base-patch16-224 style configuration
-    >>> configuration = SiglipVisionConfig()
-    >>> # Initializing a SiglipVisionModel (with random weights) from the google/siglip-base-patch16-224 style configuration
-    >>> model = SiglipVisionModel(configuration)
-    >>> # Accessing the model configuration
-    >>> configuration = model.config
-    ```"""
-    model_type = "siglip_vision_model"
-    def __init__(
-        self,
-        hidden_size=768,
-        intermediate_size=3072,
-        num_hidden_layers=12,
-        num_attention_heads=12,
-        num_channels=3,
-        image_size=224,
-        patch_size=16,
-        hidden_act="gelu_pytorch_tanh",
-        layer_norm_eps=1e-6,
-        attention_dropout=0.0,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.num_channels = num_channels
-        self.patch_size = patch_size
-        self.image_size = image_size
-        self.attention_dropout = attention_dropout
-        self.layer_norm_eps = layer_norm_eps
-        self.hidden_act = hidden_act
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-        # get the vision config dict if we are loading from SiglipConfig
-        if config_dict.get("model_type") == "siglip":
-            config_dict = config_dict["vision_config"]
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-        return cls.from_dict(config_dict, **kwargs)
-class SiglipConfig(PretrainedConfig):
-    r"""
-    [`SiglipConfig`] is the configuration class to store the configuration of a [`SiglipModel`]. It is used to
-    instantiate a Siglip model according to the specified arguments, defining the text model and vision model configs.
-    Instantiating a configuration with the defaults will yield a similar configuration to that of the Siglip
-    [google/siglip-base-patch16-224](https://huggingface.co/google/siglip-base-patch16-224) architecture.
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PretrainedConfig`] for more information.
-    Args:
-        text_config (`dict`, *optional*):
-            Dictionary of configuration options used to initialize [`SiglipTextConfig`].
-        vision_config (`dict`, *optional*):
-            Dictionary of configuration options used to initialize [`SiglipVisionConfig`].
-        kwargs (*optional*):
-            Dictionary of keyword arguments.
-    Example:
-    ```python
-    >>> from transformers import SiglipConfig, SiglipModel
-    >>> # Initializing a SiglipConfig with google/siglip-base-patch16-224 style configuration
-    >>> configuration = SiglipConfig()
-    >>> # Initializing a SiglipModel (with random weights) from the google/siglip-base-patch16-224 style configuration
-    >>> model = SiglipModel(configuration)
-    >>> # Accessing the model configuration
-    >>> configuration = model.config
-    >>> # We can also initialize a SiglipConfig from a SiglipTextConfig and a SiglipVisionConfig
-    >>> from transformers import SiglipTextConfig, SiglipVisionConfig
-    >>> # Initializing a SiglipText and SiglipVision configuration
-    >>> config_text = SiglipTextConfig()
-    >>> config_vision = SiglipVisionConfig()
-    >>> config = SiglipConfig.from_text_vision_configs(config_text, config_vision)
-    ```"""
-    model_type = "siglip"
-    def __init__(self, text_config=None, vision_config=None, **kwargs):
-        super().__init__(**kwargs)
-        if text_config is None:
-            text_config = {}
-            logger.info("`text_config` is `None`. Initializing the `SiglipTextConfig` with default values.")
-        if vision_config is None:
-            vision_config = {}
-            logger.info("`vision_config` is `None`. initializing the `SiglipVisionConfig` with default values.")
-        self.text_config = SiglipTextConfig(**text_config)
-        self.vision_config = SiglipVisionConfig(**vision_config)
-        self.initializer_factor = 1.0
-    @classmethod
-    def from_text_vision_configs(cls, text_config: SiglipTextConfig, vision_config: SiglipVisionConfig, **kwargs):
-        r"""
-        Instantiate a [`SiglipConfig`] (or a derived class) from siglip text model configuration and siglip vision
-        model configuration.
-        Returns:
-            [`SiglipConfig`]: An instance of a configuration object
-        """
-        return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)

conversation.py DELETED Viewed

@@ -1,434 +0,0 @@
-"""
-Conversation prompt templates.
-We kindly request that you import fastchat instead of copying this file if you wish to use it.
-If you have changes in mind, please contribute back so the community can benefit collectively and continue to maintain these valuable templates.
-"""
-import dataclasses
-from enum import IntEnum, auto
-from typing import Any, Dict, List, Tuple, Union
-class SeparatorStyle(IntEnum):
-    """Separator styles."""
-    ADD_COLON_SINGLE = auto()
-    ADD_COLON_TWO = auto()
-    ADD_COLON_SPACE_SINGLE = auto()
-    NO_COLON_SINGLE = auto()
-    NO_COLON_TWO = auto()
-    ADD_NEW_LINE_SINGLE = auto()
-    LLAMA2 = auto()
-    CHATGLM = auto()
-    CHATML = auto()
-    CHATINTERN = auto()
-    DOLLY = auto()
-    RWKV = auto()
-    PHOENIX = auto()
-    ROBIN = auto()
-    FALCON_CHAT = auto()
-    CHATGLM3 = auto()
-    INTERNVL_ZH = auto()
-    MPT = auto()
-    LLAMA3 = auto()
-@dataclasses.dataclass
-class Conversation:
-    """A class that manages prompt templates and keeps all conversation history."""
-    # The name of this template
-    name: str
-    # The template of the system prompt
-    system_template: str = '{system_message}'
-    # The system message
-    system_message: str = ''
-    # The names of two roles
-    roles: Tuple[str] = ('USER', 'ASSISTANT')
-    # All messages. Each item is (role, message).
-    messages: List[List[str]] = ()
-    # The number of few shot examples
-    offset: int = 0
-    # The separator style and configurations
-    sep_style: SeparatorStyle = SeparatorStyle.ADD_COLON_SINGLE
-    sep: str = '\n'
-    sep2: str = None
-    # Stop criteria (the default one is EOS token)
-    stop_str: Union[str, List[str]] = None
-    # Stops generation if meeting any token in this list
-    stop_token_ids: List[int] = None
-    def get_prompt(self) -> str:
-        """Get the prompt for generation."""
-        system_prompt = self.system_template.format(system_message=self.system_message)
-        if self.sep_style == SeparatorStyle.ADD_COLON_SINGLE:
-            ret = system_prompt + self.sep
-            for role, message in self.messages:
-                if message:
-                    ret += role + ': ' + message + self.sep
-                else:
-                    ret += role + ':'
-            return ret
-        elif self.sep_style == SeparatorStyle.ADD_COLON_TWO:
-            seps = [self.sep, self.sep2]
-            ret = system_prompt + seps[0]
-            for i, (role, message) in enumerate(self.messages):
-                if message:
-                    ret += role + ': ' + message + seps[i % 2]
-                else:
-                    ret += role + ':'
-            return ret
-        elif self.sep_style == SeparatorStyle.ADD_COLON_SPACE_SINGLE:
-            ret = system_prompt + self.sep
-            for role, message in self.messages:
-                if message:
-                    ret += role + ': ' + message + self.sep
-                else:
-                    ret += role + ': '  # must be end with a space
-            return ret
-        elif self.sep_style == SeparatorStyle.ADD_NEW_LINE_SINGLE:
-            ret = '' if system_prompt == '' else system_prompt + self.sep
-            for role, message in self.messages:
-                if message:
-                    ret += role + '\n' + message + self.sep
-                else:
-                    ret += role + '\n'
-            return ret
-        elif self.sep_style == SeparatorStyle.NO_COLON_SINGLE:
-            ret = system_prompt
-            for role, message in self.messages:
-                if message:
-                    ret += role + message + self.sep
-                else:
-                    ret += role
-            return ret
-        elif self.sep_style == SeparatorStyle.NO_COLON_TWO:
-            seps = [self.sep, self.sep2]
-            ret = system_prompt
-            for i, (role, message) in enumerate(self.messages):
-                if message:
-                    ret += role + message + seps[i % 2]
-                else:
-                    ret += role
-            return ret
-        elif self.sep_style == SeparatorStyle.RWKV:
-            ret = system_prompt
-            for i, (role, message) in enumerate(self.messages):
-                if message:
-                    ret += (
-                        role
-                        + ': '
-                        + message.replace('\r\n', '\n').replace('\n\n', '\n')
-                    )
-                    ret += '\n\n'
-                else:
-                    ret += role + ':'
-            return ret
-        elif self.sep_style == SeparatorStyle.LLAMA2:
-            seps = [self.sep, self.sep2]
-            if self.system_message:
-                ret = system_prompt
-            else:
-                ret = '[INST] '
-            for i, (role, message) in enumerate(self.messages):
-                tag = self.roles[i % 2]
-                if message:
-                    if i == 0:
-                        ret += message + ' '
-                    else:
-                        ret += tag + ' ' + message + seps[i % 2]
-                else:
-                    ret += tag
-            return ret
-        elif self.sep_style == SeparatorStyle.CHATGLM:
-            # source: https://huggingface.co/THUDM/chatglm-6b/blob/1d240ba371910e9282298d4592532d7f0f3e9f3e/modeling_chatglm.py#L1302-L1308
-            # source2: https://huggingface.co/THUDM/chatglm2-6b/blob/e186c891cf64310ac66ef10a87e6635fa6c2a579/modeling_chatglm.py#L926
-            round_add_n = 1 if self.name == 'chatglm2' else 0
-            if system_prompt:
-                ret = system_prompt + self.sep
-            else:
-                ret = ''
-            for i, (role, message) in enumerate(self.messages):
-                if i % 2 == 0:
-                    ret += f'[Round {i//2 + round_add_n}]{self.sep}'
-                if message:
-                    ret += f'{role}：{message}{self.sep}'
-                else:
-                    ret += f'{role}：'
-            return ret
-        elif self.sep_style == SeparatorStyle.CHATML:
-            ret = '' if system_prompt == '' else system_prompt + self.sep + '\n'
-            for role, message in self.messages:
-                if message:
-                    ret += role + '\n' + message + self.sep + '\n'
-                else:
-                    ret += role + '\n'
-            return ret
-        elif self.sep_style == SeparatorStyle.CHATGLM3:
-            ret = ''
-            if self.system_message:
-                ret += system_prompt
-            for role, message in self.messages:
-                if message:
-                    ret += role + '\n' + ' ' + message
-                else:
-                    ret += role
-            return ret
-        elif self.sep_style == SeparatorStyle.CHATINTERN:
-            # source: https://huggingface.co/internlm/internlm-chat-7b-8k/blob/bd546fa984b4b0b86958f56bf37f94aa75ab8831/modeling_internlm.py#L771
-            seps = [self.sep, self.sep2]
-            ret = system_prompt
-            for i, (role, message) in enumerate(self.messages):
-                # if i % 2 == 0:
-                #     ret += "<s>"
-                if message:
-                    ret += role + ':' + message + seps[i % 2] + '\n'
-                else:
-                    ret += role + ':'
-            return ret
-        elif self.sep_style == SeparatorStyle.DOLLY:
-            seps = [self.sep, self.sep2]
-            ret = system_prompt
-            for i, (role, message) in enumerate(self.messages):
-                if message:
-                    ret += role + ':\n' + message + seps[i % 2]
-                    if i % 2 == 1:
-                        ret += '\n\n'
-                else:
-                    ret += role + ':\n'
-            return ret
-        elif self.sep_style == SeparatorStyle.PHOENIX:
-            ret = system_prompt
-            for role, message in self.messages:
-                if message:
-                    ret += role + ': ' + '<s>' + message + '</s>'
-                else:
-                    ret += role + ': ' + '<s>'
-            return ret
-        elif self.sep_style == SeparatorStyle.ROBIN:
-            ret = system_prompt + self.sep
-            for role, message in self.messages:
-                if message:
-                    ret += role + ':\n' + message + self.sep
-                else:
-                    ret += role + ':\n'
-            return ret
-        elif self.sep_style == SeparatorStyle.FALCON_CHAT:
-            ret = ''
-            if self.system_message:
-                ret += system_prompt + self.sep
-            for role, message in self.messages:
-                if message:
-                    ret += role + ': ' + message + self.sep
-                else:
-                    ret += role + ':'
-            return ret
-        elif self.sep_style == SeparatorStyle.INTERNVL_ZH:
-            seps = [self.sep, self.sep2]
-            ret = self.system_message + seps[0]
-            for i, (role, message) in enumerate(self.messages):
-                if message:
-                    ret += role + ': ' + message + seps[i % 2]
-                else:
-                    ret += role + ':'
-            return ret
-        elif self.sep_style == SeparatorStyle.MPT:
-            ret = system_prompt + self.sep
-            for role, message in self.messages:
-                if message:
-                    if type(message) is tuple:
-                        message, _, _ = message
-                    ret += role + message + self.sep
-                else:
-                    ret += role
-            return ret
-        elif self.sep_style == SeparatorStyle.LLAMA3:
-            ret =  system_prompt + self.sep
-            for role, message in self.messages:
-                if message:
-                    if type(message) is tuple:
-                        message, _, _ = message
-                    ret += role + message + self.sep
-                else:
-                    ret += role
-            return ret
-        else:
-            raise ValueError(f'Invalid style: {self.sep_style}')
-    def set_system_message(self, system_message: str):
-        """Set the system message."""
-        self.system_message = system_message
-    def append_message(self, role: str, message: str):
-        """Append a new message."""
-        self.messages.append([role, message])
-    def update_last_message(self, message: str):
-        """Update the last output.
-        The last message is typically set to be None when constructing the prompt,
-        so we need to update it in-place after getting the response from a model.
-        """
-        self.messages[-1][1] = message
-    def to_gradio_chatbot(self):
-        """Convert the conversation to gradio chatbot format."""
-        ret = []
-        for i, (role, msg) in enumerate(self.messages[self.offset :]):
-            if i % 2 == 0:
-                ret.append([msg, None])
-            else:
-                ret[-1][-1] = msg
-        return ret
-    def to_openai_api_messages(self):
-        """Convert the conversation to OpenAI chat completion format."""
-        ret = [{'role': 'system', 'content': self.system_message}]
-        for i, (_, msg) in enumerate(self.messages[self.offset :]):
-            if i % 2 == 0:
-                ret.append({'role': 'user', 'content': msg})
-            else:
-                if msg is not None:
-                    ret.append({'role': 'assistant', 'content': msg})
-        return ret
-    def copy(self):
-        return Conversation(
-            name=self.name,
-            system_template=self.system_template,
-            system_message=self.system_message,
-            roles=self.roles,
-            messages=[[x, y] for x, y in self.messages],
-            offset=self.offset,
-            sep_style=self.sep_style,
-            sep=self.sep,
-            sep2=self.sep2,
-            stop_str=self.stop_str,
-            stop_token_ids=self.stop_token_ids,
-        )
-    def dict(self):
-        return {
-            'template_name': self.name,
-            'system_message': self.system_message,
-            'roles': self.roles,
-            'messages': self.messages,
-            'offset': self.offset,
-        }
-# A global registry for all conversation templates
-conv_templates: Dict[str, Conversation] = {}
-def register_conv_template(template: Conversation, override: bool = False):
-    """Register a new conversation template."""
-    if not override:
-        assert (
-            template.name not in conv_templates
-        ), f'{template.name} has been registered.'
-    conv_templates[template.name] = template
-def get_conv_template(name: str) -> Conversation:
-    """Get a conversation template."""
-    return conv_templates[name].copy()
-# Note that for inference, using the Hermes-2 and internlm2-chat templates is equivalent.
-register_conv_template(
-    Conversation(
-        name='Hermes-2',
-        system_template='<|im_start|>system\n{system_message}',
-        # note: The new system prompt was not used here to avoid changes in benchmark performance.
-        # system_message='我是书生·万象，英文名是InternVL，是由上海人工智能实验室及多家合作单位联合开发的多模态大语言模型。人工智能实验室致力于原始技术创新，开源开放，共享共创，推动科技进步和产业发展。',
-        system_message='你是由上海人工智能实验室联合商汤科技开发的书生多模态大模型，英文名叫InternVL, 是一个有用无害的人工智能助手。',
-        roles=('<|im_start|>user\n', '<|im_start|>assistant\n'),
-        sep_style=SeparatorStyle.MPT,
-        sep='<|im_end|>',
-        stop_token_ids=[
-            2,
-            6,
-            7,
-            8,
-        ],
-        stop_str='<|endoftext|>',
-    )
-)
-register_conv_template(
-    Conversation(
-        name='internlm2-chat',
-        system_template='<|im_start|>system\n{system_message}',
-        # note: The new system prompt was not used here to avoid changes in benchmark performance.
-        # system_message='我是书生·万象，英文名是InternVL，是由上海人工智能实验室及多家合作单位联合开发的多模态大语言模型。人工智能实验室致力于原始技术创新，开源开放，共享共创，推动科技进步和产业发展。',
-        system_message='你是由上海人工智能实验室联合商汤科技开发的书生多模态大模型，英文名叫InternVL, 是一个有用无害的人工智能助手。',
-        roles=('<|im_start|>user\n', '<|im_start|>assistant\n'),
-        sep_style=SeparatorStyle.MPT,
-        sep='<|im_end|>',
-        stop_token_ids=[
-            2,
-            92543,
-            92542
-        ]
-    )
-)
-register_conv_template(
-    Conversation(
-        name='phi3-chat',
-        system_template='<|system|>\n{system_message}',
-        # note: The new system prompt was not used here to avoid changes in benchmark performance.
-        # system_message='我是书生·万象，英文名是InternVL，是由上海人工智能实验室及��家合作单位联合开发的多模态大语言模型。人工智能实验室致力于原始技术创新，开源开放，共享共创，推动科技进步和产业发展。',
-        system_message='你是由上海人工智能实验室联合商汤科技开发的书生多模态大模型，英文名叫InternVL, 是一个有用无害的人工智能助手。',
-        roles=('<|user|>\n', '<|assistant|>\n'),
-        sep_style=SeparatorStyle.MPT,
-        sep='<|end|>',
-        stop_token_ids=[
-            2,
-            32000,
-            32007
-        ]
-    )
-)
-register_conv_template(
-    Conversation(
-        name='llama3-chat',
-        system_template='<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n{system_message}',
-        system_message='You are an AI assistant whose name is Eagle-Next.',
-        roles=('<|start_header_id|>user<|end_header_id|>\n\n', '<|start_header_id|>assistant<|end_header_id|>\n\n'),
-        sep_style=SeparatorStyle.LLAMA3,
-        sep='<|eot_id|>',
-        stop_token_ids=[
-            128259,
-            128001
-        ]
-    )
-)
-# Qwen-chat default template
-# source: https://huggingface.co/Qwen/Qwen-7B-Chat/blob/main/qwen_generation_utils.py#L130
-register_conv_template(
-    Conversation(
-        name='qwen2-chat',
-        system_template='<|im_start|>system\n{system_message}',
-        system_message='You are a helpful assistant.',
-        roles=('<|im_start|>user', '<|im_start|>assistant'),
-        sep_style=SeparatorStyle.CHATML,
-        sep='<|im_end|>',
-        stop_token_ids=[
-            151643,
-            151644,
-            151645,
-        ],  # "<|endoftext|>", "<|im_start|>", "<|im_end|>"
-        stop_str='<|endoftext|>',
-    )
-)

convnext.py DELETED Viewed

@@ -1,572 +0,0 @@
-""" ConvNeXt
-Papers:
-* `A ConvNet for the 2020s` - https://arxiv.org/pdf/2201.03545.pdf
-@Article{liu2022convnet,
-  author  = {Zhuang Liu and Hanzi Mao and Chao-Yuan Wu and Christoph Feichtenhofer and Trevor Darrell and Saining Xie},
-  title   = {A ConvNet for the 2020s},
-  journal = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
-  year    = {2022},
-}
-* `ConvNeXt-V2 - Co-designing and Scaling ConvNets with Masked Autoencoders` - https://arxiv.org/abs/2301.00808
-@article{Woo2023ConvNeXtV2,
-  title={ConvNeXt V2: Co-designing and Scaling ConvNets with Masked Autoencoders},
-  author={Sanghyun Woo, Shoubhik Debnath, Ronghang Hu, Xinlei Chen, Zhuang Liu, In So Kweon and Saining Xie},
-  year={2023},
-  journal={arXiv preprint arXiv:2301.00808},
-}
-Original code and weights from:
-* https://github.com/facebookresearch/ConvNeXt, original copyright below
-* https://github.com/facebookresearch/ConvNeXt-V2, original copyright below
-Model defs atto, femto, pico, nano and _ols / _hnf variants are timm originals.
-Modifications and additions for timm hacked together by / Copyright 2022, Ross Wightman
-"""
-# ConvNeXt
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-# This source code is licensed under the MIT license
-# ConvNeXt-V2
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree (Attribution-NonCommercial 4.0 International (CC BY-NC 4.0))
-# No code was used directly from ConvNeXt-V2, however the weights are CC BY-NC 4.0 so beware if using commercially.
-from collections import OrderedDict
-from functools import partial
-from typing import Callable, Optional, Tuple, Union
-import torch
-import torch.nn as nn
-from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
-from timm.layers import trunc_normal_, AvgPool2dSame, DropPath, Mlp, GlobalResponseNormMlp, \
-    LayerNorm2d, LayerNorm, create_conv2d, get_act_layer, make_divisible, to_ntuple
-from timm.layers import NormMlpClassifierHead, ClassifierHead
-from timm.models._builder import build_model_with_cfg
-from timm.models._manipulate import named_apply, checkpoint_seq
-from timm.models._registry import generate_default_cfgs, register_model, register_model_deprecations
-__all__ = ['ConvNeXt']  # model_registry will add each entrypoint fn to this
-class Downsample(nn.Module):
-    def __init__(self, in_chs, out_chs, stride=1, dilation=1):
-        super().__init__()
-        avg_stride = stride if dilation == 1 else 1
-        if stride > 1 or dilation > 1:
-            avg_pool_fn = AvgPool2dSame if avg_stride == 1 and dilation > 1 else nn.AvgPool2d
-            self.pool = avg_pool_fn(2, avg_stride, ceil_mode=True, count_include_pad=False)
-        else:
-            self.pool = nn.Identity()
-        if in_chs != out_chs:
-            self.conv = create_conv2d(in_chs, out_chs, 1, stride=1)
-        else:
-            self.conv = nn.Identity()
-    def forward(self, x):
-        x = self.pool(x)
-        x = self.conv(x)
-        return x
-class ConvNeXtBlock(nn.Module):
-    """ ConvNeXt Block
-    There are two equivalent implementations:
-      (1) DwConv -> LayerNorm (channels_first) -> 1x1 Conv -> GELU -> 1x1 Conv; all in (N, C, H, W)
-      (2) DwConv -> Permute to (N, H, W, C); LayerNorm (channels_last) -> Linear -> GELU -> Linear; Permute back
-    Unlike the official impl, this one allows choice of 1 or 2, 1x1 conv can be faster with appropriate
-    choice of LayerNorm impl, however as model size increases the tradeoffs appear to change and nn.Linear
-    is a better choice. This was observed with PyTorch 1.10 on 3090 GPU, it could change over time & w/ different HW.
-    """
-    def __init__(
-            self,
-            in_chs: int,
-            out_chs: Optional[int] = None,
-            kernel_size: int = 7,
-            stride: int = 1,
-            dilation: Union[int, Tuple[int, int]] = (1, 1),
-            mlp_ratio: float = 4,
-            conv_mlp: bool = False,
-            conv_bias: bool = True,
-            use_grn: bool = False,
-            ls_init_value: Optional[float] = 1e-6,
-            act_layer: Union[str, Callable] = 'gelu',
-            norm_layer: Optional[Callable] = None,
-            drop_path: float = 0.,
-    ):
-        """
-        Args:
-            in_chs: Block input channels.
-            out_chs: Block output channels (same as in_chs if None).
-            kernel_size: Depthwise convolution kernel size.
-            stride: Stride of depthwise convolution.
-            dilation: Tuple specifying input and output dilation of block.
-            mlp_ratio: MLP expansion ratio.
-            conv_mlp: Use 1x1 convolutions for MLP and a NCHW compatible norm layer if True.
-            conv_bias: Apply bias for all convolution (linear) layers.
-            use_grn: Use GlobalResponseNorm in MLP (from ConvNeXt-V2)
-            ls_init_value: Layer-scale init values, layer-scale applied if not None.
-            act_layer: Activation layer.
-            norm_layer: Normalization layer (defaults to LN if not specified).
-            drop_path: Stochastic depth probability.
-        """
-        super().__init__()
-        out_chs = out_chs or in_chs
-        dilation = to_ntuple(2)(dilation)
-        act_layer = get_act_layer(act_layer)
-        if not norm_layer:
-            norm_layer = LayerNorm2d if conv_mlp else LayerNorm
-        mlp_layer = partial(GlobalResponseNormMlp if use_grn else Mlp, use_conv=conv_mlp)
-        self.use_conv_mlp = conv_mlp
-        self.conv_dw = create_conv2d(
-            in_chs,
-            out_chs,
-            kernel_size=kernel_size,
-            stride=stride,
-            dilation=dilation[0],
-            depthwise=True,
-            bias=conv_bias,
-        )
-        self.norm = norm_layer(out_chs)
-        self.mlp = mlp_layer(out_chs, int(mlp_ratio * out_chs), act_layer=act_layer)
-        self.weight = nn.Parameter(ls_init_value * torch.ones(out_chs)) if ls_init_value is not None else None
-        if in_chs != out_chs or stride != 1 or dilation[0] != dilation[1]:
-            self.shortcut = Downsample(in_chs, out_chs, stride=stride, dilation=dilation[0])
-        else:
-            self.shortcut = nn.Identity()
-        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
-    def forward(self, x):
-        shortcut = x
-        x = self.conv_dw(x)
-        if self.use_conv_mlp:
-            x = self.norm(x)
-            x = self.mlp(x)
-        else:
-            x = x.permute(0, 2, 3, 1)
-            x = self.norm(x)
-            x = self.mlp(x)
-            x = x.permute(0, 3, 1, 2)
-        if self.weight is not None:
-            x = x.mul(self.weight.reshape(1, -1, 1, 1))
-        x = self.drop_path(x) + self.shortcut(shortcut)
-        return x
-class ConvNeXtStage(nn.Module):
-    def __init__(
-            self,
-            in_chs,
-            out_chs,
-            kernel_size=7,
-            stride=2,
-            depth=2,
-            dilation=(1, 1),
-            drop_path_rates=None,
-            ls_init_value=1.0,
-            conv_mlp=False,
-            conv_bias=True,
-            use_grn=False,
-            act_layer='gelu',
-            norm_layer=None,
-            norm_layer_cl=None
-    ):
-        super().__init__()
-        self.grad_checkpointing = False
-        if in_chs != out_chs or stride > 1 or dilation[0] != dilation[1]:
-            ds_ks = 2 if stride > 1 or dilation[0] != dilation[1] else 1
-            pad = 'same' if dilation[1] > 1 else 0  # same padding needed if dilation used
-            self.downsample = nn.Sequential(
-                norm_layer(in_chs),
-                create_conv2d(
-                    in_chs,
-                    out_chs,
-                    kernel_size=ds_ks,
-                    stride=stride,
-                    dilation=dilation[0],
-                    padding=pad,
-                    bias=conv_bias,
-                ),
-            )
-            in_chs = out_chs
-        else:
-            self.downsample = nn.Identity()
-        drop_path_rates = drop_path_rates or [0.] * depth
-        stage_blocks = []
-        for i in range(depth):
-            stage_blocks.append(ConvNeXtBlock(
-                in_chs=in_chs,
-                out_chs=out_chs,
-                kernel_size=kernel_size,
-                dilation=dilation[1],
-                drop_path=drop_path_rates[i],
-                ls_init_value=ls_init_value,
-                conv_mlp=conv_mlp,
-                conv_bias=conv_bias,
-                use_grn=use_grn,
-                act_layer=act_layer,
-                norm_layer=norm_layer if conv_mlp else norm_layer_cl,
-            ))
-            in_chs = out_chs
-        self.blocks = nn.Sequential(*stage_blocks)
-    def forward(self, x):
-        x = self.downsample(x)
-        if self.grad_checkpointing and not torch.jit.is_scripting():
-            x = checkpoint_seq(self.blocks, x)
-        else:
-            x = self.blocks(x)
-        return x
-class ConvNeXt(nn.Module):
-    r""" ConvNeXt
-        A PyTorch impl of : `A ConvNet for the 2020s`  - https://arxiv.org/pdf/2201.03545.pdf
-    """
-    def __init__(
-            self,
-            in_chans: int = 3,
-            num_classes: int = 1000,
-            global_pool: str = 'avg',
-            output_stride: int = 32,
-            depths: Tuple[int, ...] = (3, 3, 9, 3),
-            dims: Tuple[int, ...] = (96, 192, 384, 768),
-            kernel_sizes: Union[int, Tuple[int, ...]] = 7,
-            ls_init_value: Optional[float] = 1e-6,
-            stem_type: str = 'patch',
-            patch_size: int = 4,
-            head_init_scale: float = 1.,
-            head_norm_first: bool = False,
-            head_hidden_size: Optional[int] = None,
-            conv_mlp: bool = False,
-            conv_bias: bool = True,
-            use_grn: bool = False,
-            act_layer: Union[str, Callable] = 'gelu',
-            norm_layer: Optional[Union[str, Callable]] = None,
-            norm_eps: Optional[float] = None,
-            drop_rate: float = 0.,
-            drop_path_rate: float = 0.,
-    ):
-        """
-        Args:
-            in_chans: Number of input image channels.
-            num_classes: Number of classes for classification head.
-            global_pool: Global pooling type.
-            output_stride: Output stride of network, one of (8, 16, 32).
-            depths: Number of blocks at each stage.
-            dims: Feature dimension at each stage.
-            kernel_sizes: Depthwise convolution kernel-sizes for each stage.
-            ls_init_value: Init value for Layer Scale, disabled if None.
-            stem_type: Type of stem.
-            patch_size: Stem patch size for patch stem.
-            head_init_scale: Init scaling value for classifier weights and biases.
-            head_norm_first: Apply normalization before global pool + head.
-            head_hidden_size: Size of MLP hidden layer in head if not None and head_norm_first == False.
-            conv_mlp: Use 1x1 conv in MLP, improves speed for small networks w/ chan last.
-            conv_bias: Use bias layers w/ all convolutions.
-            use_grn: Use Global Response Norm (ConvNeXt-V2) in MLP.
-            act_layer: Activation layer type.
-            norm_layer: Normalization layer type.
-            drop_rate: Head pre-classifier dropout rate.
-            drop_path_rate: Stochastic depth drop rate.
-        """
-        super().__init__()
-        assert output_stride in (8, 16, 32)
-        kernel_sizes = to_ntuple(4)(kernel_sizes)
-        if norm_layer is None:
-            norm_layer = LayerNorm2d
-            norm_layer_cl = norm_layer if conv_mlp else LayerNorm
-            if norm_eps is not None:
-                norm_layer = partial(norm_layer, eps=norm_eps)
-                norm_layer_cl = partial(norm_layer_cl, eps=norm_eps)
-        else:
-            assert conv_mlp,\
-                'If a norm_layer is specified, conv MLP must be used so all norm expect rank-4, channels-first input'
-            norm_layer_cl = norm_layer
-            if norm_eps is not None:
-                norm_layer_cl = partial(norm_layer_cl, eps=norm_eps)
-        self.num_classes = num_classes
-        self.drop_rate = drop_rate
-        self.feature_info = []
-        assert stem_type in ('patch', 'overlap', 'overlap_tiered')
-        if stem_type == 'patch':
-            # NOTE: this stem is a minimal form of ViT PatchEmbed, as used in SwinTransformer w/ patch_size = 4
-            self.stem = nn.Sequential(
-                nn.Conv2d(in_chans, dims[0], kernel_size=patch_size, stride=patch_size, bias=conv_bias),
-                norm_layer(dims[0]),
-            )
-            stem_stride = patch_size
-        else:
-            mid_chs = make_divisible(dims[0] // 2) if 'tiered' in stem_type else dims[0]
-            self.stem = nn.Sequential(
-                nn.Conv2d(in_chans, mid_chs, kernel_size=3, stride=2, padding=1, bias=conv_bias),
-                nn.Conv2d(mid_chs, dims[0], kernel_size=3, stride=2, padding=1, bias=conv_bias),
-                norm_layer(dims[0]),
-            )
-            stem_stride = 4
-        self.stages = nn.Sequential()
-        dp_rates = [x.tolist() for x in torch.linspace(0, drop_path_rate, sum(depths)).split(depths)]
-        stages = []
-        prev_chs = dims[0]
-        curr_stride = stem_stride
-        dilation = 1
-        # 4 feature resolution stages, each consisting of multiple residual blocks
-        for i in range(4):
-            stride = 2 if curr_stride == 2 or i > 0 else 1
-            if curr_stride >= output_stride and stride > 1:
-                dilation *= stride
-                stride = 1
-            curr_stride *= stride
-            first_dilation = 1 if dilation in (1, 2) else 2
-            out_chs = dims[i]
-            stages.append(ConvNeXtStage(
-                prev_chs,
-                out_chs,
-                kernel_size=kernel_sizes[i],
-                stride=stride,
-                dilation=(first_dilation, dilation),
-                depth=depths[i],
-                drop_path_rates=dp_rates[i],
-                ls_init_value=ls_init_value,
-                conv_mlp=conv_mlp,
-                conv_bias=conv_bias,
-                use_grn=use_grn,
-                act_layer=act_layer,
-                norm_layer=norm_layer,
-                norm_layer_cl=norm_layer_cl,
-            ))
-            prev_chs = out_chs
-            # NOTE feature_info use currently assumes stage 0 == stride 1, rest are stride 2
-            self.feature_info += [dict(num_chs=prev_chs, reduction=curr_stride, module=f'stages.{i}')]
-        self.stages = nn.Sequential(*stages)
-        self.num_features = prev_chs
-        # if head_norm_first == true, norm -> global pool -> fc ordering, like most other nets
-        # otherwise pool -> norm -> fc, the default ConvNeXt ordering (pretrained FB weights)
-        if head_norm_first:
-            assert not head_hidden_size
-            self.norm_pre = norm_layer(self.num_features)
-            self.head = ClassifierHead(
-                self.num_features,
-                num_classes,
-                pool_type=global_pool,
-                drop_rate=self.drop_rate,
-            )
-        else:
-            self.norm_pre = nn.Identity()
-            self.head = NormMlpClassifierHead(
-                self.num_features,
-                num_classes,
-                hidden_size=head_hidden_size,
-                pool_type=global_pool,
-                drop_rate=self.drop_rate,
-                norm_layer=norm_layer,
-                act_layer='gelu',
-            )
-        named_apply(partial(_init_weights, head_init_scale=head_init_scale), self)
-    @torch.jit.ignore
-    def group_matcher(self, coarse=False):
-        return dict(
-            stem=r'^stem',
-            blocks=r'^stages\.(\d+)' if coarse else [
-                (r'^stages\.(\d+)\.downsample', (0,)),  # blocks
-                (r'^stages\.(\d+)\.blocks\.(\d+)', None),
-                (r'^norm_pre', (99999,))
-            ]
-        )
-    @torch.jit.ignore
-    def set_grad_checkpointing(self, enable=True):
-        for s in self.stages:
-            s.grad_checkpointing = enable
-    @torch.jit.ignore
-    def get_classifier(self):
-        return self.head.fc
-    def reset_classifier(self, num_classes=0, global_pool=None):
-        self.head.reset(num_classes, global_pool)
-    def forward_features(self, x):
-        x = self.stem(x)
-        x = self.stages(x)
-        x = self.norm_pre(x)
-        return x
-    def forward_head(self, x, pre_logits: bool = False):
-        return self.head(x, pre_logits=True) if pre_logits else self.head(x)
-    def forward(self, x):
-        x = self.forward_features(x)
-        x = self.forward_head(x)
-        return x
-def _init_weights(module, name=None, head_init_scale=1.0):
-    if isinstance(module, nn.Conv2d):
-        trunc_normal_(module.weight, std=.02)
-        if module.bias is not None:
-            nn.init.zeros_(module.bias)
-    elif isinstance(module, nn.Linear):
-        trunc_normal_(module.weight, std=.02)
-        nn.init.zeros_(module.bias)
-        if name and 'head.' in name:
-            module.weight.data.mul_(head_init_scale)
-            module.bias.data.mul_(head_init_scale)
-def checkpoint_filter_fn(state_dict, model):
-    """ Remap FB checkpoints -> timm """
-    if 'head.norm.weight' in state_dict or 'norm_pre.weight' in state_dict:
-        out_dict={}
-        out_dict = {k.replace('gamma', 'weight'): v for k, v in state_dict.items()}
-        return out_dict  # non-FB checkpoint
-    if 'model' in state_dict:
-        state_dict = state_dict['model']
-    out_dict = {}
-    if 'visual.trunk.stem.0.weight' in state_dict:
-        out_dict = {k.replace('visual.trunk.', '').replace('gamma', 'weight'): v for k, v in state_dict.items() if
-                    k.startswith('visual.trunk.')}
-        if 'visual.head.proj.weight' in state_dict:
-            out_dict['head.fc.weight'] = state_dict['visual.head.proj.weight']
-            out_dict['head.fc.bias'] = torch.zeros(state_dict['visual.head.proj.weight'].shape[0])
-        elif 'visual.head.mlp.fc1.weight' in state_dict:
-            out_dict['head.pre_logits.fc.weight'] = state_dict['visual.head.mlp.fc1.weight']
-            out_dict['head.pre_logits.fc.bias'] = state_dict['visual.head.mlp.fc1.bias']
-            out_dict['head.fc.weight'] = state_dict['visual.head.mlp.fc2.weight']
-            out_dict['head.fc.bias'] = torch.zeros(state_dict['visual.head.mlp.fc2.weight'].shape[0])
-        return out_dict
-    import re
-    for k, v in state_dict.items():
-        k = k.replace('downsample_layers.0.', 'stem.')
-        k = re.sub(r'stages.([0-9]+).([0-9]+)', r'stages.\1.blocks.\2', k)
-        k = re.sub(r'downsample_layers.([0-9]+).([0-9]+)', r'stages.\1.downsample.\2', k)
-        k = k.replace('dwconv', 'conv_dw')
-        k = k.replace('pwconv', 'mlp.fc')
-        if 'grn' in k:
-            k = k.replace('grn.beta', 'mlp.grn.bias')
-            k = k.replace('grn.gamma', 'mlp.grn.weight')
-            v = v.reshape(v.shape[-1])
-        k = k.replace('head.', 'head.fc.')
-        if k.startswith('norm.'):
-            k = k.replace('norm', 'head.norm')
-        if v.ndim == 2 and 'head' not in k:
-            model_shape = model.state_dict()[k].shape
-            v = v.reshape(model_shape)
-        k=k.replace('gamma','weight')
-        out_dict[k] = v
-    return out_dict
-def _create_convnext(variant, pretrained=False, **kwargs):
-    if kwargs.get('pretrained_cfg', '') == 'fcmae':
-        # NOTE fcmae pretrained weights have no classifier or final norm-layer (`head.norm`)
-        # This is workaround loading with num_classes=0 w/o removing norm-layer.
-        kwargs.setdefault('pretrained_strict', False)
-    model = build_model_with_cfg(
-        ConvNeXt, variant, pretrained,
-        pretrained_filter_fn=checkpoint_filter_fn,
-        feature_cfg=dict(out_indices=(0, 1, 2, 3), flatten_sequential=True),
-        **kwargs)
-    return model
-def _cfg(url='', **kwargs):
-    return {
-        'url': url,
-        'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': (7, 7),
-        'crop_pct': 0.875, 'interpolation': 'bicubic',
-        'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD,
-        'first_conv': 'stem.0', 'classifier': 'head.fc',
-        **kwargs
-    }
-def _cfgv2(url='', **kwargs):
-    return {
-        'url': url,
-        'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': (7, 7),
-        'crop_pct': 0.875, 'interpolation': 'bicubic',
-        'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD,
-        'first_conv': 'stem.0', 'classifier': 'head.fc',
-        'license': 'cc-by-nc-4.0', 'paper_ids': 'arXiv:2301.00808',
-        'paper_name': 'ConvNeXt-V2: Co-designing and Scaling ConvNets with Masked Autoencoders',
-        'origin_url': 'https://github.com/facebookresearch/ConvNeXt-V2',
-        **kwargs
-    }
-default_cfgs = generate_default_cfgs({
-    'convnext_xxlarge.clip_laion2b_soup_ft_in1k': _cfg(
-        hf_hub_id='timm/',
-        mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
-        input_size=(3, 256, 256), pool_size=(8, 8), crop_pct=1.0),
-    'convnext_xxlarge.clip_laion2b_soup_ft_in12k': _cfg(
-        hf_hub_id='timm/',
-        mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, num_classes=11821,
-        input_size=(3, 256, 256), pool_size=(8, 8), crop_pct=1.0),
-    'convnext_xxlarge.clip_laion2b_soup': _cfg(
-        hf_hub_id='laion/CLIP-convnext_xxlarge-laion2B-s34B-b82K-augreg-soup',
-        hf_hub_filename='open_clip_pytorch_model.bin',
-        mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
-        input_size=(3, 256, 256), pool_size=(8, 8), crop_pct=1.0, num_classes=1024),
-    'convnext_xxlarge.clip_laion2b_rewind': _cfg(
-        hf_hub_id='laion/CLIP-convnext_xxlarge-laion2B-s34B-b82K-augreg-rewind',
-        hf_hub_filename='open_clip_pytorch_model.bin',
-        mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
-        input_size=(3, 256, 256), pool_size=(8, 8), crop_pct=1.0, num_classes=1024),
-})
-@register_model
-def convnext_xxlarge(pretrained=False, **kwargs) -> ConvNeXt:
-    model_args = dict(depths=[3, 4, 30, 3], dims=[384, 768, 1536, 3072], norm_eps=kwargs.pop('norm_eps', 1e-5))
-    model = _create_convnext('convnext_xxlarge', pretrained=pretrained, **dict(model_args, **kwargs))
-    return model
-# register_model_deprecations(__name__, {
-#     'convnext_tiny_in22ft1k': 'convnext_tiny.fb_in22k_ft_in1k',
-#     'convnext_small_in22ft1k': 'convnext_small.fb_in22k_ft_in1k',
-#     'convnext_base_in22ft1k': 'convnext_base.fb_in22k_ft_in1k',
-#     'convnext_large_in22ft1k': 'convnext_large.fb_in22k_ft_in1k',
-#     'convnext_xlarge_in22ft1k': 'convnext_xlarge.fb_in22k_ft_in1k',
-#     'convnext_tiny_384_in22ft1k': 'convnext_tiny.fb_in22k_ft_in1k_384',
-#     'convnext_small_384_in22ft1k': 'convnext_small.fb_in22k_ft_in1k_384',
-#     'convnext_base_384_in22ft1k': 'convnext_base.fb_in22k_ft_in1k_384',
-#     'convnext_large_384_in22ft1k': 'convnext_large.fb_in22k_ft_in1k_384',
-#     'convnext_xlarge_384_in22ft1k': 'convnext_xlarge.fb_in22k_ft_in1k_384',
-#     'convnext_tiny_in22k': 'convnext_tiny.fb_in22k',
-#     'convnext_small_in22k': 'convnext_small.fb_in22k',
-#     'convnext_base_in22k': 'convnext_base.fb_in22k',
-#     'convnext_large_in22k': 'convnext_large.fb_in22k',
-#     'convnext_xlarge_in22k': 'convnext_xlarge.fb_in22k',
-# })

convnext_encoder.py DELETED Viewed

@@ -1,301 +0,0 @@
-import torch, os
-import torch.nn as nn
-from timm import create_model
-from transformers import CLIPImageProcessor
-from .convnext import convnext_xxlarge
-from torch.utils.checkpoint import checkpoint
-import torch
-from torchvision import transforms as T
-from PIL import Image
-cfg={
-    "crop_size": 256,
-    "do_center_crop": True,
-    "do_normalize": True,
-    "do_resize": True,
-    "feature_extractor_type": "CLIPFeatureExtractor",
-    "image_mean": [
-        0.48145466,
-        0.4578275,
-        0.40821073
-    ],
-    "image_std": [
-        0.26862954,
-        0.26130258,
-        0.27577711
-    ],
-    "resample": 3,
-    "size": 256
-}
-MEAN_SLIP = [0.5, 0.5, 0.5]
-STD_SLIP = [0.5, 0.5, 0.5]
-MEAN_CLIP = [0.48145466, 0.4578275, 0.40821073]
-STD_CLIP = [0.26862954, 0.26130258, 0.27577711]
-a = [s_slip / s_clip for s_slip, s_clip in zip(STD_SLIP, STD_CLIP)]
-b = [(m_slip - m_clip) / s_clip for m_slip, m_clip, s_clip in zip(MEAN_SLIP, MEAN_CLIP, STD_CLIP)]
-class SlipToClipTransform:
-    def __init__(self, a, b):
-        self.a = torch.tensor(a).view(-1, 1, 1)
-        self.b = torch.tensor(b).view(-1, 1, 1)
-    def __call__(self, x_slip):
-        return x_slip * self.a.to(x_slip.device) + self.b.to(x_slip.device)
-slip_to_clip = SlipToClipTransform(a, b)
-class ConvNextVisionTower(nn.Module):
-    def __init__(self, vision_tower, args, delay_load=False, normalize_type=None):
-        super().__init__()
-        self.is_loaded = False
-        self.freeze_vision=args.freeze_vision
-        self.input_image_size=args.input_image_size
-        self.vision_tower_name = vision_tower
-        self.name = 'convnext'
-        self.select_layer = args.mm_vision_select_layer
-        self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
-        self.pre_norm = normalize_type
-        print('pre_norm: ', self.pre_norm)
-        self.delay_load = delay_load
-        self.load_model()
-    def load_model(self):
-        if 'xxlarge' in self.vision_tower_name:
-            if self.delay_load:
-                self.vision_tower = convnext_xxlarge(pretrained=False)
-            else:
-                self.vision_tower = convnext_xxlarge(self.vision_tower_name)
-            setattr(self.vision_tower, 'hidden_size', 3072)
-        elif os.path.exists(self.vision_tower_name):
-            self.vision_tower = torch.load(self.vision_tower_name)
-        else:
-            assert False, 'Not implemented'
-        self.vision_tower = self.vision_tower.to(torch.bfloat16)
-        if self.freeze_vision:
-            self.vision_tower.requires_grad_(False)
-        # if self.vision_tower.grad_checkpointing:
-        for s in self.vision_tower.stages:
-            s.grad_checkpointing = True
-        self.is_loaded = True
-    def feature_select(self, image_forward_outs):
-        if self.select_layer>100:
-            image_features = image_forward_outs[-4:]
-        else:
-            image_features = image_forward_outs[-1]
-        return image_features
-    def forward_features(self, x):
-        x = self.vision_tower.stem(x)
-        image_forward_out=[]
-        for blk in self.vision_tower.stages:
-            x = blk(x)
-            b,c,h,w=x.shape
-            image_forward_out.append(x.view(b,c,-1).transpose(1,2))
-        return image_forward_out
-    def forward(self, images):
-        if self.freeze_vision:
-            with torch.no_grad():
-                image_features = self._forward_images(images)
-        else:
-            image_features = self._forward_images(images)
-        return image_features
-    def _forward_images(self, images):
-        if type(images) is list:
-            image_features = []
-            for image in images:
-                if self.pre_norm == 'siglip':
-                    dtype = image.dtype
-                    image = slip_to_clip(image.to(torch.float32)).to(dtype)
-                image_forward_out = self.forward_features(image.to(device=self.device, dtype=self.dtype).unsqueeze(0))
-                image_feature = self.feature_select(image_forward_out)
-                image_features.append(image_feature)
-        else:
-            if self.pre_norm == 'siglip':
-                dtype = images.dtype
-                images = slip_to_clip(images.to(torch.float32)).to(dtype)
-            image_forward_outs = self.forward_features(images.to(device=self.device, dtype=self.dtype))
-            image_features = self.feature_select(image_forward_outs)
-        return image_features
-    @property
-    def dummy_feature(self):
-        return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
-    @property
-    def dtype(self):
-        return next(self.vision_tower.parameters()).dtype
-    @property
-    def device(self):
-        return next(self.vision_tower.parameters()).device
-    @property
-    def config(self):
-        assert  NotImplementedError
-        pass
-    @property
-    def num_attention_heads(self):
-        # as constant
-        return 16
-    @property
-    def num_layers(self):
-        # as constant
-        return 4
-    @property
-    def hidden_size(self):
-        return self.vision_tower.hidden_size
-    @property
-    def num_patches(self):
-        return (self.input_image_size // self.patch_embed.patch_size[0]) ** 2
-class ConvNextFPNVisionTower(nn.Module):
-    def __init__(self,
-                 vision_tower,
-                 args,
-                 fpn_target_level=1,
-                 fpn_layer_idx=[1,2,3],
-                 fpn_input_dim=[768,1536,3072],
-                 delay_load=False):
-        super().__init__()
-        self.is_loaded = False
-        self.vision_tower_name = vision_tower.replace('-fpn', 'fpn')
-        self.freeze_vision = getattr(args, "frozen_backbone", True)
-        # self.input_image_size = getattr(args, "vision_tower_input_size", 1024)
-        self.input_image_size = 1024  # hardcode
-        self.select_layer = args.mm_vision_select_layer # no effect
-        self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
-        self.need_fpn = True
-        self.fpn_layer_idx = fpn_layer_idx # [1, 2, 3] # x8, x16, x32
-        self.fpn_input_dim = [768, 1536, 3072]
-        self.delay_load = delay_load
-        self.load_model()
-    def load_model(self):
-        if self.is_loaded:
-            return
-        self.image_processor = CLIPImageProcessor(**cfg)
-        if 'xxlarge' in self.vision_tower_name:
-            self.vision_tower = convnext_xxlarge(self.vision_tower_name)
-            setattr(self.vision_tower, 'hidden_size', self.fpn_input_dim)
-            # setattr(self.vision_tower, 'hidden_size', 3072)
-        else:
-            self.vision_tower = convnext_large_mlp(self.vision_tower_name)
-            setattr(self.vision_tower, 'hidden_size', 1536)
-        if self.freeze_vision:
-            self.vision_tower.requires_grad_(False)
-        # if self.vision_tower.grad_checkpointing:
-        for s in self.vision_tower.stages:
-            s.grad_checkpointing = True
-        if self.input_image_size is not None:
-            self.image_processor.size=self.input_image_size
-            self.image_processor.crop_size={
-                'height':self.input_image_size,
-                'width': self.input_image_size
-            }
-        self.is_loaded = True
-    @torch.no_grad()
-    def forward_features(self, x):
-        x = self.vision_tower.stem(x)
-        image_forward_out=[]
-        for blk in self.vision_tower.stages:
-            x = blk(x)
-            image_forward_out.append(x)
-        return image_forward_out
-    @torch.no_grad()
-    def forward(self, images):
-        if type(images) is list:
-            image_features = []
-            for image in images:
-                image_feature = self.forward_features(image.to(device=self.device, dtype=self.dtype).unsqueeze(0))
-                image_features.append(image_feature)
-        else:
-            image_features = self.forward_features(images.to(device=self.device, dtype=self.dtype))
-            image_features = [image_features[idx] for idx in self.fpn_layer_idx]
-        return image_features
-    @property
-    def dummy_feature(self):
-        return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
-    @property
-    def dtype(self):
-        return next(self.vision_tower.parameters()).dtype
-    @property
-    def device(self):
-        return next(self.vision_tower.parameters()).device
-    @property
-    def config(self):
-        assert  NotImplementedError
-        pass
-    @property
-    def num_attention_heads(self):
-        # as constant
-        return 16
-    @property
-    def num_layers(self):
-        # as constant
-        return 4
-    @property
-    def hidden_size(self):
-        return self.vision_tower.hidden_size
-    @property
-    def num_patches(self):
-        return (cfg['image_size'] // self.patch_embed.patch_size[0]) ** 2
-if __name__ == '__main__':
-    COMBINED_STD = [s_slip / s_clip for s_slip, s_clip in zip(STD_SigLIP, STD_CLIP)]
-    COMBINED_MEAN = [(m_slip - m_clip) / s_clip for m_slip, m_clip, s_clip in zip(MEAN_SigLIP, MEAN_CLIP, STD_CLIP)]
-    # 定义合并的归一化变换
-    combined_normalize = T.Normalize(mean=COMBINED_MEAN, std=COMBINED_STD)
-    x = torch.randn(1, 3, 256, 256).cuda()
-    a = normalize_clip(x).to(torch.bfloat16)
-    b = normalize_siglip(x).to(torch.bfloat16)
-    c = denormalize_siglip(b.to(torch.float32))
-    c2 = normalize_clip(c).to(torch.bfloat16)
-    c3 = combined_normalize(b)
-    print((c-x).abs().max())
-    print((c2-a).abs().max())
-    print((c3-a).abs().max())
-    from IPython import embed
-    embed()
-    exit()

demo.py CHANGED Viewed

@@ -237,7 +237,7 @@ class ModelWorker:
             self.norm_type = 'siglip'
         else:
             self.norm_type = 'imagenet'
         if any(x in model_path.lower() for x in ['34b']):
             device_map = split_model(model_path, self.device)
         else:
@@ -261,7 +261,7 @@ class ModelWorker:
         self.image_size = self.model.config.force_image_size
         self.context_len = tokenizer.model_max_length
         self.per_tile_len = 256
     def reload_model(self):
         del self.model
         torch.cuda.empty_cache()
@@ -297,6 +297,7 @@ class ModelWorker:
         global_image_cnt = 0
         history, pil_images, max_input_tile_list = [], [], []
         for message in send_messages:
             if message['role'] == 'user':
                 prefix = ''
@@ -341,6 +342,7 @@ class ModelWorker:
             max_input_tiles_limited_by_contect = params['max_input_tiles']
             while True:
                 image_tiles = []
                 for current_max_input_tiles, pil_image in zip(max_input_tile_list, pil_images):
                     if self.model.config.dynamic_image_size:
                         tiles = dynamic_preprocess(
@@ -348,6 +350,7 @@ class ModelWorker:
                             use_thumbnail=self.model.config.use_thumbnail)
                     else:
                         tiles = [pil_image]
                     image_tiles += tiles
                 if (len(image_tiles) * self.per_tile_len < self.context_len):
                     break
@@ -358,6 +361,8 @@ class ModelWorker:
                     break
             pixel_values = [transform(item) for item in image_tiles]
             pixel_values = torch.stack(pixel_values).to(self.model.device, dtype=torch.bfloat16)
         else:
@@ -372,13 +377,14 @@ class ModelWorker:
             max_length=self.context_len,
             top_p=top_p,
         )
         response = self.model.chat(
             tokenizer=self.tokenizer,
             pixel_values=pixel_values,
             question=question,
             history=history,
             return_history=False,
             generation_config=generation_config,
         )
         self.model.system_message = old_system_message
@@ -390,8 +396,8 @@ class ModelWorker:
 if __name__ == '__main__':
     parser = argparse.ArgumentParser()
-    parser.add_argument('--model-path', type=str, default='nvidia/Eagle2-2B')
-    parser.add_argument('--model-name', type=str, default='Eagle2-2B')
     parser.add_argument('--device', type=str, default='cuda')
     parser.add_argument('--load-8bit', action='store_true')
     args = parser.parse_args()
@@ -404,9 +410,10 @@ if __name__ == '__main__':
                          args.device)
     prompt = [
         {'role': 'system', 'content': 'You are a helpful assistant.'},
-        {'role': 'user', 'content': 'Describe this image in details.',
             'image':[
-                {'url': 'https://www.nvidia.com/content/dam/en-zz/Solutions/about-nvidia/logo-and-brand/[email protected]'}
             ]
         }
     ]

             self.norm_type = 'siglip'
         else:
             self.norm_type = 'imagenet'
+        print('norm_type: ', self.norm_type)
         if any(x in model_path.lower() for x in ['34b']):
             device_map = split_model(model_path, self.device)
         else:
         self.image_size = self.model.config.force_image_size
         self.context_len = tokenizer.model_max_length
         self.per_tile_len = 256
+        print(self.model)
     def reload_model(self):
         del self.model
         torch.cuda.empty_cache()
         global_image_cnt = 0
         history, pil_images, max_input_tile_list = [], [], []
         for message in send_messages:
             if message['role'] == 'user':
                 prefix = ''
             max_input_tiles_limited_by_contect = params['max_input_tiles']
             while True:
                 image_tiles = []
+                num_patches_list = []
                 for current_max_input_tiles, pil_image in zip(max_input_tile_list, pil_images):
                     if self.model.config.dynamic_image_size:
                         tiles = dynamic_preprocess(
                             use_thumbnail=self.model.config.use_thumbnail)
                     else:
                         tiles = [pil_image]
+                    num_patches_list.append(len(tiles))
                     image_tiles += tiles
                 if (len(image_tiles) * self.per_tile_len < self.context_len):
                     break
                     break
             pixel_values = [transform(item) for item in image_tiles]
             pixel_values = torch.stack(pixel_values).to(self.model.device, dtype=torch.bfloat16)
         else:
             max_length=self.context_len,
             top_p=top_p,
         )
+        print(f'pixel_values: {pixel_values.shape}')
         response = self.model.chat(
             tokenizer=self.tokenizer,
             pixel_values=pixel_values,
             question=question,
             history=history,
             return_history=False,
+            num_patches_list=num_patches_list,
             generation_config=generation_config,
         )
         self.model.system_message = old_system_message
 if __name__ == '__main__':
     parser = argparse.ArgumentParser()
+    parser.add_argument('--model-path', type=str, default='/home/zhidingy/workspace/eagle-next/internvl_chat/work_dirs/release/test/Eagle2-2B')
+    parser.add_argument('--model-name', type=str, default='Eagle2')
     parser.add_argument('--device', type=str, default='cuda')
     parser.add_argument('--load-8bit', action='store_true')
     args = parser.parse_args()
                          args.device)
     prompt = [
         {'role': 'system', 'content': 'You are a helpful assistant.'},
+        {'role': 'user', 'content': 'Describe these two images in details respectively.',
             'image':[
+                {'url': 'https://www.nvidia.com/content/dam/en-zz/Solutions/about-nvidia/logo-and-brand/[email protected]'},
+                {'url': "https://www.google.com.hk/images/branding/googlelogo/2x/googlelogo_color_272x92dp.png"}
             ]
         }
     ]

flash_attention.py DELETED Viewed

@@ -1,76 +0,0 @@
-# https://github.com/Dao-AILab/flash-attention/blob/v0.2.8/flash_attn/flash_attention.py
-import torch
-import torch.nn as nn
-from einops import rearrange
-try:  # v1
-    from flash_attn.flash_attn_interface import \
-        flash_attn_unpadded_qkvpacked_func
-except:  # v2
-    from flash_attn.flash_attn_interface import flash_attn_varlen_qkvpacked_func as flash_attn_unpadded_qkvpacked_func
-from flash_attn.bert_padding import pad_input, unpad_input
-class FlashAttention(nn.Module):
-    """Implement the scaled dot product attention with softmax.
-    Arguments
-    ---------
-        softmax_scale: The temperature to use for the softmax attention.
-                      (default: 1/sqrt(d_keys) where d_keys is computed at
-                      runtime)
-        attention_dropout: The dropout rate to apply to the attention
-                           (default: 0.0)
-    """
-    def __init__(self, softmax_scale=None, attention_dropout=0.0, device=None, dtype=None):
-        super().__init__()
-        self.softmax_scale = softmax_scale
-        self.dropout_p = attention_dropout
-    def forward(self, qkv, key_padding_mask=None, causal=False, cu_seqlens=None,
-                max_s=None, need_weights=False):
-        """Implements the multihead softmax attention.
-        Arguments
-        ---------
-            qkv: The tensor containing the query, key, and value. (B, S, 3, H, D) if key_padding_mask is None
-                if unpadded: (nnz, 3, h, d)
-            key_padding_mask: a bool tensor of shape (B, S)
-        """
-        assert not need_weights
-        assert qkv.dtype in [torch.float16, torch.bfloat16]
-        assert qkv.is_cuda
-        if cu_seqlens is None:
-            batch_size = qkv.shape[0]
-            seqlen = qkv.shape[1]
-            if key_padding_mask is None:
-                qkv = rearrange(qkv, 'b s ... -> (b s) ...')
-                max_s = seqlen
-                cu_seqlens = torch.arange(0, (batch_size + 1) * seqlen, step=seqlen, dtype=torch.int32,
-                                          device=qkv.device)
-                output = flash_attn_unpadded_qkvpacked_func(
-                    qkv, cu_seqlens, max_s, self.dropout_p if self.training else 0.0,
-                    softmax_scale=self.softmax_scale, causal=causal
-                )
-                output = rearrange(output, '(b s) ... -> b s ...', b=batch_size)
-            else:
-                nheads = qkv.shape[-2]
-                x = rearrange(qkv, 'b s three h d -> b s (three h d)')
-                x_unpad, indices, cu_seqlens, max_s = unpad_input(x, key_padding_mask)
-                x_unpad = rearrange(x_unpad, 'nnz (three h d) -> nnz three h d', three=3, h=nheads)
-                output_unpad = flash_attn_unpadded_qkvpacked_func(
-                    x_unpad, cu_seqlens, max_s, self.dropout_p if self.training else 0.0,
-                    softmax_scale=self.softmax_scale, causal=causal
-                )
-                output = rearrange(pad_input(rearrange(output_unpad, 'nnz h d -> nnz (h d)'),
-                                             indices, batch_size, seqlen),
-                                   'b s (h d) -> b s h d', h=nheads)
-        else:
-            assert max_s is not None
-            output = flash_attn_unpadded_qkvpacked_func(
-                qkv, cu_seqlens, max_s, self.dropout_p if self.training else 0.0,
-                softmax_scale=self.softmax_scale, causal=causal
-            )
-        return output, None

modeling_eagle_chat.py CHANGED Viewed

@@ -11,26 +11,18 @@ import torch.utils.checkpoint
 import transformers
 from torch import nn
 from torch.nn import CrossEntropyLoss
-from transformers import (AutoModel, GenerationConfig, LlamaForCausalLM,
-                          LlamaTokenizer)
 from transformers.modeling_outputs import CausalLMOutputWithPast
 from transformers.modeling_utils import PreTrainedModel
 from transformers.utils import ModelOutput, logging
 from peft import LoraConfig, get_peft_model
-from .configuration_eagle_chat import Eagle2ChatConfig
-from .conversation import get_conv_template
-from .modeling_siglip import SiglipVisionModel
-from .modeling_qwen2 import Qwen2ForCausalLM
-from .flash_attention import *
-from .multi_backbone_channel_concatentation_model import MultiBackboneChannelConcatenationVisionModel
-from .multi_backbone_channel_concatenation_encoder import MultiBackboneChannelConcatenationVisionTower
-from .configuration_multi_backbone_channel_concatentation_model import MultiBackboneChannelConcatenationVisionModelConfig
-from .siglip_vision_tower import SiglipVisionTower
-from .convnext_encoder import ConvNextVisionTower
-from .convnext import ConvNeXt
-logger = logging.get_logger(__name__)
 def version_cmp(v1, v2, op='eq'):
     import operator
@@ -44,25 +36,25 @@ class Eagle2ChatModel(PreTrainedModel):
     config_class = Eagle2ChatConfig
     main_input_name = 'pixel_values'
     _no_split_modules = ['LlamaDecoderLayer']
     def __init__(self, config: Eagle2ChatConfig, vision_model=None, language_model=None):
         super().__init__(config)
-        assert version_cmp(transformers.__version__, '4.37.2', 'ge')
-        assert version_cmp(transformers.__version__, '4.39.2', 'le')
         image_size = config.force_image_size or config.vision_config.image_size
-        if hasattr(config.vision_config, 'grid_size'):
-            grid_size = config.vision_config.grid_size
-            self.patch_size = 14
-            self.num_image_token = int((grid_size * config.downsample_ratio) ** 2)
-        else:
-            patch_size = config.vision_config.patch_size
-            self.patch_size = patch_size
-            self.num_image_token = int((image_size // patch_size) ** 2 * (config.downsample_ratio ** 2))
         self.select_layer = config.select_layer
         self.template = config.template
         self.downsample_ratio = config.downsample_ratio
         logger.info(f'num_image_token: {self.num_image_token}')
@@ -70,9 +62,9 @@ class Eagle2ChatModel(PreTrainedModel):
             self.vision_model = vision_model
         else:
             if config.vision_config.model_type == 'siglip_vision_model':
                 self.vision_model = SiglipVisionModel(config.vision_config)
-            elif config.vision_config.model_type.startswith("MOB"):
-                self.vision_model = MultiBackboneChannelConcatenationVisionModel(config.vision_config, config)
         if language_model is not None:
             self.language_model = language_model
@@ -85,35 +77,17 @@ class Eagle2ChatModel(PreTrainedModel):
                 raise NotImplementedError(f'{config.llm_config.architectures[0]} is not implemented.')
         vit_hidden_size = config.vision_config.hidden_size
-        if vit_hidden_size == 'lazy_calculation':
-            # a hack for Mixture of Backbones
-            vit_hidden_size = self.vision_model.hidden_size
-            print("The lazy calculated hidden_size: {} .. ".format(vit_hidden_size))
         llm_hidden_size = config.llm_config.hidden_size
-        self.moe_version_type = getattr(config.vision_config, 'moe_version_type', None)
-        if self.moe_version_type in ['seq_concat', 'feat_concat']:
-            raise NotImplementedError
-        elif self.moe_version_type == 'convnext_512_siglip_448':
-            convnext_hidden_size = vit_hidden_size['convnext']
-            siglip_hidden_size = vit_hidden_size['siglip']
-            feature_concat_hidden_size = convnext_hidden_size + siglip_hidden_size * int(1 / self.downsample_ratio) ** 2
-            self.mlp1 = nn.Sequential(
-                nn.LayerNorm(feature_concat_hidden_size),
-                nn.Linear(feature_concat_hidden_size, llm_hidden_size),
-                nn.GELU(),
-                nn.Linear(llm_hidden_size, llm_hidden_size)
-            )
-        else:
-            self.mlp1 = nn.Sequential(
                 nn.LayerNorm(vit_hidden_size * int(1 / self.downsample_ratio) ** 2),
                 nn.Linear(vit_hidden_size * int(1 / self.downsample_ratio) ** 2, llm_hidden_size),
                 nn.GELU(),
                 nn.Linear(llm_hidden_size, llm_hidden_size)
             )
         self.img_context_token_id = None
-        self.conv_template = get_conv_template(self.template)
-        self.system_message = self.conv_template.system_message
         if config.use_backbone_lora:
             self.wrap_backbone_lora(r=config.use_backbone_lora, lora_alpha=2 * config.use_backbone_lora)
@@ -165,19 +139,13 @@ class Eagle2ChatModel(PreTrainedModel):
         image_flags = image_flags.squeeze(-1)
         input_embeds = self.language_model.get_input_embeddings()(input_ids)
-        if self.moe_version_type in ['seq_concat', 'feat_concat'] and not isinstance(pixel_values, dict):
-            raise NotImplementedError
         vit_embeds = self.extract_feature(pixel_values)
         if not isinstance(image_flags, list):
             image_flags = image_flags.squeeze(-1)
             vit_embeds = vit_embeds[image_flags == 1]
-        if isinstance(pixel_values, dict):
-            # for MOE
-            vit_batch_size = sum(pixel_values['num_patches'])
-        else:
-            vit_batch_size = pixel_values.shape[0]
         B, N, C = input_embeds.shape
         input_embeds = input_embeds.reshape(B * N, C)
@@ -206,7 +174,6 @@ class Eagle2ChatModel(PreTrainedModel):
             use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
         )
         logits = outputs.logits
@@ -248,7 +215,6 @@ class Eagle2ChatModel(PreTrainedModel):
         return x
     def extract_feature(self, pixel_values):
         """
         """
@@ -256,8 +222,10 @@ class Eagle2ChatModel(PreTrainedModel):
             vit_embeds = self.vision_model(
                 pixel_values=pixel_values,
                 output_hidden_states=False,
-                return_dict=True).last_hidden_state # torch.Size([B, 1025, 1024])
         else:
             vit_embeds = self.vision_model(
                 pixel_values=pixel_values,
@@ -265,35 +233,24 @@ class Eagle2ChatModel(PreTrainedModel):
                 return_dict=True).hidden_states[self.select_layer]
         if type(self.vision_model) == SiglipVisionModel:
             pass
-        elif type(self.vision_model) == MultiBackboneChannelConcatenationVisionModel:
-            pass
         else:
             vit_embeds = vit_embeds[:, 1:, :] # torch.Size([B, 1024, 1024])
         if self.training and self.neftune_alpha is not None:
             vit_embeds = self.noised_embed(vit_embeds, self.neftune_alpha)
-        if self.moe_version_type in ['feat_concat',  'seq_concat']:
-            raise NotImplementedError
-        elif self.moe_version_type == 'convnext_512_siglip_448':
-            siglip_embeds = vit_embeds['siglip']
-            convnext_embeds = vit_embeds['convnext']
-            h = w = int(siglip_embeds.shape[1] ** 0.5)
-            siglip_embeds = siglip_embeds.reshape(siglip_embeds.shape[0], h, w, -1)
-            siglip_embeds = self.pixel_shuffle(siglip_embeds, scale_factor=self.downsample_ratio)
-            siglip_embeds = siglip_embeds.reshape(siglip_embeds.shape[0], -1, siglip_embeds.shape[-1])
-            vit_embeds = self.mlp1(torch.cat([siglip_embeds, convnext_embeds], dim=-1))
-        else:
-            h = w = int(vit_embeds.shape[1] ** 0.5)
-            vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], h, w, -1)
-            vit_embeds = self.pixel_shuffle(vit_embeds, scale_factor=self.downsample_ratio) # torch.Size([B, 1024, 1024]) -> torch.Size([B, 16, 16, 4096])
-            vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], -1, vit_embeds.shape[-1]) # torch.Size([B, 16, 16, 4096]) -> torch.Size([B, 256, 4096])
-            vit_embeds = self.mlp1(vit_embeds)#.to(pixel_values.device)
         return vit_embeds
-    def batch_chat(self, tokenizer, pixel_values, questions, generation_config, num_patches_list=None,
                    history=None, return_history=False, IMG_START_TOKEN='<img>', IMG_END_TOKEN='</img>',
                    IMG_CONTEXT_TOKEN='<IMG_CONTEXT>', verbose=False, image_counts=None):
         if history is not None or return_history:
@@ -316,10 +273,11 @@ class Eagle2ChatModel(PreTrainedModel):
             question = questions[idx]
             if pixel_values is not None and '<image>' not in question:
                 question = '<image>\n' + question
-            template = get_conv_template(self.template)
-            template.append_message(template.roles[0], question)
-            template.append_message(template.roles[1], None)
-            query = template.get_prompt()
             image_tokens = IMG_START_TOKEN + IMG_CONTEXT_TOKEN * self.num_image_token * num_patches + IMG_END_TOKEN
             query = query.replace('<image>', image_tokens, 1)
@@ -329,7 +287,7 @@ class Eagle2ChatModel(PreTrainedModel):
         model_inputs = tokenizer(queries, return_tensors='pt', padding=True)
         input_ids = model_inputs['input_ids'].cuda()
         attention_mask = model_inputs['attention_mask'].cuda()
-        eos_token_id = tokenizer.convert_tokens_to_ids(template.sep)
         generation_config['eos_token_id'] = eos_token_id
         generation_output = self.generate(
             pixel_values=pixel_values,
@@ -338,7 +296,7 @@ class Eagle2ChatModel(PreTrainedModel):
             **generation_config
         )
         responses = tokenizer.batch_decode(generation_output, skip_special_tokens=True)
-        responses = [response.split(template.sep)[0].strip() for response in responses]
         return responses
     def chat(self, tokenizer, pixel_values, question, generation_config, history=None, return_history=False,
@@ -355,17 +313,18 @@ class Eagle2ChatModel(PreTrainedModel):
         img_context_token_id = tokenizer.convert_tokens_to_ids(IMG_CONTEXT_TOKEN)
         self.img_context_token_id = img_context_token_id
-        template = get_conv_template(self.template)
-        template.system_message = self.system_message
-        eos_token_id = tokenizer.convert_tokens_to_ids(template.sep)
         history = [] if history is None else history
         for (old_question, old_answer) in history:
-            template.append_message(template.roles[0], old_question)
-            template.append_message(template.roles[1], old_answer)
-        template.append_message(template.roles[0], question)
-        template.append_message(template.roles[1], None)
-        query = template.get_prompt()
         if verbose and pixel_values is not None:
             image_bs = pixel_values.shape[0]
@@ -382,11 +341,6 @@ class Eagle2ChatModel(PreTrainedModel):
         input_ids = model_inputs['input_ids'].cuda()
         attention_mask = model_inputs['attention_mask'].cuda()
         generation_config['eos_token_id'] = eos_token_id
-        if self.moe_version_type is not None and self.moe_version_type != 'all_tiling' and self.moe_version_type != 'convnext_512_siglip_448':
-            pixel_values = {
-                'pixel_values': pixel_values,
-                'num_patches': num_patches_list # num patch of each image.
-            }
         generation_output = self.generate(
             pixel_values=pixel_values,
             input_ids=input_ids,
@@ -394,7 +348,7 @@ class Eagle2ChatModel(PreTrainedModel):
             **generation_config
         )
         response = tokenizer.batch_decode(generation_output, skip_special_tokens=True)[0]
-        response = response.split(template.sep)[0].strip()
         history.append((question, response))
         if return_history:
             return response, history
@@ -405,6 +359,17 @@ class Eagle2ChatModel(PreTrainedModel):
                 print(query_to_print, response)
             return response
     @torch.no_grad()
     def generate(
             self,
@@ -443,7 +408,6 @@ class Eagle2ChatModel(PreTrainedModel):
             attention_mask=attention_mask,
             generation_config=generation_config,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
             use_cache=True,
             **generate_kwargs,
         )

 import transformers
 from torch import nn
 from torch.nn import CrossEntropyLoss
+from transformers import (AutoModel, GenerationConfig,
+                          LlamaTokenizer, LlamaForCausalLM)
 from transformers.modeling_outputs import CausalLMOutputWithPast
 from transformers.modeling_utils import PreTrainedModel
 from transformers.utils import ModelOutput, logging
 from peft import LoraConfig, get_peft_model
+from transformers.models.siglip.modeling_siglip import SiglipVisionModel
+from transformers.models.qwen2.modeling_qwen2 import Qwen2ForCausalLM
+logger = logging.get_logger(__name__)
+from .configuration_eagle_chat import Eagle2ChatConfig
 def version_cmp(v1, v2, op='eq'):
     import operator
     config_class = Eagle2ChatConfig
     main_input_name = 'pixel_values'
     _no_split_modules = ['LlamaDecoderLayer']
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_flex_attn = False
+    _supports_cache_class = False
+    _supports_quantized_cache = False
+    _supports_static_cache = False
+    _supports_attention_backend = False
     def __init__(self, config: Eagle2ChatConfig, vision_model=None, language_model=None):
         super().__init__(config)
         image_size = config.force_image_size or config.vision_config.image_size
+        patch_size = config.vision_config.patch_size
+        self.patch_size = patch_size
+        self.num_image_token = int((image_size // patch_size) ** 2 * (config.downsample_ratio ** 2))
         self.select_layer = config.select_layer
         self.template = config.template
         self.downsample_ratio = config.downsample_ratio
         logger.info(f'num_image_token: {self.num_image_token}')
             self.vision_model = vision_model
         else:
             if config.vision_config.model_type == 'siglip_vision_model':
+                if version_cmp(transformers.__version__, '4.43.0', 'le'):
+                    config.vision_config._attn_implementation = 'eager'
                 self.vision_model = SiglipVisionModel(config.vision_config)
         if language_model is not None:
             self.language_model = language_model
                 raise NotImplementedError(f'{config.llm_config.architectures[0]} is not implemented.')
         vit_hidden_size = config.vision_config.hidden_size
         llm_hidden_size = config.llm_config.hidden_size
+        self.mlp1 = nn.Sequential(
                 nn.LayerNorm(vit_hidden_size * int(1 / self.downsample_ratio) ** 2),
                 nn.Linear(vit_hidden_size * int(1 / self.downsample_ratio) ** 2, llm_hidden_size),
                 nn.GELU(),
                 nn.Linear(llm_hidden_size, llm_hidden_size)
             )
         self.img_context_token_id = None
+        self.system_message = 'You are a helpful assistant.' # Default system message
         if config.use_backbone_lora:
             self.wrap_backbone_lora(r=config.use_backbone_lora, lora_alpha=2 * config.use_backbone_lora)
         image_flags = image_flags.squeeze(-1)
         input_embeds = self.language_model.get_input_embeddings()(input_ids)
         vit_embeds = self.extract_feature(pixel_values)
         if not isinstance(image_flags, list):
             image_flags = image_flags.squeeze(-1)
             vit_embeds = vit_embeds[image_flags == 1]
+        vit_batch_size = pixel_values.shape[0]
         B, N, C = input_embeds.shape
         input_embeds = input_embeds.reshape(B * N, C)
             use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
         )
         logits = outputs.logits
         return x
     def extract_feature(self, pixel_values):
         """
         """
             vit_embeds = self.vision_model(
                 pixel_values=pixel_values,
                 output_hidden_states=False,
+                return_dict=True)
+            # if there is vit_embeds.last_hidden_state, use it.
+            if hasattr(vit_embeds, 'last_hidden_state'):
+                vit_embeds = vit_embeds.last_hidden_state
         else:
             vit_embeds = self.vision_model(
                 pixel_values=pixel_values,
                 return_dict=True).hidden_states[self.select_layer]
         if type(self.vision_model) == SiglipVisionModel:
             pass
         else:
             vit_embeds = vit_embeds[:, 1:, :] # torch.Size([B, 1024, 1024])
         if self.training and self.neftune_alpha is not None:
             vit_embeds = self.noised_embed(vit_embeds, self.neftune_alpha)
+        h = w = int(vit_embeds.shape[1] ** 0.5)
+        vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], h, w, -1)
+        vit_embeds = self.pixel_shuffle(vit_embeds, scale_factor=self.downsample_ratio) # torch.Size([B, 1024, 1024]) -> torch.Size([B, 16, 16, 4096])
+        vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], -1, vit_embeds.shape[-1]) # torch.Size([B, 16, 16, 4096]) -> torch.Size([B, 256, 4096])
+        vit_embeds = self.mlp1(vit_embeds)#.to(pixel_values.device)
         return vit_embeds
+    def batch_chat(self,
+                   tokenizer, pixel_values, questions, generation_config,     num_patches_list=None,
                    history=None, return_history=False, IMG_START_TOKEN='<img>', IMG_END_TOKEN='</img>',
                    IMG_CONTEXT_TOKEN='<IMG_CONTEXT>', verbose=False, image_counts=None):
         if history is not None or return_history:
             question = questions[idx]
             if pixel_values is not None and '<image>' not in question:
                 question = '<image>\n' + question
+            template_messages = []
+            sep = tokenizer.eos_token
+            template_messages.append(('<|im_start|>user', question))
+            template_messages.append(('<|im_end|>assistant', None))
+            query = self.get_prompt(self.system_message, template_messages, sep)
             image_tokens = IMG_START_TOKEN + IMG_CONTEXT_TOKEN * self.num_image_token * num_patches + IMG_END_TOKEN
             query = query.replace('<image>', image_tokens, 1)
         model_inputs = tokenizer(queries, return_tensors='pt', padding=True)
         input_ids = model_inputs['input_ids'].cuda()
         attention_mask = model_inputs['attention_mask'].cuda()
+        eos_token_id = tokenizer.convert_tokens_to_ids(sep)
         generation_config['eos_token_id'] = eos_token_id
         generation_output = self.generate(
             pixel_values=pixel_values,
             **generation_config
         )
         responses = tokenizer.batch_decode(generation_output, skip_special_tokens=True)
+        responses = [response.split(sep)[0].strip() for response in responses]
         return responses
     def chat(self, tokenizer, pixel_values, question, generation_config, history=None, return_history=False,
         img_context_token_id = tokenizer.convert_tokens_to_ids(IMG_CONTEXT_TOKEN)
         self.img_context_token_id = img_context_token_id
+        template_messages = []
+        system_message = f'<|im_start|>system\n{self.system_message}'
+        sep = tokenizer.eos_token
+        eos_token_id = tokenizer.convert_tokens_to_ids(sep)
         history = [] if history is None else history
         for (old_question, old_answer) in history:
+            template_messages.append(('<|im_start|>user', old_question))
+            template_messages.append(('<|im_start|>assistant', old_answer))
+        template_messages.append(('<|im_start|>user', question))
+        template_messages.append(('<|im_end|>assistant', None))
+        query = self.get_prompt(system_message, template_messages, sep)
         if verbose and pixel_values is not None:
             image_bs = pixel_values.shape[0]
         input_ids = model_inputs['input_ids'].cuda()
         attention_mask = model_inputs['attention_mask'].cuda()
         generation_config['eos_token_id'] = eos_token_id
         generation_output = self.generate(
             pixel_values=pixel_values,
             input_ids=input_ids,
             **generation_config
         )
         response = tokenizer.batch_decode(generation_output, skip_special_tokens=True)[0]
+        response = response.split(sep)[0].strip()
         history.append((question, response))
         if return_history:
             return response, history
                 print(query_to_print, response)
             return response
+    def get_prompt(self, system_prompt, messages, sep) -> str:
+        """Get the prompt for generation."""
+        ret = '' if system_prompt == '' else system_prompt + sep + '\n'
+        for role, message in messages:
+            if message:
+                ret += role + '\n' + message + sep + '\n'
+            else:
+                ret += role + '\n'
+        return ret
     @torch.no_grad()
     def generate(
             self,
             attention_mask=attention_mask,
             generation_config=generation_config,
             output_hidden_states=output_hidden_states,
             use_cache=True,
             **generate_kwargs,
         )

modeling_qwen2.py DELETED Viewed

@@ -1,1744 +0,0 @@
-# coding=utf-8
-# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
-#
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" PyTorch Qwen2 model."""
-import inspect
-import math
-import warnings
-from typing import List, Optional, Tuple, Union
-import torch
-import torch.nn.functional as F
-import torch.utils.checkpoint
-from torch import nn
-from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
-from transformers.activations import ACT2FN
-from transformers.cache_utils import Cache, DynamicCache
-from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask, _prepare_4d_causal_attention_mask_for_sdpa
-from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
-from transformers.modeling_utils import PreTrainedModel
-from transformers.utils import (
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    is_flash_attn_2_available,
-    is_flash_attn_greater_or_equal_2_10,
-    logging,
-    replace_return_docstrings,
-)
-from .configuration_qwen2 import Qwen2Config
-if is_flash_attn_2_available():
-    from flash_attn import flash_attn_func, flash_attn_varlen_func
-    from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
-    _flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters)
-logger = logging.get_logger(__name__)
-_CHECKPOINT_FOR_DOC = "Qwen/Qwen2-7B-beta"
-_CONFIG_FOR_DOC = "Qwen2Config"
-QWEN2_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "Qwen/Qwen2-7B-beta",
-    # See all Qwen2 models at https://huggingface.co/models?filter=qwen2
-]
-# Copied from transformers.models.llama.modeling_llama._get_unpad_data
-def _get_unpad_data(attention_mask):
-    seqlens_in_batch = (attention_mask>0).sum(dim=-1, dtype=torch.int32)
-    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
-    max_seqlen_in_batch = seqlens_in_batch.max().item()
-    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0))
-    return (
-        indices,
-        cu_seqlens,
-        max_seqlen_in_batch,
-    )
-def _get_unpad_data_packing(attention_mask, sub_sample_lengths):
-    seqlens_in_batch = []
-    for i, per_sub_sample_lengths in enumerate(sub_sample_lengths):
-        if (attention_mask[i]==0).sum() == per_sub_sample_lengths[-1]:
-            per_sub_sample_lengths = per_sub_sample_lengths[:-1]
-        seqlens_in_batch.extend(per_sub_sample_lengths)
-    seqlens_in_batch = torch.tensor(seqlens_in_batch, device=attention_mask.device, dtype=torch.int32)
-    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
-    max_seqlen_in_batch = seqlens_in_batch.max().item()
-    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0))
-    return (
-        indices,
-        cu_seqlens,
-        max_seqlen_in_batch,
-    )
-# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Qwen2
-class Qwen2RMSNorm(nn.Module):
-    def __init__(self, hidden_size, eps=1e-6):
-        """
-        Qwen2RMSNorm is equivalent to T5LayerNorm
-        """
-        super().__init__()
-        self.weight = nn.Parameter(torch.ones(hidden_size))
-        self.variance_epsilon = eps
-    def forward(self, hidden_states):
-        input_dtype = hidden_states.dtype
-        hidden_states = hidden_states.to(torch.float32)
-        variance = hidden_states.pow(2).mean(-1, keepdim=True)
-        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
-        return self.weight * hidden_states.to(input_dtype)
-# Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Qwen2
-class Qwen2RotaryEmbedding(nn.Module):
-    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
-        super().__init__()
-        self.dim = dim
-        self.max_position_embeddings = max_position_embeddings
-        self.base = base
-        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
-        self.register_buffer("inv_freq", inv_freq, persistent=False)
-        # Build here to make `torch.jit.trace` work.
-        self._set_cos_sin_cache(
-            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
-        )
-    def _set_cos_sin_cache(self, seq_len, device, dtype):
-        self.max_seq_len_cached = seq_len
-        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
-        freqs = torch.outer(t, self.inv_freq)
-        # Different from paper, but it uses a different permutation in order to obtain the same calculation
-        emb = torch.cat((freqs, freqs), dim=-1)
-        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
-        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
-    def forward(self, x, seq_len=None):
-        # x: [bs, num_attention_heads, seq_len, head_size]
-        if seq_len > self.max_seq_len_cached:
-            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
-        return (
-            self.cos_cached[:seq_len].to(dtype=x.dtype),
-            self.sin_cached[:seq_len].to(dtype=x.dtype),
-        )
-# Copied from transformers.models.llama.modeling_llama.rotate_half
-def rotate_half(x):
-    """Rotates half the hidden dims of the input."""
-    x1 = x[..., : x.shape[-1] // 2]
-    x2 = x[..., x.shape[-1] // 2 :]
-    return torch.cat((-x2, x1), dim=-1)
-# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
-def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
-    """Applies Rotary Position Embedding to the query and key tensors.
-    Args:
-        q (`torch.Tensor`): The query tensor.
-        k (`torch.Tensor`): The key tensor.
-        cos (`torch.Tensor`): The cosine part of the rotary embedding.
-        sin (`torch.Tensor`): The sine part of the rotary embedding.
-        position_ids (`torch.Tensor`):
-            The position indices of the tokens corresponding to the query and key tensors. For example, this can be
-            used to pass offsetted position ids when working with a KV-cache.
-        unsqueeze_dim (`int`, *optional*, defaults to 1):
-            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
-            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
-            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
-            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
-            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
-            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
-    Returns:
-        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
-    """
-    cos = cos[position_ids].unsqueeze(unsqueeze_dim)
-    sin = sin[position_ids].unsqueeze(unsqueeze_dim)
-    q_embed = (q * cos) + (rotate_half(q) * sin)
-    k_embed = (k * cos) + (rotate_half(k) * sin)
-    return q_embed, k_embed
-# Copied from transformers.models.mistral.modeling_mistral.MistralMLP with Mistral->Qwen2
-class Qwen2MLP(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.config = config
-        self.hidden_size = config.hidden_size
-        self.intermediate_size = config.intermediate_size
-        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
-        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
-        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
-        self.act_fn = ACT2FN[config.hidden_act]
-    def forward(self, x):
-        return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
-# Copied from transformers.models.llama.modeling_llama.repeat_kv
-def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
-    """
-    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
-    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
-    """
-    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
-    if n_rep == 1:
-        return hidden_states
-    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
-    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
-class Qwen2Attention(nn.Module):
-    """
-    Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
-    and "Generating Long Sequences with Sparse Transformers".
-    """
-    def __init__(self, config: Qwen2Config, layer_idx: Optional[int] = None):
-        super().__init__()
-        self.config = config
-        self.layer_idx = layer_idx
-        if layer_idx is None:
-            logger.warning_once(
-                f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will "
-                "to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
-                "when creating this class."
-            )
-        self.hidden_size = config.hidden_size
-        self.num_heads = config.num_attention_heads
-        self.head_dim = self.hidden_size // self.num_heads
-        self.num_key_value_heads = config.num_key_value_heads
-        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
-        self.max_position_embeddings = config.max_position_embeddings
-        self.rope_theta = config.rope_theta
-        self.is_causal = True
-        self.attention_dropout = config.attention_dropout
-        if (self.head_dim * self.num_heads) != self.hidden_size:
-            raise ValueError(
-                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
-                f" and `num_heads`: {self.num_heads})."
-            )
-        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=True)
-        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
-        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
-        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
-        self.rotary_emb = Qwen2RotaryEmbedding(
-            self.head_dim,
-            max_position_embeddings=self.max_position_embeddings,
-            base=self.rope_theta,
-        )
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Cache] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-        **kwargs,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        if "padding_mask" in kwargs:
-            warnings.warn(
-                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
-            )
-        bsz, q_len, _ = hidden_states.size()
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
-        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            if self.layer_idx is None:
-                raise ValueError(
-                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
-                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
-                    "with a layer index."
-                )
-            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
-        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
-        if past_key_value is not None:
-            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
-            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
-        # repeat k/v heads if n_kv_heads < n_heads
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
-        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
-            raise ValueError(
-                f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
-                f" {attn_weights.size()}"
-            )
-        if attention_mask is not None:
-            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
-                raise ValueError(
-                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
-                )
-            attn_weights = attn_weights + attention_mask
-        # upcast attention to fp32
-        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
-        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
-        attn_output = torch.matmul(attn_weights, value_states)
-        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
-            raise ValueError(
-                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
-                f" {attn_output.size()}"
-            )
-        attn_output = attn_output.transpose(1, 2).contiguous()
-        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
-        attn_output = self.o_proj(attn_output)
-        if not output_attentions:
-            attn_weights = None
-        return attn_output, attn_weights, past_key_value
-class Qwen2FlashAttention2(Qwen2Attention):
-    """
-    Qwen2 flash attention module, following Qwen2 attention module. This module inherits from `Qwen2Attention`
-    as the weights of the module stays untouched. The only required change would be on the forward pass
-    where it needs to correctly call the public API of flash attention and deal with padding tokens
-    in case the input contains any of them. Additionally, for sliding window attention, we apply SWA only to the bottom
-    config.max_window_layers layers.
-    """
-    # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
-        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
-        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
-        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Cache] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-        **kwargs,
-    ):
-        if "padding_mask" in kwargs:
-            warnings.warn(
-                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
-            )
-            # overwrite attention_mask with padding_mask
-            attention_mask = kwargs.pop("padding_mask")
-        bsz, q_len, _ = hidden_states.size()
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
-        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            if self.layer_idx is None:
-                raise ValueError(
-                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
-                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
-                    "with a layer index."
-                )
-            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
-        # Because the input can be padded, the absolute sequence length depends on the max position id.
-        rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item()) + 1
-        cos, sin = self.rotary_emb(value_states, seq_len=rotary_seq_len)
-        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
-        use_sliding_windows = (
-            _flash_supports_window_size
-            and getattr(self.config, "sliding_window", None) is not None
-            and kv_seq_len > self.config.sliding_window
-            and self.config.use_sliding_window
-        )
-        if not _flash_supports_window_size:
-            logger.warning_once(
-                "The current flash attention version does not support sliding window attention, for a more memory efficient implementation"
-                " make sure to upgrade flash-attn library."
-            )
-        if past_key_value is not None:
-            # Activate slicing cache only if the config has a value `sliding_windows` attribute
-            cache_has_contents = past_key_value.get_seq_length(self.layer_idx) > 0
-            if (
-                getattr(self.config, "sliding_window", None) is not None
-                and kv_seq_len > self.config.sliding_window
-                and cache_has_contents
-            ):
-                slicing_tokens = 1 - self.config.sliding_window
-                past_key = past_key_value[self.layer_idx][0]
-                past_value = past_key_value[self.layer_idx][1]
-                past_key = past_key[:, :, slicing_tokens:, :].contiguous()
-                past_value = past_value[:, :, slicing_tokens:, :].contiguous()
-                if past_key.shape[-2] != self.config.sliding_window - 1:
-                    raise ValueError(
-                        f"past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got"
-                        f" {past_key.shape}"
-                    )
-                if attention_mask is not None:
-                    attention_mask = attention_mask[:, slicing_tokens:]
-                    attention_mask = torch.cat([attention_mask, torch.ones_like(attention_mask[:, -1:])], dim=-1)
-            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
-            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
-        # repeat k/v heads if n_kv_heads < n_heads
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-        dropout_rate = 0.0 if not self.training else self.attention_dropout
-        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
-        # therefore the input hidden states gets silently casted in float32. Hence, we need
-        # cast them back in float16 just to be sure everything works as expected.
-        input_dtype = query_states.dtype
-        if input_dtype == torch.float32:
-            if torch.is_autocast_enabled():
-                target_dtype = torch.get_autocast_gpu_dtype()
-            # Handle the case where the model is quantized
-            elif hasattr(self.config, "_pre_quantization_dtype"):
-                target_dtype = self.config._pre_quantization_dtype
-            else:
-                target_dtype = self.q_proj.weight.dtype
-            logger.warning_once(
-                f"The input hidden states seems to be silently casted in float32, this might be related to"
-                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
-                f" {target_dtype}."
-            )
-            query_states = query_states.to(target_dtype)
-            key_states = key_states.to(target_dtype)
-            value_states = value_states.to(target_dtype)
-        # Reashape to the expected shape for Flash Attention
-        query_states = query_states.transpose(1, 2)
-        key_states = key_states.transpose(1, 2)
-        value_states = value_states.transpose(1, 2)
-        attn_output = self._flash_attention_forward(
-            query_states,
-            key_states,
-            value_states,
-            attention_mask,
-            q_len,
-            dropout=dropout_rate,
-            use_sliding_windows=use_sliding_windows,
-        )
-        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
-        attn_output = self.o_proj(attn_output)
-        if not output_attentions:
-            attn_weights = None
-        return attn_output, attn_weights, past_key_value
-    def _flash_attention_forward(
-        self,
-        query_states,
-        key_states,
-        value_states,
-        attention_mask,
-        query_length,
-        dropout=0.0,
-        softmax_scale=None,
-        use_sliding_windows=False,
-    ):
-        """
-        Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
-        first unpad the input, then computes the attention scores and pad the final attention scores.
-        Args:
-            query_states (`torch.Tensor`):
-                Input query states to be passed to Flash Attention API
-            key_states (`torch.Tensor`):
-                Input key states to be passed to Flash Attention API
-            value_states (`torch.Tensor`):
-                Input value states to be passed to Flash Attention API
-            attention_mask (`torch.Tensor`):
-                The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
-                position of padding tokens and 1 for the position of non-padding tokens.
-            dropout (`int`, *optional*):
-                Attention dropout
-            softmax_scale (`float`, *optional*):
-                The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
-            use_sliding_windows (`bool`, *optional*):
-                Whether to activate sliding window attention.
-        """
-        if not self._flash_attn_uses_top_left_mask:
-            causal = self.is_causal
-        else:
-            # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
-            causal = self.is_causal and query_length != 1
-        # Decide whether to use SWA or not by layer index.
-        if use_sliding_windows and self.layer_idx >= self.config.max_window_layers:
-            use_sliding_windows = False
-        # Contains at least one padding token in the sequence
-        if attention_mask is not None:
-            batch_size = query_states.shape[0]
-            query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
-                query_states, key_states, value_states, attention_mask, query_length
-            )
-            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
-            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
-            if not use_sliding_windows:
-                attn_output_unpad = flash_attn_varlen_func(
-                    query_states,
-                    key_states,
-                    value_states,
-                    cu_seqlens_q=cu_seqlens_q,
-                    cu_seqlens_k=cu_seqlens_k,
-                    max_seqlen_q=max_seqlen_in_batch_q,
-                    max_seqlen_k=max_seqlen_in_batch_k,
-                    dropout_p=dropout,
-                    softmax_scale=softmax_scale,
-                    causal=causal,
-                )
-            else:
-                attn_output_unpad = flash_attn_varlen_func(
-                    query_states,
-                    key_states,
-                    value_states,
-                    cu_seqlens_q=cu_seqlens_q,
-                    cu_seqlens_k=cu_seqlens_k,
-                    max_seqlen_q=max_seqlen_in_batch_q,
-                    max_seqlen_k=max_seqlen_in_batch_k,
-                    dropout_p=dropout,
-                    softmax_scale=softmax_scale,
-                    causal=causal,
-                    window_size=(self.config.sliding_window, self.config.sliding_window),
-                )
-            attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
-        else:
-            if not use_sliding_windows:
-                attn_output = flash_attn_func(
-                    query_states,
-                    key_states,
-                    value_states,
-                    dropout,
-                    softmax_scale=softmax_scale,
-                    causal=causal,
-                )
-            else:
-                attn_output = flash_attn_func(
-                    query_states,
-                    key_states,
-                    value_states,
-                    dropout,
-                    softmax_scale=softmax_scale,
-                    causal=causal,
-                    window_size=(self.config.sliding_window, self.config.sliding_window),
-                )
-        return attn_output
-    # Copied from transformers.models.mistral.modeling_mistral.MistralFlashAttention2._upad_input
-    def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
-        batch_size, kv_seq_len, num_heads, head_dim = key_layer.shape
-        # On the first iteration we need to properly re-create the padding mask
-        # by slicing it on the proper place
-        if kv_seq_len != attention_mask.shape[-1]:
-            attention_mask_num_tokens = attention_mask.shape[-1]
-            attention_mask = attention_mask[:, attention_mask_num_tokens - kv_seq_len :]
-        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
-        key_layer = index_first_axis(key_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
-        value_layer = index_first_axis(value_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
-        if query_length == kv_seq_len:
-            query_layer = index_first_axis(
-                query_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k
-            )
-            cu_seqlens_q = cu_seqlens_k
-            max_seqlen_in_batch_q = max_seqlen_in_batch_k
-            indices_q = indices_k
-        elif query_length == 1:
-            max_seqlen_in_batch_q = 1
-            cu_seqlens_q = torch.arange(
-                batch_size + 1, dtype=torch.int32, device=query_layer.device
-            )  # There is a memcpy here, that is very bad.
-            indices_q = cu_seqlens_q[:-1]
-            query_layer = query_layer.squeeze(1)
-        else:
-            # The -q_len: slice assumes left padding.
-            attention_mask = attention_mask[:, -query_length:]
-            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
-        return (
-            query_layer,
-            key_layer,
-            value_layer,
-            indices_q,
-            (cu_seqlens_q, cu_seqlens_k),
-            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
-        )
-class Qwen2FlashAttention2_packing(Qwen2Attention):
-    """
-    Qwen2 flash attention module, following Qwen2 attention module. This module inherits from `Qwen2Attention`
-    as the weights of the module stays untouched. The only required change would be on the forward pass
-    where it needs to correctly call the public API of flash attention and deal with padding tokens
-    in case the input contains any of them. Additionally, for sliding window attention, we apply SWA only to the bottom
-    config.max_window_layers layers.
-    """
-    # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
-        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
-        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
-        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Cache] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-        sub_sample_lengths = None,
-        **kwargs,
-    ):
-        if "padding_mask" in kwargs:
-            warnings.warn(
-                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
-            )
-            # overwrite attention_mask with padding_mask
-            attention_mask = kwargs.pop("padding_mask")
-        bsz, q_len, _ = hidden_states.size()
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
-        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            if self.layer_idx is None:
-                raise ValueError(
-                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
-                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
-                    "with a layer index."
-                )
-            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
-        # Because the input can be padded, the absolute sequence length depends on the max position id.
-        rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item()) + 1
-        cos, sin = self.rotary_emb(value_states, seq_len=rotary_seq_len)
-        if sub_sample_lengths is not None:
-            packing_position_ids = []
-            for b in range(bsz):
-                each_sum_sample_lengths = sub_sample_lengths[b]
-                packing_position_ids.append(torch.cat([torch.arange(each) for each in each_sum_sample_lengths]))
-            packing_position_ids = torch.stack(packing_position_ids)
-            packing_position_ids.to(query_states.device)
-            query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, packing_position_ids)
-        else:
-            query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
-        use_sliding_windows = (
-            _flash_supports_window_size
-            and getattr(self.config, "sliding_window", None) is not None
-            and kv_seq_len > self.config.sliding_window
-            and self.config.use_sliding_window
-        )
-        if not _flash_supports_window_size:
-            logger.warning_once(
-                "The current flash attention version does not support sliding window attention, for a more memory efficient implementation"
-                " make sure to upgrade flash-attn library."
-            )
-        if past_key_value is not None:
-            # Activate slicing cache only if the config has a value `sliding_windows` attribute
-            cache_has_contents = past_key_value.get_seq_length(self.layer_idx) > 0
-            if (
-                getattr(self.config, "sliding_window", None) is not None
-                and kv_seq_len > self.config.sliding_window
-                and cache_has_contents
-            ):
-                slicing_tokens = 1 - self.config.sliding_window
-                past_key = past_key_value[self.layer_idx][0]
-                past_value = past_key_value[self.layer_idx][1]
-                past_key = past_key[:, :, slicing_tokens:, :].contiguous()
-                past_value = past_value[:, :, slicing_tokens:, :].contiguous()
-                if past_key.shape[-2] != self.config.sliding_window - 1:
-                    raise ValueError(
-                        f"past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got"
-                        f" {past_key.shape}"
-                    )
-                if attention_mask is not None:
-                    attention_mask = attention_mask[:, slicing_tokens:]
-                    attention_mask = torch.cat([attention_mask, torch.ones_like(attention_mask[:, -1:])], dim=-1)
-            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
-            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
-        # repeat k/v heads if n_kv_heads < n_heads
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-        dropout_rate = 0.0 if not self.training else self.attention_dropout
-        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
-        # therefore the input hidden states gets silently casted in float32. Hence, we need
-        # cast them back in float16 just to be sure everything works as expected.
-        input_dtype = query_states.dtype
-        if input_dtype == torch.float32:
-            if torch.is_autocast_enabled():
-                target_dtype = torch.get_autocast_gpu_dtype()
-            # Handle the case where the model is quantized
-            elif hasattr(self.config, "_pre_quantization_dtype"):
-                target_dtype = self.config._pre_quantization_dtype
-            else:
-                target_dtype = self.q_proj.weight.dtype
-            logger.warning_once(
-                f"The input hidden states seems to be silently casted in float32, this might be related to"
-                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
-                f" {target_dtype}."
-            )
-            query_states = query_states.to(target_dtype)
-            key_states = key_states.to(target_dtype)
-            value_states = value_states.to(target_dtype)
-        # Reashape to the expected shape for Flash Attention
-        query_states = query_states.transpose(1, 2)
-        key_states = key_states.transpose(1, 2)
-        value_states = value_states.transpose(1, 2)
-        attn_output = self._flash_attention_forward(
-            query_states,
-            key_states,
-            value_states,
-            attention_mask,
-            q_len,
-            dropout=dropout_rate,
-            use_sliding_windows=use_sliding_windows,
-            sub_sample_lengths=sub_sample_lengths
-        )
-        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
-        attn_output = self.o_proj(attn_output)
-        if not output_attentions:
-            attn_weights = None
-        return attn_output, attn_weights, past_key_value
-    def _flash_attention_forward(
-        self,
-        query_states,
-        key_states,
-        value_states,
-        attention_mask,
-        query_length,
-        dropout=0.0,
-        softmax_scale=None,
-        use_sliding_windows=False,
-        sub_sample_lengths=None,
-    ):
-        """
-        Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
-        first unpad the input, then computes the attention scores and pad the final attention scores.
-        Args:
-            query_states (`torch.Tensor`):
-                Input query states to be passed to Flash Attention API
-            key_states (`torch.Tensor`):
-                Input key states to be passed to Flash Attention API
-            value_states (`torch.Tensor`):
-                Input value states to be passed to Flash Attention API
-            attention_mask (`torch.Tensor`):
-                The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
-                position of padding tokens and 1 for the position of non-padding tokens.
-            dropout (`int`, *optional*):
-                Attention dropout
-            softmax_scale (`float`, *optional*):
-                The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
-            use_sliding_windows (`bool`, *optional*):
-                Whether to activate sliding window attention.
-        """
-        if not self._flash_attn_uses_top_left_mask:
-            causal = self.is_causal
-        else:
-            # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
-            causal = self.is_causal and query_length != 1
-        # Decide whether to use SWA or not by layer index.
-        if use_sliding_windows and self.layer_idx >= self.config.max_window_layers:
-            use_sliding_windows = False
-        # Contains at least one padding token in the sequence
-        if attention_mask is not None:
-            batch_size = query_states.shape[0]
-            query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._unpad_input_packing(
-                query_states, key_states, value_states, attention_mask, query_length, sub_sample_lengths
-            )
-            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
-            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
-            if not use_sliding_windows:
-                attn_output_unpad = flash_attn_varlen_func(
-                    query_states,
-                    key_states,
-                    value_states,
-                    cu_seqlens_q=cu_seqlens_q,
-                    cu_seqlens_k=cu_seqlens_k,
-                    max_seqlen_q=max_seqlen_in_batch_q,
-                    max_seqlen_k=max_seqlen_in_batch_k,
-                    dropout_p=dropout,
-                    softmax_scale=softmax_scale,
-                    causal=causal,
-                )
-            else:
-                attn_output_unpad = flash_attn_varlen_func(
-                    query_states,
-                    key_states,
-                    value_states,
-                    cu_seqlens_q=cu_seqlens_q,
-                    cu_seqlens_k=cu_seqlens_k,
-                    max_seqlen_q=max_seqlen_in_batch_q,
-                    max_seqlen_k=max_seqlen_in_batch_k,
-                    dropout_p=dropout,
-                    softmax_scale=softmax_scale,
-                    causal=causal,
-                    window_size=(self.config.sliding_window, self.config.sliding_window),
-                )
-            attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
-        else:
-            if not use_sliding_windows:
-                attn_output = flash_attn_func(
-                    query_states,
-                    key_states,
-                    value_states,
-                    dropout,
-                    softmax_scale=softmax_scale,
-                    causal=causal,
-                )
-            else:
-                attn_output = flash_attn_func(
-                    query_states,
-                    key_states,
-                    value_states,
-                    dropout,
-                    softmax_scale=softmax_scale,
-                    causal=causal,
-                    window_size=(self.config.sliding_window, self.config.sliding_window),
-                )
-        return attn_output
-    # Copied from transformers.models.mistral.modeling_mistral.MistralFlashAttention2._upad_input
-    def _unpad_input_packing(self, query_layer, key_layer, value_layer, attention_mask, query_length, sub_sample_lengths):
-        batch_size, kv_seq_len, num_heads, head_dim = key_layer.shape
-        # On the first iteration we need to properly re-create the padding mask
-        # by slicing it on the proper place
-        if kv_seq_len != attention_mask.shape[-1]:
-            attention_mask_num_tokens = attention_mask.shape[-1]
-            attention_mask = attention_mask[:, attention_mask_num_tokens - kv_seq_len :]
-        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data_packing(attention_mask, sub_sample_lengths)
-        key_layer = index_first_axis(key_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
-        value_layer = index_first_axis(value_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
-        if query_length == kv_seq_len:
-            query_layer = index_first_axis(
-                query_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k
-            )
-            cu_seqlens_q = cu_seqlens_k
-            max_seqlen_in_batch_q = max_seqlen_in_batch_k
-            indices_q = indices_k
-        elif query_length == 1:
-            max_seqlen_in_batch_q = 1
-            cu_seqlens_q = torch.arange(
-                batch_size + 1, dtype=torch.int32, device=query_layer.device
-            )  # There is a memcpy here, that is very bad.
-            indices_q = cu_seqlens_q[:-1]
-            query_layer = query_layer.squeeze(1)
-        else:
-            # The -q_len: slice assumes left padding.
-            attention_mask = attention_mask[:, -query_length:]
-            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
-        return (
-            query_layer,
-            key_layer,
-            value_layer,
-            indices_q,
-            (cu_seqlens_q, cu_seqlens_k),
-            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
-        )
-# Copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention with Llama->Qwen2
-class Qwen2SdpaAttention(Qwen2Attention):
-    """
-    Qwen2 attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
-    `Qwen2Attention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
-    SDPA API.
-    """
-    # Adapted from Qwen2Attention.forward
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Cache] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        if output_attentions:
-            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
-            logger.warning_once(
-                "Qwen2Model is using Qwen2SdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
-                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
-            )
-            return super().forward(
-                hidden_states=hidden_states,
-                attention_mask=attention_mask,
-                position_ids=position_ids,
-                past_key_value=past_key_value,
-                output_attentions=output_attentions,
-                use_cache=use_cache,
-            )
-        bsz, q_len, _ = hidden_states.size()
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
-        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
-        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
-        if past_key_value is not None:
-            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
-            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-        if attention_mask is not None:
-            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
-                raise ValueError(
-                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
-                )
-        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
-        # Reference: https://github.com/pytorch/pytorch/issues/112577.
-        if query_states.device.type == "cuda" and attention_mask is not None:
-            query_states = query_states.contiguous()
-            key_states = key_states.contiguous()
-            value_states = value_states.contiguous()
-        attn_output = torch.nn.functional.scaled_dot_product_attention(
-            query_states,
-            key_states,
-            value_states,
-            attn_mask=attention_mask,
-            dropout_p=self.attention_dropout if self.training else 0.0,
-            # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
-            is_causal=self.is_causal and attention_mask is None and q_len > 1,
-        )
-        attn_output = attn_output.transpose(1, 2).contiguous()
-        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
-        attn_output = self.o_proj(attn_output)
-        return attn_output, None, past_key_value
-QWEN2_ATTENTION_CLASSES = {
-    "eager": Qwen2Attention,
-    "flash_attention_2": Qwen2FlashAttention2,
-    "sdpa": Qwen2SdpaAttention,
-    'flash_attention_2_packing':Qwen2FlashAttention2_packing
-}
-class Qwen2DecoderLayer(nn.Module):
-    def __init__(self, config: Qwen2Config, layer_idx: int):
-        super().__init__()
-        self.hidden_size = config.hidden_size
-        if config.use_sliding_window and config.attn_implementation != "flash_attention_2":
-            logger.warning_once(
-                f"Sliding Window Attention is enabled but not implemented for `{config.attn_implementation}`; "
-                "unexpected results may be encountered."
-            )
-        self.self_attn = QWEN2_ATTENTION_CLASSES[config.attn_implementation](config, layer_idx)
-        self.mlp = Qwen2MLP(config)
-        self.input_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.post_attention_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        sub_sample_lengths=None,
-        output_attentions: Optional[bool] = False,
-        use_cache: Optional[bool] = False,
-        **kwargs,
-    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
-        if "padding_mask" in kwargs:
-            warnings.warn(
-                "Passing `padding_mask` is deprecated and will be removed in v4.37. "
-                "Please make sure use `attention_mask` instead.`"
-            )
-        """
-        Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
-            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
-                `(batch, sequence_length)` where padding elements are indicated by 0.
-            output_attentions (`bool`, *optional*):
-                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-                returned tensors for more detail.
-            use_cache (`bool`, *optional*):
-                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
-                (see `past_key_values`).
-            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
-        """
-        residual = hidden_states
-        hidden_states = self.input_layernorm(hidden_states)
-        # Self Attention
-        hidden_states, self_attn_weights, present_key_value = self.self_attn(
-            hidden_states=hidden_states,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_value=past_key_value,
-            output_attentions=output_attentions,
-            use_cache=use_cache,
-            sub_sample_lengths=sub_sample_lengths,
-        )
-        hidden_states = residual + hidden_states
-        # Fully Connected
-        residual = hidden_states
-        hidden_states = self.post_attention_layernorm(hidden_states)
-        hidden_states = self.mlp(hidden_states)
-        hidden_states = residual + hidden_states
-        outputs = (hidden_states,)
-        if output_attentions:
-            outputs += (self_attn_weights,)
-        if use_cache:
-            outputs += (present_key_value,)
-        return outputs
-QWEN2_START_DOCSTRING = r"""
-    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
-    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
-    etc.)
-    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
-    and behavior.
-    Parameters:
-        config ([`Qwen2Config`]):
-            Model configuration class with all the parameters of the model. Initializing with a config file does not
-            load the weights associated with the model, only the configuration. Check out the
-            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-"""
-@add_start_docstrings(
-    "The bare Qwen2 Model outputting raw hidden-states without any specific head on top.",
-    QWEN2_START_DOCSTRING,
-)
-class Qwen2PreTrainedModel(PreTrainedModel):
-    config_class = Qwen2Config
-    base_model_prefix = "model"
-    supports_gradient_checkpointing = True
-    _no_split_modules = ["Qwen2DecoderLayer"]
-    _skip_keys_device_placement = "past_key_values"
-    _supports_flash_attn_2 = True
-    _supports_sdpa = True
-    _supports_cache_class = True
-    def _init_weights(self, module):
-        std = self.config.initializer_range
-        if isinstance(module, nn.Linear):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, nn.Embedding):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.padding_idx is not None:
-                module.weight.data[module.padding_idx].zero_()
-QWEN2_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
-            it.
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-            [What are input IDs?](../glossary#input-ids)
-        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-            [What are attention masks?](../glossary#attention-mask)
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
-            `past_key_values`).
-            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
-            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
-            information on the default strategy.
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
-            config.n_positions - 1]`.
-            [What are position IDs?](../glossary#position-ids)
-        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
-            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
-            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
-            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
-            Two formats are allowed:
-            - a [`~cache_utils.Cache`] instance;
-            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
-            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
-            cache format.
-            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
-            legacy cache format will be returned.
-            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
-            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
-            of shape `(batch_size, sequence_length)`.
-        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
-            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
-            model's internal embedding lookup matrix.
-        use_cache (`bool`, *optional*):
-            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
-            `past_key_values`).
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-"""
-@add_start_docstrings(
-    "The bare Qwen2 Model outputting raw hidden-states without any specific head on top.",
-    QWEN2_START_DOCSTRING,
-)
-class Qwen2Model(Qwen2PreTrainedModel):
-    """
-    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Qwen2DecoderLayer`]
-    Args:
-        config: Qwen2Config
-    """
-    def __init__(self, config: Qwen2Config):
-        super().__init__(config)
-        self.padding_idx = config.pad_token_id
-        self.vocab_size = config.vocab_size
-        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
-        self.layers = nn.ModuleList(
-            [Qwen2DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
-        )
-        self.attn_implementation = config.attn_implementation
-        self.norm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.gradient_checkpointing = False
-        # Initialize weights and apply final processing
-        self.post_init()
-    def get_input_embeddings(self):
-        return self.embed_tokens
-    def set_input_embeddings(self, value):
-        self.embed_tokens = value
-    @add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-        sub_sample_lengths=None,
-    ) -> Union[Tuple, BaseModelOutputWithPast]:
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        # retrieve input_ids and inputs_embeds
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
-        elif input_ids is not None:
-            batch_size, seq_length = input_ids.shape
-        elif inputs_embeds is not None:
-            batch_size, seq_length, _ = inputs_embeds.shape
-        else:
-            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
-        if self.gradient_checkpointing and self.training:
-            if use_cache:
-                logger.warning_once(
-                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                )
-                use_cache = False
-        past_key_values_length = 0
-        if use_cache:
-            use_legacy_cache = not isinstance(past_key_values, Cache)
-            if use_legacy_cache:
-                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
-            past_key_values_length = past_key_values.get_usable_length(seq_length)
-        if position_ids is None:
-            device = input_ids.device if input_ids is not None else inputs_embeds.device
-            position_ids = torch.arange(
-                past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
-            )
-            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
-        else:
-            position_ids = position_ids.view(-1, seq_length).long()
-        if inputs_embeds is None:
-            inputs_embeds = self.embed_tokens(input_ids)
-        if attention_mask is not None and self.attn_implementation == "flash_attention_2" and use_cache:
-            is_padding_right = attention_mask[:, -1].sum().item() != batch_size
-            if is_padding_right:
-                raise ValueError(
-                    "You are attempting to perform batched generation with padding_side='right'"
-                    " this may lead to unexpected behaviour for Flash Attention version of Qwen2. Make sure to "
-                    " call `tokenizer.padding_side  = 'left'` before tokenizing the input. "
-                )
-        if self.attn_implementation == "flash_attention_2" or self.config.attn_implementation =='flash_attention_2_packing':
-            # 2d mask is passed through the layers
-            if attention_mask is not None:
-                if attention_mask.dtype == torch.long:
-                    pass
-                    # attention_mask = attention_mask
-                else:
-                    attention_mask = attention_mask if  (0 in attention_mask) else None
-        elif self.attn_implementation == "sdpa" and not output_attentions:
-            # output_attentions=True can not be supported when using SDPA, and we fall back on
-            # the manual implementation that requires a 4D causal mask in all cases.
-            attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
-                attention_mask,
-                (batch_size, seq_length),
-                inputs_embeds,
-                past_key_values_length,
-            )
-        else:
-            # 4d mask is passed through the layers
-            attention_mask = _prepare_4d_causal_attention_mask(
-                attention_mask,
-                (batch_size, seq_length),
-                inputs_embeds,
-                past_key_values_length,
-                sliding_window=self.config.sliding_window,
-            )
-        hidden_states = inputs_embeds
-        # decoder layers
-        all_hidden_states = () if output_hidden_states else None
-        all_self_attns = () if output_attentions else None
-        next_decoder_cache = None
-        for decoder_layer in self.layers:
-            if output_hidden_states:
-                all_hidden_states += (hidden_states,)
-            if self.gradient_checkpointing and self.training:
-                layer_outputs = self._gradient_checkpointing_func(
-                    decoder_layer.__call__,
-                    hidden_states,
-                    attention_mask,
-                    position_ids,
-                    past_key_values,
-                    sub_sample_lengths,
-                    output_attentions,
-                    use_cache,
-                )
-            else:
-                layer_outputs = decoder_layer(
-                    hidden_states,
-                    attention_mask=attention_mask,
-                    position_ids=position_ids,
-                    past_key_value=past_key_values,
-                    sub_sample_lengths=sub_sample_lengths,
-                    output_attentions=output_attentions,
-                    use_cache=use_cache,
-                )
-            hidden_states = layer_outputs[0]
-            if use_cache:
-                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
-            if output_attentions:
-                all_self_attns += (layer_outputs[1],)
-        hidden_states = self.norm(hidden_states)
-        # add hidden states from the last decoder layer
-        if output_hidden_states:
-            all_hidden_states += (hidden_states,)
-        next_cache = None
-        if use_cache:
-            next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
-        if not return_dict:
-            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
-        return BaseModelOutputWithPast(
-            last_hidden_state=hidden_states,
-            past_key_values=next_cache,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attns,
-        )
-class Qwen2ForCausalLM(Qwen2PreTrainedModel):
-    _tied_weights_keys = ["lm_head.weight"]
-    def __init__(self, config):
-        super().__init__(config)
-        self.model = Qwen2Model(config)
-        self.vocab_size = config.vocab_size
-        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
-        # Initialize weights and apply final processing
-        self.post_init()
-        self.support_packing = True
-    def get_input_embeddings(self):
-        return self.model.embed_tokens
-    def set_input_embeddings(self, value):
-        self.model.embed_tokens = value
-    def get_output_embeddings(self):
-        return self.lm_head
-    def set_output_embeddings(self, new_embeddings):
-        self.lm_head = new_embeddings
-    def set_decoder(self, decoder):
-        self.model = decoder
-    def get_decoder(self):
-        return self.model
-    @add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-        sub_sample_lengths=None,
-    ) -> Union[Tuple, CausalLMOutputWithPast]:
-        r"""
-        Args:
-            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-        Returns:
-        Example:
-        ```python
-        >>> from transformers import AutoTokenizer, Qwen2ForCausalLM
-        >>> model = Qwen2ForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
-        >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
-        >>> prompt = "Hey, are you conscious? Can you talk to me?"
-        >>> inputs = tokenizer(prompt, return_tensors="pt")
-        >>> # Generate
-        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
-        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
-        ```"""
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-        outputs = self.model(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-            sub_sample_lengths=sub_sample_lengths
-        )
-        hidden_states = outputs[0]
-        logits = self.lm_head(hidden_states)
-        logits = logits.float()
-        loss = None
-        if labels is not None:
-            # Shift so that tokens < n predict n
-            shift_logits = logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            # Flatten the tokens
-            loss_fct = CrossEntropyLoss()
-            shift_logits = shift_logits.view(-1, self.config.vocab_size)
-            shift_labels = shift_labels.view(-1)
-            # Enable model parallelism
-            shift_labels = shift_labels.to(shift_logits.device)
-            loss = loss_fct(shift_logits, shift_labels)
-        if not return_dict:
-            output = (logits,) + outputs[1:]
-            return (loss,) + output if loss is not None else output
-        return CausalLMOutputWithPast(
-            loss=loss,
-            logits=logits,
-            past_key_values=outputs.past_key_values,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-    def prepare_inputs_for_generation(
-        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
-    ):
-        # Omit tokens covered by past_key_values
-        if past_key_values is not None:
-            if isinstance(past_key_values, Cache):
-                cache_length = past_key_values.get_seq_length()
-                past_length = past_key_values.seen_tokens
-                max_cache_length = past_key_values.get_max_length()
-            else:
-                cache_length = past_length = past_key_values[0][0].shape[2]
-                max_cache_length = None
-            # Keep only the unprocessed tokens:
-            # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
-            # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
-            # input)
-            if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
-                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
-            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
-            # input_ids based on the past_length.
-            elif past_length < input_ids.shape[1]:
-                input_ids = input_ids[:, past_length:]
-            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
-            # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
-            if (
-                max_cache_length is not None
-                and attention_mask is not None
-                and cache_length + input_ids.shape[1] > max_cache_length
-            ):
-                attention_mask = attention_mask[:, -max_cache_length:]
-        position_ids = kwargs.get("position_ids", None)
-        if attention_mask is not None and position_ids is None:
-            # create position_ids on the fly for batch generation
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
-                position_ids = position_ids[:, -input_ids.shape[1] :]
-        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
-        if inputs_embeds is not None and past_key_values is None:
-            model_inputs = {"inputs_embeds": inputs_embeds}
-        else:
-            model_inputs = {"input_ids": input_ids}
-        model_inputs.update(
-            {
-                "position_ids": position_ids,
-                "past_key_values": past_key_values,
-                "use_cache": kwargs.get("use_cache"),
-                "attention_mask": attention_mask,
-            }
-        )
-        return model_inputs
-    @staticmethod
-    def _reorder_cache(past_key_values, beam_idx):
-        reordered_past = ()
-        for layer_past in past_key_values:
-            reordered_past += (
-                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
-            )
-        return reordered_past
-@add_start_docstrings(
-    """
-    The Qwen2 Model transformer with a sequence classification head on top (linear layer).
-    [`Qwen2ForSequenceClassification`] uses the last token in order to do the classification, as other causal models
-    (e.g. GPT-2) do.
-    Since it does classification on the last token, it requires to know the position of the last token. If a
-    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
-    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
-    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
-    each row of the batch).
-    """,
-    QWEN2_START_DOCSTRING,
-)
-class Qwen2ForSequenceClassification(Qwen2PreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-        self.model = Qwen2Model(config)
-        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
-        # Initialize weights and apply final processing
-        self.post_init()
-    def get_input_embeddings(self):
-        return self.model.embed_tokens
-    def set_input_embeddings(self, value):
-        self.model.embed_tokens = value
-    @add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
-            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
-            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        transformer_outputs = self.model(
-            input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        hidden_states = transformer_outputs[0]
-        logits = self.score(hidden_states)
-        if input_ids is not None:
-            batch_size = input_ids.shape[0]
-        else:
-            batch_size = inputs_embeds.shape[0]
-        if self.config.pad_token_id is None and batch_size != 1:
-            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
-        if self.config.pad_token_id is None:
-            sequence_lengths = -1
-        else:
-            if input_ids is not None:
-                # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility
-                sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1
-                sequence_lengths = sequence_lengths % input_ids.shape[-1]
-                sequence_lengths = sequence_lengths.to(logits.device)
-            else:
-                sequence_lengths = -1
-        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
-        loss = None
-        if labels is not None:
-            labels = labels.to(logits.device)
-            if self.config.problem_type is None:
-                if self.num_labels == 1:
-                    self.config.problem_type = "regression"
-                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
-                    self.config.problem_type = "single_label_classification"
-                else:
-                    self.config.problem_type = "multi_label_classification"
-            if self.config.problem_type == "regression":
-                loss_fct = MSELoss()
-                if self.num_labels == 1:
-                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
-                else:
-                    loss = loss_fct(pooled_logits, labels)
-            elif self.config.problem_type == "single_label_classification":
-                loss_fct = CrossEntropyLoss()
-                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
-            elif self.config.problem_type == "multi_label_classification":
-                loss_fct = BCEWithLogitsLoss()
-                loss = loss_fct(pooled_logits, labels)
-        if not return_dict:
-            output = (pooled_logits,) + transformer_outputs[1:]
-            return ((loss,) + output) if loss is not None else output
-        return SequenceClassifierOutputWithPast(
-            loss=loss,
-            logits=pooled_logits,
-            past_key_values=transformer_outputs.past_key_values,
-            hidden_states=transformer_outputs.hidden_states,
-            attentions=transformer_outputs.attentions,
-        )

modeling_siglip.py DELETED Viewed

@@ -1,1241 +0,0 @@
-# --------------------------------------------------------
-# Eagle2
-# Copyright (c) 2025 NVIDIA
-# Licensed under The MIT License [see LICENSE for details]
-# Support flash-attention in SigLIP
-# --------------------------------------------------------
-# coding=utf-8
-# Copyright 2024 Google AI and The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" PyTorch Siglip model."""
-import math
-import warnings
-from dataclasses import dataclass
-from typing import Any, Optional, Tuple, Union
-from einops import rearrange
-import numpy as np
-import torch
-import torch.utils.checkpoint
-from torch import nn
-from torch.nn.init import _calculate_fan_in_and_fan_out
-from transformers.activations import ACT2FN
-from transformers.modeling_attn_mask_utils import _prepare_4d_attention_mask
-from transformers.modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
-from transformers.modeling_utils import PreTrainedModel
-from transformers.utils import (
-    ModelOutput,
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    logging,
-    replace_return_docstrings,
-)
-from .configuration_siglip import SiglipConfig, SiglipTextConfig, SiglipVisionConfig
-try:
-    from .flash_attention import FlashAttention
-    has_flash_attn = True
-except:
-    print('FlashAttention is not installed.')
-    has_flash_attn = False
-logger = logging.get_logger(__name__)
-_CHECKPOINT_FOR_DOC = "google/siglip-base-patch16-224"
-SIGLIP_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "google/siglip-base-patch16-224",
-    # See all SigLIP models at https://huggingface.co/models?filter=siglip
-]
-def _trunc_normal_(tensor, mean, std, a, b):
-    # Cut & paste from PyTorch official master until it's in a few official releases - RW
-    # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
-    def norm_cdf(x):
-        # Computes standard normal cumulative distribution function
-        return (1.0 + math.erf(x / math.sqrt(2.0))) / 2.0
-    if (mean < a - 2 * std) or (mean > b + 2 * std):
-        warnings.warn(
-            "mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
-            "The distribution of values may be incorrect.",
-            stacklevel=2,
-        )
-    # Values are generated by using a truncated uniform distribution and
-    # then using the inverse CDF for the normal distribution.
-    # Get upper and lower cdf values
-    l = norm_cdf((a - mean) / std)
-    u = norm_cdf((b - mean) / std)
-    # Uniformly fill tensor with values from [l, u], then translate to
-    # [2l-1, 2u-1].
-    tensor.uniform_(2 * l - 1, 2 * u - 1)
-    # Use inverse cdf transform for normal distribution to get truncated
-    # standard normal
-    tensor.erfinv_()
-    # Transform to proper mean, std
-    tensor.mul_(std * math.sqrt(2.0))
-    tensor.add_(mean)
-    # Clamp to ensure it's in the proper range
-    tensor.clamp_(min=a, max=b)
-def trunc_normal_tf_(
-    tensor: torch.Tensor, mean: float = 0.0, std: float = 1.0, a: float = -2.0, b: float = 2.0
-) -> torch.Tensor:
-    """Fills the input Tensor with values drawn from a truncated
-    normal distribution. The values are effectively drawn from the
-    normal distribution :math:`\\mathcal{N}(\text{mean}, \text{std}^2)`
-    with values outside :math:`[a, b]` redrawn until they are within
-    the bounds. The method used for generating the random values works
-    best when :math:`a \\leq \text{mean} \\leq b`.
-    NOTE: this 'tf' variant behaves closer to Tensorflow / JAX impl where the
-    bounds [a, b] are applied when sampling the normal distribution with mean=0, std=1.0
-    and the result is subsquently scaled and shifted by the mean and std args.
-    Args:
-        tensor: an n-dimensional `torch.Tensor`
-        mean: the mean of the normal distribution
-        std: the standard deviation of the normal distribution
-        a: the minimum cutoff value
-        b: the maximum cutoff value
-    """
-    with torch.no_grad():
-        _trunc_normal_(tensor, 0, 1.0, a, b)
-        tensor.mul_(std).add_(mean)
-def variance_scaling_(tensor, scale=1.0, mode="fan_in", distribution="normal"):
-    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)
-    if mode == "fan_in":
-        denom = fan_in
-    elif mode == "fan_out":
-        denom = fan_out
-    elif mode == "fan_avg":
-        denom = (fan_in + fan_out) / 2
-    variance = scale / denom
-    if distribution == "truncated_normal":
-        # constant is stddev of standard normal truncated to (-2, 2)
-        trunc_normal_tf_(tensor, std=math.sqrt(variance) / 0.87962566103423978)
-    elif distribution == "normal":
-        with torch.no_grad():
-            tensor.normal_(std=math.sqrt(variance))
-    elif distribution == "uniform":
-        bound = math.sqrt(3 * variance)
-        with torch.no_grad():
-            tensor.uniform_(-bound, bound)
-    else:
-        raise ValueError(f"invalid distribution {distribution}")
-def lecun_normal_(tensor):
-    variance_scaling_(tensor, mode="fan_in", distribution="truncated_normal")
-def default_flax_embed_init(tensor):
-    variance_scaling_(tensor, mode="fan_in", distribution="normal")
-@dataclass
-# Copied from transformers.models.clip.modeling_clip.CLIPVisionModelOutput with CLIP->Siglip
-class SiglipVisionModelOutput(ModelOutput):
-    """
-    Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.
-    Args:
-        image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
-            The image embeddings obtained by applying the projection layer to the pooler_output.
-        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
-            Sequence of hidden-states at the output of the last layer of the model.
-        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
-            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
-            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
-        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-    """
-    image_embeds: Optional[torch.FloatTensor] = None
-    last_hidden_state: torch.FloatTensor = None
-    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    attentions: Optional[Tuple[torch.FloatTensor]] = None
-@dataclass
-# Copied from transformers.models.clip.modeling_clip.CLIPTextModelOutput with CLIP->Siglip
-class SiglipTextModelOutput(ModelOutput):
-    """
-    Base class for text model's outputs that also contains a pooling of the last hidden states.
-    Args:
-        text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
-            The text embeddings obtained by applying the projection layer to the pooler_output.
-        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
-            Sequence of hidden-states at the output of the last layer of the model.
-        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
-            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
-            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
-        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-    """
-    text_embeds: Optional[torch.FloatTensor] = None
-    last_hidden_state: torch.FloatTensor = None
-    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    attentions: Optional[Tuple[torch.FloatTensor]] = None
-@dataclass
-# Copied from transformers.models.clip.modeling_clip.CLIPOutput with CLIP->Siglip
-class SiglipOutput(ModelOutput):
-    """
-    Args:
-        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
-            Contrastive loss for image-text similarity.
-        logits_per_image:(`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
-            The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
-            similarity scores.
-        logits_per_text:(`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
-            The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
-            similarity scores.
-        text_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`):
-            The text embeddings obtained by applying the projection layer to the pooled output of [`SiglipTextModel`].
-        image_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`):
-            The image embeddings obtained by applying the projection layer to the pooled output of [`SiglipVisionModel`].
-        text_model_output(`BaseModelOutputWithPooling`):
-            The output of the [`SiglipTextModel`].
-        vision_model_output(`BaseModelOutputWithPooling`):
-            The output of the [`SiglipVisionModel`].
-    """
-    loss: Optional[torch.FloatTensor] = None
-    logits_per_image: torch.FloatTensor = None
-    logits_per_text: torch.FloatTensor = None
-    text_embeds: torch.FloatTensor = None
-    image_embeds: torch.FloatTensor = None
-    text_model_output: BaseModelOutputWithPooling = None
-    vision_model_output: BaseModelOutputWithPooling = None
-    def to_tuple(self) -> Tuple[Any]:
-        return tuple(
-            self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple()
-            for k in self.keys()
-        )
-class SiglipVisionEmbeddings(nn.Module):
-    def __init__(self, config: SiglipVisionConfig):
-        super().__init__()
-        self.config = config
-        self.embed_dim = config.hidden_size
-        self.image_size = config.image_size
-        self.patch_size = config.patch_size
-        self.patch_embedding = nn.Conv2d(
-            in_channels=config.num_channels,
-            out_channels=self.embed_dim,
-            kernel_size=self.patch_size,
-            stride=self.patch_size,
-            padding="valid",
-        )
-        self.num_patches = (self.image_size // self.patch_size) ** 2
-        self.num_positions = self.num_patches
-        self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
-        self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False)
-    def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
-        patch_embeds = self.patch_embedding(pixel_values)  # shape = [*, width, grid, grid]
-        embeddings = patch_embeds.flatten(2).transpose(1, 2)
-        embeddings = embeddings + self.position_embedding(self.position_ids)
-        return embeddings
-# Copied from transformers.models.clip.modeling_clip.CLIPTextEmbeddings with CLIP->Siglip
-class SiglipTextEmbeddings(nn.Module):
-    def __init__(self, config: SiglipTextConfig):
-        super().__init__()
-        embed_dim = config.hidden_size
-        self.token_embedding = nn.Embedding(config.vocab_size, embed_dim)
-        self.position_embedding = nn.Embedding(config.max_position_embeddings, embed_dim)
-        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
-        self.register_buffer(
-            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
-        )
-    def forward(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-    ) -> torch.Tensor:
-        seq_length = input_ids.shape[-1] if input_ids is not None else inputs_embeds.shape[-2]
-        if position_ids is None:
-            position_ids = self.position_ids[:, :seq_length]
-        if inputs_embeds is None:
-            inputs_embeds = self.token_embedding(input_ids)
-        position_embeddings = self.position_embedding(position_ids)
-        embeddings = inputs_embeds + position_embeddings
-        return embeddings
-class SiglipAttention(nn.Module):
-    """Multi-headed attention from 'Attention Is All You Need' paper"""
-    # Copied from transformers.models.clip.modeling_clip.CLIPAttention.__init__
-    def __init__(self, config):
-        super().__init__()
-        self.config = config
-        self.embed_dim = config.hidden_size
-        self.num_heads = config.num_attention_heads
-        self.head_dim = self.embed_dim // self.num_heads
-        if self.head_dim * self.num_heads != self.embed_dim:
-            raise ValueError(
-                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
-                f" {self.num_heads})."
-            )
-        self.scale = self.head_dim**-0.5
-        self.dropout = config.attention_dropout
-        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
-        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
-        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
-        # self.use_flash_attn = config.use_flash_attn and has_flash_attn
-        self.use_flash_attn = True if has_flash_attn else False
-        if self.use_flash_attn:
-            self.inner_attn = FlashAttention(attention_dropout=config.attention_dropout)
-        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
-    def _flash_attn(self,
-                    hidden_states: torch.Tensor,
-                    attention_mask: Optional[torch.Tensor] = None,
-                    output_attentions: Optional[bool] = False,
-                    key_padding_mask=None,
-                    need_weights=False
-                    ):
-        batch_size, q_len, _ = hidden_states.size()
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
-        query_states = query_states.view(batch_size, q_len, self.num_heads, self.head_dim)
-        key_states = key_states.view(batch_size, q_len, self.num_heads, self.head_dim)
-        value_states = value_states.view(batch_size, q_len, self.num_heads, self.head_dim)
-        qkv = torch.stack([query_states, key_states, value_states], dim=2)
-        context, attn_weights = self.inner_attn(
-            qkv, key_padding_mask=key_padding_mask, need_weights=need_weights, causal=False
-        )
-        attn_output = self.out_proj(rearrange(context, 'b s h d -> b s (h d)'))
-        return attn_output, attn_weights
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        output_attentions: Optional[bool] = False,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        """Input shape: Batch x Time x Channel"""
-        if self.use_flash_attn:
-            return self._flash_attn(hidden_states)
-        else:
-            return self._vanilla_attn(hidden_states, attention_mask, output_attentions)
-    def _vanilla_attn(self, hidden_states, attention_mask=None, output_attentions=False):
-        batch_size, q_len, _ = hidden_states.size()
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
-        query_states = query_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        k_v_seq_len = key_states.shape[-2]
-        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) * self.scale
-        if attn_weights.size() != (batch_size, self.num_heads, q_len, k_v_seq_len):
-            raise ValueError(
-                f"Attention weights should be of size {(batch_size, self.num_heads, q_len, k_v_seq_len)}, but is"
-                f" {attn_weights.size()}"
-            )
-        if attention_mask is not None:
-            if attention_mask.size() != (batch_size, 1, q_len, k_v_seq_len):
-                raise ValueError(
-                    f"Attention mask should be of size {(batch_size, 1, q_len, k_v_seq_len)}, but is {attention_mask.size()}"
-                )
-            attn_weights = attn_weights + attention_mask
-        # upcast attention to fp32
-        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
-        attn_weights = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
-        attn_output = torch.matmul(attn_weights, value_states)
-        if attn_output.size() != (batch_size, self.num_heads, q_len, self.head_dim):
-            raise ValueError(
-                f"`attn_output` should be of size {(batch_size, self.num_heads, q_len, self.head_dim)}, but is"
-                f" {attn_output.size()}"
-            )
-        attn_output = attn_output.transpose(1, 2).contiguous()
-        attn_output = attn_output.reshape(batch_size, q_len, self.embed_dim)
-        attn_output = self.out_proj(attn_output)
-        return attn_output, attn_weights
-# Copied from transformers.models.clip.modeling_clip.CLIPMLP with CLIP->Siglip
-class SiglipMLP(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.config = config
-        self.activation_fn = ACT2FN[config.hidden_act]
-        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
-        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        hidden_states = self.fc1(hidden_states)
-        hidden_states = self.activation_fn(hidden_states)
-        hidden_states = self.fc2(hidden_states)
-        return hidden_states
-# Copied from transformers.models.clip.modeling_clip.CLIPEncoderLayer with CLIP->Siglip
-class SiglipEncoderLayer(nn.Module):
-    def __init__(self, config: SiglipConfig):
-        super().__init__()
-        self.embed_dim = config.hidden_size
-        self.self_attn = SiglipAttention(config)
-        self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
-        self.mlp = SiglipMLP(config)
-        self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
-    # Ignore copy
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: torch.Tensor,
-        output_attentions: Optional[bool] = False,
-    ) -> Tuple[torch.FloatTensor]:
-        """
-        Args:
-            hidden_states (`torch.FloatTensor`):
-                Input to the layer of shape `(batch, seq_len, embed_dim)`.
-            attention_mask (`torch.FloatTensor`):
-                Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where padding elements are indicated by very large negative values.
-            output_attentions (`bool`, *optional*, defaults to `False`):
-                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-                returned tensors for more detail.
-        """
-        residual = hidden_states
-        hidden_states = self.layer_norm1(hidden_states)
-        hidden_states, attn_weights = self.self_attn(
-            hidden_states=hidden_states,
-            attention_mask=attention_mask,
-            output_attentions=output_attentions,
-        )
-        hidden_states = residual + hidden_states
-        residual = hidden_states
-        hidden_states = self.layer_norm2(hidden_states)
-        hidden_states = self.mlp(hidden_states)
-        hidden_states = residual + hidden_states
-        outputs = (hidden_states,)
-        if output_attentions:
-            outputs += (attn_weights,)
-        return outputs
-class SiglipPreTrainedModel(PreTrainedModel):
-    """
-    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
-    models.
-    """
-    config_class = SiglipConfig
-    base_model_prefix = "siglip"
-    supports_gradient_checkpointing = True
-    def _init_weights(self, module):
-        """Initialize the weights"""
-        if isinstance(module, SiglipVisionEmbeddings):
-            width = (
-                self.config.vision_config.hidden_size
-                if isinstance(self.config, SiglipConfig)
-                else self.config.hidden_size
-            )
-            nn.init.normal_(module.position_embedding.weight, std=1 / np.sqrt(width))
-        elif isinstance(module, nn.Embedding):
-            default_flax_embed_init(module.weight)
-        elif isinstance(module, SiglipAttention):
-            nn.init.xavier_uniform_(module.q_proj.weight)
-            nn.init.xavier_uniform_(module.k_proj.weight)
-            nn.init.xavier_uniform_(module.v_proj.weight)
-            nn.init.xavier_uniform_(module.out_proj.weight)
-            nn.init.zeros_(module.q_proj.bias)
-            nn.init.zeros_(module.k_proj.bias)
-            nn.init.zeros_(module.v_proj.bias)
-            nn.init.zeros_(module.out_proj.bias)
-        elif isinstance(module, SiglipMLP):
-            nn.init.xavier_uniform_(module.fc1.weight)
-            nn.init.xavier_uniform_(module.fc2.weight)
-            nn.init.normal_(module.fc1.bias, std=1e-6)
-            nn.init.normal_(module.fc2.bias, std=1e-6)
-        elif isinstance(module, SiglipMultiheadAttentionPoolingHead):
-            nn.init.xavier_uniform_(module.probe.data)
-            nn.init.xavier_uniform_(module.attention.in_proj_weight.data)
-            nn.init.zeros_(module.attention.in_proj_bias.data)
-        elif isinstance(module, SiglipModel):
-            logit_scale_init = torch.log(torch.tensor(1.0))
-            module.logit_scale.data.fill_(logit_scale_init)
-            module.logit_bias.data.zero_()
-        elif isinstance(module, (nn.Linear, nn.Conv2d)):
-            lecun_normal_(module.weight)
-            if module.bias is not None:
-                nn.init.zeros_(module.bias)
-        elif isinstance(module, nn.LayerNorm):
-            module.bias.data.zero_()
-            module.weight.data.fill_(1.0)
-SIGLIP_START_DOCSTRING = r"""
-    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
-    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
-    etc.)
-    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
-    and behavior.
-    Parameters:
-        config ([`SiglipConfig`]): Model configuration class with all the parameters of the model.
-            Initializing with a config file does not load the weights associated with the model, only the
-            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-"""
-SIGLIP_TEXT_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
-            it.
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-            [What are input IDs?](../glossary#input-ids)
-        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-            [What are attention masks?](../glossary#attention-mask)
-        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
-            config.max_position_embeddings - 1]`.
-            [What are position IDs?](../glossary#position-ids)
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-"""
-SIGLIP_VISION_INPUTS_DOCSTRING = r"""
-    Args:
-        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
-            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
-            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-"""
-SIGLIP_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
-            it.
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-            [What are input IDs?](../glossary#input-ids)
-        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-            [What are attention masks?](../glossary#attention-mask)
-        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
-            config.max_position_embeddings - 1]`.
-            [What are position IDs?](../glossary#position-ids)
-        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
-            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
-            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
-        return_loss (`bool`, *optional*):
-            Whether or not to return the contrastive loss.
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-"""
-# Copied from transformers.models.clip.modeling_clip.CLIPEncoder with CLIP->Siglip
-class SiglipEncoder(nn.Module):
-    """
-    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
-    [`SiglipEncoderLayer`].
-    Args:
-        config: SiglipConfig
-    """
-    def __init__(self, config: SiglipConfig):
-        super().__init__()
-        self.config = config
-        self.layers = nn.ModuleList([SiglipEncoderLayer(config) for _ in range(config.num_hidden_layers)])
-        self.gradient_checkpointing = False
-    # Ignore copy
-    def forward(
-        self,
-        inputs_embeds,
-        attention_mask: Optional[torch.Tensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutput]:
-        r"""
-        Args:
-            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
-                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
-                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
-                than the model's internal embedding lookup matrix.
-            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-                - 1 for tokens that are **not masked**,
-                - 0 for tokens that are **masked**.
-                [What are attention masks?](../glossary#attention-mask)
-            output_attentions (`bool`, *optional*):
-                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-                returned tensors for more detail.
-            output_hidden_states (`bool`, *optional*):
-                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
-                for more detail.
-            return_dict (`bool`, *optional*):
-                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-        """
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        encoder_states = () if output_hidden_states else None
-        all_attentions = () if output_attentions else None
-        hidden_states = inputs_embeds
-        for encoder_layer in self.layers:
-            if output_hidden_states:
-                encoder_states = encoder_states + (hidden_states,)
-            if self.gradient_checkpointing and self.training:
-                layer_outputs = self._gradient_checkpointing_func(
-                    encoder_layer.__call__,
-                    hidden_states,
-                    attention_mask,
-                    output_attentions,
-                )
-            else:
-                layer_outputs = encoder_layer(
-                    hidden_states,
-                    attention_mask,
-                    output_attentions=output_attentions,
-                )
-            hidden_states = layer_outputs[0]
-            if output_attentions:
-                all_attentions = all_attentions + (layer_outputs[1],)
-        if output_hidden_states:
-            encoder_states = encoder_states + (hidden_states,)
-        if not return_dict:
-            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
-        return BaseModelOutput(
-            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
-        )
-class SiglipTextTransformer(nn.Module):
-    def __init__(self, config: SiglipTextConfig):
-        super().__init__()
-        self.config = config
-        embed_dim = config.hidden_size
-        self.embeddings = SiglipTextEmbeddings(config)
-        self.encoder = SiglipEncoder(config)
-        self.final_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
-        self.head = nn.Linear(embed_dim, embed_dim)
-    @add_start_docstrings_to_model_forward(SIGLIP_TEXT_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=SiglipTextConfig)
-    def forward(
-        self,
-        input_ids: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.Tensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutputWithPooling]:
-        r"""
-        Returns:
-        """
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        if input_ids is None:
-            raise ValueError("You have to specify input_ids")
-        input_shape = input_ids.size()
-        input_ids = input_ids.view(-1, input_shape[-1])
-        hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids)
-        # note: SigLIP's text model does not use a causal mask, unlike the original CLIP model.
-        # expand attention_mask
-        if attention_mask is not None:
-            # [batch_size, seq_len] -> [batch_size, 1, tgt_seq_len, src_seq_len]
-            attention_mask = _prepare_4d_attention_mask(attention_mask, hidden_states.dtype)
-        encoder_outputs = self.encoder(
-            inputs_embeds=hidden_states,
-            attention_mask=attention_mask,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        last_hidden_state = encoder_outputs[0]
-        last_hidden_state = self.final_layer_norm(last_hidden_state)
-        # Assuming "sticky" EOS tokenization, last token is always EOS.
-        pooled_output = last_hidden_state[:, -1, :]
-        pooled_output = self.head(pooled_output)
-        if not return_dict:
-            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
-        return BaseModelOutputWithPooling(
-            last_hidden_state=last_hidden_state,
-            pooler_output=pooled_output,
-            hidden_states=encoder_outputs.hidden_states,
-            attentions=encoder_outputs.attentions,
-        )
-@add_start_docstrings(
-    """The text model from SigLIP without any head or projection on top.""",
-    SIGLIP_START_DOCSTRING,
-)
-class SiglipTextModel(SiglipPreTrainedModel):
-    config_class = SiglipTextConfig
-    _no_split_modules = ["SiglipTextEmbeddings", "SiglipEncoderLayer"]
-    def __init__(self, config: SiglipTextConfig):
-        super().__init__(config)
-        self.text_model = SiglipTextTransformer(config)
-        # Initialize weights and apply final processing
-        self.post_init()
-    def get_input_embeddings(self) -> nn.Module:
-        return self.text_model.embeddings.token_embedding
-    def set_input_embeddings(self, value):
-        self.text_model.embeddings.token_embedding = value
-    @add_start_docstrings_to_model_forward(SIGLIP_TEXT_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=SiglipTextConfig)
-    def forward(
-        self,
-        input_ids: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.Tensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutputWithPooling]:
-        r"""
-        Returns:
-        Examples:
-        ```python
-        >>> from transformers import AutoTokenizer, SiglipTextModel
-        >>> model = SiglipTextModel.from_pretrained("google/siglip-base-patch16-224")
-        >>> tokenizer = AutoTokenizer.from_pretrained("google/siglip-base-patch16-224")
-        >>> # important: make sure to set padding="max_length" as that's how the model was trained
-        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding="max_length", return_tensors="pt")
-        >>> outputs = model(**inputs)
-        >>> last_hidden_state = outputs.last_hidden_state
-        >>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
-        ```"""
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        return self.text_model(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-class SiglipVisionTransformer(nn.Module):
-    def __init__(self, config: SiglipVisionConfig):
-        super().__init__()
-        self.config = config
-        embed_dim = config.hidden_size
-        self.embeddings = SiglipVisionEmbeddings(config)
-        self.encoder = SiglipEncoder(config)
-        self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
-        self.head = SiglipMultiheadAttentionPoolingHead(config)
-    @add_start_docstrings_to_model_forward(SIGLIP_VISION_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=SiglipVisionConfig)
-    def forward(
-        self,
-        pixel_values,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutputWithPooling]:
-        r"""
-        Returns:
-        """
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        hidden_states = self.embeddings(pixel_values)
-        encoder_outputs = self.encoder(
-            inputs_embeds=hidden_states,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        last_hidden_state = encoder_outputs[0]
-        last_hidden_state = self.post_layernorm(last_hidden_state)
-        pooled_output = self.head(last_hidden_state)
-        if not return_dict:
-            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
-        return BaseModelOutputWithPooling(
-            last_hidden_state=last_hidden_state,
-            pooler_output=pooled_output,
-            hidden_states=encoder_outputs.hidden_states,
-            attentions=encoder_outputs.attentions,
-        )
-class SiglipMultiheadAttentionPoolingHead(nn.Module):
-    """Multihead Attention Pooling."""
-    def __init__(self, config: SiglipVisionConfig):
-        super().__init__()
-        self.probe = nn.Parameter(torch.randn(1, 1, config.hidden_size))
-        self.attention = torch.nn.MultiheadAttention(config.hidden_size, config.num_attention_heads, batch_first=True)
-        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-        self.mlp = SiglipMLP(config)
-    def forward(self, hidden_state):
-        batch_size = hidden_state.shape[0]
-        probe = self.probe.repeat(batch_size, 1, 1)
-        hidden_state = self.attention(probe, hidden_state, hidden_state)[0]
-        residual = hidden_state
-        hidden_state = self.layernorm(hidden_state)
-        hidden_state = residual + self.mlp(hidden_state)
-        return hidden_state[:, 0]
-@add_start_docstrings(
-    """The vision model from SigLIP without any head or projection on top.""",
-    SIGLIP_START_DOCSTRING,
-)
-class SiglipVisionModel(SiglipPreTrainedModel):
-    config_class = SiglipVisionConfig
-    main_input_name = "pixel_values"
-    _no_split_modules = [
-        "SiglipEncoderLayer",
-        "SiglipVisionEmbeddings",
-        "SiglipMultiheadAttentionPoolingHead",
-    ]
-    def __init__(self, config: SiglipVisionConfig):
-        super().__init__(config)
-        self.vision_model = SiglipVisionTransformer(config)
-        # Initialize weights and apply final processing
-        self.post_init()
-    def get_input_embeddings(self) -> nn.Module:
-        return self.vision_model.embeddings.patch_embedding
-    @add_start_docstrings_to_model_forward(SIGLIP_VISION_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=SiglipVisionConfig)
-    def forward(
-        self,
-        pixel_values,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutputWithPooling]:
-        r"""
-        Returns:
-        Examples:
-        ```python
-        >>> from PIL import Image
-        >>> import requests
-        >>> from transformers import AutoProcessor, SiglipVisionModel
-        >>> model = SiglipVisionModel.from_pretrained("google/siglip-base-patch16-224")
-        >>> processor = AutoProcessor.from_pretrained("google/siglip-base-patch16-224")
-        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-        >>> image = Image.open(requests.get(url, stream=True).raw)
-        >>> inputs = processor(images=image, return_tensors="pt")
-        >>> outputs = model(**inputs)
-        >>> last_hidden_state = outputs.last_hidden_state
-        >>> pooled_output = outputs.pooler_output  # pooled features
-        ```"""
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        return self.vision_model(
-            pixel_values=pixel_values,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-@add_start_docstrings(SIGLIP_START_DOCSTRING)
-class SiglipModel(SiglipPreTrainedModel):
-    config_class = SiglipConfig
-    def __init__(self, config: SiglipConfig):
-        super().__init__(config)
-        if not isinstance(config.text_config, SiglipTextConfig):
-            raise ValueError(
-                "config.text_config is expected to be of type SiglipTextConfig but is of type"
-                f" {type(config.text_config)}."
-            )
-        if not isinstance(config.vision_config, SiglipVisionConfig):
-            raise ValueError(
-                "config.vision_config is expected to be of type SiglipVisionConfig but is of type"
-                f" {type(config.vision_config)}."
-            )
-        text_config = config.text_config
-        vision_config = config.vision_config
-        self.text_model = SiglipTextTransformer(text_config)
-        self.vision_model = SiglipVisionTransformer(vision_config)
-        self.logit_scale = nn.Parameter(torch.randn(1))
-        self.logit_bias = nn.Parameter(torch.randn(1))
-        # Initialize weights and apply final processing
-        self.post_init()
-    @add_start_docstrings_to_model_forward(SIGLIP_TEXT_INPUTS_DOCSTRING)
-    def get_text_features(
-        self,
-        input_ids: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.Tensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> torch.FloatTensor:
-        r"""
-        Returns:
-            text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
-            applying the projection layer to the pooled output of [`SiglipTextModel`].
-        Examples:
-        ```python
-        >>> from transformers import AutoTokenizer, AutoModel
-        >>> import torch
-        >>> model = AutoModel.from_pretrained("google/siglip-base-patch16-224")
-        >>> tokenizer = AutoTokenizer.from_pretrained("google/siglip-base-patch16-224")
-        >>> # important: make sure to set padding="max_length" as that's how the model was trained
-        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding="max_length", return_tensors="pt")
-        >>> with torch.no_grad():
-        transformers.     text_features = model.get_text_features(**inputs)
-        ```"""
-        # Use SigLIP model's config for some fields (if specified) instead of those of vision & text components.
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        text_outputs = self.text_model(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        pooled_output = text_outputs[1]
-        return pooled_output
-    @add_start_docstrings_to_model_forward(SIGLIP_VISION_INPUTS_DOCSTRING)
-    def get_image_features(
-        self,
-        pixel_values: Optional[torch.FloatTensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> torch.FloatTensor:
-        r"""
-        Returns:
-            image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
-            applying the projection layer to the pooled output of [`SiglipVisionModel`].
-        Examples:
-        ```python
-        >>> from PIL import Image
-        >>> import requests
-        >>> from transformers import AutoProcessor, AutoModel
-        >>> import torch
-        >>> model = AutoModel.from_pretrained("google/siglip-base-patch16-224")
-        >>> processor = AutoProcessor.from_pretrained("google/siglip-base-patch16-224")
-        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-        >>> image = Image.open(requests.get(url, stream=True).raw)
-        >>> inputs = processor(images=image, return_tensors="pt")
-        >>> with torch.no_grad():
-        transformers.     image_features = model.get_image_features(**inputs)
-        ```"""
-        # Use SiglipModel's config for some fields (if specified) instead of those of vision & text components.
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        vision_outputs = self.vision_model(
-            pixel_values=pixel_values,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        pooled_output = vision_outputs[1]
-        return pooled_output
-    @add_start_docstrings_to_model_forward(SIGLIP_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=SiglipOutput, config_class=SiglipConfig)
-    def forward(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        pixel_values: Optional[torch.FloatTensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        return_loss: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, SiglipOutput]:
-        r"""
-        Returns:
-        Examples:
-        ```python
-        >>> from PIL import Image
-        >>> import requests
-        >>> from transformers import AutoProcessor, AutoModel
-        >>> import torch
-        >>> model = AutoModel.from_pretrained("google/siglip-base-patch16-224")
-        >>> processor = AutoProcessor.from_pretrained("google/siglip-base-patch16-224")
-        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-        >>> image = Image.open(requests.get(url, stream=True).raw)
-        >>> texts = ["a photo of 2 cats", "a photo of 2 dogs"]
-        >>> # important: we pass `padding=max_length` since the model was trained with this
-        >>> inputs = processor(text=texts, images=image, padding="max_length", return_tensors="pt")
-        >>> with torch.no_grad():
-        transformers.     outputs = model(**inputs)
-        >>> logits_per_image = outputs.logits_per_image
-        >>> probs = torch.sigmoid(logits_per_image) # these are the probabilities
-        >>> print(f"{probs[0][0]:.1%} that image 0 is '{texts[0]}'")
-        31.9% that image 0 is 'a photo of 2 cats'
-        ```"""
-        # Use SigLIP model's config for some fields (if specified) instead of those of vision & text components.
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        vision_outputs = self.vision_model(
-            pixel_values=pixel_values,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        text_outputs = self.text_model(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        image_embeds = vision_outputs[1]
-        text_embeds = text_outputs[1]
-        # normalized features
-        image_embeds = image_embeds / image_embeds.norm(p=2, dim=-1, keepdim=True)
-        text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)
-        # cosine similarity as logits
-        logits_per_text = torch.matmul(text_embeds, image_embeds.t()) * self.logit_scale.exp() + self.logit_bias
-        logits_per_image = logits_per_text.t()
-        loss = None
-        if return_loss:
-            raise NotImplementedError("SigLIP loss to be implemented")
-        if not return_dict:
-            output = (logits_per_image, logits_per_text, text_embeds, image_embeds, text_outputs, vision_outputs)
-            return ((loss,) + output) if loss is not None else output
-        return SiglipOutput(
-            loss=loss,
-            logits_per_image=logits_per_image,
-            logits_per_text=logits_per_text,
-            text_embeds=text_embeds,
-            image_embeds=image_embeds,
-            text_model_output=text_outputs,
-            vision_model_output=vision_outputs,
-        )

multi_backbone_channel_concatenation_encoder.py DELETED Viewed

@@ -1,266 +0,0 @@
-# --------------------------------------------------------
-# Eagle2
-# Copyright (c) 2025 NVIDIA
-# Licensed under The Apache License [see LICENSE for details]
-# --------------------------------------------------------
-import torch, os
-import torch.nn as nn
-from torch.utils.checkpoint import checkpoint
-from .siglip_vision_tower import SiglipVisionTower
-import torch.nn.functional as F
-from torch.nn.init import trunc_normal_
-from copy import deepcopy
-import random
-import math
-class MultiBackboneChannelConcatenationVisionTower(nn.Module):
-    def __init__(self,
-                 vision_tower,
-                 args,
-                 grid_size=32,
-                 convnext_img_size=1024,
-                 normalize_type=None, raw_config=None):
-        super().__init__()
-        self.is_loaded = False
-        self.grid_size = grid_size
-        self.num_tokens = self.grid_size ** 2
-        self.normalize_type = args.normalize_type
-        self.moe_version_type = args.moe_version_type
-        self.raw_config = raw_config
-        print("moe_version_type: ", self.moe_version_type)
-        assert self.moe_version_type in [None, 'all_tiling', 'seq_concat', 'feat_concat', 'convnext_512_siglip_448'], f"Unknown self.moe_version_type: {self.moe_version_type}"
-        vision_tower_name_list = vision_tower.split(";")
-        self.input_image_size = 1024
-        self.convnext_img_size = convnext_img_size
-        self.load_vision_towers(vision_tower_name_list, args)
-    def load_vision_towers(self, vision_tower_name_list, args):
-        self.vision_towers = nn.ModuleList()
-        freeze_backbone_list = args.freeze_backbones # note this is a str
-        if freeze_backbone_list is not None and len(freeze_backbone_list) > 0:
-            print("The frozen backbones: ", freeze_backbone_list)
-        else:
-            # make it a blank str
-            freeze_backbone_list = ""
-        for name in vision_tower_name_list:
-            ## ConvNeXt
-            if name == 'convnext-1024':
-                convnext_args = deepcopy(args)
-                convnext_args.freeze_vision = False
-                if 'convnext-1024' in freeze_backbone_list:
-                    convnext_args.freeze_vision = True
-                from .convnext_encoder import ConvNextVisionTower
-                convnext_args.input_image_size = self.convnext_img_size
-                convnext_vision_tower = args.vision_tower_convnext_path
-                convnext_vision_tower = ConvNextVisionTower(convnext_vision_tower,
-                                                                convnext_args, delay_load=args.delay_load, normalize_type=self.normalize_type)
-                convnext_vision_tower.load_model()
-                self.vision_towers.append(convnext_vision_tower)
-            ## PaliSigLIP
-            elif name == 'palisiglip':
-                palisiglip_args = deepcopy(args)
-                palisiglip_args.input_image_size = 448
-                palisiglip_args.freeze_vision = False
-                if 'palisiglip' in freeze_backbone_list:
-                    palisiglip_args.freeze_vision = True
-                palisiglip_vision_tower = SiglipVisionTower(args.vision_tower_siglip_path, palisiglip_args, delay_load=args.delay_load, raw_config=self.raw_config)
-                palisiglip_vision_tower.load_model()
-                self.vision_towers.append(palisiglip_vision_tower)
-        # Set the image processor
-        self.image_processor = None
-        self.is_loaded = True
-    def load_model(self):
-        assert self.is_loaded, "All the vision encoders should be loaded during initialization!"
-    def forward(self, x):
-        # x is a Tensor if moe_version_type is None or 'all_tiling'
-        # else is a tuple(Tensor, Tensor)
-        if self.moe_version_type in [None, 'all_tiling']:
-            # The default pipeline
-            features = []
-            image_input_size = x.shape[2]
-            assert x.shape[2] == x.shape[3], f"Image should be a square but size ({x.shape[2]} x {x.shape[3]})"
-            for vision_tower in self.vision_towers:
-                if vision_tower.input_image_size != image_input_size:
-                    resized_x = F.interpolate(x.float(),
-                                            size=(vision_tower.input_image_size, vision_tower.input_image_size),
-                                            mode='bilinear',
-                                            align_corners=True).to(dtype=x.dtype)
-                else:
-                    resized_x = x
-                feature = vision_tower(resized_x)
-                if len(feature.shape) == 3: # b, n, c
-                    b, n, c = feature.shape
-                    if n == self.num_tokens:
-                        features.append(feature)
-                        continue
-                    w = h = int(n**0.5)
-                    feature = feature.transpose(1,2).reshape(b, c, h, w)
-                else:
-                    b, c, h, w = feature.shape
-                if w != self.grid_size:
-                    feature = F.interpolate(feature.float(), size=(self.grid_size, self.grid_size), mode='bilinear', align_corners=True).to(dtype=x.dtype)
-                features.append(feature.flatten(2,3).transpose(1,2))
-            features = torch.cat(features, dim=-1)
-        elif self.moe_version_type == 'convnext_512_siglip_448':
-            features = {}
-            image_input_size = x.shape[2]
-            assert x.shape[2] == x.shape[3], f"Image should be a square but size ({x.shape[2]} x {x.shape[3]})"
-            for vision_tower in self.vision_towers:
-                if vision_tower.input_image_size != image_input_size:
-                    resized_x = F.interpolate(x.float(),
-                                            size=(vision_tower.input_image_size, vision_tower.input_image_size),
-                                            mode='bilinear',
-                                            align_corners=True).to(dtype=x.dtype)
-                else:
-                    resized_x = x
-                feature = vision_tower(resized_x)
-                # if len(feature.shape) == 3: # b, n, c
-                #     b, n, c = feature.shape
-                #     if n == self.num_tokens:
-                #         features.append(feature)
-                #         continue
-                #     w = h = int(n**0.5)
-                #     feature = feature.transpose(1,2).reshape(b, c, h, w)
-                # else:
-                #     b, c, h, w = feature.shape
-                features[vision_tower.name] = feature
-        else:
-            assert isinstance(x, dict), "x is expected to be a dict but {}".format(type(x))
-            pixel_values = x['pixel_values']
-            num_patches = x['num_patches'] # num patch of paddings token in texts
-            # calculated the real image patches
-            if self.moe_version_type == 'seq_concat':
-                image_in_num_patches = [i-1 for i in num_patches]
-            else:
-                image_in_num_patches = [i for i in num_patches]
-            assert sum(image_in_num_patches) == pixel_values.size(0), "sum(image_in_num_patches) ({}) != pixel_values.size(0) ({})".format(sum(image_in_num_patches), pixel_values.size(0))
-            # find the thubnail image id
-            thumbnail_image_id = torch.cumsum(torch.tensor(image_in_num_patches).to(pixel_values.device), 0) - 1
-            image_no_tiling = pixel_values[thumbnail_image_id]
-            # By default, we use the 1st vision_tower for x, others for x_nt
-            features = []
-            for layer_id, vision_tower in enumerate(self.vision_towers):
-                if layer_id == 0:
-                    x = pixel_values
-                else:
-                    x = image_no_tiling
-                if vision_tower.input_image_size != self.input_image_size:
-                    resized_x = F.interpolate(x.float(),
-                                            size=(vision_tower.input_image_size, vision_tower.input_image_size),
-                                            mode='bilinear',
-                                            align_corners=True).to(dtype=x.dtype)
-                else:
-                    resized_x = x
-                feature = vision_tower(resized_x)
-                if len(feature.shape) == 3: # b, n, c
-                    b, n, c = feature.shape
-                    if n == self.num_tokens:
-                        features.append(feature)
-                        continue
-                    w = h = int(n**0.5)
-                    feature = feature.transpose(1,2).reshape(b, c, h, w)
-                else:
-                    b, c, h, w = feature.shape
-                if w != self.grid_size:
-                    feature = F.interpolate(feature.float(), size=(self.grid_size, self.grid_size), mode='bilinear', align_corners=True).to(dtype=x.dtype)
-                features.append(feature.flatten(2,3).transpose(1,2))
-            clip_embeds = features[0]
-            if len(features) <= 1:
-                no_tiling_embeds = None
-            else:
-                no_tiling_embeds = torch.cat(features[1:], dim=-1)
-            if self.moe_version_type == 'feat_concat':
-                # concat thumbnail images features together
-                clip_thumbnail_embeds = clip_embeds[thumbnail_image_id]
-                if no_tiling_embeds is not None:
-                    no_tiling_embeds = torch.cat([clip_thumbnail_embeds, no_tiling_embeds], dim=-1)
-                else:
-                    no_tiling_embeds = clip_thumbnail_embeds
-                # extra patch featureas
-                clip_embeds_mask = ~torch.isin(torch.arange(clip_embeds.shape[0]).to(clip_embeds.device), thumbnail_image_id)
-                clip_embeds = clip_embeds[clip_embeds_mask]
-            features = {
-                    'clip_embeds': clip_embeds,
-                    'no_tiling_embeds': no_tiling_embeds,
-                    'num_patches': num_patches
-                }
-        # features is a Tensor if not clip_tiling_only
-        return features
-    @property
-    def dummy_feature(self):
-        return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
-    @property
-    def dtype(self):
-        return next(self.clip_vision_tower.parameters()).dtype
-    @property
-    def device(self):
-        return next(self.clip_vision_tower.parameters()).device
-    @property
-    def config(self):
-        assert NotImplementedError
-        pass
-    @property
-    def hidden_size(self):
-        if self.moe_version_type == 'convnext_512_siglip_448':
-            res = {}
-            for vision_tower in self.vision_towers:
-                res[vision_tower.name] = vision_tower.hidden_size
-            return res
-        else:
-            return sum([_.hidden_size for _ in self.vision_towers])
-    @property
-    def num_patches(self):
-        return self.num_tokens

multi_backbone_channel_concatentation_model.py DELETED Viewed

@@ -1,95 +0,0 @@
-# --------------------------------------------------------
-# Eagle2
-# Copyright (c) 2025 NVIDIA
-# Licensed under The Apache License [see LICENSE for details]
-# --------------------------------------------------------
-import torch.nn as nn
-from transformers.modeling_outputs import BaseModelOutputWithPooling
-from typing import Optional, Tuple, Union
-from .multi_backbone_channel_concatenation_encoder import MultiBackboneChannelConcatenationVisionTower
-from .configuration_multi_backbone_channel_concatentation_model import MultiBackboneChannelConcatenationVisionModelConfig
-class MultiBackboneChannelConcatenationVisionModel(nn.Module):
-    """
-    A vision model wrapper that concatenates channels from multiple backbones.
-    Args:
-        config (MultiBackboneChannelConcatenationVisionModelConfig): The configuration for the model.
-    Attributes:
-        vision_model (MultiBackboneChannelConcatenationVisionTower): The vision tower that performs the channel concatenation.
-    Notes:
-        **The class is not inherited from the PreTrainedModel in transformers**
-    """
-    config_class = MultiBackboneChannelConcatenationVisionModelConfig
-    main_input_name = "pixel_values"
-    def __init__(self, config: MultiBackboneChannelConcatenationVisionModelConfig, raw_config):
-        super().__init__()
-        self.vision_model = MultiBackboneChannelConcatenationVisionTower(
-            vision_tower=config.vision_tower,
-            args=config,
-            grid_size=config.grid_size,
-            convnext_img_size=config.convnext_img_size,
-            normalize_type=config.normalize_type,
-            raw_config=raw_config
-        )
-    def get_input_embeddings(self):
-        # You might need to adjust this depending on how you want to handle input embeddings
-        return self.vision_model.vision_towers[0].get_input_embeddings()
-    def forward(
-        self,
-        pixel_values,
-        return_dict: Optional[bool] = True,
-        output_hidden_states: Optional[bool] = False,
-    ) -> Union[Tuple, BaseModelOutputWithPooling]:
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        assert return_dict is True, "We only support return_dict"
-        assert output_hidden_states is False, "We do not support output_hidden_states"
-        features = self.vision_model(pixel_values)
-        # We only supports features as model outputs
-        return BaseModelOutputWithPooling(
-            last_hidden_state=features,
-            pooler_output=None,
-            hidden_states=None,
-            attentions=None,
-        )
-    @property
-    def dummy_feature(self):
-        return self.vision_model.dummy_feature
-    @property
-    def dtype(self):
-        return self.vision_model.dtype
-    @property
-    def device(self):
-        return self.vision_model.device
-    @property
-    def config(self):
-        return self.vision_model.config
-    @property
-    def hidden_size(self):
-        return self.vision_model.hidden_size
-    @property
-    def num_patches(self):
-        return self.vision_model.num_patches

siglip_vision_tower.py DELETED Viewed

@@ -1,93 +0,0 @@
-import torch
-import torch.nn as nn
-from torch.utils.checkpoint import checkpoint
-from .modeling_siglip import SiglipVisionModel
-from .configuration_siglip import SiglipVisionConfig
-import math
-import torch
-import torch.nn.functional as F
-from typing import List, Optional
-import os
-class SiglipVisionTower(nn.Module):
-    # We use the same wrapper as the default clip encoder.
-    # See `clip_encoder.py` in the same folder
-    def __init__(self, vision_tower, args, delay_load=False, raw_config=None):
-        super().__init__()
-        self.is_loaded = False
-        self.freeze_vision=args.freeze_vision
-        self.input_image_size=args.input_image_size
-        self.vision_tower_name = vision_tower
-        self.select_layer = args.mm_vision_select_layer
-        self.name = 'siglip'
-        self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
-        self.delay_load = delay_load
-        self.raw_config = raw_config
-        if not delay_load:
-            self.load_model()
-        else:
-            if os.path.isfile(self.vision_tower_name):
-                self.cfg_only = SiglipVisionConfig.from_pretrained(self.vision_tower_name, local_files_only=True)
-            else:
-                self.cfg_only = SiglipVisionConfig(**self.raw_config.vision_config.siglip_vision_config)
-    def load_model(self):
-        if self.is_loaded:
-            print('{} is already loaded, `load_model` called again, skipping.'.format(self.vision_tower_name))
-            return
-        # self.image_processor = SiglipImageProcessor(size=1024)
-        # self.vision_tower = SiglipVisionModel.from_pretrained(self.vision_tower_name, local_files_only=True, torch_dtype=torch.bfloat16)
-        if self.delay_load:
-            # cfg = SiglipVisionConfig.from_pretrained(self.vision_tower_name, local_files_only=True)
-            self.vision_tower = SiglipVisionModel(self.cfg_only)
-        else:
-            self.vision_tower = SiglipVisionModel.from_pretrained(self.vision_tower_name, local_files_only=True)
-        if self.freeze_vision:
-            self.vision_tower.requires_grad_(False)
-        self.vision_tower.vision_model.encoder.gradient_checkpointing = True
-        self.is_loaded = True
-    def forward(self, images):
-        return self.vision_tower(
-                pixel_values=images,
-                output_hidden_states=False,
-                return_dict=True).last_hidden_state
-    @property
-    def dummy_feature(self):
-        return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
-    @property
-    def dtype(self):
-        return self.vision_tower.dtype
-    @property
-    def device(self):
-        return self.vision_tower.device
-    @property
-    def config(self):
-        if self.is_loaded:
-            return self.vision_tower.config
-        else:
-            return self.cfg_only
-    @property
-    def hidden_size(self):
-        return self.config.hidden_size
-    @property
-    def num_patches_per_side(self):
-        return self.config.image_size // self.config.patch_size
-    @property
-    def num_patches(self):
-        return (self.config.image_size // self.config.patch_size) ** 2

tokenization_qwen2.py DELETED Viewed

@@ -1,345 +0,0 @@
-# coding=utf-8
-# Copyright 2024 The Qwen team, Alibaba Group and The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Tokenization classes for Qwen2."""
-import json
-import os
-import unicodedata
-from functools import lru_cache
-from typing import Optional, Tuple
-import regex as re
-from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
-from transformers.utils import logging
-logger = logging.get_logger(__name__)
-VOCAB_FILES_NAMES = {
-    "vocab_file": "vocab.json",
-    "merges_file": "merges.txt",
-}
-PRETRAINED_VOCAB_FILES_MAP = {
-    "vocab_file": {"qwen/qwen-tokenizer": "https://huggingface.co/qwen/qwen-tokenizer/resolve/main/vocab.json"},
-    "merges_file": {"qwen/qwen-tokenizer": "https://huggingface.co/qwen/qwen-tokenizer/resolve/main/merges.txt"},
-}
-MAX_MODEL_INPUT_SIZES = {"qwen/qwen-tokenizer": 32768}
-PRETOKENIZE_REGEX = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"""
-@lru_cache()
-# Copied from transformers.models.gpt2.tokenization_gpt2.bytes_to_unicode
-def bytes_to_unicode():
-    """
-    Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
-    characters the bpe code barfs on.
-    The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
-    if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
-    decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
-    tables between utf-8 bytes and unicode strings.
-    """
-    bs = (
-        list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
-    )
-    cs = bs[:]
-    n = 0
-    for b in range(2**8):
-        if b not in bs:
-            bs.append(b)
-            cs.append(2**8 + n)
-            n += 1
-    cs = [chr(n) for n in cs]
-    return dict(zip(bs, cs))
-# Copied from transformers.models.gpt2.tokenization_gpt2.get_pairs
-def get_pairs(word):
-    """
-    Return set of symbol pairs in a word.
-    Word is represented as tuple of symbols (symbols being variable-length strings).
-    """
-    pairs = set()
-    prev_char = word[0]
-    for char in word[1:]:
-        pairs.add((prev_char, char))
-        prev_char = char
-    return pairs
-class Qwen2Tokenizer(PreTrainedTokenizer):
-    """
-    Construct a Qwen2 tokenizer. Based on byte-level Byte-Pair-Encoding.
-    Same with GPT2Tokenzier, this tokenizer has been trained to treat spaces like parts of the tokens so a word will
-    be encoded differently whether it is at the beginning of the sentence (without space) or not:
-    ```python
-    >>> from transformers import Qwen2Tokenizer
-    >>> tokenizer = Qwen2Tokenizer.from_pretrained("Qwen/Qwen-tokenizer")
-    >>> tokenizer("Hello world")["input_ids"]
-    [9707, 1879]
-    >>> tokenizer(" Hello world")["input_ids"]
-    [21927, 1879]
-    ```
-    This is expected.
-    You should not use GPT2Tokenizer instead, because of the different pretokenization rules.
-    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
-    this superclass for more information regarding those methods.
-    Args:
-        vocab_file (`str`):
-            Path to the vocabulary file.
-        merges_file (`str`):
-            Path to the merges file.
-        errors (`str`, *optional*, defaults to `"replace"`):
-            Paradigm to follow when decoding bytes to UTF-8. See
-            [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
-        unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
-            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
-            token instead.
-        bos_token (`str`, *optional*):
-            The beginning of sequence token. Not applicable for this tokenizer.
-        eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
-            The end of sequence token.
-        pad_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
-            The token used for padding, for example when batching sequences of different lengths.
-        clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
-            Whether or not the model should cleanup the spaces that were added when splitting the input text during the
-            tokenization process. Not applicable to this tokenizer, since tokenization does not add spaces.
-        split_special_tokens (`bool`, *optional*, defaults to `False`):
-            Whether or not the special tokens should be split during the tokenization process. The default behavior is
-            to not split special tokens. This means that if `<|endoftext|>` is the `eos_token`, then `tokenizer.tokenize("<|endoftext|>") =
-            ['<|endoftext|>`]. Otherwise, if `split_special_tokens=True`, then `tokenizer.tokenize("<|endoftext|>")` will be give `['<',
-            '|', 'endo', 'ft', 'ext', '|', '>']`. This argument is only supported for `slow` tokenizers for the moment.
-    """
-    vocab_files_names = VOCAB_FILES_NAMES
-    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
-    max_model_input_sizes = MAX_MODEL_INPUT_SIZES
-    model_input_names = ["input_ids", "attention_mask"]
-    def __init__(
-        self,
-        vocab_file,
-        merges_file,
-        errors="replace",
-        unk_token="<|endoftext|>",
-        bos_token=None,
-        eos_token="<|endoftext|>",
-        pad_token="<|endoftext|>",
-        clean_up_tokenization_spaces=False,
-        split_special_tokens=False,
-        **kwargs,
-    ):
-        # Qwen vocab does not contain control tokens; added tokens need to be special
-        bos_token = (
-            AddedToken(bos_token, lstrip=False, rstrip=False, special=True, normalized=False)
-            if isinstance(bos_token, str)
-            else bos_token
-        )
-        eos_token = (
-            AddedToken(eos_token, lstrip=False, rstrip=False, special=True, normalized=False)
-            if isinstance(eos_token, str)
-            else eos_token
-        )
-        unk_token = (
-            AddedToken(unk_token, lstrip=False, rstrip=False, special=True, normalized=False)
-            if isinstance(unk_token, str)
-            else unk_token
-        )
-        pad_token = (
-            AddedToken(pad_token, lstrip=False, rstrip=False, special=True, normalized=False)
-            if isinstance(pad_token, str)
-            else pad_token
-        )
-        with open(vocab_file, encoding="utf-8") as vocab_handle:
-            self.encoder = json.load(vocab_handle)
-        self.decoder = {v: k for k, v in self.encoder.items()}
-        self.errors = errors  # how to handle errors in decoding
-        self.byte_encoder = bytes_to_unicode()
-        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
-        bpe_merges = []
-        with open(merges_file, encoding="utf-8") as merges_handle:
-            for line in merges_handle:
-                line = line.strip()
-                if not line or line.startswith("#"):
-                    continue
-                bpe_merges.append(tuple(line.split()))
-        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
-        # NOTE: the cache can grow without bound and will get really large for long running processes
-        # (esp. for texts of language that do not use space between word, e.g. Chinese); technically
-        # not a memory leak but appears as one.
-        # GPT2Tokenizer has the same problem, so let's be consistent.
-        self.cache = {}
-        self.pat = re.compile(PRETOKENIZE_REGEX)
-        if kwargs.get("add_prefix_space", False):
-            logger.warning_once(
-                f"{self.__class__.__name} does not support `add_prefix_space`, setting it to True has no effect."
-            )
-        super().__init__(
-            errors=errors,
-            bos_token=bos_token,
-            eos_token=eos_token,
-            pad_token=pad_token,
-            unk_token=unk_token,
-            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
-            split_special_tokens=split_special_tokens,
-            **kwargs,
-        )
-    @property
-    def vocab_size(self) -> int:
-        return len(self.encoder)
-    # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.get_vocab
-    def get_vocab(self):
-        return dict(self.encoder, **self.added_tokens_encoder)
-    # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.bpe
-    def bpe(self, token):
-        if token in self.cache:
-            return self.cache[token]
-        word = tuple(token)
-        pairs = get_pairs(word)
-        if not pairs:
-            return token
-        while True:
-            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
-            if bigram not in self.bpe_ranks:
-                break
-            first, second = bigram
-            new_word = []
-            i = 0
-            while i < len(word):
-                try:
-                    j = word.index(first, i)
-                except ValueError:
-                    new_word.extend(word[i:])
-                    break
-                else:
-                    new_word.extend(word[i:j])
-                    i = j
-                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
-                    new_word.append(first + second)
-                    i += 2
-                else:
-                    new_word.append(word[i])
-                    i += 1
-            new_word = tuple(new_word)
-            word = new_word
-            if len(word) == 1:
-                break
-            else:
-                pairs = get_pairs(word)
-        word = " ".join(word)
-        self.cache[token] = word
-        return word
-    # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer._tokenize
-    def _tokenize(self, text):
-        """Tokenize a string."""
-        bpe_tokens = []
-        for token in re.findall(self.pat, text):
-            token = "".join(
-                self.byte_encoder[b] for b in token.encode("utf-8")
-            )  # Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case)
-            bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
-        return bpe_tokens
-    # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer._convert_token_to_id
-    def _convert_token_to_id(self, token):
-        """Converts a token (str) in an id using the vocab."""
-        return self.encoder.get(token, self.encoder.get(self.unk_token))
-    # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer._convert_id_to_token
-    def _convert_id_to_token(self, index):
-        """Converts an index (integer) in a token (str) using the vocab."""
-        return self.decoder.get(index)
-    # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.convert_tokens_to_string
-    def convert_tokens_to_string(self, tokens):
-        """Converts a sequence of tokens (string) in a single string."""
-        text = "".join(tokens)
-        text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
-        return text
-    def decode(
-        self,
-        token_ids,
-        skip_special_tokens: bool = False,
-        clean_up_tokenization_spaces: Optional[bool] = False,
-        spaces_between_special_tokens: bool = False,
-        **kwargs,
-    ) -> str:
-        # `spaces_between_special_tokens` defaults to True for _decode in slow tokenizers
-        # and cannot be configured elsewhere, but it should default to False for Qwen2Tokenizer
-        return super().decode(
-            token_ids,
-            skip_special_tokens=skip_special_tokens,
-            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
-            spaces_between_special_tokens=spaces_between_special_tokens,
-            **kwargs,
-        )
-    # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.save_vocabulary
-    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
-        if not os.path.isdir(save_directory):
-            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
-            return
-        vocab_file = os.path.join(
-            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
-        )
-        merge_file = os.path.join(
-            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
-        )
-        with open(vocab_file, "w", encoding="utf-8") as f:
-            f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
-        index = 0
-        with open(merge_file, "w", encoding="utf-8") as writer:
-            writer.write("#version: 0.2\n")
-            for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
-                if index != token_index:
-                    logger.warning(
-                        f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive."
-                        " Please check that the tokenizer is not corrupted!"
-                    )
-                    index = token_index
-                writer.write(" ".join(bpe_tokens) + "\n")
-                index += 1
-        return vocab_file, merge_file
-    def prepare_for_tokenization(self, text, **kwargs):
-        text = unicodedata.normalize("NFC", text)
-        return (text, kwargs)

tokenization_qwen2_fast.py DELETED Viewed

@@ -1,143 +0,0 @@
-# coding=utf-8
-# Copyright 2024 The Qwen team, Alibaba Group and The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Tokenization classes for Qwen2."""
-from typing import Optional, Tuple
-from transformers.tokenization_utils import AddedToken
-from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
-from transformers.utils import logging
-from .tokenization_qwen2 import Qwen2Tokenizer
-logger = logging.get_logger(__name__)
-VOCAB_FILES_NAMES = {
-    "vocab_file": "vocab.json",
-    "merges_file": "merges.txt",
-    "tokenizer_file": "tokenizer.json",
-}
-PRETRAINED_VOCAB_FILES_MAP = {
-    "vocab_file": {"qwen/qwen-tokenizer": "https://huggingface.co/qwen/qwen-tokenizer/resolve/main/vocab.json"},
-    "merges_file": {"qwen/qwen-tokenizer": "https://huggingface.co/qwen/qwen-tokenizer/resolve/main/merges.txt"},
-    "tokenizer_file": {
-        "qwen/qwen-tokenizer": "https://huggingface.co/qwen/qwen-tokenizer/resolve/main/tokenizer.json"
-    },
-}
-MAX_MODEL_INPUT_SIZES = {"qwen/qwen-tokenizer": 32768}
-class Qwen2TokenizerFast(PreTrainedTokenizerFast):
-    """
-    Construct a "fast" Qwen2 tokenizer (backed by HuggingFace's *tokenizers* library). Based on byte-level
-    Byte-Pair-Encoding.
-    Same with GPT2Tokenzier, this tokenizer has been trained to treat spaces like parts of the tokens so a word will
-    be encoded differently whether it is at the beginning of the sentence (without space) or not:
-    ```python
-    >>> from transformers import Qwen2TokenizerFast
-    >>> tokenizer = Qwen2TokenizerFast.from_pretrained("Qwen/Qwen-tokenizer")
-    >>> tokenizer("Hello world")["input_ids"]
-    [9707, 1879]
-    >>> tokenizer(" Hello world")["input_ids"]
-    [21927, 1879]
-    ```
-    This is expected.
-    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
-    refer to this superclass for more information regarding those methods.
-    Args:
-        vocab_file (`str`, *optional*):
-            Path to the vocabulary file.
-        merges_file (`str`, *optional*):
-            Path to the merges file.
-        tokenizer_file (`str`, *optional*):
-            Path to [tokenizers](https://github.com/huggingface/tokenizers) file (generally has a .json extension) that
-            contains everything needed to load the tokenizer.
-        unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
-            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
-            token instead. Not applicable to this tokenizer.
-        bos_token (`str`, *optional*):
-            The beginning of sequence token. Not applicable for this tokenizer.
-        eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
-            The end of sequence token.
-        pad_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
-            The token used for padding, for example when batching sequences of different lengths.
-    """
-    vocab_files_names = VOCAB_FILES_NAMES
-    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
-    max_model_input_sizes = MAX_MODEL_INPUT_SIZES
-    model_input_names = ["input_ids", "attention_mask"]
-    slow_tokenizer_class = Qwen2Tokenizer
-    def __init__(
-        self,
-        vocab_file=None,
-        merges_file=None,
-        tokenizer_file=None,
-        unk_token="<|endoftext|>",
-        bos_token=None,
-        eos_token="<|endoftext|>",
-        pad_token="<|endoftext|>",
-        **kwargs,
-    ):
-        # We need to at least pass vocab_file and merges_file to base class
-        # in case a slow tokenizer needs to be initialized; other can be
-        # configured through files.
-        # following GPT2TokenizerFast, also adding unk_token, bos_token, and eos_token
-        bos_token = (
-            AddedToken(bos_token, lstrip=False, rstrip=False, special=True, normalized=False)
-            if isinstance(bos_token, str)
-            else bos_token
-        )
-        eos_token = (
-            AddedToken(eos_token, lstrip=False, rstrip=False, special=True, normalized=False)
-            if isinstance(eos_token, str)
-            else eos_token
-        )
-        unk_token = (
-            AddedToken(unk_token, lstrip=False, rstrip=False, special=True, normalized=False)
-            if isinstance(unk_token, str)
-            else unk_token
-        )
-        pad_token = (
-            AddedToken(pad_token, lstrip=False, rstrip=False, special=True, normalized=False)
-            if isinstance(pad_token, str)
-            else pad_token
-        )
-        super().__init__(
-            vocab_file,
-            merges_file,
-            tokenizer_file=tokenizer_file,
-            unk_token=unk_token,
-            bos_token=bos_token,
-            eos_token=eos_token,
-            pad_token=pad_token,
-            **kwargs,
-        )
-    # Copied from transformers.models.gpt2.tokenization_gpt2_fast.GPT2TokenizerFast.save_vocabulary
-    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
-        files = self._tokenizer.model.save(save_directory, name=filename_prefix)
-        return tuple(files)