microsoft
/

Phi-4-multimodal-instruct-onnx

@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch Phi-O model."""
 import os
 import math
 import warnings
@@ -48,8 +48,8 @@ from transformers.utils import (
 )
 from transformers import PretrainedConfig
-from .configuration_phio import PhiOConfig
-from .processing_phio import InputMode
 from .vision_siglip_navit import get_siglip_vision_model
 from .speech_conformer_encoder import ConformerEncoder
@@ -57,7 +57,7 @@ from .speech_conformer_encoder import ConformerEncoder
 logger = logging.get_logger(__name__)
 _CHECKPOINT_FOR_DOC = "TBA"
-_CONFIG_FOR_DOC = "PhiOConfig"
 # Special token ids
 _IMAGE_SPECIAL_TOKEN_ID = 200010  # '<|endoftext10|>', or we can better name it (in `tokenizer_config.json`)
@@ -194,8 +194,8 @@ def select_logic(hidden_states: torch.FloatTensor, features: torch.FloatTensor,
     return hidden_states
-class PhiOEmbedding(nn.Module):
-    """Phi-O embedding for text-only, vision + text, speech + text, and vision + speech + text"""
     def __init__(self, wte):
         super().__init__()
         self.wte = wte
@@ -234,7 +234,7 @@ class PhiOEmbedding(nn.Module):
         return hidden_states
-class PhiOImageEmbedding(nn.Module):
     """Image embedding."""
     def __init__(self, config: PretrainedConfig, **kwargs) -> None:
@@ -666,7 +666,7 @@ class PhiOImageEmbedding(nn.Module):
         return image_features_proj.squeeze()
-class PhiOAudioEmbedding(nn.Module):
     """Audio embedding."""
     def __init__(self, config: PretrainedConfig, **kwargs) -> None:
@@ -746,7 +746,7 @@ class PhiOAudioEmbedding(nn.Module):
         self.audio_embed_sizes = None
     def post_init(self, audio_config):
-        # execute after the from_pretrained() initialization of the phio model
         if audio_config.get('name', None) == "cascades":
             init_model_config = audio_config.get("init_model", {})
             self.encoder.post_init(init_model_config)
@@ -891,7 +891,7 @@ class PhiOAudioEmbedding(nn.Module):
         return audio_features_proj
-class PhiOImageAudioEmbedding(nn.Module):
     """Image-audio embedding."""
     def __init__(self, config: PretrainedConfig, **kwargs) -> None:
@@ -904,9 +904,9 @@ class PhiOImageAudioEmbedding(nn.Module):
         assert self.image_input_id != self.audio_input_id, 'image_input_id and audio_input_id should be different'
         self.image_embd_layer_kwargs = kwargs['image_embd_layer']
-        self.image_embed = PhiOImageEmbedding(config, **self.image_embd_layer_kwargs)
         self.audio_embd_layer_kwargs = kwargs['audio_embd_layer']
-        self.audio_embed = PhiOAudioEmbedding(config, **self.audio_embd_layer_kwargs)
         self.input_image_embeds = None
         self.image_sizes = None
@@ -1035,10 +1035,10 @@ class PhiOImageAudioEmbedding(nn.Module):
 # Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Phi3
-class PhiORMSNorm(nn.Module):
     def __init__(self, hidden_size, eps=1e-6):
         """
-        PhiORMSNorm is equivalent to T5LayerNorm
         """
         super().__init__()
         self.weight = nn.Parameter(torch.ones(hidden_size))
@@ -1056,7 +1056,7 @@ class PhiORMSNorm(nn.Module):
 # Copied from transformers.models.gemma.modeling_gemma.GemmaRotaryEmbedding with gemma->phi3, Gemma->Phi3
-class PhiORotaryEmbedding(nn.Module):
     def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
         super().__init__()
@@ -1085,11 +1085,11 @@ class PhiORotaryEmbedding(nn.Module):
         return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
-class PhiOSuScaledRotaryEmbedding(PhiORotaryEmbedding):
     def __init__(self, dim, config, device=None):
         warnings.warn(
-            "The class PhiOSuScaledRotaryEmbedding is deprecated and will be removed in version 5 of Transformers. Please"
-            " use PhiOLongRoPEScaledRotaryEmbedding instead.",
             FutureWarning,
         )
         super().__init__(dim, config.max_position_embeddings, config.rope_theta, device)
@@ -1126,10 +1126,10 @@ class PhiOSuScaledRotaryEmbedding(PhiORotaryEmbedding):
         return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
-class PhiOYarnScaledRotaryEmbedding(PhiORotaryEmbedding):
     def __init__(self, dim, config, device=None):
         warnings.warn(
-            "The class PhiOYarnScaledRotaryEmbedding is deprecated and will be removed in version 5 of Transformers",
             FutureWarning,
         )
         super().__init__(dim, config.max_position_embeddings, config.rope_theta, device)
@@ -1171,7 +1171,7 @@ class PhiOYarnScaledRotaryEmbedding(PhiORotaryEmbedding):
         return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
-class PhiOLongRoPEScaledRotaryEmbedding(PhiORotaryEmbedding):
     def __init__(self, dim, config, device=None):
         super().__init__(dim, config.max_position_embeddings, config.rope_theta, device)
@@ -1252,7 +1252,7 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
     return q_embed, k_embed
-class PhiOMLP(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -1284,10 +1284,10 @@ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
     return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
-class PhiOAttention(nn.Module):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
-    def __init__(self, config: PhiOConfig, layer_idx: Optional[int] = None):
         super().__init__()
         self.config = config
         self.layer_idx = layer_idx
@@ -1324,7 +1324,7 @@ class PhiOAttention(nn.Module):
     def _init_rope(self):
         if self.rope_scaling is None:
-            self.rotary_emb = PhiORotaryEmbedding(
                 self.rotary_ndims,
                 max_position_embeddings=self.max_position_embeddings,
                 base=self.rope_theta,
@@ -1332,7 +1332,7 @@ class PhiOAttention(nn.Module):
         else:
             scaling_type = self.config.rope_scaling["type"]
             if scaling_type == "longrope":
-                self.rotary_emb = PhiOLongRoPEScaledRotaryEmbedding(self.rotary_ndims, self.config)
             else:
                 raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
@@ -1410,9 +1410,9 @@ class PhiOAttention(nn.Module):
         return attn_output, attn_weights, past_key_value
-class PhiOFlashAttention2(PhiOAttention):
     """
-    Phi-O flash attention module. This module inherits from `PhiOAttention` as the weights of the module stays
     untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
     flash attention and deal with padding tokens in case the input contains any of them.
     """
@@ -1436,7 +1436,7 @@ class PhiOFlashAttention2(PhiOAttention):
         use_cache: bool = False,
         cache_position: Optional[torch.LongTensor] = None,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        # PhiOFlashAttention2 attention does not support output_attentions
         output_attentions = False
@@ -1538,14 +1538,14 @@ class PhiOFlashAttention2(PhiOAttention):
 # copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention with Llama->Phi
 # TODO @Arthur no longer copied from LLama after static cache
-class PhiOSdpaAttention(PhiOAttention):
     """
-    PhiO attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
-    `PhiOAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
     SDPA API.
     """
-    # Adapted from PhiOAttention.forward
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -1559,7 +1559,7 @@ class PhiOSdpaAttention(PhiOAttention):
         if output_attentions:
             # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
             logger.warning_once(
-                "PhiOModel is using PhiOSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
                 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
             )
             return super().forward(
@@ -1630,26 +1630,26 @@ class PhiOSdpaAttention(PhiOAttention):
         return attn_output, None, past_key_value
-PHIO_ATTENTION_CLASSES = {
-    "eager": PhiOAttention,
-    "flash_attention_2": PhiOFlashAttention2,
-    "sdpa": PhiOSdpaAttention,
 }
-class PhiODecoderLayer(nn.Module):
-    def __init__(self, config: PhiOConfig, layer_idx: int):
         super().__init__()
         self.config = config
-        self.self_attn = PHIO_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx=layer_idx)
-        self.mlp = PhiOMLP(config)
-        self.input_layernorm = PhiORMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.resid_attn_dropout = nn.Dropout(config.resid_pdrop)
         self.resid_mlp_dropout = nn.Dropout(config.resid_pdrop)
-        self.post_attention_layernorm = PhiORMSNorm(config.hidden_size, eps=config.rms_norm_eps)
     def forward(
         self,
@@ -1718,7 +1718,7 @@ class PhiODecoderLayer(nn.Module):
         return outputs
-PHIO_START_DOCSTRING = r"""
     This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
     library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
     etc.)
@@ -1728,7 +1728,7 @@ PHIO_START_DOCSTRING = r"""
     and behavior.
     Parameters:
-        config ([`PhiOConfig`]):
             Model configuration class with all the parameters of the model. Initializing with a config file does not
             load the weights associated with the model, only the configuration. Check out the
             [`~PreTrainedModel.from_pretrained`] method to load the model weights.
@@ -1737,13 +1737,13 @@ PHIO_START_DOCSTRING = r"""
 @add_start_docstrings(
     "The bare Phi-O model outputting raw hidden-states without any specific head on top.",
-    PHIO_START_DOCSTRING,
 )
-class PhiOPreTrainedModel(PreTrainedModel):
-    config_class = PhiOConfig
     base_model_prefix = "model"
     supports_gradient_checkpointing = True
-    _no_split_modules = ["PhiODecoderLayer"]
     _skip_keys_device_placement = "past_key_values"
     _supports_flash_attn_2 = True
     _supports_sdpa = True
@@ -1763,7 +1763,7 @@ class PhiOPreTrainedModel(PreTrainedModel):
                 module.weight.data[module.padding_idx].zero_()
-PHIO_INPUTS_DOCSTRING = r"""
     Args:
         input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
             Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
@@ -1840,24 +1840,24 @@ PHIO_INPUTS_DOCSTRING = r"""
 @add_start_docstrings(
     "The bare Phi-O model outputting raw hidden-states without any specific head on top.",
-    PHIO_START_DOCSTRING,
 )
-class PhiOModel(PhiOPreTrainedModel):
     """
-    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`PhiODecoderLayer`]
     Args:
-        config: PhiOConfig
     """
-    def __init__(self, config: PhiOConfig):
         super().__init__(config)
         self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
         self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
         self.embed_dropout = nn.Dropout(config.embd_pdrop)
-        self.combined_embed = PhiOEmbedding(self.embed_tokens)
         self.embed_tokens_extend = None
         if isinstance(config.embd_layer, dict):
@@ -1865,13 +1865,13 @@ class PhiOModel(PhiOPreTrainedModel):
                 'embedding_cls': config.embd_layer['embedding_cls'],
                 **config.embd_layer
             }
-            self.embed_tokens_extend = PhiOImageAudioEmbedding(config, **embedding_config)
         self.layers = nn.ModuleList(
-            [PhiODecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
         )
         self._attn_implementation = config._attn_implementation
-        self.norm = PhiORMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.gradient_checkpointing = False
         # Initialize weights and apply final processing
@@ -1883,7 +1883,7 @@ class PhiOModel(PhiOPreTrainedModel):
     def set_input_embeddings(self, value):
         self.embed_tokens = value
-    @add_start_docstrings_to_model_forward(PHIO_INPUTS_DOCSTRING)
     def forward(
         self,
         input_ids: torch.LongTensor = None,
@@ -2109,7 +2109,7 @@ class PhiOModel(PhiOPreTrainedModel):
         device: torch.device,
         cache_position: torch.Tensor,
         batch_size: int,
-        config: PhiOConfig,
         past_key_values: Cache,
     ):
         """
@@ -2131,7 +2131,7 @@ class PhiOModel(PhiOPreTrainedModel):
                 Indices depicting the position of the input sequence tokens in the sequence.
             batch_size (`torch.Tensor`):
                 Batch size.
-            config (`PhiOConfig`):
                 The model's configuration class
             past_key_values (`Cache`):
                 The cache class that is being used currently to generate
@@ -2168,13 +2168,13 @@ class PhiOModel(PhiOPreTrainedModel):
         return causal_mask
-class PhiOForCausalLM(PhiOPreTrainedModel, GenerationMixin):
     _tied_weights_keys = ["lm_head.weight"]
     # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.__init__ with Llama->Phi
     def __init__(self, config):
         super().__init__(config)
-        self.model = PhiOModel(config)
         self.vocab_size = config.vocab_size
         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
@@ -2260,7 +2260,7 @@ class PhiOForCausalLM(PhiOPreTrainedModel, GenerationMixin):
         return self.model
     # Ignore copy
-    @add_start_docstrings_to_model_forward(PHIO_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
@@ -2301,9 +2301,9 @@ class PhiOForCausalLM(PhiOPreTrainedModel, GenerationMixin):
         Example:
         ```python
-        >>> from transformers import AutoTokenizer, PhiOForCausalLM
-        >>> model = PhiOForCausalLM.from_pretrained("TBA")
         >>> tokenizer = AutoTokenizer.from_pretrained("TBA")
         >>> prompt = "This is an example script ."
@@ -2443,9 +2443,9 @@ class PhiOForCausalLM(PhiOPreTrainedModel, GenerationMixin):
 @add_start_docstrings(
     """
-    The [`PhiOModel`] with a sequence classification head on top (linear layer).
-    [`PhiOForSequenceClassification`] uses the last token in order to do the classification, as other causal models
     (e.g. GPT-2) do.
     Since it does classification on the last token, it requires to know the position of the last token. If a
@@ -2454,14 +2454,14 @@ class PhiOForCausalLM(PhiOPreTrainedModel, GenerationMixin):
     padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
     each row of the batch).
     """,
-    PHIO_START_DOCSTRING,
 )
 # Copied from transformers.models.llama.modeling_llama.LlamaForSequenceClassification with Llama->Phi, LLAMA->PHI, self.transformer->self.model, transformer_outputs->model_outputs
-class PhiOForSequenceClassification(PhiOPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
-        self.model = PhiOModel(config)
         self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
         # Initialize weights and apply final processing
@@ -2473,7 +2473,7 @@ class PhiOForSequenceClassification(PhiOPreTrainedModel):
     def set_input_embeddings(self, value):
         self.model.embed_tokens = value
-    @add_start_docstrings_to_model_forward(PHIO_INPUTS_DOCSTRING)
     def forward(
         self,
         input_ids: torch.LongTensor = None,
@@ -2548,18 +2548,18 @@ class PhiOForSequenceClassification(PhiOPreTrainedModel):
 @add_start_docstrings(
     """
-    [`PhiOModel`] with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
     Named-Entity-Recognition (NER) tasks.
     """,
-    PHIO_START_DOCSTRING,
 )
 # Copied from transformers.models.mpt.modeling_mpt.MptForTokenClassification with Mpt->Phi,MPT->PHI,self.transformer->self.model,transformer_outputs->model_outputs
-class PhiOForTokenClassification(PhiOPreTrainedModel):
-    def __init__(self, config: PhiOConfig):
         super().__init__(config)
         self.num_labels = config.num_labels
-        self.model = PhiOModel(config)
         if hasattr(config, "classifier_dropout") and config.classifier_dropout is not None:
             classifier_dropout = config.classifier_dropout
         elif hasattr(config, "hidden_dropout") and config.hidden_dropout is not None:
@@ -2572,7 +2572,7 @@ class PhiOForTokenClassification(PhiOPreTrainedModel):
         # Initialize weights and apply final processing
         self.post_init()
-    @add_start_docstrings_to_model_forward(PHIO_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TokenClassifierOutput,

 # See the License for the specific language governing permissions and
 # limitations under the License.
+""" PyTorch Phi-4-MM model."""
 import os
 import math
 import warnings
 )
 from transformers import PretrainedConfig
+from .configuration_phi4mm import Phi4MMConfig
+from .processing_phi4mm import InputMode
 from .vision_siglip_navit import get_siglip_vision_model
 from .speech_conformer_encoder import ConformerEncoder
 logger = logging.get_logger(__name__)
 _CHECKPOINT_FOR_DOC = "TBA"
+_CONFIG_FOR_DOC = "Phi4MMConfig"
 # Special token ids
 _IMAGE_SPECIAL_TOKEN_ID = 200010  # '<|endoftext10|>', or we can better name it (in `tokenizer_config.json`)
     return hidden_states
+class Phi4MMEmbedding(nn.Module):
+    """Phi-4-MM embedding for text-only, vision + text, speech + text, and vision + speech + text"""
     def __init__(self, wte):
         super().__init__()
         self.wte = wte
         return hidden_states
+class Phi4MMImageEmbedding(nn.Module):
     """Image embedding."""
     def __init__(self, config: PretrainedConfig, **kwargs) -> None:
         return image_features_proj.squeeze()
+class Phi4MMAudioEmbedding(nn.Module):
     """Audio embedding."""
     def __init__(self, config: PretrainedConfig, **kwargs) -> None:
         self.audio_embed_sizes = None
     def post_init(self, audio_config):
+        # execute after the from_pretrained() initialization of the Phi4MM model
         if audio_config.get('name', None) == "cascades":
             init_model_config = audio_config.get("init_model", {})
             self.encoder.post_init(init_model_config)
         return audio_features_proj
+class Phi4MMImageAudioEmbedding(nn.Module):
     """Image-audio embedding."""
     def __init__(self, config: PretrainedConfig, **kwargs) -> None:
         assert self.image_input_id != self.audio_input_id, 'image_input_id and audio_input_id should be different'
         self.image_embd_layer_kwargs = kwargs['image_embd_layer']
+        self.image_embed = Phi4MMImageEmbedding(config, **self.image_embd_layer_kwargs)
         self.audio_embd_layer_kwargs = kwargs['audio_embd_layer']
+        self.audio_embed = Phi4MMAudioEmbedding(config, **self.audio_embd_layer_kwargs)
         self.input_image_embeds = None
         self.image_sizes = None
 # Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Phi3
+class Phi4MMRMSNorm(nn.Module):
     def __init__(self, hidden_size, eps=1e-6):
         """
+        Phi4MMRMSNorm is equivalent to T5LayerNorm
         """
         super().__init__()
         self.weight = nn.Parameter(torch.ones(hidden_size))
 # Copied from transformers.models.gemma.modeling_gemma.GemmaRotaryEmbedding with gemma->phi3, Gemma->Phi3
+class Phi4MMRotaryEmbedding(nn.Module):
     def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
         super().__init__()
         return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+class Phi4MMSuScaledRotaryEmbedding(Phi4MMRotaryEmbedding):
     def __init__(self, dim, config, device=None):
         warnings.warn(
+            "The class Phi4MMSuScaledRotaryEmbedding is deprecated and will be removed in version 5 of Transformers. Please"
+            " use Phi4MMLongRoPEScaledRotaryEmbedding instead.",
             FutureWarning,
         )
         super().__init__(dim, config.max_position_embeddings, config.rope_theta, device)
         return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+class Phi4MMYarnScaledRotaryEmbedding(Phi4MMRotaryEmbedding):
     def __init__(self, dim, config, device=None):
         warnings.warn(
+            "The class Phi4MMYarnScaledRotaryEmbedding is deprecated and will be removed in version 5 of Transformers",
             FutureWarning,
         )
         super().__init__(dim, config.max_position_embeddings, config.rope_theta, device)
         return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+class Phi4MMLongRoPEScaledRotaryEmbedding(Phi4MMRotaryEmbedding):
     def __init__(self, dim, config, device=None):
         super().__init__(dim, config.max_position_embeddings, config.rope_theta, device)
     return q_embed, k_embed
+class Phi4MMMLP(nn.Module):
     def __init__(self, config):
         super().__init__()
     return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+class Phi4MMAttention(nn.Module):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
+    def __init__(self, config: Phi4MMConfig, layer_idx: Optional[int] = None):
         super().__init__()
         self.config = config
         self.layer_idx = layer_idx
     def _init_rope(self):
         if self.rope_scaling is None:
+            self.rotary_emb = Phi4MMRotaryEmbedding(
                 self.rotary_ndims,
                 max_position_embeddings=self.max_position_embeddings,
                 base=self.rope_theta,
         else:
             scaling_type = self.config.rope_scaling["type"]
             if scaling_type == "longrope":
+                self.rotary_emb = Phi4MMLongRoPEScaledRotaryEmbedding(self.rotary_ndims, self.config)
             else:
                 raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
         return attn_output, attn_weights, past_key_value
+class Phi4MMFlashAttention2(Phi4MMAttention):
     """
+    Phi-O flash attention module. This module inherits from `Phi4MMAttention` as the weights of the module stays
     untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
     flash attention and deal with padding tokens in case the input contains any of them.
     """
         use_cache: bool = False,
         cache_position: Optional[torch.LongTensor] = None,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        # Phi4MMFlashAttention2 attention does not support output_attentions
         output_attentions = False
 # copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention with Llama->Phi
 # TODO @Arthur no longer copied from LLama after static cache
+class Phi4MMSdpaAttention(Phi4MMAttention):
     """
+    Phi4MM attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
+    `Phi4MMAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
     SDPA API.
     """
+    # Adapted from Phi4MMAttention.forward
     def forward(
         self,
         hidden_states: torch.Tensor,
         if output_attentions:
             # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
             logger.warning_once(
+                "Phi4MMModel is using Phi4MMSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
                 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
             )
             return super().forward(
         return attn_output, None, past_key_value
+PHI4MM_ATTENTION_CLASSES = {
+    "eager": Phi4MMAttention,
+    "flash_attention_2": Phi4MMFlashAttention2,
+    "sdpa": Phi4MMSdpaAttention,
 }
+class Phi4MMDecoderLayer(nn.Module):
+    def __init__(self, config: Phi4MMConfig, layer_idx: int):
         super().__init__()
         self.config = config
+        self.self_attn = PHI4MM_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx=layer_idx)
+        self.mlp = Phi4MMMLP(config)
+        self.input_layernorm = Phi4MMRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.resid_attn_dropout = nn.Dropout(config.resid_pdrop)
         self.resid_mlp_dropout = nn.Dropout(config.resid_pdrop)
+        self.post_attention_layernorm = Phi4MMRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
     def forward(
         self,
         return outputs
+PHI4MM_START_DOCSTRING = r"""
     This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
     library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
     etc.)
     and behavior.
     Parameters:
+        config ([`Phi4MMConfig`]):
             Model configuration class with all the parameters of the model. Initializing with a config file does not
             load the weights associated with the model, only the configuration. Check out the
             [`~PreTrainedModel.from_pretrained`] method to load the model weights.
 @add_start_docstrings(
     "The bare Phi-O model outputting raw hidden-states without any specific head on top.",
+    PHI4MM_START_DOCSTRING,
 )
+class Phi4MMPreTrainedModel(PreTrainedModel):
+    config_class = Phi4MMConfig
     base_model_prefix = "model"
     supports_gradient_checkpointing = True
+    _no_split_modules = ["Phi4MMDecoderLayer"]
     _skip_keys_device_placement = "past_key_values"
     _supports_flash_attn_2 = True
     _supports_sdpa = True
                 module.weight.data[module.padding_idx].zero_()
+PHI4MM_INPUTS_DOCSTRING = r"""
     Args:
         input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
             Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
 @add_start_docstrings(
     "The bare Phi-O model outputting raw hidden-states without any specific head on top.",
+    PHI4MM_START_DOCSTRING,
 )
+class Phi4MMModel(Phi4MMPreTrainedModel):
     """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Phi4MMDecoderLayer`]
     Args:
+        config: Phi4MMConfig
     """
+    def __init__(self, config: Phi4MMConfig):
         super().__init__(config)
         self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
         self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
         self.embed_dropout = nn.Dropout(config.embd_pdrop)
+        self.combined_embed = Phi4MMEmbedding(self.embed_tokens)
         self.embed_tokens_extend = None
         if isinstance(config.embd_layer, dict):
                 'embedding_cls': config.embd_layer['embedding_cls'],
                 **config.embd_layer
             }
+            self.embed_tokens_extend = Phi4MMImageAudioEmbedding(config, **embedding_config)
         self.layers = nn.ModuleList(
+            [Phi4MMDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
         )
         self._attn_implementation = config._attn_implementation
+        self.norm = Phi4MMRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.gradient_checkpointing = False
         # Initialize weights and apply final processing
     def set_input_embeddings(self, value):
         self.embed_tokens = value
+    @add_start_docstrings_to_model_forward(Phi4MM_INPUTS_DOCSTRING)
     def forward(
         self,
         input_ids: torch.LongTensor = None,
         device: torch.device,
         cache_position: torch.Tensor,
         batch_size: int,
+        config: Phi4MMConfig,
         past_key_values: Cache,
     ):
         """
                 Indices depicting the position of the input sequence tokens in the sequence.
             batch_size (`torch.Tensor`):
                 Batch size.
+            config (`Phi4MMConfig`):
                 The model's configuration class
             past_key_values (`Cache`):
                 The cache class that is being used currently to generate
         return causal_mask
+class Phi4MMForCausalLM(Phi4MMPreTrainedModel, GenerationMixin):
     _tied_weights_keys = ["lm_head.weight"]
     # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.__init__ with Llama->Phi
     def __init__(self, config):
         super().__init__(config)
+        self.model = Phi4MMModel(config)
         self.vocab_size = config.vocab_size
         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
         return self.model
     # Ignore copy
+    @add_start_docstrings_to_model_forward(Phi4MM_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
         Example:
         ```python
+        >>> from transformers import AutoTokenizer, Phi4MMForCausalLM
+        >>> model = Phi4MMForCausalLM.from_pretrained("TBA")
         >>> tokenizer = AutoTokenizer.from_pretrained("TBA")
         >>> prompt = "This is an example script ."
 @add_start_docstrings(
     """
+    The [`Phi4MMModel`] with a sequence classification head on top (linear layer).
+    [`Phi4MMForSequenceClassification`] uses the last token in order to do the classification, as other causal models
     (e.g. GPT-2) do.
     Since it does classification on the last token, it requires to know the position of the last token. If a
     padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
     each row of the batch).
     """,
+    PHI4MM_START_DOCSTRING,
 )
 # Copied from transformers.models.llama.modeling_llama.LlamaForSequenceClassification with Llama->Phi, LLAMA->PHI, self.transformer->self.model, transformer_outputs->model_outputs
+class Phi4MMForSequenceClassification(Phi4MMPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
+        self.model = Phi4MMModel(config)
         self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
         # Initialize weights and apply final processing
     def set_input_embeddings(self, value):
         self.model.embed_tokens = value
+    @add_start_docstrings_to_model_forward(Phi4MM_INPUTS_DOCSTRING)
     def forward(
         self,
         input_ids: torch.LongTensor = None,
 @add_start_docstrings(
     """
+    [`Phi4MMModel`] with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
     Named-Entity-Recognition (NER) tasks.
     """,
+    PHI4MM_START_DOCSTRING,
 )
 # Copied from transformers.models.mpt.modeling_mpt.MptForTokenClassification with Mpt->Phi,MPT->PHI,self.transformer->self.model,transformer_outputs->model_outputs
+class Phi4MMForTokenClassification(Phi4MMPreTrainedModel):
+    def __init__(self, config: Phi4MMConfig):
         super().__init__(config)
         self.num_labels = config.num_labels
+        self.model = Phi4MMModel(config)
         if hasattr(config, "classifier_dropout") and config.classifier_dropout is not None:
             classifier_dropout = config.classifier_dropout
         elif hasattr(config, "hidden_dropout") and config.hidden_dropout is not None:
         # Initialize weights and apply final processing
         self.post_init()
+    @add_start_docstrings_to_model_forward(Phi4MM_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TokenClassifierOutput,

onnx/{processing_phio.py → processing_phi4mm.py} RENAMED Viewed

@@ -13,7 +13,7 @@
 # limitations under the License.
 """
-Processor class for PhiO
 """
 import re
 from typing import List, Optional, Tuple, Union
@@ -57,9 +57,9 @@ class InputMode(Enum):
     VISION_SPEECH = 3
-class PhiOImageProcessor(BaseImageProcessor):
     r"""
-    Constructs a PhiO image processor.
     """
     model_input_names = ["input_image_embeds", "image_sizes", "image_attention_mask"]
@@ -317,7 +317,7 @@ def speechlib_mel(sample_rate, n_fft, n_mels, fmin=None, fmax=None):
     return matrix
-class PhiOAudioFeatureExtractor(SequenceFeatureExtractor):
     model_input_names = ["input_audio_embeds", "audio_embed_sizes", "audio_attention_mask"]
     def __init__(self, audio_compression_rate, audio_downsample_rate, audio_feat_stride, **kwargs):
@@ -489,15 +489,15 @@ class PhiOAudioFeatureExtractor(SequenceFeatureExtractor):
         return result
-class PhiOProcessor(ProcessorMixin):
     r"""
-    Constructs a PhiO processor which raps an image processor, a audio processor, and a GPT tokenizer into a single processor.
-    [`PhiOProcessor`] offers all the functionalities of [`PhiOImageProcessor`] and [`GPT2Tokenizer`]. See the
-    [`~PhiOProcessor.__call__`] and [`~PhiOProcessor.decode`] for more information.
     Args:
-        image_processor ([`PhiOImageProcessor`], *optional*):
             The image processor is a required input.
         tokenizer ([`GPT2Tokenizer`], *optional*):
             The tokenizer is a required input.
@@ -505,8 +505,8 @@ class PhiOProcessor(ProcessorMixin):
     attributes = ["image_processor", "audio_processor", "tokenizer"]
     tokenizer_class = "GPT2TokenizerFast"
-    image_processor_class = "AutoImageProcessor"  # PhiOImageProcessor will be registered later
-    audio_processor_class = "AutoFeatureExtractor"  # PhiOAudioFeatureExtractor will be registered later
     def __init__(self, image_processor, audio_processor, tokenizer):
         self.image_processor = image_processor
@@ -527,7 +527,7 @@ class PhiOProcessor(ProcessorMixin):
         Main method to prepare for the model one or several sequences(s) and image(s). This method forards the `text`
         and `kwargs` arguments to GPT2Tokenizer's [`~GPT2Tokenizer.__call__`] if `text` is not `None` to encode
         the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
-        PhiOImageProcessor's [`~PhiOImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
         of the above two methods for more information.
         Args:
@@ -728,5 +728,5 @@ class PhiOProcessor(ProcessorMixin):
         return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names + audio_processor_input_names))
-AutoImageProcessor.register("PhiOImageProcessor", PhiOImageProcessor)
-AutoFeatureExtractor.register("PhiOAudioFeatureExtractor", PhiOAudioFeatureExtractor)

 # limitations under the License.
 """
+Processor class for Phi-4-MM
 """
 import re
 from typing import List, Optional, Tuple, Union
     VISION_SPEECH = 3
+class Phi4MMImageProcessor(BaseImageProcessor):
     r"""
+    Constructs a Phi4MM image processor.
     """
     model_input_names = ["input_image_embeds", "image_sizes", "image_attention_mask"]
     return matrix
+class Phi4MMAudioFeatureExtractor(SequenceFeatureExtractor):
     model_input_names = ["input_audio_embeds", "audio_embed_sizes", "audio_attention_mask"]
     def __init__(self, audio_compression_rate, audio_downsample_rate, audio_feat_stride, **kwargs):
         return result
+class Phi4MMProcessor(ProcessorMixin):
     r"""
+    Constructs a Phi4MM processor which raps an image processor, a audio processor, and a GPT tokenizer into a single processor.
+    [`Phi4MMProcessor`] offers all the functionalities of [`Phi4MMImageProcessor`] and [`GPT2Tokenizer`]. See the
+    [`~Phi4MMProcessor.__call__`] and [`~Phi4MMProcessor.decode`] for more information.
     Args:
+        image_processor ([`Phi4MMImageProcessor`], *optional*):
             The image processor is a required input.
         tokenizer ([`GPT2Tokenizer`], *optional*):
             The tokenizer is a required input.
     attributes = ["image_processor", "audio_processor", "tokenizer"]
     tokenizer_class = "GPT2TokenizerFast"
+    image_processor_class = "AutoImageProcessor"  # Phi4MMImageProcessor will be registered later
+    audio_processor_class = "AutoFeatureExtractor"  # Phi4MMAudioFeatureExtractor will be registered later
     def __init__(self, image_processor, audio_processor, tokenizer):
         self.image_processor = image_processor
         Main method to prepare for the model one or several sequences(s) and image(s). This method forards the `text`
         and `kwargs` arguments to GPT2Tokenizer's [`~GPT2Tokenizer.__call__`] if `text` is not `None` to encode
         the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
+        Phi4MMImageProcessor's [`~Phi4MMImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
         of the above two methods for more information.
         Args:
         return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names + audio_processor_input_names))
+AutoImageProcessor.register("Phi4MMImageProcessor", Phi4MMImageProcessor)
+AutoFeatureExtractor.register("Phi4MMAudioFeatureExtractor", Phi4MMAudioFeatureExtractor)