kvaishnavi commited on
Commit
7a8fd12
·
1 Parent(s): 3d69e6d

Fix naming styles

Browse files
onnx/config.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:16fb355ba07bea3ffdf794f297f2005aee4f4ee6aba9742e264ad4471535e966
3
- size 4585
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a51e83822b43cce5531974027de76f26710fee3095c065213607bb3557276c9
3
+ size 4629
onnx/{modeling_phio.py → modeling_phi4mm.py} RENAMED
@@ -13,7 +13,7 @@
13
  # See the License for the specific language governing permissions and
14
  # limitations under the License.
15
 
16
- """ PyTorch Phi-O model."""
17
  import os
18
  import math
19
  import warnings
@@ -48,8 +48,8 @@ from transformers.utils import (
48
  )
49
  from transformers import PretrainedConfig
50
 
51
- from .configuration_phio import PhiOConfig
52
- from .processing_phio import InputMode
53
  from .vision_siglip_navit import get_siglip_vision_model
54
  from .speech_conformer_encoder import ConformerEncoder
55
 
@@ -57,7 +57,7 @@ from .speech_conformer_encoder import ConformerEncoder
57
  logger = logging.get_logger(__name__)
58
 
59
  _CHECKPOINT_FOR_DOC = "TBA"
60
- _CONFIG_FOR_DOC = "PhiOConfig"
61
 
62
  # Special token ids
63
  _IMAGE_SPECIAL_TOKEN_ID = 200010 # '<|endoftext10|>', or we can better name it (in `tokenizer_config.json`)
@@ -194,8 +194,8 @@ def select_logic(hidden_states: torch.FloatTensor, features: torch.FloatTensor,
194
  return hidden_states
195
 
196
 
197
- class PhiOEmbedding(nn.Module):
198
- """Phi-O embedding for text-only, vision + text, speech + text, and vision + speech + text"""
199
  def __init__(self, wte):
200
  super().__init__()
201
  self.wte = wte
@@ -234,7 +234,7 @@ class PhiOEmbedding(nn.Module):
234
  return hidden_states
235
 
236
 
237
- class PhiOImageEmbedding(nn.Module):
238
  """Image embedding."""
239
 
240
  def __init__(self, config: PretrainedConfig, **kwargs) -> None:
@@ -666,7 +666,7 @@ class PhiOImageEmbedding(nn.Module):
666
  return image_features_proj.squeeze()
667
 
668
 
669
- class PhiOAudioEmbedding(nn.Module):
670
  """Audio embedding."""
671
 
672
  def __init__(self, config: PretrainedConfig, **kwargs) -> None:
@@ -746,7 +746,7 @@ class PhiOAudioEmbedding(nn.Module):
746
  self.audio_embed_sizes = None
747
 
748
  def post_init(self, audio_config):
749
- # execute after the from_pretrained() initialization of the phio model
750
  if audio_config.get('name', None) == "cascades":
751
  init_model_config = audio_config.get("init_model", {})
752
  self.encoder.post_init(init_model_config)
@@ -891,7 +891,7 @@ class PhiOAudioEmbedding(nn.Module):
891
  return audio_features_proj
892
 
893
 
894
- class PhiOImageAudioEmbedding(nn.Module):
895
  """Image-audio embedding."""
896
 
897
  def __init__(self, config: PretrainedConfig, **kwargs) -> None:
@@ -904,9 +904,9 @@ class PhiOImageAudioEmbedding(nn.Module):
904
  assert self.image_input_id != self.audio_input_id, 'image_input_id and audio_input_id should be different'
905
 
906
  self.image_embd_layer_kwargs = kwargs['image_embd_layer']
907
- self.image_embed = PhiOImageEmbedding(config, **self.image_embd_layer_kwargs)
908
  self.audio_embd_layer_kwargs = kwargs['audio_embd_layer']
909
- self.audio_embed = PhiOAudioEmbedding(config, **self.audio_embd_layer_kwargs)
910
 
911
  self.input_image_embeds = None
912
  self.image_sizes = None
@@ -1035,10 +1035,10 @@ class PhiOImageAudioEmbedding(nn.Module):
1035
 
1036
 
1037
  # Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Phi3
1038
- class PhiORMSNorm(nn.Module):
1039
  def __init__(self, hidden_size, eps=1e-6):
1040
  """
1041
- PhiORMSNorm is equivalent to T5LayerNorm
1042
  """
1043
  super().__init__()
1044
  self.weight = nn.Parameter(torch.ones(hidden_size))
@@ -1056,7 +1056,7 @@ class PhiORMSNorm(nn.Module):
1056
 
1057
 
1058
  # Copied from transformers.models.gemma.modeling_gemma.GemmaRotaryEmbedding with gemma->phi3, Gemma->Phi3
1059
- class PhiORotaryEmbedding(nn.Module):
1060
  def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
1061
  super().__init__()
1062
 
@@ -1085,11 +1085,11 @@ class PhiORotaryEmbedding(nn.Module):
1085
  return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
1086
 
1087
 
1088
- class PhiOSuScaledRotaryEmbedding(PhiORotaryEmbedding):
1089
  def __init__(self, dim, config, device=None):
1090
  warnings.warn(
1091
- "The class PhiOSuScaledRotaryEmbedding is deprecated and will be removed in version 5 of Transformers. Please"
1092
- " use PhiOLongRoPEScaledRotaryEmbedding instead.",
1093
  FutureWarning,
1094
  )
1095
  super().__init__(dim, config.max_position_embeddings, config.rope_theta, device)
@@ -1126,10 +1126,10 @@ class PhiOSuScaledRotaryEmbedding(PhiORotaryEmbedding):
1126
  return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
1127
 
1128
 
1129
- class PhiOYarnScaledRotaryEmbedding(PhiORotaryEmbedding):
1130
  def __init__(self, dim, config, device=None):
1131
  warnings.warn(
1132
- "The class PhiOYarnScaledRotaryEmbedding is deprecated and will be removed in version 5 of Transformers",
1133
  FutureWarning,
1134
  )
1135
  super().__init__(dim, config.max_position_embeddings, config.rope_theta, device)
@@ -1171,7 +1171,7 @@ class PhiOYarnScaledRotaryEmbedding(PhiORotaryEmbedding):
1171
  return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
1172
 
1173
 
1174
- class PhiOLongRoPEScaledRotaryEmbedding(PhiORotaryEmbedding):
1175
  def __init__(self, dim, config, device=None):
1176
  super().__init__(dim, config.max_position_embeddings, config.rope_theta, device)
1177
 
@@ -1252,7 +1252,7 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
1252
  return q_embed, k_embed
1253
 
1254
 
1255
- class PhiOMLP(nn.Module):
1256
  def __init__(self, config):
1257
  super().__init__()
1258
 
@@ -1284,10 +1284,10 @@ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
1284
  return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
1285
 
1286
 
1287
- class PhiOAttention(nn.Module):
1288
  """Multi-headed attention from 'Attention Is All You Need' paper"""
1289
 
1290
- def __init__(self, config: PhiOConfig, layer_idx: Optional[int] = None):
1291
  super().__init__()
1292
  self.config = config
1293
  self.layer_idx = layer_idx
@@ -1324,7 +1324,7 @@ class PhiOAttention(nn.Module):
1324
 
1325
  def _init_rope(self):
1326
  if self.rope_scaling is None:
1327
- self.rotary_emb = PhiORotaryEmbedding(
1328
  self.rotary_ndims,
1329
  max_position_embeddings=self.max_position_embeddings,
1330
  base=self.rope_theta,
@@ -1332,7 +1332,7 @@ class PhiOAttention(nn.Module):
1332
  else:
1333
  scaling_type = self.config.rope_scaling["type"]
1334
  if scaling_type == "longrope":
1335
- self.rotary_emb = PhiOLongRoPEScaledRotaryEmbedding(self.rotary_ndims, self.config)
1336
  else:
1337
  raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
1338
 
@@ -1410,9 +1410,9 @@ class PhiOAttention(nn.Module):
1410
  return attn_output, attn_weights, past_key_value
1411
 
1412
 
1413
- class PhiOFlashAttention2(PhiOAttention):
1414
  """
1415
- Phi-O flash attention module. This module inherits from `PhiOAttention` as the weights of the module stays
1416
  untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
1417
  flash attention and deal with padding tokens in case the input contains any of them.
1418
  """
@@ -1436,7 +1436,7 @@ class PhiOFlashAttention2(PhiOAttention):
1436
  use_cache: bool = False,
1437
  cache_position: Optional[torch.LongTensor] = None,
1438
  ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
1439
- # PhiOFlashAttention2 attention does not support output_attentions
1440
 
1441
  output_attentions = False
1442
 
@@ -1538,14 +1538,14 @@ class PhiOFlashAttention2(PhiOAttention):
1538
 
1539
  # copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention with Llama->Phi
1540
  # TODO @Arthur no longer copied from LLama after static cache
1541
- class PhiOSdpaAttention(PhiOAttention):
1542
  """
1543
- PhiO attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
1544
- `PhiOAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
1545
  SDPA API.
1546
  """
1547
 
1548
- # Adapted from PhiOAttention.forward
1549
  def forward(
1550
  self,
1551
  hidden_states: torch.Tensor,
@@ -1559,7 +1559,7 @@ class PhiOSdpaAttention(PhiOAttention):
1559
  if output_attentions:
1560
  # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
1561
  logger.warning_once(
1562
- "PhiOModel is using PhiOSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
1563
  'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
1564
  )
1565
  return super().forward(
@@ -1630,26 +1630,26 @@ class PhiOSdpaAttention(PhiOAttention):
1630
  return attn_output, None, past_key_value
1631
 
1632
 
1633
- PHIO_ATTENTION_CLASSES = {
1634
- "eager": PhiOAttention,
1635
- "flash_attention_2": PhiOFlashAttention2,
1636
- "sdpa": PhiOSdpaAttention,
1637
  }
1638
 
1639
 
1640
- class PhiODecoderLayer(nn.Module):
1641
- def __init__(self, config: PhiOConfig, layer_idx: int):
1642
  super().__init__()
1643
 
1644
  self.config = config
1645
- self.self_attn = PHIO_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx=layer_idx)
1646
 
1647
- self.mlp = PhiOMLP(config)
1648
- self.input_layernorm = PhiORMSNorm(config.hidden_size, eps=config.rms_norm_eps)
1649
 
1650
  self.resid_attn_dropout = nn.Dropout(config.resid_pdrop)
1651
  self.resid_mlp_dropout = nn.Dropout(config.resid_pdrop)
1652
- self.post_attention_layernorm = PhiORMSNorm(config.hidden_size, eps=config.rms_norm_eps)
1653
 
1654
  def forward(
1655
  self,
@@ -1718,7 +1718,7 @@ class PhiODecoderLayer(nn.Module):
1718
  return outputs
1719
 
1720
 
1721
- PHIO_START_DOCSTRING = r"""
1722
  This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
1723
  library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
1724
  etc.)
@@ -1728,7 +1728,7 @@ PHIO_START_DOCSTRING = r"""
1728
  and behavior.
1729
 
1730
  Parameters:
1731
- config ([`PhiOConfig`]):
1732
  Model configuration class with all the parameters of the model. Initializing with a config file does not
1733
  load the weights associated with the model, only the configuration. Check out the
1734
  [`~PreTrainedModel.from_pretrained`] method to load the model weights.
@@ -1737,13 +1737,13 @@ PHIO_START_DOCSTRING = r"""
1737
 
1738
  @add_start_docstrings(
1739
  "The bare Phi-O model outputting raw hidden-states without any specific head on top.",
1740
- PHIO_START_DOCSTRING,
1741
  )
1742
- class PhiOPreTrainedModel(PreTrainedModel):
1743
- config_class = PhiOConfig
1744
  base_model_prefix = "model"
1745
  supports_gradient_checkpointing = True
1746
- _no_split_modules = ["PhiODecoderLayer"]
1747
  _skip_keys_device_placement = "past_key_values"
1748
  _supports_flash_attn_2 = True
1749
  _supports_sdpa = True
@@ -1763,7 +1763,7 @@ class PhiOPreTrainedModel(PreTrainedModel):
1763
  module.weight.data[module.padding_idx].zero_()
1764
 
1765
 
1766
- PHIO_INPUTS_DOCSTRING = r"""
1767
  Args:
1768
  input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
1769
  Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
@@ -1840,24 +1840,24 @@ PHIO_INPUTS_DOCSTRING = r"""
1840
 
1841
  @add_start_docstrings(
1842
  "The bare Phi-O model outputting raw hidden-states without any specific head on top.",
1843
- PHIO_START_DOCSTRING,
1844
  )
1845
- class PhiOModel(PhiOPreTrainedModel):
1846
  """
1847
- Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`PhiODecoderLayer`]
1848
 
1849
  Args:
1850
- config: PhiOConfig
1851
  """
1852
 
1853
- def __init__(self, config: PhiOConfig):
1854
  super().__init__(config)
1855
  self.padding_idx = config.pad_token_id
1856
  self.vocab_size = config.vocab_size
1857
 
1858
  self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
1859
  self.embed_dropout = nn.Dropout(config.embd_pdrop)
1860
- self.combined_embed = PhiOEmbedding(self.embed_tokens)
1861
 
1862
  self.embed_tokens_extend = None
1863
  if isinstance(config.embd_layer, dict):
@@ -1865,13 +1865,13 @@ class PhiOModel(PhiOPreTrainedModel):
1865
  'embedding_cls': config.embd_layer['embedding_cls'],
1866
  **config.embd_layer
1867
  }
1868
- self.embed_tokens_extend = PhiOImageAudioEmbedding(config, **embedding_config)
1869
 
1870
  self.layers = nn.ModuleList(
1871
- [PhiODecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
1872
  )
1873
  self._attn_implementation = config._attn_implementation
1874
- self.norm = PhiORMSNorm(config.hidden_size, eps=config.rms_norm_eps)
1875
 
1876
  self.gradient_checkpointing = False
1877
  # Initialize weights and apply final processing
@@ -1883,7 +1883,7 @@ class PhiOModel(PhiOPreTrainedModel):
1883
  def set_input_embeddings(self, value):
1884
  self.embed_tokens = value
1885
 
1886
- @add_start_docstrings_to_model_forward(PHIO_INPUTS_DOCSTRING)
1887
  def forward(
1888
  self,
1889
  input_ids: torch.LongTensor = None,
@@ -2109,7 +2109,7 @@ class PhiOModel(PhiOPreTrainedModel):
2109
  device: torch.device,
2110
  cache_position: torch.Tensor,
2111
  batch_size: int,
2112
- config: PhiOConfig,
2113
  past_key_values: Cache,
2114
  ):
2115
  """
@@ -2131,7 +2131,7 @@ class PhiOModel(PhiOPreTrainedModel):
2131
  Indices depicting the position of the input sequence tokens in the sequence.
2132
  batch_size (`torch.Tensor`):
2133
  Batch size.
2134
- config (`PhiOConfig`):
2135
  The model's configuration class
2136
  past_key_values (`Cache`):
2137
  The cache class that is being used currently to generate
@@ -2168,13 +2168,13 @@ class PhiOModel(PhiOPreTrainedModel):
2168
  return causal_mask
2169
 
2170
 
2171
- class PhiOForCausalLM(PhiOPreTrainedModel, GenerationMixin):
2172
  _tied_weights_keys = ["lm_head.weight"]
2173
 
2174
  # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.__init__ with Llama->Phi
2175
  def __init__(self, config):
2176
  super().__init__(config)
2177
- self.model = PhiOModel(config)
2178
  self.vocab_size = config.vocab_size
2179
  self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
2180
 
@@ -2260,7 +2260,7 @@ class PhiOForCausalLM(PhiOPreTrainedModel, GenerationMixin):
2260
  return self.model
2261
 
2262
  # Ignore copy
2263
- @add_start_docstrings_to_model_forward(PHIO_INPUTS_DOCSTRING)
2264
  @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
2265
  def forward(
2266
  self,
@@ -2301,9 +2301,9 @@ class PhiOForCausalLM(PhiOPreTrainedModel, GenerationMixin):
2301
  Example:
2302
 
2303
  ```python
2304
- >>> from transformers import AutoTokenizer, PhiOForCausalLM
2305
 
2306
- >>> model = PhiOForCausalLM.from_pretrained("TBA")
2307
  >>> tokenizer = AutoTokenizer.from_pretrained("TBA")
2308
 
2309
  >>> prompt = "This is an example script ."
@@ -2443,9 +2443,9 @@ class PhiOForCausalLM(PhiOPreTrainedModel, GenerationMixin):
2443
 
2444
  @add_start_docstrings(
2445
  """
2446
- The [`PhiOModel`] with a sequence classification head on top (linear layer).
2447
 
2448
- [`PhiOForSequenceClassification`] uses the last token in order to do the classification, as other causal models
2449
  (e.g. GPT-2) do.
2450
 
2451
  Since it does classification on the last token, it requires to know the position of the last token. If a
@@ -2454,14 +2454,14 @@ class PhiOForCausalLM(PhiOPreTrainedModel, GenerationMixin):
2454
  padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
2455
  each row of the batch).
2456
  """,
2457
- PHIO_START_DOCSTRING,
2458
  )
2459
  # Copied from transformers.models.llama.modeling_llama.LlamaForSequenceClassification with Llama->Phi, LLAMA->PHI, self.transformer->self.model, transformer_outputs->model_outputs
2460
- class PhiOForSequenceClassification(PhiOPreTrainedModel):
2461
  def __init__(self, config):
2462
  super().__init__(config)
2463
  self.num_labels = config.num_labels
2464
- self.model = PhiOModel(config)
2465
  self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
2466
 
2467
  # Initialize weights and apply final processing
@@ -2473,7 +2473,7 @@ class PhiOForSequenceClassification(PhiOPreTrainedModel):
2473
  def set_input_embeddings(self, value):
2474
  self.model.embed_tokens = value
2475
 
2476
- @add_start_docstrings_to_model_forward(PHIO_INPUTS_DOCSTRING)
2477
  def forward(
2478
  self,
2479
  input_ids: torch.LongTensor = None,
@@ -2548,18 +2548,18 @@ class PhiOForSequenceClassification(PhiOPreTrainedModel):
2548
 
2549
  @add_start_docstrings(
2550
  """
2551
- [`PhiOModel`] with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
2552
  Named-Entity-Recognition (NER) tasks.
2553
  """,
2554
- PHIO_START_DOCSTRING,
2555
  )
2556
  # Copied from transformers.models.mpt.modeling_mpt.MptForTokenClassification with Mpt->Phi,MPT->PHI,self.transformer->self.model,transformer_outputs->model_outputs
2557
- class PhiOForTokenClassification(PhiOPreTrainedModel):
2558
- def __init__(self, config: PhiOConfig):
2559
  super().__init__(config)
2560
  self.num_labels = config.num_labels
2561
 
2562
- self.model = PhiOModel(config)
2563
  if hasattr(config, "classifier_dropout") and config.classifier_dropout is not None:
2564
  classifier_dropout = config.classifier_dropout
2565
  elif hasattr(config, "hidden_dropout") and config.hidden_dropout is not None:
@@ -2572,7 +2572,7 @@ class PhiOForTokenClassification(PhiOPreTrainedModel):
2572
  # Initialize weights and apply final processing
2573
  self.post_init()
2574
 
2575
- @add_start_docstrings_to_model_forward(PHIO_INPUTS_DOCSTRING)
2576
  @add_code_sample_docstrings(
2577
  checkpoint=_CHECKPOINT_FOR_DOC,
2578
  output_type=TokenClassifierOutput,
 
13
  # See the License for the specific language governing permissions and
14
  # limitations under the License.
15
 
16
+ """ PyTorch Phi-4-MM model."""
17
  import os
18
  import math
19
  import warnings
 
48
  )
49
  from transformers import PretrainedConfig
50
 
51
+ from .configuration_phi4mm import Phi4MMConfig
52
+ from .processing_phi4mm import InputMode
53
  from .vision_siglip_navit import get_siglip_vision_model
54
  from .speech_conformer_encoder import ConformerEncoder
55
 
 
57
  logger = logging.get_logger(__name__)
58
 
59
  _CHECKPOINT_FOR_DOC = "TBA"
60
+ _CONFIG_FOR_DOC = "Phi4MMConfig"
61
 
62
  # Special token ids
63
  _IMAGE_SPECIAL_TOKEN_ID = 200010 # '<|endoftext10|>', or we can better name it (in `tokenizer_config.json`)
 
194
  return hidden_states
195
 
196
 
197
+ class Phi4MMEmbedding(nn.Module):
198
+ """Phi-4-MM embedding for text-only, vision + text, speech + text, and vision + speech + text"""
199
  def __init__(self, wte):
200
  super().__init__()
201
  self.wte = wte
 
234
  return hidden_states
235
 
236
 
237
+ class Phi4MMImageEmbedding(nn.Module):
238
  """Image embedding."""
239
 
240
  def __init__(self, config: PretrainedConfig, **kwargs) -> None:
 
666
  return image_features_proj.squeeze()
667
 
668
 
669
+ class Phi4MMAudioEmbedding(nn.Module):
670
  """Audio embedding."""
671
 
672
  def __init__(self, config: PretrainedConfig, **kwargs) -> None:
 
746
  self.audio_embed_sizes = None
747
 
748
  def post_init(self, audio_config):
749
+ # execute after the from_pretrained() initialization of the Phi4MM model
750
  if audio_config.get('name', None) == "cascades":
751
  init_model_config = audio_config.get("init_model", {})
752
  self.encoder.post_init(init_model_config)
 
891
  return audio_features_proj
892
 
893
 
894
+ class Phi4MMImageAudioEmbedding(nn.Module):
895
  """Image-audio embedding."""
896
 
897
  def __init__(self, config: PretrainedConfig, **kwargs) -> None:
 
904
  assert self.image_input_id != self.audio_input_id, 'image_input_id and audio_input_id should be different'
905
 
906
  self.image_embd_layer_kwargs = kwargs['image_embd_layer']
907
+ self.image_embed = Phi4MMImageEmbedding(config, **self.image_embd_layer_kwargs)
908
  self.audio_embd_layer_kwargs = kwargs['audio_embd_layer']
909
+ self.audio_embed = Phi4MMAudioEmbedding(config, **self.audio_embd_layer_kwargs)
910
 
911
  self.input_image_embeds = None
912
  self.image_sizes = None
 
1035
 
1036
 
1037
  # Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Phi3
1038
+ class Phi4MMRMSNorm(nn.Module):
1039
  def __init__(self, hidden_size, eps=1e-6):
1040
  """
1041
+ Phi4MMRMSNorm is equivalent to T5LayerNorm
1042
  """
1043
  super().__init__()
1044
  self.weight = nn.Parameter(torch.ones(hidden_size))
 
1056
 
1057
 
1058
  # Copied from transformers.models.gemma.modeling_gemma.GemmaRotaryEmbedding with gemma->phi3, Gemma->Phi3
1059
+ class Phi4MMRotaryEmbedding(nn.Module):
1060
  def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
1061
  super().__init__()
1062
 
 
1085
  return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
1086
 
1087
 
1088
+ class Phi4MMSuScaledRotaryEmbedding(Phi4MMRotaryEmbedding):
1089
  def __init__(self, dim, config, device=None):
1090
  warnings.warn(
1091
+ "The class Phi4MMSuScaledRotaryEmbedding is deprecated and will be removed in version 5 of Transformers. Please"
1092
+ " use Phi4MMLongRoPEScaledRotaryEmbedding instead.",
1093
  FutureWarning,
1094
  )
1095
  super().__init__(dim, config.max_position_embeddings, config.rope_theta, device)
 
1126
  return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
1127
 
1128
 
1129
+ class Phi4MMYarnScaledRotaryEmbedding(Phi4MMRotaryEmbedding):
1130
  def __init__(self, dim, config, device=None):
1131
  warnings.warn(
1132
+ "The class Phi4MMYarnScaledRotaryEmbedding is deprecated and will be removed in version 5 of Transformers",
1133
  FutureWarning,
1134
  )
1135
  super().__init__(dim, config.max_position_embeddings, config.rope_theta, device)
 
1171
  return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
1172
 
1173
 
1174
+ class Phi4MMLongRoPEScaledRotaryEmbedding(Phi4MMRotaryEmbedding):
1175
  def __init__(self, dim, config, device=None):
1176
  super().__init__(dim, config.max_position_embeddings, config.rope_theta, device)
1177
 
 
1252
  return q_embed, k_embed
1253
 
1254
 
1255
+ class Phi4MMMLP(nn.Module):
1256
  def __init__(self, config):
1257
  super().__init__()
1258
 
 
1284
  return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
1285
 
1286
 
1287
+ class Phi4MMAttention(nn.Module):
1288
  """Multi-headed attention from 'Attention Is All You Need' paper"""
1289
 
1290
+ def __init__(self, config: Phi4MMConfig, layer_idx: Optional[int] = None):
1291
  super().__init__()
1292
  self.config = config
1293
  self.layer_idx = layer_idx
 
1324
 
1325
  def _init_rope(self):
1326
  if self.rope_scaling is None:
1327
+ self.rotary_emb = Phi4MMRotaryEmbedding(
1328
  self.rotary_ndims,
1329
  max_position_embeddings=self.max_position_embeddings,
1330
  base=self.rope_theta,
 
1332
  else:
1333
  scaling_type = self.config.rope_scaling["type"]
1334
  if scaling_type == "longrope":
1335
+ self.rotary_emb = Phi4MMLongRoPEScaledRotaryEmbedding(self.rotary_ndims, self.config)
1336
  else:
1337
  raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
1338
 
 
1410
  return attn_output, attn_weights, past_key_value
1411
 
1412
 
1413
+ class Phi4MMFlashAttention2(Phi4MMAttention):
1414
  """
1415
+ Phi-O flash attention module. This module inherits from `Phi4MMAttention` as the weights of the module stays
1416
  untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
1417
  flash attention and deal with padding tokens in case the input contains any of them.
1418
  """
 
1436
  use_cache: bool = False,
1437
  cache_position: Optional[torch.LongTensor] = None,
1438
  ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
1439
+ # Phi4MMFlashAttention2 attention does not support output_attentions
1440
 
1441
  output_attentions = False
1442
 
 
1538
 
1539
  # copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention with Llama->Phi
1540
  # TODO @Arthur no longer copied from LLama after static cache
1541
+ class Phi4MMSdpaAttention(Phi4MMAttention):
1542
  """
1543
+ Phi4MM attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
1544
+ `Phi4MMAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
1545
  SDPA API.
1546
  """
1547
 
1548
+ # Adapted from Phi4MMAttention.forward
1549
  def forward(
1550
  self,
1551
  hidden_states: torch.Tensor,
 
1559
  if output_attentions:
1560
  # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
1561
  logger.warning_once(
1562
+ "Phi4MMModel is using Phi4MMSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
1563
  'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
1564
  )
1565
  return super().forward(
 
1630
  return attn_output, None, past_key_value
1631
 
1632
 
1633
+ PHI4MM_ATTENTION_CLASSES = {
1634
+ "eager": Phi4MMAttention,
1635
+ "flash_attention_2": Phi4MMFlashAttention2,
1636
+ "sdpa": Phi4MMSdpaAttention,
1637
  }
1638
 
1639
 
1640
+ class Phi4MMDecoderLayer(nn.Module):
1641
+ def __init__(self, config: Phi4MMConfig, layer_idx: int):
1642
  super().__init__()
1643
 
1644
  self.config = config
1645
+ self.self_attn = PHI4MM_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx=layer_idx)
1646
 
1647
+ self.mlp = Phi4MMMLP(config)
1648
+ self.input_layernorm = Phi4MMRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
1649
 
1650
  self.resid_attn_dropout = nn.Dropout(config.resid_pdrop)
1651
  self.resid_mlp_dropout = nn.Dropout(config.resid_pdrop)
1652
+ self.post_attention_layernorm = Phi4MMRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
1653
 
1654
  def forward(
1655
  self,
 
1718
  return outputs
1719
 
1720
 
1721
+ PHI4MM_START_DOCSTRING = r"""
1722
  This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
1723
  library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
1724
  etc.)
 
1728
  and behavior.
1729
 
1730
  Parameters:
1731
+ config ([`Phi4MMConfig`]):
1732
  Model configuration class with all the parameters of the model. Initializing with a config file does not
1733
  load the weights associated with the model, only the configuration. Check out the
1734
  [`~PreTrainedModel.from_pretrained`] method to load the model weights.
 
1737
 
1738
  @add_start_docstrings(
1739
  "The bare Phi-O model outputting raw hidden-states without any specific head on top.",
1740
+ PHI4MM_START_DOCSTRING,
1741
  )
1742
+ class Phi4MMPreTrainedModel(PreTrainedModel):
1743
+ config_class = Phi4MMConfig
1744
  base_model_prefix = "model"
1745
  supports_gradient_checkpointing = True
1746
+ _no_split_modules = ["Phi4MMDecoderLayer"]
1747
  _skip_keys_device_placement = "past_key_values"
1748
  _supports_flash_attn_2 = True
1749
  _supports_sdpa = True
 
1763
  module.weight.data[module.padding_idx].zero_()
1764
 
1765
 
1766
+ PHI4MM_INPUTS_DOCSTRING = r"""
1767
  Args:
1768
  input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
1769
  Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
 
1840
 
1841
  @add_start_docstrings(
1842
  "The bare Phi-O model outputting raw hidden-states without any specific head on top.",
1843
+ PHI4MM_START_DOCSTRING,
1844
  )
1845
+ class Phi4MMModel(Phi4MMPreTrainedModel):
1846
  """
1847
+ Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Phi4MMDecoderLayer`]
1848
 
1849
  Args:
1850
+ config: Phi4MMConfig
1851
  """
1852
 
1853
+ def __init__(self, config: Phi4MMConfig):
1854
  super().__init__(config)
1855
  self.padding_idx = config.pad_token_id
1856
  self.vocab_size = config.vocab_size
1857
 
1858
  self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
1859
  self.embed_dropout = nn.Dropout(config.embd_pdrop)
1860
+ self.combined_embed = Phi4MMEmbedding(self.embed_tokens)
1861
 
1862
  self.embed_tokens_extend = None
1863
  if isinstance(config.embd_layer, dict):
 
1865
  'embedding_cls': config.embd_layer['embedding_cls'],
1866
  **config.embd_layer
1867
  }
1868
+ self.embed_tokens_extend = Phi4MMImageAudioEmbedding(config, **embedding_config)
1869
 
1870
  self.layers = nn.ModuleList(
1871
+ [Phi4MMDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
1872
  )
1873
  self._attn_implementation = config._attn_implementation
1874
+ self.norm = Phi4MMRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
1875
 
1876
  self.gradient_checkpointing = False
1877
  # Initialize weights and apply final processing
 
1883
  def set_input_embeddings(self, value):
1884
  self.embed_tokens = value
1885
 
1886
+ @add_start_docstrings_to_model_forward(Phi4MM_INPUTS_DOCSTRING)
1887
  def forward(
1888
  self,
1889
  input_ids: torch.LongTensor = None,
 
2109
  device: torch.device,
2110
  cache_position: torch.Tensor,
2111
  batch_size: int,
2112
+ config: Phi4MMConfig,
2113
  past_key_values: Cache,
2114
  ):
2115
  """
 
2131
  Indices depicting the position of the input sequence tokens in the sequence.
2132
  batch_size (`torch.Tensor`):
2133
  Batch size.
2134
+ config (`Phi4MMConfig`):
2135
  The model's configuration class
2136
  past_key_values (`Cache`):
2137
  The cache class that is being used currently to generate
 
2168
  return causal_mask
2169
 
2170
 
2171
+ class Phi4MMForCausalLM(Phi4MMPreTrainedModel, GenerationMixin):
2172
  _tied_weights_keys = ["lm_head.weight"]
2173
 
2174
  # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.__init__ with Llama->Phi
2175
  def __init__(self, config):
2176
  super().__init__(config)
2177
+ self.model = Phi4MMModel(config)
2178
  self.vocab_size = config.vocab_size
2179
  self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
2180
 
 
2260
  return self.model
2261
 
2262
  # Ignore copy
2263
+ @add_start_docstrings_to_model_forward(Phi4MM_INPUTS_DOCSTRING)
2264
  @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
2265
  def forward(
2266
  self,
 
2301
  Example:
2302
 
2303
  ```python
2304
+ >>> from transformers import AutoTokenizer, Phi4MMForCausalLM
2305
 
2306
+ >>> model = Phi4MMForCausalLM.from_pretrained("TBA")
2307
  >>> tokenizer = AutoTokenizer.from_pretrained("TBA")
2308
 
2309
  >>> prompt = "This is an example script ."
 
2443
 
2444
  @add_start_docstrings(
2445
  """
2446
+ The [`Phi4MMModel`] with a sequence classification head on top (linear layer).
2447
 
2448
+ [`Phi4MMForSequenceClassification`] uses the last token in order to do the classification, as other causal models
2449
  (e.g. GPT-2) do.
2450
 
2451
  Since it does classification on the last token, it requires to know the position of the last token. If a
 
2454
  padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
2455
  each row of the batch).
2456
  """,
2457
+ PHI4MM_START_DOCSTRING,
2458
  )
2459
  # Copied from transformers.models.llama.modeling_llama.LlamaForSequenceClassification with Llama->Phi, LLAMA->PHI, self.transformer->self.model, transformer_outputs->model_outputs
2460
+ class Phi4MMForSequenceClassification(Phi4MMPreTrainedModel):
2461
  def __init__(self, config):
2462
  super().__init__(config)
2463
  self.num_labels = config.num_labels
2464
+ self.model = Phi4MMModel(config)
2465
  self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
2466
 
2467
  # Initialize weights and apply final processing
 
2473
  def set_input_embeddings(self, value):
2474
  self.model.embed_tokens = value
2475
 
2476
+ @add_start_docstrings_to_model_forward(Phi4MM_INPUTS_DOCSTRING)
2477
  def forward(
2478
  self,
2479
  input_ids: torch.LongTensor = None,
 
2548
 
2549
  @add_start_docstrings(
2550
  """
2551
+ [`Phi4MMModel`] with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
2552
  Named-Entity-Recognition (NER) tasks.
2553
  """,
2554
+ PHI4MM_START_DOCSTRING,
2555
  )
2556
  # Copied from transformers.models.mpt.modeling_mpt.MptForTokenClassification with Mpt->Phi,MPT->PHI,self.transformer->self.model,transformer_outputs->model_outputs
2557
+ class Phi4MMForTokenClassification(Phi4MMPreTrainedModel):
2558
+ def __init__(self, config: Phi4MMConfig):
2559
  super().__init__(config)
2560
  self.num_labels = config.num_labels
2561
 
2562
+ self.model = Phi4MMModel(config)
2563
  if hasattr(config, "classifier_dropout") and config.classifier_dropout is not None:
2564
  classifier_dropout = config.classifier_dropout
2565
  elif hasattr(config, "hidden_dropout") and config.hidden_dropout is not None:
 
2572
  # Initialize weights and apply final processing
2573
  self.post_init()
2574
 
2575
+ @add_start_docstrings_to_model_forward(Phi4MM_INPUTS_DOCSTRING)
2576
  @add_code_sample_docstrings(
2577
  checkpoint=_CHECKPOINT_FOR_DOC,
2578
  output_type=TokenClassifierOutput,
onnx/{processing_phio.py → processing_phi4mm.py} RENAMED
@@ -13,7 +13,7 @@
13
  # limitations under the License.
14
 
15
  """
16
- Processor class for PhiO
17
  """
18
  import re
19
  from typing import List, Optional, Tuple, Union
@@ -57,9 +57,9 @@ class InputMode(Enum):
57
  VISION_SPEECH = 3
58
 
59
 
60
- class PhiOImageProcessor(BaseImageProcessor):
61
  r"""
62
- Constructs a PhiO image processor.
63
  """
64
  model_input_names = ["input_image_embeds", "image_sizes", "image_attention_mask"]
65
 
@@ -317,7 +317,7 @@ def speechlib_mel(sample_rate, n_fft, n_mels, fmin=None, fmax=None):
317
  return matrix
318
 
319
 
320
- class PhiOAudioFeatureExtractor(SequenceFeatureExtractor):
321
  model_input_names = ["input_audio_embeds", "audio_embed_sizes", "audio_attention_mask"]
322
 
323
  def __init__(self, audio_compression_rate, audio_downsample_rate, audio_feat_stride, **kwargs):
@@ -489,15 +489,15 @@ class PhiOAudioFeatureExtractor(SequenceFeatureExtractor):
489
  return result
490
 
491
 
492
- class PhiOProcessor(ProcessorMixin):
493
  r"""
494
- Constructs a PhiO processor which raps an image processor, a audio processor, and a GPT tokenizer into a single processor.
495
 
496
- [`PhiOProcessor`] offers all the functionalities of [`PhiOImageProcessor`] and [`GPT2Tokenizer`]. See the
497
- [`~PhiOProcessor.__call__`] and [`~PhiOProcessor.decode`] for more information.
498
 
499
  Args:
500
- image_processor ([`PhiOImageProcessor`], *optional*):
501
  The image processor is a required input.
502
  tokenizer ([`GPT2Tokenizer`], *optional*):
503
  The tokenizer is a required input.
@@ -505,8 +505,8 @@ class PhiOProcessor(ProcessorMixin):
505
 
506
  attributes = ["image_processor", "audio_processor", "tokenizer"]
507
  tokenizer_class = "GPT2TokenizerFast"
508
- image_processor_class = "AutoImageProcessor" # PhiOImageProcessor will be registered later
509
- audio_processor_class = "AutoFeatureExtractor" # PhiOAudioFeatureExtractor will be registered later
510
 
511
  def __init__(self, image_processor, audio_processor, tokenizer):
512
  self.image_processor = image_processor
@@ -527,7 +527,7 @@ class PhiOProcessor(ProcessorMixin):
527
  Main method to prepare for the model one or several sequences(s) and image(s). This method forards the `text`
528
  and `kwargs` arguments to GPT2Tokenizer's [`~GPT2Tokenizer.__call__`] if `text` is not `None` to encode
529
  the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
530
- PhiOImageProcessor's [`~PhiOImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
531
  of the above two methods for more information.
532
 
533
  Args:
@@ -728,5 +728,5 @@ class PhiOProcessor(ProcessorMixin):
728
  return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names + audio_processor_input_names))
729
 
730
 
731
- AutoImageProcessor.register("PhiOImageProcessor", PhiOImageProcessor)
732
- AutoFeatureExtractor.register("PhiOAudioFeatureExtractor", PhiOAudioFeatureExtractor)
 
13
  # limitations under the License.
14
 
15
  """
16
+ Processor class for Phi-4-MM
17
  """
18
  import re
19
  from typing import List, Optional, Tuple, Union
 
57
  VISION_SPEECH = 3
58
 
59
 
60
+ class Phi4MMImageProcessor(BaseImageProcessor):
61
  r"""
62
+ Constructs a Phi4MM image processor.
63
  """
64
  model_input_names = ["input_image_embeds", "image_sizes", "image_attention_mask"]
65
 
 
317
  return matrix
318
 
319
 
320
+ class Phi4MMAudioFeatureExtractor(SequenceFeatureExtractor):
321
  model_input_names = ["input_audio_embeds", "audio_embed_sizes", "audio_attention_mask"]
322
 
323
  def __init__(self, audio_compression_rate, audio_downsample_rate, audio_feat_stride, **kwargs):
 
489
  return result
490
 
491
 
492
+ class Phi4MMProcessor(ProcessorMixin):
493
  r"""
494
+ Constructs a Phi4MM processor which raps an image processor, a audio processor, and a GPT tokenizer into a single processor.
495
 
496
+ [`Phi4MMProcessor`] offers all the functionalities of [`Phi4MMImageProcessor`] and [`GPT2Tokenizer`]. See the
497
+ [`~Phi4MMProcessor.__call__`] and [`~Phi4MMProcessor.decode`] for more information.
498
 
499
  Args:
500
+ image_processor ([`Phi4MMImageProcessor`], *optional*):
501
  The image processor is a required input.
502
  tokenizer ([`GPT2Tokenizer`], *optional*):
503
  The tokenizer is a required input.
 
505
 
506
  attributes = ["image_processor", "audio_processor", "tokenizer"]
507
  tokenizer_class = "GPT2TokenizerFast"
508
+ image_processor_class = "AutoImageProcessor" # Phi4MMImageProcessor will be registered later
509
+ audio_processor_class = "AutoFeatureExtractor" # Phi4MMAudioFeatureExtractor will be registered later
510
 
511
  def __init__(self, image_processor, audio_processor, tokenizer):
512
  self.image_processor = image_processor
 
527
  Main method to prepare for the model one or several sequences(s) and image(s). This method forards the `text`
528
  and `kwargs` arguments to GPT2Tokenizer's [`~GPT2Tokenizer.__call__`] if `text` is not `None` to encode
529
  the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
530
+ Phi4MMImageProcessor's [`~Phi4MMImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
531
  of the above two methods for more information.
532
 
533
  Args:
 
728
  return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names + audio_processor_input_names))
729
 
730
 
731
+ AutoImageProcessor.register("Phi4MMImageProcessor", Phi4MMImageProcessor)
732
+ AutoFeatureExtractor.register("Phi4MMAudioFeatureExtractor", Phi4MMAudioFeatureExtractor)