Commit
·
7a8fd12
1
Parent(s):
3d69e6d
Fix naming styles
Browse files
onnx/config.json
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6a51e83822b43cce5531974027de76f26710fee3095c065213607bb3557276c9
|
3 |
+
size 4629
|
onnx/{modeling_phio.py → modeling_phi4mm.py}
RENAMED
@@ -13,7 +13,7 @@
|
|
13 |
# See the License for the specific language governing permissions and
|
14 |
# limitations under the License.
|
15 |
|
16 |
-
""" PyTorch Phi-
|
17 |
import os
|
18 |
import math
|
19 |
import warnings
|
@@ -48,8 +48,8 @@ from transformers.utils import (
|
|
48 |
)
|
49 |
from transformers import PretrainedConfig
|
50 |
|
51 |
-
from .
|
52 |
-
from .
|
53 |
from .vision_siglip_navit import get_siglip_vision_model
|
54 |
from .speech_conformer_encoder import ConformerEncoder
|
55 |
|
@@ -57,7 +57,7 @@ from .speech_conformer_encoder import ConformerEncoder
|
|
57 |
logger = logging.get_logger(__name__)
|
58 |
|
59 |
_CHECKPOINT_FOR_DOC = "TBA"
|
60 |
-
_CONFIG_FOR_DOC = "
|
61 |
|
62 |
# Special token ids
|
63 |
_IMAGE_SPECIAL_TOKEN_ID = 200010 # '<|endoftext10|>', or we can better name it (in `tokenizer_config.json`)
|
@@ -194,8 +194,8 @@ def select_logic(hidden_states: torch.FloatTensor, features: torch.FloatTensor,
|
|
194 |
return hidden_states
|
195 |
|
196 |
|
197 |
-
class
|
198 |
-
"""Phi-
|
199 |
def __init__(self, wte):
|
200 |
super().__init__()
|
201 |
self.wte = wte
|
@@ -234,7 +234,7 @@ class PhiOEmbedding(nn.Module):
|
|
234 |
return hidden_states
|
235 |
|
236 |
|
237 |
-
class
|
238 |
"""Image embedding."""
|
239 |
|
240 |
def __init__(self, config: PretrainedConfig, **kwargs) -> None:
|
@@ -666,7 +666,7 @@ class PhiOImageEmbedding(nn.Module):
|
|
666 |
return image_features_proj.squeeze()
|
667 |
|
668 |
|
669 |
-
class
|
670 |
"""Audio embedding."""
|
671 |
|
672 |
def __init__(self, config: PretrainedConfig, **kwargs) -> None:
|
@@ -746,7 +746,7 @@ class PhiOAudioEmbedding(nn.Module):
|
|
746 |
self.audio_embed_sizes = None
|
747 |
|
748 |
def post_init(self, audio_config):
|
749 |
-
# execute after the from_pretrained() initialization of the
|
750 |
if audio_config.get('name', None) == "cascades":
|
751 |
init_model_config = audio_config.get("init_model", {})
|
752 |
self.encoder.post_init(init_model_config)
|
@@ -891,7 +891,7 @@ class PhiOAudioEmbedding(nn.Module):
|
|
891 |
return audio_features_proj
|
892 |
|
893 |
|
894 |
-
class
|
895 |
"""Image-audio embedding."""
|
896 |
|
897 |
def __init__(self, config: PretrainedConfig, **kwargs) -> None:
|
@@ -904,9 +904,9 @@ class PhiOImageAudioEmbedding(nn.Module):
|
|
904 |
assert self.image_input_id != self.audio_input_id, 'image_input_id and audio_input_id should be different'
|
905 |
|
906 |
self.image_embd_layer_kwargs = kwargs['image_embd_layer']
|
907 |
-
self.image_embed =
|
908 |
self.audio_embd_layer_kwargs = kwargs['audio_embd_layer']
|
909 |
-
self.audio_embed =
|
910 |
|
911 |
self.input_image_embeds = None
|
912 |
self.image_sizes = None
|
@@ -1035,10 +1035,10 @@ class PhiOImageAudioEmbedding(nn.Module):
|
|
1035 |
|
1036 |
|
1037 |
# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Phi3
|
1038 |
-
class
|
1039 |
def __init__(self, hidden_size, eps=1e-6):
|
1040 |
"""
|
1041 |
-
|
1042 |
"""
|
1043 |
super().__init__()
|
1044 |
self.weight = nn.Parameter(torch.ones(hidden_size))
|
@@ -1056,7 +1056,7 @@ class PhiORMSNorm(nn.Module):
|
|
1056 |
|
1057 |
|
1058 |
# Copied from transformers.models.gemma.modeling_gemma.GemmaRotaryEmbedding with gemma->phi3, Gemma->Phi3
|
1059 |
-
class
|
1060 |
def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
|
1061 |
super().__init__()
|
1062 |
|
@@ -1085,11 +1085,11 @@ class PhiORotaryEmbedding(nn.Module):
|
|
1085 |
return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
|
1086 |
|
1087 |
|
1088 |
-
class
|
1089 |
def __init__(self, dim, config, device=None):
|
1090 |
warnings.warn(
|
1091 |
-
"The class
|
1092 |
-
" use
|
1093 |
FutureWarning,
|
1094 |
)
|
1095 |
super().__init__(dim, config.max_position_embeddings, config.rope_theta, device)
|
@@ -1126,10 +1126,10 @@ class PhiOSuScaledRotaryEmbedding(PhiORotaryEmbedding):
|
|
1126 |
return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
|
1127 |
|
1128 |
|
1129 |
-
class
|
1130 |
def __init__(self, dim, config, device=None):
|
1131 |
warnings.warn(
|
1132 |
-
"The class
|
1133 |
FutureWarning,
|
1134 |
)
|
1135 |
super().__init__(dim, config.max_position_embeddings, config.rope_theta, device)
|
@@ -1171,7 +1171,7 @@ class PhiOYarnScaledRotaryEmbedding(PhiORotaryEmbedding):
|
|
1171 |
return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
|
1172 |
|
1173 |
|
1174 |
-
class
|
1175 |
def __init__(self, dim, config, device=None):
|
1176 |
super().__init__(dim, config.max_position_embeddings, config.rope_theta, device)
|
1177 |
|
@@ -1252,7 +1252,7 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
|
|
1252 |
return q_embed, k_embed
|
1253 |
|
1254 |
|
1255 |
-
class
|
1256 |
def __init__(self, config):
|
1257 |
super().__init__()
|
1258 |
|
@@ -1284,10 +1284,10 @@ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
|
|
1284 |
return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
|
1285 |
|
1286 |
|
1287 |
-
class
|
1288 |
"""Multi-headed attention from 'Attention Is All You Need' paper"""
|
1289 |
|
1290 |
-
def __init__(self, config:
|
1291 |
super().__init__()
|
1292 |
self.config = config
|
1293 |
self.layer_idx = layer_idx
|
@@ -1324,7 +1324,7 @@ class PhiOAttention(nn.Module):
|
|
1324 |
|
1325 |
def _init_rope(self):
|
1326 |
if self.rope_scaling is None:
|
1327 |
-
self.rotary_emb =
|
1328 |
self.rotary_ndims,
|
1329 |
max_position_embeddings=self.max_position_embeddings,
|
1330 |
base=self.rope_theta,
|
@@ -1332,7 +1332,7 @@ class PhiOAttention(nn.Module):
|
|
1332 |
else:
|
1333 |
scaling_type = self.config.rope_scaling["type"]
|
1334 |
if scaling_type == "longrope":
|
1335 |
-
self.rotary_emb =
|
1336 |
else:
|
1337 |
raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
|
1338 |
|
@@ -1410,9 +1410,9 @@ class PhiOAttention(nn.Module):
|
|
1410 |
return attn_output, attn_weights, past_key_value
|
1411 |
|
1412 |
|
1413 |
-
class
|
1414 |
"""
|
1415 |
-
Phi-O flash attention module. This module inherits from `
|
1416 |
untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
|
1417 |
flash attention and deal with padding tokens in case the input contains any of them.
|
1418 |
"""
|
@@ -1436,7 +1436,7 @@ class PhiOFlashAttention2(PhiOAttention):
|
|
1436 |
use_cache: bool = False,
|
1437 |
cache_position: Optional[torch.LongTensor] = None,
|
1438 |
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
|
1439 |
-
#
|
1440 |
|
1441 |
output_attentions = False
|
1442 |
|
@@ -1538,14 +1538,14 @@ class PhiOFlashAttention2(PhiOAttention):
|
|
1538 |
|
1539 |
# copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention with Llama->Phi
|
1540 |
# TODO @Arthur no longer copied from LLama after static cache
|
1541 |
-
class
|
1542 |
"""
|
1543 |
-
|
1544 |
-
`
|
1545 |
SDPA API.
|
1546 |
"""
|
1547 |
|
1548 |
-
# Adapted from
|
1549 |
def forward(
|
1550 |
self,
|
1551 |
hidden_states: torch.Tensor,
|
@@ -1559,7 +1559,7 @@ class PhiOSdpaAttention(PhiOAttention):
|
|
1559 |
if output_attentions:
|
1560 |
# TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
|
1561 |
logger.warning_once(
|
1562 |
-
"
|
1563 |
'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
|
1564 |
)
|
1565 |
return super().forward(
|
@@ -1630,26 +1630,26 @@ class PhiOSdpaAttention(PhiOAttention):
|
|
1630 |
return attn_output, None, past_key_value
|
1631 |
|
1632 |
|
1633 |
-
|
1634 |
-
"eager":
|
1635 |
-
"flash_attention_2":
|
1636 |
-
"sdpa":
|
1637 |
}
|
1638 |
|
1639 |
|
1640 |
-
class
|
1641 |
-
def __init__(self, config:
|
1642 |
super().__init__()
|
1643 |
|
1644 |
self.config = config
|
1645 |
-
self.self_attn =
|
1646 |
|
1647 |
-
self.mlp =
|
1648 |
-
self.input_layernorm =
|
1649 |
|
1650 |
self.resid_attn_dropout = nn.Dropout(config.resid_pdrop)
|
1651 |
self.resid_mlp_dropout = nn.Dropout(config.resid_pdrop)
|
1652 |
-
self.post_attention_layernorm =
|
1653 |
|
1654 |
def forward(
|
1655 |
self,
|
@@ -1718,7 +1718,7 @@ class PhiODecoderLayer(nn.Module):
|
|
1718 |
return outputs
|
1719 |
|
1720 |
|
1721 |
-
|
1722 |
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
|
1723 |
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
|
1724 |
etc.)
|
@@ -1728,7 +1728,7 @@ PHIO_START_DOCSTRING = r"""
|
|
1728 |
and behavior.
|
1729 |
|
1730 |
Parameters:
|
1731 |
-
config ([`
|
1732 |
Model configuration class with all the parameters of the model. Initializing with a config file does not
|
1733 |
load the weights associated with the model, only the configuration. Check out the
|
1734 |
[`~PreTrainedModel.from_pretrained`] method to load the model weights.
|
@@ -1737,13 +1737,13 @@ PHIO_START_DOCSTRING = r"""
|
|
1737 |
|
1738 |
@add_start_docstrings(
|
1739 |
"The bare Phi-O model outputting raw hidden-states without any specific head on top.",
|
1740 |
-
|
1741 |
)
|
1742 |
-
class
|
1743 |
-
config_class =
|
1744 |
base_model_prefix = "model"
|
1745 |
supports_gradient_checkpointing = True
|
1746 |
-
_no_split_modules = ["
|
1747 |
_skip_keys_device_placement = "past_key_values"
|
1748 |
_supports_flash_attn_2 = True
|
1749 |
_supports_sdpa = True
|
@@ -1763,7 +1763,7 @@ class PhiOPreTrainedModel(PreTrainedModel):
|
|
1763 |
module.weight.data[module.padding_idx].zero_()
|
1764 |
|
1765 |
|
1766 |
-
|
1767 |
Args:
|
1768 |
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
|
1769 |
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
|
@@ -1840,24 +1840,24 @@ PHIO_INPUTS_DOCSTRING = r"""
|
|
1840 |
|
1841 |
@add_start_docstrings(
|
1842 |
"The bare Phi-O model outputting raw hidden-states without any specific head on top.",
|
1843 |
-
|
1844 |
)
|
1845 |
-
class
|
1846 |
"""
|
1847 |
-
Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`
|
1848 |
|
1849 |
Args:
|
1850 |
-
config:
|
1851 |
"""
|
1852 |
|
1853 |
-
def __init__(self, config:
|
1854 |
super().__init__(config)
|
1855 |
self.padding_idx = config.pad_token_id
|
1856 |
self.vocab_size = config.vocab_size
|
1857 |
|
1858 |
self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
|
1859 |
self.embed_dropout = nn.Dropout(config.embd_pdrop)
|
1860 |
-
self.combined_embed =
|
1861 |
|
1862 |
self.embed_tokens_extend = None
|
1863 |
if isinstance(config.embd_layer, dict):
|
@@ -1865,13 +1865,13 @@ class PhiOModel(PhiOPreTrainedModel):
|
|
1865 |
'embedding_cls': config.embd_layer['embedding_cls'],
|
1866 |
**config.embd_layer
|
1867 |
}
|
1868 |
-
self.embed_tokens_extend =
|
1869 |
|
1870 |
self.layers = nn.ModuleList(
|
1871 |
-
[
|
1872 |
)
|
1873 |
self._attn_implementation = config._attn_implementation
|
1874 |
-
self.norm =
|
1875 |
|
1876 |
self.gradient_checkpointing = False
|
1877 |
# Initialize weights and apply final processing
|
@@ -1883,7 +1883,7 @@ class PhiOModel(PhiOPreTrainedModel):
|
|
1883 |
def set_input_embeddings(self, value):
|
1884 |
self.embed_tokens = value
|
1885 |
|
1886 |
-
@add_start_docstrings_to_model_forward(
|
1887 |
def forward(
|
1888 |
self,
|
1889 |
input_ids: torch.LongTensor = None,
|
@@ -2109,7 +2109,7 @@ class PhiOModel(PhiOPreTrainedModel):
|
|
2109 |
device: torch.device,
|
2110 |
cache_position: torch.Tensor,
|
2111 |
batch_size: int,
|
2112 |
-
config:
|
2113 |
past_key_values: Cache,
|
2114 |
):
|
2115 |
"""
|
@@ -2131,7 +2131,7 @@ class PhiOModel(PhiOPreTrainedModel):
|
|
2131 |
Indices depicting the position of the input sequence tokens in the sequence.
|
2132 |
batch_size (`torch.Tensor`):
|
2133 |
Batch size.
|
2134 |
-
config (`
|
2135 |
The model's configuration class
|
2136 |
past_key_values (`Cache`):
|
2137 |
The cache class that is being used currently to generate
|
@@ -2168,13 +2168,13 @@ class PhiOModel(PhiOPreTrainedModel):
|
|
2168 |
return causal_mask
|
2169 |
|
2170 |
|
2171 |
-
class
|
2172 |
_tied_weights_keys = ["lm_head.weight"]
|
2173 |
|
2174 |
# Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.__init__ with Llama->Phi
|
2175 |
def __init__(self, config):
|
2176 |
super().__init__(config)
|
2177 |
-
self.model =
|
2178 |
self.vocab_size = config.vocab_size
|
2179 |
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
|
2180 |
|
@@ -2260,7 +2260,7 @@ class PhiOForCausalLM(PhiOPreTrainedModel, GenerationMixin):
|
|
2260 |
return self.model
|
2261 |
|
2262 |
# Ignore copy
|
2263 |
-
@add_start_docstrings_to_model_forward(
|
2264 |
@replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
|
2265 |
def forward(
|
2266 |
self,
|
@@ -2301,9 +2301,9 @@ class PhiOForCausalLM(PhiOPreTrainedModel, GenerationMixin):
|
|
2301 |
Example:
|
2302 |
|
2303 |
```python
|
2304 |
-
>>> from transformers import AutoTokenizer,
|
2305 |
|
2306 |
-
>>> model =
|
2307 |
>>> tokenizer = AutoTokenizer.from_pretrained("TBA")
|
2308 |
|
2309 |
>>> prompt = "This is an example script ."
|
@@ -2443,9 +2443,9 @@ class PhiOForCausalLM(PhiOPreTrainedModel, GenerationMixin):
|
|
2443 |
|
2444 |
@add_start_docstrings(
|
2445 |
"""
|
2446 |
-
The [`
|
2447 |
|
2448 |
-
[`
|
2449 |
(e.g. GPT-2) do.
|
2450 |
|
2451 |
Since it does classification on the last token, it requires to know the position of the last token. If a
|
@@ -2454,14 +2454,14 @@ class PhiOForCausalLM(PhiOPreTrainedModel, GenerationMixin):
|
|
2454 |
padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
|
2455 |
each row of the batch).
|
2456 |
""",
|
2457 |
-
|
2458 |
)
|
2459 |
# Copied from transformers.models.llama.modeling_llama.LlamaForSequenceClassification with Llama->Phi, LLAMA->PHI, self.transformer->self.model, transformer_outputs->model_outputs
|
2460 |
-
class
|
2461 |
def __init__(self, config):
|
2462 |
super().__init__(config)
|
2463 |
self.num_labels = config.num_labels
|
2464 |
-
self.model =
|
2465 |
self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
|
2466 |
|
2467 |
# Initialize weights and apply final processing
|
@@ -2473,7 +2473,7 @@ class PhiOForSequenceClassification(PhiOPreTrainedModel):
|
|
2473 |
def set_input_embeddings(self, value):
|
2474 |
self.model.embed_tokens = value
|
2475 |
|
2476 |
-
@add_start_docstrings_to_model_forward(
|
2477 |
def forward(
|
2478 |
self,
|
2479 |
input_ids: torch.LongTensor = None,
|
@@ -2548,18 +2548,18 @@ class PhiOForSequenceClassification(PhiOPreTrainedModel):
|
|
2548 |
|
2549 |
@add_start_docstrings(
|
2550 |
"""
|
2551 |
-
[`
|
2552 |
Named-Entity-Recognition (NER) tasks.
|
2553 |
""",
|
2554 |
-
|
2555 |
)
|
2556 |
# Copied from transformers.models.mpt.modeling_mpt.MptForTokenClassification with Mpt->Phi,MPT->PHI,self.transformer->self.model,transformer_outputs->model_outputs
|
2557 |
-
class
|
2558 |
-
def __init__(self, config:
|
2559 |
super().__init__(config)
|
2560 |
self.num_labels = config.num_labels
|
2561 |
|
2562 |
-
self.model =
|
2563 |
if hasattr(config, "classifier_dropout") and config.classifier_dropout is not None:
|
2564 |
classifier_dropout = config.classifier_dropout
|
2565 |
elif hasattr(config, "hidden_dropout") and config.hidden_dropout is not None:
|
@@ -2572,7 +2572,7 @@ class PhiOForTokenClassification(PhiOPreTrainedModel):
|
|
2572 |
# Initialize weights and apply final processing
|
2573 |
self.post_init()
|
2574 |
|
2575 |
-
@add_start_docstrings_to_model_forward(
|
2576 |
@add_code_sample_docstrings(
|
2577 |
checkpoint=_CHECKPOINT_FOR_DOC,
|
2578 |
output_type=TokenClassifierOutput,
|
|
|
13 |
# See the License for the specific language governing permissions and
|
14 |
# limitations under the License.
|
15 |
|
16 |
+
""" PyTorch Phi-4-MM model."""
|
17 |
import os
|
18 |
import math
|
19 |
import warnings
|
|
|
48 |
)
|
49 |
from transformers import PretrainedConfig
|
50 |
|
51 |
+
from .configuration_phi4mm import Phi4MMConfig
|
52 |
+
from .processing_phi4mm import InputMode
|
53 |
from .vision_siglip_navit import get_siglip_vision_model
|
54 |
from .speech_conformer_encoder import ConformerEncoder
|
55 |
|
|
|
57 |
logger = logging.get_logger(__name__)
|
58 |
|
59 |
_CHECKPOINT_FOR_DOC = "TBA"
|
60 |
+
_CONFIG_FOR_DOC = "Phi4MMConfig"
|
61 |
|
62 |
# Special token ids
|
63 |
_IMAGE_SPECIAL_TOKEN_ID = 200010 # '<|endoftext10|>', or we can better name it (in `tokenizer_config.json`)
|
|
|
194 |
return hidden_states
|
195 |
|
196 |
|
197 |
+
class Phi4MMEmbedding(nn.Module):
|
198 |
+
"""Phi-4-MM embedding for text-only, vision + text, speech + text, and vision + speech + text"""
|
199 |
def __init__(self, wte):
|
200 |
super().__init__()
|
201 |
self.wte = wte
|
|
|
234 |
return hidden_states
|
235 |
|
236 |
|
237 |
+
class Phi4MMImageEmbedding(nn.Module):
|
238 |
"""Image embedding."""
|
239 |
|
240 |
def __init__(self, config: PretrainedConfig, **kwargs) -> None:
|
|
|
666 |
return image_features_proj.squeeze()
|
667 |
|
668 |
|
669 |
+
class Phi4MMAudioEmbedding(nn.Module):
|
670 |
"""Audio embedding."""
|
671 |
|
672 |
def __init__(self, config: PretrainedConfig, **kwargs) -> None:
|
|
|
746 |
self.audio_embed_sizes = None
|
747 |
|
748 |
def post_init(self, audio_config):
|
749 |
+
# execute after the from_pretrained() initialization of the Phi4MM model
|
750 |
if audio_config.get('name', None) == "cascades":
|
751 |
init_model_config = audio_config.get("init_model", {})
|
752 |
self.encoder.post_init(init_model_config)
|
|
|
891 |
return audio_features_proj
|
892 |
|
893 |
|
894 |
+
class Phi4MMImageAudioEmbedding(nn.Module):
|
895 |
"""Image-audio embedding."""
|
896 |
|
897 |
def __init__(self, config: PretrainedConfig, **kwargs) -> None:
|
|
|
904 |
assert self.image_input_id != self.audio_input_id, 'image_input_id and audio_input_id should be different'
|
905 |
|
906 |
self.image_embd_layer_kwargs = kwargs['image_embd_layer']
|
907 |
+
self.image_embed = Phi4MMImageEmbedding(config, **self.image_embd_layer_kwargs)
|
908 |
self.audio_embd_layer_kwargs = kwargs['audio_embd_layer']
|
909 |
+
self.audio_embed = Phi4MMAudioEmbedding(config, **self.audio_embd_layer_kwargs)
|
910 |
|
911 |
self.input_image_embeds = None
|
912 |
self.image_sizes = None
|
|
|
1035 |
|
1036 |
|
1037 |
# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Phi3
|
1038 |
+
class Phi4MMRMSNorm(nn.Module):
|
1039 |
def __init__(self, hidden_size, eps=1e-6):
|
1040 |
"""
|
1041 |
+
Phi4MMRMSNorm is equivalent to T5LayerNorm
|
1042 |
"""
|
1043 |
super().__init__()
|
1044 |
self.weight = nn.Parameter(torch.ones(hidden_size))
|
|
|
1056 |
|
1057 |
|
1058 |
# Copied from transformers.models.gemma.modeling_gemma.GemmaRotaryEmbedding with gemma->phi3, Gemma->Phi3
|
1059 |
+
class Phi4MMRotaryEmbedding(nn.Module):
|
1060 |
def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
|
1061 |
super().__init__()
|
1062 |
|
|
|
1085 |
return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
|
1086 |
|
1087 |
|
1088 |
+
class Phi4MMSuScaledRotaryEmbedding(Phi4MMRotaryEmbedding):
|
1089 |
def __init__(self, dim, config, device=None):
|
1090 |
warnings.warn(
|
1091 |
+
"The class Phi4MMSuScaledRotaryEmbedding is deprecated and will be removed in version 5 of Transformers. Please"
|
1092 |
+
" use Phi4MMLongRoPEScaledRotaryEmbedding instead.",
|
1093 |
FutureWarning,
|
1094 |
)
|
1095 |
super().__init__(dim, config.max_position_embeddings, config.rope_theta, device)
|
|
|
1126 |
return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
|
1127 |
|
1128 |
|
1129 |
+
class Phi4MMYarnScaledRotaryEmbedding(Phi4MMRotaryEmbedding):
|
1130 |
def __init__(self, dim, config, device=None):
|
1131 |
warnings.warn(
|
1132 |
+
"The class Phi4MMYarnScaledRotaryEmbedding is deprecated and will be removed in version 5 of Transformers",
|
1133 |
FutureWarning,
|
1134 |
)
|
1135 |
super().__init__(dim, config.max_position_embeddings, config.rope_theta, device)
|
|
|
1171 |
return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
|
1172 |
|
1173 |
|
1174 |
+
class Phi4MMLongRoPEScaledRotaryEmbedding(Phi4MMRotaryEmbedding):
|
1175 |
def __init__(self, dim, config, device=None):
|
1176 |
super().__init__(dim, config.max_position_embeddings, config.rope_theta, device)
|
1177 |
|
|
|
1252 |
return q_embed, k_embed
|
1253 |
|
1254 |
|
1255 |
+
class Phi4MMMLP(nn.Module):
|
1256 |
def __init__(self, config):
|
1257 |
super().__init__()
|
1258 |
|
|
|
1284 |
return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
|
1285 |
|
1286 |
|
1287 |
+
class Phi4MMAttention(nn.Module):
|
1288 |
"""Multi-headed attention from 'Attention Is All You Need' paper"""
|
1289 |
|
1290 |
+
def __init__(self, config: Phi4MMConfig, layer_idx: Optional[int] = None):
|
1291 |
super().__init__()
|
1292 |
self.config = config
|
1293 |
self.layer_idx = layer_idx
|
|
|
1324 |
|
1325 |
def _init_rope(self):
|
1326 |
if self.rope_scaling is None:
|
1327 |
+
self.rotary_emb = Phi4MMRotaryEmbedding(
|
1328 |
self.rotary_ndims,
|
1329 |
max_position_embeddings=self.max_position_embeddings,
|
1330 |
base=self.rope_theta,
|
|
|
1332 |
else:
|
1333 |
scaling_type = self.config.rope_scaling["type"]
|
1334 |
if scaling_type == "longrope":
|
1335 |
+
self.rotary_emb = Phi4MMLongRoPEScaledRotaryEmbedding(self.rotary_ndims, self.config)
|
1336 |
else:
|
1337 |
raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
|
1338 |
|
|
|
1410 |
return attn_output, attn_weights, past_key_value
|
1411 |
|
1412 |
|
1413 |
+
class Phi4MMFlashAttention2(Phi4MMAttention):
|
1414 |
"""
|
1415 |
+
Phi-O flash attention module. This module inherits from `Phi4MMAttention` as the weights of the module stays
|
1416 |
untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
|
1417 |
flash attention and deal with padding tokens in case the input contains any of them.
|
1418 |
"""
|
|
|
1436 |
use_cache: bool = False,
|
1437 |
cache_position: Optional[torch.LongTensor] = None,
|
1438 |
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
|
1439 |
+
# Phi4MMFlashAttention2 attention does not support output_attentions
|
1440 |
|
1441 |
output_attentions = False
|
1442 |
|
|
|
1538 |
|
1539 |
# copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention with Llama->Phi
|
1540 |
# TODO @Arthur no longer copied from LLama after static cache
|
1541 |
+
class Phi4MMSdpaAttention(Phi4MMAttention):
|
1542 |
"""
|
1543 |
+
Phi4MM attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
|
1544 |
+
`Phi4MMAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
|
1545 |
SDPA API.
|
1546 |
"""
|
1547 |
|
1548 |
+
# Adapted from Phi4MMAttention.forward
|
1549 |
def forward(
|
1550 |
self,
|
1551 |
hidden_states: torch.Tensor,
|
|
|
1559 |
if output_attentions:
|
1560 |
# TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
|
1561 |
logger.warning_once(
|
1562 |
+
"Phi4MMModel is using Phi4MMSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
|
1563 |
'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
|
1564 |
)
|
1565 |
return super().forward(
|
|
|
1630 |
return attn_output, None, past_key_value
|
1631 |
|
1632 |
|
1633 |
+
PHI4MM_ATTENTION_CLASSES = {
|
1634 |
+
"eager": Phi4MMAttention,
|
1635 |
+
"flash_attention_2": Phi4MMFlashAttention2,
|
1636 |
+
"sdpa": Phi4MMSdpaAttention,
|
1637 |
}
|
1638 |
|
1639 |
|
1640 |
+
class Phi4MMDecoderLayer(nn.Module):
|
1641 |
+
def __init__(self, config: Phi4MMConfig, layer_idx: int):
|
1642 |
super().__init__()
|
1643 |
|
1644 |
self.config = config
|
1645 |
+
self.self_attn = PHI4MM_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx=layer_idx)
|
1646 |
|
1647 |
+
self.mlp = Phi4MMMLP(config)
|
1648 |
+
self.input_layernorm = Phi4MMRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
|
1649 |
|
1650 |
self.resid_attn_dropout = nn.Dropout(config.resid_pdrop)
|
1651 |
self.resid_mlp_dropout = nn.Dropout(config.resid_pdrop)
|
1652 |
+
self.post_attention_layernorm = Phi4MMRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
|
1653 |
|
1654 |
def forward(
|
1655 |
self,
|
|
|
1718 |
return outputs
|
1719 |
|
1720 |
|
1721 |
+
PHI4MM_START_DOCSTRING = r"""
|
1722 |
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
|
1723 |
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
|
1724 |
etc.)
|
|
|
1728 |
and behavior.
|
1729 |
|
1730 |
Parameters:
|
1731 |
+
config ([`Phi4MMConfig`]):
|
1732 |
Model configuration class with all the parameters of the model. Initializing with a config file does not
|
1733 |
load the weights associated with the model, only the configuration. Check out the
|
1734 |
[`~PreTrainedModel.from_pretrained`] method to load the model weights.
|
|
|
1737 |
|
1738 |
@add_start_docstrings(
|
1739 |
"The bare Phi-O model outputting raw hidden-states without any specific head on top.",
|
1740 |
+
PHI4MM_START_DOCSTRING,
|
1741 |
)
|
1742 |
+
class Phi4MMPreTrainedModel(PreTrainedModel):
|
1743 |
+
config_class = Phi4MMConfig
|
1744 |
base_model_prefix = "model"
|
1745 |
supports_gradient_checkpointing = True
|
1746 |
+
_no_split_modules = ["Phi4MMDecoderLayer"]
|
1747 |
_skip_keys_device_placement = "past_key_values"
|
1748 |
_supports_flash_attn_2 = True
|
1749 |
_supports_sdpa = True
|
|
|
1763 |
module.weight.data[module.padding_idx].zero_()
|
1764 |
|
1765 |
|
1766 |
+
PHI4MM_INPUTS_DOCSTRING = r"""
|
1767 |
Args:
|
1768 |
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
|
1769 |
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
|
|
|
1840 |
|
1841 |
@add_start_docstrings(
|
1842 |
"The bare Phi-O model outputting raw hidden-states without any specific head on top.",
|
1843 |
+
PHI4MM_START_DOCSTRING,
|
1844 |
)
|
1845 |
+
class Phi4MMModel(Phi4MMPreTrainedModel):
|
1846 |
"""
|
1847 |
+
Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Phi4MMDecoderLayer`]
|
1848 |
|
1849 |
Args:
|
1850 |
+
config: Phi4MMConfig
|
1851 |
"""
|
1852 |
|
1853 |
+
def __init__(self, config: Phi4MMConfig):
|
1854 |
super().__init__(config)
|
1855 |
self.padding_idx = config.pad_token_id
|
1856 |
self.vocab_size = config.vocab_size
|
1857 |
|
1858 |
self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
|
1859 |
self.embed_dropout = nn.Dropout(config.embd_pdrop)
|
1860 |
+
self.combined_embed = Phi4MMEmbedding(self.embed_tokens)
|
1861 |
|
1862 |
self.embed_tokens_extend = None
|
1863 |
if isinstance(config.embd_layer, dict):
|
|
|
1865 |
'embedding_cls': config.embd_layer['embedding_cls'],
|
1866 |
**config.embd_layer
|
1867 |
}
|
1868 |
+
self.embed_tokens_extend = Phi4MMImageAudioEmbedding(config, **embedding_config)
|
1869 |
|
1870 |
self.layers = nn.ModuleList(
|
1871 |
+
[Phi4MMDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
|
1872 |
)
|
1873 |
self._attn_implementation = config._attn_implementation
|
1874 |
+
self.norm = Phi4MMRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
|
1875 |
|
1876 |
self.gradient_checkpointing = False
|
1877 |
# Initialize weights and apply final processing
|
|
|
1883 |
def set_input_embeddings(self, value):
|
1884 |
self.embed_tokens = value
|
1885 |
|
1886 |
+
@add_start_docstrings_to_model_forward(Phi4MM_INPUTS_DOCSTRING)
|
1887 |
def forward(
|
1888 |
self,
|
1889 |
input_ids: torch.LongTensor = None,
|
|
|
2109 |
device: torch.device,
|
2110 |
cache_position: torch.Tensor,
|
2111 |
batch_size: int,
|
2112 |
+
config: Phi4MMConfig,
|
2113 |
past_key_values: Cache,
|
2114 |
):
|
2115 |
"""
|
|
|
2131 |
Indices depicting the position of the input sequence tokens in the sequence.
|
2132 |
batch_size (`torch.Tensor`):
|
2133 |
Batch size.
|
2134 |
+
config (`Phi4MMConfig`):
|
2135 |
The model's configuration class
|
2136 |
past_key_values (`Cache`):
|
2137 |
The cache class that is being used currently to generate
|
|
|
2168 |
return causal_mask
|
2169 |
|
2170 |
|
2171 |
+
class Phi4MMForCausalLM(Phi4MMPreTrainedModel, GenerationMixin):
|
2172 |
_tied_weights_keys = ["lm_head.weight"]
|
2173 |
|
2174 |
# Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.__init__ with Llama->Phi
|
2175 |
def __init__(self, config):
|
2176 |
super().__init__(config)
|
2177 |
+
self.model = Phi4MMModel(config)
|
2178 |
self.vocab_size = config.vocab_size
|
2179 |
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
|
2180 |
|
|
|
2260 |
return self.model
|
2261 |
|
2262 |
# Ignore copy
|
2263 |
+
@add_start_docstrings_to_model_forward(Phi4MM_INPUTS_DOCSTRING)
|
2264 |
@replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
|
2265 |
def forward(
|
2266 |
self,
|
|
|
2301 |
Example:
|
2302 |
|
2303 |
```python
|
2304 |
+
>>> from transformers import AutoTokenizer, Phi4MMForCausalLM
|
2305 |
|
2306 |
+
>>> model = Phi4MMForCausalLM.from_pretrained("TBA")
|
2307 |
>>> tokenizer = AutoTokenizer.from_pretrained("TBA")
|
2308 |
|
2309 |
>>> prompt = "This is an example script ."
|
|
|
2443 |
|
2444 |
@add_start_docstrings(
|
2445 |
"""
|
2446 |
+
The [`Phi4MMModel`] with a sequence classification head on top (linear layer).
|
2447 |
|
2448 |
+
[`Phi4MMForSequenceClassification`] uses the last token in order to do the classification, as other causal models
|
2449 |
(e.g. GPT-2) do.
|
2450 |
|
2451 |
Since it does classification on the last token, it requires to know the position of the last token. If a
|
|
|
2454 |
padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
|
2455 |
each row of the batch).
|
2456 |
""",
|
2457 |
+
PHI4MM_START_DOCSTRING,
|
2458 |
)
|
2459 |
# Copied from transformers.models.llama.modeling_llama.LlamaForSequenceClassification with Llama->Phi, LLAMA->PHI, self.transformer->self.model, transformer_outputs->model_outputs
|
2460 |
+
class Phi4MMForSequenceClassification(Phi4MMPreTrainedModel):
|
2461 |
def __init__(self, config):
|
2462 |
super().__init__(config)
|
2463 |
self.num_labels = config.num_labels
|
2464 |
+
self.model = Phi4MMModel(config)
|
2465 |
self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
|
2466 |
|
2467 |
# Initialize weights and apply final processing
|
|
|
2473 |
def set_input_embeddings(self, value):
|
2474 |
self.model.embed_tokens = value
|
2475 |
|
2476 |
+
@add_start_docstrings_to_model_forward(Phi4MM_INPUTS_DOCSTRING)
|
2477 |
def forward(
|
2478 |
self,
|
2479 |
input_ids: torch.LongTensor = None,
|
|
|
2548 |
|
2549 |
@add_start_docstrings(
|
2550 |
"""
|
2551 |
+
[`Phi4MMModel`] with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
|
2552 |
Named-Entity-Recognition (NER) tasks.
|
2553 |
""",
|
2554 |
+
PHI4MM_START_DOCSTRING,
|
2555 |
)
|
2556 |
# Copied from transformers.models.mpt.modeling_mpt.MptForTokenClassification with Mpt->Phi,MPT->PHI,self.transformer->self.model,transformer_outputs->model_outputs
|
2557 |
+
class Phi4MMForTokenClassification(Phi4MMPreTrainedModel):
|
2558 |
+
def __init__(self, config: Phi4MMConfig):
|
2559 |
super().__init__(config)
|
2560 |
self.num_labels = config.num_labels
|
2561 |
|
2562 |
+
self.model = Phi4MMModel(config)
|
2563 |
if hasattr(config, "classifier_dropout") and config.classifier_dropout is not None:
|
2564 |
classifier_dropout = config.classifier_dropout
|
2565 |
elif hasattr(config, "hidden_dropout") and config.hidden_dropout is not None:
|
|
|
2572 |
# Initialize weights and apply final processing
|
2573 |
self.post_init()
|
2574 |
|
2575 |
+
@add_start_docstrings_to_model_forward(Phi4MM_INPUTS_DOCSTRING)
|
2576 |
@add_code_sample_docstrings(
|
2577 |
checkpoint=_CHECKPOINT_FOR_DOC,
|
2578 |
output_type=TokenClassifierOutput,
|
onnx/{processing_phio.py → processing_phi4mm.py}
RENAMED
@@ -13,7 +13,7 @@
|
|
13 |
# limitations under the License.
|
14 |
|
15 |
"""
|
16 |
-
Processor class for
|
17 |
"""
|
18 |
import re
|
19 |
from typing import List, Optional, Tuple, Union
|
@@ -57,9 +57,9 @@ class InputMode(Enum):
|
|
57 |
VISION_SPEECH = 3
|
58 |
|
59 |
|
60 |
-
class
|
61 |
r"""
|
62 |
-
Constructs a
|
63 |
"""
|
64 |
model_input_names = ["input_image_embeds", "image_sizes", "image_attention_mask"]
|
65 |
|
@@ -317,7 +317,7 @@ def speechlib_mel(sample_rate, n_fft, n_mels, fmin=None, fmax=None):
|
|
317 |
return matrix
|
318 |
|
319 |
|
320 |
-
class
|
321 |
model_input_names = ["input_audio_embeds", "audio_embed_sizes", "audio_attention_mask"]
|
322 |
|
323 |
def __init__(self, audio_compression_rate, audio_downsample_rate, audio_feat_stride, **kwargs):
|
@@ -489,15 +489,15 @@ class PhiOAudioFeatureExtractor(SequenceFeatureExtractor):
|
|
489 |
return result
|
490 |
|
491 |
|
492 |
-
class
|
493 |
r"""
|
494 |
-
Constructs a
|
495 |
|
496 |
-
[`
|
497 |
-
[`~
|
498 |
|
499 |
Args:
|
500 |
-
image_processor ([`
|
501 |
The image processor is a required input.
|
502 |
tokenizer ([`GPT2Tokenizer`], *optional*):
|
503 |
The tokenizer is a required input.
|
@@ -505,8 +505,8 @@ class PhiOProcessor(ProcessorMixin):
|
|
505 |
|
506 |
attributes = ["image_processor", "audio_processor", "tokenizer"]
|
507 |
tokenizer_class = "GPT2TokenizerFast"
|
508 |
-
image_processor_class = "AutoImageProcessor" #
|
509 |
-
audio_processor_class = "AutoFeatureExtractor" #
|
510 |
|
511 |
def __init__(self, image_processor, audio_processor, tokenizer):
|
512 |
self.image_processor = image_processor
|
@@ -527,7 +527,7 @@ class PhiOProcessor(ProcessorMixin):
|
|
527 |
Main method to prepare for the model one or several sequences(s) and image(s). This method forards the `text`
|
528 |
and `kwargs` arguments to GPT2Tokenizer's [`~GPT2Tokenizer.__call__`] if `text` is not `None` to encode
|
529 |
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
|
530 |
-
|
531 |
of the above two methods for more information.
|
532 |
|
533 |
Args:
|
@@ -728,5 +728,5 @@ class PhiOProcessor(ProcessorMixin):
|
|
728 |
return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names + audio_processor_input_names))
|
729 |
|
730 |
|
731 |
-
AutoImageProcessor.register("
|
732 |
-
AutoFeatureExtractor.register("
|
|
|
13 |
# limitations under the License.
|
14 |
|
15 |
"""
|
16 |
+
Processor class for Phi-4-MM
|
17 |
"""
|
18 |
import re
|
19 |
from typing import List, Optional, Tuple, Union
|
|
|
57 |
VISION_SPEECH = 3
|
58 |
|
59 |
|
60 |
+
class Phi4MMImageProcessor(BaseImageProcessor):
|
61 |
r"""
|
62 |
+
Constructs a Phi4MM image processor.
|
63 |
"""
|
64 |
model_input_names = ["input_image_embeds", "image_sizes", "image_attention_mask"]
|
65 |
|
|
|
317 |
return matrix
|
318 |
|
319 |
|
320 |
+
class Phi4MMAudioFeatureExtractor(SequenceFeatureExtractor):
|
321 |
model_input_names = ["input_audio_embeds", "audio_embed_sizes", "audio_attention_mask"]
|
322 |
|
323 |
def __init__(self, audio_compression_rate, audio_downsample_rate, audio_feat_stride, **kwargs):
|
|
|
489 |
return result
|
490 |
|
491 |
|
492 |
+
class Phi4MMProcessor(ProcessorMixin):
|
493 |
r"""
|
494 |
+
Constructs a Phi4MM processor which raps an image processor, a audio processor, and a GPT tokenizer into a single processor.
|
495 |
|
496 |
+
[`Phi4MMProcessor`] offers all the functionalities of [`Phi4MMImageProcessor`] and [`GPT2Tokenizer`]. See the
|
497 |
+
[`~Phi4MMProcessor.__call__`] and [`~Phi4MMProcessor.decode`] for more information.
|
498 |
|
499 |
Args:
|
500 |
+
image_processor ([`Phi4MMImageProcessor`], *optional*):
|
501 |
The image processor is a required input.
|
502 |
tokenizer ([`GPT2Tokenizer`], *optional*):
|
503 |
The tokenizer is a required input.
|
|
|
505 |
|
506 |
attributes = ["image_processor", "audio_processor", "tokenizer"]
|
507 |
tokenizer_class = "GPT2TokenizerFast"
|
508 |
+
image_processor_class = "AutoImageProcessor" # Phi4MMImageProcessor will be registered later
|
509 |
+
audio_processor_class = "AutoFeatureExtractor" # Phi4MMAudioFeatureExtractor will be registered later
|
510 |
|
511 |
def __init__(self, image_processor, audio_processor, tokenizer):
|
512 |
self.image_processor = image_processor
|
|
|
527 |
Main method to prepare for the model one or several sequences(s) and image(s). This method forards the `text`
|
528 |
and `kwargs` arguments to GPT2Tokenizer's [`~GPT2Tokenizer.__call__`] if `text` is not `None` to encode
|
529 |
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
|
530 |
+
Phi4MMImageProcessor's [`~Phi4MMImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
|
531 |
of the above two methods for more information.
|
532 |
|
533 |
Args:
|
|
|
728 |
return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names + audio_processor_input_names))
|
729 |
|
730 |
|
731 |
+
AutoImageProcessor.register("Phi4MMImageProcessor", Phi4MMImageProcessor)
|
732 |
+
AutoFeatureExtractor.register("Phi4MMAudioFeatureExtractor", Phi4MMAudioFeatureExtractor)
|