commit files to HF hub

Browse files

Files changed (3) hide show

config.json +3 -3
configuration_phi3.py → configuration.py +8 -8
modeling_phi3.py → modeling.py +69 -69

config.json CHANGED Viewed

@@ -1,12 +1,12 @@
 {
-  "_name_or_path": "Phi-3-mini-4k-instruct",
   "architectures": [
     "Phi3ForCausalLM"
   ],
   "attention_dropout": 0.0,
   "auto_map": {
-    "AutoConfig": "configuration_phi3.Phi3Config",
-    "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM"
   },
   "bos_token_id": 1,
   "embd_pdrop": 0.0,

 {
+  "_name_or_path": "PersianStories-4k",
   "architectures": [
     "Phi3ForCausalLM"
   ],
   "attention_dropout": 0.0,
   "auto_map": {
+    "AutoConfig": "configuration.Phi3Config",
+    "AutoModelForCausalLM": "modeling.Phi3ForCausalLM"
   },
   "bos_token_id": 1,
   "embd_pdrop": 0.0,

configuration_phi3.py → configuration.py RENAMED Viewed

@@ -22,15 +22,15 @@ from transformers.utils import logging
 logger = logging.get_logger(__name__)
-PHI3_PRETRAINED_CONFIG_ARCHIVE_MAP = {
     "microsoft/Phi-3-mini-4k-instruct": "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/resolve/main/config.json",
     "microsoft/Phi-3-mini-128k-instruct": "https://huggingface.co/microsoft/Phi-3-mini-128k-instruct/resolve/main/config.json",
 }
-class Phi3Config(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a [`Phi3Model`]. It is used to instantiate a Phi-3
     model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
     defaults will yield a similar configuration to that of the
     [microsoft/Phi-3-mini-4k-instruct](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct).
@@ -41,7 +41,7 @@ class Phi3Config(PretrainedConfig):
     Args:
         vocab_size (`int`, *optional*, defaults to 32064):
             Vocabulary size of the Phi-3 model. Defines the number of different tokens that can be represented by the
-            `inputs_ids` passed when calling [`Phi3Model`].
         hidden_size (`int`, *optional*, defaults to 3072):
             Dimension of the hidden representations.
         intermediate_size (`int`, *optional*, defaults to 8192):
@@ -99,19 +99,19 @@ class Phi3Config(PretrainedConfig):
     Example:
     ```python
-    >>> from transformers import Phi3Model, Phi3Config
     >>> # Initializing a Phi-3 style configuration
-    >>> configuration = Phi3Config.from_pretrained("microsoft/Phi-3-mini-4k-instruct")
     >>> # Initializing a model from the configuration
-    >>> model = Phi3Model(configuration)
     >>> # Accessing the model configuration
     >>> configuration = model.config
     ```"""
-    model_type = "phi3"
     keys_to_ignore_at_inference = ["past_key_values"]
     def __init__(

 logger = logging.get_logger(__name__)
+PersianStories_PRETRAINED_CONFIG_ARCHIVE_MAP = {
     "microsoft/Phi-3-mini-4k-instruct": "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/resolve/main/config.json",
     "microsoft/Phi-3-mini-128k-instruct": "https://huggingface.co/microsoft/Phi-3-mini-128k-instruct/resolve/main/config.json",
 }
+class PersianStoriesConfig(PretrainedConfig):
     r"""
+    This is the configuration class to store the configuration of a [`PersianStoriesModel`]. It is used to instantiate a Phi-3
     model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
     defaults will yield a similar configuration to that of the
     [microsoft/Phi-3-mini-4k-instruct](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct).
     Args:
         vocab_size (`int`, *optional*, defaults to 32064):
             Vocabulary size of the Phi-3 model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`PersianStoriesModel`].
         hidden_size (`int`, *optional*, defaults to 3072):
             Dimension of the hidden representations.
         intermediate_size (`int`, *optional*, defaults to 8192):
     Example:
     ```python
+    >>> from transformers import PersianStoriesModel, PersianStoriesConfig
     >>> # Initializing a Phi-3 style configuration
+    >>> configuration = PersianStoriesConfig.from_pretrained("microsoft/Phi-3-mini-4k-instruct")
     >>> # Initializing a model from the configuration
+    >>> model = PersianStoriesModel(configuration)
     >>> # Accessing the model configuration
     >>> configuration = model.config
     ```"""
+    model_type = "PersianStories"
     keys_to_ignore_at_inference = ["past_key_values"]
     def __init__(

modeling_phi3.py → modeling.py RENAMED Viewed

@@ -45,7 +45,7 @@ from transformers.utils import (
     logging,
     replace_return_docstrings,
 )
-from .configuration_phi3 import Phi3Config
 logger = logging.get_logger(__name__)
@@ -68,20 +68,20 @@ except ImportError as error:
         )
 _CHECKPOINT_FOR_DOC = "microsoft/Phi-3-mini-4k-instruct"
-_CONFIG_FOR_DOC = "Phi3Config"
-PHI3_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "microsoft/Phi-3-mini-4k-instruct",
     "microsoft/Phi-3-mini-128k-instruct",
     # See all Phi-3 models at https://huggingface.co/models?filter=Phi-3
 ]
-# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Phi3
-class Phi3RMSNorm(nn.Module):
     def __init__(self, hidden_size, eps=1e-6):
         """
-        Phi3RMSNorm is equivalent to T5LayerNorm
         """
         super().__init__()
         self.weight = nn.Parameter(torch.ones(hidden_size))
@@ -108,8 +108,8 @@ def _get_unpad_data(attention_mask):
     )
-# Copied from transformers.models.gemma.modeling_gemma.GemmaRotaryEmbedding with gemma->phi3, Gemma->Phi3
-class Phi3RotaryEmbedding(nn.Module):
     def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
         super().__init__()
@@ -139,7 +139,7 @@ class Phi3RotaryEmbedding(nn.Module):
         return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
-class Phi3LongRoPEScaledRotaryEmbedding(Phi3RotaryEmbedding):
     def __init__(self, dim, config, device=None):
         super().__init__(dim, config.max_position_embeddings, config.rope_theta, device)
@@ -216,7 +216,7 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
     return q_embed, k_embed
-class Phi3MLP(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -248,10 +248,10 @@ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
     return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
-class Phi3Attention(nn.Module):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
-    def __init__(self, config: Phi3Config, layer_idx: Optional[int] = None):
         super().__init__()
         self.config = config
         self.layer_idx = layer_idx
@@ -287,7 +287,7 @@ class Phi3Attention(nn.Module):
     def _init_rope(self):
         if self.rope_scaling is None:
-            self.rotary_emb = Phi3RotaryEmbedding(
                 self.head_dim,
                 max_position_embeddings=self.max_position_embeddings,
                 base=self.rope_theta,
@@ -295,7 +295,7 @@ class Phi3Attention(nn.Module):
         else:
             scaling_type = self.config.rope_scaling["type"]
             if scaling_type == "longrope":
-                self.rotary_emb = Phi3LongRoPEScaledRotaryEmbedding(self.head_dim, self.config)
             else:
                 raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
@@ -381,9 +381,9 @@ class Phi3Attention(nn.Module):
         return attn_output, attn_weights, past_key_value
-class Phi3FlashAttention2(Phi3Attention):
     """
-    Phi-3 flash attention module. This module inherits from `Phi3Attention` as the weights of the module stays
     untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
     flash attention and deal with padding tokens in case the input contains any of them.
     """
@@ -407,7 +407,7 @@ class Phi3FlashAttention2(Phi3Attention):
         use_cache: bool = False,
         **kwargs,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        # Phi3FlashAttention2 attention does not support output_attentions
         if not _flash_supports_window_size:
             logger.warning_once(
@@ -690,16 +690,16 @@ class Phi3FlashAttention2(Phi3Attention):
         )
-# copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention with Llama->Phi3
 # TODO @Arthur no longer copied from LLama after static cache
-class Phi3SdpaAttention(Phi3Attention):
     """
-    Phi3 attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
-    `Phi3Attention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
     SDPA API.
     """
-    # Adapted from Phi3Attention.forward
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -712,7 +712,7 @@ class Phi3SdpaAttention(Phi3Attention):
         if output_attentions:
             # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
             logger.warning_once(
-                "Phi3Model is using Phi3SdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
                 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
             )
             return super().forward(
@@ -781,26 +781,26 @@ class Phi3SdpaAttention(Phi3Attention):
         return attn_output, None, past_key_value
-PHI3_ATTENTION_CLASSES = {
-    "eager": Phi3Attention,
-    "flash_attention_2": Phi3FlashAttention2,
-    "sdpa": Phi3SdpaAttention,
 }
-class Phi3DecoderLayer(nn.Module):
-    def __init__(self, config: Phi3Config, layer_idx: int):
         super().__init__()
         self.config = config
-        self.self_attn = PHI3_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx=layer_idx)
-        self.mlp = Phi3MLP(config)
-        self.input_layernorm = Phi3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.resid_attn_dropout = nn.Dropout(config.resid_pdrop)
         self.resid_mlp_dropout = nn.Dropout(config.resid_pdrop)
-        self.post_attention_layernorm = Phi3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
     def forward(
         self,
@@ -866,7 +866,7 @@ class Phi3DecoderLayer(nn.Module):
         return outputs
-PHI3_START_DOCSTRING = r"""
     This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
     library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
     etc.)
@@ -876,7 +876,7 @@ PHI3_START_DOCSTRING = r"""
     and behavior.
     Parameters:
-        config ([`Phi3Config`]):
             Model configuration class with all the parameters of the model. Initializing with a config file does not
             load the weights associated with the model, only the configuration. Check out the
             [`~PreTrainedModel.from_pretrained`] method to load the model weights.
@@ -885,13 +885,13 @@ PHI3_START_DOCSTRING = r"""
 @add_start_docstrings(
     "The bare Phi-3 model outputting raw hidden-states without any specific head on top.",
-    PHI3_START_DOCSTRING,
 )
-class Phi3PreTrainedModel(PreTrainedModel):
-    config_class = Phi3Config
     base_model_prefix = "model"
     supports_gradient_checkpointing = True
-    _no_split_modules = ["Phi3DecoderLayer"]
     _skip_keys_device_placement = "past_key_values"
     _supports_flash_attn_2 = True
     _supports_sdpa = False
@@ -911,7 +911,7 @@ class Phi3PreTrainedModel(PreTrainedModel):
                 module.weight.data[module.padding_idx].zero_()
-PHI3_INPUTS_DOCSTRING = r"""
     Args:
         input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
             Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
@@ -983,17 +983,17 @@ PHI3_INPUTS_DOCSTRING = r"""
 @add_start_docstrings(
     "The bare Phi-3 model outputting raw hidden-states without any specific head on top.",
-    PHI3_START_DOCSTRING,
 )
-class Phi3Model(Phi3PreTrainedModel):
     """
-    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Phi3DecoderLayer`]
     Args:
-        config: Phi3Config
     """
-    def __init__(self, config: Phi3Config):
         super().__init__(config)
         self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
@@ -1001,10 +1001,10 @@ class Phi3Model(Phi3PreTrainedModel):
         self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
         self.embed_dropout = nn.Dropout(config.embd_pdrop)
         self.layers = nn.ModuleList(
-            [Phi3DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
         )
         self._attn_implementation = config._attn_implementation
-        self.norm = Phi3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.gradient_checkpointing = False
         # Initialize weights and apply final processing
@@ -1016,7 +1016,7 @@ class Phi3Model(Phi3PreTrainedModel):
     def set_input_embeddings(self, value):
         self.embed_tokens = value
-    @add_start_docstrings_to_model_forward(PHI3_INPUTS_DOCSTRING)
     def forward(
         self,
         input_ids: torch.LongTensor = None,
@@ -1079,7 +1079,7 @@ class Phi3Model(Phi3PreTrainedModel):
             if is_padding_right:
                 raise ValueError(
                     "You are attempting to perform batched generation with padding_side='right'"
-                    " this may lead to unexpected behaviour for Flash Attention version of Phi3. Make sure to "
                     " call `tokenizer.padding_side  = 'left'` before tokenizing the input. "
                 )
@@ -1154,13 +1154,13 @@ class Phi3Model(Phi3PreTrainedModel):
         )
-class Phi3ForCausalLM(Phi3PreTrainedModel):
     _tied_weights_keys = ["lm_head.weight"]
-    # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.__init__ with Llama->Phi3
     def __init__(self, config):
         super().__init__(config)
-        self.model = Phi3Model(config)
         self.vocab_size = config.vocab_size
         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
@@ -1192,7 +1192,7 @@ class Phi3ForCausalLM(Phi3PreTrainedModel):
         return self.model
     # Ignore copy
-    @add_start_docstrings_to_model_forward(PHI3_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
@@ -1219,9 +1219,9 @@ class Phi3ForCausalLM(Phi3PreTrainedModel):
         Example:
         ```python
-        >>> from transformers import AutoTokenizer, Phi3ForCausalLM
-        >>> model = Phi3ForCausalLM.from_pretrained("microsoft/phi-3-mini-4k-instruct")
         >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-3-mini-4k-instruct")
         >>> prompt = "This is an example script ."
@@ -1351,9 +1351,9 @@ class Phi3ForCausalLM(Phi3PreTrainedModel):
 @add_start_docstrings(
     """
-    The [`Phi3Model`] with a sequence classification head on top (linear layer).
-    [`Phi3ForSequenceClassification`] uses the last token in order to do the classification, as other causal models
     (e.g. GPT-2) do.
     Since it does classification on the last token, it requires to know the position of the last token. If a
@@ -1362,14 +1362,14 @@ class Phi3ForCausalLM(Phi3PreTrainedModel):
     padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
     each row of the batch).
     """,
-    PHI3_START_DOCSTRING,
 )
-# Copied from transformers.models.llama.modeling_llama.LlamaForSequenceClassification with Llama->Phi3, LLAMA->PHI3, self.transformer->self.model, transformer_outputs->model_outputs
-class Phi3ForSequenceClassification(Phi3PreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
-        self.model = Phi3Model(config)
         self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
         # Initialize weights and apply final processing
@@ -1381,7 +1381,7 @@ class Phi3ForSequenceClassification(Phi3PreTrainedModel):
     def set_input_embeddings(self, value):
         self.model.embed_tokens = value
-    @add_start_docstrings_to_model_forward(PHI3_INPUTS_DOCSTRING)
     def forward(
         self,
         input_ids: torch.LongTensor = None,
@@ -1475,18 +1475,18 @@ class Phi3ForSequenceClassification(Phi3PreTrainedModel):
 @add_start_docstrings(
     """
-    [`Phi3Model`] with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
     Named-Entity-Recognition (NER) tasks.
     """,
-    PHI3_START_DOCSTRING,
 )
-# Copied from transformers.models.mpt.modeling_mpt.MptForTokenClassification with Mpt->Phi3,MPT->PHI3,self.transformer->self.model,transformer_outputs->model_outputs
-class Phi3ForTokenClassification(Phi3PreTrainedModel):
-    def __init__(self, config: Phi3Config):
         super().__init__(config)
         self.num_labels = config.num_labels
-        self.model = Phi3Model(config)
         if hasattr(config, "classifier_dropout") and config.classifier_dropout is not None:
             classifier_dropout = config.classifier_dropout
         elif hasattr(config, "hidden_dropout") and config.hidden_dropout is not None:
@@ -1499,7 +1499,7 @@ class Phi3ForTokenClassification(Phi3PreTrainedModel):
         # Initialize weights and apply final processing
         self.post_init()
-    @add_start_docstrings_to_model_forward(PHI3_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TokenClassifierOutput,

     logging,
     replace_return_docstrings,
 )
+from .configuration import PersianStoriesConfig
 logger = logging.get_logger(__name__)
         )
 _CHECKPOINT_FOR_DOC = "microsoft/Phi-3-mini-4k-instruct"
+_CONFIG_FOR_DOC = "PersianStoriesConfig"
+PersianStories_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "microsoft/Phi-3-mini-4k-instruct",
     "microsoft/Phi-3-mini-128k-instruct",
     # See all Phi-3 models at https://huggingface.co/models?filter=Phi-3
 ]
+# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->PersianStories
+class PersianStoriesRMSNorm(nn.Module):
     def __init__(self, hidden_size, eps=1e-6):
         """
+        PersianStoriesRMSNorm is equivalent to T5LayerNorm
         """
         super().__init__()
         self.weight = nn.Parameter(torch.ones(hidden_size))
     )
+# Copied from transformers.models.gemma.modeling_gemma.GemmaRotaryEmbedding with gemma->PersianStories, Gemma->PersianStories
+class PersianStoriesRotaryEmbedding(nn.Module):
     def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
         super().__init__()
         return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+class PersianStoriesLongRoPEScaledRotaryEmbedding(PersianStoriesRotaryEmbedding):
     def __init__(self, dim, config, device=None):
         super().__init__(dim, config.max_position_embeddings, config.rope_theta, device)
     return q_embed, k_embed
+class PersianStoriesMLP(nn.Module):
     def __init__(self, config):
         super().__init__()
     return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+class PersianStoriesAttention(nn.Module):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
+    def __init__(self, config: PersianStoriesConfig, layer_idx: Optional[int] = None):
         super().__init__()
         self.config = config
         self.layer_idx = layer_idx
     def _init_rope(self):
         if self.rope_scaling is None:
+            self.rotary_emb = PersianStoriesRotaryEmbedding(
                 self.head_dim,
                 max_position_embeddings=self.max_position_embeddings,
                 base=self.rope_theta,
         else:
             scaling_type = self.config.rope_scaling["type"]
             if scaling_type == "longrope":
+                self.rotary_emb = PersianStoriesLongRoPEScaledRotaryEmbedding(self.head_dim, self.config)
             else:
                 raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
         return attn_output, attn_weights, past_key_value
+class PersianStoriesFlashAttention2(PersianStoriesAttention):
     """
+    Phi-3 flash attention module. This module inherits from `PersianStoriesAttention` as the weights of the module stays
     untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
     flash attention and deal with padding tokens in case the input contains any of them.
     """
         use_cache: bool = False,
         **kwargs,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        # PersianStoriesFlashAttention2 attention does not support output_attentions
         if not _flash_supports_window_size:
             logger.warning_once(
         )
+# copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention with Llama->PersianStories
 # TODO @Arthur no longer copied from LLama after static cache
+class PersianStoriesSdpaAttention(PersianStoriesAttention):
     """
+    PersianStories attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
+    `PersianStoriesAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
     SDPA API.
     """
+    # Adapted from PersianStoriesAttention.forward
     def forward(
         self,
         hidden_states: torch.Tensor,
         if output_attentions:
             # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
             logger.warning_once(
+                "PersianStoriesModel is using PersianStoriesSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
                 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
             )
             return super().forward(
         return attn_output, None, past_key_value
+PersianStories_ATTENTION_CLASSES = {
+    "eager": PersianStoriesAttention,
+    "flash_attention_2": PersianStoriesFlashAttention2,
+    "sdpa": PersianStoriesSdpaAttention,
 }
+class PersianStoriesDecoderLayer(nn.Module):
+    def __init__(self, config: PersianStoriesConfig, layer_idx: int):
         super().__init__()
         self.config = config
+        self.self_attn = PersianStories_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx=layer_idx)
+        self.mlp = PersianStoriesMLP(config)
+        self.input_layernorm = PersianStoriesRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.resid_attn_dropout = nn.Dropout(config.resid_pdrop)
         self.resid_mlp_dropout = nn.Dropout(config.resid_pdrop)
+        self.post_attention_layernorm = PersianStoriesRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
     def forward(
         self,
         return outputs
+PersianStories_START_DOCSTRING = r"""
     This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
     library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
     etc.)
     and behavior.
     Parameters:
+        config ([`PersianStoriesConfig`]):
             Model configuration class with all the parameters of the model. Initializing with a config file does not
             load the weights associated with the model, only the configuration. Check out the
             [`~PreTrainedModel.from_pretrained`] method to load the model weights.
 @add_start_docstrings(
     "The bare Phi-3 model outputting raw hidden-states without any specific head on top.",
+    PersianStories_START_DOCSTRING,
 )
+class PersianStoriesPreTrainedModel(PreTrainedModel):
+    config_class = PersianStoriesConfig
     base_model_prefix = "model"
     supports_gradient_checkpointing = True
+    _no_split_modules = ["PersianStoriesDecoderLayer"]
     _skip_keys_device_placement = "past_key_values"
     _supports_flash_attn_2 = True
     _supports_sdpa = False
                 module.weight.data[module.padding_idx].zero_()
+PersianStories_INPUTS_DOCSTRING = r"""
     Args:
         input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
             Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
 @add_start_docstrings(
     "The bare Phi-3 model outputting raw hidden-states without any specific head on top.",
+    PersianStories_START_DOCSTRING,
 )
+class PersianStoriesModel(PersianStoriesPreTrainedModel):
     """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`PersianStoriesDecoderLayer`]
     Args:
+        config: PersianStoriesConfig
     """
+    def __init__(self, config: PersianStoriesConfig):
         super().__init__(config)
         self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
         self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
         self.embed_dropout = nn.Dropout(config.embd_pdrop)
         self.layers = nn.ModuleList(
+            [PersianStoriesDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
         )
         self._attn_implementation = config._attn_implementation
+        self.norm = PersianStoriesRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.gradient_checkpointing = False
         # Initialize weights and apply final processing
     def set_input_embeddings(self, value):
         self.embed_tokens = value
+    @add_start_docstrings_to_model_forward(PersianStories_INPUTS_DOCSTRING)
     def forward(
         self,
         input_ids: torch.LongTensor = None,
             if is_padding_right:
                 raise ValueError(
                     "You are attempting to perform batched generation with padding_side='right'"
+                    " this may lead to unexpected behaviour for Flash Attention version of PersianStories. Make sure to "
                     " call `tokenizer.padding_side  = 'left'` before tokenizing the input. "
                 )
         )
+class PersianStoriesForCausalLM(PersianStoriesPreTrainedModel):
     _tied_weights_keys = ["lm_head.weight"]
+    # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.__init__ with Llama->PersianStories
     def __init__(self, config):
         super().__init__(config)
+        self.model = PersianStoriesModel(config)
         self.vocab_size = config.vocab_size
         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
         return self.model
     # Ignore copy
+    @add_start_docstrings_to_model_forward(PersianStories_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
         Example:
         ```python
+        >>> from transformers import AutoTokenizer, PersianStoriesForCausalLM
+        >>> model = PersianStoriesForCausalLM.from_pretrained("microsoft/phi-3-mini-4k-instruct")
         >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-3-mini-4k-instruct")
         >>> prompt = "This is an example script ."
 @add_start_docstrings(
     """
+    The [`PersianStoriesModel`] with a sequence classification head on top (linear layer).
+    [`PersianStoriesForSequenceClassification`] uses the last token in order to do the classification, as other causal models
     (e.g. GPT-2) do.
     Since it does classification on the last token, it requires to know the position of the last token. If a
     padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
     each row of the batch).
     """,
+    PersianStories_START_DOCSTRING,
 )
+# Copied from transformers.models.llama.modeling_llama.LlamaForSequenceClassification with Llama->PersianStories, LLAMA->PersianStories, self.transformer->self.model, transformer_outputs->model_outputs
+class PersianStoriesForSequenceClassification(PersianStoriesPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
+        self.model = PersianStoriesModel(config)
         self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
         # Initialize weights and apply final processing
     def set_input_embeddings(self, value):
         self.model.embed_tokens = value
+    @add_start_docstrings_to_model_forward(PersianStories_INPUTS_DOCSTRING)
     def forward(
         self,
         input_ids: torch.LongTensor = None,
 @add_start_docstrings(
     """
+    [`PersianStoriesModel`] with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
     Named-Entity-Recognition (NER) tasks.
     """,
+    PersianStories_START_DOCSTRING,
 )
+# Copied from transformers.models.mpt.modeling_mpt.MptForTokenClassification with Mpt->PersianStories,MPT->PersianStories,self.transformer->self.model,transformer_outputs->model_outputs
+class PersianStoriesForTokenClassification(PersianStoriesPreTrainedModel):
+    def __init__(self, config: PersianStoriesConfig):
         super().__init__(config)
         self.num_labels = config.num_labels
+        self.model = PersianStoriesModel(config)
         if hasattr(config, "classifier_dropout") and config.classifier_dropout is not None:
             classifier_dropout = config.classifier_dropout
         elif hasattr(config, "hidden_dropout") and config.hidden_dropout is not None:
         # Initialize weights and apply final processing
         self.post_init()
+    @add_start_docstrings_to_model_forward(PersianStories_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TokenClassifierOutput,