borna commited on
Commit
cd2a5d3
·
1 Parent(s): 64fdbdd

commit files to HF hub

Browse files
config.json CHANGED
@@ -1,12 +1,12 @@
1
  {
2
- "_name_or_path": "Phi-3-mini-4k-instruct",
3
  "architectures": [
4
  "Phi3ForCausalLM"
5
  ],
6
  "attention_dropout": 0.0,
7
  "auto_map": {
8
- "AutoConfig": "configuration_phi3.Phi3Config",
9
- "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM"
10
  },
11
  "bos_token_id": 1,
12
  "embd_pdrop": 0.0,
 
1
  {
2
+ "_name_or_path": "PersianStories-4k",
3
  "architectures": [
4
  "Phi3ForCausalLM"
5
  ],
6
  "attention_dropout": 0.0,
7
  "auto_map": {
8
+ "AutoConfig": "configuration.Phi3Config",
9
+ "AutoModelForCausalLM": "modeling.Phi3ForCausalLM"
10
  },
11
  "bos_token_id": 1,
12
  "embd_pdrop": 0.0,
configuration_phi3.py → configuration.py RENAMED
@@ -22,15 +22,15 @@ from transformers.utils import logging
22
 
23
  logger = logging.get_logger(__name__)
24
 
25
- PHI3_PRETRAINED_CONFIG_ARCHIVE_MAP = {
26
  "microsoft/Phi-3-mini-4k-instruct": "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/resolve/main/config.json",
27
  "microsoft/Phi-3-mini-128k-instruct": "https://huggingface.co/microsoft/Phi-3-mini-128k-instruct/resolve/main/config.json",
28
  }
29
 
30
 
31
- class Phi3Config(PretrainedConfig):
32
  r"""
33
- This is the configuration class to store the configuration of a [`Phi3Model`]. It is used to instantiate a Phi-3
34
  model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
35
  defaults will yield a similar configuration to that of the
36
  [microsoft/Phi-3-mini-4k-instruct](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct).
@@ -41,7 +41,7 @@ class Phi3Config(PretrainedConfig):
41
  Args:
42
  vocab_size (`int`, *optional*, defaults to 32064):
43
  Vocabulary size of the Phi-3 model. Defines the number of different tokens that can be represented by the
44
- `inputs_ids` passed when calling [`Phi3Model`].
45
  hidden_size (`int`, *optional*, defaults to 3072):
46
  Dimension of the hidden representations.
47
  intermediate_size (`int`, *optional*, defaults to 8192):
@@ -99,19 +99,19 @@ class Phi3Config(PretrainedConfig):
99
  Example:
100
 
101
  ```python
102
- >>> from transformers import Phi3Model, Phi3Config
103
 
104
  >>> # Initializing a Phi-3 style configuration
105
- >>> configuration = Phi3Config.from_pretrained("microsoft/Phi-3-mini-4k-instruct")
106
 
107
  >>> # Initializing a model from the configuration
108
- >>> model = Phi3Model(configuration)
109
 
110
  >>> # Accessing the model configuration
111
  >>> configuration = model.config
112
  ```"""
113
 
114
- model_type = "phi3"
115
  keys_to_ignore_at_inference = ["past_key_values"]
116
 
117
  def __init__(
 
22
 
23
  logger = logging.get_logger(__name__)
24
 
25
+ PersianStories_PRETRAINED_CONFIG_ARCHIVE_MAP = {
26
  "microsoft/Phi-3-mini-4k-instruct": "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/resolve/main/config.json",
27
  "microsoft/Phi-3-mini-128k-instruct": "https://huggingface.co/microsoft/Phi-3-mini-128k-instruct/resolve/main/config.json",
28
  }
29
 
30
 
31
+ class PersianStoriesConfig(PretrainedConfig):
32
  r"""
33
+ This is the configuration class to store the configuration of a [`PersianStoriesModel`]. It is used to instantiate a Phi-3
34
  model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
35
  defaults will yield a similar configuration to that of the
36
  [microsoft/Phi-3-mini-4k-instruct](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct).
 
41
  Args:
42
  vocab_size (`int`, *optional*, defaults to 32064):
43
  Vocabulary size of the Phi-3 model. Defines the number of different tokens that can be represented by the
44
+ `inputs_ids` passed when calling [`PersianStoriesModel`].
45
  hidden_size (`int`, *optional*, defaults to 3072):
46
  Dimension of the hidden representations.
47
  intermediate_size (`int`, *optional*, defaults to 8192):
 
99
  Example:
100
 
101
  ```python
102
+ >>> from transformers import PersianStoriesModel, PersianStoriesConfig
103
 
104
  >>> # Initializing a Phi-3 style configuration
105
+ >>> configuration = PersianStoriesConfig.from_pretrained("microsoft/Phi-3-mini-4k-instruct")
106
 
107
  >>> # Initializing a model from the configuration
108
+ >>> model = PersianStoriesModel(configuration)
109
 
110
  >>> # Accessing the model configuration
111
  >>> configuration = model.config
112
  ```"""
113
 
114
+ model_type = "PersianStories"
115
  keys_to_ignore_at_inference = ["past_key_values"]
116
 
117
  def __init__(
modeling_phi3.py → modeling.py RENAMED
@@ -45,7 +45,7 @@ from transformers.utils import (
45
  logging,
46
  replace_return_docstrings,
47
  )
48
- from .configuration_phi3 import Phi3Config
49
 
50
 
51
  logger = logging.get_logger(__name__)
@@ -68,20 +68,20 @@ except ImportError as error:
68
  )
69
 
70
  _CHECKPOINT_FOR_DOC = "microsoft/Phi-3-mini-4k-instruct"
71
- _CONFIG_FOR_DOC = "Phi3Config"
72
 
73
- PHI3_PRETRAINED_MODEL_ARCHIVE_LIST = [
74
  "microsoft/Phi-3-mini-4k-instruct",
75
  "microsoft/Phi-3-mini-128k-instruct",
76
  # See all Phi-3 models at https://huggingface.co/models?filter=Phi-3
77
  ]
78
 
79
 
80
- # Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Phi3
81
- class Phi3RMSNorm(nn.Module):
82
  def __init__(self, hidden_size, eps=1e-6):
83
  """
84
- Phi3RMSNorm is equivalent to T5LayerNorm
85
  """
86
  super().__init__()
87
  self.weight = nn.Parameter(torch.ones(hidden_size))
@@ -108,8 +108,8 @@ def _get_unpad_data(attention_mask):
108
  )
109
 
110
 
111
- # Copied from transformers.models.gemma.modeling_gemma.GemmaRotaryEmbedding with gemma->phi3, Gemma->Phi3
112
- class Phi3RotaryEmbedding(nn.Module):
113
  def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
114
  super().__init__()
115
 
@@ -139,7 +139,7 @@ class Phi3RotaryEmbedding(nn.Module):
139
  return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
140
 
141
 
142
- class Phi3LongRoPEScaledRotaryEmbedding(Phi3RotaryEmbedding):
143
  def __init__(self, dim, config, device=None):
144
  super().__init__(dim, config.max_position_embeddings, config.rope_theta, device)
145
 
@@ -216,7 +216,7 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
216
  return q_embed, k_embed
217
 
218
 
219
- class Phi3MLP(nn.Module):
220
  def __init__(self, config):
221
  super().__init__()
222
 
@@ -248,10 +248,10 @@ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
248
  return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
249
 
250
 
251
- class Phi3Attention(nn.Module):
252
  """Multi-headed attention from 'Attention Is All You Need' paper"""
253
 
254
- def __init__(self, config: Phi3Config, layer_idx: Optional[int] = None):
255
  super().__init__()
256
  self.config = config
257
  self.layer_idx = layer_idx
@@ -287,7 +287,7 @@ class Phi3Attention(nn.Module):
287
 
288
  def _init_rope(self):
289
  if self.rope_scaling is None:
290
- self.rotary_emb = Phi3RotaryEmbedding(
291
  self.head_dim,
292
  max_position_embeddings=self.max_position_embeddings,
293
  base=self.rope_theta,
@@ -295,7 +295,7 @@ class Phi3Attention(nn.Module):
295
  else:
296
  scaling_type = self.config.rope_scaling["type"]
297
  if scaling_type == "longrope":
298
- self.rotary_emb = Phi3LongRoPEScaledRotaryEmbedding(self.head_dim, self.config)
299
  else:
300
  raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
301
 
@@ -381,9 +381,9 @@ class Phi3Attention(nn.Module):
381
  return attn_output, attn_weights, past_key_value
382
 
383
 
384
- class Phi3FlashAttention2(Phi3Attention):
385
  """
386
- Phi-3 flash attention module. This module inherits from `Phi3Attention` as the weights of the module stays
387
  untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
388
  flash attention and deal with padding tokens in case the input contains any of them.
389
  """
@@ -407,7 +407,7 @@ class Phi3FlashAttention2(Phi3Attention):
407
  use_cache: bool = False,
408
  **kwargs,
409
  ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
410
- # Phi3FlashAttention2 attention does not support output_attentions
411
 
412
  if not _flash_supports_window_size:
413
  logger.warning_once(
@@ -690,16 +690,16 @@ class Phi3FlashAttention2(Phi3Attention):
690
  )
691
 
692
 
693
- # copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention with Llama->Phi3
694
  # TODO @Arthur no longer copied from LLama after static cache
695
- class Phi3SdpaAttention(Phi3Attention):
696
  """
697
- Phi3 attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
698
- `Phi3Attention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
699
  SDPA API.
700
  """
701
 
702
- # Adapted from Phi3Attention.forward
703
  def forward(
704
  self,
705
  hidden_states: torch.Tensor,
@@ -712,7 +712,7 @@ class Phi3SdpaAttention(Phi3Attention):
712
  if output_attentions:
713
  # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
714
  logger.warning_once(
715
- "Phi3Model is using Phi3SdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
716
  'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
717
  )
718
  return super().forward(
@@ -781,26 +781,26 @@ class Phi3SdpaAttention(Phi3Attention):
781
  return attn_output, None, past_key_value
782
 
783
 
784
- PHI3_ATTENTION_CLASSES = {
785
- "eager": Phi3Attention,
786
- "flash_attention_2": Phi3FlashAttention2,
787
- "sdpa": Phi3SdpaAttention,
788
  }
789
 
790
 
791
- class Phi3DecoderLayer(nn.Module):
792
- def __init__(self, config: Phi3Config, layer_idx: int):
793
  super().__init__()
794
 
795
  self.config = config
796
- self.self_attn = PHI3_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx=layer_idx)
797
 
798
- self.mlp = Phi3MLP(config)
799
- self.input_layernorm = Phi3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
800
 
801
  self.resid_attn_dropout = nn.Dropout(config.resid_pdrop)
802
  self.resid_mlp_dropout = nn.Dropout(config.resid_pdrop)
803
- self.post_attention_layernorm = Phi3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
804
 
805
  def forward(
806
  self,
@@ -866,7 +866,7 @@ class Phi3DecoderLayer(nn.Module):
866
  return outputs
867
 
868
 
869
- PHI3_START_DOCSTRING = r"""
870
  This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
871
  library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
872
  etc.)
@@ -876,7 +876,7 @@ PHI3_START_DOCSTRING = r"""
876
  and behavior.
877
 
878
  Parameters:
879
- config ([`Phi3Config`]):
880
  Model configuration class with all the parameters of the model. Initializing with a config file does not
881
  load the weights associated with the model, only the configuration. Check out the
882
  [`~PreTrainedModel.from_pretrained`] method to load the model weights.
@@ -885,13 +885,13 @@ PHI3_START_DOCSTRING = r"""
885
 
886
  @add_start_docstrings(
887
  "The bare Phi-3 model outputting raw hidden-states without any specific head on top.",
888
- PHI3_START_DOCSTRING,
889
  )
890
- class Phi3PreTrainedModel(PreTrainedModel):
891
- config_class = Phi3Config
892
  base_model_prefix = "model"
893
  supports_gradient_checkpointing = True
894
- _no_split_modules = ["Phi3DecoderLayer"]
895
  _skip_keys_device_placement = "past_key_values"
896
  _supports_flash_attn_2 = True
897
  _supports_sdpa = False
@@ -911,7 +911,7 @@ class Phi3PreTrainedModel(PreTrainedModel):
911
  module.weight.data[module.padding_idx].zero_()
912
 
913
 
914
- PHI3_INPUTS_DOCSTRING = r"""
915
  Args:
916
  input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
917
  Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
@@ -983,17 +983,17 @@ PHI3_INPUTS_DOCSTRING = r"""
983
 
984
  @add_start_docstrings(
985
  "The bare Phi-3 model outputting raw hidden-states without any specific head on top.",
986
- PHI3_START_DOCSTRING,
987
  )
988
- class Phi3Model(Phi3PreTrainedModel):
989
  """
990
- Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Phi3DecoderLayer`]
991
 
992
  Args:
993
- config: Phi3Config
994
  """
995
 
996
- def __init__(self, config: Phi3Config):
997
  super().__init__(config)
998
  self.padding_idx = config.pad_token_id
999
  self.vocab_size = config.vocab_size
@@ -1001,10 +1001,10 @@ class Phi3Model(Phi3PreTrainedModel):
1001
  self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
1002
  self.embed_dropout = nn.Dropout(config.embd_pdrop)
1003
  self.layers = nn.ModuleList(
1004
- [Phi3DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
1005
  )
1006
  self._attn_implementation = config._attn_implementation
1007
- self.norm = Phi3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
1008
 
1009
  self.gradient_checkpointing = False
1010
  # Initialize weights and apply final processing
@@ -1016,7 +1016,7 @@ class Phi3Model(Phi3PreTrainedModel):
1016
  def set_input_embeddings(self, value):
1017
  self.embed_tokens = value
1018
 
1019
- @add_start_docstrings_to_model_forward(PHI3_INPUTS_DOCSTRING)
1020
  def forward(
1021
  self,
1022
  input_ids: torch.LongTensor = None,
@@ -1079,7 +1079,7 @@ class Phi3Model(Phi3PreTrainedModel):
1079
  if is_padding_right:
1080
  raise ValueError(
1081
  "You are attempting to perform batched generation with padding_side='right'"
1082
- " this may lead to unexpected behaviour for Flash Attention version of Phi3. Make sure to "
1083
  " call `tokenizer.padding_side = 'left'` before tokenizing the input. "
1084
  )
1085
 
@@ -1154,13 +1154,13 @@ class Phi3Model(Phi3PreTrainedModel):
1154
  )
1155
 
1156
 
1157
- class Phi3ForCausalLM(Phi3PreTrainedModel):
1158
  _tied_weights_keys = ["lm_head.weight"]
1159
 
1160
- # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.__init__ with Llama->Phi3
1161
  def __init__(self, config):
1162
  super().__init__(config)
1163
- self.model = Phi3Model(config)
1164
  self.vocab_size = config.vocab_size
1165
  self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
1166
 
@@ -1192,7 +1192,7 @@ class Phi3ForCausalLM(Phi3PreTrainedModel):
1192
  return self.model
1193
 
1194
  # Ignore copy
1195
- @add_start_docstrings_to_model_forward(PHI3_INPUTS_DOCSTRING)
1196
  @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
1197
  def forward(
1198
  self,
@@ -1219,9 +1219,9 @@ class Phi3ForCausalLM(Phi3PreTrainedModel):
1219
  Example:
1220
 
1221
  ```python
1222
- >>> from transformers import AutoTokenizer, Phi3ForCausalLM
1223
 
1224
- >>> model = Phi3ForCausalLM.from_pretrained("microsoft/phi-3-mini-4k-instruct")
1225
  >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-3-mini-4k-instruct")
1226
 
1227
  >>> prompt = "This is an example script ."
@@ -1351,9 +1351,9 @@ class Phi3ForCausalLM(Phi3PreTrainedModel):
1351
 
1352
  @add_start_docstrings(
1353
  """
1354
- The [`Phi3Model`] with a sequence classification head on top (linear layer).
1355
 
1356
- [`Phi3ForSequenceClassification`] uses the last token in order to do the classification, as other causal models
1357
  (e.g. GPT-2) do.
1358
 
1359
  Since it does classification on the last token, it requires to know the position of the last token. If a
@@ -1362,14 +1362,14 @@ class Phi3ForCausalLM(Phi3PreTrainedModel):
1362
  padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
1363
  each row of the batch).
1364
  """,
1365
- PHI3_START_DOCSTRING,
1366
  )
1367
- # Copied from transformers.models.llama.modeling_llama.LlamaForSequenceClassification with Llama->Phi3, LLAMA->PHI3, self.transformer->self.model, transformer_outputs->model_outputs
1368
- class Phi3ForSequenceClassification(Phi3PreTrainedModel):
1369
  def __init__(self, config):
1370
  super().__init__(config)
1371
  self.num_labels = config.num_labels
1372
- self.model = Phi3Model(config)
1373
  self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
1374
 
1375
  # Initialize weights and apply final processing
@@ -1381,7 +1381,7 @@ class Phi3ForSequenceClassification(Phi3PreTrainedModel):
1381
  def set_input_embeddings(self, value):
1382
  self.model.embed_tokens = value
1383
 
1384
- @add_start_docstrings_to_model_forward(PHI3_INPUTS_DOCSTRING)
1385
  def forward(
1386
  self,
1387
  input_ids: torch.LongTensor = None,
@@ -1475,18 +1475,18 @@ class Phi3ForSequenceClassification(Phi3PreTrainedModel):
1475
 
1476
  @add_start_docstrings(
1477
  """
1478
- [`Phi3Model`] with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
1479
  Named-Entity-Recognition (NER) tasks.
1480
  """,
1481
- PHI3_START_DOCSTRING,
1482
  )
1483
- # Copied from transformers.models.mpt.modeling_mpt.MptForTokenClassification with Mpt->Phi3,MPT->PHI3,self.transformer->self.model,transformer_outputs->model_outputs
1484
- class Phi3ForTokenClassification(Phi3PreTrainedModel):
1485
- def __init__(self, config: Phi3Config):
1486
  super().__init__(config)
1487
  self.num_labels = config.num_labels
1488
 
1489
- self.model = Phi3Model(config)
1490
  if hasattr(config, "classifier_dropout") and config.classifier_dropout is not None:
1491
  classifier_dropout = config.classifier_dropout
1492
  elif hasattr(config, "hidden_dropout") and config.hidden_dropout is not None:
@@ -1499,7 +1499,7 @@ class Phi3ForTokenClassification(Phi3PreTrainedModel):
1499
  # Initialize weights and apply final processing
1500
  self.post_init()
1501
 
1502
- @add_start_docstrings_to_model_forward(PHI3_INPUTS_DOCSTRING)
1503
  @add_code_sample_docstrings(
1504
  checkpoint=_CHECKPOINT_FOR_DOC,
1505
  output_type=TokenClassifierOutput,
 
45
  logging,
46
  replace_return_docstrings,
47
  )
48
+ from .configuration import PersianStoriesConfig
49
 
50
 
51
  logger = logging.get_logger(__name__)
 
68
  )
69
 
70
  _CHECKPOINT_FOR_DOC = "microsoft/Phi-3-mini-4k-instruct"
71
+ _CONFIG_FOR_DOC = "PersianStoriesConfig"
72
 
73
+ PersianStories_PRETRAINED_MODEL_ARCHIVE_LIST = [
74
  "microsoft/Phi-3-mini-4k-instruct",
75
  "microsoft/Phi-3-mini-128k-instruct",
76
  # See all Phi-3 models at https://huggingface.co/models?filter=Phi-3
77
  ]
78
 
79
 
80
+ # Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->PersianStories
81
+ class PersianStoriesRMSNorm(nn.Module):
82
  def __init__(self, hidden_size, eps=1e-6):
83
  """
84
+ PersianStoriesRMSNorm is equivalent to T5LayerNorm
85
  """
86
  super().__init__()
87
  self.weight = nn.Parameter(torch.ones(hidden_size))
 
108
  )
109
 
110
 
111
+ # Copied from transformers.models.gemma.modeling_gemma.GemmaRotaryEmbedding with gemma->PersianStories, Gemma->PersianStories
112
+ class PersianStoriesRotaryEmbedding(nn.Module):
113
  def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
114
  super().__init__()
115
 
 
139
  return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
140
 
141
 
142
+ class PersianStoriesLongRoPEScaledRotaryEmbedding(PersianStoriesRotaryEmbedding):
143
  def __init__(self, dim, config, device=None):
144
  super().__init__(dim, config.max_position_embeddings, config.rope_theta, device)
145
 
 
216
  return q_embed, k_embed
217
 
218
 
219
+ class PersianStoriesMLP(nn.Module):
220
  def __init__(self, config):
221
  super().__init__()
222
 
 
248
  return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
249
 
250
 
251
+ class PersianStoriesAttention(nn.Module):
252
  """Multi-headed attention from 'Attention Is All You Need' paper"""
253
 
254
+ def __init__(self, config: PersianStoriesConfig, layer_idx: Optional[int] = None):
255
  super().__init__()
256
  self.config = config
257
  self.layer_idx = layer_idx
 
287
 
288
  def _init_rope(self):
289
  if self.rope_scaling is None:
290
+ self.rotary_emb = PersianStoriesRotaryEmbedding(
291
  self.head_dim,
292
  max_position_embeddings=self.max_position_embeddings,
293
  base=self.rope_theta,
 
295
  else:
296
  scaling_type = self.config.rope_scaling["type"]
297
  if scaling_type == "longrope":
298
+ self.rotary_emb = PersianStoriesLongRoPEScaledRotaryEmbedding(self.head_dim, self.config)
299
  else:
300
  raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
301
 
 
381
  return attn_output, attn_weights, past_key_value
382
 
383
 
384
+ class PersianStoriesFlashAttention2(PersianStoriesAttention):
385
  """
386
+ Phi-3 flash attention module. This module inherits from `PersianStoriesAttention` as the weights of the module stays
387
  untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
388
  flash attention and deal with padding tokens in case the input contains any of them.
389
  """
 
407
  use_cache: bool = False,
408
  **kwargs,
409
  ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
410
+ # PersianStoriesFlashAttention2 attention does not support output_attentions
411
 
412
  if not _flash_supports_window_size:
413
  logger.warning_once(
 
690
  )
691
 
692
 
693
+ # copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention with Llama->PersianStories
694
  # TODO @Arthur no longer copied from LLama after static cache
695
+ class PersianStoriesSdpaAttention(PersianStoriesAttention):
696
  """
697
+ PersianStories attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
698
+ `PersianStoriesAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
699
  SDPA API.
700
  """
701
 
702
+ # Adapted from PersianStoriesAttention.forward
703
  def forward(
704
  self,
705
  hidden_states: torch.Tensor,
 
712
  if output_attentions:
713
  # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
714
  logger.warning_once(
715
+ "PersianStoriesModel is using PersianStoriesSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
716
  'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
717
  )
718
  return super().forward(
 
781
  return attn_output, None, past_key_value
782
 
783
 
784
+ PersianStories_ATTENTION_CLASSES = {
785
+ "eager": PersianStoriesAttention,
786
+ "flash_attention_2": PersianStoriesFlashAttention2,
787
+ "sdpa": PersianStoriesSdpaAttention,
788
  }
789
 
790
 
791
+ class PersianStoriesDecoderLayer(nn.Module):
792
+ def __init__(self, config: PersianStoriesConfig, layer_idx: int):
793
  super().__init__()
794
 
795
  self.config = config
796
+ self.self_attn = PersianStories_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx=layer_idx)
797
 
798
+ self.mlp = PersianStoriesMLP(config)
799
+ self.input_layernorm = PersianStoriesRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
800
 
801
  self.resid_attn_dropout = nn.Dropout(config.resid_pdrop)
802
  self.resid_mlp_dropout = nn.Dropout(config.resid_pdrop)
803
+ self.post_attention_layernorm = PersianStoriesRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
804
 
805
  def forward(
806
  self,
 
866
  return outputs
867
 
868
 
869
+ PersianStories_START_DOCSTRING = r"""
870
  This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
871
  library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
872
  etc.)
 
876
  and behavior.
877
 
878
  Parameters:
879
+ config ([`PersianStoriesConfig`]):
880
  Model configuration class with all the parameters of the model. Initializing with a config file does not
881
  load the weights associated with the model, only the configuration. Check out the
882
  [`~PreTrainedModel.from_pretrained`] method to load the model weights.
 
885
 
886
  @add_start_docstrings(
887
  "The bare Phi-3 model outputting raw hidden-states without any specific head on top.",
888
+ PersianStories_START_DOCSTRING,
889
  )
890
+ class PersianStoriesPreTrainedModel(PreTrainedModel):
891
+ config_class = PersianStoriesConfig
892
  base_model_prefix = "model"
893
  supports_gradient_checkpointing = True
894
+ _no_split_modules = ["PersianStoriesDecoderLayer"]
895
  _skip_keys_device_placement = "past_key_values"
896
  _supports_flash_attn_2 = True
897
  _supports_sdpa = False
 
911
  module.weight.data[module.padding_idx].zero_()
912
 
913
 
914
+ PersianStories_INPUTS_DOCSTRING = r"""
915
  Args:
916
  input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
917
  Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
 
983
 
984
  @add_start_docstrings(
985
  "The bare Phi-3 model outputting raw hidden-states without any specific head on top.",
986
+ PersianStories_START_DOCSTRING,
987
  )
988
+ class PersianStoriesModel(PersianStoriesPreTrainedModel):
989
  """
990
+ Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`PersianStoriesDecoderLayer`]
991
 
992
  Args:
993
+ config: PersianStoriesConfig
994
  """
995
 
996
+ def __init__(self, config: PersianStoriesConfig):
997
  super().__init__(config)
998
  self.padding_idx = config.pad_token_id
999
  self.vocab_size = config.vocab_size
 
1001
  self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
1002
  self.embed_dropout = nn.Dropout(config.embd_pdrop)
1003
  self.layers = nn.ModuleList(
1004
+ [PersianStoriesDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
1005
  )
1006
  self._attn_implementation = config._attn_implementation
1007
+ self.norm = PersianStoriesRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
1008
 
1009
  self.gradient_checkpointing = False
1010
  # Initialize weights and apply final processing
 
1016
  def set_input_embeddings(self, value):
1017
  self.embed_tokens = value
1018
 
1019
+ @add_start_docstrings_to_model_forward(PersianStories_INPUTS_DOCSTRING)
1020
  def forward(
1021
  self,
1022
  input_ids: torch.LongTensor = None,
 
1079
  if is_padding_right:
1080
  raise ValueError(
1081
  "You are attempting to perform batched generation with padding_side='right'"
1082
+ " this may lead to unexpected behaviour for Flash Attention version of PersianStories. Make sure to "
1083
  " call `tokenizer.padding_side = 'left'` before tokenizing the input. "
1084
  )
1085
 
 
1154
  )
1155
 
1156
 
1157
+ class PersianStoriesForCausalLM(PersianStoriesPreTrainedModel):
1158
  _tied_weights_keys = ["lm_head.weight"]
1159
 
1160
+ # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.__init__ with Llama->PersianStories
1161
  def __init__(self, config):
1162
  super().__init__(config)
1163
+ self.model = PersianStoriesModel(config)
1164
  self.vocab_size = config.vocab_size
1165
  self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
1166
 
 
1192
  return self.model
1193
 
1194
  # Ignore copy
1195
+ @add_start_docstrings_to_model_forward(PersianStories_INPUTS_DOCSTRING)
1196
  @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
1197
  def forward(
1198
  self,
 
1219
  Example:
1220
 
1221
  ```python
1222
+ >>> from transformers import AutoTokenizer, PersianStoriesForCausalLM
1223
 
1224
+ >>> model = PersianStoriesForCausalLM.from_pretrained("microsoft/phi-3-mini-4k-instruct")
1225
  >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-3-mini-4k-instruct")
1226
 
1227
  >>> prompt = "This is an example script ."
 
1351
 
1352
  @add_start_docstrings(
1353
  """
1354
+ The [`PersianStoriesModel`] with a sequence classification head on top (linear layer).
1355
 
1356
+ [`PersianStoriesForSequenceClassification`] uses the last token in order to do the classification, as other causal models
1357
  (e.g. GPT-2) do.
1358
 
1359
  Since it does classification on the last token, it requires to know the position of the last token. If a
 
1362
  padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
1363
  each row of the batch).
1364
  """,
1365
+ PersianStories_START_DOCSTRING,
1366
  )
1367
+ # Copied from transformers.models.llama.modeling_llama.LlamaForSequenceClassification with Llama->PersianStories, LLAMA->PersianStories, self.transformer->self.model, transformer_outputs->model_outputs
1368
+ class PersianStoriesForSequenceClassification(PersianStoriesPreTrainedModel):
1369
  def __init__(self, config):
1370
  super().__init__(config)
1371
  self.num_labels = config.num_labels
1372
+ self.model = PersianStoriesModel(config)
1373
  self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
1374
 
1375
  # Initialize weights and apply final processing
 
1381
  def set_input_embeddings(self, value):
1382
  self.model.embed_tokens = value
1383
 
1384
+ @add_start_docstrings_to_model_forward(PersianStories_INPUTS_DOCSTRING)
1385
  def forward(
1386
  self,
1387
  input_ids: torch.LongTensor = None,
 
1475
 
1476
  @add_start_docstrings(
1477
  """
1478
+ [`PersianStoriesModel`] with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
1479
  Named-Entity-Recognition (NER) tasks.
1480
  """,
1481
+ PersianStories_START_DOCSTRING,
1482
  )
1483
+ # Copied from transformers.models.mpt.modeling_mpt.MptForTokenClassification with Mpt->PersianStories,MPT->PersianStories,self.transformer->self.model,transformer_outputs->model_outputs
1484
+ class PersianStoriesForTokenClassification(PersianStoriesPreTrainedModel):
1485
+ def __init__(self, config: PersianStoriesConfig):
1486
  super().__init__(config)
1487
  self.num_labels = config.num_labels
1488
 
1489
+ self.model = PersianStoriesModel(config)
1490
  if hasattr(config, "classifier_dropout") and config.classifier_dropout is not None:
1491
  classifier_dropout = config.classifier_dropout
1492
  elif hasattr(config, "hidden_dropout") and config.hidden_dropout is not None:
 
1499
  # Initialize weights and apply final processing
1500
  self.post_init()
1501
 
1502
+ @add_start_docstrings_to_model_forward(PersianStories_INPUTS_DOCSTRING)
1503
  @add_code_sample_docstrings(
1504
  checkpoint=_CHECKPOINT_FOR_DOC,
1505
  output_type=TokenClassifierOutput,