PyTorch
English
Chinese
plm
custom_code
UCASLuoyang commited on
Commit
c033303
·
verified ·
1 Parent(s): bc17ed7

Rename modeling_edgellm.py to modeling_plm.py

Browse files
modeling_edgellm.py → modeling_plm.py RENAMED
@@ -1,5 +1,5 @@
1
  # coding=utf-8
2
- # Copyright 2024 The EdgeLLM team and The HuggingFace Inc. All rights reserved.
3
  #
4
  # This code is based on Alibaba's Qwen2 library, DeepSeek-AI's deepseekv2
5
  # libraryEleutherAI's GPT-NeoX library and the GPT-NeoX and OPT implementations
@@ -18,7 +18,7 @@
18
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
19
  # See the License for the specific language governing permissions and
20
  # limitations under the License.
21
- """PyTorch EdgeLLM model."""
22
 
23
  import inspect
24
  import math
@@ -53,7 +53,7 @@ from transformers.utils import (
53
  logging,
54
  replace_return_docstrings,
55
  )
56
- from .configuration_edgellm import EdgellmConfig
57
 
58
 
59
  if is_flash_attn_2_available():
@@ -66,8 +66,8 @@ if is_flash_attn_2_available():
66
  logger = logging.get_logger(__name__)
67
 
68
 
69
- _CHECKPOINT_FOR_DOC = "Edgellm/Edgellm-7B-beta"
70
- _CONFIG_FOR_DOC = "EdgellmConfig"
71
 
72
 
73
  # Copied from transformers.models.llama.modeling_llama._get_unpad_data
@@ -82,17 +82,12 @@ def _get_unpad_data(attention_mask):
82
  max_seqlen_in_batch,
83
  )
84
 
85
- class IdentityOperation(nn.Module):
86
- def __init__(self):
87
- super(IdentityOperation, self).__init__()
88
 
89
- def forward(self, x):
90
- return x
91
- # Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Edgellm
92
- class EdgellmRMSNorm(nn.Module):
93
  def __init__(self, hidden_size, eps=1e-6):
94
  """
95
- EdgellmRMSNorm is equivalent to T5LayerNorm
96
  """
97
  super().__init__()
98
  self.weight = nn.Parameter(torch.ones(hidden_size))
@@ -107,8 +102,8 @@ class EdgellmRMSNorm(nn.Module):
107
  return (self.weight.to(torch.float32) * hidden_states).to(input_dtype)
108
 
109
 
110
- # Copied from transformers.models.mixtral.modeling_mixtral.MixtralRotaryEmbedding with Mixtral->Edgellm
111
- class EdgellmRotaryEmbedding(nn.Module):
112
  def __init__(self, dim, max_position_embeddings=4096, base=100000, device=None):
113
  super().__init__()
114
  self.dim = dim
@@ -150,8 +145,8 @@ class EdgellmRotaryEmbedding(nn.Module):
150
  )
151
 
152
 
153
- class EdgellmLinearScalingRotaryEmbedding(EdgellmRotaryEmbedding):
154
- """EdgellmRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
155
 
156
  def __init__(
157
  self,
@@ -178,9 +173,9 @@ class EdgellmLinearScalingRotaryEmbedding(EdgellmRotaryEmbedding):
178
  self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
179
 
180
 
181
- # Copied from transformers.models.llama.modeling_llama.LlamaDynamicNTKScalingRotaryEmbedding with Llama->Edgellm
182
- class EdgellmDynamicNTKScalingRotaryEmbedding(EdgellmRotaryEmbedding):
183
- """EdgellmRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
184
 
185
  def __init__(
186
  self,
@@ -254,7 +249,7 @@ def yarn_linear_ramp_mask(min, max, dim):
254
  return ramp_func
255
 
256
 
257
- class EdgellmYarnRotaryEmbedding(EdgellmRotaryEmbedding):
258
 
259
  def __init__(
260
  self,
@@ -366,7 +361,7 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
366
  return q_embed, k_embed
367
 
368
 
369
- class EdgellmMLP(nn.Module):
370
  def __init__(self, config):
371
  super().__init__()
372
  self.hidden_size = config.hidden_size
@@ -396,9 +391,9 @@ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
396
 
397
 
398
  # Copied from https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite/blob/main/modeling_deepseek.py
399
- # DeepseekV2Attention with DeepseekV2->Edgellm
400
 
401
- class EdgellmAttention(nn.Module):
402
  """Multi-headed attention from 'Attention Is All You Need' paper"""
403
 
404
  def __init__(self, config, layer_idx: Optional[int] = None):
@@ -424,8 +419,6 @@ class EdgellmAttention(nn.Module):
424
  self.v_head_dim = config.v_head_dim
425
  self.qk_nope_head_dim = config.qk_nope_head_dim
426
  self.q_head_dim = config.qk_nope_head_dim + config.qk_rope_head_dim
427
- self.attn_in = IdentityOperation()
428
- self.attn_out = IdentityOperation()
429
 
430
  self.is_causal = True
431
 
@@ -437,7 +430,7 @@ class EdgellmAttention(nn.Module):
437
  self.q_a_proj = nn.Linear(
438
  self.hidden_size, config.q_lora_rank, bias=config.attention_bias
439
  )
440
- self.q_a_layernorm = EdgellmRMSNorm(config.q_lora_rank)
441
  self.q_b_proj = nn.Linear(
442
  config.q_lora_rank, self.num_heads * self.q_head_dim, bias=False
443
  )
@@ -447,27 +440,27 @@ class EdgellmAttention(nn.Module):
447
  config.kv_lora_rank + config.qk_rope_head_dim,
448
  bias=config.attention_bias,
449
  ) # 2048 512 64
450
- self.kv_a_layernorm = EdgellmRMSNorm(config.kv_lora_rank)
451
  self.kv_b_proj = nn.Linear(
452
  config.kv_lora_rank,
453
  self.num_heads
454
  * (self.q_head_dim - self.qk_rope_head_dim + self.v_head_dim),
455
  bias=False,
456
- ) #512
457
- # breakpoint()
458
  self.o_proj = nn.Linear(
459
  self.num_heads * self.v_head_dim,
460
  self.hidden_size,
461
  bias=config.attention_bias,
462
- ) # 16 128 2048
463
  self._init_rope()
464
 
465
- self.softmax_scale = self.q_head_dim ** (-0.5) # sqrt 1/192
466
 
467
 
468
  def _init_rope(self):
469
  if self.config.rope_scaling is None:
470
- self.rotary_emb = EdgellmRotaryEmbedding(
471
  self.qk_rope_head_dim,
472
  max_position_embeddings=self.max_position_embeddings,
473
  base=self.rope_theta,
@@ -623,7 +616,7 @@ class EdgellmAttention(nn.Module):
623
  return attn_output, attn_weights, past_key_value
624
 
625
 
626
- class EdgellmFlashAttention2(EdgellmAttention):
627
  """
628
  DeepseekV2 flash attention module. This module inherits from `DeepseekV2Attention` as the weights of the module stays
629
  untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
@@ -747,7 +740,7 @@ class EdgellmFlashAttention2(EdgellmAttention):
747
  query_states = query_states.to(target_dtype)
748
  key_states = key_states.to(target_dtype)
749
  value_states = value_states.to(target_dtype)
750
- # breakpoint()
751
  attn_output = self._flash_attention_forward(
752
  query_states,
753
  key_states,
@@ -763,11 +756,9 @@ class EdgellmFlashAttention2(EdgellmAttention):
763
  attn_output = attn_output.reshape(
764
  bsz, q_len, self.num_heads * self.v_head_dim
765
  ).contiguous()
766
- # torch.save(attn_output, "./hf-attn_output_b_821.pt")
767
- # breakpoint()
768
  attn_output = self.o_proj(attn_output)
769
- # torch.save(attn_output, "./hf-attn_output_821.pt")
770
- # breakpoint()
771
  if not output_attentions:
772
  attn_weights = None
773
 
@@ -898,14 +889,14 @@ class EdgellmFlashAttention2(EdgellmAttention):
898
  (cu_seqlens_q, cu_seqlens_k),
899
  (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
900
  )
901
- Edgellm_ATTENTION_CLASSES = {
902
- "eager": EdgellmAttention,
903
- "flash_attention_2": EdgellmFlashAttention2,
904
  }
905
 
906
 
907
- class EdgellmDecoderLayer(nn.Module):
908
- def __init__(self, config: EdgellmConfig, layer_idx: int):
909
  super().__init__()
910
  self.hidden_size = config.hidden_size
911
 
@@ -914,10 +905,10 @@ class EdgellmDecoderLayer(nn.Module):
914
  f"Sliding Window Attention is enabled but not implemented for `{config._attn_implementation}`; "
915
  "unexpected results may be encountered."
916
  )
917
- self.self_attn = Edgellm_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)
918
- self.mlp = EdgellmMLP(config)
919
- self.input_layernorm = EdgellmRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
920
- self.post_attention_layernorm = EdgellmRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
921
 
922
  def forward(
923
  self,
@@ -982,7 +973,7 @@ class EdgellmDecoderLayer(nn.Module):
982
  return outputs
983
 
984
 
985
- Edgellm_START_DOCSTRING = r"""
986
  This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
987
  library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
988
  etc.)
@@ -992,7 +983,7 @@ Edgellm_START_DOCSTRING = r"""
992
  and behavior.
993
 
994
  Parameters:
995
- config ([`EdgellmConfig`]):
996
  Model configuration class with all the parameters of the model. Initializing with a config file does not
997
  load the weights associated with the model, only the configuration. Check out the
998
  [`~PreTrainedModel.from_pretrained`] method to load the model weights.
@@ -1000,14 +991,14 @@ Edgellm_START_DOCSTRING = r"""
1000
 
1001
 
1002
  @add_start_docstrings(
1003
- "The bare Edgellm Model outputting raw hidden-states without any specific head on top.",
1004
- Edgellm_START_DOCSTRING,
1005
  )
1006
- class EdgellmPreTrainedModel(PreTrainedModel):
1007
- config_class = EdgellmConfig
1008
  base_model_prefix = "model"
1009
  supports_gradient_checkpointing = True
1010
- _no_split_modules = ["EdgellmDecoderLayer"]
1011
  _skip_keys_device_placement = "past_key_values"
1012
  _supports_flash_attn_2 = True
1013
  _supports_cache_class = True
@@ -1024,7 +1015,7 @@ class EdgellmPreTrainedModel(PreTrainedModel):
1024
  module.weight.data[module.padding_idx].zero_()
1025
 
1026
 
1027
- Edgellm_INPUTS_DOCSTRING = r"""
1028
  Args:
1029
  input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
1030
  Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
@@ -1099,28 +1090,28 @@ Edgellm_INPUTS_DOCSTRING = r"""
1099
 
1100
 
1101
  @add_start_docstrings(
1102
- "The bare Edgellm Model outputting raw hidden-states without any specific head on top.",
1103
- Edgellm_START_DOCSTRING,
1104
  )
1105
- class EdgellmModel(EdgellmPreTrainedModel):
1106
  """
1107
- Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`EdgellmDecoderLayer`]
1108
 
1109
  Args:
1110
- config: EdgellmConfig
1111
  """
1112
 
1113
- def __init__(self, config: EdgellmConfig):
1114
  super().__init__(config)
1115
  self.padding_idx = config.pad_token_id
1116
  self.vocab_size = config.vocab_size
1117
 
1118
  self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
1119
  self.layers = nn.ModuleList(
1120
- [EdgellmDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
1121
  )
1122
  self._attn_implementation = config._attn_implementation
1123
- self.norm = EdgellmRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
1124
 
1125
  self.gradient_checkpointing = False
1126
  # Initialize weights and apply final processing
@@ -1132,7 +1123,7 @@ class EdgellmModel(EdgellmPreTrainedModel):
1132
  def set_input_embeddings(self, value):
1133
  self.embed_tokens = value
1134
 
1135
- @add_start_docstrings_to_model_forward(Edgellm_INPUTS_DOCSTRING)
1136
  def forward(
1137
  self,
1138
  input_ids: torch.LongTensor = None,
@@ -1267,12 +1258,12 @@ class EdgellmModel(EdgellmPreTrainedModel):
1267
  )
1268
 
1269
 
1270
- class EdgellmForCausalLM(EdgellmPreTrainedModel):
1271
  _tied_weights_keys = ["lm_head.weight"]
1272
 
1273
  def __init__(self, config):
1274
  super().__init__(config)
1275
- self.model = EdgellmModel(config)
1276
  self.vocab_size = config.vocab_size
1277
  self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
1278
 
@@ -1297,7 +1288,7 @@ class EdgellmForCausalLM(EdgellmPreTrainedModel):
1297
  def get_decoder(self):
1298
  return self.model
1299
 
1300
- @add_start_docstrings_to_model_forward(Edgellm_INPUTS_DOCSTRING)
1301
  @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
1302
  def forward(
1303
  self,
@@ -1325,9 +1316,9 @@ class EdgellmForCausalLM(EdgellmPreTrainedModel):
1325
  Example:
1326
 
1327
  ```python
1328
- >>> from transformers import AutoTokenizer, EdgellmForCausalLM
1329
 
1330
- >>> model = EdgellmForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
1331
  >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
1332
 
1333
  >>> prompt = "Hey, are you conscious? Can you talk to me?"
@@ -1473,9 +1464,9 @@ class EdgellmForCausalLM(EdgellmPreTrainedModel):
1473
 
1474
  @add_start_docstrings(
1475
  """
1476
- The Edgellm Model transformer with a sequence classification head on top (linear layer).
1477
 
1478
- [`EdgellmForSequenceClassification`] uses the last token in order to do the classification, as other causal models
1479
  (e.g. GPT-2) do.
1480
 
1481
  Since it does classification on the last token, it requires to know the position of the last token. If a
@@ -1484,13 +1475,13 @@ class EdgellmForCausalLM(EdgellmPreTrainedModel):
1484
  padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
1485
  each row of the batch).
1486
  """,
1487
- Edgellm_START_DOCSTRING,
1488
  )
1489
- class EdgellmForSequenceClassification(EdgellmPreTrainedModel):
1490
  def __init__(self, config):
1491
  super().__init__(config)
1492
  self.num_labels = config.num_labels
1493
- self.model = EdgellmModel(config)
1494
  self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
1495
 
1496
  # Initialize weights and apply final processing
@@ -1502,7 +1493,7 @@ class EdgellmForSequenceClassification(EdgellmPreTrainedModel):
1502
  def set_input_embeddings(self, value):
1503
  self.model.embed_tokens = value
1504
 
1505
- @add_start_docstrings_to_model_forward(Edgellm_INPUTS_DOCSTRING)
1506
  def forward(
1507
  self,
1508
  input_ids: torch.LongTensor = None,
@@ -1596,17 +1587,17 @@ class EdgellmForSequenceClassification(EdgellmPreTrainedModel):
1596
 
1597
  @add_start_docstrings(
1598
  """
1599
- The Edgellm Model transformer with a token classification head on top (a linear layer on top of the hidden-states
1600
  output) e.g. for Named-Entity-Recognition (NER) tasks.
1601
  """,
1602
- Edgellm_START_DOCSTRING,
1603
  )
1604
- # Copied from transformers.models.llama.modeling_llama.LlamaForTokenClassification with Llama->Edgellm, LLAMA->Edgellm
1605
- class EdgellmForTokenClassification(EdgellmPreTrainedModel):
1606
  def __init__(self, config):
1607
  super().__init__(config)
1608
  self.num_labels = config.num_labels
1609
- self.model = EdgellmModel(config)
1610
  if getattr(config, "classifier_dropout", None) is not None:
1611
  classifier_dropout = config.classifier_dropout
1612
  elif getattr(config, "hidden_dropout", None) is not None:
@@ -1625,7 +1616,7 @@ class EdgellmForTokenClassification(EdgellmPreTrainedModel):
1625
  def set_input_embeddings(self, value):
1626
  self.model.embed_tokens = value
1627
 
1628
- @add_start_docstrings_to_model_forward(Edgellm_INPUTS_DOCSTRING)
1629
  def forward(
1630
  self,
1631
  input_ids: Optional[torch.LongTensor] = None,
@@ -1683,9 +1674,9 @@ class EdgellmForTokenClassification(EdgellmPreTrainedModel):
1683
  # from IPython import embed
1684
  # from transformers import Qwen2Tokenizer
1685
  # import light_hf_proxy
1686
- # tokenizer = Qwen2Tokenizer.from_pretrained("Qwen/Qwen2-1.5B")
1687
- # config = EdgellmConfig.from_pretrained("/data/daven/edge/edgellm/edgellm/config.json" ,attn_implementation="flash_attention_2", torch_dtype=torch.bfloat16)
1688
- # model = EdgellmForCausalLM(config).to(torch.bfloat16).to("cuda:7")
1689
  # input_ids = tokenizer(
1690
  # "Thanks to the generous support from SIGMOD EC, we will provide scholarship awards to selected students attending the WSDM 2024 conference. For awardees attending in-person, the grant will cover the cost of registration + some travel expenses. The awards will be competitive in the sense that not every student will receive a Travel Award. Each awardee will receive a bursary to partially cover the expense to attend the conference in-person. Awardees are expected to register for the main conference using a free-registration code provided with the award notification email and will have to make their own arrangements for travel and accommodation.Awardees are expected to register for the main conference and will have to make their own arrangements for travel and accommodation."
1691
  # )
 
1
  # coding=utf-8
2
+ # Copyright 2024 The PLM team and The HuggingFace Inc. All rights reserved.
3
  #
4
  # This code is based on Alibaba's Qwen2 library, DeepSeek-AI's deepseekv2
5
  # libraryEleutherAI's GPT-NeoX library and the GPT-NeoX and OPT implementations
 
18
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
19
  # See the License for the specific language governing permissions and
20
  # limitations under the License.
21
+ """PyTorch PLM model."""
22
 
23
  import inspect
24
  import math
 
53
  logging,
54
  replace_return_docstrings,
55
  )
56
+ from .configuration_plm import PLMConfig
57
 
58
 
59
  if is_flash_attn_2_available():
 
66
  logger = logging.get_logger(__name__)
67
 
68
 
69
+ _CHECKPOINT_FOR_DOC = "PLM/PLM-1.8B-base"
70
+ _CONFIG_FOR_DOC = "PLMConfig"
71
 
72
 
73
  # Copied from transformers.models.llama.modeling_llama._get_unpad_data
 
82
  max_seqlen_in_batch,
83
  )
84
 
 
 
 
85
 
86
+ # Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->PLM
87
+ class PLMRMSNorm(nn.Module):
 
 
88
  def __init__(self, hidden_size, eps=1e-6):
89
  """
90
+ PLMRMSNorm is equivalent to T5LayerNorm
91
  """
92
  super().__init__()
93
  self.weight = nn.Parameter(torch.ones(hidden_size))
 
102
  return (self.weight.to(torch.float32) * hidden_states).to(input_dtype)
103
 
104
 
105
+ # Copied from transformers.models.mixtral.modeling_mixtral.MixtralRotaryEmbedding with Mixtral->PLM
106
+ class PLMRotaryEmbedding(nn.Module):
107
  def __init__(self, dim, max_position_embeddings=4096, base=100000, device=None):
108
  super().__init__()
109
  self.dim = dim
 
145
  )
146
 
147
 
148
+ class PLMLinearScalingRotaryEmbedding(PLMRotaryEmbedding):
149
+ """PLMRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
150
 
151
  def __init__(
152
  self,
 
173
  self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
174
 
175
 
176
+ # Copied from transformers.models.llama.modeling_llama.LlamaDynamicNTKScalingRotaryEmbedding with Llama->PLM
177
+ class PLMDynamicNTKScalingRotaryEmbedding(PLMRotaryEmbedding):
178
+ """PLMRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
179
 
180
  def __init__(
181
  self,
 
249
  return ramp_func
250
 
251
 
252
+ class PLMYarnRotaryEmbedding(PLMRotaryEmbedding):
253
 
254
  def __init__(
255
  self,
 
361
  return q_embed, k_embed
362
 
363
 
364
+ class PLMMLP(nn.Module):
365
  def __init__(self, config):
366
  super().__init__()
367
  self.hidden_size = config.hidden_size
 
391
 
392
 
393
  # Copied from https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite/blob/main/modeling_deepseek.py
394
+ # DeepseekV2Attention with DeepseekV2->PLM
395
 
396
+ class PLMAttention(nn.Module):
397
  """Multi-headed attention from 'Attention Is All You Need' paper"""
398
 
399
  def __init__(self, config, layer_idx: Optional[int] = None):
 
419
  self.v_head_dim = config.v_head_dim
420
  self.qk_nope_head_dim = config.qk_nope_head_dim
421
  self.q_head_dim = config.qk_nope_head_dim + config.qk_rope_head_dim
 
 
422
 
423
  self.is_causal = True
424
 
 
430
  self.q_a_proj = nn.Linear(
431
  self.hidden_size, config.q_lora_rank, bias=config.attention_bias
432
  )
433
+ self.q_a_layernorm = PLMRMSNorm(config.q_lora_rank)
434
  self.q_b_proj = nn.Linear(
435
  config.q_lora_rank, self.num_heads * self.q_head_dim, bias=False
436
  )
 
440
  config.kv_lora_rank + config.qk_rope_head_dim,
441
  bias=config.attention_bias,
442
  ) # 2048 512 64
443
+ self.kv_a_layernorm = PLMRMSNorm(config.kv_lora_rank)
444
  self.kv_b_proj = nn.Linear(
445
  config.kv_lora_rank,
446
  self.num_heads
447
  * (self.q_head_dim - self.qk_rope_head_dim + self.v_head_dim),
448
  bias=False,
449
+ )
450
+
451
  self.o_proj = nn.Linear(
452
  self.num_heads * self.v_head_dim,
453
  self.hidden_size,
454
  bias=config.attention_bias,
455
+ )
456
  self._init_rope()
457
 
458
+ self.softmax_scale = self.q_head_dim ** (-0.5)
459
 
460
 
461
  def _init_rope(self):
462
  if self.config.rope_scaling is None:
463
+ self.rotary_emb = PLMRotaryEmbedding(
464
  self.qk_rope_head_dim,
465
  max_position_embeddings=self.max_position_embeddings,
466
  base=self.rope_theta,
 
616
  return attn_output, attn_weights, past_key_value
617
 
618
 
619
+ class PLMFlashAttention2(PLMAttention):
620
  """
621
  DeepseekV2 flash attention module. This module inherits from `DeepseekV2Attention` as the weights of the module stays
622
  untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
 
740
  query_states = query_states.to(target_dtype)
741
  key_states = key_states.to(target_dtype)
742
  value_states = value_states.to(target_dtype)
743
+
744
  attn_output = self._flash_attention_forward(
745
  query_states,
746
  key_states,
 
756
  attn_output = attn_output.reshape(
757
  bsz, q_len, self.num_heads * self.v_head_dim
758
  ).contiguous()
759
+
 
760
  attn_output = self.o_proj(attn_output)
761
+
 
762
  if not output_attentions:
763
  attn_weights = None
764
 
 
889
  (cu_seqlens_q, cu_seqlens_k),
890
  (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
891
  )
892
+ PLM_ATTENTION_CLASSES = {
893
+ "eager": PLMAttention,
894
+ "flash_attention_2": PLMFlashAttention2,
895
  }
896
 
897
 
898
+ class PLMDecoderLayer(nn.Module):
899
+ def __init__(self, config: PLMConfig, layer_idx: int):
900
  super().__init__()
901
  self.hidden_size = config.hidden_size
902
 
 
905
  f"Sliding Window Attention is enabled but not implemented for `{config._attn_implementation}`; "
906
  "unexpected results may be encountered."
907
  )
908
+ self.self_attn = PLM_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)
909
+ self.mlp = PLMMLP(config)
910
+ self.input_layernorm = PLMRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
911
+ self.post_attention_layernorm = PLMRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
912
 
913
  def forward(
914
  self,
 
973
  return outputs
974
 
975
 
976
+ PLM_START_DOCSTRING = r"""
977
  This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
978
  library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
979
  etc.)
 
983
  and behavior.
984
 
985
  Parameters:
986
+ config ([`PLMConfig`]):
987
  Model configuration class with all the parameters of the model. Initializing with a config file does not
988
  load the weights associated with the model, only the configuration. Check out the
989
  [`~PreTrainedModel.from_pretrained`] method to load the model weights.
 
991
 
992
 
993
  @add_start_docstrings(
994
+ "The bare PLM Model outputting raw hidden-states without any specific head on top.",
995
+ PLM_START_DOCSTRING,
996
  )
997
+ class PLMPreTrainedModel(PreTrainedModel):
998
+ config_class = PLMConfig
999
  base_model_prefix = "model"
1000
  supports_gradient_checkpointing = True
1001
+ _no_split_modules = ["PLMDecoderLayer"]
1002
  _skip_keys_device_placement = "past_key_values"
1003
  _supports_flash_attn_2 = True
1004
  _supports_cache_class = True
 
1015
  module.weight.data[module.padding_idx].zero_()
1016
 
1017
 
1018
+ PLM_INPUTS_DOCSTRING = r"""
1019
  Args:
1020
  input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
1021
  Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
 
1090
 
1091
 
1092
  @add_start_docstrings(
1093
+ "The bare PLM Model outputting raw hidden-states without any specific head on top.",
1094
+ PLM_START_DOCSTRING,
1095
  )
1096
+ class PLMModel(PLMPreTrainedModel):
1097
  """
1098
+ Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`PLMDecoderLayer`]
1099
 
1100
  Args:
1101
+ config: PLMConfig
1102
  """
1103
 
1104
+ def __init__(self, config: PLMConfig):
1105
  super().__init__(config)
1106
  self.padding_idx = config.pad_token_id
1107
  self.vocab_size = config.vocab_size
1108
 
1109
  self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
1110
  self.layers = nn.ModuleList(
1111
+ [PLMDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
1112
  )
1113
  self._attn_implementation = config._attn_implementation
1114
+ self.norm = PLMRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
1115
 
1116
  self.gradient_checkpointing = False
1117
  # Initialize weights and apply final processing
 
1123
  def set_input_embeddings(self, value):
1124
  self.embed_tokens = value
1125
 
1126
+ @add_start_docstrings_to_model_forward(PLM_INPUTS_DOCSTRING)
1127
  def forward(
1128
  self,
1129
  input_ids: torch.LongTensor = None,
 
1258
  )
1259
 
1260
 
1261
+ class PLMForCausalLM(PLMPreTrainedModel):
1262
  _tied_weights_keys = ["lm_head.weight"]
1263
 
1264
  def __init__(self, config):
1265
  super().__init__(config)
1266
+ self.model = PLMModel(config)
1267
  self.vocab_size = config.vocab_size
1268
  self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
1269
 
 
1288
  def get_decoder(self):
1289
  return self.model
1290
 
1291
+ @add_start_docstrings_to_model_forward(PLM_INPUTS_DOCSTRING)
1292
  @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
1293
  def forward(
1294
  self,
 
1316
  Example:
1317
 
1318
  ```python
1319
+ >>> from transformers import AutoTokenizer, PLMForCausalLM
1320
 
1321
+ >>> model = PLMForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
1322
  >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
1323
 
1324
  >>> prompt = "Hey, are you conscious? Can you talk to me?"
 
1464
 
1465
  @add_start_docstrings(
1466
  """
1467
+ The PLM Model transformer with a sequence classification head on top (linear layer).
1468
 
1469
+ [`PLMForSequenceClassification`] uses the last token in order to do the classification, as other causal models
1470
  (e.g. GPT-2) do.
1471
 
1472
  Since it does classification on the last token, it requires to know the position of the last token. If a
 
1475
  padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
1476
  each row of the batch).
1477
  """,
1478
+ PLM_START_DOCSTRING,
1479
  )
1480
+ class PLMForSequenceClassification(PLMPreTrainedModel):
1481
  def __init__(self, config):
1482
  super().__init__(config)
1483
  self.num_labels = config.num_labels
1484
+ self.model = PLMModel(config)
1485
  self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
1486
 
1487
  # Initialize weights and apply final processing
 
1493
  def set_input_embeddings(self, value):
1494
  self.model.embed_tokens = value
1495
 
1496
+ @add_start_docstrings_to_model_forward(PLM_INPUTS_DOCSTRING)
1497
  def forward(
1498
  self,
1499
  input_ids: torch.LongTensor = None,
 
1587
 
1588
  @add_start_docstrings(
1589
  """
1590
+ The PLM Model transformer with a token classification head on top (a linear layer on top of the hidden-states
1591
  output) e.g. for Named-Entity-Recognition (NER) tasks.
1592
  """,
1593
+ PLM_START_DOCSTRING,
1594
  )
1595
+ # Copied from transformers.models.llama.modeling_llama.LlamaForTokenClassification with Llama->PLM, LLAMA->PLM
1596
+ class PLMForTokenClassification(PLMPreTrainedModel):
1597
  def __init__(self, config):
1598
  super().__init__(config)
1599
  self.num_labels = config.num_labels
1600
+ self.model = PLMModel(config)
1601
  if getattr(config, "classifier_dropout", None) is not None:
1602
  classifier_dropout = config.classifier_dropout
1603
  elif getattr(config, "hidden_dropout", None) is not None:
 
1616
  def set_input_embeddings(self, value):
1617
  self.model.embed_tokens = value
1618
 
1619
+ @add_start_docstrings_to_model_forward(PLM_INPUTS_DOCSTRING)
1620
  def forward(
1621
  self,
1622
  input_ids: Optional[torch.LongTensor] = None,
 
1674
  # from IPython import embed
1675
  # from transformers import Qwen2Tokenizer
1676
  # import light_hf_proxy
1677
+ # tokenizer = Qwen2Tokenizer.from_pretrained("PLM-Team/PLM-1.8B-Base")
1678
+ # config = PLMConfig.from_pretrained("PLM-Team/PLM-1.8B-Base/config.json" ,attn_implementation="flash_attention_2", torch_dtype=torch.bfloat16)
1679
+ # model = PLMForCausalLM(config).to(torch.bfloat16).to("cuda:7")
1680
  # input_ids = tokenizer(
1681
  # "Thanks to the generous support from SIGMOD EC, we will provide scholarship awards to selected students attending the WSDM 2024 conference. For awardees attending in-person, the grant will cover the cost of registration + some travel expenses. The awards will be competitive in the sense that not every student will receive a Travel Award. Each awardee will receive a bursary to partially cover the expense to attend the conference in-person. Awardees are expected to register for the main conference using a free-registration code provided with the award notification email and will have to make their own arrangements for travel and accommodation.Awardees are expected to register for the main conference and will have to make their own arrangements for travel and accommodation."
1682
  # )