PLM-Team
/

PLM-1.8B-Base

PyTorch

English

Chinese

plm

custom_code

Model card Files Files and versions Community

UCASLuoyang commited on 22 days ago

Commit

c033303

verified ·

1 Parent(s): bc17ed7

Rename modeling_edgellm.py to modeling_plm.py

Browse files

Files changed (1) hide show

modeling_edgellm.py → modeling_plm.py +76 -85

modeling_edgellm.py → modeling_plm.py RENAMED Viewed

@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2024 The EdgeLLM team and The HuggingFace Inc. All rights reserved.
 #
 # This code is based on Alibaba's Qwen2 library, DeepSeek-AI's deepseekv2
 # libraryEleutherAI's GPT-NeoX library and the GPT-NeoX and OPT implementations
@@ -18,7 +18,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""PyTorch EdgeLLM model."""
 import inspect
 import math
@@ -53,7 +53,7 @@ from transformers.utils import (
     logging,
     replace_return_docstrings,
 )
-from .configuration_edgellm import EdgellmConfig
 if is_flash_attn_2_available():
@@ -66,8 +66,8 @@ if is_flash_attn_2_available():
 logger = logging.get_logger(__name__)
-_CHECKPOINT_FOR_DOC = "Edgellm/Edgellm-7B-beta"
-_CONFIG_FOR_DOC = "EdgellmConfig"
 # Copied from transformers.models.llama.modeling_llama._get_unpad_data
@@ -82,17 +82,12 @@ def _get_unpad_data(attention_mask):
         max_seqlen_in_batch,
     )
-class IdentityOperation(nn.Module):
-    def __init__(self):
-        super(IdentityOperation, self).__init__()
-    def forward(self, x):
-        return x
-# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Edgellm
-class EdgellmRMSNorm(nn.Module):
     def __init__(self, hidden_size, eps=1e-6):
         """
-        EdgellmRMSNorm is equivalent to T5LayerNorm
         """
         super().__init__()
         self.weight = nn.Parameter(torch.ones(hidden_size))
@@ -107,8 +102,8 @@ class EdgellmRMSNorm(nn.Module):
         return (self.weight.to(torch.float32) * hidden_states).to(input_dtype)
-# Copied from transformers.models.mixtral.modeling_mixtral.MixtralRotaryEmbedding with Mixtral->Edgellm
-class EdgellmRotaryEmbedding(nn.Module):
     def __init__(self, dim, max_position_embeddings=4096, base=100000, device=None):
         super().__init__()
         self.dim = dim
@@ -150,8 +145,8 @@ class EdgellmRotaryEmbedding(nn.Module):
         )
-class EdgellmLinearScalingRotaryEmbedding(EdgellmRotaryEmbedding):
-    """EdgellmRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
     def __init__(
         self,
@@ -178,9 +173,9 @@ class EdgellmLinearScalingRotaryEmbedding(EdgellmRotaryEmbedding):
         self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
-# Copied from transformers.models.llama.modeling_llama.LlamaDynamicNTKScalingRotaryEmbedding with Llama->Edgellm
-class EdgellmDynamicNTKScalingRotaryEmbedding(EdgellmRotaryEmbedding):
-    """EdgellmRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
     def __init__(
         self,
@@ -254,7 +249,7 @@ def yarn_linear_ramp_mask(min, max, dim):
     return ramp_func
-class EdgellmYarnRotaryEmbedding(EdgellmRotaryEmbedding):
     def __init__(
         self,
@@ -366,7 +361,7 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
     return q_embed, k_embed
-class EdgellmMLP(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.hidden_size = config.hidden_size
@@ -396,9 +391,9 @@ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
 # Copied from https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite/blob/main/modeling_deepseek.py
-# DeepseekV2Attention with DeepseekV2->Edgellm
-class EdgellmAttention(nn.Module):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
     def __init__(self, config, layer_idx: Optional[int] = None):
@@ -424,8 +419,6 @@ class EdgellmAttention(nn.Module):
         self.v_head_dim = config.v_head_dim
         self.qk_nope_head_dim = config.qk_nope_head_dim
         self.q_head_dim = config.qk_nope_head_dim + config.qk_rope_head_dim
-        self.attn_in = IdentityOperation()
-        self.attn_out = IdentityOperation()
         self.is_causal = True
@@ -437,7 +430,7 @@ class EdgellmAttention(nn.Module):
             self.q_a_proj = nn.Linear(
                 self.hidden_size, config.q_lora_rank, bias=config.attention_bias
             )
-            self.q_a_layernorm = EdgellmRMSNorm(config.q_lora_rank)
             self.q_b_proj = nn.Linear(
                 config.q_lora_rank, self.num_heads * self.q_head_dim, bias=False
             )
@@ -447,27 +440,27 @@ class EdgellmAttention(nn.Module):
             config.kv_lora_rank + config.qk_rope_head_dim,
             bias=config.attention_bias,
         ) # 2048 512 64
-        self.kv_a_layernorm = EdgellmRMSNorm(config.kv_lora_rank)
         self.kv_b_proj = nn.Linear(
             config.kv_lora_rank,
             self.num_heads
             * (self.q_head_dim - self.qk_rope_head_dim + self.v_head_dim),
             bias=False,
-        )  #512
-        # breakpoint()
         self.o_proj = nn.Linear(
             self.num_heads * self.v_head_dim,
             self.hidden_size,
             bias=config.attention_bias,
-        )  # 16 128 2048
         self._init_rope()
-        self.softmax_scale = self.q_head_dim ** (-0.5) # sqrt 1/192
     def _init_rope(self):
         if self.config.rope_scaling is None:
-            self.rotary_emb = EdgellmRotaryEmbedding(
                 self.qk_rope_head_dim,
                 max_position_embeddings=self.max_position_embeddings,
                 base=self.rope_theta,
@@ -623,7 +616,7 @@ class EdgellmAttention(nn.Module):
         return attn_output, attn_weights, past_key_value
-class EdgellmFlashAttention2(EdgellmAttention):
     """
     DeepseekV2 flash attention module. This module inherits from `DeepseekV2Attention` as the weights of the module stays
     untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
@@ -747,7 +740,7 @@ class EdgellmFlashAttention2(EdgellmAttention):
             query_states = query_states.to(target_dtype)
             key_states = key_states.to(target_dtype)
             value_states = value_states.to(target_dtype)
-        # breakpoint()
         attn_output = self._flash_attention_forward(
             query_states,
             key_states,
@@ -763,11 +756,9 @@ class EdgellmFlashAttention2(EdgellmAttention):
         attn_output = attn_output.reshape(
             bsz, q_len, self.num_heads * self.v_head_dim
         ).contiguous()
-        # torch.save(attn_output, "./hf-attn_output_b_821.pt")
-        # breakpoint()
         attn_output = self.o_proj(attn_output)
-        # torch.save(attn_output, "./hf-attn_output_821.pt")
-        # breakpoint()
         if not output_attentions:
             attn_weights = None
@@ -898,14 +889,14 @@ class EdgellmFlashAttention2(EdgellmAttention):
             (cu_seqlens_q, cu_seqlens_k),
             (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
         )
-Edgellm_ATTENTION_CLASSES = {
-    "eager": EdgellmAttention,
-    "flash_attention_2": EdgellmFlashAttention2,
 }
-class EdgellmDecoderLayer(nn.Module):
-    def __init__(self, config: EdgellmConfig, layer_idx: int):
         super().__init__()
         self.hidden_size = config.hidden_size
@@ -914,10 +905,10 @@ class EdgellmDecoderLayer(nn.Module):
                 f"Sliding Window Attention is enabled but not implemented for `{config._attn_implementation}`; "
                 "unexpected results may be encountered."
             )
-        self.self_attn = Edgellm_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)
-        self.mlp = EdgellmMLP(config)
-        self.input_layernorm = EdgellmRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.post_attention_layernorm = EdgellmRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
     def forward(
         self,
@@ -982,7 +973,7 @@ class EdgellmDecoderLayer(nn.Module):
         return outputs
-Edgellm_START_DOCSTRING = r"""
     This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
     library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
     etc.)
@@ -992,7 +983,7 @@ Edgellm_START_DOCSTRING = r"""
     and behavior.
     Parameters:
-        config ([`EdgellmConfig`]):
             Model configuration class with all the parameters of the model. Initializing with a config file does not
             load the weights associated with the model, only the configuration. Check out the
             [`~PreTrainedModel.from_pretrained`] method to load the model weights.
@@ -1000,14 +991,14 @@ Edgellm_START_DOCSTRING = r"""
 @add_start_docstrings(
-    "The bare Edgellm Model outputting raw hidden-states without any specific head on top.",
-    Edgellm_START_DOCSTRING,
 )
-class EdgellmPreTrainedModel(PreTrainedModel):
-    config_class = EdgellmConfig
     base_model_prefix = "model"
     supports_gradient_checkpointing = True
-    _no_split_modules = ["EdgellmDecoderLayer"]
     _skip_keys_device_placement = "past_key_values"
     _supports_flash_attn_2 = True
     _supports_cache_class = True
@@ -1024,7 +1015,7 @@ class EdgellmPreTrainedModel(PreTrainedModel):
                 module.weight.data[module.padding_idx].zero_()
-Edgellm_INPUTS_DOCSTRING = r"""
     Args:
         input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
             Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
@@ -1099,28 +1090,28 @@ Edgellm_INPUTS_DOCSTRING = r"""
 @add_start_docstrings(
-    "The bare Edgellm Model outputting raw hidden-states without any specific head on top.",
-    Edgellm_START_DOCSTRING,
 )
-class EdgellmModel(EdgellmPreTrainedModel):
     """
-    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`EdgellmDecoderLayer`]
     Args:
-        config: EdgellmConfig
     """
-    def __init__(self, config: EdgellmConfig):
         super().__init__(config)
         self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
         self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
         self.layers = nn.ModuleList(
-            [EdgellmDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
         )
         self._attn_implementation = config._attn_implementation
-        self.norm = EdgellmRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.gradient_checkpointing = False
         # Initialize weights and apply final processing
@@ -1132,7 +1123,7 @@ class EdgellmModel(EdgellmPreTrainedModel):
     def set_input_embeddings(self, value):
         self.embed_tokens = value
-    @add_start_docstrings_to_model_forward(Edgellm_INPUTS_DOCSTRING)
     def forward(
         self,
         input_ids: torch.LongTensor = None,
@@ -1267,12 +1258,12 @@ class EdgellmModel(EdgellmPreTrainedModel):
         )
-class EdgellmForCausalLM(EdgellmPreTrainedModel):
     _tied_weights_keys = ["lm_head.weight"]
     def __init__(self, config):
         super().__init__(config)
-        self.model = EdgellmModel(config)
         self.vocab_size = config.vocab_size
         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
@@ -1297,7 +1288,7 @@ class EdgellmForCausalLM(EdgellmPreTrainedModel):
     def get_decoder(self):
         return self.model
-    @add_start_docstrings_to_model_forward(Edgellm_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
@@ -1325,9 +1316,9 @@ class EdgellmForCausalLM(EdgellmPreTrainedModel):
         Example:
         ```python
-        >>> from transformers import AutoTokenizer, EdgellmForCausalLM
-        >>> model = EdgellmForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
         >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
         >>> prompt = "Hey, are you conscious? Can you talk to me?"
@@ -1473,9 +1464,9 @@ class EdgellmForCausalLM(EdgellmPreTrainedModel):
 @add_start_docstrings(
     """
-    The Edgellm Model transformer with a sequence classification head on top (linear layer).
-    [`EdgellmForSequenceClassification`] uses the last token in order to do the classification, as other causal models
     (e.g. GPT-2) do.
     Since it does classification on the last token, it requires to know the position of the last token. If a
@@ -1484,13 +1475,13 @@ class EdgellmForCausalLM(EdgellmPreTrainedModel):
     padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
     each row of the batch).
     """,
-    Edgellm_START_DOCSTRING,
 )
-class EdgellmForSequenceClassification(EdgellmPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
-        self.model = EdgellmModel(config)
         self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
         # Initialize weights and apply final processing
@@ -1502,7 +1493,7 @@ class EdgellmForSequenceClassification(EdgellmPreTrainedModel):
     def set_input_embeddings(self, value):
         self.model.embed_tokens = value
-    @add_start_docstrings_to_model_forward(Edgellm_INPUTS_DOCSTRING)
     def forward(
         self,
         input_ids: torch.LongTensor = None,
@@ -1596,17 +1587,17 @@ class EdgellmForSequenceClassification(EdgellmPreTrainedModel):
 @add_start_docstrings(
     """
-    The Edgellm Model transformer with a token classification head on top (a linear layer on top of the hidden-states
     output) e.g. for Named-Entity-Recognition (NER) tasks.
     """,
-    Edgellm_START_DOCSTRING,
 )
-# Copied from transformers.models.llama.modeling_llama.LlamaForTokenClassification with Llama->Edgellm, LLAMA->Edgellm
-class EdgellmForTokenClassification(EdgellmPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
-        self.model = EdgellmModel(config)
         if getattr(config, "classifier_dropout", None) is not None:
             classifier_dropout = config.classifier_dropout
         elif getattr(config, "hidden_dropout", None) is not None:
@@ -1625,7 +1616,7 @@ class EdgellmForTokenClassification(EdgellmPreTrainedModel):
     def set_input_embeddings(self, value):
         self.model.embed_tokens = value
-    @add_start_docstrings_to_model_forward(Edgellm_INPUTS_DOCSTRING)
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
@@ -1683,9 +1674,9 @@ class EdgellmForTokenClassification(EdgellmPreTrainedModel):
 #     from IPython import embed
 #     from transformers import Qwen2Tokenizer
 #     import light_hf_proxy
-#     tokenizer = Qwen2Tokenizer.from_pretrained("Qwen/Qwen2-1.5B")
-#     config = EdgellmConfig.from_pretrained("/data/daven/edge/edgellm/edgellm/config.json" ,attn_implementation="flash_attention_2", torch_dtype=torch.bfloat16)
-#     model = EdgellmForCausalLM(config).to(torch.bfloat16).to("cuda:7")
 #     input_ids = tokenizer(
 #         "Thanks to the generous support from SIGMOD EC, we will provide scholarship awards to selected students attending the WSDM 2024 conference. For awardees attending in-person, the grant will cover the cost of registration + some travel expenses. The awards will be competitive in the sense that not every student will receive a Travel Award. Each awardee will receive a bursary to partially cover the expense to attend the conference in-person. Awardees are expected to register for the main conference using a free-registration code provided with the award notification email and will have to make their own arrangements for travel and accommodation.Awardees are expected to register for the main conference and will have to make their own arrangements for travel and accommodation."
 #     )

 # coding=utf-8
+# Copyright 2024 The PLM team and The HuggingFace Inc. All rights reserved.
 #
 # This code is based on Alibaba's Qwen2 library, DeepSeek-AI's deepseekv2
 # libraryEleutherAI's GPT-NeoX library and the GPT-NeoX and OPT implementations
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+"""PyTorch PLM model."""
 import inspect
 import math
     logging,
     replace_return_docstrings,
 )
+from .configuration_plm import PLMConfig
 if is_flash_attn_2_available():
 logger = logging.get_logger(__name__)
+_CHECKPOINT_FOR_DOC = "PLM/PLM-1.8B-base"
+_CONFIG_FOR_DOC = "PLMConfig"
 # Copied from transformers.models.llama.modeling_llama._get_unpad_data
         max_seqlen_in_batch,
     )
+# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->PLM
+class PLMRMSNorm(nn.Module):
     def __init__(self, hidden_size, eps=1e-6):
         """
+        PLMRMSNorm is equivalent to T5LayerNorm
         """
         super().__init__()
         self.weight = nn.Parameter(torch.ones(hidden_size))
         return (self.weight.to(torch.float32) * hidden_states).to(input_dtype)
+# Copied from transformers.models.mixtral.modeling_mixtral.MixtralRotaryEmbedding with Mixtral->PLM
+class PLMRotaryEmbedding(nn.Module):
     def __init__(self, dim, max_position_embeddings=4096, base=100000, device=None):
         super().__init__()
         self.dim = dim
         )
+class PLMLinearScalingRotaryEmbedding(PLMRotaryEmbedding):
+    """PLMRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
     def __init__(
         self,
         self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+# Copied from transformers.models.llama.modeling_llama.LlamaDynamicNTKScalingRotaryEmbedding with Llama->PLM
+class PLMDynamicNTKScalingRotaryEmbedding(PLMRotaryEmbedding):
+    """PLMRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
     def __init__(
         self,
     return ramp_func
+class PLMYarnRotaryEmbedding(PLMRotaryEmbedding):
     def __init__(
         self,
     return q_embed, k_embed
+class PLMMLP(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.hidden_size = config.hidden_size
 # Copied from https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite/blob/main/modeling_deepseek.py
+# DeepseekV2Attention with DeepseekV2->PLM
+class PLMAttention(nn.Module):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
     def __init__(self, config, layer_idx: Optional[int] = None):
         self.v_head_dim = config.v_head_dim
         self.qk_nope_head_dim = config.qk_nope_head_dim
         self.q_head_dim = config.qk_nope_head_dim + config.qk_rope_head_dim
         self.is_causal = True
             self.q_a_proj = nn.Linear(
                 self.hidden_size, config.q_lora_rank, bias=config.attention_bias
             )
+            self.q_a_layernorm = PLMRMSNorm(config.q_lora_rank)
             self.q_b_proj = nn.Linear(
                 config.q_lora_rank, self.num_heads * self.q_head_dim, bias=False
             )
             config.kv_lora_rank + config.qk_rope_head_dim,
             bias=config.attention_bias,
         ) # 2048 512 64
+        self.kv_a_layernorm = PLMRMSNorm(config.kv_lora_rank)
         self.kv_b_proj = nn.Linear(
             config.kv_lora_rank,
             self.num_heads
             * (self.q_head_dim - self.qk_rope_head_dim + self.v_head_dim),
             bias=False,
+        )
         self.o_proj = nn.Linear(
             self.num_heads * self.v_head_dim,
             self.hidden_size,
             bias=config.attention_bias,
+        )
         self._init_rope()
+        self.softmax_scale = self.q_head_dim ** (-0.5)
     def _init_rope(self):
         if self.config.rope_scaling is None:
+            self.rotary_emb = PLMRotaryEmbedding(
                 self.qk_rope_head_dim,
                 max_position_embeddings=self.max_position_embeddings,
                 base=self.rope_theta,
         return attn_output, attn_weights, past_key_value
+class PLMFlashAttention2(PLMAttention):
     """
     DeepseekV2 flash attention module. This module inherits from `DeepseekV2Attention` as the weights of the module stays
     untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
             query_states = query_states.to(target_dtype)
             key_states = key_states.to(target_dtype)
             value_states = value_states.to(target_dtype)
         attn_output = self._flash_attention_forward(
             query_states,
             key_states,
         attn_output = attn_output.reshape(
             bsz, q_len, self.num_heads * self.v_head_dim
         ).contiguous()
         attn_output = self.o_proj(attn_output)
         if not output_attentions:
             attn_weights = None
             (cu_seqlens_q, cu_seqlens_k),
             (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
         )
+PLM_ATTENTION_CLASSES = {
+    "eager": PLMAttention,
+    "flash_attention_2": PLMFlashAttention2,
 }
+class PLMDecoderLayer(nn.Module):
+    def __init__(self, config: PLMConfig, layer_idx: int):
         super().__init__()
         self.hidden_size = config.hidden_size
                 f"Sliding Window Attention is enabled but not implemented for `{config._attn_implementation}`; "
                 "unexpected results may be encountered."
             )
+        self.self_attn = PLM_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)
+        self.mlp = PLMMLP(config)
+        self.input_layernorm = PLMRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = PLMRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
     def forward(
         self,
         return outputs
+PLM_START_DOCSTRING = r"""
     This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
     library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
     etc.)
     and behavior.
     Parameters:
+        config ([`PLMConfig`]):
             Model configuration class with all the parameters of the model. Initializing with a config file does not
             load the weights associated with the model, only the configuration. Check out the
             [`~PreTrainedModel.from_pretrained`] method to load the model weights.
 @add_start_docstrings(
+    "The bare PLM Model outputting raw hidden-states without any specific head on top.",
+    PLM_START_DOCSTRING,
 )
+class PLMPreTrainedModel(PreTrainedModel):
+    config_class = PLMConfig
     base_model_prefix = "model"
     supports_gradient_checkpointing = True
+    _no_split_modules = ["PLMDecoderLayer"]
     _skip_keys_device_placement = "past_key_values"
     _supports_flash_attn_2 = True
     _supports_cache_class = True
                 module.weight.data[module.padding_idx].zero_()
+PLM_INPUTS_DOCSTRING = r"""
     Args:
         input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
             Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
 @add_start_docstrings(
+    "The bare PLM Model outputting raw hidden-states without any specific head on top.",
+    PLM_START_DOCSTRING,
 )
+class PLMModel(PLMPreTrainedModel):
     """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`PLMDecoderLayer`]
     Args:
+        config: PLMConfig
     """
+    def __init__(self, config: PLMConfig):
         super().__init__(config)
         self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
         self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
         self.layers = nn.ModuleList(
+            [PLMDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
         )
         self._attn_implementation = config._attn_implementation
+        self.norm = PLMRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.gradient_checkpointing = False
         # Initialize weights and apply final processing
     def set_input_embeddings(self, value):
         self.embed_tokens = value
+    @add_start_docstrings_to_model_forward(PLM_INPUTS_DOCSTRING)
     def forward(
         self,
         input_ids: torch.LongTensor = None,
         )
+class PLMForCausalLM(PLMPreTrainedModel):
     _tied_weights_keys = ["lm_head.weight"]
     def __init__(self, config):
         super().__init__(config)
+        self.model = PLMModel(config)
         self.vocab_size = config.vocab_size
         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
     def get_decoder(self):
         return self.model
+    @add_start_docstrings_to_model_forward(PLM_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
         Example:
         ```python
+        >>> from transformers import AutoTokenizer, PLMForCausalLM
+        >>> model = PLMForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
         >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
         >>> prompt = "Hey, are you conscious? Can you talk to me?"
 @add_start_docstrings(
     """
+    The PLM Model transformer with a sequence classification head on top (linear layer).
+    [`PLMForSequenceClassification`] uses the last token in order to do the classification, as other causal models
     (e.g. GPT-2) do.
     Since it does classification on the last token, it requires to know the position of the last token. If a
     padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
     each row of the batch).
     """,
+    PLM_START_DOCSTRING,
 )
+class PLMForSequenceClassification(PLMPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
+        self.model = PLMModel(config)
         self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
         # Initialize weights and apply final processing
     def set_input_embeddings(self, value):
         self.model.embed_tokens = value
+    @add_start_docstrings_to_model_forward(PLM_INPUTS_DOCSTRING)
     def forward(
         self,
         input_ids: torch.LongTensor = None,
 @add_start_docstrings(
     """
+    The PLM Model transformer with a token classification head on top (a linear layer on top of the hidden-states
     output) e.g. for Named-Entity-Recognition (NER) tasks.
     """,
+    PLM_START_DOCSTRING,
 )
+# Copied from transformers.models.llama.modeling_llama.LlamaForTokenClassification with Llama->PLM, LLAMA->PLM
+class PLMForTokenClassification(PLMPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
+        self.model = PLMModel(config)
         if getattr(config, "classifier_dropout", None) is not None:
             classifier_dropout = config.classifier_dropout
         elif getattr(config, "hidden_dropout", None) is not None:
     def set_input_embeddings(self, value):
         self.model.embed_tokens = value
+    @add_start_docstrings_to_model_forward(PLM_INPUTS_DOCSTRING)
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
 #     from IPython import embed
 #     from transformers import Qwen2Tokenizer
 #     import light_hf_proxy
+#     tokenizer = Qwen2Tokenizer.from_pretrained("PLM-Team/PLM-1.8B-Base")
+#     config = PLMConfig.from_pretrained("PLM-Team/PLM-1.8B-Base/config.json" ,attn_implementation="flash_attention_2", torch_dtype=torch.bfloat16)
+#     model = PLMForCausalLM(config).to(torch.bfloat16).to("cuda:7")
 #     input_ids = tokenizer(
 #         "Thanks to the generous support from SIGMOD EC, we will provide scholarship awards to selected students attending the WSDM 2024 conference. For awardees attending in-person, the grant will cover the cost of registration + some travel expenses. The awards will be competitive in the sense that not every student will receive a Travel Award. Each awardee will receive a bursary to partially cover the expense to attend the conference in-person. Awardees are expected to register for the main conference using a free-registration code provided with the award notification email and will have to make their own arrangements for travel and accommodation.Awardees are expected to register for the main conference and will have to make their own arrangements for travel and accommodation."
 #     )