Rename modeling_edgellm.py to modeling_plm.py
Browse files
modeling_edgellm.py → modeling_plm.py
RENAMED
@@ -1,5 +1,5 @@
|
|
1 |
# coding=utf-8
|
2 |
-
# Copyright 2024 The
|
3 |
#
|
4 |
# This code is based on Alibaba's Qwen2 library, DeepSeek-AI's deepseekv2
|
5 |
# libraryEleutherAI's GPT-NeoX library and the GPT-NeoX and OPT implementations
|
@@ -18,7 +18,7 @@
|
|
18 |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
19 |
# See the License for the specific language governing permissions and
|
20 |
# limitations under the License.
|
21 |
-
"""PyTorch
|
22 |
|
23 |
import inspect
|
24 |
import math
|
@@ -53,7 +53,7 @@ from transformers.utils import (
|
|
53 |
logging,
|
54 |
replace_return_docstrings,
|
55 |
)
|
56 |
-
from .
|
57 |
|
58 |
|
59 |
if is_flash_attn_2_available():
|
@@ -66,8 +66,8 @@ if is_flash_attn_2_available():
|
|
66 |
logger = logging.get_logger(__name__)
|
67 |
|
68 |
|
69 |
-
_CHECKPOINT_FOR_DOC = "
|
70 |
-
_CONFIG_FOR_DOC = "
|
71 |
|
72 |
|
73 |
# Copied from transformers.models.llama.modeling_llama._get_unpad_data
|
@@ -82,17 +82,12 @@ def _get_unpad_data(attention_mask):
|
|
82 |
max_seqlen_in_batch,
|
83 |
)
|
84 |
|
85 |
-
class IdentityOperation(nn.Module):
|
86 |
-
def __init__(self):
|
87 |
-
super(IdentityOperation, self).__init__()
|
88 |
|
89 |
-
|
90 |
-
|
91 |
-
# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Edgellm
|
92 |
-
class EdgellmRMSNorm(nn.Module):
|
93 |
def __init__(self, hidden_size, eps=1e-6):
|
94 |
"""
|
95 |
-
|
96 |
"""
|
97 |
super().__init__()
|
98 |
self.weight = nn.Parameter(torch.ones(hidden_size))
|
@@ -107,8 +102,8 @@ class EdgellmRMSNorm(nn.Module):
|
|
107 |
return (self.weight.to(torch.float32) * hidden_states).to(input_dtype)
|
108 |
|
109 |
|
110 |
-
# Copied from transformers.models.mixtral.modeling_mixtral.MixtralRotaryEmbedding with Mixtral->
|
111 |
-
class
|
112 |
def __init__(self, dim, max_position_embeddings=4096, base=100000, device=None):
|
113 |
super().__init__()
|
114 |
self.dim = dim
|
@@ -150,8 +145,8 @@ class EdgellmRotaryEmbedding(nn.Module):
|
|
150 |
)
|
151 |
|
152 |
|
153 |
-
class
|
154 |
-
"""
|
155 |
|
156 |
def __init__(
|
157 |
self,
|
@@ -178,9 +173,9 @@ class EdgellmLinearScalingRotaryEmbedding(EdgellmRotaryEmbedding):
|
|
178 |
self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
|
179 |
|
180 |
|
181 |
-
# Copied from transformers.models.llama.modeling_llama.LlamaDynamicNTKScalingRotaryEmbedding with Llama->
|
182 |
-
class
|
183 |
-
"""
|
184 |
|
185 |
def __init__(
|
186 |
self,
|
@@ -254,7 +249,7 @@ def yarn_linear_ramp_mask(min, max, dim):
|
|
254 |
return ramp_func
|
255 |
|
256 |
|
257 |
-
class
|
258 |
|
259 |
def __init__(
|
260 |
self,
|
@@ -366,7 +361,7 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
|
|
366 |
return q_embed, k_embed
|
367 |
|
368 |
|
369 |
-
class
|
370 |
def __init__(self, config):
|
371 |
super().__init__()
|
372 |
self.hidden_size = config.hidden_size
|
@@ -396,9 +391,9 @@ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
|
|
396 |
|
397 |
|
398 |
# Copied from https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite/blob/main/modeling_deepseek.py
|
399 |
-
# DeepseekV2Attention with DeepseekV2->
|
400 |
|
401 |
-
class
|
402 |
"""Multi-headed attention from 'Attention Is All You Need' paper"""
|
403 |
|
404 |
def __init__(self, config, layer_idx: Optional[int] = None):
|
@@ -424,8 +419,6 @@ class EdgellmAttention(nn.Module):
|
|
424 |
self.v_head_dim = config.v_head_dim
|
425 |
self.qk_nope_head_dim = config.qk_nope_head_dim
|
426 |
self.q_head_dim = config.qk_nope_head_dim + config.qk_rope_head_dim
|
427 |
-
self.attn_in = IdentityOperation()
|
428 |
-
self.attn_out = IdentityOperation()
|
429 |
|
430 |
self.is_causal = True
|
431 |
|
@@ -437,7 +430,7 @@ class EdgellmAttention(nn.Module):
|
|
437 |
self.q_a_proj = nn.Linear(
|
438 |
self.hidden_size, config.q_lora_rank, bias=config.attention_bias
|
439 |
)
|
440 |
-
self.q_a_layernorm =
|
441 |
self.q_b_proj = nn.Linear(
|
442 |
config.q_lora_rank, self.num_heads * self.q_head_dim, bias=False
|
443 |
)
|
@@ -447,27 +440,27 @@ class EdgellmAttention(nn.Module):
|
|
447 |
config.kv_lora_rank + config.qk_rope_head_dim,
|
448 |
bias=config.attention_bias,
|
449 |
) # 2048 512 64
|
450 |
-
self.kv_a_layernorm =
|
451 |
self.kv_b_proj = nn.Linear(
|
452 |
config.kv_lora_rank,
|
453 |
self.num_heads
|
454 |
* (self.q_head_dim - self.qk_rope_head_dim + self.v_head_dim),
|
455 |
bias=False,
|
456 |
-
)
|
457 |
-
|
458 |
self.o_proj = nn.Linear(
|
459 |
self.num_heads * self.v_head_dim,
|
460 |
self.hidden_size,
|
461 |
bias=config.attention_bias,
|
462 |
-
)
|
463 |
self._init_rope()
|
464 |
|
465 |
-
self.softmax_scale = self.q_head_dim ** (-0.5)
|
466 |
|
467 |
|
468 |
def _init_rope(self):
|
469 |
if self.config.rope_scaling is None:
|
470 |
-
self.rotary_emb =
|
471 |
self.qk_rope_head_dim,
|
472 |
max_position_embeddings=self.max_position_embeddings,
|
473 |
base=self.rope_theta,
|
@@ -623,7 +616,7 @@ class EdgellmAttention(nn.Module):
|
|
623 |
return attn_output, attn_weights, past_key_value
|
624 |
|
625 |
|
626 |
-
class
|
627 |
"""
|
628 |
DeepseekV2 flash attention module. This module inherits from `DeepseekV2Attention` as the weights of the module stays
|
629 |
untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
|
@@ -747,7 +740,7 @@ class EdgellmFlashAttention2(EdgellmAttention):
|
|
747 |
query_states = query_states.to(target_dtype)
|
748 |
key_states = key_states.to(target_dtype)
|
749 |
value_states = value_states.to(target_dtype)
|
750 |
-
|
751 |
attn_output = self._flash_attention_forward(
|
752 |
query_states,
|
753 |
key_states,
|
@@ -763,11 +756,9 @@ class EdgellmFlashAttention2(EdgellmAttention):
|
|
763 |
attn_output = attn_output.reshape(
|
764 |
bsz, q_len, self.num_heads * self.v_head_dim
|
765 |
).contiguous()
|
766 |
-
|
767 |
-
# breakpoint()
|
768 |
attn_output = self.o_proj(attn_output)
|
769 |
-
|
770 |
-
# breakpoint()
|
771 |
if not output_attentions:
|
772 |
attn_weights = None
|
773 |
|
@@ -898,14 +889,14 @@ class EdgellmFlashAttention2(EdgellmAttention):
|
|
898 |
(cu_seqlens_q, cu_seqlens_k),
|
899 |
(max_seqlen_in_batch_q, max_seqlen_in_batch_k),
|
900 |
)
|
901 |
-
|
902 |
-
"eager":
|
903 |
-
"flash_attention_2":
|
904 |
}
|
905 |
|
906 |
|
907 |
-
class
|
908 |
-
def __init__(self, config:
|
909 |
super().__init__()
|
910 |
self.hidden_size = config.hidden_size
|
911 |
|
@@ -914,10 +905,10 @@ class EdgellmDecoderLayer(nn.Module):
|
|
914 |
f"Sliding Window Attention is enabled but not implemented for `{config._attn_implementation}`; "
|
915 |
"unexpected results may be encountered."
|
916 |
)
|
917 |
-
self.self_attn =
|
918 |
-
self.mlp =
|
919 |
-
self.input_layernorm =
|
920 |
-
self.post_attention_layernorm =
|
921 |
|
922 |
def forward(
|
923 |
self,
|
@@ -982,7 +973,7 @@ class EdgellmDecoderLayer(nn.Module):
|
|
982 |
return outputs
|
983 |
|
984 |
|
985 |
-
|
986 |
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
|
987 |
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
|
988 |
etc.)
|
@@ -992,7 +983,7 @@ Edgellm_START_DOCSTRING = r"""
|
|
992 |
and behavior.
|
993 |
|
994 |
Parameters:
|
995 |
-
config ([`
|
996 |
Model configuration class with all the parameters of the model. Initializing with a config file does not
|
997 |
load the weights associated with the model, only the configuration. Check out the
|
998 |
[`~PreTrainedModel.from_pretrained`] method to load the model weights.
|
@@ -1000,14 +991,14 @@ Edgellm_START_DOCSTRING = r"""
|
|
1000 |
|
1001 |
|
1002 |
@add_start_docstrings(
|
1003 |
-
"The bare
|
1004 |
-
|
1005 |
)
|
1006 |
-
class
|
1007 |
-
config_class =
|
1008 |
base_model_prefix = "model"
|
1009 |
supports_gradient_checkpointing = True
|
1010 |
-
_no_split_modules = ["
|
1011 |
_skip_keys_device_placement = "past_key_values"
|
1012 |
_supports_flash_attn_2 = True
|
1013 |
_supports_cache_class = True
|
@@ -1024,7 +1015,7 @@ class EdgellmPreTrainedModel(PreTrainedModel):
|
|
1024 |
module.weight.data[module.padding_idx].zero_()
|
1025 |
|
1026 |
|
1027 |
-
|
1028 |
Args:
|
1029 |
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
|
1030 |
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
|
@@ -1099,28 +1090,28 @@ Edgellm_INPUTS_DOCSTRING = r"""
|
|
1099 |
|
1100 |
|
1101 |
@add_start_docstrings(
|
1102 |
-
"The bare
|
1103 |
-
|
1104 |
)
|
1105 |
-
class
|
1106 |
"""
|
1107 |
-
Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`
|
1108 |
|
1109 |
Args:
|
1110 |
-
config:
|
1111 |
"""
|
1112 |
|
1113 |
-
def __init__(self, config:
|
1114 |
super().__init__(config)
|
1115 |
self.padding_idx = config.pad_token_id
|
1116 |
self.vocab_size = config.vocab_size
|
1117 |
|
1118 |
self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
|
1119 |
self.layers = nn.ModuleList(
|
1120 |
-
[
|
1121 |
)
|
1122 |
self._attn_implementation = config._attn_implementation
|
1123 |
-
self.norm =
|
1124 |
|
1125 |
self.gradient_checkpointing = False
|
1126 |
# Initialize weights and apply final processing
|
@@ -1132,7 +1123,7 @@ class EdgellmModel(EdgellmPreTrainedModel):
|
|
1132 |
def set_input_embeddings(self, value):
|
1133 |
self.embed_tokens = value
|
1134 |
|
1135 |
-
@add_start_docstrings_to_model_forward(
|
1136 |
def forward(
|
1137 |
self,
|
1138 |
input_ids: torch.LongTensor = None,
|
@@ -1267,12 +1258,12 @@ class EdgellmModel(EdgellmPreTrainedModel):
|
|
1267 |
)
|
1268 |
|
1269 |
|
1270 |
-
class
|
1271 |
_tied_weights_keys = ["lm_head.weight"]
|
1272 |
|
1273 |
def __init__(self, config):
|
1274 |
super().__init__(config)
|
1275 |
-
self.model =
|
1276 |
self.vocab_size = config.vocab_size
|
1277 |
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
|
1278 |
|
@@ -1297,7 +1288,7 @@ class EdgellmForCausalLM(EdgellmPreTrainedModel):
|
|
1297 |
def get_decoder(self):
|
1298 |
return self.model
|
1299 |
|
1300 |
-
@add_start_docstrings_to_model_forward(
|
1301 |
@replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
|
1302 |
def forward(
|
1303 |
self,
|
@@ -1325,9 +1316,9 @@ class EdgellmForCausalLM(EdgellmPreTrainedModel):
|
|
1325 |
Example:
|
1326 |
|
1327 |
```python
|
1328 |
-
>>> from transformers import AutoTokenizer,
|
1329 |
|
1330 |
-
>>> model =
|
1331 |
>>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
|
1332 |
|
1333 |
>>> prompt = "Hey, are you conscious? Can you talk to me?"
|
@@ -1473,9 +1464,9 @@ class EdgellmForCausalLM(EdgellmPreTrainedModel):
|
|
1473 |
|
1474 |
@add_start_docstrings(
|
1475 |
"""
|
1476 |
-
The
|
1477 |
|
1478 |
-
[`
|
1479 |
(e.g. GPT-2) do.
|
1480 |
|
1481 |
Since it does classification on the last token, it requires to know the position of the last token. If a
|
@@ -1484,13 +1475,13 @@ class EdgellmForCausalLM(EdgellmPreTrainedModel):
|
|
1484 |
padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
|
1485 |
each row of the batch).
|
1486 |
""",
|
1487 |
-
|
1488 |
)
|
1489 |
-
class
|
1490 |
def __init__(self, config):
|
1491 |
super().__init__(config)
|
1492 |
self.num_labels = config.num_labels
|
1493 |
-
self.model =
|
1494 |
self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
|
1495 |
|
1496 |
# Initialize weights and apply final processing
|
@@ -1502,7 +1493,7 @@ class EdgellmForSequenceClassification(EdgellmPreTrainedModel):
|
|
1502 |
def set_input_embeddings(self, value):
|
1503 |
self.model.embed_tokens = value
|
1504 |
|
1505 |
-
@add_start_docstrings_to_model_forward(
|
1506 |
def forward(
|
1507 |
self,
|
1508 |
input_ids: torch.LongTensor = None,
|
@@ -1596,17 +1587,17 @@ class EdgellmForSequenceClassification(EdgellmPreTrainedModel):
|
|
1596 |
|
1597 |
@add_start_docstrings(
|
1598 |
"""
|
1599 |
-
The
|
1600 |
output) e.g. for Named-Entity-Recognition (NER) tasks.
|
1601 |
""",
|
1602 |
-
|
1603 |
)
|
1604 |
-
# Copied from transformers.models.llama.modeling_llama.LlamaForTokenClassification with Llama->
|
1605 |
-
class
|
1606 |
def __init__(self, config):
|
1607 |
super().__init__(config)
|
1608 |
self.num_labels = config.num_labels
|
1609 |
-
self.model =
|
1610 |
if getattr(config, "classifier_dropout", None) is not None:
|
1611 |
classifier_dropout = config.classifier_dropout
|
1612 |
elif getattr(config, "hidden_dropout", None) is not None:
|
@@ -1625,7 +1616,7 @@ class EdgellmForTokenClassification(EdgellmPreTrainedModel):
|
|
1625 |
def set_input_embeddings(self, value):
|
1626 |
self.model.embed_tokens = value
|
1627 |
|
1628 |
-
@add_start_docstrings_to_model_forward(
|
1629 |
def forward(
|
1630 |
self,
|
1631 |
input_ids: Optional[torch.LongTensor] = None,
|
@@ -1683,9 +1674,9 @@ class EdgellmForTokenClassification(EdgellmPreTrainedModel):
|
|
1683 |
# from IPython import embed
|
1684 |
# from transformers import Qwen2Tokenizer
|
1685 |
# import light_hf_proxy
|
1686 |
-
# tokenizer = Qwen2Tokenizer.from_pretrained("
|
1687 |
-
# config =
|
1688 |
-
# model =
|
1689 |
# input_ids = tokenizer(
|
1690 |
# "Thanks to the generous support from SIGMOD EC, we will provide scholarship awards to selected students attending the WSDM 2024 conference. For awardees attending in-person, the grant will cover the cost of registration + some travel expenses. The awards will be competitive in the sense that not every student will receive a Travel Award. Each awardee will receive a bursary to partially cover the expense to attend the conference in-person. Awardees are expected to register for the main conference using a free-registration code provided with the award notification email and will have to make their own arrangements for travel and accommodation.Awardees are expected to register for the main conference and will have to make their own arrangements for travel and accommodation."
|
1691 |
# )
|
|
|
1 |
# coding=utf-8
|
2 |
+
# Copyright 2024 The PLM team and The HuggingFace Inc. All rights reserved.
|
3 |
#
|
4 |
# This code is based on Alibaba's Qwen2 library, DeepSeek-AI's deepseekv2
|
5 |
# libraryEleutherAI's GPT-NeoX library and the GPT-NeoX and OPT implementations
|
|
|
18 |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
19 |
# See the License for the specific language governing permissions and
|
20 |
# limitations under the License.
|
21 |
+
"""PyTorch PLM model."""
|
22 |
|
23 |
import inspect
|
24 |
import math
|
|
|
53 |
logging,
|
54 |
replace_return_docstrings,
|
55 |
)
|
56 |
+
from .configuration_plm import PLMConfig
|
57 |
|
58 |
|
59 |
if is_flash_attn_2_available():
|
|
|
66 |
logger = logging.get_logger(__name__)
|
67 |
|
68 |
|
69 |
+
_CHECKPOINT_FOR_DOC = "PLM/PLM-1.8B-base"
|
70 |
+
_CONFIG_FOR_DOC = "PLMConfig"
|
71 |
|
72 |
|
73 |
# Copied from transformers.models.llama.modeling_llama._get_unpad_data
|
|
|
82 |
max_seqlen_in_batch,
|
83 |
)
|
84 |
|
|
|
|
|
|
|
85 |
|
86 |
+
# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->PLM
|
87 |
+
class PLMRMSNorm(nn.Module):
|
|
|
|
|
88 |
def __init__(self, hidden_size, eps=1e-6):
|
89 |
"""
|
90 |
+
PLMRMSNorm is equivalent to T5LayerNorm
|
91 |
"""
|
92 |
super().__init__()
|
93 |
self.weight = nn.Parameter(torch.ones(hidden_size))
|
|
|
102 |
return (self.weight.to(torch.float32) * hidden_states).to(input_dtype)
|
103 |
|
104 |
|
105 |
+
# Copied from transformers.models.mixtral.modeling_mixtral.MixtralRotaryEmbedding with Mixtral->PLM
|
106 |
+
class PLMRotaryEmbedding(nn.Module):
|
107 |
def __init__(self, dim, max_position_embeddings=4096, base=100000, device=None):
|
108 |
super().__init__()
|
109 |
self.dim = dim
|
|
|
145 |
)
|
146 |
|
147 |
|
148 |
+
class PLMLinearScalingRotaryEmbedding(PLMRotaryEmbedding):
|
149 |
+
"""PLMRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
|
150 |
|
151 |
def __init__(
|
152 |
self,
|
|
|
173 |
self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
|
174 |
|
175 |
|
176 |
+
# Copied from transformers.models.llama.modeling_llama.LlamaDynamicNTKScalingRotaryEmbedding with Llama->PLM
|
177 |
+
class PLMDynamicNTKScalingRotaryEmbedding(PLMRotaryEmbedding):
|
178 |
+
"""PLMRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
|
179 |
|
180 |
def __init__(
|
181 |
self,
|
|
|
249 |
return ramp_func
|
250 |
|
251 |
|
252 |
+
class PLMYarnRotaryEmbedding(PLMRotaryEmbedding):
|
253 |
|
254 |
def __init__(
|
255 |
self,
|
|
|
361 |
return q_embed, k_embed
|
362 |
|
363 |
|
364 |
+
class PLMMLP(nn.Module):
|
365 |
def __init__(self, config):
|
366 |
super().__init__()
|
367 |
self.hidden_size = config.hidden_size
|
|
|
391 |
|
392 |
|
393 |
# Copied from https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite/blob/main/modeling_deepseek.py
|
394 |
+
# DeepseekV2Attention with DeepseekV2->PLM
|
395 |
|
396 |
+
class PLMAttention(nn.Module):
|
397 |
"""Multi-headed attention from 'Attention Is All You Need' paper"""
|
398 |
|
399 |
def __init__(self, config, layer_idx: Optional[int] = None):
|
|
|
419 |
self.v_head_dim = config.v_head_dim
|
420 |
self.qk_nope_head_dim = config.qk_nope_head_dim
|
421 |
self.q_head_dim = config.qk_nope_head_dim + config.qk_rope_head_dim
|
|
|
|
|
422 |
|
423 |
self.is_causal = True
|
424 |
|
|
|
430 |
self.q_a_proj = nn.Linear(
|
431 |
self.hidden_size, config.q_lora_rank, bias=config.attention_bias
|
432 |
)
|
433 |
+
self.q_a_layernorm = PLMRMSNorm(config.q_lora_rank)
|
434 |
self.q_b_proj = nn.Linear(
|
435 |
config.q_lora_rank, self.num_heads * self.q_head_dim, bias=False
|
436 |
)
|
|
|
440 |
config.kv_lora_rank + config.qk_rope_head_dim,
|
441 |
bias=config.attention_bias,
|
442 |
) # 2048 512 64
|
443 |
+
self.kv_a_layernorm = PLMRMSNorm(config.kv_lora_rank)
|
444 |
self.kv_b_proj = nn.Linear(
|
445 |
config.kv_lora_rank,
|
446 |
self.num_heads
|
447 |
* (self.q_head_dim - self.qk_rope_head_dim + self.v_head_dim),
|
448 |
bias=False,
|
449 |
+
)
|
450 |
+
|
451 |
self.o_proj = nn.Linear(
|
452 |
self.num_heads * self.v_head_dim,
|
453 |
self.hidden_size,
|
454 |
bias=config.attention_bias,
|
455 |
+
)
|
456 |
self._init_rope()
|
457 |
|
458 |
+
self.softmax_scale = self.q_head_dim ** (-0.5)
|
459 |
|
460 |
|
461 |
def _init_rope(self):
|
462 |
if self.config.rope_scaling is None:
|
463 |
+
self.rotary_emb = PLMRotaryEmbedding(
|
464 |
self.qk_rope_head_dim,
|
465 |
max_position_embeddings=self.max_position_embeddings,
|
466 |
base=self.rope_theta,
|
|
|
616 |
return attn_output, attn_weights, past_key_value
|
617 |
|
618 |
|
619 |
+
class PLMFlashAttention2(PLMAttention):
|
620 |
"""
|
621 |
DeepseekV2 flash attention module. This module inherits from `DeepseekV2Attention` as the weights of the module stays
|
622 |
untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
|
|
|
740 |
query_states = query_states.to(target_dtype)
|
741 |
key_states = key_states.to(target_dtype)
|
742 |
value_states = value_states.to(target_dtype)
|
743 |
+
|
744 |
attn_output = self._flash_attention_forward(
|
745 |
query_states,
|
746 |
key_states,
|
|
|
756 |
attn_output = attn_output.reshape(
|
757 |
bsz, q_len, self.num_heads * self.v_head_dim
|
758 |
).contiguous()
|
759 |
+
|
|
|
760 |
attn_output = self.o_proj(attn_output)
|
761 |
+
|
|
|
762 |
if not output_attentions:
|
763 |
attn_weights = None
|
764 |
|
|
|
889 |
(cu_seqlens_q, cu_seqlens_k),
|
890 |
(max_seqlen_in_batch_q, max_seqlen_in_batch_k),
|
891 |
)
|
892 |
+
PLM_ATTENTION_CLASSES = {
|
893 |
+
"eager": PLMAttention,
|
894 |
+
"flash_attention_2": PLMFlashAttention2,
|
895 |
}
|
896 |
|
897 |
|
898 |
+
class PLMDecoderLayer(nn.Module):
|
899 |
+
def __init__(self, config: PLMConfig, layer_idx: int):
|
900 |
super().__init__()
|
901 |
self.hidden_size = config.hidden_size
|
902 |
|
|
|
905 |
f"Sliding Window Attention is enabled but not implemented for `{config._attn_implementation}`; "
|
906 |
"unexpected results may be encountered."
|
907 |
)
|
908 |
+
self.self_attn = PLM_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)
|
909 |
+
self.mlp = PLMMLP(config)
|
910 |
+
self.input_layernorm = PLMRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
|
911 |
+
self.post_attention_layernorm = PLMRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
|
912 |
|
913 |
def forward(
|
914 |
self,
|
|
|
973 |
return outputs
|
974 |
|
975 |
|
976 |
+
PLM_START_DOCSTRING = r"""
|
977 |
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
|
978 |
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
|
979 |
etc.)
|
|
|
983 |
and behavior.
|
984 |
|
985 |
Parameters:
|
986 |
+
config ([`PLMConfig`]):
|
987 |
Model configuration class with all the parameters of the model. Initializing with a config file does not
|
988 |
load the weights associated with the model, only the configuration. Check out the
|
989 |
[`~PreTrainedModel.from_pretrained`] method to load the model weights.
|
|
|
991 |
|
992 |
|
993 |
@add_start_docstrings(
|
994 |
+
"The bare PLM Model outputting raw hidden-states without any specific head on top.",
|
995 |
+
PLM_START_DOCSTRING,
|
996 |
)
|
997 |
+
class PLMPreTrainedModel(PreTrainedModel):
|
998 |
+
config_class = PLMConfig
|
999 |
base_model_prefix = "model"
|
1000 |
supports_gradient_checkpointing = True
|
1001 |
+
_no_split_modules = ["PLMDecoderLayer"]
|
1002 |
_skip_keys_device_placement = "past_key_values"
|
1003 |
_supports_flash_attn_2 = True
|
1004 |
_supports_cache_class = True
|
|
|
1015 |
module.weight.data[module.padding_idx].zero_()
|
1016 |
|
1017 |
|
1018 |
+
PLM_INPUTS_DOCSTRING = r"""
|
1019 |
Args:
|
1020 |
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
|
1021 |
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
|
|
|
1090 |
|
1091 |
|
1092 |
@add_start_docstrings(
|
1093 |
+
"The bare PLM Model outputting raw hidden-states without any specific head on top.",
|
1094 |
+
PLM_START_DOCSTRING,
|
1095 |
)
|
1096 |
+
class PLMModel(PLMPreTrainedModel):
|
1097 |
"""
|
1098 |
+
Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`PLMDecoderLayer`]
|
1099 |
|
1100 |
Args:
|
1101 |
+
config: PLMConfig
|
1102 |
"""
|
1103 |
|
1104 |
+
def __init__(self, config: PLMConfig):
|
1105 |
super().__init__(config)
|
1106 |
self.padding_idx = config.pad_token_id
|
1107 |
self.vocab_size = config.vocab_size
|
1108 |
|
1109 |
self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
|
1110 |
self.layers = nn.ModuleList(
|
1111 |
+
[PLMDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
|
1112 |
)
|
1113 |
self._attn_implementation = config._attn_implementation
|
1114 |
+
self.norm = PLMRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
|
1115 |
|
1116 |
self.gradient_checkpointing = False
|
1117 |
# Initialize weights and apply final processing
|
|
|
1123 |
def set_input_embeddings(self, value):
|
1124 |
self.embed_tokens = value
|
1125 |
|
1126 |
+
@add_start_docstrings_to_model_forward(PLM_INPUTS_DOCSTRING)
|
1127 |
def forward(
|
1128 |
self,
|
1129 |
input_ids: torch.LongTensor = None,
|
|
|
1258 |
)
|
1259 |
|
1260 |
|
1261 |
+
class PLMForCausalLM(PLMPreTrainedModel):
|
1262 |
_tied_weights_keys = ["lm_head.weight"]
|
1263 |
|
1264 |
def __init__(self, config):
|
1265 |
super().__init__(config)
|
1266 |
+
self.model = PLMModel(config)
|
1267 |
self.vocab_size = config.vocab_size
|
1268 |
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
|
1269 |
|
|
|
1288 |
def get_decoder(self):
|
1289 |
return self.model
|
1290 |
|
1291 |
+
@add_start_docstrings_to_model_forward(PLM_INPUTS_DOCSTRING)
|
1292 |
@replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
|
1293 |
def forward(
|
1294 |
self,
|
|
|
1316 |
Example:
|
1317 |
|
1318 |
```python
|
1319 |
+
>>> from transformers import AutoTokenizer, PLMForCausalLM
|
1320 |
|
1321 |
+
>>> model = PLMForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
|
1322 |
>>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
|
1323 |
|
1324 |
>>> prompt = "Hey, are you conscious? Can you talk to me?"
|
|
|
1464 |
|
1465 |
@add_start_docstrings(
|
1466 |
"""
|
1467 |
+
The PLM Model transformer with a sequence classification head on top (linear layer).
|
1468 |
|
1469 |
+
[`PLMForSequenceClassification`] uses the last token in order to do the classification, as other causal models
|
1470 |
(e.g. GPT-2) do.
|
1471 |
|
1472 |
Since it does classification on the last token, it requires to know the position of the last token. If a
|
|
|
1475 |
padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
|
1476 |
each row of the batch).
|
1477 |
""",
|
1478 |
+
PLM_START_DOCSTRING,
|
1479 |
)
|
1480 |
+
class PLMForSequenceClassification(PLMPreTrainedModel):
|
1481 |
def __init__(self, config):
|
1482 |
super().__init__(config)
|
1483 |
self.num_labels = config.num_labels
|
1484 |
+
self.model = PLMModel(config)
|
1485 |
self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
|
1486 |
|
1487 |
# Initialize weights and apply final processing
|
|
|
1493 |
def set_input_embeddings(self, value):
|
1494 |
self.model.embed_tokens = value
|
1495 |
|
1496 |
+
@add_start_docstrings_to_model_forward(PLM_INPUTS_DOCSTRING)
|
1497 |
def forward(
|
1498 |
self,
|
1499 |
input_ids: torch.LongTensor = None,
|
|
|
1587 |
|
1588 |
@add_start_docstrings(
|
1589 |
"""
|
1590 |
+
The PLM Model transformer with a token classification head on top (a linear layer on top of the hidden-states
|
1591 |
output) e.g. for Named-Entity-Recognition (NER) tasks.
|
1592 |
""",
|
1593 |
+
PLM_START_DOCSTRING,
|
1594 |
)
|
1595 |
+
# Copied from transformers.models.llama.modeling_llama.LlamaForTokenClassification with Llama->PLM, LLAMA->PLM
|
1596 |
+
class PLMForTokenClassification(PLMPreTrainedModel):
|
1597 |
def __init__(self, config):
|
1598 |
super().__init__(config)
|
1599 |
self.num_labels = config.num_labels
|
1600 |
+
self.model = PLMModel(config)
|
1601 |
if getattr(config, "classifier_dropout", None) is not None:
|
1602 |
classifier_dropout = config.classifier_dropout
|
1603 |
elif getattr(config, "hidden_dropout", None) is not None:
|
|
|
1616 |
def set_input_embeddings(self, value):
|
1617 |
self.model.embed_tokens = value
|
1618 |
|
1619 |
+
@add_start_docstrings_to_model_forward(PLM_INPUTS_DOCSTRING)
|
1620 |
def forward(
|
1621 |
self,
|
1622 |
input_ids: Optional[torch.LongTensor] = None,
|
|
|
1674 |
# from IPython import embed
|
1675 |
# from transformers import Qwen2Tokenizer
|
1676 |
# import light_hf_proxy
|
1677 |
+
# tokenizer = Qwen2Tokenizer.from_pretrained("PLM-Team/PLM-1.8B-Base")
|
1678 |
+
# config = PLMConfig.from_pretrained("PLM-Team/PLM-1.8B-Base/config.json" ,attn_implementation="flash_attention_2", torch_dtype=torch.bfloat16)
|
1679 |
+
# model = PLMForCausalLM(config).to(torch.bfloat16).to("cuda:7")
|
1680 |
# input_ids = tokenizer(
|
1681 |
# "Thanks to the generous support from SIGMOD EC, we will provide scholarship awards to selected students attending the WSDM 2024 conference. For awardees attending in-person, the grant will cover the cost of registration + some travel expenses. The awards will be competitive in the sense that not every student will receive a Travel Award. Each awardee will receive a bursary to partially cover the expense to attend the conference in-person. Awardees are expected to register for the main conference using a free-registration code provided with the award notification email and will have to make their own arrangements for travel and accommodation.Awardees are expected to register for the main conference and will have to make their own arrangements for travel and accommodation."
|
1682 |
# )
|