PyTorch
English
Chinese
plm
custom_code
jjw0126 commited on
Commit
be0bde5
·
verified ·
1 Parent(s): 3707598

Upload folder using huggingface_hub

Browse files
Files changed (4) hide show
  1. .gitattributes +36 -0
  2. README.md +3 -0
  3. configuration_plm.py +27 -36
  4. modeling_plm.py +20 -41
.gitattributes ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ ---
2
+ license: mit
3
+ ---
configuration_plm.py CHANGED
@@ -13,7 +13,6 @@
13
  # See the License for the specific language governing permissions and
14
  # limitations under the License.
15
  """PLM model configuration"""
16
- # Test test
17
  from transformers.configuration_utils import PretrainedConfig
18
  from transformers.utils import logging
19
 
@@ -23,28 +22,27 @@ logger = logging.get_logger(__name__)
23
 
24
  class PLMConfig(PretrainedConfig):
25
  r"""
26
- This is the configuration class to store the configuration of a [`Qwen2Model`]. It is used to instantiate a
27
- Qwen2 model according to the specified arguments, defining the model architecture. Instantiating a configuration
28
- with the defaults will yield a similar configuration to that of
29
- Qwen2-7B-beta [Qwen/Qwen2-7B-beta](https://huggingface.co/Qwen/Qwen2-7B-beta).
30
 
31
  Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
32
- documentation from [`PretrainedConfig`] for more information.
 
33
 
34
 
35
  Args:
36
  vocab_size (`int`, *optional*, defaults to 151936):
37
- Vocabulary size of the Qwen2 model. Defines the number of different tokens that can be represented by the
38
- `inputs_ids` passed when calling [`Qwen2Model`]
39
  hidden_size (`int`, *optional*, defaults to 4096):
40
  Dimension of the hidden representations.
41
- intermediate_size (`int`, *optional*, defaults to 22016):
42
  Dimension of the MLP representations.
43
  num_hidden_layers (`int`, *optional*, defaults to 32):
44
  Number of hidden layers in the Transformer encoder.
45
- num_attention_heads (`int`, *optional*, defaults to 32):
46
  Number of attention heads for each attention layer in the Transformer encoder.
47
- num_key_value_heads (`int`, *optional*, defaults to 32):
48
  This is the number of key_value heads that should be used to implement Grouped Query Attention. If
49
  `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
50
  `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
@@ -53,7 +51,12 @@ class PLMConfig(PretrainedConfig):
53
  paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
54
  hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
55
  The non-linear activation function (function or string) in the decoder.
56
- max_position_embeddings (`int`, *optional*, defaults to 32768):
 
 
 
 
 
57
  The maximum sequence length that this model might ever be used with.
58
  initializer_range (`float`, *optional*, defaults to 0.02):
59
  The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
@@ -62,27 +65,25 @@ class PLMConfig(PretrainedConfig):
62
  use_cache (`bool`, *optional*, defaults to `True`):
63
  Whether or not the model should return the last key/values attentions (not used by all models). Only
64
  relevant if `config.is_decoder=True`.
65
- tie_word_embeddings (`bool`, *optional*, defaults to `False`):
66
  Whether the model's input and output word embeddings should be tied.
67
- rope_theta (`float`, *optional*, defaults to 10000.0):
 
 
68
  The base period of the RoPE embeddings.
69
- use_sliding_window (`bool`, *optional*, defaults to `False`):
70
- Whether to use sliding window attention.
71
- sliding_window (`int`, *optional*, defaults to 4096):
72
- Sliding window attention (SWA) window size. If not specified, will default to `4096`.
73
- max_window_layers (`int`, *optional*, defaults to 28):
74
- The number of layers that use SWA (Sliding Window Attention). The bottom layers use SWA while the top use full attention.
75
  attention_dropout (`float`, *optional*, defaults to 0.0):
76
  The dropout ratio for the attention probabilities.
77
 
78
  ```python
79
- >>> from transformers import Qwen2Model, Qwen2Config
80
 
81
- >>> # Initializing a Qwen2 style configuration
82
- >>> configuration = Qwen2Config()
83
 
84
- >>> # Initializing a model from the Qwen2-7B style configuration
85
- >>> model = Qwen2Model(configuration)
86
 
87
  >>> # Accessing the model configuration
88
  >>> configuration = model.config
@@ -111,12 +112,10 @@ class PLMConfig(PretrainedConfig):
111
  use_cache=True,
112
  pretraining_tp=1,
113
  tie_word_embeddings=True,
114
- rope_theta=10000.0,
115
  rope_scaling=None,
116
  attention_bias=False,
117
  attention_dropout=0.0,
118
- use_sliding_window=False,
119
- sliding_window=4096,
120
  **kwargs,
121
  ):
122
  self.vocab_size = vocab_size
@@ -145,14 +144,6 @@ class PLMConfig(PretrainedConfig):
145
  self.attention_bias = attention_bias
146
  self.attention_dropout = attention_dropout
147
 
148
- self.use_sliding_window = use_sliding_window
149
- self.sliding_window = sliding_window
150
-
151
- # for backward compatibility
152
- if num_key_value_heads is None:
153
- num_key_value_heads = num_attention_heads
154
- self.attn_implementation = "flash_attention_2"
155
-
156
  super().__init__(
157
  tie_word_embeddings=tie_word_embeddings,
158
  **kwargs,
 
13
  # See the License for the specific language governing permissions and
14
  # limitations under the License.
15
  """PLM model configuration"""
 
16
  from transformers.configuration_utils import PretrainedConfig
17
  from transformers.utils import logging
18
 
 
22
 
23
  class PLMConfig(PretrainedConfig):
24
  r"""
25
+ This is the configuration class to store the configuration of a [`PLMModel`]. It is used to instantiate a
26
+ PLM model according to the specified arguments, defining the model architecture.
 
 
27
 
28
  Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
29
+ documentation from [`PretrainedConfig`] for more information. Instantiating a configuration with the
30
+ defaults will yield a similar configuration to that of the PLM model.
31
 
32
 
33
  Args:
34
  vocab_size (`int`, *optional*, defaults to 151936):
35
+ Vocabulary size of the PLM model. Defines the number of different tokens that can be represented by the
36
+ `inputs_ids` passed when calling [`PLMModel`]
37
  hidden_size (`int`, *optional*, defaults to 4096):
38
  Dimension of the hidden representations.
39
+ intermediate_size (`int`, *optional*, defaults to 8192):
40
  Dimension of the MLP representations.
41
  num_hidden_layers (`int`, *optional*, defaults to 32):
42
  Number of hidden layers in the Transformer encoder.
43
+ num_attention_heads (`int`, *optional*, defaults to 16):
44
  Number of attention heads for each attention layer in the Transformer encoder.
45
+ num_key_value_heads (`int`, *optional*, defaults to 16):
46
  This is the number of key_value heads that should be used to implement Grouped Query Attention. If
47
  `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
48
  `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
 
51
  paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
52
  hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
53
  The non-linear activation function (function or string) in the decoder.
54
+ pretraining_tp (`int`, *optional*, defaults to 1):
55
+ Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
56
+ document](https://huggingface.co/docs/transformers/parallelism) to understand more about it. This value is
57
+ necessary to ensure exact reproducibility of the pretraining results. Please refer to [this
58
+ issue](https://github.com/pytorch/pytorch/issues/76232).
59
+ max_position_embeddings (`int`, *optional*, defaults to 4096):
60
  The maximum sequence length that this model might ever be used with.
61
  initializer_range (`float`, *optional*, defaults to 0.02):
62
  The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
 
65
  use_cache (`bool`, *optional*, defaults to `True`):
66
  Whether or not the model should return the last key/values attentions (not used by all models). Only
67
  relevant if `config.is_decoder=True`.
68
+ tie_word_embeddings (`bool`, *optional*, defaults to `True`):
69
  Whether the model's input and output word embeddings should be tied.
70
+ rope_scaling (`Dict`, *optional*):
71
+ Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports normal rope.
72
+ rope_theta (`float`, *optional*, defaults to 100000.0):
73
  The base period of the RoPE embeddings.
74
+ attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
75
+ Whether to use a bias in the query, key, value and output projection layers during self-attention.
 
 
 
 
76
  attention_dropout (`float`, *optional*, defaults to 0.0):
77
  The dropout ratio for the attention probabilities.
78
 
79
  ```python
80
+ >>> from transformers import PLMModel, PLMConfig
81
 
82
+ >>> # Initializing a PLM style configuration
83
+ >>> configuration = PLMConfig()
84
 
85
+ >>> # Initializing a model from the PLM style configuration
86
+ >>> model = PLMModel(configuration)
87
 
88
  >>> # Accessing the model configuration
89
  >>> configuration = model.config
 
112
  use_cache=True,
113
  pretraining_tp=1,
114
  tie_word_embeddings=True,
115
+ rope_theta=100000.0,
116
  rope_scaling=None,
117
  attention_bias=False,
118
  attention_dropout=0.0,
 
 
119
  **kwargs,
120
  ):
121
  self.vocab_size = vocab_size
 
144
  self.attention_bias = attention_bias
145
  self.attention_dropout = attention_dropout
146
 
 
 
 
 
 
 
 
 
147
  super().__init__(
148
  tie_word_embeddings=tie_word_embeddings,
149
  **kwargs,
modeling_plm.py CHANGED
@@ -2,7 +2,7 @@
2
  # Copyright 2024 The PLM team and The HuggingFace Inc. All rights reserved.
3
  #
4
  # This code is based on Alibaba's Qwen2 library, DeepSeek-AI's deepseekv2
5
- # libraryEleutherAI's GPT-NeoX library and the GPT-NeoX and OPT implementations
6
  # in this library. It has been modified from its original forms to accommodate
7
  # minor architectural differences compared to GPT-NeoX and OPT used by the Meta
8
  # AI team that trained the model.
@@ -253,7 +253,7 @@ class PLMAttention(nn.Module):
253
  if self.q_lora_rank is None:
254
  self.q_proj = nn.Linear(
255
  self.hidden_size, self.num_heads * self.q_head_dim, bias=False
256
- ) # 2048 16 192
257
  else:
258
  self.q_a_proj = nn.Linear(
259
  self.hidden_size, config.q_lora_rank, bias=config.attention_bias
@@ -267,7 +267,7 @@ class PLMAttention(nn.Module):
267
  self.hidden_size,
268
  config.kv_lora_rank + config.qk_rope_head_dim,
269
  bias=config.attention_bias,
270
- ) # 2048 512 64
271
  self.kv_a_layernorm = PLMRMSNorm(config.kv_lora_rank)
272
  self.kv_b_proj = nn.Linear(
273
  config.kv_lora_rank,
@@ -275,7 +275,6 @@ class PLMAttention(nn.Module):
275
  * (self.q_head_dim - self.qk_rope_head_dim + self.v_head_dim),
276
  bias=False,
277
  )
278
-
279
  self.o_proj = nn.Linear(
280
  self.num_heads * self.v_head_dim,
281
  self.hidden_size,
@@ -287,11 +286,14 @@ class PLMAttention(nn.Module):
287
 
288
 
289
  def _init_rope(self):
290
- self.rotary_emb = PLMRotaryEmbedding(
291
- self.qk_rope_head_dim,
292
- max_position_embeddings=self.max_position_embeddings,
293
- base=self.rope_theta,
294
- )
 
 
 
295
 
296
 
297
  def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
@@ -318,28 +320,27 @@ class PLMAttention(nn.Module):
318
  bsz, q_len, _ = hidden_states.size()
319
 
320
  if self.q_lora_rank is None:
321
- q = self.q_proj(hidden_states) # 9,2048 -> 3072, 16 * 192
322
  else:
323
  q = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(hidden_states)))
324
- q = q.view(bsz, q_len, self.num_heads, self.q_head_dim).transpose(1, 2)#[1, 16, 9, 192])
325
  q_nope, q_pe = torch.split(
326
  q, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1
327
- )# [1, 16, 9, 128] [1, 16, 9, 64]
328
 
329
  compressed_kv = self.kv_a_proj_with_mqa(hidden_states) # 1 9 576
330
  compressed_kv, k_pe = torch.split(
331
  compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1
332
- )# 512 64
333
- k_pe = k_pe.view(bsz, q_len, 1, self.qk_rope_head_dim).transpose(1, 2)# [1, 1, 9, 64])
334
  kv = (
335
  self.kv_b_proj(self.kv_a_layernorm(compressed_kv))
336
  .view(bsz, q_len, self.num_heads, self.qk_nope_head_dim + self.v_head_dim)
337
  .transpose(1, 2)
338
  )
339
- # 1 16 9 256
340
  k_nope, value_states = torch.split(
341
  kv, [self.qk_nope_head_dim, self.v_head_dim], dim=-1
342
- ) # 1 16 9 128, 1 16 9 128
343
  kv_seq_len = value_states.shape[-2]
344
  if past_key_value is not None:
345
  if self.layer_idx is None:
@@ -353,7 +354,7 @@ class PLMAttention(nn.Module):
353
 
354
  q_pe, k_pe = apply_rotary_pos_emb(q_pe, k_pe, cos, sin, position_ids)
355
 
356
- query_states = k_pe.new_empty(bsz, self.num_heads, q_len, self.q_head_dim)# ([1, 16, 9, 192])
357
  query_states[:, :, :, : self.qk_nope_head_dim] = q_nope
358
  query_states[:, :, :, self.qk_nope_head_dim :] = q_pe
359
 
@@ -361,7 +362,7 @@ class PLMAttention(nn.Module):
361
  key_states[:, :, :, : self.qk_nope_head_dim] = k_nope
362
  key_states[:, :, :, self.qk_nope_head_dim :] = k_pe
363
  if past_key_value is not None:
364
- cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models
365
  key_states, value_states = past_key_value.update(
366
  key_states, value_states, self.layer_idx, cache_kwargs
367
  )
@@ -1457,26 +1458,4 @@ class PLMForTokenClassification(PLMPreTrainedModel):
1457
  logits=logits,
1458
  hidden_states=outputs.hidden_states,
1459
  attentions=outputs.attentions,
1460
- )
1461
-
1462
-
1463
- # if __name__=="__main__":
1464
- # from IPython import embed
1465
- # from transformers import Qwen2Tokenizer
1466
- # import light_hf_proxy
1467
- # tokenizer = Qwen2Tokenizer.from_pretrained("PLM-Team/PLM-1.8B-Base")
1468
- # config = PLMConfig.from_pretrained("PLM-Team/PLM-1.8B-Base/config.json" ,attn_implementation="flash_attention_2", torch_dtype=torch.bfloat16)
1469
- # model = PLMForCausalLM(config).to(torch.bfloat16).to("cuda:7")
1470
- # input_ids = tokenizer(
1471
- # "Thanks to the generous support from SIGMOD EC, we will provide scholarship awards to selected students attending the WSDM 2024 conference. For awardees attending in-person, the grant will cover the cost of registration + some travel expenses. The awards will be competitive in the sense that not every student will receive a Travel Award. Each awardee will receive a bursary to partially cover the expense to attend the conference in-person. Awardees are expected to register for the main conference using a free-registration code provided with the award notification email and will have to make their own arrangements for travel and accommodation.Awardees are expected to register for the main conference and will have to make their own arrangements for travel and accommodation."
1472
- # )
1473
- # sample = torch.tensor([input_ids["input_ids"]]).to("cuda:7") # (1,L)
1474
-
1475
- # # Step 4: Forward pass through the model
1476
- # with torch.no_grad():
1477
- # outputs = model(sample)
1478
-
1479
- # # Optionally, inspect the outputs
1480
- # print(outputs)
1481
-
1482
- # embed()
 
2
  # Copyright 2024 The PLM team and The HuggingFace Inc. All rights reserved.
3
  #
4
  # This code is based on Alibaba's Qwen2 library, DeepSeek-AI's deepseekv2
5
+ # library, EleutherAI's GPT-NeoX library and the GPT-NeoX and OPT implementations
6
  # in this library. It has been modified from its original forms to accommodate
7
  # minor architectural differences compared to GPT-NeoX and OPT used by the Meta
8
  # AI team that trained the model.
 
253
  if self.q_lora_rank is None:
254
  self.q_proj = nn.Linear(
255
  self.hidden_size, self.num_heads * self.q_head_dim, bias=False
256
+ )
257
  else:
258
  self.q_a_proj = nn.Linear(
259
  self.hidden_size, config.q_lora_rank, bias=config.attention_bias
 
267
  self.hidden_size,
268
  config.kv_lora_rank + config.qk_rope_head_dim,
269
  bias=config.attention_bias,
270
+ )
271
  self.kv_a_layernorm = PLMRMSNorm(config.kv_lora_rank)
272
  self.kv_b_proj = nn.Linear(
273
  config.kv_lora_rank,
 
275
  * (self.q_head_dim - self.qk_rope_head_dim + self.v_head_dim),
276
  bias=False,
277
  )
 
278
  self.o_proj = nn.Linear(
279
  self.num_heads * self.v_head_dim,
280
  self.hidden_size,
 
286
 
287
 
288
  def _init_rope(self):
289
+ if self.config.rope_scaling is None:
290
+ self.rotary_emb = PLMRotaryEmbedding(
291
+ self.qk_rope_head_dim,
292
+ max_position_embeddings=self.max_position_embeddings,
293
+ base=self.rope_theta,
294
+ )
295
+ else:
296
+ raise ValueError(f"Currently do not support other RoPE scaling type")
297
 
298
 
299
  def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
 
320
  bsz, q_len, _ = hidden_states.size()
321
 
322
  if self.q_lora_rank is None:
323
+ q = self.q_proj(hidden_states)
324
  else:
325
  q = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(hidden_states)))
326
+ q = q.view(bsz, q_len, self.num_heads, self.q_head_dim).transpose(1, 2)
327
  q_nope, q_pe = torch.split(
328
  q, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1
329
+ )
330
 
331
  compressed_kv = self.kv_a_proj_with_mqa(hidden_states) # 1 9 576
332
  compressed_kv, k_pe = torch.split(
333
  compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1
334
+ )
335
+ k_pe = k_pe.view(bsz, q_len, 1, self.qk_rope_head_dim).transpose(1, 2)
336
  kv = (
337
  self.kv_b_proj(self.kv_a_layernorm(compressed_kv))
338
  .view(bsz, q_len, self.num_heads, self.qk_nope_head_dim + self.v_head_dim)
339
  .transpose(1, 2)
340
  )
 
341
  k_nope, value_states = torch.split(
342
  kv, [self.qk_nope_head_dim, self.v_head_dim], dim=-1
343
+ )
344
  kv_seq_len = value_states.shape[-2]
345
  if past_key_value is not None:
346
  if self.layer_idx is None:
 
354
 
355
  q_pe, k_pe = apply_rotary_pos_emb(q_pe, k_pe, cos, sin, position_ids)
356
 
357
+ query_states = k_pe.new_empty(bsz, self.num_heads, q_len, self.q_head_dim)
358
  query_states[:, :, :, : self.qk_nope_head_dim] = q_nope
359
  query_states[:, :, :, self.qk_nope_head_dim :] = q_pe
360
 
 
362
  key_states[:, :, :, : self.qk_nope_head_dim] = k_nope
363
  key_states[:, :, :, self.qk_nope_head_dim :] = k_pe
364
  if past_key_value is not None:
365
+ cache_kwargs = {"sin": sin, "cos": cos}
366
  key_states, value_states = past_key_value.update(
367
  key_states, value_states, self.layer_idx, cache_kwargs
368
  )
 
1458
  logits=logits,
1459
  hidden_states=outputs.hidden_states,
1460
  attentions=outputs.attentions,
1461
+ )