Upload folder using huggingface_hub
Browse files- .gitattributes +36 -0
- README.md +3 -0
- configuration_plm.py +27 -36
- modeling_plm.py +20 -41
.gitattributes
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
README.md
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
license: mit
|
3 |
+
---
|
configuration_plm.py
CHANGED
@@ -13,7 +13,6 @@
|
|
13 |
# See the License for the specific language governing permissions and
|
14 |
# limitations under the License.
|
15 |
"""PLM model configuration"""
|
16 |
-
# Test test
|
17 |
from transformers.configuration_utils import PretrainedConfig
|
18 |
from transformers.utils import logging
|
19 |
|
@@ -23,28 +22,27 @@ logger = logging.get_logger(__name__)
|
|
23 |
|
24 |
class PLMConfig(PretrainedConfig):
|
25 |
r"""
|
26 |
-
This is the configuration class to store the configuration of a [`
|
27 |
-
|
28 |
-
with the defaults will yield a similar configuration to that of
|
29 |
-
Qwen2-7B-beta [Qwen/Qwen2-7B-beta](https://huggingface.co/Qwen/Qwen2-7B-beta).
|
30 |
|
31 |
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
32 |
-
documentation from [`PretrainedConfig`] for more information.
|
|
|
33 |
|
34 |
|
35 |
Args:
|
36 |
vocab_size (`int`, *optional*, defaults to 151936):
|
37 |
-
Vocabulary size of the
|
38 |
-
`inputs_ids` passed when calling [`
|
39 |
hidden_size (`int`, *optional*, defaults to 4096):
|
40 |
Dimension of the hidden representations.
|
41 |
-
intermediate_size (`int`, *optional*, defaults to
|
42 |
Dimension of the MLP representations.
|
43 |
num_hidden_layers (`int`, *optional*, defaults to 32):
|
44 |
Number of hidden layers in the Transformer encoder.
|
45 |
-
num_attention_heads (`int`, *optional*, defaults to
|
46 |
Number of attention heads for each attention layer in the Transformer encoder.
|
47 |
-
num_key_value_heads (`int`, *optional*, defaults to
|
48 |
This is the number of key_value heads that should be used to implement Grouped Query Attention. If
|
49 |
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
50 |
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
@@ -53,7 +51,12 @@ class PLMConfig(PretrainedConfig):
|
|
53 |
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
|
54 |
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
55 |
The non-linear activation function (function or string) in the decoder.
|
56 |
-
|
|
|
|
|
|
|
|
|
|
|
57 |
The maximum sequence length that this model might ever be used with.
|
58 |
initializer_range (`float`, *optional*, defaults to 0.02):
|
59 |
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
@@ -62,27 +65,25 @@ class PLMConfig(PretrainedConfig):
|
|
62 |
use_cache (`bool`, *optional*, defaults to `True`):
|
63 |
Whether or not the model should return the last key/values attentions (not used by all models). Only
|
64 |
relevant if `config.is_decoder=True`.
|
65 |
-
tie_word_embeddings (`bool`, *optional*, defaults to `
|
66 |
Whether the model's input and output word embeddings should be tied.
|
67 |
-
|
|
|
|
|
68 |
The base period of the RoPE embeddings.
|
69 |
-
|
70 |
-
Whether to use
|
71 |
-
sliding_window (`int`, *optional*, defaults to 4096):
|
72 |
-
Sliding window attention (SWA) window size. If not specified, will default to `4096`.
|
73 |
-
max_window_layers (`int`, *optional*, defaults to 28):
|
74 |
-
The number of layers that use SWA (Sliding Window Attention). The bottom layers use SWA while the top use full attention.
|
75 |
attention_dropout (`float`, *optional*, defaults to 0.0):
|
76 |
The dropout ratio for the attention probabilities.
|
77 |
|
78 |
```python
|
79 |
-
>>> from transformers import
|
80 |
|
81 |
-
>>> # Initializing a
|
82 |
-
>>> configuration =
|
83 |
|
84 |
-
>>> # Initializing a model from the
|
85 |
-
>>> model =
|
86 |
|
87 |
>>> # Accessing the model configuration
|
88 |
>>> configuration = model.config
|
@@ -111,12 +112,10 @@ class PLMConfig(PretrainedConfig):
|
|
111 |
use_cache=True,
|
112 |
pretraining_tp=1,
|
113 |
tie_word_embeddings=True,
|
114 |
-
rope_theta=
|
115 |
rope_scaling=None,
|
116 |
attention_bias=False,
|
117 |
attention_dropout=0.0,
|
118 |
-
use_sliding_window=False,
|
119 |
-
sliding_window=4096,
|
120 |
**kwargs,
|
121 |
):
|
122 |
self.vocab_size = vocab_size
|
@@ -145,14 +144,6 @@ class PLMConfig(PretrainedConfig):
|
|
145 |
self.attention_bias = attention_bias
|
146 |
self.attention_dropout = attention_dropout
|
147 |
|
148 |
-
self.use_sliding_window = use_sliding_window
|
149 |
-
self.sliding_window = sliding_window
|
150 |
-
|
151 |
-
# for backward compatibility
|
152 |
-
if num_key_value_heads is None:
|
153 |
-
num_key_value_heads = num_attention_heads
|
154 |
-
self.attn_implementation = "flash_attention_2"
|
155 |
-
|
156 |
super().__init__(
|
157 |
tie_word_embeddings=tie_word_embeddings,
|
158 |
**kwargs,
|
|
|
13 |
# See the License for the specific language governing permissions and
|
14 |
# limitations under the License.
|
15 |
"""PLM model configuration"""
|
|
|
16 |
from transformers.configuration_utils import PretrainedConfig
|
17 |
from transformers.utils import logging
|
18 |
|
|
|
22 |
|
23 |
class PLMConfig(PretrainedConfig):
|
24 |
r"""
|
25 |
+
This is the configuration class to store the configuration of a [`PLMModel`]. It is used to instantiate a
|
26 |
+
PLM model according to the specified arguments, defining the model architecture.
|
|
|
|
|
27 |
|
28 |
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
29 |
+
documentation from [`PretrainedConfig`] for more information. Instantiating a configuration with the
|
30 |
+
defaults will yield a similar configuration to that of the PLM model.
|
31 |
|
32 |
|
33 |
Args:
|
34 |
vocab_size (`int`, *optional*, defaults to 151936):
|
35 |
+
Vocabulary size of the PLM model. Defines the number of different tokens that can be represented by the
|
36 |
+
`inputs_ids` passed when calling [`PLMModel`]
|
37 |
hidden_size (`int`, *optional*, defaults to 4096):
|
38 |
Dimension of the hidden representations.
|
39 |
+
intermediate_size (`int`, *optional*, defaults to 8192):
|
40 |
Dimension of the MLP representations.
|
41 |
num_hidden_layers (`int`, *optional*, defaults to 32):
|
42 |
Number of hidden layers in the Transformer encoder.
|
43 |
+
num_attention_heads (`int`, *optional*, defaults to 16):
|
44 |
Number of attention heads for each attention layer in the Transformer encoder.
|
45 |
+
num_key_value_heads (`int`, *optional*, defaults to 16):
|
46 |
This is the number of key_value heads that should be used to implement Grouped Query Attention. If
|
47 |
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
48 |
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
|
|
51 |
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
|
52 |
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
53 |
The non-linear activation function (function or string) in the decoder.
|
54 |
+
pretraining_tp (`int`, *optional*, defaults to 1):
|
55 |
+
Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
|
56 |
+
document](https://huggingface.co/docs/transformers/parallelism) to understand more about it. This value is
|
57 |
+
necessary to ensure exact reproducibility of the pretraining results. Please refer to [this
|
58 |
+
issue](https://github.com/pytorch/pytorch/issues/76232).
|
59 |
+
max_position_embeddings (`int`, *optional*, defaults to 4096):
|
60 |
The maximum sequence length that this model might ever be used with.
|
61 |
initializer_range (`float`, *optional*, defaults to 0.02):
|
62 |
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
|
|
65 |
use_cache (`bool`, *optional*, defaults to `True`):
|
66 |
Whether or not the model should return the last key/values attentions (not used by all models). Only
|
67 |
relevant if `config.is_decoder=True`.
|
68 |
+
tie_word_embeddings (`bool`, *optional*, defaults to `True`):
|
69 |
Whether the model's input and output word embeddings should be tied.
|
70 |
+
rope_scaling (`Dict`, *optional*):
|
71 |
+
Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports normal rope.
|
72 |
+
rope_theta (`float`, *optional*, defaults to 100000.0):
|
73 |
The base period of the RoPE embeddings.
|
74 |
+
attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
|
75 |
+
Whether to use a bias in the query, key, value and output projection layers during self-attention.
|
|
|
|
|
|
|
|
|
76 |
attention_dropout (`float`, *optional*, defaults to 0.0):
|
77 |
The dropout ratio for the attention probabilities.
|
78 |
|
79 |
```python
|
80 |
+
>>> from transformers import PLMModel, PLMConfig
|
81 |
|
82 |
+
>>> # Initializing a PLM style configuration
|
83 |
+
>>> configuration = PLMConfig()
|
84 |
|
85 |
+
>>> # Initializing a model from the PLM style configuration
|
86 |
+
>>> model = PLMModel(configuration)
|
87 |
|
88 |
>>> # Accessing the model configuration
|
89 |
>>> configuration = model.config
|
|
|
112 |
use_cache=True,
|
113 |
pretraining_tp=1,
|
114 |
tie_word_embeddings=True,
|
115 |
+
rope_theta=100000.0,
|
116 |
rope_scaling=None,
|
117 |
attention_bias=False,
|
118 |
attention_dropout=0.0,
|
|
|
|
|
119 |
**kwargs,
|
120 |
):
|
121 |
self.vocab_size = vocab_size
|
|
|
144 |
self.attention_bias = attention_bias
|
145 |
self.attention_dropout = attention_dropout
|
146 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
147 |
super().__init__(
|
148 |
tie_word_embeddings=tie_word_embeddings,
|
149 |
**kwargs,
|
modeling_plm.py
CHANGED
@@ -2,7 +2,7 @@
|
|
2 |
# Copyright 2024 The PLM team and The HuggingFace Inc. All rights reserved.
|
3 |
#
|
4 |
# This code is based on Alibaba's Qwen2 library, DeepSeek-AI's deepseekv2
|
5 |
-
#
|
6 |
# in this library. It has been modified from its original forms to accommodate
|
7 |
# minor architectural differences compared to GPT-NeoX and OPT used by the Meta
|
8 |
# AI team that trained the model.
|
@@ -253,7 +253,7 @@ class PLMAttention(nn.Module):
|
|
253 |
if self.q_lora_rank is None:
|
254 |
self.q_proj = nn.Linear(
|
255 |
self.hidden_size, self.num_heads * self.q_head_dim, bias=False
|
256 |
-
)
|
257 |
else:
|
258 |
self.q_a_proj = nn.Linear(
|
259 |
self.hidden_size, config.q_lora_rank, bias=config.attention_bias
|
@@ -267,7 +267,7 @@ class PLMAttention(nn.Module):
|
|
267 |
self.hidden_size,
|
268 |
config.kv_lora_rank + config.qk_rope_head_dim,
|
269 |
bias=config.attention_bias,
|
270 |
-
)
|
271 |
self.kv_a_layernorm = PLMRMSNorm(config.kv_lora_rank)
|
272 |
self.kv_b_proj = nn.Linear(
|
273 |
config.kv_lora_rank,
|
@@ -275,7 +275,6 @@ class PLMAttention(nn.Module):
|
|
275 |
* (self.q_head_dim - self.qk_rope_head_dim + self.v_head_dim),
|
276 |
bias=False,
|
277 |
)
|
278 |
-
|
279 |
self.o_proj = nn.Linear(
|
280 |
self.num_heads * self.v_head_dim,
|
281 |
self.hidden_size,
|
@@ -287,11 +286,14 @@ class PLMAttention(nn.Module):
|
|
287 |
|
288 |
|
289 |
def _init_rope(self):
|
290 |
-
self.
|
291 |
-
self.
|
292 |
-
|
293 |
-
|
294 |
-
|
|
|
|
|
|
|
295 |
|
296 |
|
297 |
def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
|
@@ -318,28 +320,27 @@ class PLMAttention(nn.Module):
|
|
318 |
bsz, q_len, _ = hidden_states.size()
|
319 |
|
320 |
if self.q_lora_rank is None:
|
321 |
-
q = self.q_proj(hidden_states)
|
322 |
else:
|
323 |
q = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(hidden_states)))
|
324 |
-
q = q.view(bsz, q_len, self.num_heads, self.q_head_dim).transpose(1, 2)
|
325 |
q_nope, q_pe = torch.split(
|
326 |
q, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1
|
327 |
-
)
|
328 |
|
329 |
compressed_kv = self.kv_a_proj_with_mqa(hidden_states) # 1 9 576
|
330 |
compressed_kv, k_pe = torch.split(
|
331 |
compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1
|
332 |
-
)
|
333 |
-
k_pe = k_pe.view(bsz, q_len, 1, self.qk_rope_head_dim).transpose(1, 2)
|
334 |
kv = (
|
335 |
self.kv_b_proj(self.kv_a_layernorm(compressed_kv))
|
336 |
.view(bsz, q_len, self.num_heads, self.qk_nope_head_dim + self.v_head_dim)
|
337 |
.transpose(1, 2)
|
338 |
)
|
339 |
-
# 1 16 9 256
|
340 |
k_nope, value_states = torch.split(
|
341 |
kv, [self.qk_nope_head_dim, self.v_head_dim], dim=-1
|
342 |
-
)
|
343 |
kv_seq_len = value_states.shape[-2]
|
344 |
if past_key_value is not None:
|
345 |
if self.layer_idx is None:
|
@@ -353,7 +354,7 @@ class PLMAttention(nn.Module):
|
|
353 |
|
354 |
q_pe, k_pe = apply_rotary_pos_emb(q_pe, k_pe, cos, sin, position_ids)
|
355 |
|
356 |
-
query_states = k_pe.new_empty(bsz, self.num_heads, q_len, self.q_head_dim)
|
357 |
query_states[:, :, :, : self.qk_nope_head_dim] = q_nope
|
358 |
query_states[:, :, :, self.qk_nope_head_dim :] = q_pe
|
359 |
|
@@ -361,7 +362,7 @@ class PLMAttention(nn.Module):
|
|
361 |
key_states[:, :, :, : self.qk_nope_head_dim] = k_nope
|
362 |
key_states[:, :, :, self.qk_nope_head_dim :] = k_pe
|
363 |
if past_key_value is not None:
|
364 |
-
cache_kwargs = {"sin": sin, "cos": cos}
|
365 |
key_states, value_states = past_key_value.update(
|
366 |
key_states, value_states, self.layer_idx, cache_kwargs
|
367 |
)
|
@@ -1457,26 +1458,4 @@ class PLMForTokenClassification(PLMPreTrainedModel):
|
|
1457 |
logits=logits,
|
1458 |
hidden_states=outputs.hidden_states,
|
1459 |
attentions=outputs.attentions,
|
1460 |
-
)
|
1461 |
-
|
1462 |
-
|
1463 |
-
# if __name__=="__main__":
|
1464 |
-
# from IPython import embed
|
1465 |
-
# from transformers import Qwen2Tokenizer
|
1466 |
-
# import light_hf_proxy
|
1467 |
-
# tokenizer = Qwen2Tokenizer.from_pretrained("PLM-Team/PLM-1.8B-Base")
|
1468 |
-
# config = PLMConfig.from_pretrained("PLM-Team/PLM-1.8B-Base/config.json" ,attn_implementation="flash_attention_2", torch_dtype=torch.bfloat16)
|
1469 |
-
# model = PLMForCausalLM(config).to(torch.bfloat16).to("cuda:7")
|
1470 |
-
# input_ids = tokenizer(
|
1471 |
-
# "Thanks to the generous support from SIGMOD EC, we will provide scholarship awards to selected students attending the WSDM 2024 conference. For awardees attending in-person, the grant will cover the cost of registration + some travel expenses. The awards will be competitive in the sense that not every student will receive a Travel Award. Each awardee will receive a bursary to partially cover the expense to attend the conference in-person. Awardees are expected to register for the main conference using a free-registration code provided with the award notification email and will have to make their own arrangements for travel and accommodation.Awardees are expected to register for the main conference and will have to make their own arrangements for travel and accommodation."
|
1472 |
-
# )
|
1473 |
-
# sample = torch.tensor([input_ids["input_ids"]]).to("cuda:7") # (1,L)
|
1474 |
-
|
1475 |
-
# # Step 4: Forward pass through the model
|
1476 |
-
# with torch.no_grad():
|
1477 |
-
# outputs = model(sample)
|
1478 |
-
|
1479 |
-
# # Optionally, inspect the outputs
|
1480 |
-
# print(outputs)
|
1481 |
-
|
1482 |
-
# embed()
|
|
|
2 |
# Copyright 2024 The PLM team and The HuggingFace Inc. All rights reserved.
|
3 |
#
|
4 |
# This code is based on Alibaba's Qwen2 library, DeepSeek-AI's deepseekv2
|
5 |
+
# library, EleutherAI's GPT-NeoX library and the GPT-NeoX and OPT implementations
|
6 |
# in this library. It has been modified from its original forms to accommodate
|
7 |
# minor architectural differences compared to GPT-NeoX and OPT used by the Meta
|
8 |
# AI team that trained the model.
|
|
|
253 |
if self.q_lora_rank is None:
|
254 |
self.q_proj = nn.Linear(
|
255 |
self.hidden_size, self.num_heads * self.q_head_dim, bias=False
|
256 |
+
)
|
257 |
else:
|
258 |
self.q_a_proj = nn.Linear(
|
259 |
self.hidden_size, config.q_lora_rank, bias=config.attention_bias
|
|
|
267 |
self.hidden_size,
|
268 |
config.kv_lora_rank + config.qk_rope_head_dim,
|
269 |
bias=config.attention_bias,
|
270 |
+
)
|
271 |
self.kv_a_layernorm = PLMRMSNorm(config.kv_lora_rank)
|
272 |
self.kv_b_proj = nn.Linear(
|
273 |
config.kv_lora_rank,
|
|
|
275 |
* (self.q_head_dim - self.qk_rope_head_dim + self.v_head_dim),
|
276 |
bias=False,
|
277 |
)
|
|
|
278 |
self.o_proj = nn.Linear(
|
279 |
self.num_heads * self.v_head_dim,
|
280 |
self.hidden_size,
|
|
|
286 |
|
287 |
|
288 |
def _init_rope(self):
|
289 |
+
if self.config.rope_scaling is None:
|
290 |
+
self.rotary_emb = PLMRotaryEmbedding(
|
291 |
+
self.qk_rope_head_dim,
|
292 |
+
max_position_embeddings=self.max_position_embeddings,
|
293 |
+
base=self.rope_theta,
|
294 |
+
)
|
295 |
+
else:
|
296 |
+
raise ValueError(f"Currently do not support other RoPE scaling type")
|
297 |
|
298 |
|
299 |
def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
|
|
|
320 |
bsz, q_len, _ = hidden_states.size()
|
321 |
|
322 |
if self.q_lora_rank is None:
|
323 |
+
q = self.q_proj(hidden_states)
|
324 |
else:
|
325 |
q = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(hidden_states)))
|
326 |
+
q = q.view(bsz, q_len, self.num_heads, self.q_head_dim).transpose(1, 2)
|
327 |
q_nope, q_pe = torch.split(
|
328 |
q, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1
|
329 |
+
)
|
330 |
|
331 |
compressed_kv = self.kv_a_proj_with_mqa(hidden_states) # 1 9 576
|
332 |
compressed_kv, k_pe = torch.split(
|
333 |
compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1
|
334 |
+
)
|
335 |
+
k_pe = k_pe.view(bsz, q_len, 1, self.qk_rope_head_dim).transpose(1, 2)
|
336 |
kv = (
|
337 |
self.kv_b_proj(self.kv_a_layernorm(compressed_kv))
|
338 |
.view(bsz, q_len, self.num_heads, self.qk_nope_head_dim + self.v_head_dim)
|
339 |
.transpose(1, 2)
|
340 |
)
|
|
|
341 |
k_nope, value_states = torch.split(
|
342 |
kv, [self.qk_nope_head_dim, self.v_head_dim], dim=-1
|
343 |
+
)
|
344 |
kv_seq_len = value_states.shape[-2]
|
345 |
if past_key_value is not None:
|
346 |
if self.layer_idx is None:
|
|
|
354 |
|
355 |
q_pe, k_pe = apply_rotary_pos_emb(q_pe, k_pe, cos, sin, position_ids)
|
356 |
|
357 |
+
query_states = k_pe.new_empty(bsz, self.num_heads, q_len, self.q_head_dim)
|
358 |
query_states[:, :, :, : self.qk_nope_head_dim] = q_nope
|
359 |
query_states[:, :, :, self.qk_nope_head_dim :] = q_pe
|
360 |
|
|
|
362 |
key_states[:, :, :, : self.qk_nope_head_dim] = k_nope
|
363 |
key_states[:, :, :, self.qk_nope_head_dim :] = k_pe
|
364 |
if past_key_value is not None:
|
365 |
+
cache_kwargs = {"sin": sin, "cos": cos}
|
366 |
key_states, value_states = past_key_value.update(
|
367 |
key_states, value_states, self.layer_idx, cache_kwargs
|
368 |
)
|
|
|
1458 |
logits=logits,
|
1459 |
hidden_states=outputs.hidden_states,
|
1460 |
attentions=outputs.attentions,
|
1461 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|