ricdomolm commited on
Commit
171bc11
·
1 Parent(s): b735e73

Add files from e3

Browse files
config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/fast/rolmedo/models/skywork-13b/snapshots/model/",
3
+ "architectures": [
4
+ "SkyworkForCausalLM"
5
+ ],
6
+ "auto_map": {
7
+ "AutoConfig": "configuration_skywork.SkyworkConfig",
8
+ "AutoModelForCausalLM": "modeling_skywork.SkyworkForCausalLM"
9
+ },
10
+ "bos_token_id": 1,
11
+ "eos_token_id": 2,
12
+ "hidden_act": "silu",
13
+ "hidden_size": 4608,
14
+ "initializer_range": 0.01,
15
+ "intermediate_size": 12288,
16
+ "max_position_embeddings": 4096,
17
+ "model_type": "skywork",
18
+ "num_attention_heads": 36,
19
+ "num_hidden_layers": 52,
20
+ "num_key_value_heads": 36,
21
+ "pad_token_id": 0,
22
+ "pretraining_tp": 1,
23
+ "rms_norm_eps": 1e-06,
24
+ "rope_scaling": null,
25
+ "rope_theta": 10000.0,
26
+ "tie_word_embeddings": false,
27
+ "torch_dtype": "bfloat16",
28
+ "transformers_version": "4.41.1",
29
+ "use_cache": true,
30
+ "vocab_size": 65519
31
+ }
configuration_skywork.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) SkyworkAI and the HuggingFace Inc. team. All rights reserved.
2
+ # This code is built upon Huggingface's transformers repository.
3
+
4
+
5
+ from transformers.configuration_utils import PretrainedConfig
6
+ from transformers.utils import logging
7
+
8
+
9
+ logger = logging.get_logger(__name__)
10
+
11
+ LLAMA_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
12
+
13
+
14
+ class SkyworkConfig(PretrainedConfig):
15
+
16
+ model_type = "skywork"
17
+ keys_to_ignore_at_inference = ["past_key_values"]
18
+
19
+ def __init__(
20
+ self,
21
+ vocab_size=32000,
22
+ hidden_size=4096,
23
+ intermediate_size=11008,
24
+ num_hidden_layers=32,
25
+ num_attention_heads=32,
26
+ num_key_value_heads=None,
27
+ hidden_act="silu",
28
+ max_position_embeddings=2048,
29
+ initializer_range=0.02,
30
+ rms_norm_eps=1e-6,
31
+ use_cache=True,
32
+ pad_token_id=None,
33
+ bos_token_id=1,
34
+ eos_token_id=2,
35
+ pretraining_tp=1,
36
+ tie_word_embeddings=False,
37
+ rope_theta=10000.0,
38
+ rope_scaling=None,
39
+ **kwargs,
40
+ ):
41
+ self.vocab_size = vocab_size
42
+ self.max_position_embeddings = max_position_embeddings
43
+ self.hidden_size = hidden_size
44
+ self.intermediate_size = intermediate_size
45
+ self.num_hidden_layers = num_hidden_layers
46
+ self.num_attention_heads = num_attention_heads
47
+
48
+ # for backward compatibility
49
+ if num_key_value_heads is None:
50
+ num_key_value_heads = num_attention_heads
51
+
52
+ self.num_key_value_heads = num_key_value_heads
53
+ self.hidden_act = hidden_act
54
+ self.initializer_range = initializer_range
55
+ self.rms_norm_eps = rms_norm_eps
56
+ self.pretraining_tp = pretraining_tp
57
+ self.use_cache = use_cache
58
+ self.rope_theta = rope_theta
59
+ self.rope_scaling = rope_scaling
60
+ self._rope_scaling_validation()
61
+
62
+ super().__init__(
63
+ pad_token_id=pad_token_id,
64
+ bos_token_id=bos_token_id,
65
+ eos_token_id=eos_token_id,
66
+ tie_word_embeddings=tie_word_embeddings,
67
+ **kwargs,
68
+ )
69
+
70
+ def _rope_scaling_validation(self):
71
+ """
72
+ Validate the `rope_scaling` configuration.
73
+ """
74
+ if self.rope_scaling is None:
75
+ return
76
+
77
+ if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
78
+ raise ValueError(
79
+ "`rope_scaling` must be a dictionary with with two fields, `type` and `factor`, "
80
+ f"got {self.rope_scaling}"
81
+ )
82
+ rope_scaling_type = self.rope_scaling.get("type", None)
83
+ rope_scaling_factor = self.rope_scaling.get("factor", None)
84
+ if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic", "ntk"]:
85
+ raise ValueError(
86
+ f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
87
+ )
88
+ if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
89
+ raise ValueError(f"`rope_scaling`'s factor field must be an float > 1, got {rope_scaling_factor}")
generation_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 1,
3
+ "do_sample": true,
4
+ "eos_token_id": 2,
5
+ "max_length": 4096,
6
+ "pad_token_id": 0,
7
+ "temperature": 0.6,
8
+ "top_p": 0.9,
9
+ "transformers_version": "4.41.1"
10
+ }
modeling_skywork.py ADDED
@@ -0,0 +1,911 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) SkyworkAI and the HuggingFace Inc. team. All rights reserved.
2
+ # This code is built upon Huggingface's transformers repository.
3
+
4
+ import math
5
+ from typing import List, Optional, Tuple, Union
6
+
7
+ import torch
8
+ import torch.nn.functional as F
9
+ import torch.utils.checkpoint
10
+ from torch import nn
11
+ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
12
+
13
+ from transformers.activations import ACT2FN
14
+ from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
15
+ from transformers.modeling_utils import PreTrainedModel
16
+ from transformers.utils import logging
17
+ from .configuration_skywork import SkyworkConfig
18
+
19
+
20
+ logger = logging.get_logger(__name__)
21
+
22
+ _CONFIG_FOR_DOC = "SkyworkConfig"
23
+
24
+
25
+ # Copied from transformers.models.bart.modeling_bart._make_causal_mask
26
+ def _make_causal_mask(
27
+ input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
28
+ ):
29
+ """
30
+ Make causal mask used for bi-directional self-attention.
31
+ """
32
+ bsz, tgt_len = input_ids_shape
33
+ mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
34
+ mask_cond = torch.arange(mask.size(-1), device=device)
35
+ mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
36
+ mask = mask.to(dtype)
37
+
38
+ if past_key_values_length > 0:
39
+ mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
40
+ return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
41
+
42
+
43
+ # Copied from transformers.models.bart.modeling_bart._expand_mask
44
+ def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
45
+ """
46
+ Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
47
+ """
48
+ bsz, src_len = mask.size()
49
+ tgt_len = tgt_len if tgt_len is not None else src_len
50
+
51
+ expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
52
+
53
+ inverted_mask = 1.0 - expanded_mask
54
+
55
+ return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
56
+
57
+
58
+ class SkyworkRMSNorm(nn.Module):
59
+ def __init__(self, hidden_size, eps=1e-6):
60
+ """
61
+ SkyworkRMSNorm is equivalent to T5LayerNorm
62
+ """
63
+ super().__init__()
64
+ self.weight = nn.Parameter(torch.ones(hidden_size))
65
+ self.variance_epsilon = eps
66
+
67
+ def forward(self, hidden_states):
68
+ input_dtype = hidden_states.dtype
69
+ hidden_states = hidden_states.to(torch.float32)
70
+ variance = hidden_states.pow(2).mean(-1, keepdim=True)
71
+ hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
72
+ return self.weight * hidden_states.to(input_dtype)
73
+
74
+
75
+ class SkyworkRotaryEmbedding(torch.nn.Module):
76
+ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
77
+ super().__init__()
78
+
79
+ self.dim = dim
80
+ self.max_position_embeddings = max_position_embeddings
81
+ self.base = base
82
+ inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
83
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
84
+
85
+ # Build here to make `torch.jit.trace` work.
86
+ self._set_cos_sin_cache(
87
+ seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
88
+ )
89
+
90
+ def _set_cos_sin_cache(self, seq_len, device, dtype):
91
+ self.max_seq_len_cached = seq_len
92
+ t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
93
+
94
+ freqs = torch.einsum("i,j->ij", t, self.inv_freq)
95
+ # Different from paper, but it uses a different permutation in order to obtain the same calculation
96
+ emb = torch.cat((freqs, freqs), dim=-1)
97
+ self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
98
+ self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
99
+
100
+ def forward(self, x, seq_len=None):
101
+ # x: [bs, num_attention_heads, seq_len, head_size]
102
+ if seq_len > self.max_seq_len_cached:
103
+ self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
104
+
105
+ return (
106
+ self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
107
+ self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
108
+ )
109
+
110
+
111
+ class SkyworkLinearScalingRotaryEmbedding(SkyworkRotaryEmbedding):
112
+ """SkyworkRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
113
+
114
+ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
115
+ self.scaling_factor = scaling_factor
116
+ super().__init__(dim, max_position_embeddings, base, device)
117
+
118
+ def _set_cos_sin_cache(self, seq_len, device, dtype):
119
+ self.max_seq_len_cached = seq_len
120
+ t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
121
+ t = t / self.scaling_factor
122
+
123
+ freqs = torch.einsum("i,j->ij", t, self.inv_freq)
124
+ # Different from paper, but it uses a different permutation in order to obtain the same calculation
125
+ emb = torch.cat((freqs, freqs), dim=-1)
126
+ self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
127
+ self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
128
+
129
+
130
+ class SkyworkDynamicNTKScalingRotaryEmbedding(SkyworkRotaryEmbedding):
131
+ """SkyworkRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
132
+
133
+ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
134
+ self.scaling_factor = scaling_factor
135
+ super().__init__(dim, max_position_embeddings, base, device)
136
+
137
+ def _set_cos_sin_cache(self, seq_len, device, dtype):
138
+ self.max_seq_len_cached = seq_len
139
+
140
+ if seq_len > self.max_position_embeddings:
141
+ base = self.base * (
142
+ (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1)
143
+ ) ** (self.dim / (self.dim - 2))
144
+ inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
145
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
146
+
147
+ t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
148
+
149
+ freqs = torch.einsum("i,j->ij", t, self.inv_freq)
150
+ # Different from paper, but it uses a different permutation in order to obtain the same calculation
151
+ emb = torch.cat((freqs, freqs), dim=-1)
152
+ self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
153
+ self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
154
+
155
+
156
+
157
+ class SkyworkNTKScalingRotaryEmbedding(torch.nn.Module):
158
+ def __init__(self, dim, max_position_embeddings=2048, base=10000, scaling_factor=100, device=None):
159
+ super().__init__()
160
+
161
+ self.dim = dim
162
+ self.max_position_embeddings = max_position_embeddings
163
+ self.base = base * scaling_factor
164
+ inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
165
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
166
+
167
+ # Build here to make `torch.jit.trace` work.
168
+ self._set_cos_sin_cache(
169
+ seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
170
+ )
171
+
172
+ def _set_cos_sin_cache(self, seq_len, device, dtype):
173
+ self.max_seq_len_cached = seq_len
174
+ t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
175
+ freqs = torch.einsum("i,j->ij", t, self.inv_freq)
176
+ emb = torch.cat((freqs, freqs), dim=-1)
177
+ self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
178
+ self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
179
+
180
+ def forward(self, x, seq_len=None):
181
+ if seq_len > self.max_seq_len_cached:
182
+ self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
183
+
184
+ return (
185
+ self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
186
+ self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
187
+ )
188
+
189
+ def rotate_half(x):
190
+ """Rotates half the hidden dims of the input."""
191
+ x1 = x[..., : x.shape[-1] // 2]
192
+ x2 = x[..., x.shape[-1] // 2 :]
193
+ return torch.cat((-x2, x1), dim=-1)
194
+
195
+
196
+ def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
197
+ # The first two dimensions of cos and sin are always 1, so we can `squeeze` them.
198
+ cos = cos.squeeze(1).squeeze(0) # [seq_len, dim]
199
+ sin = sin.squeeze(1).squeeze(0) # [seq_len, dim]
200
+ cos = cos[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim]
201
+ sin = sin[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim]
202
+ q_embed = (q * cos) + (rotate_half(q) * sin)
203
+ k_embed = (k * cos) + (rotate_half(k) * sin)
204
+ return q_embed, k_embed
205
+
206
+
207
+ class SkyworkMLP(nn.Module):
208
+ def __init__(self, config):
209
+ super().__init__()
210
+ self.config = config
211
+ self.hidden_size = config.hidden_size
212
+ self.intermediate_size = config.intermediate_size
213
+ self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
214
+ self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
215
+ self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
216
+ self.act_fn = ACT2FN[config.hidden_act]
217
+
218
+ def forward(self, x):
219
+ if self.config.pretraining_tp > 1:
220
+ slice = self.intermediate_size // self.config.pretraining_tp
221
+ gate_proj_slices = self.gate_proj.weight.split(slice, dim=0)
222
+ up_proj_slices = self.up_proj.weight.split(slice, dim=0)
223
+ down_proj_slices = self.down_proj.weight.split(slice, dim=1)
224
+
225
+ gate_proj = torch.cat(
226
+ [F.linear(x, gate_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1
227
+ )
228
+ up_proj = torch.cat([F.linear(x, up_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1)
229
+
230
+ intermediate_states = (self.act_fn(gate_proj) * up_proj).split(slice, dim=2)
231
+ down_proj = [
232
+ F.linear(intermediate_states[i], down_proj_slices[i]) for i in range(self.config.pretraining_tp)
233
+ ]
234
+ down_proj = sum(down_proj)
235
+ else:
236
+ down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
237
+
238
+ return down_proj
239
+
240
+
241
+ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
242
+ """
243
+ This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
244
+ num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
245
+ """
246
+ batch, num_key_value_heads, slen, head_dim = hidden_states.shape
247
+ if n_rep == 1:
248
+ return hidden_states
249
+ hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
250
+ return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
251
+
252
+
253
+ class SkyworkAttention(nn.Module):
254
+ """Multi-headed attention from 'Attention Is All You Need' paper"""
255
+
256
+ def __init__(self, config: SkyworkConfig):
257
+ super().__init__()
258
+ self.config = config
259
+ self.hidden_size = config.hidden_size
260
+ self.num_heads = config.num_attention_heads
261
+ self.head_dim = self.hidden_size // self.num_heads
262
+ self.num_key_value_heads = config.num_key_value_heads
263
+ self.num_key_value_groups = self.num_heads // self.num_key_value_heads
264
+ self.max_position_embeddings = config.max_position_embeddings
265
+ self.rope_theta = config.rope_theta
266
+
267
+ if (self.head_dim * self.num_heads) != self.hidden_size:
268
+ raise ValueError(
269
+ f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
270
+ f" and `num_heads`: {self.num_heads})."
271
+ )
272
+ self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
273
+ self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
274
+ self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
275
+ self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
276
+ self._init_rope()
277
+
278
+ def _init_rope(self):
279
+ if self.config.rope_scaling is None:
280
+ self.rotary_emb = SkyworkRotaryEmbedding(
281
+ self.head_dim,
282
+ max_position_embeddings=self.max_position_embeddings,
283
+ base=self.rope_theta,
284
+ )
285
+ else:
286
+ scaling_type = self.config.rope_scaling["type"]
287
+ scaling_factor = self.config.rope_scaling["factor"]
288
+ if scaling_type == "linear":
289
+ self.rotary_emb = SkyworkLinearScalingRotaryEmbedding(
290
+ self.head_dim,
291
+ max_position_embeddings=self.max_position_embeddings,
292
+ scaling_factor=scaling_factor,
293
+ base=self.rope_theta,
294
+ )
295
+ elif scaling_type == "dynamic":
296
+ self.rotary_emb = SkyworkDynamicNTKScalingRotaryEmbedding(
297
+ self.head_dim,
298
+ max_position_embeddings=self.max_position_embeddings,
299
+ scaling_factor=scaling_factor,
300
+ base=self.rope_theta,
301
+ )
302
+ elif scaling_type == "ntk":
303
+ self.rotary_emb = SkyworkNTKScalingRotaryEmbedding(
304
+ self.head_dim,
305
+ max_position_embeddings=self.max_position_embeddings,
306
+ scaling_factor=scaling_factor,
307
+ base=self.rope_theta,
308
+ )
309
+ else:
310
+ raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
311
+ print('-'*80)
312
+ print(f"USING COSTOM MODELING, scaling_type is {scaling_type}, scaling_factor is {scaling_factor}")
313
+
314
+ def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
315
+ return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
316
+
317
+ def forward(
318
+ self,
319
+ hidden_states: torch.Tensor,
320
+ attention_mask: Optional[torch.Tensor] = None,
321
+ position_ids: Optional[torch.LongTensor] = None,
322
+ past_key_value: Optional[Tuple[torch.Tensor]] = None,
323
+ output_attentions: bool = False,
324
+ use_cache: bool = False,
325
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
326
+ bsz, q_len, _ = hidden_states.size()
327
+
328
+ if self.config.pretraining_tp > 1:
329
+ key_value_slicing = (self.num_key_value_heads * self.head_dim) // self.config.pretraining_tp
330
+ query_slices = self.q_proj.weight.split(
331
+ (self.num_heads * self.head_dim) // self.config.pretraining_tp, dim=0
332
+ )
333
+ key_slices = self.k_proj.weight.split(key_value_slicing, dim=0)
334
+ value_slices = self.v_proj.weight.split(key_value_slicing, dim=0)
335
+
336
+ query_states = [F.linear(hidden_states, query_slices[i]) for i in range(self.config.pretraining_tp)]
337
+ query_states = torch.cat(query_states, dim=-1)
338
+
339
+ key_states = [F.linear(hidden_states, key_slices[i]) for i in range(self.config.pretraining_tp)]
340
+ key_states = torch.cat(key_states, dim=-1)
341
+
342
+ value_states = [F.linear(hidden_states, value_slices[i]) for i in range(self.config.pretraining_tp)]
343
+ value_states = torch.cat(value_states, dim=-1)
344
+
345
+ else:
346
+ query_states = self.q_proj(hidden_states)
347
+ key_states = self.k_proj(hidden_states)
348
+ value_states = self.v_proj(hidden_states)
349
+
350
+ query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
351
+ key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
352
+ value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
353
+
354
+ kv_seq_len = key_states.shape[-2]
355
+ if past_key_value is not None:
356
+ kv_seq_len += past_key_value[0].shape[-2]
357
+ cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
358
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
359
+
360
+ if past_key_value is not None:
361
+ # reuse k, v, self_attention
362
+ key_states = torch.cat([past_key_value[0], key_states], dim=2)
363
+ value_states = torch.cat([past_key_value[1], value_states], dim=2)
364
+
365
+ past_key_value = (key_states, value_states) if use_cache else None
366
+
367
+ # repeat k/v heads if n_kv_heads < n_heads
368
+ key_states = repeat_kv(key_states, self.num_key_value_groups)
369
+ value_states = repeat_kv(value_states, self.num_key_value_groups)
370
+
371
+ attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
372
+
373
+ if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
374
+ raise ValueError(
375
+ f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
376
+ f" {attn_weights.size()}"
377
+ )
378
+
379
+ if attention_mask is not None:
380
+ if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
381
+ raise ValueError(
382
+ f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
383
+ )
384
+ attn_weights = attn_weights + attention_mask
385
+
386
+ # upcast attention to fp32
387
+ attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
388
+ attn_output = torch.matmul(attn_weights, value_states)
389
+
390
+ if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
391
+ raise ValueError(
392
+ f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
393
+ f" {attn_output.size()}"
394
+ )
395
+
396
+ attn_output = attn_output.transpose(1, 2).contiguous()
397
+ attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
398
+
399
+ if self.config.pretraining_tp > 1:
400
+ attn_output = attn_output.split(self.hidden_size // self.config.pretraining_tp, dim=2)
401
+ o_proj_slices = self.o_proj.weight.split(self.hidden_size // self.config.pretraining_tp, dim=1)
402
+ attn_output = sum([F.linear(attn_output[i], o_proj_slices[i]) for i in range(self.config.pretraining_tp)])
403
+ else:
404
+ attn_output = self.o_proj(attn_output)
405
+
406
+ if not output_attentions:
407
+ attn_weights = None
408
+
409
+ return attn_output, attn_weights, past_key_value
410
+
411
+
412
+ class SkyworkDecoderLayer(nn.Module):
413
+ def __init__(self, config: SkyworkConfig):
414
+ super().__init__()
415
+ self.hidden_size = config.hidden_size
416
+ self.self_attn = SkyworkAttention(config=config)
417
+ self.mlp = SkyworkMLP(config)
418
+ self.input_layernorm = SkyworkRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
419
+ self.post_attention_layernorm = SkyworkRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
420
+
421
+ def forward(
422
+ self,
423
+ hidden_states: torch.Tensor,
424
+ attention_mask: Optional[torch.Tensor] = None,
425
+ position_ids: Optional[torch.LongTensor] = None,
426
+ past_key_value: Optional[Tuple[torch.Tensor]] = None,
427
+ output_attentions: Optional[bool] = False,
428
+ use_cache: Optional[bool] = False,
429
+ ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
430
+ """
431
+ Args:
432
+ hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
433
+ attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
434
+ `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
435
+ output_attentions (`bool`, *optional*):
436
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
437
+ returned tensors for more detail.
438
+ use_cache (`bool`, *optional*):
439
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
440
+ (see `past_key_values`).
441
+ past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
442
+ """
443
+
444
+ residual = hidden_states
445
+
446
+ hidden_states = self.input_layernorm(hidden_states)
447
+
448
+ # Self Attention
449
+ hidden_states, self_attn_weights, present_key_value = self.self_attn(
450
+ hidden_states=hidden_states,
451
+ attention_mask=attention_mask,
452
+ position_ids=position_ids,
453
+ past_key_value=past_key_value,
454
+ output_attentions=output_attentions,
455
+ use_cache=use_cache,
456
+ )
457
+ hidden_states = residual + hidden_states
458
+
459
+ # Fully Connected
460
+ residual = hidden_states
461
+ hidden_states = self.post_attention_layernorm(hidden_states)
462
+ hidden_states = self.mlp(hidden_states)
463
+ hidden_states = residual + hidden_states
464
+
465
+ outputs = (hidden_states,)
466
+
467
+ if output_attentions:
468
+ outputs += (self_attn_weights,)
469
+
470
+ if use_cache:
471
+ outputs += (present_key_value,)
472
+
473
+ return outputs
474
+
475
+ class SkyworkPreTrainedModel(PreTrainedModel):
476
+ config_class = SkyworkConfig
477
+ base_model_prefix = "model"
478
+ supports_gradient_checkpointing = True
479
+ _no_split_modules = ["SkyworkDecoderLayer"]
480
+ _skip_keys_device_placement = "past_key_values"
481
+
482
+ def _init_weights(self, module):
483
+ std = self.config.initializer_range
484
+ if isinstance(module, nn.Linear):
485
+ module.weight.data.normal_(mean=0.0, std=std)
486
+ if module.bias is not None:
487
+ module.bias.data.zero_()
488
+ elif isinstance(module, nn.Embedding):
489
+ module.weight.data.normal_(mean=0.0, std=std)
490
+ if module.padding_idx is not None:
491
+ module.weight.data[module.padding_idx].zero_()
492
+
493
+ def _set_gradient_checkpointing(self, module, value=False):
494
+ if isinstance(module, SkyworkModel):
495
+ module.gradient_checkpointing = value
496
+
497
+ class SkyworkModel(SkyworkPreTrainedModel):
498
+ """
499
+ Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`SkyworkDecoderLayer`]
500
+
501
+ Args:
502
+ config: SkyworkConfig
503
+ """
504
+
505
+ def __init__(self, config: SkyworkConfig):
506
+ super().__init__(config)
507
+ self.padding_idx = config.pad_token_id
508
+ self.vocab_size = config.vocab_size
509
+
510
+ self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
511
+ self.layers = nn.ModuleList([SkyworkDecoderLayer(config) for _ in range(config.num_hidden_layers)])
512
+ self.norm = SkyworkRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
513
+
514
+ self.gradient_checkpointing = False
515
+ # Initialize weights and apply final processing
516
+ self.post_init()
517
+
518
+ def get_input_embeddings(self):
519
+ return self.embed_tokens
520
+
521
+ def set_input_embeddings(self, value):
522
+ self.embed_tokens = value
523
+
524
+ # Copied from transformers.models.bart.modeling_bart.BartDecoder._prepare_decoder_attention_mask
525
+ def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
526
+ # create causal mask
527
+ # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
528
+ combined_attention_mask = None
529
+ if input_shape[-1] > 1:
530
+ combined_attention_mask = _make_causal_mask(
531
+ input_shape,
532
+ inputs_embeds.dtype,
533
+ device=inputs_embeds.device,
534
+ past_key_values_length=past_key_values_length,
535
+ )
536
+
537
+ if attention_mask is not None:
538
+ # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
539
+ expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(
540
+ inputs_embeds.device
541
+ )
542
+ combined_attention_mask = (
543
+ expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
544
+ )
545
+
546
+ return combined_attention_mask
547
+
548
+ def forward(
549
+ self,
550
+ input_ids: torch.LongTensor = None,
551
+ attention_mask: Optional[torch.Tensor] = None,
552
+ position_ids: Optional[torch.LongTensor] = None,
553
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
554
+ inputs_embeds: Optional[torch.FloatTensor] = None,
555
+ use_cache: Optional[bool] = None,
556
+ output_attentions: Optional[bool] = None,
557
+ output_hidden_states: Optional[bool] = None,
558
+ return_dict: Optional[bool] = None,
559
+ ) -> Union[Tuple, BaseModelOutputWithPast]:
560
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
561
+ output_hidden_states = (
562
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
563
+ )
564
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
565
+
566
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
567
+
568
+ # retrieve input_ids and inputs_embeds
569
+ if input_ids is not None and inputs_embeds is not None:
570
+ raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
571
+ elif input_ids is not None:
572
+ batch_size, seq_length = input_ids.shape
573
+ elif inputs_embeds is not None:
574
+ batch_size, seq_length, _ = inputs_embeds.shape
575
+ else:
576
+ raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
577
+
578
+ seq_length_with_past = seq_length
579
+ past_key_values_length = 0
580
+
581
+ if past_key_values is not None:
582
+ past_key_values_length = past_key_values[0][0].shape[2]
583
+ seq_length_with_past = seq_length_with_past + past_key_values_length
584
+
585
+ if position_ids is None:
586
+ device = input_ids.device if input_ids is not None else inputs_embeds.device
587
+ position_ids = torch.arange(
588
+ past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
589
+ )
590
+ position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
591
+ else:
592
+ position_ids = position_ids.view(-1, seq_length).long()
593
+
594
+ if inputs_embeds is None:
595
+ inputs_embeds = self.embed_tokens(input_ids)
596
+ # embed positions
597
+ if attention_mask is None:
598
+ attention_mask = torch.ones(
599
+ (batch_size, seq_length_with_past), dtype=torch.bool, device=inputs_embeds.device
600
+ )
601
+ attention_mask = self._prepare_decoder_attention_mask(
602
+ attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
603
+ )
604
+
605
+ hidden_states = inputs_embeds
606
+
607
+ if self.gradient_checkpointing and self.training:
608
+ if use_cache:
609
+ logger.warning_once(
610
+ "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
611
+ )
612
+ use_cache = False
613
+
614
+ # decoder layers
615
+ all_hidden_states = () if output_hidden_states else None
616
+ all_self_attns = () if output_attentions else None
617
+ next_decoder_cache = () if use_cache else None
618
+
619
+ for idx, decoder_layer in enumerate(self.layers):
620
+ if output_hidden_states:
621
+ all_hidden_states += (hidden_states,)
622
+
623
+ past_key_value = past_key_values[idx] if past_key_values is not None else None
624
+
625
+ if self.gradient_checkpointing and self.training:
626
+
627
+ def create_custom_forward(module):
628
+ def custom_forward(*inputs):
629
+ # None for past_key_value
630
+ return module(*inputs, past_key_value, output_attentions)
631
+
632
+ return custom_forward
633
+
634
+ layer_outputs = torch.utils.checkpoint.checkpoint(
635
+ create_custom_forward(decoder_layer),
636
+ hidden_states,
637
+ attention_mask,
638
+ position_ids,
639
+ )
640
+ else:
641
+ layer_outputs = decoder_layer(
642
+ hidden_states,
643
+ attention_mask=attention_mask,
644
+ position_ids=position_ids,
645
+ past_key_value=past_key_value,
646
+ output_attentions=output_attentions,
647
+ use_cache=use_cache,
648
+ )
649
+
650
+ hidden_states = layer_outputs[0]
651
+
652
+ if use_cache:
653
+ next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
654
+
655
+ if output_attentions:
656
+ all_self_attns += (layer_outputs[1],)
657
+
658
+ hidden_states = self.norm(hidden_states)
659
+
660
+ # add hidden states from the last decoder layer
661
+ if output_hidden_states:
662
+ all_hidden_states += (hidden_states,)
663
+
664
+ next_cache = next_decoder_cache if use_cache else None
665
+ if not return_dict:
666
+ return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
667
+ return BaseModelOutputWithPast(
668
+ last_hidden_state=hidden_states,
669
+ past_key_values=next_cache,
670
+ hidden_states=all_hidden_states,
671
+ attentions=all_self_attns,
672
+ )
673
+
674
+
675
+ class SkyworkForCausalLM(SkyworkPreTrainedModel):
676
+ _tied_weights_keys = ["lm_head.weight"]
677
+
678
+ def __init__(self, config):
679
+ super().__init__(config)
680
+ self.model = SkyworkModel(config)
681
+ self.vocab_size = config.vocab_size
682
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
683
+
684
+ # Initialize weights and apply final processing
685
+ self.post_init()
686
+
687
+ def get_input_embeddings(self):
688
+ return self.model.embed_tokens
689
+
690
+ def set_input_embeddings(self, value):
691
+ self.model.embed_tokens = value
692
+
693
+ def get_output_embeddings(self):
694
+ return self.lm_head
695
+
696
+ def set_output_embeddings(self, new_embeddings):
697
+ self.lm_head = new_embeddings
698
+
699
+ def set_decoder(self, decoder):
700
+ self.model = decoder
701
+
702
+ def get_decoder(self):
703
+ return self.model
704
+
705
+ def forward(
706
+ self,
707
+ input_ids: torch.LongTensor = None,
708
+ attention_mask: Optional[torch.Tensor] = None,
709
+ position_ids: Optional[torch.LongTensor] = None,
710
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
711
+ inputs_embeds: Optional[torch.FloatTensor] = None,
712
+ labels: Optional[torch.LongTensor] = None,
713
+ use_cache: Optional[bool] = None,
714
+ output_attentions: Optional[bool] = None,
715
+ output_hidden_states: Optional[bool] = None,
716
+ return_dict: Optional[bool] = None,
717
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
718
+
719
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
720
+ output_hidden_states = (
721
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
722
+ )
723
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
724
+
725
+ # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
726
+ outputs = self.model(
727
+ input_ids=input_ids,
728
+ attention_mask=attention_mask,
729
+ position_ids=position_ids,
730
+ past_key_values=past_key_values,
731
+ inputs_embeds=inputs_embeds,
732
+ use_cache=use_cache,
733
+ output_attentions=output_attentions,
734
+ output_hidden_states=output_hidden_states,
735
+ return_dict=return_dict,
736
+ )
737
+
738
+ hidden_states = outputs[0]
739
+ if self.config.pretraining_tp > 1:
740
+ lm_head_slices = self.lm_head.weight.split(self.vocab_size // self.config.pretraining_tp, dim=0)
741
+ logits = [F.linear(hidden_states, lm_head_slices[i]) for i in range(self.config.pretraining_tp)]
742
+ logits = torch.cat(logits, dim=-1)
743
+ else:
744
+ logits = self.lm_head(hidden_states)
745
+ logits = logits.float()
746
+
747
+ loss = None
748
+ if labels is not None:
749
+ # Shift so that tokens < n predict n
750
+ shift_logits = logits[..., :-1, :].contiguous()
751
+ shift_labels = labels[..., 1:].contiguous()
752
+ # Flatten the tokens
753
+ loss_fct = CrossEntropyLoss()
754
+ shift_logits = shift_logits.view(-1, self.config.vocab_size)
755
+ shift_labels = shift_labels.view(-1)
756
+ # Enable model parallelism
757
+ shift_labels = shift_labels.to(shift_logits.device)
758
+ loss = loss_fct(shift_logits, shift_labels)
759
+
760
+ if not return_dict:
761
+ output = (logits,) + outputs[1:]
762
+ return (loss,) + output if loss is not None else output
763
+
764
+ return CausalLMOutputWithPast(
765
+ loss=loss,
766
+ logits=logits,
767
+ past_key_values=outputs.past_key_values,
768
+ hidden_states=outputs.hidden_states,
769
+ attentions=outputs.attentions,
770
+ )
771
+
772
+ def prepare_inputs_for_generation(
773
+ self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
774
+ ):
775
+ if past_key_values:
776
+ input_ids = input_ids[:, -1:]
777
+
778
+ position_ids = kwargs.get("position_ids", None)
779
+ if attention_mask is not None and position_ids is None:
780
+ # create position_ids on the fly for batch generation
781
+ position_ids = attention_mask.long().cumsum(-1) - 1
782
+ position_ids.masked_fill_(attention_mask == 0, 1)
783
+ if past_key_values:
784
+ position_ids = position_ids[:, -1].unsqueeze(-1)
785
+
786
+ # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
787
+ if inputs_embeds is not None and past_key_values is None:
788
+ model_inputs = {"inputs_embeds": inputs_embeds}
789
+ else:
790
+ model_inputs = {"input_ids": input_ids}
791
+
792
+ model_inputs.update(
793
+ {
794
+ "position_ids": position_ids,
795
+ "past_key_values": past_key_values,
796
+ "use_cache": kwargs.get("use_cache"),
797
+ "attention_mask": attention_mask,
798
+ }
799
+ )
800
+ return model_inputs
801
+
802
+ @staticmethod
803
+ def _reorder_cache(past_key_values, beam_idx):
804
+ reordered_past = ()
805
+ for layer_past in past_key_values:
806
+ reordered_past += (
807
+ tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
808
+ )
809
+ return reordered_past
810
+
811
+
812
+ class SkyworkForSequenceClassification(SkyworkPreTrainedModel):
813
+ def __init__(self, config):
814
+ super().__init__(config)
815
+ self.num_labels = config.num_labels
816
+ self.model = SkyworkModel(config)
817
+ self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
818
+
819
+ # Initialize weights and apply final processing
820
+ self.post_init()
821
+
822
+ def get_input_embeddings(self):
823
+ return self.model.embed_tokens
824
+
825
+ def set_input_embeddings(self, value):
826
+ self.model.embed_tokens = value
827
+
828
+ def forward(
829
+ self,
830
+ input_ids: torch.LongTensor = None,
831
+ attention_mask: Optional[torch.Tensor] = None,
832
+ position_ids: Optional[torch.LongTensor] = None,
833
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
834
+ inputs_embeds: Optional[torch.FloatTensor] = None,
835
+ labels: Optional[torch.LongTensor] = None,
836
+ use_cache: Optional[bool] = None,
837
+ output_attentions: Optional[bool] = None,
838
+ output_hidden_states: Optional[bool] = None,
839
+ return_dict: Optional[bool] = None,
840
+ ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
841
+
842
+
843
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
844
+
845
+ transformer_outputs = self.model(
846
+ input_ids,
847
+ attention_mask=attention_mask,
848
+ position_ids=position_ids,
849
+ past_key_values=past_key_values,
850
+ inputs_embeds=inputs_embeds,
851
+ use_cache=use_cache,
852
+ output_attentions=output_attentions,
853
+ output_hidden_states=output_hidden_states,
854
+ return_dict=return_dict,
855
+ )
856
+ hidden_states = transformer_outputs[0]
857
+ logits = self.score(hidden_states)
858
+
859
+ if input_ids is not None:
860
+ batch_size = input_ids.shape[0]
861
+ else:
862
+ batch_size = inputs_embeds.shape[0]
863
+
864
+ if self.config.pad_token_id is None and batch_size != 1:
865
+ raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
866
+ if self.config.pad_token_id is None:
867
+ sequence_lengths = -1
868
+ else:
869
+ if input_ids is not None:
870
+ sequence_lengths = (torch.eq(input_ids, self.config.pad_token_id).long().argmax(-1) - 1).to(
871
+ logits.device
872
+ )
873
+ else:
874
+ sequence_lengths = -1
875
+
876
+ pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
877
+
878
+ loss = None
879
+ if labels is not None:
880
+ labels = labels.to(logits.device)
881
+ if self.config.problem_type is None:
882
+ if self.num_labels == 1:
883
+ self.config.problem_type = "regression"
884
+ elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
885
+ self.config.problem_type = "single_label_classification"
886
+ else:
887
+ self.config.problem_type = "multi_label_classification"
888
+
889
+ if self.config.problem_type == "regression":
890
+ loss_fct = MSELoss()
891
+ if self.num_labels == 1:
892
+ loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
893
+ else:
894
+ loss = loss_fct(pooled_logits, labels)
895
+ elif self.config.problem_type == "single_label_classification":
896
+ loss_fct = CrossEntropyLoss()
897
+ loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
898
+ elif self.config.problem_type == "multi_label_classification":
899
+ loss_fct = BCEWithLogitsLoss()
900
+ loss = loss_fct(pooled_logits, labels)
901
+ if not return_dict:
902
+ output = (pooled_logits,) + transformer_outputs[1:]
903
+ return ((loss,) + output) if loss is not None else output
904
+
905
+ return SequenceClassifierOutputWithPast(
906
+ loss=loss,
907
+ logits=pooled_logits,
908
+ past_key_values=transformer_outputs.past_key_values,
909
+ hidden_states=transformer_outputs.hidden_states,
910
+ attentions=transformer_outputs.attentions,
911
+ )
pytorch_model-00001-of-00006.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ef1cf938cb288dfa292ec7400b9bd5c88dd71a9362ed886265bcff8d5a275158
3
+ size 4963976342
pytorch_model-00002-of-00006.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:96dd9f9c91f6f3c514817ab3039cfcc4ce3fc765e4170aa12dcf0272aef68054
3
+ size 4983048004
pytorch_model-00003-of-00006.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a59462a5db44c3b5c929fb3af236b02de1bb81d7f24949f5add57d44cf9bc6e4
3
+ size 4968891564
pytorch_model-00004-of-00006.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9852ba3a22a01b0b9d19481332fc77cc108e1db5b11d340da721976de3fbf586
3
+ size 4940561242
pytorch_model-00005-of-00006.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a054fbc1e634cd5b3d69581f3d83d178ecf82a004477815c4eb4b7e6ca0aff6f
3
+ size 4983048004
pytorch_model-00006-of-00006.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b972b79222cd0bfb733835f70b11a81fa47d2d16baa8268969b416534e136b9e
3
+ size 2868863134
pytorch_model.bin.index.json ADDED
@@ -0,0 +1,478 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 27708226560
4
+ },
5
+ "weight_map": {
6
+ "lm_head.weight": "pytorch_model-00006-of-00006.bin",
7
+ "model.embed_tokens.weight": "pytorch_model-00001-of-00006.bin",
8
+ "model.layers.0.input_layernorm.weight": "pytorch_model-00001-of-00006.bin",
9
+ "model.layers.0.mlp.down_proj.weight": "pytorch_model-00001-of-00006.bin",
10
+ "model.layers.0.mlp.gate_proj.weight": "pytorch_model-00001-of-00006.bin",
11
+ "model.layers.0.mlp.up_proj.weight": "pytorch_model-00001-of-00006.bin",
12
+ "model.layers.0.post_attention_layernorm.weight": "pytorch_model-00001-of-00006.bin",
13
+ "model.layers.0.self_attn.k_proj.weight": "pytorch_model-00001-of-00006.bin",
14
+ "model.layers.0.self_attn.o_proj.weight": "pytorch_model-00001-of-00006.bin",
15
+ "model.layers.0.self_attn.q_proj.weight": "pytorch_model-00001-of-00006.bin",
16
+ "model.layers.0.self_attn.v_proj.weight": "pytorch_model-00001-of-00006.bin",
17
+ "model.layers.1.input_layernorm.weight": "pytorch_model-00001-of-00006.bin",
18
+ "model.layers.1.mlp.down_proj.weight": "pytorch_model-00001-of-00006.bin",
19
+ "model.layers.1.mlp.gate_proj.weight": "pytorch_model-00001-of-00006.bin",
20
+ "model.layers.1.mlp.up_proj.weight": "pytorch_model-00001-of-00006.bin",
21
+ "model.layers.1.post_attention_layernorm.weight": "pytorch_model-00001-of-00006.bin",
22
+ "model.layers.1.self_attn.k_proj.weight": "pytorch_model-00001-of-00006.bin",
23
+ "model.layers.1.self_attn.o_proj.weight": "pytorch_model-00001-of-00006.bin",
24
+ "model.layers.1.self_attn.q_proj.weight": "pytorch_model-00001-of-00006.bin",
25
+ "model.layers.1.self_attn.v_proj.weight": "pytorch_model-00001-of-00006.bin",
26
+ "model.layers.10.input_layernorm.weight": "pytorch_model-00002-of-00006.bin",
27
+ "model.layers.10.mlp.down_proj.weight": "pytorch_model-00002-of-00006.bin",
28
+ "model.layers.10.mlp.gate_proj.weight": "pytorch_model-00002-of-00006.bin",
29
+ "model.layers.10.mlp.up_proj.weight": "pytorch_model-00002-of-00006.bin",
30
+ "model.layers.10.post_attention_layernorm.weight": "pytorch_model-00002-of-00006.bin",
31
+ "model.layers.10.self_attn.k_proj.weight": "pytorch_model-00002-of-00006.bin",
32
+ "model.layers.10.self_attn.o_proj.weight": "pytorch_model-00002-of-00006.bin",
33
+ "model.layers.10.self_attn.q_proj.weight": "pytorch_model-00002-of-00006.bin",
34
+ "model.layers.10.self_attn.v_proj.weight": "pytorch_model-00002-of-00006.bin",
35
+ "model.layers.11.input_layernorm.weight": "pytorch_model-00002-of-00006.bin",
36
+ "model.layers.11.mlp.down_proj.weight": "pytorch_model-00002-of-00006.bin",
37
+ "model.layers.11.mlp.gate_proj.weight": "pytorch_model-00002-of-00006.bin",
38
+ "model.layers.11.mlp.up_proj.weight": "pytorch_model-00002-of-00006.bin",
39
+ "model.layers.11.post_attention_layernorm.weight": "pytorch_model-00002-of-00006.bin",
40
+ "model.layers.11.self_attn.k_proj.weight": "pytorch_model-00002-of-00006.bin",
41
+ "model.layers.11.self_attn.o_proj.weight": "pytorch_model-00002-of-00006.bin",
42
+ "model.layers.11.self_attn.q_proj.weight": "pytorch_model-00002-of-00006.bin",
43
+ "model.layers.11.self_attn.v_proj.weight": "pytorch_model-00002-of-00006.bin",
44
+ "model.layers.12.input_layernorm.weight": "pytorch_model-00002-of-00006.bin",
45
+ "model.layers.12.mlp.down_proj.weight": "pytorch_model-00002-of-00006.bin",
46
+ "model.layers.12.mlp.gate_proj.weight": "pytorch_model-00002-of-00006.bin",
47
+ "model.layers.12.mlp.up_proj.weight": "pytorch_model-00002-of-00006.bin",
48
+ "model.layers.12.post_attention_layernorm.weight": "pytorch_model-00002-of-00006.bin",
49
+ "model.layers.12.self_attn.k_proj.weight": "pytorch_model-00002-of-00006.bin",
50
+ "model.layers.12.self_attn.o_proj.weight": "pytorch_model-00002-of-00006.bin",
51
+ "model.layers.12.self_attn.q_proj.weight": "pytorch_model-00002-of-00006.bin",
52
+ "model.layers.12.self_attn.v_proj.weight": "pytorch_model-00002-of-00006.bin",
53
+ "model.layers.13.input_layernorm.weight": "pytorch_model-00002-of-00006.bin",
54
+ "model.layers.13.mlp.down_proj.weight": "pytorch_model-00002-of-00006.bin",
55
+ "model.layers.13.mlp.gate_proj.weight": "pytorch_model-00002-of-00006.bin",
56
+ "model.layers.13.mlp.up_proj.weight": "pytorch_model-00002-of-00006.bin",
57
+ "model.layers.13.post_attention_layernorm.weight": "pytorch_model-00002-of-00006.bin",
58
+ "model.layers.13.self_attn.k_proj.weight": "pytorch_model-00002-of-00006.bin",
59
+ "model.layers.13.self_attn.o_proj.weight": "pytorch_model-00002-of-00006.bin",
60
+ "model.layers.13.self_attn.q_proj.weight": "pytorch_model-00002-of-00006.bin",
61
+ "model.layers.13.self_attn.v_proj.weight": "pytorch_model-00002-of-00006.bin",
62
+ "model.layers.14.input_layernorm.weight": "pytorch_model-00002-of-00006.bin",
63
+ "model.layers.14.mlp.down_proj.weight": "pytorch_model-00002-of-00006.bin",
64
+ "model.layers.14.mlp.gate_proj.weight": "pytorch_model-00002-of-00006.bin",
65
+ "model.layers.14.mlp.up_proj.weight": "pytorch_model-00002-of-00006.bin",
66
+ "model.layers.14.post_attention_layernorm.weight": "pytorch_model-00002-of-00006.bin",
67
+ "model.layers.14.self_attn.k_proj.weight": "pytorch_model-00002-of-00006.bin",
68
+ "model.layers.14.self_attn.o_proj.weight": "pytorch_model-00002-of-00006.bin",
69
+ "model.layers.14.self_attn.q_proj.weight": "pytorch_model-00002-of-00006.bin",
70
+ "model.layers.14.self_attn.v_proj.weight": "pytorch_model-00002-of-00006.bin",
71
+ "model.layers.15.input_layernorm.weight": "pytorch_model-00002-of-00006.bin",
72
+ "model.layers.15.mlp.down_proj.weight": "pytorch_model-00002-of-00006.bin",
73
+ "model.layers.15.mlp.gate_proj.weight": "pytorch_model-00002-of-00006.bin",
74
+ "model.layers.15.mlp.up_proj.weight": "pytorch_model-00002-of-00006.bin",
75
+ "model.layers.15.post_attention_layernorm.weight": "pytorch_model-00002-of-00006.bin",
76
+ "model.layers.15.self_attn.k_proj.weight": "pytorch_model-00002-of-00006.bin",
77
+ "model.layers.15.self_attn.o_proj.weight": "pytorch_model-00002-of-00006.bin",
78
+ "model.layers.15.self_attn.q_proj.weight": "pytorch_model-00002-of-00006.bin",
79
+ "model.layers.15.self_attn.v_proj.weight": "pytorch_model-00002-of-00006.bin",
80
+ "model.layers.16.input_layernorm.weight": "pytorch_model-00002-of-00006.bin",
81
+ "model.layers.16.mlp.down_proj.weight": "pytorch_model-00002-of-00006.bin",
82
+ "model.layers.16.mlp.gate_proj.weight": "pytorch_model-00002-of-00006.bin",
83
+ "model.layers.16.mlp.up_proj.weight": "pytorch_model-00002-of-00006.bin",
84
+ "model.layers.16.post_attention_layernorm.weight": "pytorch_model-00002-of-00006.bin",
85
+ "model.layers.16.self_attn.k_proj.weight": "pytorch_model-00002-of-00006.bin",
86
+ "model.layers.16.self_attn.o_proj.weight": "pytorch_model-00002-of-00006.bin",
87
+ "model.layers.16.self_attn.q_proj.weight": "pytorch_model-00002-of-00006.bin",
88
+ "model.layers.16.self_attn.v_proj.weight": "pytorch_model-00002-of-00006.bin",
89
+ "model.layers.17.input_layernorm.weight": "pytorch_model-00002-of-00006.bin",
90
+ "model.layers.17.mlp.down_proj.weight": "pytorch_model-00002-of-00006.bin",
91
+ "model.layers.17.mlp.gate_proj.weight": "pytorch_model-00002-of-00006.bin",
92
+ "model.layers.17.mlp.up_proj.weight": "pytorch_model-00002-of-00006.bin",
93
+ "model.layers.17.post_attention_layernorm.weight": "pytorch_model-00002-of-00006.bin",
94
+ "model.layers.17.self_attn.k_proj.weight": "pytorch_model-00002-of-00006.bin",
95
+ "model.layers.17.self_attn.o_proj.weight": "pytorch_model-00002-of-00006.bin",
96
+ "model.layers.17.self_attn.q_proj.weight": "pytorch_model-00002-of-00006.bin",
97
+ "model.layers.17.self_attn.v_proj.weight": "pytorch_model-00002-of-00006.bin",
98
+ "model.layers.18.input_layernorm.weight": "pytorch_model-00003-of-00006.bin",
99
+ "model.layers.18.mlp.down_proj.weight": "pytorch_model-00003-of-00006.bin",
100
+ "model.layers.18.mlp.gate_proj.weight": "pytorch_model-00003-of-00006.bin",
101
+ "model.layers.18.mlp.up_proj.weight": "pytorch_model-00003-of-00006.bin",
102
+ "model.layers.18.post_attention_layernorm.weight": "pytorch_model-00003-of-00006.bin",
103
+ "model.layers.18.self_attn.k_proj.weight": "pytorch_model-00002-of-00006.bin",
104
+ "model.layers.18.self_attn.o_proj.weight": "pytorch_model-00002-of-00006.bin",
105
+ "model.layers.18.self_attn.q_proj.weight": "pytorch_model-00002-of-00006.bin",
106
+ "model.layers.18.self_attn.v_proj.weight": "pytorch_model-00002-of-00006.bin",
107
+ "model.layers.19.input_layernorm.weight": "pytorch_model-00003-of-00006.bin",
108
+ "model.layers.19.mlp.down_proj.weight": "pytorch_model-00003-of-00006.bin",
109
+ "model.layers.19.mlp.gate_proj.weight": "pytorch_model-00003-of-00006.bin",
110
+ "model.layers.19.mlp.up_proj.weight": "pytorch_model-00003-of-00006.bin",
111
+ "model.layers.19.post_attention_layernorm.weight": "pytorch_model-00003-of-00006.bin",
112
+ "model.layers.19.self_attn.k_proj.weight": "pytorch_model-00003-of-00006.bin",
113
+ "model.layers.19.self_attn.o_proj.weight": "pytorch_model-00003-of-00006.bin",
114
+ "model.layers.19.self_attn.q_proj.weight": "pytorch_model-00003-of-00006.bin",
115
+ "model.layers.19.self_attn.v_proj.weight": "pytorch_model-00003-of-00006.bin",
116
+ "model.layers.2.input_layernorm.weight": "pytorch_model-00001-of-00006.bin",
117
+ "model.layers.2.mlp.down_proj.weight": "pytorch_model-00001-of-00006.bin",
118
+ "model.layers.2.mlp.gate_proj.weight": "pytorch_model-00001-of-00006.bin",
119
+ "model.layers.2.mlp.up_proj.weight": "pytorch_model-00001-of-00006.bin",
120
+ "model.layers.2.post_attention_layernorm.weight": "pytorch_model-00001-of-00006.bin",
121
+ "model.layers.2.self_attn.k_proj.weight": "pytorch_model-00001-of-00006.bin",
122
+ "model.layers.2.self_attn.o_proj.weight": "pytorch_model-00001-of-00006.bin",
123
+ "model.layers.2.self_attn.q_proj.weight": "pytorch_model-00001-of-00006.bin",
124
+ "model.layers.2.self_attn.v_proj.weight": "pytorch_model-00001-of-00006.bin",
125
+ "model.layers.20.input_layernorm.weight": "pytorch_model-00003-of-00006.bin",
126
+ "model.layers.20.mlp.down_proj.weight": "pytorch_model-00003-of-00006.bin",
127
+ "model.layers.20.mlp.gate_proj.weight": "pytorch_model-00003-of-00006.bin",
128
+ "model.layers.20.mlp.up_proj.weight": "pytorch_model-00003-of-00006.bin",
129
+ "model.layers.20.post_attention_layernorm.weight": "pytorch_model-00003-of-00006.bin",
130
+ "model.layers.20.self_attn.k_proj.weight": "pytorch_model-00003-of-00006.bin",
131
+ "model.layers.20.self_attn.o_proj.weight": "pytorch_model-00003-of-00006.bin",
132
+ "model.layers.20.self_attn.q_proj.weight": "pytorch_model-00003-of-00006.bin",
133
+ "model.layers.20.self_attn.v_proj.weight": "pytorch_model-00003-of-00006.bin",
134
+ "model.layers.21.input_layernorm.weight": "pytorch_model-00003-of-00006.bin",
135
+ "model.layers.21.mlp.down_proj.weight": "pytorch_model-00003-of-00006.bin",
136
+ "model.layers.21.mlp.gate_proj.weight": "pytorch_model-00003-of-00006.bin",
137
+ "model.layers.21.mlp.up_proj.weight": "pytorch_model-00003-of-00006.bin",
138
+ "model.layers.21.post_attention_layernorm.weight": "pytorch_model-00003-of-00006.bin",
139
+ "model.layers.21.self_attn.k_proj.weight": "pytorch_model-00003-of-00006.bin",
140
+ "model.layers.21.self_attn.o_proj.weight": "pytorch_model-00003-of-00006.bin",
141
+ "model.layers.21.self_attn.q_proj.weight": "pytorch_model-00003-of-00006.bin",
142
+ "model.layers.21.self_attn.v_proj.weight": "pytorch_model-00003-of-00006.bin",
143
+ "model.layers.22.input_layernorm.weight": "pytorch_model-00003-of-00006.bin",
144
+ "model.layers.22.mlp.down_proj.weight": "pytorch_model-00003-of-00006.bin",
145
+ "model.layers.22.mlp.gate_proj.weight": "pytorch_model-00003-of-00006.bin",
146
+ "model.layers.22.mlp.up_proj.weight": "pytorch_model-00003-of-00006.bin",
147
+ "model.layers.22.post_attention_layernorm.weight": "pytorch_model-00003-of-00006.bin",
148
+ "model.layers.22.self_attn.k_proj.weight": "pytorch_model-00003-of-00006.bin",
149
+ "model.layers.22.self_attn.o_proj.weight": "pytorch_model-00003-of-00006.bin",
150
+ "model.layers.22.self_attn.q_proj.weight": "pytorch_model-00003-of-00006.bin",
151
+ "model.layers.22.self_attn.v_proj.weight": "pytorch_model-00003-of-00006.bin",
152
+ "model.layers.23.input_layernorm.weight": "pytorch_model-00003-of-00006.bin",
153
+ "model.layers.23.mlp.down_proj.weight": "pytorch_model-00003-of-00006.bin",
154
+ "model.layers.23.mlp.gate_proj.weight": "pytorch_model-00003-of-00006.bin",
155
+ "model.layers.23.mlp.up_proj.weight": "pytorch_model-00003-of-00006.bin",
156
+ "model.layers.23.post_attention_layernorm.weight": "pytorch_model-00003-of-00006.bin",
157
+ "model.layers.23.self_attn.k_proj.weight": "pytorch_model-00003-of-00006.bin",
158
+ "model.layers.23.self_attn.o_proj.weight": "pytorch_model-00003-of-00006.bin",
159
+ "model.layers.23.self_attn.q_proj.weight": "pytorch_model-00003-of-00006.bin",
160
+ "model.layers.23.self_attn.v_proj.weight": "pytorch_model-00003-of-00006.bin",
161
+ "model.layers.24.input_layernorm.weight": "pytorch_model-00003-of-00006.bin",
162
+ "model.layers.24.mlp.down_proj.weight": "pytorch_model-00003-of-00006.bin",
163
+ "model.layers.24.mlp.gate_proj.weight": "pytorch_model-00003-of-00006.bin",
164
+ "model.layers.24.mlp.up_proj.weight": "pytorch_model-00003-of-00006.bin",
165
+ "model.layers.24.post_attention_layernorm.weight": "pytorch_model-00003-of-00006.bin",
166
+ "model.layers.24.self_attn.k_proj.weight": "pytorch_model-00003-of-00006.bin",
167
+ "model.layers.24.self_attn.o_proj.weight": "pytorch_model-00003-of-00006.bin",
168
+ "model.layers.24.self_attn.q_proj.weight": "pytorch_model-00003-of-00006.bin",
169
+ "model.layers.24.self_attn.v_proj.weight": "pytorch_model-00003-of-00006.bin",
170
+ "model.layers.25.input_layernorm.weight": "pytorch_model-00003-of-00006.bin",
171
+ "model.layers.25.mlp.down_proj.weight": "pytorch_model-00003-of-00006.bin",
172
+ "model.layers.25.mlp.gate_proj.weight": "pytorch_model-00003-of-00006.bin",
173
+ "model.layers.25.mlp.up_proj.weight": "pytorch_model-00003-of-00006.bin",
174
+ "model.layers.25.post_attention_layernorm.weight": "pytorch_model-00003-of-00006.bin",
175
+ "model.layers.25.self_attn.k_proj.weight": "pytorch_model-00003-of-00006.bin",
176
+ "model.layers.25.self_attn.o_proj.weight": "pytorch_model-00003-of-00006.bin",
177
+ "model.layers.25.self_attn.q_proj.weight": "pytorch_model-00003-of-00006.bin",
178
+ "model.layers.25.self_attn.v_proj.weight": "pytorch_model-00003-of-00006.bin",
179
+ "model.layers.26.input_layernorm.weight": "pytorch_model-00003-of-00006.bin",
180
+ "model.layers.26.mlp.down_proj.weight": "pytorch_model-00003-of-00006.bin",
181
+ "model.layers.26.mlp.gate_proj.weight": "pytorch_model-00003-of-00006.bin",
182
+ "model.layers.26.mlp.up_proj.weight": "pytorch_model-00003-of-00006.bin",
183
+ "model.layers.26.post_attention_layernorm.weight": "pytorch_model-00003-of-00006.bin",
184
+ "model.layers.26.self_attn.k_proj.weight": "pytorch_model-00003-of-00006.bin",
185
+ "model.layers.26.self_attn.o_proj.weight": "pytorch_model-00003-of-00006.bin",
186
+ "model.layers.26.self_attn.q_proj.weight": "pytorch_model-00003-of-00006.bin",
187
+ "model.layers.26.self_attn.v_proj.weight": "pytorch_model-00003-of-00006.bin",
188
+ "model.layers.27.input_layernorm.weight": "pytorch_model-00003-of-00006.bin",
189
+ "model.layers.27.mlp.down_proj.weight": "pytorch_model-00003-of-00006.bin",
190
+ "model.layers.27.mlp.gate_proj.weight": "pytorch_model-00003-of-00006.bin",
191
+ "model.layers.27.mlp.up_proj.weight": "pytorch_model-00003-of-00006.bin",
192
+ "model.layers.27.post_attention_layernorm.weight": "pytorch_model-00003-of-00006.bin",
193
+ "model.layers.27.self_attn.k_proj.weight": "pytorch_model-00003-of-00006.bin",
194
+ "model.layers.27.self_attn.o_proj.weight": "pytorch_model-00003-of-00006.bin",
195
+ "model.layers.27.self_attn.q_proj.weight": "pytorch_model-00003-of-00006.bin",
196
+ "model.layers.27.self_attn.v_proj.weight": "pytorch_model-00003-of-00006.bin",
197
+ "model.layers.28.input_layernorm.weight": "pytorch_model-00004-of-00006.bin",
198
+ "model.layers.28.mlp.down_proj.weight": "pytorch_model-00004-of-00006.bin",
199
+ "model.layers.28.mlp.gate_proj.weight": "pytorch_model-00004-of-00006.bin",
200
+ "model.layers.28.mlp.up_proj.weight": "pytorch_model-00004-of-00006.bin",
201
+ "model.layers.28.post_attention_layernorm.weight": "pytorch_model-00004-of-00006.bin",
202
+ "model.layers.28.self_attn.k_proj.weight": "pytorch_model-00004-of-00006.bin",
203
+ "model.layers.28.self_attn.o_proj.weight": "pytorch_model-00004-of-00006.bin",
204
+ "model.layers.28.self_attn.q_proj.weight": "pytorch_model-00003-of-00006.bin",
205
+ "model.layers.28.self_attn.v_proj.weight": "pytorch_model-00004-of-00006.bin",
206
+ "model.layers.29.input_layernorm.weight": "pytorch_model-00004-of-00006.bin",
207
+ "model.layers.29.mlp.down_proj.weight": "pytorch_model-00004-of-00006.bin",
208
+ "model.layers.29.mlp.gate_proj.weight": "pytorch_model-00004-of-00006.bin",
209
+ "model.layers.29.mlp.up_proj.weight": "pytorch_model-00004-of-00006.bin",
210
+ "model.layers.29.post_attention_layernorm.weight": "pytorch_model-00004-of-00006.bin",
211
+ "model.layers.29.self_attn.k_proj.weight": "pytorch_model-00004-of-00006.bin",
212
+ "model.layers.29.self_attn.o_proj.weight": "pytorch_model-00004-of-00006.bin",
213
+ "model.layers.29.self_attn.q_proj.weight": "pytorch_model-00004-of-00006.bin",
214
+ "model.layers.29.self_attn.v_proj.weight": "pytorch_model-00004-of-00006.bin",
215
+ "model.layers.3.input_layernorm.weight": "pytorch_model-00001-of-00006.bin",
216
+ "model.layers.3.mlp.down_proj.weight": "pytorch_model-00001-of-00006.bin",
217
+ "model.layers.3.mlp.gate_proj.weight": "pytorch_model-00001-of-00006.bin",
218
+ "model.layers.3.mlp.up_proj.weight": "pytorch_model-00001-of-00006.bin",
219
+ "model.layers.3.post_attention_layernorm.weight": "pytorch_model-00001-of-00006.bin",
220
+ "model.layers.3.self_attn.k_proj.weight": "pytorch_model-00001-of-00006.bin",
221
+ "model.layers.3.self_attn.o_proj.weight": "pytorch_model-00001-of-00006.bin",
222
+ "model.layers.3.self_attn.q_proj.weight": "pytorch_model-00001-of-00006.bin",
223
+ "model.layers.3.self_attn.v_proj.weight": "pytorch_model-00001-of-00006.bin",
224
+ "model.layers.30.input_layernorm.weight": "pytorch_model-00004-of-00006.bin",
225
+ "model.layers.30.mlp.down_proj.weight": "pytorch_model-00004-of-00006.bin",
226
+ "model.layers.30.mlp.gate_proj.weight": "pytorch_model-00004-of-00006.bin",
227
+ "model.layers.30.mlp.up_proj.weight": "pytorch_model-00004-of-00006.bin",
228
+ "model.layers.30.post_attention_layernorm.weight": "pytorch_model-00004-of-00006.bin",
229
+ "model.layers.30.self_attn.k_proj.weight": "pytorch_model-00004-of-00006.bin",
230
+ "model.layers.30.self_attn.o_proj.weight": "pytorch_model-00004-of-00006.bin",
231
+ "model.layers.30.self_attn.q_proj.weight": "pytorch_model-00004-of-00006.bin",
232
+ "model.layers.30.self_attn.v_proj.weight": "pytorch_model-00004-of-00006.bin",
233
+ "model.layers.31.input_layernorm.weight": "pytorch_model-00004-of-00006.bin",
234
+ "model.layers.31.mlp.down_proj.weight": "pytorch_model-00004-of-00006.bin",
235
+ "model.layers.31.mlp.gate_proj.weight": "pytorch_model-00004-of-00006.bin",
236
+ "model.layers.31.mlp.up_proj.weight": "pytorch_model-00004-of-00006.bin",
237
+ "model.layers.31.post_attention_layernorm.weight": "pytorch_model-00004-of-00006.bin",
238
+ "model.layers.31.self_attn.k_proj.weight": "pytorch_model-00004-of-00006.bin",
239
+ "model.layers.31.self_attn.o_proj.weight": "pytorch_model-00004-of-00006.bin",
240
+ "model.layers.31.self_attn.q_proj.weight": "pytorch_model-00004-of-00006.bin",
241
+ "model.layers.31.self_attn.v_proj.weight": "pytorch_model-00004-of-00006.bin",
242
+ "model.layers.32.input_layernorm.weight": "pytorch_model-00004-of-00006.bin",
243
+ "model.layers.32.mlp.down_proj.weight": "pytorch_model-00004-of-00006.bin",
244
+ "model.layers.32.mlp.gate_proj.weight": "pytorch_model-00004-of-00006.bin",
245
+ "model.layers.32.mlp.up_proj.weight": "pytorch_model-00004-of-00006.bin",
246
+ "model.layers.32.post_attention_layernorm.weight": "pytorch_model-00004-of-00006.bin",
247
+ "model.layers.32.self_attn.k_proj.weight": "pytorch_model-00004-of-00006.bin",
248
+ "model.layers.32.self_attn.o_proj.weight": "pytorch_model-00004-of-00006.bin",
249
+ "model.layers.32.self_attn.q_proj.weight": "pytorch_model-00004-of-00006.bin",
250
+ "model.layers.32.self_attn.v_proj.weight": "pytorch_model-00004-of-00006.bin",
251
+ "model.layers.33.input_layernorm.weight": "pytorch_model-00004-of-00006.bin",
252
+ "model.layers.33.mlp.down_proj.weight": "pytorch_model-00004-of-00006.bin",
253
+ "model.layers.33.mlp.gate_proj.weight": "pytorch_model-00004-of-00006.bin",
254
+ "model.layers.33.mlp.up_proj.weight": "pytorch_model-00004-of-00006.bin",
255
+ "model.layers.33.post_attention_layernorm.weight": "pytorch_model-00004-of-00006.bin",
256
+ "model.layers.33.self_attn.k_proj.weight": "pytorch_model-00004-of-00006.bin",
257
+ "model.layers.33.self_attn.o_proj.weight": "pytorch_model-00004-of-00006.bin",
258
+ "model.layers.33.self_attn.q_proj.weight": "pytorch_model-00004-of-00006.bin",
259
+ "model.layers.33.self_attn.v_proj.weight": "pytorch_model-00004-of-00006.bin",
260
+ "model.layers.34.input_layernorm.weight": "pytorch_model-00004-of-00006.bin",
261
+ "model.layers.34.mlp.down_proj.weight": "pytorch_model-00004-of-00006.bin",
262
+ "model.layers.34.mlp.gate_proj.weight": "pytorch_model-00004-of-00006.bin",
263
+ "model.layers.34.mlp.up_proj.weight": "pytorch_model-00004-of-00006.bin",
264
+ "model.layers.34.post_attention_layernorm.weight": "pytorch_model-00004-of-00006.bin",
265
+ "model.layers.34.self_attn.k_proj.weight": "pytorch_model-00004-of-00006.bin",
266
+ "model.layers.34.self_attn.o_proj.weight": "pytorch_model-00004-of-00006.bin",
267
+ "model.layers.34.self_attn.q_proj.weight": "pytorch_model-00004-of-00006.bin",
268
+ "model.layers.34.self_attn.v_proj.weight": "pytorch_model-00004-of-00006.bin",
269
+ "model.layers.35.input_layernorm.weight": "pytorch_model-00004-of-00006.bin",
270
+ "model.layers.35.mlp.down_proj.weight": "pytorch_model-00004-of-00006.bin",
271
+ "model.layers.35.mlp.gate_proj.weight": "pytorch_model-00004-of-00006.bin",
272
+ "model.layers.35.mlp.up_proj.weight": "pytorch_model-00004-of-00006.bin",
273
+ "model.layers.35.post_attention_layernorm.weight": "pytorch_model-00004-of-00006.bin",
274
+ "model.layers.35.self_attn.k_proj.weight": "pytorch_model-00004-of-00006.bin",
275
+ "model.layers.35.self_attn.o_proj.weight": "pytorch_model-00004-of-00006.bin",
276
+ "model.layers.35.self_attn.q_proj.weight": "pytorch_model-00004-of-00006.bin",
277
+ "model.layers.35.self_attn.v_proj.weight": "pytorch_model-00004-of-00006.bin",
278
+ "model.layers.36.input_layernorm.weight": "pytorch_model-00004-of-00006.bin",
279
+ "model.layers.36.mlp.down_proj.weight": "pytorch_model-00004-of-00006.bin",
280
+ "model.layers.36.mlp.gate_proj.weight": "pytorch_model-00004-of-00006.bin",
281
+ "model.layers.36.mlp.up_proj.weight": "pytorch_model-00004-of-00006.bin",
282
+ "model.layers.36.post_attention_layernorm.weight": "pytorch_model-00004-of-00006.bin",
283
+ "model.layers.36.self_attn.k_proj.weight": "pytorch_model-00004-of-00006.bin",
284
+ "model.layers.36.self_attn.o_proj.weight": "pytorch_model-00004-of-00006.bin",
285
+ "model.layers.36.self_attn.q_proj.weight": "pytorch_model-00004-of-00006.bin",
286
+ "model.layers.36.self_attn.v_proj.weight": "pytorch_model-00004-of-00006.bin",
287
+ "model.layers.37.input_layernorm.weight": "pytorch_model-00005-of-00006.bin",
288
+ "model.layers.37.mlp.down_proj.weight": "pytorch_model-00005-of-00006.bin",
289
+ "model.layers.37.mlp.gate_proj.weight": "pytorch_model-00004-of-00006.bin",
290
+ "model.layers.37.mlp.up_proj.weight": "pytorch_model-00004-of-00006.bin",
291
+ "model.layers.37.post_attention_layernorm.weight": "pytorch_model-00005-of-00006.bin",
292
+ "model.layers.37.self_attn.k_proj.weight": "pytorch_model-00004-of-00006.bin",
293
+ "model.layers.37.self_attn.o_proj.weight": "pytorch_model-00004-of-00006.bin",
294
+ "model.layers.37.self_attn.q_proj.weight": "pytorch_model-00004-of-00006.bin",
295
+ "model.layers.37.self_attn.v_proj.weight": "pytorch_model-00004-of-00006.bin",
296
+ "model.layers.38.input_layernorm.weight": "pytorch_model-00005-of-00006.bin",
297
+ "model.layers.38.mlp.down_proj.weight": "pytorch_model-00005-of-00006.bin",
298
+ "model.layers.38.mlp.gate_proj.weight": "pytorch_model-00005-of-00006.bin",
299
+ "model.layers.38.mlp.up_proj.weight": "pytorch_model-00005-of-00006.bin",
300
+ "model.layers.38.post_attention_layernorm.weight": "pytorch_model-00005-of-00006.bin",
301
+ "model.layers.38.self_attn.k_proj.weight": "pytorch_model-00005-of-00006.bin",
302
+ "model.layers.38.self_attn.o_proj.weight": "pytorch_model-00005-of-00006.bin",
303
+ "model.layers.38.self_attn.q_proj.weight": "pytorch_model-00005-of-00006.bin",
304
+ "model.layers.38.self_attn.v_proj.weight": "pytorch_model-00005-of-00006.bin",
305
+ "model.layers.39.input_layernorm.weight": "pytorch_model-00005-of-00006.bin",
306
+ "model.layers.39.mlp.down_proj.weight": "pytorch_model-00005-of-00006.bin",
307
+ "model.layers.39.mlp.gate_proj.weight": "pytorch_model-00005-of-00006.bin",
308
+ "model.layers.39.mlp.up_proj.weight": "pytorch_model-00005-of-00006.bin",
309
+ "model.layers.39.post_attention_layernorm.weight": "pytorch_model-00005-of-00006.bin",
310
+ "model.layers.39.self_attn.k_proj.weight": "pytorch_model-00005-of-00006.bin",
311
+ "model.layers.39.self_attn.o_proj.weight": "pytorch_model-00005-of-00006.bin",
312
+ "model.layers.39.self_attn.q_proj.weight": "pytorch_model-00005-of-00006.bin",
313
+ "model.layers.39.self_attn.v_proj.weight": "pytorch_model-00005-of-00006.bin",
314
+ "model.layers.4.input_layernorm.weight": "pytorch_model-00001-of-00006.bin",
315
+ "model.layers.4.mlp.down_proj.weight": "pytorch_model-00001-of-00006.bin",
316
+ "model.layers.4.mlp.gate_proj.weight": "pytorch_model-00001-of-00006.bin",
317
+ "model.layers.4.mlp.up_proj.weight": "pytorch_model-00001-of-00006.bin",
318
+ "model.layers.4.post_attention_layernorm.weight": "pytorch_model-00001-of-00006.bin",
319
+ "model.layers.4.self_attn.k_proj.weight": "pytorch_model-00001-of-00006.bin",
320
+ "model.layers.4.self_attn.o_proj.weight": "pytorch_model-00001-of-00006.bin",
321
+ "model.layers.4.self_attn.q_proj.weight": "pytorch_model-00001-of-00006.bin",
322
+ "model.layers.4.self_attn.v_proj.weight": "pytorch_model-00001-of-00006.bin",
323
+ "model.layers.40.input_layernorm.weight": "pytorch_model-00005-of-00006.bin",
324
+ "model.layers.40.mlp.down_proj.weight": "pytorch_model-00005-of-00006.bin",
325
+ "model.layers.40.mlp.gate_proj.weight": "pytorch_model-00005-of-00006.bin",
326
+ "model.layers.40.mlp.up_proj.weight": "pytorch_model-00005-of-00006.bin",
327
+ "model.layers.40.post_attention_layernorm.weight": "pytorch_model-00005-of-00006.bin",
328
+ "model.layers.40.self_attn.k_proj.weight": "pytorch_model-00005-of-00006.bin",
329
+ "model.layers.40.self_attn.o_proj.weight": "pytorch_model-00005-of-00006.bin",
330
+ "model.layers.40.self_attn.q_proj.weight": "pytorch_model-00005-of-00006.bin",
331
+ "model.layers.40.self_attn.v_proj.weight": "pytorch_model-00005-of-00006.bin",
332
+ "model.layers.41.input_layernorm.weight": "pytorch_model-00005-of-00006.bin",
333
+ "model.layers.41.mlp.down_proj.weight": "pytorch_model-00005-of-00006.bin",
334
+ "model.layers.41.mlp.gate_proj.weight": "pytorch_model-00005-of-00006.bin",
335
+ "model.layers.41.mlp.up_proj.weight": "pytorch_model-00005-of-00006.bin",
336
+ "model.layers.41.post_attention_layernorm.weight": "pytorch_model-00005-of-00006.bin",
337
+ "model.layers.41.self_attn.k_proj.weight": "pytorch_model-00005-of-00006.bin",
338
+ "model.layers.41.self_attn.o_proj.weight": "pytorch_model-00005-of-00006.bin",
339
+ "model.layers.41.self_attn.q_proj.weight": "pytorch_model-00005-of-00006.bin",
340
+ "model.layers.41.self_attn.v_proj.weight": "pytorch_model-00005-of-00006.bin",
341
+ "model.layers.42.input_layernorm.weight": "pytorch_model-00005-of-00006.bin",
342
+ "model.layers.42.mlp.down_proj.weight": "pytorch_model-00005-of-00006.bin",
343
+ "model.layers.42.mlp.gate_proj.weight": "pytorch_model-00005-of-00006.bin",
344
+ "model.layers.42.mlp.up_proj.weight": "pytorch_model-00005-of-00006.bin",
345
+ "model.layers.42.post_attention_layernorm.weight": "pytorch_model-00005-of-00006.bin",
346
+ "model.layers.42.self_attn.k_proj.weight": "pytorch_model-00005-of-00006.bin",
347
+ "model.layers.42.self_attn.o_proj.weight": "pytorch_model-00005-of-00006.bin",
348
+ "model.layers.42.self_attn.q_proj.weight": "pytorch_model-00005-of-00006.bin",
349
+ "model.layers.42.self_attn.v_proj.weight": "pytorch_model-00005-of-00006.bin",
350
+ "model.layers.43.input_layernorm.weight": "pytorch_model-00005-of-00006.bin",
351
+ "model.layers.43.mlp.down_proj.weight": "pytorch_model-00005-of-00006.bin",
352
+ "model.layers.43.mlp.gate_proj.weight": "pytorch_model-00005-of-00006.bin",
353
+ "model.layers.43.mlp.up_proj.weight": "pytorch_model-00005-of-00006.bin",
354
+ "model.layers.43.post_attention_layernorm.weight": "pytorch_model-00005-of-00006.bin",
355
+ "model.layers.43.self_attn.k_proj.weight": "pytorch_model-00005-of-00006.bin",
356
+ "model.layers.43.self_attn.o_proj.weight": "pytorch_model-00005-of-00006.bin",
357
+ "model.layers.43.self_attn.q_proj.weight": "pytorch_model-00005-of-00006.bin",
358
+ "model.layers.43.self_attn.v_proj.weight": "pytorch_model-00005-of-00006.bin",
359
+ "model.layers.44.input_layernorm.weight": "pytorch_model-00005-of-00006.bin",
360
+ "model.layers.44.mlp.down_proj.weight": "pytorch_model-00005-of-00006.bin",
361
+ "model.layers.44.mlp.gate_proj.weight": "pytorch_model-00005-of-00006.bin",
362
+ "model.layers.44.mlp.up_proj.weight": "pytorch_model-00005-of-00006.bin",
363
+ "model.layers.44.post_attention_layernorm.weight": "pytorch_model-00005-of-00006.bin",
364
+ "model.layers.44.self_attn.k_proj.weight": "pytorch_model-00005-of-00006.bin",
365
+ "model.layers.44.self_attn.o_proj.weight": "pytorch_model-00005-of-00006.bin",
366
+ "model.layers.44.self_attn.q_proj.weight": "pytorch_model-00005-of-00006.bin",
367
+ "model.layers.44.self_attn.v_proj.weight": "pytorch_model-00005-of-00006.bin",
368
+ "model.layers.45.input_layernorm.weight": "pytorch_model-00005-of-00006.bin",
369
+ "model.layers.45.mlp.down_proj.weight": "pytorch_model-00005-of-00006.bin",
370
+ "model.layers.45.mlp.gate_proj.weight": "pytorch_model-00005-of-00006.bin",
371
+ "model.layers.45.mlp.up_proj.weight": "pytorch_model-00005-of-00006.bin",
372
+ "model.layers.45.post_attention_layernorm.weight": "pytorch_model-00005-of-00006.bin",
373
+ "model.layers.45.self_attn.k_proj.weight": "pytorch_model-00005-of-00006.bin",
374
+ "model.layers.45.self_attn.o_proj.weight": "pytorch_model-00005-of-00006.bin",
375
+ "model.layers.45.self_attn.q_proj.weight": "pytorch_model-00005-of-00006.bin",
376
+ "model.layers.45.self_attn.v_proj.weight": "pytorch_model-00005-of-00006.bin",
377
+ "model.layers.46.input_layernorm.weight": "pytorch_model-00005-of-00006.bin",
378
+ "model.layers.46.mlp.down_proj.weight": "pytorch_model-00005-of-00006.bin",
379
+ "model.layers.46.mlp.gate_proj.weight": "pytorch_model-00005-of-00006.bin",
380
+ "model.layers.46.mlp.up_proj.weight": "pytorch_model-00005-of-00006.bin",
381
+ "model.layers.46.post_attention_layernorm.weight": "pytorch_model-00005-of-00006.bin",
382
+ "model.layers.46.self_attn.k_proj.weight": "pytorch_model-00005-of-00006.bin",
383
+ "model.layers.46.self_attn.o_proj.weight": "pytorch_model-00005-of-00006.bin",
384
+ "model.layers.46.self_attn.q_proj.weight": "pytorch_model-00005-of-00006.bin",
385
+ "model.layers.46.self_attn.v_proj.weight": "pytorch_model-00005-of-00006.bin",
386
+ "model.layers.47.input_layernorm.weight": "pytorch_model-00006-of-00006.bin",
387
+ "model.layers.47.mlp.down_proj.weight": "pytorch_model-00006-of-00006.bin",
388
+ "model.layers.47.mlp.gate_proj.weight": "pytorch_model-00005-of-00006.bin",
389
+ "model.layers.47.mlp.up_proj.weight": "pytorch_model-00006-of-00006.bin",
390
+ "model.layers.47.post_attention_layernorm.weight": "pytorch_model-00006-of-00006.bin",
391
+ "model.layers.47.self_attn.k_proj.weight": "pytorch_model-00005-of-00006.bin",
392
+ "model.layers.47.self_attn.o_proj.weight": "pytorch_model-00005-of-00006.bin",
393
+ "model.layers.47.self_attn.q_proj.weight": "pytorch_model-00005-of-00006.bin",
394
+ "model.layers.47.self_attn.v_proj.weight": "pytorch_model-00005-of-00006.bin",
395
+ "model.layers.48.input_layernorm.weight": "pytorch_model-00006-of-00006.bin",
396
+ "model.layers.48.mlp.down_proj.weight": "pytorch_model-00006-of-00006.bin",
397
+ "model.layers.48.mlp.gate_proj.weight": "pytorch_model-00006-of-00006.bin",
398
+ "model.layers.48.mlp.up_proj.weight": "pytorch_model-00006-of-00006.bin",
399
+ "model.layers.48.post_attention_layernorm.weight": "pytorch_model-00006-of-00006.bin",
400
+ "model.layers.48.self_attn.k_proj.weight": "pytorch_model-00006-of-00006.bin",
401
+ "model.layers.48.self_attn.o_proj.weight": "pytorch_model-00006-of-00006.bin",
402
+ "model.layers.48.self_attn.q_proj.weight": "pytorch_model-00006-of-00006.bin",
403
+ "model.layers.48.self_attn.v_proj.weight": "pytorch_model-00006-of-00006.bin",
404
+ "model.layers.49.input_layernorm.weight": "pytorch_model-00006-of-00006.bin",
405
+ "model.layers.49.mlp.down_proj.weight": "pytorch_model-00006-of-00006.bin",
406
+ "model.layers.49.mlp.gate_proj.weight": "pytorch_model-00006-of-00006.bin",
407
+ "model.layers.49.mlp.up_proj.weight": "pytorch_model-00006-of-00006.bin",
408
+ "model.layers.49.post_attention_layernorm.weight": "pytorch_model-00006-of-00006.bin",
409
+ "model.layers.49.self_attn.k_proj.weight": "pytorch_model-00006-of-00006.bin",
410
+ "model.layers.49.self_attn.o_proj.weight": "pytorch_model-00006-of-00006.bin",
411
+ "model.layers.49.self_attn.q_proj.weight": "pytorch_model-00006-of-00006.bin",
412
+ "model.layers.49.self_attn.v_proj.weight": "pytorch_model-00006-of-00006.bin",
413
+ "model.layers.5.input_layernorm.weight": "pytorch_model-00001-of-00006.bin",
414
+ "model.layers.5.mlp.down_proj.weight": "pytorch_model-00001-of-00006.bin",
415
+ "model.layers.5.mlp.gate_proj.weight": "pytorch_model-00001-of-00006.bin",
416
+ "model.layers.5.mlp.up_proj.weight": "pytorch_model-00001-of-00006.bin",
417
+ "model.layers.5.post_attention_layernorm.weight": "pytorch_model-00001-of-00006.bin",
418
+ "model.layers.5.self_attn.k_proj.weight": "pytorch_model-00001-of-00006.bin",
419
+ "model.layers.5.self_attn.o_proj.weight": "pytorch_model-00001-of-00006.bin",
420
+ "model.layers.5.self_attn.q_proj.weight": "pytorch_model-00001-of-00006.bin",
421
+ "model.layers.5.self_attn.v_proj.weight": "pytorch_model-00001-of-00006.bin",
422
+ "model.layers.50.input_layernorm.weight": "pytorch_model-00006-of-00006.bin",
423
+ "model.layers.50.mlp.down_proj.weight": "pytorch_model-00006-of-00006.bin",
424
+ "model.layers.50.mlp.gate_proj.weight": "pytorch_model-00006-of-00006.bin",
425
+ "model.layers.50.mlp.up_proj.weight": "pytorch_model-00006-of-00006.bin",
426
+ "model.layers.50.post_attention_layernorm.weight": "pytorch_model-00006-of-00006.bin",
427
+ "model.layers.50.self_attn.k_proj.weight": "pytorch_model-00006-of-00006.bin",
428
+ "model.layers.50.self_attn.o_proj.weight": "pytorch_model-00006-of-00006.bin",
429
+ "model.layers.50.self_attn.q_proj.weight": "pytorch_model-00006-of-00006.bin",
430
+ "model.layers.50.self_attn.v_proj.weight": "pytorch_model-00006-of-00006.bin",
431
+ "model.layers.51.input_layernorm.weight": "pytorch_model-00006-of-00006.bin",
432
+ "model.layers.51.mlp.down_proj.weight": "pytorch_model-00006-of-00006.bin",
433
+ "model.layers.51.mlp.gate_proj.weight": "pytorch_model-00006-of-00006.bin",
434
+ "model.layers.51.mlp.up_proj.weight": "pytorch_model-00006-of-00006.bin",
435
+ "model.layers.51.post_attention_layernorm.weight": "pytorch_model-00006-of-00006.bin",
436
+ "model.layers.51.self_attn.k_proj.weight": "pytorch_model-00006-of-00006.bin",
437
+ "model.layers.51.self_attn.o_proj.weight": "pytorch_model-00006-of-00006.bin",
438
+ "model.layers.51.self_attn.q_proj.weight": "pytorch_model-00006-of-00006.bin",
439
+ "model.layers.51.self_attn.v_proj.weight": "pytorch_model-00006-of-00006.bin",
440
+ "model.layers.6.input_layernorm.weight": "pytorch_model-00001-of-00006.bin",
441
+ "model.layers.6.mlp.down_proj.weight": "pytorch_model-00001-of-00006.bin",
442
+ "model.layers.6.mlp.gate_proj.weight": "pytorch_model-00001-of-00006.bin",
443
+ "model.layers.6.mlp.up_proj.weight": "pytorch_model-00001-of-00006.bin",
444
+ "model.layers.6.post_attention_layernorm.weight": "pytorch_model-00001-of-00006.bin",
445
+ "model.layers.6.self_attn.k_proj.weight": "pytorch_model-00001-of-00006.bin",
446
+ "model.layers.6.self_attn.o_proj.weight": "pytorch_model-00001-of-00006.bin",
447
+ "model.layers.6.self_attn.q_proj.weight": "pytorch_model-00001-of-00006.bin",
448
+ "model.layers.6.self_attn.v_proj.weight": "pytorch_model-00001-of-00006.bin",
449
+ "model.layers.7.input_layernorm.weight": "pytorch_model-00001-of-00006.bin",
450
+ "model.layers.7.mlp.down_proj.weight": "pytorch_model-00001-of-00006.bin",
451
+ "model.layers.7.mlp.gate_proj.weight": "pytorch_model-00001-of-00006.bin",
452
+ "model.layers.7.mlp.up_proj.weight": "pytorch_model-00001-of-00006.bin",
453
+ "model.layers.7.post_attention_layernorm.weight": "pytorch_model-00001-of-00006.bin",
454
+ "model.layers.7.self_attn.k_proj.weight": "pytorch_model-00001-of-00006.bin",
455
+ "model.layers.7.self_attn.o_proj.weight": "pytorch_model-00001-of-00006.bin",
456
+ "model.layers.7.self_attn.q_proj.weight": "pytorch_model-00001-of-00006.bin",
457
+ "model.layers.7.self_attn.v_proj.weight": "pytorch_model-00001-of-00006.bin",
458
+ "model.layers.8.input_layernorm.weight": "pytorch_model-00002-of-00006.bin",
459
+ "model.layers.8.mlp.down_proj.weight": "pytorch_model-00002-of-00006.bin",
460
+ "model.layers.8.mlp.gate_proj.weight": "pytorch_model-00001-of-00006.bin",
461
+ "model.layers.8.mlp.up_proj.weight": "pytorch_model-00002-of-00006.bin",
462
+ "model.layers.8.post_attention_layernorm.weight": "pytorch_model-00002-of-00006.bin",
463
+ "model.layers.8.self_attn.k_proj.weight": "pytorch_model-00001-of-00006.bin",
464
+ "model.layers.8.self_attn.o_proj.weight": "pytorch_model-00001-of-00006.bin",
465
+ "model.layers.8.self_attn.q_proj.weight": "pytorch_model-00001-of-00006.bin",
466
+ "model.layers.8.self_attn.v_proj.weight": "pytorch_model-00001-of-00006.bin",
467
+ "model.layers.9.input_layernorm.weight": "pytorch_model-00002-of-00006.bin",
468
+ "model.layers.9.mlp.down_proj.weight": "pytorch_model-00002-of-00006.bin",
469
+ "model.layers.9.mlp.gate_proj.weight": "pytorch_model-00002-of-00006.bin",
470
+ "model.layers.9.mlp.up_proj.weight": "pytorch_model-00002-of-00006.bin",
471
+ "model.layers.9.post_attention_layernorm.weight": "pytorch_model-00002-of-00006.bin",
472
+ "model.layers.9.self_attn.k_proj.weight": "pytorch_model-00002-of-00006.bin",
473
+ "model.layers.9.self_attn.o_proj.weight": "pytorch_model-00002-of-00006.bin",
474
+ "model.layers.9.self_attn.q_proj.weight": "pytorch_model-00002-of-00006.bin",
475
+ "model.layers.9.self_attn.v_proj.weight": "pytorch_model-00002-of-00006.bin",
476
+ "model.norm.weight": "pytorch_model-00006-of-00006.bin"
477
+ }
478
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "</s>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": true,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
tokenization_skywork.py ADDED
@@ -0,0 +1,250 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) SkyworkAI and the HuggingFace Inc. team. All rights reserved.
2
+ # This code is built upon Huggingface's transformers repository.
3
+
4
+ """Tokenization classes for Skywork."""
5
+ import os
6
+ from shutil import copyfile
7
+ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
8
+
9
+ import sentencepiece as spm
10
+
11
+ from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
12
+ from transformers.utils import logging
13
+
14
+ if TYPE_CHECKING:
15
+ from transformers.pipelines.conversational import Conversation
16
+
17
+ logger = logging.get_logger(__name__)
18
+
19
+ VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model"}
20
+
21
+
22
+ SPIECE_UNDERLINE = "▁"
23
+
24
+ B_INST, E_INST = "[INST]", "[/INST]"
25
+ B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
26
+
27
+ DEFAULT_SYSTEM_PROMPT = """You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure\
28
+ that your responses are socially unbiased and positive in nature.
29
+
30
+ If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."""
31
+
32
+ class SkyworkTokenizer(PreTrainedTokenizer):
33
+
34
+ vocab_files_names = VOCAB_FILES_NAMES
35
+ # pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
36
+ # max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
37
+ model_input_names = ["input_ids", "attention_mask"]
38
+
39
+ def __init__(
40
+ self,
41
+ vocab_file,
42
+ unk_token="<unk>",
43
+ bos_token="<s>",
44
+ eos_token="</s>",
45
+ pad_token=None,
46
+ sp_model_kwargs: Optional[Dict[str, Any]] = None,
47
+ add_bos_token=True,
48
+ add_eos_token=False,
49
+ clean_up_tokenization_spaces=False,
50
+ legacy=True,
51
+ **kwargs,
52
+ ):
53
+ self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
54
+ bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
55
+ eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
56
+ unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
57
+ pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
58
+ self.legacy = legacy
59
+ self.vocab_file = vocab_file
60
+ self.add_bos_token = add_bos_token
61
+ self.add_eos_token = add_eos_token
62
+ self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
63
+ self.sp_model.Load(vocab_file)
64
+ super().__init__(
65
+ bos_token=bos_token,
66
+ eos_token=eos_token,
67
+ unk_token=unk_token,
68
+ pad_token=pad_token,
69
+ add_bos_token=add_bos_token,
70
+ add_eos_token=add_eos_token,
71
+ sp_model_kwargs=self.sp_model_kwargs,
72
+ clean_up_tokenization_spaces=clean_up_tokenization_spaces,
73
+ legacy=legacy,
74
+ **kwargs,
75
+ )
76
+ if legacy:
77
+ logger.warning_once(
78
+ f"You are using the legacy behaviour of the {self.__class__}. This means that tokens that come after special tokens will not be properly handled. "
79
+ )
80
+
81
+
82
+ def __getstate__(self):
83
+ state = self.__dict__.copy()
84
+ state["sp_model"] = None
85
+ state["sp_model_proto"] = self.sp_model.serialized_model_proto()
86
+ return state
87
+
88
+ def __setstate__(self, d):
89
+ self.__dict__ = d
90
+ self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
91
+ self.sp_model.LoadFromSerializedProto(self.sp_model_proto)
92
+
93
+ @property
94
+ def vocab_size(self):
95
+ """Returns vocab size"""
96
+ return self.sp_model.get_piece_size()
97
+
98
+ def get_vocab(self):
99
+ """Returns vocab as a dict"""
100
+ vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
101
+ vocab.update(self.added_tokens_encoder)
102
+ return vocab
103
+
104
+ # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer.tokenize
105
+ def tokenize(self, text, **kwargs) -> List[str]:
106
+ # Replace the SPIECE_UNDERLINE with a space to make sure SPIECE_UNDERLINE is only used at
107
+ # the beginning of the text
108
+ if not self.legacy:
109
+ text = SPIECE_UNDERLINE + text.replace(SPIECE_UNDERLINE, " ")
110
+ return super().tokenize(text, **kwargs)
111
+
112
+ # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer._tokenize
113
+ def _tokenize(self, text):
114
+ if not self.legacy:
115
+ is_first = text.startswith(SPIECE_UNDERLINE)
116
+ if is_first:
117
+ text = text[1:]
118
+
119
+ tokens = self.sp_model.encode(text, out_type=str)
120
+
121
+ if not self.legacy and not is_first and not text.startswith(" ") and tokens[0].startswith(SPIECE_UNDERLINE):
122
+ tokens = ([tokens[0][1:]] if len(tokens[0]) > 1 else []) + tokens[1:]
123
+ return tokens
124
+
125
+ def _convert_token_to_id(self, token):
126
+ """Converts a token (str) in an id using the vocab."""
127
+ return self.sp_model.piece_to_id(token)
128
+
129
+ def _convert_id_to_token(self, index):
130
+ """Converts an index (integer) in a token (str) using the vocab."""
131
+ token = self.sp_model.IdToPiece(index)
132
+ return token
133
+
134
+ def convert_tokens_to_string(self, tokens):
135
+ """Converts a sequence of tokens (string) in a single string."""
136
+ current_sub_tokens = []
137
+ out_string = ""
138
+ prev_is_special = False
139
+ for i, token in enumerate(tokens):
140
+ # make sure that special tokens are not decoded using sentencepiece model
141
+ if token in self.all_special_tokens:
142
+ if not prev_is_special and i != 0:
143
+ out_string += " "
144
+ out_string += self.sp_model.decode(current_sub_tokens) + token
145
+ prev_is_special = True
146
+ current_sub_tokens = []
147
+ else:
148
+ current_sub_tokens.append(token)
149
+ prev_is_special = False
150
+ out_string += self.sp_model.decode(current_sub_tokens)
151
+ return out_string
152
+
153
+ def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]:
154
+ if not os.path.isdir(save_directory):
155
+ logger.error(f"Vocabulary path ({save_directory}) should be a directory")
156
+ return
157
+ out_vocab_file = os.path.join(
158
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
159
+ )
160
+
161
+ if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
162
+ copyfile(self.vocab_file, out_vocab_file)
163
+ elif not os.path.isfile(self.vocab_file):
164
+ with open(out_vocab_file, "wb") as fi:
165
+ content_spiece_model = self.sp_model.serialized_model_proto()
166
+ fi.write(content_spiece_model)
167
+
168
+ return (out_vocab_file,)
169
+
170
+ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
171
+ bos_token_id = [self.bos_token_id] if self.add_bos_token else []
172
+ eos_token_id = [self.eos_token_id] if self.add_eos_token else []
173
+
174
+ output = bos_token_id + token_ids_0 + eos_token_id
175
+
176
+ if token_ids_1 is not None:
177
+ output = output + bos_token_id + token_ids_1 + eos_token_id
178
+
179
+ return output
180
+
181
+ def get_special_tokens_mask(
182
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
183
+ ) -> List[int]:
184
+ if already_has_special_tokens:
185
+ return super().get_special_tokens_mask(
186
+ token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
187
+ )
188
+
189
+ bos_token_id = [1] if self.add_bos_token else []
190
+ eos_token_id = [1] if self.add_eos_token else []
191
+
192
+ if token_ids_1 is None:
193
+ return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id
194
+ return (
195
+ bos_token_id
196
+ + ([0] * len(token_ids_0))
197
+ + eos_token_id
198
+ + bos_token_id
199
+ + ([0] * len(token_ids_1))
200
+ + eos_token_id
201
+ )
202
+
203
+ def create_token_type_ids_from_sequences(
204
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
205
+ ) -> List[int]:
206
+ bos_token_id = [self.bos_token_id] if self.add_bos_token else []
207
+ eos_token_id = [self.eos_token_id] if self.add_eos_token else []
208
+
209
+ output = [0] * len(bos_token_id + token_ids_0 + eos_token_id)
210
+
211
+ if token_ids_1 is not None:
212
+ output += [1] * len(bos_token_id + token_ids_1 + eos_token_id)
213
+
214
+ return output
215
+
216
+ def _build_conversation_input_ids(self, conversation: "Conversation") -> List[int]:
217
+ dialogue = list(conversation.iter_texts())
218
+ if not all([is_user for is_user, msg in dialogue[::2]]) or not all(
219
+ [not is_user for is_user, msg in dialogue[1::2]]
220
+ ):
221
+ raise ValueError(
222
+ "The model only supports 'user' and 'assistant' roles, starting with user and alternating (u/a/u/a/u...)"
223
+ )
224
+
225
+ dialog_tokens: List[int] = []
226
+ if len(conversation.past_user_inputs) > 0:
227
+ if not conversation.past_user_inputs[0].startswith(B_SYS) or E_SYS not in conversation.past_user_inputs[0]:
228
+ conversation.past_user_inputs[0] = (
229
+ B_SYS + DEFAULT_SYSTEM_PROMPT + E_SYS + conversation.past_user_inputs[0]
230
+ )
231
+ elif not dialogue[0][1].startswith(B_SYS) or E_SYS not in dialogue[0][1]:
232
+ dialogue[0] = (dialogue[0][0], B_SYS + DEFAULT_SYSTEM_PROMPT + E_SYS + dialogue[0][1])
233
+
234
+ dialog_tokens += sum(
235
+ [
236
+ [self.bos_token_id]
237
+ + self.encode(
238
+ f"{B_INST} {(prompt[1]).strip()} {E_INST} {(answer[1]).strip()} ", add_special_tokens=False
239
+ )
240
+ + [self.eos_token_id]
241
+ for prompt, answer in zip(dialogue[::2], dialogue[1::2])
242
+ ],
243
+ [],
244
+ )
245
+ if not (dialogue[-1][0]):
246
+ raise ValueError(f"Last message must be from user, got {dialogue[-1]['role']}")
247
+ dialog_tokens += [self.bos_token_id] + self.encode(
248
+ f"{B_INST} {(dialogue[-1][1]).strip()} {E_INST}", add_special_tokens=False
249
+ )
250
+ return dialog_tokens
tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:36ec9a4d6fd7cc78fbb9e4afd89fb04cba0381b08a842ca0b60826073821f594
3
+ size 994250
tokenizer_config.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "added_tokens_decoder": {
5
+ "0": {
6
+ "content": "<unk>",
7
+ "lstrip": false,
8
+ "normalized": true,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "1": {
14
+ "content": "<s>",
15
+ "lstrip": false,
16
+ "normalized": true,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "2": {
22
+ "content": "</s>",
23
+ "lstrip": false,
24
+ "normalized": true,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ }
29
+ },
30
+ "auto_map": {
31
+ "AutoTokenizer": [
32
+ "tokenization_skywork.SkyworkTokenizer",
33
+ null
34
+ ]
35
+ },
36
+ "bos_token": "<s>",
37
+ "clean_up_tokenization_spaces": false,
38
+ "eos_token": "</s>",
39
+ "legacy": true,
40
+ "model_max_length": 1000000000000000019884624838656,
41
+ "pad_token": "</s>",
42
+ "sp_model_kwargs": {},
43
+ "tokenizer_class": "SkyworkTokenizer",
44
+ "unk_token": "<unk>"
45
+ }