jiguanglizipao
commited on
Commit
·
df3cdcf
1
Parent(s):
e94d96f
init
Browse files- .gitattributes +3 -0
- config.json +27 -0
- generation_config.json +6 -0
- model-00001-of-00003.safetensors +3 -0
- model-00002-of-00003.safetensors +3 -0
- model-00003-of-00003.safetensors +3 -0
- model.safetensors.index.json +374 -0
- pytorch_model.bin.index.json +290 -0
- special_tokens_map.json +24 -0
- tokenization_utils_base.py +0 -0
- tokenization_utils_fast.py +764 -0
- tokenizer.json +0 -0
- tokenizer_config.json +12 -0
.gitattributes
CHANGED
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
model-00001-of-00003.safetensors filter=lfs diff=lfs merge=lfs -text
|
37 |
+
model-00002-of-00003.safetensors filter=lfs diff=lfs merge=lfs -text
|
38 |
+
model-00003-of-00003.safetensors filter=lfs diff=lfs merge=lfs -text
|
config.json
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"architectures": [
|
3 |
+
"LlamaForCausalLM"
|
4 |
+
],
|
5 |
+
"attention_bias": true,
|
6 |
+
"bos_token_id": 1,
|
7 |
+
"eos_token_id": 2,
|
8 |
+
"hidden_act": "silu",
|
9 |
+
"hidden_size": 4096,
|
10 |
+
"initializer_range": 0.02,
|
11 |
+
"intermediate_size": 13696,
|
12 |
+
"max_position_embeddings": 32768,
|
13 |
+
"model_type": "llama",
|
14 |
+
"num_attention_heads": 32,
|
15 |
+
"num_hidden_layers": 28,
|
16 |
+
"num_key_value_heads": 2,
|
17 |
+
"pretraining_tp": 1,
|
18 |
+
"rms_norm_eps": 1e-05,
|
19 |
+
"rope_dim": 64,
|
20 |
+
"rope_scaling": null,
|
21 |
+
"rope_theta": 10000.0,
|
22 |
+
"tie_word_embeddings": false,
|
23 |
+
"torch_dtype": "float16",
|
24 |
+
"transformers_version": "4.35.0.dev0",
|
25 |
+
"use_cache": true,
|
26 |
+
"vocab_size": 65024
|
27 |
+
}
|
generation_config.json
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_from_model_config": true,
|
3 |
+
"bos_token_id": 1,
|
4 |
+
"eos_token_id": 2,
|
5 |
+
"transformers_version": "4.35.0.dev0"
|
6 |
+
}
|
model-00001-of-00003.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:340b7a561c594c6eea307cfbc3be052a25f090d065118a13ff85604b54fbc186
|
3 |
+
size 4907706896
|
model-00002-of-00003.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:911e19e8ebcfb4dc2dc114567855d8f7f4dcf3c08704db04106befaec684f7f7
|
3 |
+
size 4895175976
|
model-00003-of-00003.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:615e66cf5157be2677a6dc5060ee009120b923ca58ac6426ab6ddb44eadbea2a
|
3 |
+
size 2684556024
|
model.safetensors.index.json
ADDED
@@ -0,0 +1,374 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"metadata": {
|
3 |
+
"total_size": 12487397376
|
4 |
+
},
|
5 |
+
"weight_map": {
|
6 |
+
"lm_head.weight": "model-00003-of-00003.safetensors",
|
7 |
+
"model.embed_tokens.weight": "model-00001-of-00003.safetensors",
|
8 |
+
"model.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors",
|
9 |
+
"model.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
|
10 |
+
"model.layers.0.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
|
11 |
+
"model.layers.0.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
|
12 |
+
"model.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
|
13 |
+
"model.layers.0.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
|
14 |
+
"model.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
|
15 |
+
"model.layers.0.self_attn.o_proj.bias": "model-00001-of-00003.safetensors",
|
16 |
+
"model.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
|
17 |
+
"model.layers.0.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
|
18 |
+
"model.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
|
19 |
+
"model.layers.0.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
|
20 |
+
"model.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
|
21 |
+
"model.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors",
|
22 |
+
"model.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
|
23 |
+
"model.layers.1.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
|
24 |
+
"model.layers.1.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
|
25 |
+
"model.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
|
26 |
+
"model.layers.1.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
|
27 |
+
"model.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
|
28 |
+
"model.layers.1.self_attn.o_proj.bias": "model-00001-of-00003.safetensors",
|
29 |
+
"model.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
|
30 |
+
"model.layers.1.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
|
31 |
+
"model.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
|
32 |
+
"model.layers.1.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
|
33 |
+
"model.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
|
34 |
+
"model.layers.10.input_layernorm.weight": "model-00002-of-00003.safetensors",
|
35 |
+
"model.layers.10.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
|
36 |
+
"model.layers.10.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
|
37 |
+
"model.layers.10.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
|
38 |
+
"model.layers.10.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
|
39 |
+
"model.layers.10.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
|
40 |
+
"model.layers.10.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
|
41 |
+
"model.layers.10.self_attn.o_proj.bias": "model-00001-of-00003.safetensors",
|
42 |
+
"model.layers.10.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
|
43 |
+
"model.layers.10.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
|
44 |
+
"model.layers.10.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
|
45 |
+
"model.layers.10.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
|
46 |
+
"model.layers.10.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
|
47 |
+
"model.layers.11.input_layernorm.weight": "model-00002-of-00003.safetensors",
|
48 |
+
"model.layers.11.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
|
49 |
+
"model.layers.11.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
|
50 |
+
"model.layers.11.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
|
51 |
+
"model.layers.11.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
|
52 |
+
"model.layers.11.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
|
53 |
+
"model.layers.11.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
|
54 |
+
"model.layers.11.self_attn.o_proj.bias": "model-00002-of-00003.safetensors",
|
55 |
+
"model.layers.11.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
|
56 |
+
"model.layers.11.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
|
57 |
+
"model.layers.11.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
|
58 |
+
"model.layers.11.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
|
59 |
+
"model.layers.11.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
|
60 |
+
"model.layers.12.input_layernorm.weight": "model-00002-of-00003.safetensors",
|
61 |
+
"model.layers.12.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
|
62 |
+
"model.layers.12.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
|
63 |
+
"model.layers.12.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
|
64 |
+
"model.layers.12.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
|
65 |
+
"model.layers.12.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
|
66 |
+
"model.layers.12.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
|
67 |
+
"model.layers.12.self_attn.o_proj.bias": "model-00002-of-00003.safetensors",
|
68 |
+
"model.layers.12.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
|
69 |
+
"model.layers.12.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
|
70 |
+
"model.layers.12.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
|
71 |
+
"model.layers.12.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
|
72 |
+
"model.layers.12.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
|
73 |
+
"model.layers.13.input_layernorm.weight": "model-00002-of-00003.safetensors",
|
74 |
+
"model.layers.13.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
|
75 |
+
"model.layers.13.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
|
76 |
+
"model.layers.13.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
|
77 |
+
"model.layers.13.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
|
78 |
+
"model.layers.13.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
|
79 |
+
"model.layers.13.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
|
80 |
+
"model.layers.13.self_attn.o_proj.bias": "model-00002-of-00003.safetensors",
|
81 |
+
"model.layers.13.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
|
82 |
+
"model.layers.13.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
|
83 |
+
"model.layers.13.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
|
84 |
+
"model.layers.13.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
|
85 |
+
"model.layers.13.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
|
86 |
+
"model.layers.14.input_layernorm.weight": "model-00002-of-00003.safetensors",
|
87 |
+
"model.layers.14.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
|
88 |
+
"model.layers.14.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
|
89 |
+
"model.layers.14.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
|
90 |
+
"model.layers.14.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
|
91 |
+
"model.layers.14.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
|
92 |
+
"model.layers.14.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
|
93 |
+
"model.layers.14.self_attn.o_proj.bias": "model-00002-of-00003.safetensors",
|
94 |
+
"model.layers.14.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
|
95 |
+
"model.layers.14.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
|
96 |
+
"model.layers.14.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
|
97 |
+
"model.layers.14.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
|
98 |
+
"model.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
|
99 |
+
"model.layers.15.input_layernorm.weight": "model-00002-of-00003.safetensors",
|
100 |
+
"model.layers.15.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
|
101 |
+
"model.layers.15.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
|
102 |
+
"model.layers.15.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
|
103 |
+
"model.layers.15.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
|
104 |
+
"model.layers.15.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
|
105 |
+
"model.layers.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
|
106 |
+
"model.layers.15.self_attn.o_proj.bias": "model-00002-of-00003.safetensors",
|
107 |
+
"model.layers.15.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
|
108 |
+
"model.layers.15.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
|
109 |
+
"model.layers.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
|
110 |
+
"model.layers.15.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
|
111 |
+
"model.layers.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
|
112 |
+
"model.layers.16.input_layernorm.weight": "model-00002-of-00003.safetensors",
|
113 |
+
"model.layers.16.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
|
114 |
+
"model.layers.16.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
|
115 |
+
"model.layers.16.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
|
116 |
+
"model.layers.16.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
|
117 |
+
"model.layers.16.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
|
118 |
+
"model.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
|
119 |
+
"model.layers.16.self_attn.o_proj.bias": "model-00002-of-00003.safetensors",
|
120 |
+
"model.layers.16.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
|
121 |
+
"model.layers.16.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
|
122 |
+
"model.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
|
123 |
+
"model.layers.16.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
|
124 |
+
"model.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
|
125 |
+
"model.layers.17.input_layernorm.weight": "model-00002-of-00003.safetensors",
|
126 |
+
"model.layers.17.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
|
127 |
+
"model.layers.17.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
|
128 |
+
"model.layers.17.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
|
129 |
+
"model.layers.17.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
|
130 |
+
"model.layers.17.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
|
131 |
+
"model.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
|
132 |
+
"model.layers.17.self_attn.o_proj.bias": "model-00002-of-00003.safetensors",
|
133 |
+
"model.layers.17.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
|
134 |
+
"model.layers.17.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
|
135 |
+
"model.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
|
136 |
+
"model.layers.17.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
|
137 |
+
"model.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
|
138 |
+
"model.layers.18.input_layernorm.weight": "model-00002-of-00003.safetensors",
|
139 |
+
"model.layers.18.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
|
140 |
+
"model.layers.18.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
|
141 |
+
"model.layers.18.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
|
142 |
+
"model.layers.18.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
|
143 |
+
"model.layers.18.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
|
144 |
+
"model.layers.18.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
|
145 |
+
"model.layers.18.self_attn.o_proj.bias": "model-00002-of-00003.safetensors",
|
146 |
+
"model.layers.18.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
|
147 |
+
"model.layers.18.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
|
148 |
+
"model.layers.18.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
|
149 |
+
"model.layers.18.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
|
150 |
+
"model.layers.18.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
|
151 |
+
"model.layers.19.input_layernorm.weight": "model-00002-of-00003.safetensors",
|
152 |
+
"model.layers.19.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
|
153 |
+
"model.layers.19.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
|
154 |
+
"model.layers.19.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
|
155 |
+
"model.layers.19.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
|
156 |
+
"model.layers.19.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
|
157 |
+
"model.layers.19.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
|
158 |
+
"model.layers.19.self_attn.o_proj.bias": "model-00002-of-00003.safetensors",
|
159 |
+
"model.layers.19.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
|
160 |
+
"model.layers.19.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
|
161 |
+
"model.layers.19.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
|
162 |
+
"model.layers.19.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
|
163 |
+
"model.layers.19.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
|
164 |
+
"model.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors",
|
165 |
+
"model.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
|
166 |
+
"model.layers.2.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
|
167 |
+
"model.layers.2.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
|
168 |
+
"model.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
|
169 |
+
"model.layers.2.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
|
170 |
+
"model.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
|
171 |
+
"model.layers.2.self_attn.o_proj.bias": "model-00001-of-00003.safetensors",
|
172 |
+
"model.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
|
173 |
+
"model.layers.2.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
|
174 |
+
"model.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
|
175 |
+
"model.layers.2.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
|
176 |
+
"model.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
|
177 |
+
"model.layers.20.input_layernorm.weight": "model-00002-of-00003.safetensors",
|
178 |
+
"model.layers.20.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
|
179 |
+
"model.layers.20.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
|
180 |
+
"model.layers.20.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
|
181 |
+
"model.layers.20.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
|
182 |
+
"model.layers.20.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
|
183 |
+
"model.layers.20.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
|
184 |
+
"model.layers.20.self_attn.o_proj.bias": "model-00002-of-00003.safetensors",
|
185 |
+
"model.layers.20.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
|
186 |
+
"model.layers.20.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
|
187 |
+
"model.layers.20.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
|
188 |
+
"model.layers.20.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
|
189 |
+
"model.layers.20.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
|
190 |
+
"model.layers.21.input_layernorm.weight": "model-00002-of-00003.safetensors",
|
191 |
+
"model.layers.21.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
|
192 |
+
"model.layers.21.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
|
193 |
+
"model.layers.21.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
|
194 |
+
"model.layers.21.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
|
195 |
+
"model.layers.21.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
|
196 |
+
"model.layers.21.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
|
197 |
+
"model.layers.21.self_attn.o_proj.bias": "model-00002-of-00003.safetensors",
|
198 |
+
"model.layers.21.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
|
199 |
+
"model.layers.21.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
|
200 |
+
"model.layers.21.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
|
201 |
+
"model.layers.21.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
|
202 |
+
"model.layers.21.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
|
203 |
+
"model.layers.22.input_layernorm.weight": "model-00003-of-00003.safetensors",
|
204 |
+
"model.layers.22.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
|
205 |
+
"model.layers.22.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
|
206 |
+
"model.layers.22.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
|
207 |
+
"model.layers.22.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
|
208 |
+
"model.layers.22.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
|
209 |
+
"model.layers.22.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
|
210 |
+
"model.layers.22.self_attn.o_proj.bias": "model-00002-of-00003.safetensors",
|
211 |
+
"model.layers.22.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
|
212 |
+
"model.layers.22.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
|
213 |
+
"model.layers.22.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
|
214 |
+
"model.layers.22.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
|
215 |
+
"model.layers.22.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
|
216 |
+
"model.layers.23.input_layernorm.weight": "model-00003-of-00003.safetensors",
|
217 |
+
"model.layers.23.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
|
218 |
+
"model.layers.23.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
|
219 |
+
"model.layers.23.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
|
220 |
+
"model.layers.23.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
|
221 |
+
"model.layers.23.self_attn.k_proj.bias": "model-00003-of-00003.safetensors",
|
222 |
+
"model.layers.23.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
|
223 |
+
"model.layers.23.self_attn.o_proj.bias": "model-00003-of-00003.safetensors",
|
224 |
+
"model.layers.23.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
|
225 |
+
"model.layers.23.self_attn.q_proj.bias": "model-00003-of-00003.safetensors",
|
226 |
+
"model.layers.23.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
|
227 |
+
"model.layers.23.self_attn.v_proj.bias": "model-00003-of-00003.safetensors",
|
228 |
+
"model.layers.23.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
|
229 |
+
"model.layers.24.input_layernorm.weight": "model-00003-of-00003.safetensors",
|
230 |
+
"model.layers.24.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
|
231 |
+
"model.layers.24.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
|
232 |
+
"model.layers.24.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
|
233 |
+
"model.layers.24.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
|
234 |
+
"model.layers.24.self_attn.k_proj.bias": "model-00003-of-00003.safetensors",
|
235 |
+
"model.layers.24.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
|
236 |
+
"model.layers.24.self_attn.o_proj.bias": "model-00003-of-00003.safetensors",
|
237 |
+
"model.layers.24.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
|
238 |
+
"model.layers.24.self_attn.q_proj.bias": "model-00003-of-00003.safetensors",
|
239 |
+
"model.layers.24.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
|
240 |
+
"model.layers.24.self_attn.v_proj.bias": "model-00003-of-00003.safetensors",
|
241 |
+
"model.layers.24.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
|
242 |
+
"model.layers.25.input_layernorm.weight": "model-00003-of-00003.safetensors",
|
243 |
+
"model.layers.25.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
|
244 |
+
"model.layers.25.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
|
245 |
+
"model.layers.25.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
|
246 |
+
"model.layers.25.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
|
247 |
+
"model.layers.25.self_attn.k_proj.bias": "model-00003-of-00003.safetensors",
|
248 |
+
"model.layers.25.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
|
249 |
+
"model.layers.25.self_attn.o_proj.bias": "model-00003-of-00003.safetensors",
|
250 |
+
"model.layers.25.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
|
251 |
+
"model.layers.25.self_attn.q_proj.bias": "model-00003-of-00003.safetensors",
|
252 |
+
"model.layers.25.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
|
253 |
+
"model.layers.25.self_attn.v_proj.bias": "model-00003-of-00003.safetensors",
|
254 |
+
"model.layers.25.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
|
255 |
+
"model.layers.26.input_layernorm.weight": "model-00003-of-00003.safetensors",
|
256 |
+
"model.layers.26.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
|
257 |
+
"model.layers.26.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
|
258 |
+
"model.layers.26.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
|
259 |
+
"model.layers.26.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
|
260 |
+
"model.layers.26.self_attn.k_proj.bias": "model-00003-of-00003.safetensors",
|
261 |
+
"model.layers.26.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
|
262 |
+
"model.layers.26.self_attn.o_proj.bias": "model-00003-of-00003.safetensors",
|
263 |
+
"model.layers.26.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
|
264 |
+
"model.layers.26.self_attn.q_proj.bias": "model-00003-of-00003.safetensors",
|
265 |
+
"model.layers.26.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
|
266 |
+
"model.layers.26.self_attn.v_proj.bias": "model-00003-of-00003.safetensors",
|
267 |
+
"model.layers.26.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
|
268 |
+
"model.layers.27.input_layernorm.weight": "model-00003-of-00003.safetensors",
|
269 |
+
"model.layers.27.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
|
270 |
+
"model.layers.27.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
|
271 |
+
"model.layers.27.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
|
272 |
+
"model.layers.27.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
|
273 |
+
"model.layers.27.self_attn.k_proj.bias": "model-00003-of-00003.safetensors",
|
274 |
+
"model.layers.27.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
|
275 |
+
"model.layers.27.self_attn.o_proj.bias": "model-00003-of-00003.safetensors",
|
276 |
+
"model.layers.27.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
|
277 |
+
"model.layers.27.self_attn.q_proj.bias": "model-00003-of-00003.safetensors",
|
278 |
+
"model.layers.27.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
|
279 |
+
"model.layers.27.self_attn.v_proj.bias": "model-00003-of-00003.safetensors",
|
280 |
+
"model.layers.27.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
|
281 |
+
"model.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors",
|
282 |
+
"model.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
|
283 |
+
"model.layers.3.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
|
284 |
+
"model.layers.3.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
|
285 |
+
"model.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
|
286 |
+
"model.layers.3.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
|
287 |
+
"model.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
|
288 |
+
"model.layers.3.self_attn.o_proj.bias": "model-00001-of-00003.safetensors",
|
289 |
+
"model.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
|
290 |
+
"model.layers.3.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
|
291 |
+
"model.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
|
292 |
+
"model.layers.3.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
|
293 |
+
"model.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
|
294 |
+
"model.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors",
|
295 |
+
"model.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
|
296 |
+
"model.layers.4.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
|
297 |
+
"model.layers.4.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
|
298 |
+
"model.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
|
299 |
+
"model.layers.4.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
|
300 |
+
"model.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
|
301 |
+
"model.layers.4.self_attn.o_proj.bias": "model-00001-of-00003.safetensors",
|
302 |
+
"model.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
|
303 |
+
"model.layers.4.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
|
304 |
+
"model.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
|
305 |
+
"model.layers.4.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
|
306 |
+
"model.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
|
307 |
+
"model.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors",
|
308 |
+
"model.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
|
309 |
+
"model.layers.5.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
|
310 |
+
"model.layers.5.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
|
311 |
+
"model.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
|
312 |
+
"model.layers.5.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
|
313 |
+
"model.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
|
314 |
+
"model.layers.5.self_attn.o_proj.bias": "model-00001-of-00003.safetensors",
|
315 |
+
"model.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
|
316 |
+
"model.layers.5.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
|
317 |
+
"model.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
|
318 |
+
"model.layers.5.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
|
319 |
+
"model.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
|
320 |
+
"model.layers.6.input_layernorm.weight": "model-00001-of-00003.safetensors",
|
321 |
+
"model.layers.6.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
|
322 |
+
"model.layers.6.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
|
323 |
+
"model.layers.6.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
|
324 |
+
"model.layers.6.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
|
325 |
+
"model.layers.6.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
|
326 |
+
"model.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
|
327 |
+
"model.layers.6.self_attn.o_proj.bias": "model-00001-of-00003.safetensors",
|
328 |
+
"model.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
|
329 |
+
"model.layers.6.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
|
330 |
+
"model.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
|
331 |
+
"model.layers.6.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
|
332 |
+
"model.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
|
333 |
+
"model.layers.7.input_layernorm.weight": "model-00001-of-00003.safetensors",
|
334 |
+
"model.layers.7.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
|
335 |
+
"model.layers.7.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
|
336 |
+
"model.layers.7.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
|
337 |
+
"model.layers.7.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
|
338 |
+
"model.layers.7.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
|
339 |
+
"model.layers.7.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
|
340 |
+
"model.layers.7.self_attn.o_proj.bias": "model-00001-of-00003.safetensors",
|
341 |
+
"model.layers.7.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
|
342 |
+
"model.layers.7.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
|
343 |
+
"model.layers.7.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
|
344 |
+
"model.layers.7.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
|
345 |
+
"model.layers.7.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
|
346 |
+
"model.layers.8.input_layernorm.weight": "model-00001-of-00003.safetensors",
|
347 |
+
"model.layers.8.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
|
348 |
+
"model.layers.8.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
|
349 |
+
"model.layers.8.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
|
350 |
+
"model.layers.8.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
|
351 |
+
"model.layers.8.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
|
352 |
+
"model.layers.8.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
|
353 |
+
"model.layers.8.self_attn.o_proj.bias": "model-00001-of-00003.safetensors",
|
354 |
+
"model.layers.8.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
|
355 |
+
"model.layers.8.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
|
356 |
+
"model.layers.8.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
|
357 |
+
"model.layers.8.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
|
358 |
+
"model.layers.8.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
|
359 |
+
"model.layers.9.input_layernorm.weight": "model-00001-of-00003.safetensors",
|
360 |
+
"model.layers.9.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
|
361 |
+
"model.layers.9.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
|
362 |
+
"model.layers.9.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
|
363 |
+
"model.layers.9.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
|
364 |
+
"model.layers.9.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
|
365 |
+
"model.layers.9.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
|
366 |
+
"model.layers.9.self_attn.o_proj.bias": "model-00001-of-00003.safetensors",
|
367 |
+
"model.layers.9.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
|
368 |
+
"model.layers.9.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
|
369 |
+
"model.layers.9.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
|
370 |
+
"model.layers.9.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
|
371 |
+
"model.layers.9.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
|
372 |
+
"model.norm.weight": "model-00003-of-00003.safetensors"
|
373 |
+
}
|
374 |
+
}
|
pytorch_model.bin.index.json
ADDED
@@ -0,0 +1,290 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"metadata": {
|
3 |
+
"total_size": 12486913536
|
4 |
+
},
|
5 |
+
"weight_map": {
|
6 |
+
"lm_head.weight": "pytorch_model-00002-of-00002.bin",
|
7 |
+
"model.embed_tokens.weight": "pytorch_model-00001-of-00002.bin",
|
8 |
+
"model.layers.0.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
|
9 |
+
"model.layers.0.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
|
10 |
+
"model.layers.0.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
|
11 |
+
"model.layers.0.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
|
12 |
+
"model.layers.0.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
|
13 |
+
"model.layers.0.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
|
14 |
+
"model.layers.0.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
|
15 |
+
"model.layers.0.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
|
16 |
+
"model.layers.0.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
|
17 |
+
"model.layers.0.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
|
18 |
+
"model.layers.1.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
|
19 |
+
"model.layers.1.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
|
20 |
+
"model.layers.1.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
|
21 |
+
"model.layers.1.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
|
22 |
+
"model.layers.1.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
|
23 |
+
"model.layers.1.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
|
24 |
+
"model.layers.1.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
|
25 |
+
"model.layers.1.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
|
26 |
+
"model.layers.1.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
|
27 |
+
"model.layers.1.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
|
28 |
+
"model.layers.10.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
|
29 |
+
"model.layers.10.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
|
30 |
+
"model.layers.10.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
|
31 |
+
"model.layers.10.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
|
32 |
+
"model.layers.10.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
|
33 |
+
"model.layers.10.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
|
34 |
+
"model.layers.10.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
|
35 |
+
"model.layers.10.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
|
36 |
+
"model.layers.10.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
|
37 |
+
"model.layers.10.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
|
38 |
+
"model.layers.11.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
|
39 |
+
"model.layers.11.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
|
40 |
+
"model.layers.11.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
|
41 |
+
"model.layers.11.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
|
42 |
+
"model.layers.11.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
|
43 |
+
"model.layers.11.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
|
44 |
+
"model.layers.11.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
|
45 |
+
"model.layers.11.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
|
46 |
+
"model.layers.11.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
|
47 |
+
"model.layers.11.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
|
48 |
+
"model.layers.12.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
|
49 |
+
"model.layers.12.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
|
50 |
+
"model.layers.12.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
|
51 |
+
"model.layers.12.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
|
52 |
+
"model.layers.12.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
|
53 |
+
"model.layers.12.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
|
54 |
+
"model.layers.12.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
|
55 |
+
"model.layers.12.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
|
56 |
+
"model.layers.12.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
|
57 |
+
"model.layers.12.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
|
58 |
+
"model.layers.13.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
|
59 |
+
"model.layers.13.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
|
60 |
+
"model.layers.13.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
|
61 |
+
"model.layers.13.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
|
62 |
+
"model.layers.13.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
|
63 |
+
"model.layers.13.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
|
64 |
+
"model.layers.13.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
|
65 |
+
"model.layers.13.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
|
66 |
+
"model.layers.13.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
|
67 |
+
"model.layers.13.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
|
68 |
+
"model.layers.14.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
|
69 |
+
"model.layers.14.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
|
70 |
+
"model.layers.14.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
|
71 |
+
"model.layers.14.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
|
72 |
+
"model.layers.14.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
|
73 |
+
"model.layers.14.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
|
74 |
+
"model.layers.14.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
|
75 |
+
"model.layers.14.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
|
76 |
+
"model.layers.14.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
|
77 |
+
"model.layers.14.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
|
78 |
+
"model.layers.15.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
|
79 |
+
"model.layers.15.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
|
80 |
+
"model.layers.15.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
|
81 |
+
"model.layers.15.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
|
82 |
+
"model.layers.15.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
|
83 |
+
"model.layers.15.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
|
84 |
+
"model.layers.15.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
|
85 |
+
"model.layers.15.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
|
86 |
+
"model.layers.15.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
|
87 |
+
"model.layers.15.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
|
88 |
+
"model.layers.16.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
|
89 |
+
"model.layers.16.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
|
90 |
+
"model.layers.16.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
|
91 |
+
"model.layers.16.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
|
92 |
+
"model.layers.16.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
|
93 |
+
"model.layers.16.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
|
94 |
+
"model.layers.16.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
|
95 |
+
"model.layers.16.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
|
96 |
+
"model.layers.16.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
|
97 |
+
"model.layers.16.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
|
98 |
+
"model.layers.17.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
|
99 |
+
"model.layers.17.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
|
100 |
+
"model.layers.17.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
|
101 |
+
"model.layers.17.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
|
102 |
+
"model.layers.17.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
|
103 |
+
"model.layers.17.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
|
104 |
+
"model.layers.17.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
|
105 |
+
"model.layers.17.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
|
106 |
+
"model.layers.17.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
|
107 |
+
"model.layers.17.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
|
108 |
+
"model.layers.18.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
|
109 |
+
"model.layers.18.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
|
110 |
+
"model.layers.18.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
|
111 |
+
"model.layers.18.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
|
112 |
+
"model.layers.18.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
|
113 |
+
"model.layers.18.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
|
114 |
+
"model.layers.18.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
|
115 |
+
"model.layers.18.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
|
116 |
+
"model.layers.18.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
|
117 |
+
"model.layers.18.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
|
118 |
+
"model.layers.19.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
|
119 |
+
"model.layers.19.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
|
120 |
+
"model.layers.19.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
|
121 |
+
"model.layers.19.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
|
122 |
+
"model.layers.19.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
|
123 |
+
"model.layers.19.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
|
124 |
+
"model.layers.19.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
|
125 |
+
"model.layers.19.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
|
126 |
+
"model.layers.19.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
|
127 |
+
"model.layers.19.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
|
128 |
+
"model.layers.2.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
|
129 |
+
"model.layers.2.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
|
130 |
+
"model.layers.2.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
|
131 |
+
"model.layers.2.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
|
132 |
+
"model.layers.2.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
|
133 |
+
"model.layers.2.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
|
134 |
+
"model.layers.2.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
|
135 |
+
"model.layers.2.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
|
136 |
+
"model.layers.2.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
|
137 |
+
"model.layers.2.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
|
138 |
+
"model.layers.20.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
|
139 |
+
"model.layers.20.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
|
140 |
+
"model.layers.20.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
|
141 |
+
"model.layers.20.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
|
142 |
+
"model.layers.20.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
|
143 |
+
"model.layers.20.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
|
144 |
+
"model.layers.20.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
|
145 |
+
"model.layers.20.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
|
146 |
+
"model.layers.20.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
|
147 |
+
"model.layers.20.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
|
148 |
+
"model.layers.21.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
|
149 |
+
"model.layers.21.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
|
150 |
+
"model.layers.21.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
|
151 |
+
"model.layers.21.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
|
152 |
+
"model.layers.21.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
|
153 |
+
"model.layers.21.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
|
154 |
+
"model.layers.21.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
|
155 |
+
"model.layers.21.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
|
156 |
+
"model.layers.21.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
|
157 |
+
"model.layers.21.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
|
158 |
+
"model.layers.22.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
|
159 |
+
"model.layers.22.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
|
160 |
+
"model.layers.22.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
|
161 |
+
"model.layers.22.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
|
162 |
+
"model.layers.22.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
|
163 |
+
"model.layers.22.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
|
164 |
+
"model.layers.22.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
|
165 |
+
"model.layers.22.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
|
166 |
+
"model.layers.22.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
|
167 |
+
"model.layers.22.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
|
168 |
+
"model.layers.23.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
|
169 |
+
"model.layers.23.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
|
170 |
+
"model.layers.23.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
|
171 |
+
"model.layers.23.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
|
172 |
+
"model.layers.23.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
|
173 |
+
"model.layers.23.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
|
174 |
+
"model.layers.23.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
|
175 |
+
"model.layers.23.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
|
176 |
+
"model.layers.23.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
|
177 |
+
"model.layers.23.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
|
178 |
+
"model.layers.24.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
|
179 |
+
"model.layers.24.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
|
180 |
+
"model.layers.24.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
|
181 |
+
"model.layers.24.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
|
182 |
+
"model.layers.24.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
|
183 |
+
"model.layers.24.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
|
184 |
+
"model.layers.24.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
|
185 |
+
"model.layers.24.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
|
186 |
+
"model.layers.24.self_attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00002.bin",
|
187 |
+
"model.layers.24.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
|
188 |
+
"model.layers.25.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
|
189 |
+
"model.layers.25.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
|
190 |
+
"model.layers.25.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
|
191 |
+
"model.layers.25.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
|
192 |
+
"model.layers.25.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
|
193 |
+
"model.layers.25.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
|
194 |
+
"model.layers.25.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
|
195 |
+
"model.layers.25.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
|
196 |
+
"model.layers.25.self_attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00002.bin",
|
197 |
+
"model.layers.25.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
|
198 |
+
"model.layers.26.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
|
199 |
+
"model.layers.26.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
|
200 |
+
"model.layers.26.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
|
201 |
+
"model.layers.26.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
|
202 |
+
"model.layers.26.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
|
203 |
+
"model.layers.26.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
|
204 |
+
"model.layers.26.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
|
205 |
+
"model.layers.26.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
|
206 |
+
"model.layers.26.self_attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00002.bin",
|
207 |
+
"model.layers.26.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
|
208 |
+
"model.layers.27.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
|
209 |
+
"model.layers.27.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
|
210 |
+
"model.layers.27.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
|
211 |
+
"model.layers.27.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
|
212 |
+
"model.layers.27.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
|
213 |
+
"model.layers.27.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
|
214 |
+
"model.layers.27.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
|
215 |
+
"model.layers.27.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
|
216 |
+
"model.layers.27.self_attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00002.bin",
|
217 |
+
"model.layers.27.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
|
218 |
+
"model.layers.3.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
|
219 |
+
"model.layers.3.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
|
220 |
+
"model.layers.3.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
|
221 |
+
"model.layers.3.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
|
222 |
+
"model.layers.3.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
|
223 |
+
"model.layers.3.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
|
224 |
+
"model.layers.3.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
|
225 |
+
"model.layers.3.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
|
226 |
+
"model.layers.3.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
|
227 |
+
"model.layers.3.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
|
228 |
+
"model.layers.4.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
|
229 |
+
"model.layers.4.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
|
230 |
+
"model.layers.4.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
|
231 |
+
"model.layers.4.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
|
232 |
+
"model.layers.4.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
|
233 |
+
"model.layers.4.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
|
234 |
+
"model.layers.4.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
|
235 |
+
"model.layers.4.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
|
236 |
+
"model.layers.4.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
|
237 |
+
"model.layers.4.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
|
238 |
+
"model.layers.5.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
|
239 |
+
"model.layers.5.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
|
240 |
+
"model.layers.5.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
|
241 |
+
"model.layers.5.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
|
242 |
+
"model.layers.5.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
|
243 |
+
"model.layers.5.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
|
244 |
+
"model.layers.5.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
|
245 |
+
"model.layers.5.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
|
246 |
+
"model.layers.5.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
|
247 |
+
"model.layers.5.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
|
248 |
+
"model.layers.6.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
|
249 |
+
"model.layers.6.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
|
250 |
+
"model.layers.6.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
|
251 |
+
"model.layers.6.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
|
252 |
+
"model.layers.6.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
|
253 |
+
"model.layers.6.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
|
254 |
+
"model.layers.6.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
|
255 |
+
"model.layers.6.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
|
256 |
+
"model.layers.6.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
|
257 |
+
"model.layers.6.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
|
258 |
+
"model.layers.7.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
|
259 |
+
"model.layers.7.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
|
260 |
+
"model.layers.7.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
|
261 |
+
"model.layers.7.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
|
262 |
+
"model.layers.7.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
|
263 |
+
"model.layers.7.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
|
264 |
+
"model.layers.7.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
|
265 |
+
"model.layers.7.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
|
266 |
+
"model.layers.7.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
|
267 |
+
"model.layers.7.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
|
268 |
+
"model.layers.8.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
|
269 |
+
"model.layers.8.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
|
270 |
+
"model.layers.8.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
|
271 |
+
"model.layers.8.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
|
272 |
+
"model.layers.8.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
|
273 |
+
"model.layers.8.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
|
274 |
+
"model.layers.8.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
|
275 |
+
"model.layers.8.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
|
276 |
+
"model.layers.8.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
|
277 |
+
"model.layers.8.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
|
278 |
+
"model.layers.9.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
|
279 |
+
"model.layers.9.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
|
280 |
+
"model.layers.9.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
|
281 |
+
"model.layers.9.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
|
282 |
+
"model.layers.9.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
|
283 |
+
"model.layers.9.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
|
284 |
+
"model.layers.9.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
|
285 |
+
"model.layers.9.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
|
286 |
+
"model.layers.9.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
|
287 |
+
"model.layers.9.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
|
288 |
+
"model.norm.weight": "pytorch_model-00002-of-00002.bin"
|
289 |
+
}
|
290 |
+
}
|
special_tokens_map.json
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"bos_token": {
|
3 |
+
"content": "<s>",
|
4 |
+
"lstrip": false,
|
5 |
+
"normalized": false,
|
6 |
+
"rstrip": false,
|
7 |
+
"single_word": false
|
8 |
+
},
|
9 |
+
"eos_token": {
|
10 |
+
"content": "</s>",
|
11 |
+
"lstrip": false,
|
12 |
+
"normalized": false,
|
13 |
+
"rstrip": false,
|
14 |
+
"single_word": false
|
15 |
+
},
|
16 |
+
"pad_token": "<unk>",
|
17 |
+
"unk_token": {
|
18 |
+
"content": "<unk>",
|
19 |
+
"lstrip": false,
|
20 |
+
"normalized": false,
|
21 |
+
"rstrip": false,
|
22 |
+
"single_word": false
|
23 |
+
}
|
24 |
+
}
|
tokenization_utils_base.py
ADDED
The diff for this file is too large to render.
See raw diff
|
|
tokenization_utils_fast.py
ADDED
@@ -0,0 +1,764 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# coding=utf-8
|
2 |
+
# Copyright 2020 The HuggingFace Inc. team.
|
3 |
+
#
|
4 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
5 |
+
# you may not use this file except in compliance with the License.
|
6 |
+
# You may obtain a copy of the License at
|
7 |
+
#
|
8 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9 |
+
#
|
10 |
+
# Unless required by applicable law or agreed to in writing, software
|
11 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13 |
+
# See the License for the specific language governing permissions and
|
14 |
+
# limitations under the License.
|
15 |
+
"""
|
16 |
+
Tokenization classes for fast tokenizers (provided by HuggingFace's tokenizers library). For slow (python) tokenizers
|
17 |
+
see tokenization_utils.py
|
18 |
+
"""
|
19 |
+
import copy
|
20 |
+
import json
|
21 |
+
import os
|
22 |
+
from collections import defaultdict
|
23 |
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
24 |
+
|
25 |
+
import tokenizers.pre_tokenizers as pre_tokenizers_fast
|
26 |
+
from tokenizers import Encoding as EncodingFast
|
27 |
+
from tokenizers import Tokenizer as TokenizerFast
|
28 |
+
from tokenizers.decoders import Decoder as DecoderFast
|
29 |
+
from tokenizers.trainers import BpeTrainer, UnigramTrainer, WordLevelTrainer, WordPieceTrainer
|
30 |
+
|
31 |
+
from transformers.convert_slow_tokenizer import convert_slow_tokenizer
|
32 |
+
from transformers.tokenization_utils import PreTrainedTokenizer
|
33 |
+
from .tokenization_utils_base import (
|
34 |
+
INIT_TOKENIZER_DOCSTRING,
|
35 |
+
AddedToken,
|
36 |
+
BatchEncoding,
|
37 |
+
PreTokenizedInput,
|
38 |
+
PreTokenizedInputPair,
|
39 |
+
PreTrainedTokenizerBase,
|
40 |
+
SpecialTokensMixin,
|
41 |
+
TextInput,
|
42 |
+
TextInputPair,
|
43 |
+
TruncationStrategy,
|
44 |
+
)
|
45 |
+
from transformers.utils import PaddingStrategy, add_end_docstrings, logging
|
46 |
+
|
47 |
+
|
48 |
+
logger = logging.get_logger(__name__)
|
49 |
+
|
50 |
+
# Fast tokenizers (provided by HuggingFace tokenizer's library) can be saved in a single file
|
51 |
+
TOKENIZER_FILE = "tokenizer.json"
|
52 |
+
SPECIAL_TOKENS_MAP_FILE = "special_tokens_map.json"
|
53 |
+
TOKENIZER_CONFIG_FILE = "tokenizer_config.json"
|
54 |
+
|
55 |
+
# Slow tokenizers have an additional added tokens files
|
56 |
+
ADDED_TOKENS_FILE = "added_tokens.json"
|
57 |
+
|
58 |
+
INIT_TOKENIZER_DOCSTRING += """
|
59 |
+
tokenizer_object ([`tokenizers.Tokenizer`]):
|
60 |
+
A [`tokenizers.Tokenizer`] object from 🤗 tokenizers to instantiate from. See [Using tokenizers from 🤗
|
61 |
+
tokenizers](../fast_tokenizers) for more information.
|
62 |
+
tokenizer_file ([`str`]):
|
63 |
+
A path to a local JSON file representing a previously serialized [`tokenizers.Tokenizer`] object from 🤗
|
64 |
+
tokenizers.
|
65 |
+
"""
|
66 |
+
|
67 |
+
MODEL_TO_TRAINER_MAPPING = {
|
68 |
+
"BPE": BpeTrainer,
|
69 |
+
"Unigram": UnigramTrainer,
|
70 |
+
"WordLevel": WordLevelTrainer,
|
71 |
+
"WordPiece": WordPieceTrainer,
|
72 |
+
}
|
73 |
+
|
74 |
+
VOCAB_FILES_NAMES = {"tokenizer_file": TOKENIZER_FILE}
|
75 |
+
|
76 |
+
|
77 |
+
@add_end_docstrings(INIT_TOKENIZER_DOCSTRING)
|
78 |
+
class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
|
79 |
+
"""
|
80 |
+
Base class for all fast tokenizers (wrapping HuggingFace tokenizers library).
|
81 |
+
|
82 |
+
Inherits from [`~tokenization_utils_base.PreTrainedTokenizerBase`].
|
83 |
+
|
84 |
+
Handles all the shared methods for tokenization and special tokens, as well as methods for
|
85 |
+
downloading/caching/loading pretrained tokenizers, as well as adding tokens to the vocabulary.
|
86 |
+
|
87 |
+
This class also contains the added tokens in a unified way on top of all tokenizers so we don't have to handle the
|
88 |
+
specific vocabulary augmentation methods of the various underlying dictionary structures (BPE, sentencepiece...).
|
89 |
+
"""
|
90 |
+
|
91 |
+
vocab_files_names = VOCAB_FILES_NAMES
|
92 |
+
slow_tokenizer_class: PreTrainedTokenizer = None
|
93 |
+
can_save_slow_tokenizer: bool = True
|
94 |
+
|
95 |
+
def __init__(self, *args, **kwargs):
|
96 |
+
tokenizer_object = kwargs.pop("tokenizer_object", None)
|
97 |
+
slow_tokenizer = kwargs.pop("__slow_tokenizer", None)
|
98 |
+
fast_tokenizer_file = kwargs.pop("tokenizer_file", None)
|
99 |
+
from_slow = kwargs.pop("from_slow", False)
|
100 |
+
if from_slow and slow_tokenizer is None and self.slow_tokenizer_class is None:
|
101 |
+
raise ValueError(
|
102 |
+
"Cannot instantiate this tokenizer from a slow version. If it's based on sentencepiece, make sure you "
|
103 |
+
"have sentencepiece installed."
|
104 |
+
)
|
105 |
+
|
106 |
+
if tokenizer_object is not None:
|
107 |
+
fast_tokenizer = copy.deepcopy(tokenizer_object)
|
108 |
+
elif fast_tokenizer_file is not None and not from_slow:
|
109 |
+
# We have a serialization from tokenizers which let us directly build the backend
|
110 |
+
fast_tokenizer = TokenizerFast.from_file(fast_tokenizer_file)
|
111 |
+
elif slow_tokenizer is not None:
|
112 |
+
# We need to convert a slow tokenizer to build the backend
|
113 |
+
fast_tokenizer = convert_slow_tokenizer(slow_tokenizer)
|
114 |
+
elif self.slow_tokenizer_class is not None:
|
115 |
+
# We need to create and convert a slow tokenizer to build the backend
|
116 |
+
slow_tokenizer = self.slow_tokenizer_class(*args, **kwargs)
|
117 |
+
fast_tokenizer = convert_slow_tokenizer(slow_tokenizer)
|
118 |
+
else:
|
119 |
+
raise ValueError(
|
120 |
+
"Couldn't instantiate the backend tokenizer from one of: \n"
|
121 |
+
"(1) a `tokenizers` library serialization file, \n"
|
122 |
+
"(2) a slow tokenizer instance to convert or \n"
|
123 |
+
"(3) an equivalent slow tokenizer class to instantiate and convert. \n"
|
124 |
+
"You need to have sentencepiece installed to convert a slow tokenizer to a fast one."
|
125 |
+
)
|
126 |
+
|
127 |
+
self._tokenizer = fast_tokenizer
|
128 |
+
|
129 |
+
if slow_tokenizer is not None:
|
130 |
+
kwargs.update(slow_tokenizer.init_kwargs)
|
131 |
+
|
132 |
+
self._decode_use_source_tokenizer = False
|
133 |
+
|
134 |
+
# We call this after having initialized the backend tokenizer because we update it.
|
135 |
+
super().__init__(**kwargs)
|
136 |
+
|
137 |
+
@property
|
138 |
+
def is_fast(self) -> bool:
|
139 |
+
return True
|
140 |
+
|
141 |
+
@property
|
142 |
+
def vocab_size(self) -> int:
|
143 |
+
"""
|
144 |
+
`int`: Size of the base vocabulary (without the added tokens).
|
145 |
+
"""
|
146 |
+
return self._tokenizer.get_vocab_size(with_added_tokens=False)
|
147 |
+
|
148 |
+
def get_vocab(self) -> Dict[str, int]:
|
149 |
+
return self._tokenizer.get_vocab(with_added_tokens=True)
|
150 |
+
|
151 |
+
@property
|
152 |
+
def vocab(self) -> Dict[str, int]:
|
153 |
+
return self.get_vocab()
|
154 |
+
|
155 |
+
def get_added_vocab(self) -> Dict[str, int]:
|
156 |
+
"""
|
157 |
+
Returns the added tokens in the vocabulary as a dictionary of token to index.
|
158 |
+
|
159 |
+
Returns:
|
160 |
+
`Dict[str, int]`: The added tokens.
|
161 |
+
"""
|
162 |
+
base_vocab = self._tokenizer.get_vocab(with_added_tokens=False)
|
163 |
+
full_vocab = self._tokenizer.get_vocab(with_added_tokens=True)
|
164 |
+
added_vocab = {tok: index for tok, index in full_vocab.items() if tok not in base_vocab}
|
165 |
+
return added_vocab
|
166 |
+
|
167 |
+
def __len__(self) -> int:
|
168 |
+
"""
|
169 |
+
Size of the full vocabulary with the added tokens.
|
170 |
+
"""
|
171 |
+
return self._tokenizer.get_vocab_size(with_added_tokens=True)
|
172 |
+
|
173 |
+
@property
|
174 |
+
def backend_tokenizer(self) -> TokenizerFast:
|
175 |
+
"""
|
176 |
+
`tokenizers.implementations.BaseTokenizer`: The Rust tokenizer used as a backend.
|
177 |
+
"""
|
178 |
+
return self._tokenizer
|
179 |
+
|
180 |
+
@property
|
181 |
+
def decoder(self) -> DecoderFast:
|
182 |
+
"""
|
183 |
+
`tokenizers.decoders.Decoder`: The Rust decoder for this tokenizer.
|
184 |
+
"""
|
185 |
+
return self._tokenizer.decoder
|
186 |
+
|
187 |
+
def _convert_encoding(
|
188 |
+
self,
|
189 |
+
encoding: EncodingFast,
|
190 |
+
return_token_type_ids: Optional[bool] = None,
|
191 |
+
return_attention_mask: Optional[bool] = None,
|
192 |
+
return_overflowing_tokens: bool = False,
|
193 |
+
return_special_tokens_mask: bool = False,
|
194 |
+
return_offsets_mapping: bool = False,
|
195 |
+
return_length: bool = False,
|
196 |
+
verbose: bool = True,
|
197 |
+
) -> Tuple[Dict[str, Any], List[EncodingFast]]:
|
198 |
+
"""
|
199 |
+
Convert the encoding representation (from low-level HuggingFace tokenizer output) to a python Dict and a list
|
200 |
+
of encodings, take care of building a batch from overflowing tokens.
|
201 |
+
|
202 |
+
Overflowing tokens are converted to additional examples (like batches) so the output values of the dict are
|
203 |
+
lists (overflows) of lists (tokens).
|
204 |
+
|
205 |
+
Output shape: (overflows, sequence length)
|
206 |
+
"""
|
207 |
+
if return_token_type_ids is None:
|
208 |
+
return_token_type_ids = "token_type_ids" in self.model_input_names
|
209 |
+
if return_attention_mask is None:
|
210 |
+
return_attention_mask = "attention_mask" in self.model_input_names
|
211 |
+
|
212 |
+
if return_overflowing_tokens and encoding.overflowing is not None:
|
213 |
+
encodings = [encoding] + encoding.overflowing
|
214 |
+
else:
|
215 |
+
encodings = [encoding]
|
216 |
+
|
217 |
+
encoding_dict = defaultdict(list)
|
218 |
+
for e in encodings:
|
219 |
+
encoding_dict["input_ids"].append(e.ids)
|
220 |
+
|
221 |
+
if return_token_type_ids:
|
222 |
+
encoding_dict["token_type_ids"].append(e.type_ids)
|
223 |
+
if return_attention_mask:
|
224 |
+
encoding_dict["attention_mask"].append(e.attention_mask)
|
225 |
+
if return_special_tokens_mask:
|
226 |
+
encoding_dict["special_tokens_mask"].append(e.special_tokens_mask)
|
227 |
+
if return_offsets_mapping:
|
228 |
+
encoding_dict["offset_mapping"].append(e.offsets)
|
229 |
+
if return_length:
|
230 |
+
encoding_dict["length"].append(len(e.ids))
|
231 |
+
|
232 |
+
return encoding_dict, encodings
|
233 |
+
|
234 |
+
def convert_tokens_to_ids(self, tokens: Union[str, List[str]]) -> Union[int, List[int]]:
|
235 |
+
"""
|
236 |
+
Converts a token string (or a sequence of tokens) in a single integer id (or a sequence of ids), using the
|
237 |
+
vocabulary.
|
238 |
+
|
239 |
+
Args:
|
240 |
+
tokens (`str` or `List[str]`): One or several token(s) to convert to token id(s).
|
241 |
+
|
242 |
+
Returns:
|
243 |
+
`int` or `List[int]`: The token id or list of token ids.
|
244 |
+
"""
|
245 |
+
if tokens is None:
|
246 |
+
return None
|
247 |
+
|
248 |
+
if isinstance(tokens, str):
|
249 |
+
return self._convert_token_to_id_with_added_voc(tokens)
|
250 |
+
|
251 |
+
return [self._convert_token_to_id_with_added_voc(token) for token in tokens]
|
252 |
+
|
253 |
+
def _convert_token_to_id_with_added_voc(self, token: str) -> int:
|
254 |
+
index = self._tokenizer.token_to_id(token)
|
255 |
+
if index is None:
|
256 |
+
return self.unk_token_id
|
257 |
+
return index
|
258 |
+
|
259 |
+
def _convert_id_to_token(self, index: int) -> Optional[str]:
|
260 |
+
return self._tokenizer.id_to_token(int(index))
|
261 |
+
|
262 |
+
def _add_tokens(self, new_tokens: List[Union[str, AddedToken]], special_tokens=False) -> int:
|
263 |
+
if special_tokens:
|
264 |
+
return self._tokenizer.add_special_tokens(new_tokens)
|
265 |
+
|
266 |
+
return self._tokenizer.add_tokens(new_tokens)
|
267 |
+
|
268 |
+
def num_special_tokens_to_add(self, pair: bool = False) -> int:
|
269 |
+
"""
|
270 |
+
Returns the number of added tokens when encoding a sequence with special tokens.
|
271 |
+
|
272 |
+
<Tip>
|
273 |
+
|
274 |
+
This encodes a dummy input and checks the number of added tokens, and is therefore not efficient. Do not put
|
275 |
+
this inside your training loop.
|
276 |
+
|
277 |
+
</Tip>
|
278 |
+
|
279 |
+
Args:
|
280 |
+
pair (`bool`, *optional*, defaults to `False`):
|
281 |
+
Whether the number of added tokens should be computed in the case of a sequence pair or a single
|
282 |
+
sequence.
|
283 |
+
|
284 |
+
Returns:
|
285 |
+
`int`: Number of special tokens added to sequences.
|
286 |
+
"""
|
287 |
+
return self._tokenizer.num_special_tokens_to_add(pair)
|
288 |
+
|
289 |
+
def convert_ids_to_tokens(
|
290 |
+
self, ids: Union[int, List[int]], skip_special_tokens: bool = False
|
291 |
+
) -> Union[str, List[str]]:
|
292 |
+
"""
|
293 |
+
Converts a single index or a sequence of indices in a token or a sequence of tokens, using the vocabulary and
|
294 |
+
added tokens.
|
295 |
+
|
296 |
+
Args:
|
297 |
+
ids (`int` or `List[int]`):
|
298 |
+
The token id (or token ids) to convert to tokens.
|
299 |
+
skip_special_tokens (`bool`, *optional*, defaults to `False`):
|
300 |
+
Whether or not to remove special tokens in the decoding.
|
301 |
+
|
302 |
+
Returns:
|
303 |
+
`str` or `List[str]`: The decoded token(s).
|
304 |
+
"""
|
305 |
+
if isinstance(ids, int):
|
306 |
+
return self._tokenizer.id_to_token(ids)
|
307 |
+
tokens = []
|
308 |
+
for index in ids:
|
309 |
+
index = int(index)
|
310 |
+
if skip_special_tokens and index in self.all_special_ids:
|
311 |
+
continue
|
312 |
+
tokens.append(self._tokenizer.id_to_token(index))
|
313 |
+
return tokens
|
314 |
+
|
315 |
+
def tokenize(self, text: str, pair: Optional[str] = None, add_special_tokens: bool = False, **kwargs) -> List[str]:
|
316 |
+
return self.encode_plus(text=text, text_pair=pair, add_special_tokens=add_special_tokens, **kwargs).tokens()
|
317 |
+
|
318 |
+
def set_truncation_and_padding(
|
319 |
+
self,
|
320 |
+
padding_strategy: PaddingStrategy,
|
321 |
+
truncation_strategy: TruncationStrategy,
|
322 |
+
max_length: int,
|
323 |
+
stride: int,
|
324 |
+
pad_to_multiple_of: Optional[int],
|
325 |
+
):
|
326 |
+
"""
|
327 |
+
Define the truncation and the padding strategies for fast tokenizers (provided by HuggingFace tokenizers
|
328 |
+
library) and restore the tokenizer settings afterwards.
|
329 |
+
|
330 |
+
The provided tokenizer has no padding / truncation strategy before the managed section. If your tokenizer set a
|
331 |
+
padding / truncation strategy before, then it will be reset to no padding / truncation when exiting the managed
|
332 |
+
section.
|
333 |
+
|
334 |
+
Args:
|
335 |
+
padding_strategy ([`~utils.PaddingStrategy`]):
|
336 |
+
The kind of padding that will be applied to the input
|
337 |
+
truncation_strategy ([`~tokenization_utils_base.TruncationStrategy`]):
|
338 |
+
The kind of truncation that will be applied to the input
|
339 |
+
max_length (`int`):
|
340 |
+
The maximum size of a sequence.
|
341 |
+
stride (`int`):
|
342 |
+
The stride to use when handling overflow.
|
343 |
+
pad_to_multiple_of (`int`, *optional*):
|
344 |
+
If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
|
345 |
+
the use of Tensor Cores on NVIDIA hardware with compute capability `>= 7.5` (Volta).
|
346 |
+
"""
|
347 |
+
_truncation = self._tokenizer.truncation
|
348 |
+
_padding = self._tokenizer.padding
|
349 |
+
# Set truncation and padding on the backend tokenizer
|
350 |
+
if truncation_strategy == TruncationStrategy.DO_NOT_TRUNCATE:
|
351 |
+
if _truncation is not None:
|
352 |
+
self._tokenizer.no_truncation()
|
353 |
+
else:
|
354 |
+
target = {
|
355 |
+
"max_length": max_length,
|
356 |
+
"stride": stride,
|
357 |
+
"strategy": truncation_strategy.value,
|
358 |
+
"direction": self.truncation_side,
|
359 |
+
}
|
360 |
+
|
361 |
+
# _truncation might contain more keys that the target `transformers`
|
362 |
+
# supports. Use only the target keys to trigger `enable_truncation`.
|
363 |
+
# This should enable this code to works on various `tokenizers`
|
364 |
+
# targets.
|
365 |
+
if _truncation is None:
|
366 |
+
current = None
|
367 |
+
else:
|
368 |
+
current = {k: _truncation.get(k, None) for k in target}
|
369 |
+
|
370 |
+
if current != target:
|
371 |
+
self._tokenizer.enable_truncation(**target)
|
372 |
+
|
373 |
+
if padding_strategy == PaddingStrategy.DO_NOT_PAD:
|
374 |
+
if _padding is not None:
|
375 |
+
self._tokenizer.no_padding()
|
376 |
+
else:
|
377 |
+
length = max_length if padding_strategy == PaddingStrategy.MAX_LENGTH else None
|
378 |
+
target = {
|
379 |
+
"length": length,
|
380 |
+
"direction": self.padding_side,
|
381 |
+
"pad_id": self.pad_token_id,
|
382 |
+
"pad_token": self.pad_token,
|
383 |
+
"pad_type_id": self.pad_token_type_id,
|
384 |
+
"pad_to_multiple_of": pad_to_multiple_of,
|
385 |
+
}
|
386 |
+
if _padding != target:
|
387 |
+
self._tokenizer.enable_padding(**target)
|
388 |
+
|
389 |
+
def _batch_encode_plus(
|
390 |
+
self,
|
391 |
+
batch_text_or_text_pairs: Union[
|
392 |
+
List[TextInput], List[TextInputPair], List[PreTokenizedInput], List[PreTokenizedInputPair]
|
393 |
+
],
|
394 |
+
add_special_tokens: bool = True,
|
395 |
+
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
|
396 |
+
truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
|
397 |
+
max_length: Optional[int] = None,
|
398 |
+
stride: int = 0,
|
399 |
+
is_split_into_words: bool = False,
|
400 |
+
pad_to_multiple_of: Optional[int] = None,
|
401 |
+
return_tensors: Optional[str] = None,
|
402 |
+
return_token_type_ids: Optional[bool] = None,
|
403 |
+
return_attention_mask: Optional[bool] = None,
|
404 |
+
return_overflowing_tokens: bool = False,
|
405 |
+
return_special_tokens_mask: bool = False,
|
406 |
+
return_offsets_mapping: bool = False,
|
407 |
+
return_length: bool = False,
|
408 |
+
verbose: bool = True,
|
409 |
+
) -> BatchEncoding:
|
410 |
+
if not isinstance(batch_text_or_text_pairs, (tuple, list)):
|
411 |
+
raise TypeError(
|
412 |
+
f"batch_text_or_text_pairs has to be a list or a tuple (got {type(batch_text_or_text_pairs)})"
|
413 |
+
)
|
414 |
+
|
415 |
+
# Set the truncation and padding strategy and restore the initial configuration
|
416 |
+
self.set_truncation_and_padding(
|
417 |
+
padding_strategy=padding_strategy,
|
418 |
+
truncation_strategy=truncation_strategy,
|
419 |
+
max_length=max_length,
|
420 |
+
stride=stride,
|
421 |
+
pad_to_multiple_of=pad_to_multiple_of,
|
422 |
+
)
|
423 |
+
|
424 |
+
encodings = self._tokenizer.encode_batch(
|
425 |
+
batch_text_or_text_pairs,
|
426 |
+
add_special_tokens=add_special_tokens,
|
427 |
+
is_pretokenized=is_split_into_words,
|
428 |
+
)
|
429 |
+
|
430 |
+
# Convert encoding to dict
|
431 |
+
# `Tokens` has type: Tuple[
|
432 |
+
# List[Dict[str, List[List[int]]]] or List[Dict[str, 2D-Tensor]],
|
433 |
+
# List[EncodingFast]
|
434 |
+
# ]
|
435 |
+
# with nested dimensions corresponding to batch, overflows, sequence length
|
436 |
+
tokens_and_encodings = [
|
437 |
+
self._convert_encoding(
|
438 |
+
encoding=encoding,
|
439 |
+
return_token_type_ids=return_token_type_ids,
|
440 |
+
return_attention_mask=return_attention_mask,
|
441 |
+
return_overflowing_tokens=return_overflowing_tokens,
|
442 |
+
return_special_tokens_mask=return_special_tokens_mask,
|
443 |
+
return_offsets_mapping=return_offsets_mapping,
|
444 |
+
return_length=return_length,
|
445 |
+
verbose=verbose,
|
446 |
+
)
|
447 |
+
for encoding in encodings
|
448 |
+
]
|
449 |
+
|
450 |
+
# Convert the output to have dict[list] from list[dict] and remove the additional overflows dimension
|
451 |
+
# From (variable) shape (batch, overflows, sequence length) to ~ (batch * overflows, sequence length)
|
452 |
+
# (we say ~ because the number of overflow varies with the example in the batch)
|
453 |
+
#
|
454 |
+
# To match each overflowing sample with the original sample in the batch
|
455 |
+
# we add an overflow_to_sample_mapping array (see below)
|
456 |
+
sanitized_tokens = {}
|
457 |
+
for key in tokens_and_encodings[0][0].keys():
|
458 |
+
stack = [e for item, _ in tokens_and_encodings for e in item[key]]
|
459 |
+
sanitized_tokens[key] = stack
|
460 |
+
sanitized_encodings = [e for _, item in tokens_and_encodings for e in item]
|
461 |
+
|
462 |
+
# If returning overflowing tokens, we need to return a mapping
|
463 |
+
# from the batch idx to the original sample
|
464 |
+
if return_overflowing_tokens:
|
465 |
+
overflow_to_sample_mapping = []
|
466 |
+
for i, (toks, _) in enumerate(tokens_and_encodings):
|
467 |
+
overflow_to_sample_mapping += [i] * len(toks["input_ids"])
|
468 |
+
sanitized_tokens["overflow_to_sample_mapping"] = overflow_to_sample_mapping
|
469 |
+
|
470 |
+
for input_ids in sanitized_tokens["input_ids"]:
|
471 |
+
self._eventual_warn_about_too_long_sequence(input_ids, max_length, verbose)
|
472 |
+
return BatchEncoding(sanitized_tokens, sanitized_encodings, tensor_type=return_tensors)
|
473 |
+
|
474 |
+
def _encode_plus(
|
475 |
+
self,
|
476 |
+
text: Union[TextInput, PreTokenizedInput],
|
477 |
+
text_pair: Optional[Union[TextInput, PreTokenizedInput]] = None,
|
478 |
+
add_special_tokens: bool = True,
|
479 |
+
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
|
480 |
+
truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
|
481 |
+
max_length: Optional[int] = None,
|
482 |
+
stride: int = 0,
|
483 |
+
is_split_into_words: bool = False,
|
484 |
+
pad_to_multiple_of: Optional[int] = None,
|
485 |
+
return_tensors: Optional[bool] = None,
|
486 |
+
return_token_type_ids: Optional[bool] = None,
|
487 |
+
return_attention_mask: Optional[bool] = None,
|
488 |
+
return_overflowing_tokens: bool = False,
|
489 |
+
return_special_tokens_mask: bool = False,
|
490 |
+
return_offsets_mapping: bool = False,
|
491 |
+
return_length: bool = False,
|
492 |
+
verbose: bool = True,
|
493 |
+
**kwargs,
|
494 |
+
) -> BatchEncoding:
|
495 |
+
batched_input = [(text, text_pair)] if text_pair else [text]
|
496 |
+
batched_output = self._batch_encode_plus(
|
497 |
+
batched_input,
|
498 |
+
is_split_into_words=is_split_into_words,
|
499 |
+
add_special_tokens=add_special_tokens,
|
500 |
+
padding_strategy=padding_strategy,
|
501 |
+
truncation_strategy=truncation_strategy,
|
502 |
+
max_length=max_length,
|
503 |
+
stride=stride,
|
504 |
+
pad_to_multiple_of=pad_to_multiple_of,
|
505 |
+
return_tensors=return_tensors,
|
506 |
+
return_token_type_ids=return_token_type_ids,
|
507 |
+
return_attention_mask=return_attention_mask,
|
508 |
+
return_overflowing_tokens=return_overflowing_tokens,
|
509 |
+
return_special_tokens_mask=return_special_tokens_mask,
|
510 |
+
return_offsets_mapping=return_offsets_mapping,
|
511 |
+
return_length=return_length,
|
512 |
+
verbose=verbose,
|
513 |
+
**kwargs,
|
514 |
+
)
|
515 |
+
|
516 |
+
# Return tensor is None, then we can remove the leading batch axis
|
517 |
+
# Overflowing tokens are returned as a batch of output so we keep them in this case
|
518 |
+
if return_tensors is None and not return_overflowing_tokens:
|
519 |
+
batched_output = BatchEncoding(
|
520 |
+
{
|
521 |
+
key: value[0] if len(value) > 0 and isinstance(value[0], list) else value
|
522 |
+
for key, value in batched_output.items()
|
523 |
+
},
|
524 |
+
batched_output.encodings,
|
525 |
+
)
|
526 |
+
|
527 |
+
self._eventual_warn_about_too_long_sequence(batched_output["input_ids"], max_length, verbose)
|
528 |
+
|
529 |
+
return batched_output
|
530 |
+
|
531 |
+
def convert_tokens_to_string(self, tokens: List[str]) -> str:
|
532 |
+
return self.backend_tokenizer.decoder.decode(tokens)
|
533 |
+
|
534 |
+
def _decode(
|
535 |
+
self,
|
536 |
+
token_ids: Union[int, List[int]],
|
537 |
+
skip_special_tokens: bool = False,
|
538 |
+
clean_up_tokenization_spaces: bool = None,
|
539 |
+
**kwargs,
|
540 |
+
) -> str:
|
541 |
+
self._decode_use_source_tokenizer = kwargs.pop("use_source_tokenizer", False)
|
542 |
+
|
543 |
+
if isinstance(token_ids, int):
|
544 |
+
token_ids = [token_ids]
|
545 |
+
text = self._tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens)
|
546 |
+
|
547 |
+
clean_up_tokenization_spaces = (
|
548 |
+
clean_up_tokenization_spaces
|
549 |
+
if clean_up_tokenization_spaces is not None
|
550 |
+
else self.clean_up_tokenization_spaces
|
551 |
+
)
|
552 |
+
if clean_up_tokenization_spaces:
|
553 |
+
clean_text = self.clean_up_tokenization(text)
|
554 |
+
return clean_text
|
555 |
+
else:
|
556 |
+
return text
|
557 |
+
|
558 |
+
def _save_pretrained(
|
559 |
+
self,
|
560 |
+
save_directory: Union[str, os.PathLike],
|
561 |
+
file_names: Tuple[str],
|
562 |
+
legacy_format: Optional[bool] = None,
|
563 |
+
filename_prefix: Optional[str] = None,
|
564 |
+
) -> Tuple[str]:
|
565 |
+
"""
|
566 |
+
Save a tokenizer using the slow-tokenizer/legacy format: vocabulary + added tokens as well as in a unique JSON
|
567 |
+
file containing {config + vocab + added-tokens}.
|
568 |
+
"""
|
569 |
+
save_directory = str(save_directory)
|
570 |
+
|
571 |
+
if self.slow_tokenizer_class is None and legacy_format is True:
|
572 |
+
raise ValueError(
|
573 |
+
"Your tokenizer does not have a legacy version defined and therefore cannot register this version. You"
|
574 |
+
" might consider leaving the legacy_format at `None` or setting it to `False`."
|
575 |
+
)
|
576 |
+
|
577 |
+
save_slow = (
|
578 |
+
(legacy_format is None or legacy_format is True)
|
579 |
+
and self.slow_tokenizer_class is not None
|
580 |
+
and self.can_save_slow_tokenizer
|
581 |
+
)
|
582 |
+
save_fast = legacy_format is None or legacy_format is False
|
583 |
+
|
584 |
+
if save_slow:
|
585 |
+
added_tokens_file = os.path.join(
|
586 |
+
save_directory, (filename_prefix + "-" if filename_prefix else "") + ADDED_TOKENS_FILE
|
587 |
+
)
|
588 |
+
added_vocab = self.get_added_vocab()
|
589 |
+
if added_vocab:
|
590 |
+
with open(added_tokens_file, "w", encoding="utf-8") as f:
|
591 |
+
out_str = json.dumps(added_vocab, indent=2, sort_keys=True, ensure_ascii=False) + "\n"
|
592 |
+
f.write(out_str)
|
593 |
+
|
594 |
+
vocab_files = self.save_vocabulary(save_directory, filename_prefix=filename_prefix)
|
595 |
+
file_names = file_names + vocab_files + (added_tokens_file,)
|
596 |
+
|
597 |
+
if save_fast:
|
598 |
+
tokenizer_file = os.path.join(
|
599 |
+
save_directory, (filename_prefix + "-" if filename_prefix else "") + TOKENIZER_FILE
|
600 |
+
)
|
601 |
+
self.backend_tokenizer.save(tokenizer_file)
|
602 |
+
file_names = file_names + (tokenizer_file,)
|
603 |
+
|
604 |
+
return file_names
|
605 |
+
|
606 |
+
def train_new_from_iterator(
|
607 |
+
self,
|
608 |
+
text_iterator,
|
609 |
+
vocab_size,
|
610 |
+
length=None,
|
611 |
+
new_special_tokens=None,
|
612 |
+
special_tokens_map=None,
|
613 |
+
**kwargs,
|
614 |
+
):
|
615 |
+
"""
|
616 |
+
Trains a tokenizer on a new corpus with the same defaults (in terms of special tokens or tokenization pipeline)
|
617 |
+
as the current one.
|
618 |
+
|
619 |
+
Args:
|
620 |
+
text_iterator (generator of `List[str]`):
|
621 |
+
The training corpus. Should be a generator of batches of texts, for instance a list of lists of texts
|
622 |
+
if you have everything in memory.
|
623 |
+
vocab_size (`int`):
|
624 |
+
The size of the vocabulary you want for your tokenizer.
|
625 |
+
length (`int`, *optional*):
|
626 |
+
The total number of sequences in the iterator. This is used to provide meaningful progress tracking
|
627 |
+
new_special_tokens (list of `str` or `AddedToken`, *optional*):
|
628 |
+
A list of new special tokens to add to the tokenizer you are training.
|
629 |
+
special_tokens_map (`Dict[str, str]`, *optional*):
|
630 |
+
If you want to rename some of the special tokens this tokenizer uses, pass along a mapping old special
|
631 |
+
token name to new special token name in this argument.
|
632 |
+
kwargs (`Dict[str, Any]`, *optional*):
|
633 |
+
Additional keyword arguments passed along to the trainer from the 🤗 Tokenizers library.
|
634 |
+
|
635 |
+
Returns:
|
636 |
+
[`PreTrainedTokenizerFast`]: A new tokenizer of the same type as the original one, trained on
|
637 |
+
`text_iterator`.
|
638 |
+
|
639 |
+
"""
|
640 |
+
tokenizer_json = json.loads(self._tokenizer.to_str())
|
641 |
+
# Remove added tokens for now (uses IDs of tokens)
|
642 |
+
added_tokens = tokenizer_json.pop("added_tokens")
|
643 |
+
# Remove post processor for now (uses IDs of tokens)
|
644 |
+
post_processor = tokenizer_json.pop("post_processor")
|
645 |
+
|
646 |
+
unk_token = None
|
647 |
+
# Remove vocab
|
648 |
+
if tokenizer_json["model"]["type"] == "BPE":
|
649 |
+
tokenizer_json["model"]["vocab"] = {}
|
650 |
+
tokenizer_json["model"]["merges"] = []
|
651 |
+
elif tokenizer_json["model"]["type"] == "Unigram":
|
652 |
+
if tokenizer_json["model"]["unk_id"] is not None:
|
653 |
+
unk_id = tokenizer_json["model"]["unk_id"]
|
654 |
+
unk_token = tokenizer_json["model"]["vocab"][unk_id][0]
|
655 |
+
if special_tokens_map is not None and unk_token in special_tokens_map:
|
656 |
+
unk_token = special_tokens_map[unk_token]
|
657 |
+
tokenizer_json["model"]["unk_id"] = 0
|
658 |
+
tokenizer_json["model"]["vocab"] = [[unk_token, 0.0]]
|
659 |
+
elif tokenizer_json["model"]["type"] in ["WordLevel", "WordPiece"]:
|
660 |
+
tokenizer_json["model"]["vocab"] = {}
|
661 |
+
else:
|
662 |
+
raise ValueError(
|
663 |
+
f"This method does not support this type of tokenizer (found {tokenizer_json['model']['type']}) "
|
664 |
+
"only BPE, Unigram, WordLevel and WordPiece."
|
665 |
+
)
|
666 |
+
|
667 |
+
if (
|
668 |
+
special_tokens_map is not None
|
669 |
+
and "unk_token" in tokenizer_json["model"]
|
670 |
+
and tokenizer_json["model"]["unk_token"] in special_tokens_map
|
671 |
+
):
|
672 |
+
tokenizer_json["model"]["unk_token"] = special_tokens_map[tokenizer_json["model"]["unk_token"]]
|
673 |
+
|
674 |
+
tokenizer = TokenizerFast.from_str(json.dumps(tokenizer_json))
|
675 |
+
|
676 |
+
# Get the special tokens from the current tokenizer if none are specified.
|
677 |
+
special_tokens = []
|
678 |
+
for added_token in added_tokens:
|
679 |
+
special = added_token.pop("special", None)
|
680 |
+
_ = added_token.pop("id", None)
|
681 |
+
if tokenizer_json["model"]["type"] != "Unigram" and not special:
|
682 |
+
continue
|
683 |
+
if special_tokens_map is not None and added_token["content"] in special_tokens_map:
|
684 |
+
added_token["content"] = special_tokens_map[added_token["content"]]
|
685 |
+
special_tokens.append(AddedToken(**added_token))
|
686 |
+
|
687 |
+
if new_special_tokens is not None:
|
688 |
+
special_tokens.extend(new_special_tokens)
|
689 |
+
|
690 |
+
# Trainer needs to know the end of word / continuing subword thingies in BPE
|
691 |
+
if (
|
692 |
+
tokenizer_json["model"]["type"] == "BPE"
|
693 |
+
and "continuing_subword_prefix" not in kwargs
|
694 |
+
and tokenizer_json["model"]["continuing_subword_prefix"] is not None
|
695 |
+
):
|
696 |
+
kwargs["continuing_subword_prefix"] = tokenizer_json["model"]["continuing_subword_prefix"]
|
697 |
+
if (
|
698 |
+
tokenizer_json["model"]["type"] == "BPE"
|
699 |
+
and "end_of_word_suffix" not in kwargs
|
700 |
+
and tokenizer_json["model"]["end_of_word_suffix"] is not None
|
701 |
+
):
|
702 |
+
kwargs["end_of_word_suffix"] = tokenizer_json["model"]["end_of_word_suffix"]
|
703 |
+
if tokenizer_json["model"]["type"] == "Unigram" and unk_token is not None:
|
704 |
+
kwargs["unk_token"] = unk_token
|
705 |
+
if tokenizer_json["pre_tokenizer"] is not None and tokenizer_json["pre_tokenizer"]["type"] == "ByteLevel":
|
706 |
+
kwargs["initial_alphabet"] = pre_tokenizers_fast.ByteLevel.alphabet()
|
707 |
+
|
708 |
+
trainer_class = MODEL_TO_TRAINER_MAPPING[tokenizer_json["model"]["type"]]
|
709 |
+
trainer = trainer_class(vocab_size=vocab_size, special_tokens=special_tokens, **kwargs)
|
710 |
+
tokenizer.train_from_iterator(text_iterator, length=length, trainer=trainer)
|
711 |
+
|
712 |
+
if post_processor is not None:
|
713 |
+
trained_tokenizer_json = json.loads(tokenizer.to_str())
|
714 |
+
# Almost done, we just have to adjust the token IDs in the post processor
|
715 |
+
if "special_tokens" in post_processor:
|
716 |
+
for key in post_processor["special_tokens"]:
|
717 |
+
tokens = post_processor["special_tokens"][key]["tokens"]
|
718 |
+
if special_tokens_map is not None:
|
719 |
+
tokens = [special_tokens_map.get(token, token) for token in tokens]
|
720 |
+
post_processor["special_tokens"][key]["tokens"] = tokens
|
721 |
+
post_processor["special_tokens"][key]["ids"] = [tokenizer.token_to_id(token) for token in tokens]
|
722 |
+
|
723 |
+
for special_token in ["cls", "sep"]:
|
724 |
+
if special_token in post_processor:
|
725 |
+
token, _ = post_processor[special_token]
|
726 |
+
if special_tokens_map is not None and token in special_tokens_map:
|
727 |
+
token = special_tokens_map[token]
|
728 |
+
token_id = tokenizer.token_to_id(token)
|
729 |
+
post_processor[special_token] = [token, token_id]
|
730 |
+
|
731 |
+
trained_tokenizer_json["post_processor"] = post_processor
|
732 |
+
tokenizer = TokenizerFast.from_str(json.dumps(trained_tokenizer_json))
|
733 |
+
|
734 |
+
kwargs = self.init_kwargs.copy()
|
735 |
+
# Map pad/cls/mask token at the Transformers level
|
736 |
+
special_tokens_list = SpecialTokensMixin.SPECIAL_TOKENS_ATTRIBUTES.copy()
|
737 |
+
special_tokens_list.remove("additional_special_tokens")
|
738 |
+
for token in special_tokens_list:
|
739 |
+
# Get the private one to avoid unnecessary warnings.
|
740 |
+
if getattr(self, f"_{token}") is not None:
|
741 |
+
special_token = getattr(self, token)
|
742 |
+
if special_tokens_map is not None and special_token in special_tokens_map:
|
743 |
+
special_token = special_tokens_map[special_token]
|
744 |
+
|
745 |
+
special_token_full = getattr(self, f"_{token}")
|
746 |
+
if isinstance(special_token_full, AddedToken):
|
747 |
+
# Create an added token with the same parameters except the content
|
748 |
+
kwargs[token] = AddedToken(
|
749 |
+
special_token,
|
750 |
+
single_word=special_token_full.single_word,
|
751 |
+
lstrip=special_token_full.lstrip,
|
752 |
+
rstrip=special_token_full.rstrip,
|
753 |
+
normalized=special_token_full.normalized,
|
754 |
+
)
|
755 |
+
else:
|
756 |
+
kwargs[token] = special_token
|
757 |
+
|
758 |
+
additional_special_tokens = self.additional_special_tokens
|
759 |
+
if new_special_tokens is not None:
|
760 |
+
additional_special_tokens.extend(new_special_tokens)
|
761 |
+
if len(additional_special_tokens) > 0:
|
762 |
+
kwargs["additional_special_tokens"] = additional_special_tokens
|
763 |
+
|
764 |
+
return self.__class__(tokenizer_object=tokenizer, **kwargs)
|
tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
tokenizer_config.json
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"name_or_path": "THUDM/chatglm2-6b",
|
3 |
+
"remove_space": false,
|
4 |
+
"do_lower_case": false,
|
5 |
+
"tokenizer_class": "PreTrainedTokenizerFast",
|
6 |
+
"auto_map": {
|
7 |
+
"AutoTokenizer": [
|
8 |
+
null,
|
9 |
+
"tokenization_utils_fast.PreTrainedTokenizerFast"
|
10 |
+
]
|
11 |
+
}
|
12 |
+
}
|