jiguanglizipao commited on
Commit
df3cdcf
·
1 Parent(s): e94d96f
.gitattributes CHANGED
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ model-00001-of-00003.safetensors filter=lfs diff=lfs merge=lfs -text
37
+ model-00002-of-00003.safetensors filter=lfs diff=lfs merge=lfs -text
38
+ model-00003-of-00003.safetensors filter=lfs diff=lfs merge=lfs -text
config.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "LlamaForCausalLM"
4
+ ],
5
+ "attention_bias": true,
6
+ "bos_token_id": 1,
7
+ "eos_token_id": 2,
8
+ "hidden_act": "silu",
9
+ "hidden_size": 4096,
10
+ "initializer_range": 0.02,
11
+ "intermediate_size": 13696,
12
+ "max_position_embeddings": 32768,
13
+ "model_type": "llama",
14
+ "num_attention_heads": 32,
15
+ "num_hidden_layers": 28,
16
+ "num_key_value_heads": 2,
17
+ "pretraining_tp": 1,
18
+ "rms_norm_eps": 1e-05,
19
+ "rope_dim": 64,
20
+ "rope_scaling": null,
21
+ "rope_theta": 10000.0,
22
+ "tie_word_embeddings": false,
23
+ "torch_dtype": "float16",
24
+ "transformers_version": "4.35.0.dev0",
25
+ "use_cache": true,
26
+ "vocab_size": 65024
27
+ }
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "transformers_version": "4.35.0.dev0"
6
+ }
model-00001-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:340b7a561c594c6eea307cfbc3be052a25f090d065118a13ff85604b54fbc186
3
+ size 4907706896
model-00002-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:911e19e8ebcfb4dc2dc114567855d8f7f4dcf3c08704db04106befaec684f7f7
3
+ size 4895175976
model-00003-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:615e66cf5157be2677a6dc5060ee009120b923ca58ac6426ab6ddb44eadbea2a
3
+ size 2684556024
model.safetensors.index.json ADDED
@@ -0,0 +1,374 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 12487397376
4
+ },
5
+ "weight_map": {
6
+ "lm_head.weight": "model-00003-of-00003.safetensors",
7
+ "model.embed_tokens.weight": "model-00001-of-00003.safetensors",
8
+ "model.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors",
9
+ "model.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
10
+ "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
11
+ "model.layers.0.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
12
+ "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
13
+ "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
14
+ "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
15
+ "model.layers.0.self_attn.o_proj.bias": "model-00001-of-00003.safetensors",
16
+ "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
17
+ "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
18
+ "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
19
+ "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
20
+ "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
21
+ "model.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors",
22
+ "model.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
23
+ "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
24
+ "model.layers.1.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
25
+ "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
26
+ "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
27
+ "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
28
+ "model.layers.1.self_attn.o_proj.bias": "model-00001-of-00003.safetensors",
29
+ "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
30
+ "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
31
+ "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
32
+ "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
33
+ "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
34
+ "model.layers.10.input_layernorm.weight": "model-00002-of-00003.safetensors",
35
+ "model.layers.10.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
36
+ "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
37
+ "model.layers.10.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
38
+ "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
39
+ "model.layers.10.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
40
+ "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
41
+ "model.layers.10.self_attn.o_proj.bias": "model-00001-of-00003.safetensors",
42
+ "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
43
+ "model.layers.10.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
44
+ "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
45
+ "model.layers.10.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
46
+ "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
47
+ "model.layers.11.input_layernorm.weight": "model-00002-of-00003.safetensors",
48
+ "model.layers.11.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
49
+ "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
50
+ "model.layers.11.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
51
+ "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
52
+ "model.layers.11.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
53
+ "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
54
+ "model.layers.11.self_attn.o_proj.bias": "model-00002-of-00003.safetensors",
55
+ "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
56
+ "model.layers.11.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
57
+ "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
58
+ "model.layers.11.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
59
+ "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
60
+ "model.layers.12.input_layernorm.weight": "model-00002-of-00003.safetensors",
61
+ "model.layers.12.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
62
+ "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
63
+ "model.layers.12.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
64
+ "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
65
+ "model.layers.12.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
66
+ "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
67
+ "model.layers.12.self_attn.o_proj.bias": "model-00002-of-00003.safetensors",
68
+ "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
69
+ "model.layers.12.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
70
+ "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
71
+ "model.layers.12.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
72
+ "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
73
+ "model.layers.13.input_layernorm.weight": "model-00002-of-00003.safetensors",
74
+ "model.layers.13.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
75
+ "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
76
+ "model.layers.13.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
77
+ "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
78
+ "model.layers.13.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
79
+ "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
80
+ "model.layers.13.self_attn.o_proj.bias": "model-00002-of-00003.safetensors",
81
+ "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
82
+ "model.layers.13.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
83
+ "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
84
+ "model.layers.13.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
85
+ "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
86
+ "model.layers.14.input_layernorm.weight": "model-00002-of-00003.safetensors",
87
+ "model.layers.14.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
88
+ "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
89
+ "model.layers.14.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
90
+ "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
91
+ "model.layers.14.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
92
+ "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
93
+ "model.layers.14.self_attn.o_proj.bias": "model-00002-of-00003.safetensors",
94
+ "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
95
+ "model.layers.14.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
96
+ "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
97
+ "model.layers.14.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
98
+ "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
99
+ "model.layers.15.input_layernorm.weight": "model-00002-of-00003.safetensors",
100
+ "model.layers.15.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
101
+ "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
102
+ "model.layers.15.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
103
+ "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
104
+ "model.layers.15.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
105
+ "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
106
+ "model.layers.15.self_attn.o_proj.bias": "model-00002-of-00003.safetensors",
107
+ "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
108
+ "model.layers.15.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
109
+ "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
110
+ "model.layers.15.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
111
+ "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
112
+ "model.layers.16.input_layernorm.weight": "model-00002-of-00003.safetensors",
113
+ "model.layers.16.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
114
+ "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
115
+ "model.layers.16.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
116
+ "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
117
+ "model.layers.16.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
118
+ "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
119
+ "model.layers.16.self_attn.o_proj.bias": "model-00002-of-00003.safetensors",
120
+ "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
121
+ "model.layers.16.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
122
+ "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
123
+ "model.layers.16.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
124
+ "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
125
+ "model.layers.17.input_layernorm.weight": "model-00002-of-00003.safetensors",
126
+ "model.layers.17.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
127
+ "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
128
+ "model.layers.17.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
129
+ "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
130
+ "model.layers.17.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
131
+ "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
132
+ "model.layers.17.self_attn.o_proj.bias": "model-00002-of-00003.safetensors",
133
+ "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
134
+ "model.layers.17.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
135
+ "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
136
+ "model.layers.17.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
137
+ "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
138
+ "model.layers.18.input_layernorm.weight": "model-00002-of-00003.safetensors",
139
+ "model.layers.18.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
140
+ "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
141
+ "model.layers.18.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
142
+ "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
143
+ "model.layers.18.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
144
+ "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
145
+ "model.layers.18.self_attn.o_proj.bias": "model-00002-of-00003.safetensors",
146
+ "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
147
+ "model.layers.18.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
148
+ "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
149
+ "model.layers.18.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
150
+ "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
151
+ "model.layers.19.input_layernorm.weight": "model-00002-of-00003.safetensors",
152
+ "model.layers.19.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
153
+ "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
154
+ "model.layers.19.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
155
+ "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
156
+ "model.layers.19.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
157
+ "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
158
+ "model.layers.19.self_attn.o_proj.bias": "model-00002-of-00003.safetensors",
159
+ "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
160
+ "model.layers.19.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
161
+ "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
162
+ "model.layers.19.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
163
+ "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
164
+ "model.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors",
165
+ "model.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
166
+ "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
167
+ "model.layers.2.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
168
+ "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
169
+ "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
170
+ "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
171
+ "model.layers.2.self_attn.o_proj.bias": "model-00001-of-00003.safetensors",
172
+ "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
173
+ "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
174
+ "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
175
+ "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
176
+ "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
177
+ "model.layers.20.input_layernorm.weight": "model-00002-of-00003.safetensors",
178
+ "model.layers.20.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
179
+ "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
180
+ "model.layers.20.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
181
+ "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
182
+ "model.layers.20.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
183
+ "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
184
+ "model.layers.20.self_attn.o_proj.bias": "model-00002-of-00003.safetensors",
185
+ "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
186
+ "model.layers.20.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
187
+ "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
188
+ "model.layers.20.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
189
+ "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
190
+ "model.layers.21.input_layernorm.weight": "model-00002-of-00003.safetensors",
191
+ "model.layers.21.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
192
+ "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
193
+ "model.layers.21.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
194
+ "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
195
+ "model.layers.21.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
196
+ "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
197
+ "model.layers.21.self_attn.o_proj.bias": "model-00002-of-00003.safetensors",
198
+ "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
199
+ "model.layers.21.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
200
+ "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
201
+ "model.layers.21.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
202
+ "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
203
+ "model.layers.22.input_layernorm.weight": "model-00003-of-00003.safetensors",
204
+ "model.layers.22.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
205
+ "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
206
+ "model.layers.22.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
207
+ "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
208
+ "model.layers.22.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
209
+ "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
210
+ "model.layers.22.self_attn.o_proj.bias": "model-00002-of-00003.safetensors",
211
+ "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
212
+ "model.layers.22.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
213
+ "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
214
+ "model.layers.22.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
215
+ "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
216
+ "model.layers.23.input_layernorm.weight": "model-00003-of-00003.safetensors",
217
+ "model.layers.23.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
218
+ "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
219
+ "model.layers.23.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
220
+ "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
221
+ "model.layers.23.self_attn.k_proj.bias": "model-00003-of-00003.safetensors",
222
+ "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
223
+ "model.layers.23.self_attn.o_proj.bias": "model-00003-of-00003.safetensors",
224
+ "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
225
+ "model.layers.23.self_attn.q_proj.bias": "model-00003-of-00003.safetensors",
226
+ "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
227
+ "model.layers.23.self_attn.v_proj.bias": "model-00003-of-00003.safetensors",
228
+ "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
229
+ "model.layers.24.input_layernorm.weight": "model-00003-of-00003.safetensors",
230
+ "model.layers.24.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
231
+ "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
232
+ "model.layers.24.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
233
+ "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
234
+ "model.layers.24.self_attn.k_proj.bias": "model-00003-of-00003.safetensors",
235
+ "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
236
+ "model.layers.24.self_attn.o_proj.bias": "model-00003-of-00003.safetensors",
237
+ "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
238
+ "model.layers.24.self_attn.q_proj.bias": "model-00003-of-00003.safetensors",
239
+ "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
240
+ "model.layers.24.self_attn.v_proj.bias": "model-00003-of-00003.safetensors",
241
+ "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
242
+ "model.layers.25.input_layernorm.weight": "model-00003-of-00003.safetensors",
243
+ "model.layers.25.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
244
+ "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
245
+ "model.layers.25.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
246
+ "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
247
+ "model.layers.25.self_attn.k_proj.bias": "model-00003-of-00003.safetensors",
248
+ "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
249
+ "model.layers.25.self_attn.o_proj.bias": "model-00003-of-00003.safetensors",
250
+ "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
251
+ "model.layers.25.self_attn.q_proj.bias": "model-00003-of-00003.safetensors",
252
+ "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
253
+ "model.layers.25.self_attn.v_proj.bias": "model-00003-of-00003.safetensors",
254
+ "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
255
+ "model.layers.26.input_layernorm.weight": "model-00003-of-00003.safetensors",
256
+ "model.layers.26.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
257
+ "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
258
+ "model.layers.26.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
259
+ "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
260
+ "model.layers.26.self_attn.k_proj.bias": "model-00003-of-00003.safetensors",
261
+ "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
262
+ "model.layers.26.self_attn.o_proj.bias": "model-00003-of-00003.safetensors",
263
+ "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
264
+ "model.layers.26.self_attn.q_proj.bias": "model-00003-of-00003.safetensors",
265
+ "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
266
+ "model.layers.26.self_attn.v_proj.bias": "model-00003-of-00003.safetensors",
267
+ "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
268
+ "model.layers.27.input_layernorm.weight": "model-00003-of-00003.safetensors",
269
+ "model.layers.27.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
270
+ "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
271
+ "model.layers.27.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
272
+ "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
273
+ "model.layers.27.self_attn.k_proj.bias": "model-00003-of-00003.safetensors",
274
+ "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
275
+ "model.layers.27.self_attn.o_proj.bias": "model-00003-of-00003.safetensors",
276
+ "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
277
+ "model.layers.27.self_attn.q_proj.bias": "model-00003-of-00003.safetensors",
278
+ "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
279
+ "model.layers.27.self_attn.v_proj.bias": "model-00003-of-00003.safetensors",
280
+ "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
281
+ "model.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors",
282
+ "model.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
283
+ "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
284
+ "model.layers.3.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
285
+ "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
286
+ "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
287
+ "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
288
+ "model.layers.3.self_attn.o_proj.bias": "model-00001-of-00003.safetensors",
289
+ "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
290
+ "model.layers.3.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
291
+ "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
292
+ "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
293
+ "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
294
+ "model.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors",
295
+ "model.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
296
+ "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
297
+ "model.layers.4.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
298
+ "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
299
+ "model.layers.4.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
300
+ "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
301
+ "model.layers.4.self_attn.o_proj.bias": "model-00001-of-00003.safetensors",
302
+ "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
303
+ "model.layers.4.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
304
+ "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
305
+ "model.layers.4.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
306
+ "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
307
+ "model.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors",
308
+ "model.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
309
+ "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
310
+ "model.layers.5.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
311
+ "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
312
+ "model.layers.5.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
313
+ "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
314
+ "model.layers.5.self_attn.o_proj.bias": "model-00001-of-00003.safetensors",
315
+ "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
316
+ "model.layers.5.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
317
+ "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
318
+ "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
319
+ "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
320
+ "model.layers.6.input_layernorm.weight": "model-00001-of-00003.safetensors",
321
+ "model.layers.6.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
322
+ "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
323
+ "model.layers.6.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
324
+ "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
325
+ "model.layers.6.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
326
+ "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
327
+ "model.layers.6.self_attn.o_proj.bias": "model-00001-of-00003.safetensors",
328
+ "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
329
+ "model.layers.6.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
330
+ "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
331
+ "model.layers.6.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
332
+ "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
333
+ "model.layers.7.input_layernorm.weight": "model-00001-of-00003.safetensors",
334
+ "model.layers.7.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
335
+ "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
336
+ "model.layers.7.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
337
+ "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
338
+ "model.layers.7.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
339
+ "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
340
+ "model.layers.7.self_attn.o_proj.bias": "model-00001-of-00003.safetensors",
341
+ "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
342
+ "model.layers.7.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
343
+ "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
344
+ "model.layers.7.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
345
+ "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
346
+ "model.layers.8.input_layernorm.weight": "model-00001-of-00003.safetensors",
347
+ "model.layers.8.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
348
+ "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
349
+ "model.layers.8.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
350
+ "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
351
+ "model.layers.8.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
352
+ "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
353
+ "model.layers.8.self_attn.o_proj.bias": "model-00001-of-00003.safetensors",
354
+ "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
355
+ "model.layers.8.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
356
+ "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
357
+ "model.layers.8.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
358
+ "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
359
+ "model.layers.9.input_layernorm.weight": "model-00001-of-00003.safetensors",
360
+ "model.layers.9.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
361
+ "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
362
+ "model.layers.9.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
363
+ "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
364
+ "model.layers.9.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
365
+ "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
366
+ "model.layers.9.self_attn.o_proj.bias": "model-00001-of-00003.safetensors",
367
+ "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
368
+ "model.layers.9.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
369
+ "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
370
+ "model.layers.9.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
371
+ "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
372
+ "model.norm.weight": "model-00003-of-00003.safetensors"
373
+ }
374
+ }
pytorch_model.bin.index.json ADDED
@@ -0,0 +1,290 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 12486913536
4
+ },
5
+ "weight_map": {
6
+ "lm_head.weight": "pytorch_model-00002-of-00002.bin",
7
+ "model.embed_tokens.weight": "pytorch_model-00001-of-00002.bin",
8
+ "model.layers.0.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
9
+ "model.layers.0.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
10
+ "model.layers.0.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
11
+ "model.layers.0.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
12
+ "model.layers.0.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
13
+ "model.layers.0.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
14
+ "model.layers.0.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
15
+ "model.layers.0.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
16
+ "model.layers.0.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
17
+ "model.layers.0.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
18
+ "model.layers.1.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
19
+ "model.layers.1.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
20
+ "model.layers.1.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
21
+ "model.layers.1.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
22
+ "model.layers.1.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
23
+ "model.layers.1.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
24
+ "model.layers.1.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
25
+ "model.layers.1.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
26
+ "model.layers.1.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
27
+ "model.layers.1.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
28
+ "model.layers.10.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
29
+ "model.layers.10.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
30
+ "model.layers.10.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
31
+ "model.layers.10.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
32
+ "model.layers.10.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
33
+ "model.layers.10.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
34
+ "model.layers.10.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
35
+ "model.layers.10.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
36
+ "model.layers.10.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
37
+ "model.layers.10.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
38
+ "model.layers.11.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
39
+ "model.layers.11.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
40
+ "model.layers.11.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
41
+ "model.layers.11.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
42
+ "model.layers.11.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
43
+ "model.layers.11.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
44
+ "model.layers.11.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
45
+ "model.layers.11.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
46
+ "model.layers.11.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
47
+ "model.layers.11.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
48
+ "model.layers.12.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
49
+ "model.layers.12.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
50
+ "model.layers.12.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
51
+ "model.layers.12.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
52
+ "model.layers.12.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
53
+ "model.layers.12.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
54
+ "model.layers.12.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
55
+ "model.layers.12.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
56
+ "model.layers.12.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
57
+ "model.layers.12.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
58
+ "model.layers.13.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
59
+ "model.layers.13.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
60
+ "model.layers.13.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
61
+ "model.layers.13.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
62
+ "model.layers.13.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
63
+ "model.layers.13.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
64
+ "model.layers.13.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
65
+ "model.layers.13.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
66
+ "model.layers.13.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
67
+ "model.layers.13.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
68
+ "model.layers.14.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
69
+ "model.layers.14.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
70
+ "model.layers.14.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
71
+ "model.layers.14.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
72
+ "model.layers.14.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
73
+ "model.layers.14.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
74
+ "model.layers.14.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
75
+ "model.layers.14.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
76
+ "model.layers.14.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
77
+ "model.layers.14.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
78
+ "model.layers.15.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
79
+ "model.layers.15.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
80
+ "model.layers.15.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
81
+ "model.layers.15.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
82
+ "model.layers.15.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
83
+ "model.layers.15.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
84
+ "model.layers.15.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
85
+ "model.layers.15.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
86
+ "model.layers.15.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
87
+ "model.layers.15.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
88
+ "model.layers.16.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
89
+ "model.layers.16.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
90
+ "model.layers.16.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
91
+ "model.layers.16.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
92
+ "model.layers.16.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
93
+ "model.layers.16.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
94
+ "model.layers.16.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
95
+ "model.layers.16.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
96
+ "model.layers.16.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
97
+ "model.layers.16.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
98
+ "model.layers.17.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
99
+ "model.layers.17.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
100
+ "model.layers.17.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
101
+ "model.layers.17.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
102
+ "model.layers.17.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
103
+ "model.layers.17.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
104
+ "model.layers.17.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
105
+ "model.layers.17.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
106
+ "model.layers.17.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
107
+ "model.layers.17.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
108
+ "model.layers.18.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
109
+ "model.layers.18.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
110
+ "model.layers.18.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
111
+ "model.layers.18.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
112
+ "model.layers.18.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
113
+ "model.layers.18.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
114
+ "model.layers.18.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
115
+ "model.layers.18.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
116
+ "model.layers.18.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
117
+ "model.layers.18.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
118
+ "model.layers.19.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
119
+ "model.layers.19.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
120
+ "model.layers.19.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
121
+ "model.layers.19.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
122
+ "model.layers.19.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
123
+ "model.layers.19.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
124
+ "model.layers.19.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
125
+ "model.layers.19.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
126
+ "model.layers.19.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
127
+ "model.layers.19.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
128
+ "model.layers.2.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
129
+ "model.layers.2.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
130
+ "model.layers.2.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
131
+ "model.layers.2.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
132
+ "model.layers.2.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
133
+ "model.layers.2.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
134
+ "model.layers.2.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
135
+ "model.layers.2.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
136
+ "model.layers.2.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
137
+ "model.layers.2.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
138
+ "model.layers.20.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
139
+ "model.layers.20.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
140
+ "model.layers.20.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
141
+ "model.layers.20.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
142
+ "model.layers.20.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
143
+ "model.layers.20.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
144
+ "model.layers.20.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
145
+ "model.layers.20.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
146
+ "model.layers.20.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
147
+ "model.layers.20.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
148
+ "model.layers.21.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
149
+ "model.layers.21.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
150
+ "model.layers.21.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
151
+ "model.layers.21.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
152
+ "model.layers.21.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
153
+ "model.layers.21.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
154
+ "model.layers.21.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
155
+ "model.layers.21.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
156
+ "model.layers.21.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
157
+ "model.layers.21.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
158
+ "model.layers.22.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
159
+ "model.layers.22.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
160
+ "model.layers.22.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
161
+ "model.layers.22.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
162
+ "model.layers.22.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
163
+ "model.layers.22.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
164
+ "model.layers.22.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
165
+ "model.layers.22.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
166
+ "model.layers.22.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
167
+ "model.layers.22.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
168
+ "model.layers.23.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
169
+ "model.layers.23.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
170
+ "model.layers.23.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
171
+ "model.layers.23.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
172
+ "model.layers.23.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
173
+ "model.layers.23.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
174
+ "model.layers.23.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
175
+ "model.layers.23.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
176
+ "model.layers.23.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
177
+ "model.layers.23.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
178
+ "model.layers.24.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
179
+ "model.layers.24.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
180
+ "model.layers.24.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
181
+ "model.layers.24.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
182
+ "model.layers.24.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
183
+ "model.layers.24.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
184
+ "model.layers.24.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
185
+ "model.layers.24.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
186
+ "model.layers.24.self_attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00002.bin",
187
+ "model.layers.24.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
188
+ "model.layers.25.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
189
+ "model.layers.25.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
190
+ "model.layers.25.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
191
+ "model.layers.25.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
192
+ "model.layers.25.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
193
+ "model.layers.25.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
194
+ "model.layers.25.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
195
+ "model.layers.25.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
196
+ "model.layers.25.self_attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00002.bin",
197
+ "model.layers.25.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
198
+ "model.layers.26.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
199
+ "model.layers.26.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
200
+ "model.layers.26.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
201
+ "model.layers.26.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
202
+ "model.layers.26.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
203
+ "model.layers.26.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
204
+ "model.layers.26.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
205
+ "model.layers.26.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
206
+ "model.layers.26.self_attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00002.bin",
207
+ "model.layers.26.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
208
+ "model.layers.27.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
209
+ "model.layers.27.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
210
+ "model.layers.27.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
211
+ "model.layers.27.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
212
+ "model.layers.27.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
213
+ "model.layers.27.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
214
+ "model.layers.27.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
215
+ "model.layers.27.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
216
+ "model.layers.27.self_attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00002.bin",
217
+ "model.layers.27.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
218
+ "model.layers.3.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
219
+ "model.layers.3.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
220
+ "model.layers.3.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
221
+ "model.layers.3.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
222
+ "model.layers.3.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
223
+ "model.layers.3.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
224
+ "model.layers.3.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
225
+ "model.layers.3.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
226
+ "model.layers.3.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
227
+ "model.layers.3.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
228
+ "model.layers.4.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
229
+ "model.layers.4.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
230
+ "model.layers.4.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
231
+ "model.layers.4.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
232
+ "model.layers.4.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
233
+ "model.layers.4.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
234
+ "model.layers.4.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
235
+ "model.layers.4.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
236
+ "model.layers.4.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
237
+ "model.layers.4.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
238
+ "model.layers.5.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
239
+ "model.layers.5.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
240
+ "model.layers.5.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
241
+ "model.layers.5.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
242
+ "model.layers.5.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
243
+ "model.layers.5.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
244
+ "model.layers.5.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
245
+ "model.layers.5.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
246
+ "model.layers.5.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
247
+ "model.layers.5.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
248
+ "model.layers.6.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
249
+ "model.layers.6.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
250
+ "model.layers.6.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
251
+ "model.layers.6.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
252
+ "model.layers.6.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
253
+ "model.layers.6.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
254
+ "model.layers.6.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
255
+ "model.layers.6.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
256
+ "model.layers.6.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
257
+ "model.layers.6.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
258
+ "model.layers.7.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
259
+ "model.layers.7.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
260
+ "model.layers.7.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
261
+ "model.layers.7.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
262
+ "model.layers.7.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
263
+ "model.layers.7.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
264
+ "model.layers.7.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
265
+ "model.layers.7.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
266
+ "model.layers.7.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
267
+ "model.layers.7.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
268
+ "model.layers.8.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
269
+ "model.layers.8.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
270
+ "model.layers.8.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
271
+ "model.layers.8.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
272
+ "model.layers.8.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
273
+ "model.layers.8.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
274
+ "model.layers.8.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
275
+ "model.layers.8.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
276
+ "model.layers.8.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
277
+ "model.layers.8.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
278
+ "model.layers.9.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
279
+ "model.layers.9.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
280
+ "model.layers.9.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
281
+ "model.layers.9.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
282
+ "model.layers.9.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
283
+ "model.layers.9.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
284
+ "model.layers.9.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
285
+ "model.layers.9.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
286
+ "model.layers.9.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
287
+ "model.layers.9.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
288
+ "model.norm.weight": "pytorch_model-00002-of-00002.bin"
289
+ }
290
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "<unk>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
tokenization_utils_base.py ADDED
The diff for this file is too large to render. See raw diff
 
tokenization_utils_fast.py ADDED
@@ -0,0 +1,764 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2020 The HuggingFace Inc. team.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """
16
+ Tokenization classes for fast tokenizers (provided by HuggingFace's tokenizers library). For slow (python) tokenizers
17
+ see tokenization_utils.py
18
+ """
19
+ import copy
20
+ import json
21
+ import os
22
+ from collections import defaultdict
23
+ from typing import Any, Dict, List, Optional, Tuple, Union
24
+
25
+ import tokenizers.pre_tokenizers as pre_tokenizers_fast
26
+ from tokenizers import Encoding as EncodingFast
27
+ from tokenizers import Tokenizer as TokenizerFast
28
+ from tokenizers.decoders import Decoder as DecoderFast
29
+ from tokenizers.trainers import BpeTrainer, UnigramTrainer, WordLevelTrainer, WordPieceTrainer
30
+
31
+ from transformers.convert_slow_tokenizer import convert_slow_tokenizer
32
+ from transformers.tokenization_utils import PreTrainedTokenizer
33
+ from .tokenization_utils_base import (
34
+ INIT_TOKENIZER_DOCSTRING,
35
+ AddedToken,
36
+ BatchEncoding,
37
+ PreTokenizedInput,
38
+ PreTokenizedInputPair,
39
+ PreTrainedTokenizerBase,
40
+ SpecialTokensMixin,
41
+ TextInput,
42
+ TextInputPair,
43
+ TruncationStrategy,
44
+ )
45
+ from transformers.utils import PaddingStrategy, add_end_docstrings, logging
46
+
47
+
48
+ logger = logging.get_logger(__name__)
49
+
50
+ # Fast tokenizers (provided by HuggingFace tokenizer's library) can be saved in a single file
51
+ TOKENIZER_FILE = "tokenizer.json"
52
+ SPECIAL_TOKENS_MAP_FILE = "special_tokens_map.json"
53
+ TOKENIZER_CONFIG_FILE = "tokenizer_config.json"
54
+
55
+ # Slow tokenizers have an additional added tokens files
56
+ ADDED_TOKENS_FILE = "added_tokens.json"
57
+
58
+ INIT_TOKENIZER_DOCSTRING += """
59
+ tokenizer_object ([`tokenizers.Tokenizer`]):
60
+ A [`tokenizers.Tokenizer`] object from 🤗 tokenizers to instantiate from. See [Using tokenizers from 🤗
61
+ tokenizers](../fast_tokenizers) for more information.
62
+ tokenizer_file ([`str`]):
63
+ A path to a local JSON file representing a previously serialized [`tokenizers.Tokenizer`] object from 🤗
64
+ tokenizers.
65
+ """
66
+
67
+ MODEL_TO_TRAINER_MAPPING = {
68
+ "BPE": BpeTrainer,
69
+ "Unigram": UnigramTrainer,
70
+ "WordLevel": WordLevelTrainer,
71
+ "WordPiece": WordPieceTrainer,
72
+ }
73
+
74
+ VOCAB_FILES_NAMES = {"tokenizer_file": TOKENIZER_FILE}
75
+
76
+
77
+ @add_end_docstrings(INIT_TOKENIZER_DOCSTRING)
78
+ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
79
+ """
80
+ Base class for all fast tokenizers (wrapping HuggingFace tokenizers library).
81
+
82
+ Inherits from [`~tokenization_utils_base.PreTrainedTokenizerBase`].
83
+
84
+ Handles all the shared methods for tokenization and special tokens, as well as methods for
85
+ downloading/caching/loading pretrained tokenizers, as well as adding tokens to the vocabulary.
86
+
87
+ This class also contains the added tokens in a unified way on top of all tokenizers so we don't have to handle the
88
+ specific vocabulary augmentation methods of the various underlying dictionary structures (BPE, sentencepiece...).
89
+ """
90
+
91
+ vocab_files_names = VOCAB_FILES_NAMES
92
+ slow_tokenizer_class: PreTrainedTokenizer = None
93
+ can_save_slow_tokenizer: bool = True
94
+
95
+ def __init__(self, *args, **kwargs):
96
+ tokenizer_object = kwargs.pop("tokenizer_object", None)
97
+ slow_tokenizer = kwargs.pop("__slow_tokenizer", None)
98
+ fast_tokenizer_file = kwargs.pop("tokenizer_file", None)
99
+ from_slow = kwargs.pop("from_slow", False)
100
+ if from_slow and slow_tokenizer is None and self.slow_tokenizer_class is None:
101
+ raise ValueError(
102
+ "Cannot instantiate this tokenizer from a slow version. If it's based on sentencepiece, make sure you "
103
+ "have sentencepiece installed."
104
+ )
105
+
106
+ if tokenizer_object is not None:
107
+ fast_tokenizer = copy.deepcopy(tokenizer_object)
108
+ elif fast_tokenizer_file is not None and not from_slow:
109
+ # We have a serialization from tokenizers which let us directly build the backend
110
+ fast_tokenizer = TokenizerFast.from_file(fast_tokenizer_file)
111
+ elif slow_tokenizer is not None:
112
+ # We need to convert a slow tokenizer to build the backend
113
+ fast_tokenizer = convert_slow_tokenizer(slow_tokenizer)
114
+ elif self.slow_tokenizer_class is not None:
115
+ # We need to create and convert a slow tokenizer to build the backend
116
+ slow_tokenizer = self.slow_tokenizer_class(*args, **kwargs)
117
+ fast_tokenizer = convert_slow_tokenizer(slow_tokenizer)
118
+ else:
119
+ raise ValueError(
120
+ "Couldn't instantiate the backend tokenizer from one of: \n"
121
+ "(1) a `tokenizers` library serialization file, \n"
122
+ "(2) a slow tokenizer instance to convert or \n"
123
+ "(3) an equivalent slow tokenizer class to instantiate and convert. \n"
124
+ "You need to have sentencepiece installed to convert a slow tokenizer to a fast one."
125
+ )
126
+
127
+ self._tokenizer = fast_tokenizer
128
+
129
+ if slow_tokenizer is not None:
130
+ kwargs.update(slow_tokenizer.init_kwargs)
131
+
132
+ self._decode_use_source_tokenizer = False
133
+
134
+ # We call this after having initialized the backend tokenizer because we update it.
135
+ super().__init__(**kwargs)
136
+
137
+ @property
138
+ def is_fast(self) -> bool:
139
+ return True
140
+
141
+ @property
142
+ def vocab_size(self) -> int:
143
+ """
144
+ `int`: Size of the base vocabulary (without the added tokens).
145
+ """
146
+ return self._tokenizer.get_vocab_size(with_added_tokens=False)
147
+
148
+ def get_vocab(self) -> Dict[str, int]:
149
+ return self._tokenizer.get_vocab(with_added_tokens=True)
150
+
151
+ @property
152
+ def vocab(self) -> Dict[str, int]:
153
+ return self.get_vocab()
154
+
155
+ def get_added_vocab(self) -> Dict[str, int]:
156
+ """
157
+ Returns the added tokens in the vocabulary as a dictionary of token to index.
158
+
159
+ Returns:
160
+ `Dict[str, int]`: The added tokens.
161
+ """
162
+ base_vocab = self._tokenizer.get_vocab(with_added_tokens=False)
163
+ full_vocab = self._tokenizer.get_vocab(with_added_tokens=True)
164
+ added_vocab = {tok: index for tok, index in full_vocab.items() if tok not in base_vocab}
165
+ return added_vocab
166
+
167
+ def __len__(self) -> int:
168
+ """
169
+ Size of the full vocabulary with the added tokens.
170
+ """
171
+ return self._tokenizer.get_vocab_size(with_added_tokens=True)
172
+
173
+ @property
174
+ def backend_tokenizer(self) -> TokenizerFast:
175
+ """
176
+ `tokenizers.implementations.BaseTokenizer`: The Rust tokenizer used as a backend.
177
+ """
178
+ return self._tokenizer
179
+
180
+ @property
181
+ def decoder(self) -> DecoderFast:
182
+ """
183
+ `tokenizers.decoders.Decoder`: The Rust decoder for this tokenizer.
184
+ """
185
+ return self._tokenizer.decoder
186
+
187
+ def _convert_encoding(
188
+ self,
189
+ encoding: EncodingFast,
190
+ return_token_type_ids: Optional[bool] = None,
191
+ return_attention_mask: Optional[bool] = None,
192
+ return_overflowing_tokens: bool = False,
193
+ return_special_tokens_mask: bool = False,
194
+ return_offsets_mapping: bool = False,
195
+ return_length: bool = False,
196
+ verbose: bool = True,
197
+ ) -> Tuple[Dict[str, Any], List[EncodingFast]]:
198
+ """
199
+ Convert the encoding representation (from low-level HuggingFace tokenizer output) to a python Dict and a list
200
+ of encodings, take care of building a batch from overflowing tokens.
201
+
202
+ Overflowing tokens are converted to additional examples (like batches) so the output values of the dict are
203
+ lists (overflows) of lists (tokens).
204
+
205
+ Output shape: (overflows, sequence length)
206
+ """
207
+ if return_token_type_ids is None:
208
+ return_token_type_ids = "token_type_ids" in self.model_input_names
209
+ if return_attention_mask is None:
210
+ return_attention_mask = "attention_mask" in self.model_input_names
211
+
212
+ if return_overflowing_tokens and encoding.overflowing is not None:
213
+ encodings = [encoding] + encoding.overflowing
214
+ else:
215
+ encodings = [encoding]
216
+
217
+ encoding_dict = defaultdict(list)
218
+ for e in encodings:
219
+ encoding_dict["input_ids"].append(e.ids)
220
+
221
+ if return_token_type_ids:
222
+ encoding_dict["token_type_ids"].append(e.type_ids)
223
+ if return_attention_mask:
224
+ encoding_dict["attention_mask"].append(e.attention_mask)
225
+ if return_special_tokens_mask:
226
+ encoding_dict["special_tokens_mask"].append(e.special_tokens_mask)
227
+ if return_offsets_mapping:
228
+ encoding_dict["offset_mapping"].append(e.offsets)
229
+ if return_length:
230
+ encoding_dict["length"].append(len(e.ids))
231
+
232
+ return encoding_dict, encodings
233
+
234
+ def convert_tokens_to_ids(self, tokens: Union[str, List[str]]) -> Union[int, List[int]]:
235
+ """
236
+ Converts a token string (or a sequence of tokens) in a single integer id (or a sequence of ids), using the
237
+ vocabulary.
238
+
239
+ Args:
240
+ tokens (`str` or `List[str]`): One or several token(s) to convert to token id(s).
241
+
242
+ Returns:
243
+ `int` or `List[int]`: The token id or list of token ids.
244
+ """
245
+ if tokens is None:
246
+ return None
247
+
248
+ if isinstance(tokens, str):
249
+ return self._convert_token_to_id_with_added_voc(tokens)
250
+
251
+ return [self._convert_token_to_id_with_added_voc(token) for token in tokens]
252
+
253
+ def _convert_token_to_id_with_added_voc(self, token: str) -> int:
254
+ index = self._tokenizer.token_to_id(token)
255
+ if index is None:
256
+ return self.unk_token_id
257
+ return index
258
+
259
+ def _convert_id_to_token(self, index: int) -> Optional[str]:
260
+ return self._tokenizer.id_to_token(int(index))
261
+
262
+ def _add_tokens(self, new_tokens: List[Union[str, AddedToken]], special_tokens=False) -> int:
263
+ if special_tokens:
264
+ return self._tokenizer.add_special_tokens(new_tokens)
265
+
266
+ return self._tokenizer.add_tokens(new_tokens)
267
+
268
+ def num_special_tokens_to_add(self, pair: bool = False) -> int:
269
+ """
270
+ Returns the number of added tokens when encoding a sequence with special tokens.
271
+
272
+ <Tip>
273
+
274
+ This encodes a dummy input and checks the number of added tokens, and is therefore not efficient. Do not put
275
+ this inside your training loop.
276
+
277
+ </Tip>
278
+
279
+ Args:
280
+ pair (`bool`, *optional*, defaults to `False`):
281
+ Whether the number of added tokens should be computed in the case of a sequence pair or a single
282
+ sequence.
283
+
284
+ Returns:
285
+ `int`: Number of special tokens added to sequences.
286
+ """
287
+ return self._tokenizer.num_special_tokens_to_add(pair)
288
+
289
+ def convert_ids_to_tokens(
290
+ self, ids: Union[int, List[int]], skip_special_tokens: bool = False
291
+ ) -> Union[str, List[str]]:
292
+ """
293
+ Converts a single index or a sequence of indices in a token or a sequence of tokens, using the vocabulary and
294
+ added tokens.
295
+
296
+ Args:
297
+ ids (`int` or `List[int]`):
298
+ The token id (or token ids) to convert to tokens.
299
+ skip_special_tokens (`bool`, *optional*, defaults to `False`):
300
+ Whether or not to remove special tokens in the decoding.
301
+
302
+ Returns:
303
+ `str` or `List[str]`: The decoded token(s).
304
+ """
305
+ if isinstance(ids, int):
306
+ return self._tokenizer.id_to_token(ids)
307
+ tokens = []
308
+ for index in ids:
309
+ index = int(index)
310
+ if skip_special_tokens and index in self.all_special_ids:
311
+ continue
312
+ tokens.append(self._tokenizer.id_to_token(index))
313
+ return tokens
314
+
315
+ def tokenize(self, text: str, pair: Optional[str] = None, add_special_tokens: bool = False, **kwargs) -> List[str]:
316
+ return self.encode_plus(text=text, text_pair=pair, add_special_tokens=add_special_tokens, **kwargs).tokens()
317
+
318
+ def set_truncation_and_padding(
319
+ self,
320
+ padding_strategy: PaddingStrategy,
321
+ truncation_strategy: TruncationStrategy,
322
+ max_length: int,
323
+ stride: int,
324
+ pad_to_multiple_of: Optional[int],
325
+ ):
326
+ """
327
+ Define the truncation and the padding strategies for fast tokenizers (provided by HuggingFace tokenizers
328
+ library) and restore the tokenizer settings afterwards.
329
+
330
+ The provided tokenizer has no padding / truncation strategy before the managed section. If your tokenizer set a
331
+ padding / truncation strategy before, then it will be reset to no padding / truncation when exiting the managed
332
+ section.
333
+
334
+ Args:
335
+ padding_strategy ([`~utils.PaddingStrategy`]):
336
+ The kind of padding that will be applied to the input
337
+ truncation_strategy ([`~tokenization_utils_base.TruncationStrategy`]):
338
+ The kind of truncation that will be applied to the input
339
+ max_length (`int`):
340
+ The maximum size of a sequence.
341
+ stride (`int`):
342
+ The stride to use when handling overflow.
343
+ pad_to_multiple_of (`int`, *optional*):
344
+ If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
345
+ the use of Tensor Cores on NVIDIA hardware with compute capability `>= 7.5` (Volta).
346
+ """
347
+ _truncation = self._tokenizer.truncation
348
+ _padding = self._tokenizer.padding
349
+ # Set truncation and padding on the backend tokenizer
350
+ if truncation_strategy == TruncationStrategy.DO_NOT_TRUNCATE:
351
+ if _truncation is not None:
352
+ self._tokenizer.no_truncation()
353
+ else:
354
+ target = {
355
+ "max_length": max_length,
356
+ "stride": stride,
357
+ "strategy": truncation_strategy.value,
358
+ "direction": self.truncation_side,
359
+ }
360
+
361
+ # _truncation might contain more keys that the target `transformers`
362
+ # supports. Use only the target keys to trigger `enable_truncation`.
363
+ # This should enable this code to works on various `tokenizers`
364
+ # targets.
365
+ if _truncation is None:
366
+ current = None
367
+ else:
368
+ current = {k: _truncation.get(k, None) for k in target}
369
+
370
+ if current != target:
371
+ self._tokenizer.enable_truncation(**target)
372
+
373
+ if padding_strategy == PaddingStrategy.DO_NOT_PAD:
374
+ if _padding is not None:
375
+ self._tokenizer.no_padding()
376
+ else:
377
+ length = max_length if padding_strategy == PaddingStrategy.MAX_LENGTH else None
378
+ target = {
379
+ "length": length,
380
+ "direction": self.padding_side,
381
+ "pad_id": self.pad_token_id,
382
+ "pad_token": self.pad_token,
383
+ "pad_type_id": self.pad_token_type_id,
384
+ "pad_to_multiple_of": pad_to_multiple_of,
385
+ }
386
+ if _padding != target:
387
+ self._tokenizer.enable_padding(**target)
388
+
389
+ def _batch_encode_plus(
390
+ self,
391
+ batch_text_or_text_pairs: Union[
392
+ List[TextInput], List[TextInputPair], List[PreTokenizedInput], List[PreTokenizedInputPair]
393
+ ],
394
+ add_special_tokens: bool = True,
395
+ padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
396
+ truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
397
+ max_length: Optional[int] = None,
398
+ stride: int = 0,
399
+ is_split_into_words: bool = False,
400
+ pad_to_multiple_of: Optional[int] = None,
401
+ return_tensors: Optional[str] = None,
402
+ return_token_type_ids: Optional[bool] = None,
403
+ return_attention_mask: Optional[bool] = None,
404
+ return_overflowing_tokens: bool = False,
405
+ return_special_tokens_mask: bool = False,
406
+ return_offsets_mapping: bool = False,
407
+ return_length: bool = False,
408
+ verbose: bool = True,
409
+ ) -> BatchEncoding:
410
+ if not isinstance(batch_text_or_text_pairs, (tuple, list)):
411
+ raise TypeError(
412
+ f"batch_text_or_text_pairs has to be a list or a tuple (got {type(batch_text_or_text_pairs)})"
413
+ )
414
+
415
+ # Set the truncation and padding strategy and restore the initial configuration
416
+ self.set_truncation_and_padding(
417
+ padding_strategy=padding_strategy,
418
+ truncation_strategy=truncation_strategy,
419
+ max_length=max_length,
420
+ stride=stride,
421
+ pad_to_multiple_of=pad_to_multiple_of,
422
+ )
423
+
424
+ encodings = self._tokenizer.encode_batch(
425
+ batch_text_or_text_pairs,
426
+ add_special_tokens=add_special_tokens,
427
+ is_pretokenized=is_split_into_words,
428
+ )
429
+
430
+ # Convert encoding to dict
431
+ # `Tokens` has type: Tuple[
432
+ # List[Dict[str, List[List[int]]]] or List[Dict[str, 2D-Tensor]],
433
+ # List[EncodingFast]
434
+ # ]
435
+ # with nested dimensions corresponding to batch, overflows, sequence length
436
+ tokens_and_encodings = [
437
+ self._convert_encoding(
438
+ encoding=encoding,
439
+ return_token_type_ids=return_token_type_ids,
440
+ return_attention_mask=return_attention_mask,
441
+ return_overflowing_tokens=return_overflowing_tokens,
442
+ return_special_tokens_mask=return_special_tokens_mask,
443
+ return_offsets_mapping=return_offsets_mapping,
444
+ return_length=return_length,
445
+ verbose=verbose,
446
+ )
447
+ for encoding in encodings
448
+ ]
449
+
450
+ # Convert the output to have dict[list] from list[dict] and remove the additional overflows dimension
451
+ # From (variable) shape (batch, overflows, sequence length) to ~ (batch * overflows, sequence length)
452
+ # (we say ~ because the number of overflow varies with the example in the batch)
453
+ #
454
+ # To match each overflowing sample with the original sample in the batch
455
+ # we add an overflow_to_sample_mapping array (see below)
456
+ sanitized_tokens = {}
457
+ for key in tokens_and_encodings[0][0].keys():
458
+ stack = [e for item, _ in tokens_and_encodings for e in item[key]]
459
+ sanitized_tokens[key] = stack
460
+ sanitized_encodings = [e for _, item in tokens_and_encodings for e in item]
461
+
462
+ # If returning overflowing tokens, we need to return a mapping
463
+ # from the batch idx to the original sample
464
+ if return_overflowing_tokens:
465
+ overflow_to_sample_mapping = []
466
+ for i, (toks, _) in enumerate(tokens_and_encodings):
467
+ overflow_to_sample_mapping += [i] * len(toks["input_ids"])
468
+ sanitized_tokens["overflow_to_sample_mapping"] = overflow_to_sample_mapping
469
+
470
+ for input_ids in sanitized_tokens["input_ids"]:
471
+ self._eventual_warn_about_too_long_sequence(input_ids, max_length, verbose)
472
+ return BatchEncoding(sanitized_tokens, sanitized_encodings, tensor_type=return_tensors)
473
+
474
+ def _encode_plus(
475
+ self,
476
+ text: Union[TextInput, PreTokenizedInput],
477
+ text_pair: Optional[Union[TextInput, PreTokenizedInput]] = None,
478
+ add_special_tokens: bool = True,
479
+ padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
480
+ truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
481
+ max_length: Optional[int] = None,
482
+ stride: int = 0,
483
+ is_split_into_words: bool = False,
484
+ pad_to_multiple_of: Optional[int] = None,
485
+ return_tensors: Optional[bool] = None,
486
+ return_token_type_ids: Optional[bool] = None,
487
+ return_attention_mask: Optional[bool] = None,
488
+ return_overflowing_tokens: bool = False,
489
+ return_special_tokens_mask: bool = False,
490
+ return_offsets_mapping: bool = False,
491
+ return_length: bool = False,
492
+ verbose: bool = True,
493
+ **kwargs,
494
+ ) -> BatchEncoding:
495
+ batched_input = [(text, text_pair)] if text_pair else [text]
496
+ batched_output = self._batch_encode_plus(
497
+ batched_input,
498
+ is_split_into_words=is_split_into_words,
499
+ add_special_tokens=add_special_tokens,
500
+ padding_strategy=padding_strategy,
501
+ truncation_strategy=truncation_strategy,
502
+ max_length=max_length,
503
+ stride=stride,
504
+ pad_to_multiple_of=pad_to_multiple_of,
505
+ return_tensors=return_tensors,
506
+ return_token_type_ids=return_token_type_ids,
507
+ return_attention_mask=return_attention_mask,
508
+ return_overflowing_tokens=return_overflowing_tokens,
509
+ return_special_tokens_mask=return_special_tokens_mask,
510
+ return_offsets_mapping=return_offsets_mapping,
511
+ return_length=return_length,
512
+ verbose=verbose,
513
+ **kwargs,
514
+ )
515
+
516
+ # Return tensor is None, then we can remove the leading batch axis
517
+ # Overflowing tokens are returned as a batch of output so we keep them in this case
518
+ if return_tensors is None and not return_overflowing_tokens:
519
+ batched_output = BatchEncoding(
520
+ {
521
+ key: value[0] if len(value) > 0 and isinstance(value[0], list) else value
522
+ for key, value in batched_output.items()
523
+ },
524
+ batched_output.encodings,
525
+ )
526
+
527
+ self._eventual_warn_about_too_long_sequence(batched_output["input_ids"], max_length, verbose)
528
+
529
+ return batched_output
530
+
531
+ def convert_tokens_to_string(self, tokens: List[str]) -> str:
532
+ return self.backend_tokenizer.decoder.decode(tokens)
533
+
534
+ def _decode(
535
+ self,
536
+ token_ids: Union[int, List[int]],
537
+ skip_special_tokens: bool = False,
538
+ clean_up_tokenization_spaces: bool = None,
539
+ **kwargs,
540
+ ) -> str:
541
+ self._decode_use_source_tokenizer = kwargs.pop("use_source_tokenizer", False)
542
+
543
+ if isinstance(token_ids, int):
544
+ token_ids = [token_ids]
545
+ text = self._tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens)
546
+
547
+ clean_up_tokenization_spaces = (
548
+ clean_up_tokenization_spaces
549
+ if clean_up_tokenization_spaces is not None
550
+ else self.clean_up_tokenization_spaces
551
+ )
552
+ if clean_up_tokenization_spaces:
553
+ clean_text = self.clean_up_tokenization(text)
554
+ return clean_text
555
+ else:
556
+ return text
557
+
558
+ def _save_pretrained(
559
+ self,
560
+ save_directory: Union[str, os.PathLike],
561
+ file_names: Tuple[str],
562
+ legacy_format: Optional[bool] = None,
563
+ filename_prefix: Optional[str] = None,
564
+ ) -> Tuple[str]:
565
+ """
566
+ Save a tokenizer using the slow-tokenizer/legacy format: vocabulary + added tokens as well as in a unique JSON
567
+ file containing {config + vocab + added-tokens}.
568
+ """
569
+ save_directory = str(save_directory)
570
+
571
+ if self.slow_tokenizer_class is None and legacy_format is True:
572
+ raise ValueError(
573
+ "Your tokenizer does not have a legacy version defined and therefore cannot register this version. You"
574
+ " might consider leaving the legacy_format at `None` or setting it to `False`."
575
+ )
576
+
577
+ save_slow = (
578
+ (legacy_format is None or legacy_format is True)
579
+ and self.slow_tokenizer_class is not None
580
+ and self.can_save_slow_tokenizer
581
+ )
582
+ save_fast = legacy_format is None or legacy_format is False
583
+
584
+ if save_slow:
585
+ added_tokens_file = os.path.join(
586
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + ADDED_TOKENS_FILE
587
+ )
588
+ added_vocab = self.get_added_vocab()
589
+ if added_vocab:
590
+ with open(added_tokens_file, "w", encoding="utf-8") as f:
591
+ out_str = json.dumps(added_vocab, indent=2, sort_keys=True, ensure_ascii=False) + "\n"
592
+ f.write(out_str)
593
+
594
+ vocab_files = self.save_vocabulary(save_directory, filename_prefix=filename_prefix)
595
+ file_names = file_names + vocab_files + (added_tokens_file,)
596
+
597
+ if save_fast:
598
+ tokenizer_file = os.path.join(
599
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + TOKENIZER_FILE
600
+ )
601
+ self.backend_tokenizer.save(tokenizer_file)
602
+ file_names = file_names + (tokenizer_file,)
603
+
604
+ return file_names
605
+
606
+ def train_new_from_iterator(
607
+ self,
608
+ text_iterator,
609
+ vocab_size,
610
+ length=None,
611
+ new_special_tokens=None,
612
+ special_tokens_map=None,
613
+ **kwargs,
614
+ ):
615
+ """
616
+ Trains a tokenizer on a new corpus with the same defaults (in terms of special tokens or tokenization pipeline)
617
+ as the current one.
618
+
619
+ Args:
620
+ text_iterator (generator of `List[str]`):
621
+ The training corpus. Should be a generator of batches of texts, for instance a list of lists of texts
622
+ if you have everything in memory.
623
+ vocab_size (`int`):
624
+ The size of the vocabulary you want for your tokenizer.
625
+ length (`int`, *optional*):
626
+ The total number of sequences in the iterator. This is used to provide meaningful progress tracking
627
+ new_special_tokens (list of `str` or `AddedToken`, *optional*):
628
+ A list of new special tokens to add to the tokenizer you are training.
629
+ special_tokens_map (`Dict[str, str]`, *optional*):
630
+ If you want to rename some of the special tokens this tokenizer uses, pass along a mapping old special
631
+ token name to new special token name in this argument.
632
+ kwargs (`Dict[str, Any]`, *optional*):
633
+ Additional keyword arguments passed along to the trainer from the 🤗 Tokenizers library.
634
+
635
+ Returns:
636
+ [`PreTrainedTokenizerFast`]: A new tokenizer of the same type as the original one, trained on
637
+ `text_iterator`.
638
+
639
+ """
640
+ tokenizer_json = json.loads(self._tokenizer.to_str())
641
+ # Remove added tokens for now (uses IDs of tokens)
642
+ added_tokens = tokenizer_json.pop("added_tokens")
643
+ # Remove post processor for now (uses IDs of tokens)
644
+ post_processor = tokenizer_json.pop("post_processor")
645
+
646
+ unk_token = None
647
+ # Remove vocab
648
+ if tokenizer_json["model"]["type"] == "BPE":
649
+ tokenizer_json["model"]["vocab"] = {}
650
+ tokenizer_json["model"]["merges"] = []
651
+ elif tokenizer_json["model"]["type"] == "Unigram":
652
+ if tokenizer_json["model"]["unk_id"] is not None:
653
+ unk_id = tokenizer_json["model"]["unk_id"]
654
+ unk_token = tokenizer_json["model"]["vocab"][unk_id][0]
655
+ if special_tokens_map is not None and unk_token in special_tokens_map:
656
+ unk_token = special_tokens_map[unk_token]
657
+ tokenizer_json["model"]["unk_id"] = 0
658
+ tokenizer_json["model"]["vocab"] = [[unk_token, 0.0]]
659
+ elif tokenizer_json["model"]["type"] in ["WordLevel", "WordPiece"]:
660
+ tokenizer_json["model"]["vocab"] = {}
661
+ else:
662
+ raise ValueError(
663
+ f"This method does not support this type of tokenizer (found {tokenizer_json['model']['type']}) "
664
+ "only BPE, Unigram, WordLevel and WordPiece."
665
+ )
666
+
667
+ if (
668
+ special_tokens_map is not None
669
+ and "unk_token" in tokenizer_json["model"]
670
+ and tokenizer_json["model"]["unk_token"] in special_tokens_map
671
+ ):
672
+ tokenizer_json["model"]["unk_token"] = special_tokens_map[tokenizer_json["model"]["unk_token"]]
673
+
674
+ tokenizer = TokenizerFast.from_str(json.dumps(tokenizer_json))
675
+
676
+ # Get the special tokens from the current tokenizer if none are specified.
677
+ special_tokens = []
678
+ for added_token in added_tokens:
679
+ special = added_token.pop("special", None)
680
+ _ = added_token.pop("id", None)
681
+ if tokenizer_json["model"]["type"] != "Unigram" and not special:
682
+ continue
683
+ if special_tokens_map is not None and added_token["content"] in special_tokens_map:
684
+ added_token["content"] = special_tokens_map[added_token["content"]]
685
+ special_tokens.append(AddedToken(**added_token))
686
+
687
+ if new_special_tokens is not None:
688
+ special_tokens.extend(new_special_tokens)
689
+
690
+ # Trainer needs to know the end of word / continuing subword thingies in BPE
691
+ if (
692
+ tokenizer_json["model"]["type"] == "BPE"
693
+ and "continuing_subword_prefix" not in kwargs
694
+ and tokenizer_json["model"]["continuing_subword_prefix"] is not None
695
+ ):
696
+ kwargs["continuing_subword_prefix"] = tokenizer_json["model"]["continuing_subword_prefix"]
697
+ if (
698
+ tokenizer_json["model"]["type"] == "BPE"
699
+ and "end_of_word_suffix" not in kwargs
700
+ and tokenizer_json["model"]["end_of_word_suffix"] is not None
701
+ ):
702
+ kwargs["end_of_word_suffix"] = tokenizer_json["model"]["end_of_word_suffix"]
703
+ if tokenizer_json["model"]["type"] == "Unigram" and unk_token is not None:
704
+ kwargs["unk_token"] = unk_token
705
+ if tokenizer_json["pre_tokenizer"] is not None and tokenizer_json["pre_tokenizer"]["type"] == "ByteLevel":
706
+ kwargs["initial_alphabet"] = pre_tokenizers_fast.ByteLevel.alphabet()
707
+
708
+ trainer_class = MODEL_TO_TRAINER_MAPPING[tokenizer_json["model"]["type"]]
709
+ trainer = trainer_class(vocab_size=vocab_size, special_tokens=special_tokens, **kwargs)
710
+ tokenizer.train_from_iterator(text_iterator, length=length, trainer=trainer)
711
+
712
+ if post_processor is not None:
713
+ trained_tokenizer_json = json.loads(tokenizer.to_str())
714
+ # Almost done, we just have to adjust the token IDs in the post processor
715
+ if "special_tokens" in post_processor:
716
+ for key in post_processor["special_tokens"]:
717
+ tokens = post_processor["special_tokens"][key]["tokens"]
718
+ if special_tokens_map is not None:
719
+ tokens = [special_tokens_map.get(token, token) for token in tokens]
720
+ post_processor["special_tokens"][key]["tokens"] = tokens
721
+ post_processor["special_tokens"][key]["ids"] = [tokenizer.token_to_id(token) for token in tokens]
722
+
723
+ for special_token in ["cls", "sep"]:
724
+ if special_token in post_processor:
725
+ token, _ = post_processor[special_token]
726
+ if special_tokens_map is not None and token in special_tokens_map:
727
+ token = special_tokens_map[token]
728
+ token_id = tokenizer.token_to_id(token)
729
+ post_processor[special_token] = [token, token_id]
730
+
731
+ trained_tokenizer_json["post_processor"] = post_processor
732
+ tokenizer = TokenizerFast.from_str(json.dumps(trained_tokenizer_json))
733
+
734
+ kwargs = self.init_kwargs.copy()
735
+ # Map pad/cls/mask token at the Transformers level
736
+ special_tokens_list = SpecialTokensMixin.SPECIAL_TOKENS_ATTRIBUTES.copy()
737
+ special_tokens_list.remove("additional_special_tokens")
738
+ for token in special_tokens_list:
739
+ # Get the private one to avoid unnecessary warnings.
740
+ if getattr(self, f"_{token}") is not None:
741
+ special_token = getattr(self, token)
742
+ if special_tokens_map is not None and special_token in special_tokens_map:
743
+ special_token = special_tokens_map[special_token]
744
+
745
+ special_token_full = getattr(self, f"_{token}")
746
+ if isinstance(special_token_full, AddedToken):
747
+ # Create an added token with the same parameters except the content
748
+ kwargs[token] = AddedToken(
749
+ special_token,
750
+ single_word=special_token_full.single_word,
751
+ lstrip=special_token_full.lstrip,
752
+ rstrip=special_token_full.rstrip,
753
+ normalized=special_token_full.normalized,
754
+ )
755
+ else:
756
+ kwargs[token] = special_token
757
+
758
+ additional_special_tokens = self.additional_special_tokens
759
+ if new_special_tokens is not None:
760
+ additional_special_tokens.extend(new_special_tokens)
761
+ if len(additional_special_tokens) > 0:
762
+ kwargs["additional_special_tokens"] = additional_special_tokens
763
+
764
+ return self.__class__(tokenizer_object=tokenizer, **kwargs)
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name_or_path": "THUDM/chatglm2-6b",
3
+ "remove_space": false,
4
+ "do_lower_case": false,
5
+ "tokenizer_class": "PreTrainedTokenizerFast",
6
+ "auto_map": {
7
+ "AutoTokenizer": [
8
+ null,
9
+ "tokenization_utils_fast.PreTrainedTokenizerFast"
10
+ ]
11
+ }
12
+ }