huseinzol05 commited on
Commit
7345d48
1 Parent(s): f479d47
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ *.ipynb_checkpoints
config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "./pytorch_model.bin",
3
+ "architectures": [
4
+ "T5Model"
5
+ ],
6
+ "d_ff": 3072,
7
+ "d_kv": 64,
8
+ "d_model": 768,
9
+ "decoder_start_token_id": 0,
10
+ "dropout_rate": 0.1,
11
+ "eos_token_id": 1,
12
+ "feed_forward_proj": "relu",
13
+ "gradient_checkpointing": false,
14
+ "initializer_factor": 1.0,
15
+ "inputs_length": 1024,
16
+ "is_encoder_decoder": true,
17
+ "layer_norm_epsilon": 1e-06,
18
+ "model_type": "t5",
19
+ "n_positions": 1024,
20
+ "num_decoder_layers": 12,
21
+ "num_heads": 12,
22
+ "num_layers": 12,
23
+ "pad_token_id": 0,
24
+ "relative_attention_num_buckets": 32,
25
+ "torch_dtype": "float32",
26
+ "transformers_version": "4.10.0",
27
+ "use_cache": true,
28
+ "vocab_size": 32128
29
+ }
convert-from-malaya.ipynb ADDED
@@ -0,0 +1,1077 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {
7
+ "scrolled": true
8
+ },
9
+ "outputs": [
10
+ {
11
+ "data": {
12
+ "text/plain": [
13
+ "'4.10.0'"
14
+ ]
15
+ },
16
+ "execution_count": 1,
17
+ "metadata": {},
18
+ "output_type": "execute_result"
19
+ }
20
+ ],
21
+ "source": [
22
+ "import transformers\n",
23
+ "transformers.__version__"
24
+ ]
25
+ },
26
+ {
27
+ "cell_type": "code",
28
+ "execution_count": 2,
29
+ "metadata": {},
30
+ "outputs": [],
31
+ "source": [
32
+ "from transformers import T5Config, T5Model, load_tf_weights_in_t5"
33
+ ]
34
+ },
35
+ {
36
+ "cell_type": "code",
37
+ "execution_count": 4,
38
+ "metadata": {},
39
+ "outputs": [
40
+ {
41
+ "name": "stdout",
42
+ "output_type": "stream",
43
+ "text": [
44
+ "checkpoint model.ckpt-759900.index\r\n",
45
+ "model.ckpt-759900.data-00000-of-00002 model.ckpt-759900.meta\r\n",
46
+ "model.ckpt-759900.data-00001-of-00002 operative_config.gin\r\n"
47
+ ]
48
+ }
49
+ ],
50
+ "source": [
51
+ "# !wget https://f000.backblazeb2.com/file/malaya-model/pretrained/t5-base-2021-07-28.tar.gz\n",
52
+ "# !tar -zxf t5-base-2021-07-28.tar.gz\n",
53
+ "# !rm t5-base-2021-07-28.tar.gz\n",
54
+ "!ls t5-base-v2"
55
+ ]
56
+ },
57
+ {
58
+ "cell_type": "code",
59
+ "execution_count": 5,
60
+ "metadata": {},
61
+ "outputs": [
62
+ {
63
+ "name": "stdout",
64
+ "output_type": "stream",
65
+ "text": [
66
+ "T5Config {\n",
67
+ " \"d_ff\": 3072,\n",
68
+ " \"d_kv\": 64,\n",
69
+ " \"d_model\": 768,\n",
70
+ " \"decoder_start_token_id\": 0,\n",
71
+ " \"dropout_rate\": 0.1,\n",
72
+ " \"eos_token_id\": 1,\n",
73
+ " \"feed_forward_proj\": \"relu\",\n",
74
+ " \"gradient_checkpointing\": false,\n",
75
+ " \"initializer_factor\": 1.0,\n",
76
+ " \"inputs_length\": 1024,\n",
77
+ " \"is_encoder_decoder\": true,\n",
78
+ " \"layer_norm_epsilon\": 1e-06,\n",
79
+ " \"model_type\": \"t5\",\n",
80
+ " \"n_positions\": 1024,\n",
81
+ " \"num_decoder_layers\": 12,\n",
82
+ " \"num_heads\": 12,\n",
83
+ " \"num_layers\": 12,\n",
84
+ " \"pad_token_id\": 0,\n",
85
+ " \"relative_attention_num_buckets\": 32,\n",
86
+ " \"transformers_version\": \"4.10.0\",\n",
87
+ " \"use_cache\": true,\n",
88
+ " \"vocab_size\": 32128\n",
89
+ "}\n",
90
+ "\n"
91
+ ]
92
+ }
93
+ ],
94
+ "source": [
95
+ "config = T5Config(\n",
96
+ " vocab_size = 32128,\n",
97
+ " n_positions=1024,\n",
98
+ " d_ff = 3072,\n",
99
+ " d_kv = 64,\n",
100
+ " d_model = 768,\n",
101
+ " dropout_rate = 0.1,\n",
102
+ " inputs_length = 1024,\n",
103
+ " num_heads = 12,\n",
104
+ " num_layers = 12,\n",
105
+ " decoder_start_token_id = 0,\n",
106
+ " eos_token_id = 1,\n",
107
+ " pad_token_id = 0)\n",
108
+ "print(config)\n",
109
+ "config.save_pretrained('./')"
110
+ ]
111
+ },
112
+ {
113
+ "cell_type": "code",
114
+ "execution_count": 6,
115
+ "metadata": {},
116
+ "outputs": [
117
+ {
118
+ "data": {
119
+ "text/plain": [
120
+ "T5Model(\n",
121
+ " (shared): Embedding(32128, 768)\n",
122
+ " (encoder): T5Stack(\n",
123
+ " (embed_tokens): Embedding(32128, 768)\n",
124
+ " (block): ModuleList(\n",
125
+ " (0): T5Block(\n",
126
+ " (layer): ModuleList(\n",
127
+ " (0): T5LayerSelfAttention(\n",
128
+ " (SelfAttention): T5Attention(\n",
129
+ " (q): Linear(in_features=768, out_features=768, bias=False)\n",
130
+ " (k): Linear(in_features=768, out_features=768, bias=False)\n",
131
+ " (v): Linear(in_features=768, out_features=768, bias=False)\n",
132
+ " (o): Linear(in_features=768, out_features=768, bias=False)\n",
133
+ " (relative_attention_bias): Embedding(32, 12)\n",
134
+ " )\n",
135
+ " (layer_norm): T5LayerNorm()\n",
136
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
137
+ " )\n",
138
+ " (1): T5LayerFF(\n",
139
+ " (DenseReluDense): T5DenseReluDense(\n",
140
+ " (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
141
+ " (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
142
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
143
+ " )\n",
144
+ " (layer_norm): T5LayerNorm()\n",
145
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
146
+ " )\n",
147
+ " )\n",
148
+ " )\n",
149
+ " (1): T5Block(\n",
150
+ " (layer): ModuleList(\n",
151
+ " (0): T5LayerSelfAttention(\n",
152
+ " (SelfAttention): T5Attention(\n",
153
+ " (q): Linear(in_features=768, out_features=768, bias=False)\n",
154
+ " (k): Linear(in_features=768, out_features=768, bias=False)\n",
155
+ " (v): Linear(in_features=768, out_features=768, bias=False)\n",
156
+ " (o): Linear(in_features=768, out_features=768, bias=False)\n",
157
+ " )\n",
158
+ " (layer_norm): T5LayerNorm()\n",
159
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
160
+ " )\n",
161
+ " (1): T5LayerFF(\n",
162
+ " (DenseReluDense): T5DenseReluDense(\n",
163
+ " (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
164
+ " (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
165
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
166
+ " )\n",
167
+ " (layer_norm): T5LayerNorm()\n",
168
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
169
+ " )\n",
170
+ " )\n",
171
+ " )\n",
172
+ " (2): T5Block(\n",
173
+ " (layer): ModuleList(\n",
174
+ " (0): T5LayerSelfAttention(\n",
175
+ " (SelfAttention): T5Attention(\n",
176
+ " (q): Linear(in_features=768, out_features=768, bias=False)\n",
177
+ " (k): Linear(in_features=768, out_features=768, bias=False)\n",
178
+ " (v): Linear(in_features=768, out_features=768, bias=False)\n",
179
+ " (o): Linear(in_features=768, out_features=768, bias=False)\n",
180
+ " )\n",
181
+ " (layer_norm): T5LayerNorm()\n",
182
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
183
+ " )\n",
184
+ " (1): T5LayerFF(\n",
185
+ " (DenseReluDense): T5DenseReluDense(\n",
186
+ " (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
187
+ " (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
188
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
189
+ " )\n",
190
+ " (layer_norm): T5LayerNorm()\n",
191
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
192
+ " )\n",
193
+ " )\n",
194
+ " )\n",
195
+ " (3): T5Block(\n",
196
+ " (layer): ModuleList(\n",
197
+ " (0): T5LayerSelfAttention(\n",
198
+ " (SelfAttention): T5Attention(\n",
199
+ " (q): Linear(in_features=768, out_features=768, bias=False)\n",
200
+ " (k): Linear(in_features=768, out_features=768, bias=False)\n",
201
+ " (v): Linear(in_features=768, out_features=768, bias=False)\n",
202
+ " (o): Linear(in_features=768, out_features=768, bias=False)\n",
203
+ " )\n",
204
+ " (layer_norm): T5LayerNorm()\n",
205
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
206
+ " )\n",
207
+ " (1): T5LayerFF(\n",
208
+ " (DenseReluDense): T5DenseReluDense(\n",
209
+ " (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
210
+ " (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
211
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
212
+ " )\n",
213
+ " (layer_norm): T5LayerNorm()\n",
214
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
215
+ " )\n",
216
+ " )\n",
217
+ " )\n",
218
+ " (4): T5Block(\n",
219
+ " (layer): ModuleList(\n",
220
+ " (0): T5LayerSelfAttention(\n",
221
+ " (SelfAttention): T5Attention(\n",
222
+ " (q): Linear(in_features=768, out_features=768, bias=False)\n",
223
+ " (k): Linear(in_features=768, out_features=768, bias=False)\n",
224
+ " (v): Linear(in_features=768, out_features=768, bias=False)\n",
225
+ " (o): Linear(in_features=768, out_features=768, bias=False)\n",
226
+ " )\n",
227
+ " (layer_norm): T5LayerNorm()\n",
228
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
229
+ " )\n",
230
+ " (1): T5LayerFF(\n",
231
+ " (DenseReluDense): T5DenseReluDense(\n",
232
+ " (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
233
+ " (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
234
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
235
+ " )\n",
236
+ " (layer_norm): T5LayerNorm()\n",
237
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
238
+ " )\n",
239
+ " )\n",
240
+ " )\n",
241
+ " (5): T5Block(\n",
242
+ " (layer): ModuleList(\n",
243
+ " (0): T5LayerSelfAttention(\n",
244
+ " (SelfAttention): T5Attention(\n",
245
+ " (q): Linear(in_features=768, out_features=768, bias=False)\n",
246
+ " (k): Linear(in_features=768, out_features=768, bias=False)\n",
247
+ " (v): Linear(in_features=768, out_features=768, bias=False)\n",
248
+ " (o): Linear(in_features=768, out_features=768, bias=False)\n",
249
+ " )\n",
250
+ " (layer_norm): T5LayerNorm()\n",
251
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
252
+ " )\n",
253
+ " (1): T5LayerFF(\n",
254
+ " (DenseReluDense): T5DenseReluDense(\n",
255
+ " (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
256
+ " (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
257
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
258
+ " )\n",
259
+ " (layer_norm): T5LayerNorm()\n",
260
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
261
+ " )\n",
262
+ " )\n",
263
+ " )\n",
264
+ " (6): T5Block(\n",
265
+ " (layer): ModuleList(\n",
266
+ " (0): T5LayerSelfAttention(\n",
267
+ " (SelfAttention): T5Attention(\n",
268
+ " (q): Linear(in_features=768, out_features=768, bias=False)\n",
269
+ " (k): Linear(in_features=768, out_features=768, bias=False)\n",
270
+ " (v): Linear(in_features=768, out_features=768, bias=False)\n",
271
+ " (o): Linear(in_features=768, out_features=768, bias=False)\n",
272
+ " )\n",
273
+ " (layer_norm): T5LayerNorm()\n",
274
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
275
+ " )\n",
276
+ " (1): T5LayerFF(\n",
277
+ " (DenseReluDense): T5DenseReluDense(\n",
278
+ " (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
279
+ " (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
280
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
281
+ " )\n",
282
+ " (layer_norm): T5LayerNorm()\n",
283
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
284
+ " )\n",
285
+ " )\n",
286
+ " )\n",
287
+ " (7): T5Block(\n",
288
+ " (layer): ModuleList(\n",
289
+ " (0): T5LayerSelfAttention(\n",
290
+ " (SelfAttention): T5Attention(\n",
291
+ " (q): Linear(in_features=768, out_features=768, bias=False)\n",
292
+ " (k): Linear(in_features=768, out_features=768, bias=False)\n",
293
+ " (v): Linear(in_features=768, out_features=768, bias=False)\n",
294
+ " (o): Linear(in_features=768, out_features=768, bias=False)\n",
295
+ " )\n",
296
+ " (layer_norm): T5LayerNorm()\n",
297
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
298
+ " )\n",
299
+ " (1): T5LayerFF(\n",
300
+ " (DenseReluDense): T5DenseReluDense(\n",
301
+ " (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
302
+ " (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
303
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
304
+ " )\n",
305
+ " (layer_norm): T5LayerNorm()\n",
306
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
307
+ " )\n",
308
+ " )\n",
309
+ " )\n",
310
+ " (8): T5Block(\n",
311
+ " (layer): ModuleList(\n",
312
+ " (0): T5LayerSelfAttention(\n",
313
+ " (SelfAttention): T5Attention(\n",
314
+ " (q): Linear(in_features=768, out_features=768, bias=False)\n",
315
+ " (k): Linear(in_features=768, out_features=768, bias=False)\n",
316
+ " (v): Linear(in_features=768, out_features=768, bias=False)\n",
317
+ " (o): Linear(in_features=768, out_features=768, bias=False)\n",
318
+ " )\n",
319
+ " (layer_norm): T5LayerNorm()\n",
320
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
321
+ " )\n",
322
+ " (1): T5LayerFF(\n",
323
+ " (DenseReluDense): T5DenseReluDense(\n",
324
+ " (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
325
+ " (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
326
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
327
+ " )\n",
328
+ " (layer_norm): T5LayerNorm()\n",
329
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
330
+ " )\n",
331
+ " )\n",
332
+ " )\n",
333
+ " (9): T5Block(\n",
334
+ " (layer): ModuleList(\n",
335
+ " (0): T5LayerSelfAttention(\n",
336
+ " (SelfAttention): T5Attention(\n",
337
+ " (q): Linear(in_features=768, out_features=768, bias=False)\n",
338
+ " (k): Linear(in_features=768, out_features=768, bias=False)\n",
339
+ " (v): Linear(in_features=768, out_features=768, bias=False)\n",
340
+ " (o): Linear(in_features=768, out_features=768, bias=False)\n",
341
+ " )\n",
342
+ " (layer_norm): T5LayerNorm()\n",
343
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
344
+ " )\n",
345
+ " (1): T5LayerFF(\n",
346
+ " (DenseReluDense): T5DenseReluDense(\n",
347
+ " (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
348
+ " (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
349
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
350
+ " )\n",
351
+ " (layer_norm): T5LayerNorm()\n",
352
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
353
+ " )\n",
354
+ " )\n",
355
+ " )\n",
356
+ " (10): T5Block(\n",
357
+ " (layer): ModuleList(\n",
358
+ " (0): T5LayerSelfAttention(\n",
359
+ " (SelfAttention): T5Attention(\n",
360
+ " (q): Linear(in_features=768, out_features=768, bias=False)\n",
361
+ " (k): Linear(in_features=768, out_features=768, bias=False)\n",
362
+ " (v): Linear(in_features=768, out_features=768, bias=False)\n",
363
+ " (o): Linear(in_features=768, out_features=768, bias=False)\n",
364
+ " )\n",
365
+ " (layer_norm): T5LayerNorm()\n",
366
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
367
+ " )\n",
368
+ " (1): T5LayerFF(\n",
369
+ " (DenseReluDense): T5DenseReluDense(\n",
370
+ " (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
371
+ " (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
372
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
373
+ " )\n",
374
+ " (layer_norm): T5LayerNorm()\n",
375
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
376
+ " )\n",
377
+ " )\n",
378
+ " )\n",
379
+ " (11): T5Block(\n",
380
+ " (layer): ModuleList(\n",
381
+ " (0): T5LayerSelfAttention(\n",
382
+ " (SelfAttention): T5Attention(\n",
383
+ " (q): Linear(in_features=768, out_features=768, bias=False)\n",
384
+ " (k): Linear(in_features=768, out_features=768, bias=False)\n",
385
+ " (v): Linear(in_features=768, out_features=768, bias=False)\n",
386
+ " (o): Linear(in_features=768, out_features=768, bias=False)\n",
387
+ " )\n",
388
+ " (layer_norm): T5LayerNorm()\n",
389
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
390
+ " )\n",
391
+ " (1): T5LayerFF(\n",
392
+ " (DenseReluDense): T5DenseReluDense(\n",
393
+ " (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
394
+ " (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
395
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
396
+ " )\n",
397
+ " (layer_norm): T5LayerNorm()\n",
398
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
399
+ " )\n",
400
+ " )\n",
401
+ " )\n",
402
+ " )\n",
403
+ " (final_layer_norm): T5LayerNorm()\n",
404
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
405
+ " )\n",
406
+ " (decoder): T5Stack(\n",
407
+ " (embed_tokens): Embedding(32128, 768)\n",
408
+ " (block): ModuleList(\n",
409
+ " (0): T5Block(\n",
410
+ " (layer): ModuleList(\n",
411
+ " (0): T5LayerSelfAttention(\n",
412
+ " (SelfAttention): T5Attention(\n",
413
+ " (q): Linear(in_features=768, out_features=768, bias=False)\n",
414
+ " (k): Linear(in_features=768, out_features=768, bias=False)\n",
415
+ " (v): Linear(in_features=768, out_features=768, bias=False)\n",
416
+ " (o): Linear(in_features=768, out_features=768, bias=False)\n",
417
+ " (relative_attention_bias): Embedding(32, 12)\n",
418
+ " )\n",
419
+ " (layer_norm): T5LayerNorm()\n",
420
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
421
+ " )\n",
422
+ " (1): T5LayerCrossAttention(\n",
423
+ " (EncDecAttention): T5Attention(\n",
424
+ " (q): Linear(in_features=768, out_features=768, bias=False)\n",
425
+ " (k): Linear(in_features=768, out_features=768, bias=False)\n",
426
+ " (v): Linear(in_features=768, out_features=768, bias=False)\n",
427
+ " (o): Linear(in_features=768, out_features=768, bias=False)\n",
428
+ " )\n",
429
+ " (layer_norm): T5LayerNorm()\n",
430
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
431
+ " )\n",
432
+ " (2): T5LayerFF(\n",
433
+ " (DenseReluDense): T5DenseReluDense(\n",
434
+ " (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
435
+ " (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
436
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
437
+ " )\n",
438
+ " (layer_norm): T5LayerNorm()\n",
439
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
440
+ " )\n",
441
+ " )\n",
442
+ " )\n",
443
+ " (1): T5Block(\n",
444
+ " (layer): ModuleList(\n",
445
+ " (0): T5LayerSelfAttention(\n",
446
+ " (SelfAttention): T5Attention(\n",
447
+ " (q): Linear(in_features=768, out_features=768, bias=False)\n",
448
+ " (k): Linear(in_features=768, out_features=768, bias=False)\n",
449
+ " (v): Linear(in_features=768, out_features=768, bias=False)\n",
450
+ " (o): Linear(in_features=768, out_features=768, bias=False)\n",
451
+ " )\n",
452
+ " (layer_norm): T5LayerNorm()\n",
453
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
454
+ " )\n",
455
+ " (1): T5LayerCrossAttention(\n",
456
+ " (EncDecAttention): T5Attention(\n",
457
+ " (q): Linear(in_features=768, out_features=768, bias=False)\n",
458
+ " (k): Linear(in_features=768, out_features=768, bias=False)\n",
459
+ " (v): Linear(in_features=768, out_features=768, bias=False)\n",
460
+ " (o): Linear(in_features=768, out_features=768, bias=False)\n",
461
+ " )\n",
462
+ " (layer_norm): T5LayerNorm()\n",
463
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
464
+ " )\n",
465
+ " (2): T5LayerFF(\n",
466
+ " (DenseReluDense): T5DenseReluDense(\n",
467
+ " (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
468
+ " (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
469
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
470
+ " )\n",
471
+ " (layer_norm): T5LayerNorm()\n",
472
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
473
+ " )\n",
474
+ " )\n",
475
+ " )\n",
476
+ " (2): T5Block(\n",
477
+ " (layer): ModuleList(\n",
478
+ " (0): T5LayerSelfAttention(\n",
479
+ " (SelfAttention): T5Attention(\n",
480
+ " (q): Linear(in_features=768, out_features=768, bias=False)\n",
481
+ " (k): Linear(in_features=768, out_features=768, bias=False)\n",
482
+ " (v): Linear(in_features=768, out_features=768, bias=False)\n",
483
+ " (o): Linear(in_features=768, out_features=768, bias=False)\n",
484
+ " )\n",
485
+ " (layer_norm): T5LayerNorm()\n",
486
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
487
+ " )\n",
488
+ " (1): T5LayerCrossAttention(\n",
489
+ " (EncDecAttention): T5Attention(\n",
490
+ " (q): Linear(in_features=768, out_features=768, bias=False)\n",
491
+ " (k): Linear(in_features=768, out_features=768, bias=False)\n",
492
+ " (v): Linear(in_features=768, out_features=768, bias=False)\n",
493
+ " (o): Linear(in_features=768, out_features=768, bias=False)\n",
494
+ " )\n",
495
+ " (layer_norm): T5LayerNorm()\n",
496
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
497
+ " )\n",
498
+ " (2): T5LayerFF(\n",
499
+ " (DenseReluDense): T5DenseReluDense(\n",
500
+ " (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
501
+ " (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
502
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
503
+ " )\n",
504
+ " (layer_norm): T5LayerNorm()\n",
505
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
506
+ " )\n",
507
+ " )\n",
508
+ " )\n",
509
+ " (3): T5Block(\n",
510
+ " (layer): ModuleList(\n",
511
+ " (0): T5LayerSelfAttention(\n",
512
+ " (SelfAttention): T5Attention(\n",
513
+ " (q): Linear(in_features=768, out_features=768, bias=False)\n",
514
+ " (k): Linear(in_features=768, out_features=768, bias=False)\n",
515
+ " (v): Linear(in_features=768, out_features=768, bias=False)\n",
516
+ " (o): Linear(in_features=768, out_features=768, bias=False)\n",
517
+ " )\n",
518
+ " (layer_norm): T5LayerNorm()\n",
519
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
520
+ " )\n",
521
+ " (1): T5LayerCrossAttention(\n",
522
+ " (EncDecAttention): T5Attention(\n",
523
+ " (q): Linear(in_features=768, out_features=768, bias=False)\n",
524
+ " (k): Linear(in_features=768, out_features=768, bias=False)\n",
525
+ " (v): Linear(in_features=768, out_features=768, bias=False)\n",
526
+ " (o): Linear(in_features=768, out_features=768, bias=False)\n",
527
+ " )\n",
528
+ " (layer_norm): T5LayerNorm()\n",
529
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
530
+ " )\n",
531
+ " (2): T5LayerFF(\n",
532
+ " (DenseReluDense): T5DenseReluDense(\n",
533
+ " (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
534
+ " (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
535
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
536
+ " )\n",
537
+ " (layer_norm): T5LayerNorm()\n",
538
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
539
+ " )\n",
540
+ " )\n",
541
+ " )\n",
542
+ " (4): T5Block(\n",
543
+ " (layer): ModuleList(\n",
544
+ " (0): T5LayerSelfAttention(\n",
545
+ " (SelfAttention): T5Attention(\n",
546
+ " (q): Linear(in_features=768, out_features=768, bias=False)\n",
547
+ " (k): Linear(in_features=768, out_features=768, bias=False)\n",
548
+ " (v): Linear(in_features=768, out_features=768, bias=False)\n",
549
+ " (o): Linear(in_features=768, out_features=768, bias=False)\n",
550
+ " )\n",
551
+ " (layer_norm): T5LayerNorm()\n",
552
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
553
+ " )\n",
554
+ " (1): T5LayerCrossAttention(\n",
555
+ " (EncDecAttention): T5Attention(\n",
556
+ " (q): Linear(in_features=768, out_features=768, bias=False)\n",
557
+ " (k): Linear(in_features=768, out_features=768, bias=False)\n",
558
+ " (v): Linear(in_features=768, out_features=768, bias=False)\n",
559
+ " (o): Linear(in_features=768, out_features=768, bias=False)\n",
560
+ " )\n",
561
+ " (layer_norm): T5LayerNorm()\n",
562
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
563
+ " )\n",
564
+ " (2): T5LayerFF(\n",
565
+ " (DenseReluDense): T5DenseReluDense(\n",
566
+ " (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
567
+ " (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
568
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
569
+ " )\n",
570
+ " (layer_norm): T5LayerNorm()\n",
571
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
572
+ " )\n",
573
+ " )\n",
574
+ " )\n",
575
+ " (5): T5Block(\n",
576
+ " (layer): ModuleList(\n",
577
+ " (0): T5LayerSelfAttention(\n",
578
+ " (SelfAttention): T5Attention(\n",
579
+ " (q): Linear(in_features=768, out_features=768, bias=False)\n",
580
+ " (k): Linear(in_features=768, out_features=768, bias=False)\n",
581
+ " (v): Linear(in_features=768, out_features=768, bias=False)\n",
582
+ " (o): Linear(in_features=768, out_features=768, bias=False)\n",
583
+ " )\n",
584
+ " (layer_norm): T5LayerNorm()\n",
585
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
586
+ " )\n",
587
+ " (1): T5LayerCrossAttention(\n",
588
+ " (EncDecAttention): T5Attention(\n",
589
+ " (q): Linear(in_features=768, out_features=768, bias=False)\n",
590
+ " (k): Linear(in_features=768, out_features=768, bias=False)\n",
591
+ " (v): Linear(in_features=768, out_features=768, bias=False)\n",
592
+ " (o): Linear(in_features=768, out_features=768, bias=False)\n",
593
+ " )\n",
594
+ " (layer_norm): T5LayerNorm()\n",
595
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
596
+ " )\n",
597
+ " (2): T5LayerFF(\n",
598
+ " (DenseReluDense): T5DenseReluDense(\n",
599
+ " (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
600
+ " (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
601
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
602
+ " )\n",
603
+ " (layer_norm): T5LayerNorm()\n",
604
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
605
+ " )\n",
606
+ " )\n",
607
+ " )\n",
608
+ " (6): T5Block(\n",
609
+ " (layer): ModuleList(\n",
610
+ " (0): T5LayerSelfAttention(\n",
611
+ " (SelfAttention): T5Attention(\n",
612
+ " (q): Linear(in_features=768, out_features=768, bias=False)\n",
613
+ " (k): Linear(in_features=768, out_features=768, bias=False)\n",
614
+ " (v): Linear(in_features=768, out_features=768, bias=False)\n",
615
+ " (o): Linear(in_features=768, out_features=768, bias=False)\n",
616
+ " )\n",
617
+ " (layer_norm): T5LayerNorm()\n",
618
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
619
+ " )\n",
620
+ " (1): T5LayerCrossAttention(\n",
621
+ " (EncDecAttention): T5Attention(\n",
622
+ " (q): Linear(in_features=768, out_features=768, bias=False)\n",
623
+ " (k): Linear(in_features=768, out_features=768, bias=False)\n",
624
+ " (v): Linear(in_features=768, out_features=768, bias=False)\n",
625
+ " (o): Linear(in_features=768, out_features=768, bias=False)\n",
626
+ " )\n",
627
+ " (layer_norm): T5LayerNorm()\n",
628
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
629
+ " )\n",
630
+ " (2): T5LayerFF(\n",
631
+ " (DenseReluDense): T5DenseReluDense(\n",
632
+ " (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
633
+ " (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
634
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
635
+ " )\n",
636
+ " (layer_norm): T5LayerNorm()\n",
637
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
638
+ " )\n",
639
+ " )\n",
640
+ " )\n",
641
+ " (7): T5Block(\n",
642
+ " (layer): ModuleList(\n",
643
+ " (0): T5LayerSelfAttention(\n",
644
+ " (SelfAttention): T5Attention(\n",
645
+ " (q): Linear(in_features=768, out_features=768, bias=False)\n",
646
+ " (k): Linear(in_features=768, out_features=768, bias=False)\n",
647
+ " (v): Linear(in_features=768, out_features=768, bias=False)\n",
648
+ " (o): Linear(in_features=768, out_features=768, bias=False)\n",
649
+ " )\n",
650
+ " (layer_norm): T5LayerNorm()\n",
651
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
652
+ " )\n",
653
+ " (1): T5LayerCrossAttention(\n",
654
+ " (EncDecAttention): T5Attention(\n",
655
+ " (q): Linear(in_features=768, out_features=768, bias=False)\n",
656
+ " (k): Linear(in_features=768, out_features=768, bias=False)\n",
657
+ " (v): Linear(in_features=768, out_features=768, bias=False)\n",
658
+ " (o): Linear(in_features=768, out_features=768, bias=False)\n",
659
+ " )\n",
660
+ " (layer_norm): T5LayerNorm()\n",
661
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
662
+ " )\n",
663
+ " (2): T5LayerFF(\n",
664
+ " (DenseReluDense): T5DenseReluDense(\n",
665
+ " (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
666
+ " (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
667
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
668
+ " )\n",
669
+ " (layer_norm): T5LayerNorm()\n",
670
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
671
+ " )\n",
672
+ " )\n",
673
+ " )\n",
674
+ " (8): T5Block(\n",
675
+ " (layer): ModuleList(\n",
676
+ " (0): T5LayerSelfAttention(\n",
677
+ " (SelfAttention): T5Attention(\n",
678
+ " (q): Linear(in_features=768, out_features=768, bias=False)\n",
679
+ " (k): Linear(in_features=768, out_features=768, bias=False)\n",
680
+ " (v): Linear(in_features=768, out_features=768, bias=False)\n",
681
+ " (o): Linear(in_features=768, out_features=768, bias=False)\n",
682
+ " )\n",
683
+ " (layer_norm): T5LayerNorm()\n",
684
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
685
+ " )\n",
686
+ " (1): T5LayerCrossAttention(\n",
687
+ " (EncDecAttention): T5Attention(\n",
688
+ " (q): Linear(in_features=768, out_features=768, bias=False)\n",
689
+ " (k): Linear(in_features=768, out_features=768, bias=False)\n",
690
+ " (v): Linear(in_features=768, out_features=768, bias=False)\n",
691
+ " (o): Linear(in_features=768, out_features=768, bias=False)\n",
692
+ " )\n",
693
+ " (layer_norm): T5LayerNorm()\n",
694
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
695
+ " )\n",
696
+ " (2): T5LayerFF(\n",
697
+ " (DenseReluDense): T5DenseReluDense(\n",
698
+ " (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
699
+ " (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
700
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
701
+ " )\n",
702
+ " (layer_norm): T5LayerNorm()\n",
703
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
704
+ " )\n",
705
+ " )\n",
706
+ " )\n",
707
+ " (9): T5Block(\n",
708
+ " (layer): ModuleList(\n",
709
+ " (0): T5LayerSelfAttention(\n",
710
+ " (SelfAttention): T5Attention(\n",
711
+ " (q): Linear(in_features=768, out_features=768, bias=False)\n",
712
+ " (k): Linear(in_features=768, out_features=768, bias=False)\n",
713
+ " (v): Linear(in_features=768, out_features=768, bias=False)\n",
714
+ " (o): Linear(in_features=768, out_features=768, bias=False)\n",
715
+ " )\n",
716
+ " (layer_norm): T5LayerNorm()\n",
717
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
718
+ " )\n",
719
+ " (1): T5LayerCrossAttention(\n",
720
+ " (EncDecAttention): T5Attention(\n",
721
+ " (q): Linear(in_features=768, out_features=768, bias=False)\n",
722
+ " (k): Linear(in_features=768, out_features=768, bias=False)\n",
723
+ " (v): Linear(in_features=768, out_features=768, bias=False)\n",
724
+ " (o): Linear(in_features=768, out_features=768, bias=False)\n",
725
+ " )\n",
726
+ " (layer_norm): T5LayerNorm()\n",
727
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
728
+ " )\n",
729
+ " (2): T5LayerFF(\n",
730
+ " (DenseReluDense): T5DenseReluDense(\n",
731
+ " (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
732
+ " (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
733
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
734
+ " )\n",
735
+ " (layer_norm): T5LayerNorm()\n",
736
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
737
+ " )\n",
738
+ " )\n",
739
+ " )\n",
740
+ " (10): T5Block(\n",
741
+ " (layer): ModuleList(\n",
742
+ " (0): T5LayerSelfAttention(\n",
743
+ " (SelfAttention): T5Attention(\n",
744
+ " (q): Linear(in_features=768, out_features=768, bias=False)\n",
745
+ " (k): Linear(in_features=768, out_features=768, bias=False)\n",
746
+ " (v): Linear(in_features=768, out_features=768, bias=False)\n",
747
+ " (o): Linear(in_features=768, out_features=768, bias=False)\n",
748
+ " )\n",
749
+ " (layer_norm): T5LayerNorm()\n",
750
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
751
+ " )\n",
752
+ " (1): T5LayerCrossAttention(\n",
753
+ " (EncDecAttention): T5Attention(\n",
754
+ " (q): Linear(in_features=768, out_features=768, bias=False)\n",
755
+ " (k): Linear(in_features=768, out_features=768, bias=False)\n",
756
+ " (v): Linear(in_features=768, out_features=768, bias=False)\n",
757
+ " (o): Linear(in_features=768, out_features=768, bias=False)\n",
758
+ " )\n",
759
+ " (layer_norm): T5LayerNorm()\n",
760
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
761
+ " )\n",
762
+ " (2): T5LayerFF(\n",
763
+ " (DenseReluDense): T5DenseReluDense(\n",
764
+ " (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
765
+ " (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
766
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
767
+ " )\n",
768
+ " (layer_norm): T5LayerNorm()\n",
769
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
770
+ " )\n",
771
+ " )\n",
772
+ " )\n",
773
+ " (11): T5Block(\n",
774
+ " (layer): ModuleList(\n",
775
+ " (0): T5LayerSelfAttention(\n",
776
+ " (SelfAttention): T5Attention(\n",
777
+ " (q): Linear(in_features=768, out_features=768, bias=False)\n",
778
+ " (k): Linear(in_features=768, out_features=768, bias=False)\n",
779
+ " (v): Linear(in_features=768, out_features=768, bias=False)\n",
780
+ " (o): Linear(in_features=768, out_features=768, bias=False)\n",
781
+ " )\n",
782
+ " (layer_norm): T5LayerNorm()\n",
783
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
784
+ " )\n",
785
+ " (1): T5LayerCrossAttention(\n",
786
+ " (EncDecAttention): T5Attention(\n",
787
+ " (q): Linear(in_features=768, out_features=768, bias=False)\n",
788
+ " (k): Linear(in_features=768, out_features=768, bias=False)\n",
789
+ " (v): Linear(in_features=768, out_features=768, bias=False)\n",
790
+ " (o): Linear(in_features=768, out_features=768, bias=False)\n",
791
+ " )\n",
792
+ " (layer_norm): T5LayerNorm()\n",
793
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
794
+ " )\n",
795
+ " (2): T5LayerFF(\n",
796
+ " (DenseReluDense): T5DenseReluDense(\n",
797
+ " (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
798
+ " (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
799
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
800
+ " )\n",
801
+ " (layer_norm): T5LayerNorm()\n",
802
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
803
+ " )\n",
804
+ " )\n",
805
+ " )\n",
806
+ " )\n",
807
+ " (final_layer_norm): T5LayerNorm()\n",
808
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
809
+ " )\n",
810
+ ")"
811
+ ]
812
+ },
813
+ "execution_count": 6,
814
+ "metadata": {},
815
+ "output_type": "execute_result"
816
+ }
817
+ ],
818
+ "source": [
819
+ "model = T5Model(config)\n",
820
+ "load_tf_weights_in_t5(model, config, 't5-base-v2/model.ckpt-759900')"
821
+ ]
822
+ },
823
+ {
824
+ "cell_type": "code",
825
+ "execution_count": 7,
826
+ "metadata": {},
827
+ "outputs": [
828
+ {
829
+ "data": {
830
+ "text/plain": [
831
+ "('config.json', 'pytorch_model.bin')"
832
+ ]
833
+ },
834
+ "execution_count": 7,
835
+ "metadata": {},
836
+ "output_type": "execute_result"
837
+ }
838
+ ],
839
+ "source": [
840
+ "from transformers import CONFIG_NAME, WEIGHTS_NAME\n",
841
+ "CONFIG_NAME, WEIGHTS_NAME"
842
+ ]
843
+ },
844
+ {
845
+ "cell_type": "code",
846
+ "execution_count": 8,
847
+ "metadata": {},
848
+ "outputs": [],
849
+ "source": [
850
+ "import torch\n",
851
+ "\n",
852
+ "torch.save(model.state_dict(), './' + WEIGHTS_NAME)"
853
+ ]
854
+ },
855
+ {
856
+ "cell_type": "code",
857
+ "execution_count": 9,
858
+ "metadata": {},
859
+ "outputs": [],
860
+ "source": [
861
+ "from transformers import T5Config, T5Model, T5Tokenizer"
862
+ ]
863
+ },
864
+ {
865
+ "cell_type": "code",
866
+ "execution_count": 10,
867
+ "metadata": {},
868
+ "outputs": [],
869
+ "source": [
870
+ "# !wget https://f000.backblazeb2.com/file/malaya-model/bpe/sp10m.cased.ms-en.model"
871
+ ]
872
+ },
873
+ {
874
+ "cell_type": "code",
875
+ "execution_count": 11,
876
+ "metadata": {},
877
+ "outputs": [
878
+ {
879
+ "data": {
880
+ "text/plain": [
881
+ "('./tokenizer_config.json',\n",
882
+ " './special_tokens_map.json',\n",
883
+ " './spiece.model',\n",
884
+ " './added_tokens.json')"
885
+ ]
886
+ },
887
+ "execution_count": 11,
888
+ "metadata": {},
889
+ "output_type": "execute_result"
890
+ }
891
+ ],
892
+ "source": [
893
+ "tokenizer = T5Tokenizer('sp10m.cased.ms-en.model')\n",
894
+ "tokenizer.save_pretrained('./')"
895
+ ]
896
+ },
897
+ {
898
+ "cell_type": "code",
899
+ "execution_count": 12,
900
+ "metadata": {},
901
+ "outputs": [],
902
+ "source": [
903
+ "tokenizer = T5Tokenizer.from_pretrained('./', lower = False)"
904
+ ]
905
+ },
906
+ {
907
+ "cell_type": "code",
908
+ "execution_count": 13,
909
+ "metadata": {},
910
+ "outputs": [],
911
+ "source": [
912
+ "config = T5Config.from_pretrained('./')"
913
+ ]
914
+ },
915
+ {
916
+ "cell_type": "code",
917
+ "execution_count": 14,
918
+ "metadata": {},
919
+ "outputs": [],
920
+ "source": [
921
+ "model = T5Model.from_pretrained('./pytorch_model.bin', config = config)"
922
+ ]
923
+ },
924
+ {
925
+ "cell_type": "code",
926
+ "execution_count": 15,
927
+ "metadata": {},
928
+ "outputs": [],
929
+ "source": [
930
+ "model.save_pretrained('./')"
931
+ ]
932
+ },
933
+ {
934
+ "cell_type": "code",
935
+ "execution_count": 16,
936
+ "metadata": {},
937
+ "outputs": [],
938
+ "source": [
939
+ "from transformers import T5Tokenizer, T5ForConditionalGeneration"
940
+ ]
941
+ },
942
+ {
943
+ "cell_type": "code",
944
+ "execution_count": 17,
945
+ "metadata": {},
946
+ "outputs": [],
947
+ "source": [
948
+ "model = T5ForConditionalGeneration.from_pretrained('./')"
949
+ ]
950
+ },
951
+ {
952
+ "cell_type": "code",
953
+ "execution_count": 18,
954
+ "metadata": {},
955
+ "outputs": [
956
+ {
957
+ "data": {
958
+ "text/plain": [
959
+ "'<pad> Mahathir Mohamad</s>'"
960
+ ]
961
+ },
962
+ "execution_count": 18,
963
+ "metadata": {},
964
+ "output_type": "execute_result"
965
+ }
966
+ ],
967
+ "source": [
968
+ "input_ids = tokenizer.encode('soalan: siapakah perdana menteri malaysia?', return_tensors = 'pt')\n",
969
+ "outputs = model.generate(input_ids)\n",
970
+ "tokenizer.decode(outputs[0])"
971
+ ]
972
+ },
973
+ {
974
+ "cell_type": "code",
975
+ "execution_count": 19,
976
+ "metadata": {},
977
+ "outputs": [
978
+ {
979
+ "data": {
980
+ "text/plain": [
981
+ "'<pad> PETALING JAYA: Bekas perdana menteri, Najib Razak, mempersoalkan sama ada kerajaan tahu bagaimana menguruskan wabak'"
982
+ ]
983
+ },
984
+ "execution_count": 19,
985
+ "metadata": {},
986
+ "output_type": "execute_result"
987
+ }
988
+ ],
989
+ "source": [
990
+ "input_ids = tokenizer.encode('terjemah Inggeris ke Melayu: PETALING JAYA: Former prime minister Najib Razak has questioned whether the government knows how to manage the Covid-19 pandemic, outlining several seemingly contradictory announcements it has made.', return_tensors = 'pt')\n",
991
+ "outputs = model.generate(input_ids)\n",
992
+ "tokenizer.decode(outputs[0])"
993
+ ]
994
+ },
995
+ {
996
+ "cell_type": "code",
997
+ "execution_count": 20,
998
+ "metadata": {},
999
+ "outputs": [
1000
+ {
1001
+ "data": {
1002
+ "text/plain": [
1003
+ "'<pad> PETALING JAYA: Former Prime Minister Najib Tun Razak and Deputy Prime Minister Ismail Sabri Yaakob today discussed'"
1004
+ ]
1005
+ },
1006
+ "execution_count": 20,
1007
+ "metadata": {},
1008
+ "output_type": "execute_result"
1009
+ }
1010
+ ],
1011
+ "source": [
1012
+ "input_ids = tokenizer.encode('terjemah Melayu ke Inggeris: PETALING JAYA: Pertemuan bekas Perdana Menteri, Datuk Seri Najib Tun Razak dan Timbalan Perdana Menteri, Datuk Seri Ismail Sabri Yaakob hari ini adalah bagi membincangkan isu berkaitan hala tuju dan dasar negara.', return_tensors = 'pt')\n",
1013
+ "outputs = model.generate(input_ids)\n",
1014
+ "tokenizer.decode(outputs[0])"
1015
+ ]
1016
+ },
1017
+ {
1018
+ "cell_type": "code",
1019
+ "execution_count": 21,
1020
+ "metadata": {},
1021
+ "outputs": [
1022
+ {
1023
+ "data": {
1024
+ "text/plain": [
1025
+ "'<pad> Roman Catholic Archdiocese of Maracaibo shares border with Roman Catholic Diocese'"
1026
+ ]
1027
+ },
1028
+ "execution_count": 21,
1029
+ "metadata": {},
1030
+ "output_type": "execute_result"
1031
+ }
1032
+ ],
1033
+ "source": [
1034
+ "input_ids = tokenizer.encode('grafik pengetahuan: Keuskupan Agung Katolik Rom Maracaibo terletak di barat daya Keuskupan Katolik Rom Machiques.', return_tensors = 'pt')\n",
1035
+ "outputs = model.generate(input_ids)\n",
1036
+ "tokenizer.decode(outputs[0])"
1037
+ ]
1038
+ },
1039
+ {
1040
+ "cell_type": "code",
1041
+ "execution_count": 22,
1042
+ "metadata": {},
1043
+ "outputs": [],
1044
+ "source": [
1045
+ "!rm -rf t5-base-v2"
1046
+ ]
1047
+ },
1048
+ {
1049
+ "cell_type": "code",
1050
+ "execution_count": null,
1051
+ "metadata": {},
1052
+ "outputs": [],
1053
+ "source": []
1054
+ }
1055
+ ],
1056
+ "metadata": {
1057
+ "kernelspec": {
1058
+ "display_name": "Python 3",
1059
+ "language": "python",
1060
+ "name": "python3"
1061
+ },
1062
+ "language_info": {
1063
+ "codemirror_mode": {
1064
+ "name": "ipython",
1065
+ "version": 3
1066
+ },
1067
+ "file_extension": ".py",
1068
+ "mimetype": "text/x-python",
1069
+ "name": "python",
1070
+ "nbconvert_exporter": "python",
1071
+ "pygments_lexer": "ipython3",
1072
+ "version": "3.7.7"
1073
+ }
1074
+ },
1075
+ "nbformat": 4,
1076
+ "nbformat_minor": 4
1077
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9b38ca8853a3cc69ec8295b0bbdaf5187944a83c269d6c44279496a3b714c743
3
+ size 891734137
sp10m.cased.ms-en.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:26de51154cccc9db6e65e5d466bdb0b1fff9fab1d80f4689711de943448addd6
3
+ size 803030
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>", "additional_special_tokens": ["<extra_id_0>", "<extra_id_1>", "<extra_id_2>", "<extra_id_3>", "<extra_id_4>", "<extra_id_5>", "<extra_id_6>", "<extra_id_7>", "<extra_id_8>", "<extra_id_9>", "<extra_id_10>", "<extra_id_11>", "<extra_id_12>", "<extra_id_13>", "<extra_id_14>", "<extra_id_15>", "<extra_id_16>", "<extra_id_17>", "<extra_id_18>", "<extra_id_19>", "<extra_id_20>", "<extra_id_21>", "<extra_id_22>", "<extra_id_23>", "<extra_id_24>", "<extra_id_25>", "<extra_id_26>", "<extra_id_27>", "<extra_id_28>", "<extra_id_29>", "<extra_id_30>", "<extra_id_31>", "<extra_id_32>", "<extra_id_33>", "<extra_id_34>", "<extra_id_35>", "<extra_id_36>", "<extra_id_37>", "<extra_id_38>", "<extra_id_39>", "<extra_id_40>", "<extra_id_41>", "<extra_id_42>", "<extra_id_43>", "<extra_id_44>", "<extra_id_45>", "<extra_id_46>", "<extra_id_47>", "<extra_id_48>", "<extra_id_49>", "<extra_id_50>", "<extra_id_51>", "<extra_id_52>", "<extra_id_53>", "<extra_id_54>", "<extra_id_55>", "<extra_id_56>", "<extra_id_57>", "<extra_id_58>", "<extra_id_59>", "<extra_id_60>", "<extra_id_61>", "<extra_id_62>", "<extra_id_63>", "<extra_id_64>", "<extra_id_65>", "<extra_id_66>", "<extra_id_67>", "<extra_id_68>", "<extra_id_69>", "<extra_id_70>", "<extra_id_71>", "<extra_id_72>", "<extra_id_73>", "<extra_id_74>", "<extra_id_75>", "<extra_id_76>", "<extra_id_77>", "<extra_id_78>", "<extra_id_79>", "<extra_id_80>", "<extra_id_81>", "<extra_id_82>", "<extra_id_83>", "<extra_id_84>", "<extra_id_85>", "<extra_id_86>", "<extra_id_87>", "<extra_id_88>", "<extra_id_89>", "<extra_id_90>", "<extra_id_91>", "<extra_id_92>", "<extra_id_93>", "<extra_id_94>", "<extra_id_95>", "<extra_id_96>", "<extra_id_97>", "<extra_id_98>", "<extra_id_99>"]}
spiece.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:26de51154cccc9db6e65e5d466bdb0b1fff9fab1d80f4689711de943448addd6
3
+ size 803030
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>", "extra_ids": 100, "additional_special_tokens": ["<extra_id_0>", "<extra_id_1>", "<extra_id_2>", "<extra_id_3>", "<extra_id_4>", "<extra_id_5>", "<extra_id_6>", "<extra_id_7>", "<extra_id_8>", "<extra_id_9>", "<extra_id_10>", "<extra_id_11>", "<extra_id_12>", "<extra_id_13>", "<extra_id_14>", "<extra_id_15>", "<extra_id_16>", "<extra_id_17>", "<extra_id_18>", "<extra_id_19>", "<extra_id_20>", "<extra_id_21>", "<extra_id_22>", "<extra_id_23>", "<extra_id_24>", "<extra_id_25>", "<extra_id_26>", "<extra_id_27>", "<extra_id_28>", "<extra_id_29>", "<extra_id_30>", "<extra_id_31>", "<extra_id_32>", "<extra_id_33>", "<extra_id_34>", "<extra_id_35>", "<extra_id_36>", "<extra_id_37>", "<extra_id_38>", "<extra_id_39>", "<extra_id_40>", "<extra_id_41>", "<extra_id_42>", "<extra_id_43>", "<extra_id_44>", "<extra_id_45>", "<extra_id_46>", "<extra_id_47>", "<extra_id_48>", "<extra_id_49>", "<extra_id_50>", "<extra_id_51>", "<extra_id_52>", "<extra_id_53>", "<extra_id_54>", "<extra_id_55>", "<extra_id_56>", "<extra_id_57>", "<extra_id_58>", "<extra_id_59>", "<extra_id_60>", "<extra_id_61>", "<extra_id_62>", "<extra_id_63>", "<extra_id_64>", "<extra_id_65>", "<extra_id_66>", "<extra_id_67>", "<extra_id_68>", "<extra_id_69>", "<extra_id_70>", "<extra_id_71>", "<extra_id_72>", "<extra_id_73>", "<extra_id_74>", "<extra_id_75>", "<extra_id_76>", "<extra_id_77>", "<extra_id_78>", "<extra_id_79>", "<extra_id_80>", "<extra_id_81>", "<extra_id_82>", "<extra_id_83>", "<extra_id_84>", "<extra_id_85>", "<extra_id_86>", "<extra_id_87>", "<extra_id_88>", "<extra_id_89>", "<extra_id_90>", "<extra_id_91>", "<extra_id_92>", "<extra_id_93>", "<extra_id_94>", "<extra_id_95>", "<extra_id_96>", "<extra_id_97>", "<extra_id_98>", "<extra_id_99>"], "sp_model_kwargs": {}, "tokenizer_class": "T5Tokenizer"}