d-matrix
commited on
Update configs/BASIC.yaml
Browse files- configs/BASIC.yaml +73 -73
configs/BASIC.yaml
CHANGED
@@ -8,7 +8,7 @@ model:
|
|
8 |
weight_format: SAME
|
9 |
weight_sparseness: DENSE
|
10 |
model.decoder.final_layer_norm:
|
11 |
-
approximation_function:
|
12 |
bias_format: SAME
|
13 |
input_format: SAME
|
14 |
instance: LayerNorm
|
@@ -43,7 +43,7 @@ model:
|
|
43 |
weight_format: BFP[8|8]{64,-1}(SN)
|
44 |
weight_sparseness: DENSE
|
45 |
model.decoder.layers.0.final_layer_norm:
|
46 |
-
approximation_function:
|
47 |
bias_format: SAME
|
48 |
input_format: SAME
|
49 |
instance: LayerNorm
|
@@ -82,7 +82,7 @@ model:
|
|
82 |
weight_format: BFP[8|8]{64,-1}(SN)
|
83 |
weight_sparseness: DENSE
|
84 |
model.decoder.layers.0.self_attn.softmax:
|
85 |
-
approximation_function:
|
86 |
input_format: SAME
|
87 |
instance: Softmax
|
88 |
output_format: SAME
|
@@ -96,7 +96,7 @@ model:
|
|
96 |
weight_format: BFP[8|8]{64,-1}(SN)
|
97 |
weight_sparseness: DENSE
|
98 |
model.decoder.layers.0.self_attn_layer_norm:
|
99 |
-
approximation_function:
|
100 |
bias_format: SAME
|
101 |
input_format: SAME
|
102 |
instance: LayerNorm
|
@@ -131,7 +131,7 @@ model:
|
|
131 |
weight_format: BFP[8|8]{64,-1}(SN)
|
132 |
weight_sparseness: DENSE
|
133 |
model.decoder.layers.1.final_layer_norm:
|
134 |
-
approximation_function:
|
135 |
bias_format: SAME
|
136 |
input_format: SAME
|
137 |
instance: LayerNorm
|
@@ -170,7 +170,7 @@ model:
|
|
170 |
weight_format: BFP[8|8]{64,-1}(SN)
|
171 |
weight_sparseness: DENSE
|
172 |
model.decoder.layers.1.self_attn.softmax:
|
173 |
-
approximation_function:
|
174 |
input_format: SAME
|
175 |
instance: Softmax
|
176 |
output_format: SAME
|
@@ -184,7 +184,7 @@ model:
|
|
184 |
weight_format: BFP[8|8]{64,-1}(SN)
|
185 |
weight_sparseness: DENSE
|
186 |
model.decoder.layers.1.self_attn_layer_norm:
|
187 |
-
approximation_function:
|
188 |
bias_format: SAME
|
189 |
input_format: SAME
|
190 |
instance: LayerNorm
|
@@ -219,7 +219,7 @@ model:
|
|
219 |
weight_format: BFP[8|8]{64,-1}(SN)
|
220 |
weight_sparseness: DENSE
|
221 |
model.decoder.layers.10.final_layer_norm:
|
222 |
-
approximation_function:
|
223 |
bias_format: SAME
|
224 |
input_format: SAME
|
225 |
instance: LayerNorm
|
@@ -258,7 +258,7 @@ model:
|
|
258 |
weight_format: BFP[8|8]{64,-1}(SN)
|
259 |
weight_sparseness: DENSE
|
260 |
model.decoder.layers.10.self_attn.softmax:
|
261 |
-
approximation_function:
|
262 |
input_format: SAME
|
263 |
instance: Softmax
|
264 |
output_format: SAME
|
@@ -272,7 +272,7 @@ model:
|
|
272 |
weight_format: BFP[8|8]{64,-1}(SN)
|
273 |
weight_sparseness: DENSE
|
274 |
model.decoder.layers.10.self_attn_layer_norm:
|
275 |
-
approximation_function:
|
276 |
bias_format: SAME
|
277 |
input_format: SAME
|
278 |
instance: LayerNorm
|
@@ -307,7 +307,7 @@ model:
|
|
307 |
weight_format: BFP[8|8]{64,-1}(SN)
|
308 |
weight_sparseness: DENSE
|
309 |
model.decoder.layers.11.final_layer_norm:
|
310 |
-
approximation_function:
|
311 |
bias_format: SAME
|
312 |
input_format: SAME
|
313 |
instance: LayerNorm
|
@@ -346,7 +346,7 @@ model:
|
|
346 |
weight_format: BFP[8|8]{64,-1}(SN)
|
347 |
weight_sparseness: DENSE
|
348 |
model.decoder.layers.11.self_attn.softmax:
|
349 |
-
approximation_function:
|
350 |
input_format: SAME
|
351 |
instance: Softmax
|
352 |
output_format: SAME
|
@@ -360,7 +360,7 @@ model:
|
|
360 |
weight_format: BFP[8|8]{64,-1}(SN)
|
361 |
weight_sparseness: DENSE
|
362 |
model.decoder.layers.11.self_attn_layer_norm:
|
363 |
-
approximation_function:
|
364 |
bias_format: SAME
|
365 |
input_format: SAME
|
366 |
instance: LayerNorm
|
@@ -395,7 +395,7 @@ model:
|
|
395 |
weight_format: BFP[8|8]{64,-1}(SN)
|
396 |
weight_sparseness: DENSE
|
397 |
model.decoder.layers.12.final_layer_norm:
|
398 |
-
approximation_function:
|
399 |
bias_format: SAME
|
400 |
input_format: SAME
|
401 |
instance: LayerNorm
|
@@ -434,7 +434,7 @@ model:
|
|
434 |
weight_format: BFP[8|8]{64,-1}(SN)
|
435 |
weight_sparseness: DENSE
|
436 |
model.decoder.layers.12.self_attn.softmax:
|
437 |
-
approximation_function:
|
438 |
input_format: SAME
|
439 |
instance: Softmax
|
440 |
output_format: SAME
|
@@ -448,7 +448,7 @@ model:
|
|
448 |
weight_format: BFP[8|8]{64,-1}(SN)
|
449 |
weight_sparseness: DENSE
|
450 |
model.decoder.layers.12.self_attn_layer_norm:
|
451 |
-
approximation_function:
|
452 |
bias_format: SAME
|
453 |
input_format: SAME
|
454 |
instance: LayerNorm
|
@@ -483,7 +483,7 @@ model:
|
|
483 |
weight_format: BFP[8|8]{64,-1}(SN)
|
484 |
weight_sparseness: DENSE
|
485 |
model.decoder.layers.13.final_layer_norm:
|
486 |
-
approximation_function:
|
487 |
bias_format: SAME
|
488 |
input_format: SAME
|
489 |
instance: LayerNorm
|
@@ -522,7 +522,7 @@ model:
|
|
522 |
weight_format: BFP[8|8]{64,-1}(SN)
|
523 |
weight_sparseness: DENSE
|
524 |
model.decoder.layers.13.self_attn.softmax:
|
525 |
-
approximation_function:
|
526 |
input_format: SAME
|
527 |
instance: Softmax
|
528 |
output_format: SAME
|
@@ -536,7 +536,7 @@ model:
|
|
536 |
weight_format: BFP[8|8]{64,-1}(SN)
|
537 |
weight_sparseness: DENSE
|
538 |
model.decoder.layers.13.self_attn_layer_norm:
|
539 |
-
approximation_function:
|
540 |
bias_format: SAME
|
541 |
input_format: SAME
|
542 |
instance: LayerNorm
|
@@ -571,7 +571,7 @@ model:
|
|
571 |
weight_format: BFP[8|8]{64,-1}(SN)
|
572 |
weight_sparseness: DENSE
|
573 |
model.decoder.layers.14.final_layer_norm:
|
574 |
-
approximation_function:
|
575 |
bias_format: SAME
|
576 |
input_format: SAME
|
577 |
instance: LayerNorm
|
@@ -610,7 +610,7 @@ model:
|
|
610 |
weight_format: BFP[8|8]{64,-1}(SN)
|
611 |
weight_sparseness: DENSE
|
612 |
model.decoder.layers.14.self_attn.softmax:
|
613 |
-
approximation_function:
|
614 |
input_format: SAME
|
615 |
instance: Softmax
|
616 |
output_format: SAME
|
@@ -624,7 +624,7 @@ model:
|
|
624 |
weight_format: BFP[8|8]{64,-1}(SN)
|
625 |
weight_sparseness: DENSE
|
626 |
model.decoder.layers.14.self_attn_layer_norm:
|
627 |
-
approximation_function:
|
628 |
bias_format: SAME
|
629 |
input_format: SAME
|
630 |
instance: LayerNorm
|
@@ -659,7 +659,7 @@ model:
|
|
659 |
weight_format: BFP[8|8]{64,-1}(SN)
|
660 |
weight_sparseness: DENSE
|
661 |
model.decoder.layers.15.final_layer_norm:
|
662 |
-
approximation_function:
|
663 |
bias_format: SAME
|
664 |
input_format: SAME
|
665 |
instance: LayerNorm
|
@@ -698,7 +698,7 @@ model:
|
|
698 |
weight_format: BFP[8|8]{64,-1}(SN)
|
699 |
weight_sparseness: DENSE
|
700 |
model.decoder.layers.15.self_attn.softmax:
|
701 |
-
approximation_function:
|
702 |
input_format: SAME
|
703 |
instance: Softmax
|
704 |
output_format: SAME
|
@@ -712,7 +712,7 @@ model:
|
|
712 |
weight_format: BFP[8|8]{64,-1}(SN)
|
713 |
weight_sparseness: DENSE
|
714 |
model.decoder.layers.15.self_attn_layer_norm:
|
715 |
-
approximation_function:
|
716 |
bias_format: SAME
|
717 |
input_format: SAME
|
718 |
instance: LayerNorm
|
@@ -747,7 +747,7 @@ model:
|
|
747 |
weight_format: BFP[8|8]{64,-1}(SN)
|
748 |
weight_sparseness: DENSE
|
749 |
model.decoder.layers.16.final_layer_norm:
|
750 |
-
approximation_function:
|
751 |
bias_format: SAME
|
752 |
input_format: SAME
|
753 |
instance: LayerNorm
|
@@ -786,7 +786,7 @@ model:
|
|
786 |
weight_format: BFP[8|8]{64,-1}(SN)
|
787 |
weight_sparseness: DENSE
|
788 |
model.decoder.layers.16.self_attn.softmax:
|
789 |
-
approximation_function:
|
790 |
input_format: SAME
|
791 |
instance: Softmax
|
792 |
output_format: SAME
|
@@ -800,7 +800,7 @@ model:
|
|
800 |
weight_format: BFP[8|8]{64,-1}(SN)
|
801 |
weight_sparseness: DENSE
|
802 |
model.decoder.layers.16.self_attn_layer_norm:
|
803 |
-
approximation_function:
|
804 |
bias_format: SAME
|
805 |
input_format: SAME
|
806 |
instance: LayerNorm
|
@@ -835,7 +835,7 @@ model:
|
|
835 |
weight_format: BFP[8|8]{64,-1}(SN)
|
836 |
weight_sparseness: DENSE
|
837 |
model.decoder.layers.17.final_layer_norm:
|
838 |
-
approximation_function:
|
839 |
bias_format: SAME
|
840 |
input_format: SAME
|
841 |
instance: LayerNorm
|
@@ -874,7 +874,7 @@ model:
|
|
874 |
weight_format: BFP[8|8]{64,-1}(SN)
|
875 |
weight_sparseness: DENSE
|
876 |
model.decoder.layers.17.self_attn.softmax:
|
877 |
-
approximation_function:
|
878 |
input_format: SAME
|
879 |
instance: Softmax
|
880 |
output_format: SAME
|
@@ -888,7 +888,7 @@ model:
|
|
888 |
weight_format: BFP[8|8]{64,-1}(SN)
|
889 |
weight_sparseness: DENSE
|
890 |
model.decoder.layers.17.self_attn_layer_norm:
|
891 |
-
approximation_function:
|
892 |
bias_format: SAME
|
893 |
input_format: SAME
|
894 |
instance: LayerNorm
|
@@ -923,7 +923,7 @@ model:
|
|
923 |
weight_format: BFP[8|8]{64,-1}(SN)
|
924 |
weight_sparseness: DENSE
|
925 |
model.decoder.layers.18.final_layer_norm:
|
926 |
-
approximation_function:
|
927 |
bias_format: SAME
|
928 |
input_format: SAME
|
929 |
instance: LayerNorm
|
@@ -962,7 +962,7 @@ model:
|
|
962 |
weight_format: BFP[8|8]{64,-1}(SN)
|
963 |
weight_sparseness: DENSE
|
964 |
model.decoder.layers.18.self_attn.softmax:
|
965 |
-
approximation_function:
|
966 |
input_format: SAME
|
967 |
instance: Softmax
|
968 |
output_format: SAME
|
@@ -976,7 +976,7 @@ model:
|
|
976 |
weight_format: BFP[8|8]{64,-1}(SN)
|
977 |
weight_sparseness: DENSE
|
978 |
model.decoder.layers.18.self_attn_layer_norm:
|
979 |
-
approximation_function:
|
980 |
bias_format: SAME
|
981 |
input_format: SAME
|
982 |
instance: LayerNorm
|
@@ -1011,7 +1011,7 @@ model:
|
|
1011 |
weight_format: BFP[8|8]{64,-1}(SN)
|
1012 |
weight_sparseness: DENSE
|
1013 |
model.decoder.layers.19.final_layer_norm:
|
1014 |
-
approximation_function:
|
1015 |
bias_format: SAME
|
1016 |
input_format: SAME
|
1017 |
instance: LayerNorm
|
@@ -1050,7 +1050,7 @@ model:
|
|
1050 |
weight_format: BFP[8|8]{64,-1}(SN)
|
1051 |
weight_sparseness: DENSE
|
1052 |
model.decoder.layers.19.self_attn.softmax:
|
1053 |
-
approximation_function:
|
1054 |
input_format: SAME
|
1055 |
instance: Softmax
|
1056 |
output_format: SAME
|
@@ -1064,7 +1064,7 @@ model:
|
|
1064 |
weight_format: BFP[8|8]{64,-1}(SN)
|
1065 |
weight_sparseness: DENSE
|
1066 |
model.decoder.layers.19.self_attn_layer_norm:
|
1067 |
-
approximation_function:
|
1068 |
bias_format: SAME
|
1069 |
input_format: SAME
|
1070 |
instance: LayerNorm
|
@@ -1099,7 +1099,7 @@ model:
|
|
1099 |
weight_format: BFP[8|8]{64,-1}(SN)
|
1100 |
weight_sparseness: DENSE
|
1101 |
model.decoder.layers.2.final_layer_norm:
|
1102 |
-
approximation_function:
|
1103 |
bias_format: SAME
|
1104 |
input_format: SAME
|
1105 |
instance: LayerNorm
|
@@ -1138,7 +1138,7 @@ model:
|
|
1138 |
weight_format: BFP[8|8]{64,-1}(SN)
|
1139 |
weight_sparseness: DENSE
|
1140 |
model.decoder.layers.2.self_attn.softmax:
|
1141 |
-
approximation_function:
|
1142 |
input_format: SAME
|
1143 |
instance: Softmax
|
1144 |
output_format: SAME
|
@@ -1152,7 +1152,7 @@ model:
|
|
1152 |
weight_format: BFP[8|8]{64,-1}(SN)
|
1153 |
weight_sparseness: DENSE
|
1154 |
model.decoder.layers.2.self_attn_layer_norm:
|
1155 |
-
approximation_function:
|
1156 |
bias_format: SAME
|
1157 |
input_format: SAME
|
1158 |
instance: LayerNorm
|
@@ -1187,7 +1187,7 @@ model:
|
|
1187 |
weight_format: BFP[8|8]{64,-1}(SN)
|
1188 |
weight_sparseness: DENSE
|
1189 |
model.decoder.layers.20.final_layer_norm:
|
1190 |
-
approximation_function:
|
1191 |
bias_format: SAME
|
1192 |
input_format: SAME
|
1193 |
instance: LayerNorm
|
@@ -1226,7 +1226,7 @@ model:
|
|
1226 |
weight_format: BFP[8|8]{64,-1}(SN)
|
1227 |
weight_sparseness: DENSE
|
1228 |
model.decoder.layers.20.self_attn.softmax:
|
1229 |
-
approximation_function:
|
1230 |
input_format: SAME
|
1231 |
instance: Softmax
|
1232 |
output_format: SAME
|
@@ -1240,7 +1240,7 @@ model:
|
|
1240 |
weight_format: BFP[8|8]{64,-1}(SN)
|
1241 |
weight_sparseness: DENSE
|
1242 |
model.decoder.layers.20.self_attn_layer_norm:
|
1243 |
-
approximation_function:
|
1244 |
bias_format: SAME
|
1245 |
input_format: SAME
|
1246 |
instance: LayerNorm
|
@@ -1275,7 +1275,7 @@ model:
|
|
1275 |
weight_format: BFP[8|8]{64,-1}(SN)
|
1276 |
weight_sparseness: DENSE
|
1277 |
model.decoder.layers.21.final_layer_norm:
|
1278 |
-
approximation_function:
|
1279 |
bias_format: SAME
|
1280 |
input_format: SAME
|
1281 |
instance: LayerNorm
|
@@ -1314,7 +1314,7 @@ model:
|
|
1314 |
weight_format: BFP[8|8]{64,-1}(SN)
|
1315 |
weight_sparseness: DENSE
|
1316 |
model.decoder.layers.21.self_attn.softmax:
|
1317 |
-
approximation_function:
|
1318 |
input_format: SAME
|
1319 |
instance: Softmax
|
1320 |
output_format: SAME
|
@@ -1328,7 +1328,7 @@ model:
|
|
1328 |
weight_format: BFP[8|8]{64,-1}(SN)
|
1329 |
weight_sparseness: DENSE
|
1330 |
model.decoder.layers.21.self_attn_layer_norm:
|
1331 |
-
approximation_function:
|
1332 |
bias_format: SAME
|
1333 |
input_format: SAME
|
1334 |
instance: LayerNorm
|
@@ -1363,7 +1363,7 @@ model:
|
|
1363 |
weight_format: BFP[8|8]{64,-1}(SN)
|
1364 |
weight_sparseness: DENSE
|
1365 |
model.decoder.layers.22.final_layer_norm:
|
1366 |
-
approximation_function:
|
1367 |
bias_format: SAME
|
1368 |
input_format: SAME
|
1369 |
instance: LayerNorm
|
@@ -1402,7 +1402,7 @@ model:
|
|
1402 |
weight_format: BFP[8|8]{64,-1}(SN)
|
1403 |
weight_sparseness: DENSE
|
1404 |
model.decoder.layers.22.self_attn.softmax:
|
1405 |
-
approximation_function:
|
1406 |
input_format: SAME
|
1407 |
instance: Softmax
|
1408 |
output_format: SAME
|
@@ -1416,7 +1416,7 @@ model:
|
|
1416 |
weight_format: BFP[8|8]{64,-1}(SN)
|
1417 |
weight_sparseness: DENSE
|
1418 |
model.decoder.layers.22.self_attn_layer_norm:
|
1419 |
-
approximation_function:
|
1420 |
bias_format: SAME
|
1421 |
input_format: SAME
|
1422 |
instance: LayerNorm
|
@@ -1451,7 +1451,7 @@ model:
|
|
1451 |
weight_format: BFP[8|8]{64,-1}(SN)
|
1452 |
weight_sparseness: DENSE
|
1453 |
model.decoder.layers.23.final_layer_norm:
|
1454 |
-
approximation_function:
|
1455 |
bias_format: SAME
|
1456 |
input_format: SAME
|
1457 |
instance: LayerNorm
|
@@ -1490,7 +1490,7 @@ model:
|
|
1490 |
weight_format: BFP[8|8]{64,-1}(SN)
|
1491 |
weight_sparseness: DENSE
|
1492 |
model.decoder.layers.23.self_attn.softmax:
|
1493 |
-
approximation_function:
|
1494 |
input_format: SAME
|
1495 |
instance: Softmax
|
1496 |
output_format: SAME
|
@@ -1504,7 +1504,7 @@ model:
|
|
1504 |
weight_format: BFP[8|8]{64,-1}(SN)
|
1505 |
weight_sparseness: DENSE
|
1506 |
model.decoder.layers.23.self_attn_layer_norm:
|
1507 |
-
approximation_function:
|
1508 |
bias_format: SAME
|
1509 |
input_format: SAME
|
1510 |
instance: LayerNorm
|
@@ -1539,7 +1539,7 @@ model:
|
|
1539 |
weight_format: BFP[8|8]{64,-1}(SN)
|
1540 |
weight_sparseness: DENSE
|
1541 |
model.decoder.layers.3.final_layer_norm:
|
1542 |
-
approximation_function:
|
1543 |
bias_format: SAME
|
1544 |
input_format: SAME
|
1545 |
instance: LayerNorm
|
@@ -1578,7 +1578,7 @@ model:
|
|
1578 |
weight_format: BFP[8|8]{64,-1}(SN)
|
1579 |
weight_sparseness: DENSE
|
1580 |
model.decoder.layers.3.self_attn.softmax:
|
1581 |
-
approximation_function:
|
1582 |
input_format: SAME
|
1583 |
instance: Softmax
|
1584 |
output_format: SAME
|
@@ -1592,7 +1592,7 @@ model:
|
|
1592 |
weight_format: BFP[8|8]{64,-1}(SN)
|
1593 |
weight_sparseness: DENSE
|
1594 |
model.decoder.layers.3.self_attn_layer_norm:
|
1595 |
-
approximation_function:
|
1596 |
bias_format: SAME
|
1597 |
input_format: SAME
|
1598 |
instance: LayerNorm
|
@@ -1627,7 +1627,7 @@ model:
|
|
1627 |
weight_format: BFP[8|8]{64,-1}(SN)
|
1628 |
weight_sparseness: DENSE
|
1629 |
model.decoder.layers.4.final_layer_norm:
|
1630 |
-
approximation_function:
|
1631 |
bias_format: SAME
|
1632 |
input_format: SAME
|
1633 |
instance: LayerNorm
|
@@ -1666,7 +1666,7 @@ model:
|
|
1666 |
weight_format: BFP[8|8]{64,-1}(SN)
|
1667 |
weight_sparseness: DENSE
|
1668 |
model.decoder.layers.4.self_attn.softmax:
|
1669 |
-
approximation_function:
|
1670 |
input_format: SAME
|
1671 |
instance: Softmax
|
1672 |
output_format: SAME
|
@@ -1680,7 +1680,7 @@ model:
|
|
1680 |
weight_format: BFP[8|8]{64,-1}(SN)
|
1681 |
weight_sparseness: DENSE
|
1682 |
model.decoder.layers.4.self_attn_layer_norm:
|
1683 |
-
approximation_function:
|
1684 |
bias_format: SAME
|
1685 |
input_format: SAME
|
1686 |
instance: LayerNorm
|
@@ -1715,7 +1715,7 @@ model:
|
|
1715 |
weight_format: BFP[8|8]{64,-1}(SN)
|
1716 |
weight_sparseness: DENSE
|
1717 |
model.decoder.layers.5.final_layer_norm:
|
1718 |
-
approximation_function:
|
1719 |
bias_format: SAME
|
1720 |
input_format: SAME
|
1721 |
instance: LayerNorm
|
@@ -1754,7 +1754,7 @@ model:
|
|
1754 |
weight_format: BFP[8|8]{64,-1}(SN)
|
1755 |
weight_sparseness: DENSE
|
1756 |
model.decoder.layers.5.self_attn.softmax:
|
1757 |
-
approximation_function:
|
1758 |
input_format: SAME
|
1759 |
instance: Softmax
|
1760 |
output_format: SAME
|
@@ -1768,7 +1768,7 @@ model:
|
|
1768 |
weight_format: BFP[8|8]{64,-1}(SN)
|
1769 |
weight_sparseness: DENSE
|
1770 |
model.decoder.layers.5.self_attn_layer_norm:
|
1771 |
-
approximation_function:
|
1772 |
bias_format: SAME
|
1773 |
input_format: SAME
|
1774 |
instance: LayerNorm
|
@@ -1803,7 +1803,7 @@ model:
|
|
1803 |
weight_format: BFP[8|8]{64,-1}(SN)
|
1804 |
weight_sparseness: DENSE
|
1805 |
model.decoder.layers.6.final_layer_norm:
|
1806 |
-
approximation_function:
|
1807 |
bias_format: SAME
|
1808 |
input_format: SAME
|
1809 |
instance: LayerNorm
|
@@ -1842,7 +1842,7 @@ model:
|
|
1842 |
weight_format: BFP[8|8]{64,-1}(SN)
|
1843 |
weight_sparseness: DENSE
|
1844 |
model.decoder.layers.6.self_attn.softmax:
|
1845 |
-
approximation_function:
|
1846 |
input_format: SAME
|
1847 |
instance: Softmax
|
1848 |
output_format: SAME
|
@@ -1856,7 +1856,7 @@ model:
|
|
1856 |
weight_format: BFP[8|8]{64,-1}(SN)
|
1857 |
weight_sparseness: DENSE
|
1858 |
model.decoder.layers.6.self_attn_layer_norm:
|
1859 |
-
approximation_function:
|
1860 |
bias_format: SAME
|
1861 |
input_format: SAME
|
1862 |
instance: LayerNorm
|
@@ -1891,7 +1891,7 @@ model:
|
|
1891 |
weight_format: BFP[8|8]{64,-1}(SN)
|
1892 |
weight_sparseness: DENSE
|
1893 |
model.decoder.layers.7.final_layer_norm:
|
1894 |
-
approximation_function:
|
1895 |
bias_format: SAME
|
1896 |
input_format: SAME
|
1897 |
instance: LayerNorm
|
@@ -1930,7 +1930,7 @@ model:
|
|
1930 |
weight_format: BFP[8|8]{64,-1}(SN)
|
1931 |
weight_sparseness: DENSE
|
1932 |
model.decoder.layers.7.self_attn.softmax:
|
1933 |
-
approximation_function:
|
1934 |
input_format: SAME
|
1935 |
instance: Softmax
|
1936 |
output_format: SAME
|
@@ -1944,7 +1944,7 @@ model:
|
|
1944 |
weight_format: BFP[8|8]{64,-1}(SN)
|
1945 |
weight_sparseness: DENSE
|
1946 |
model.decoder.layers.7.self_attn_layer_norm:
|
1947 |
-
approximation_function:
|
1948 |
bias_format: SAME
|
1949 |
input_format: SAME
|
1950 |
instance: LayerNorm
|
@@ -1979,7 +1979,7 @@ model:
|
|
1979 |
weight_format: BFP[8|8]{64,-1}(SN)
|
1980 |
weight_sparseness: DENSE
|
1981 |
model.decoder.layers.8.final_layer_norm:
|
1982 |
-
approximation_function:
|
1983 |
bias_format: SAME
|
1984 |
input_format: SAME
|
1985 |
instance: LayerNorm
|
@@ -2018,7 +2018,7 @@ model:
|
|
2018 |
weight_format: BFP[8|8]{64,-1}(SN)
|
2019 |
weight_sparseness: DENSE
|
2020 |
model.decoder.layers.8.self_attn.softmax:
|
2021 |
-
approximation_function:
|
2022 |
input_format: SAME
|
2023 |
instance: Softmax
|
2024 |
output_format: SAME
|
@@ -2032,7 +2032,7 @@ model:
|
|
2032 |
weight_format: BFP[8|8]{64,-1}(SN)
|
2033 |
weight_sparseness: DENSE
|
2034 |
model.decoder.layers.8.self_attn_layer_norm:
|
2035 |
-
approximation_function:
|
2036 |
bias_format: SAME
|
2037 |
input_format: SAME
|
2038 |
instance: LayerNorm
|
@@ -2067,7 +2067,7 @@ model:
|
|
2067 |
weight_format: BFP[8|8]{64,-1}(SN)
|
2068 |
weight_sparseness: DENSE
|
2069 |
model.decoder.layers.9.final_layer_norm:
|
2070 |
-
approximation_function:
|
2071 |
bias_format: SAME
|
2072 |
input_format: SAME
|
2073 |
instance: LayerNorm
|
@@ -2106,7 +2106,7 @@ model:
|
|
2106 |
weight_format: BFP[8|8]{64,-1}(SN)
|
2107 |
weight_sparseness: DENSE
|
2108 |
model.decoder.layers.9.self_attn.softmax:
|
2109 |
-
approximation_function:
|
2110 |
input_format: SAME
|
2111 |
instance: Softmax
|
2112 |
output_format: SAME
|
@@ -2120,7 +2120,7 @@ model:
|
|
2120 |
weight_format: BFP[8|8]{64,-1}(SN)
|
2121 |
weight_sparseness: DENSE
|
2122 |
model.decoder.layers.9.self_attn_layer_norm:
|
2123 |
-
approximation_function:
|
2124 |
bias_format: SAME
|
2125 |
input_format: SAME
|
2126 |
instance: LayerNorm
|
|
|
8 |
weight_format: SAME
|
9 |
weight_sparseness: DENSE
|
10 |
model.decoder.final_layer_norm:
|
11 |
+
approximation_function: NONE
|
12 |
bias_format: SAME
|
13 |
input_format: SAME
|
14 |
instance: LayerNorm
|
|
|
43 |
weight_format: BFP[8|8]{64,-1}(SN)
|
44 |
weight_sparseness: DENSE
|
45 |
model.decoder.layers.0.final_layer_norm:
|
46 |
+
approximation_function: NONE
|
47 |
bias_format: SAME
|
48 |
input_format: SAME
|
49 |
instance: LayerNorm
|
|
|
82 |
weight_format: BFP[8|8]{64,-1}(SN)
|
83 |
weight_sparseness: DENSE
|
84 |
model.decoder.layers.0.self_attn.softmax:
|
85 |
+
approximation_function: NONE
|
86 |
input_format: SAME
|
87 |
instance: Softmax
|
88 |
output_format: SAME
|
|
|
96 |
weight_format: BFP[8|8]{64,-1}(SN)
|
97 |
weight_sparseness: DENSE
|
98 |
model.decoder.layers.0.self_attn_layer_norm:
|
99 |
+
approximation_function: NONE
|
100 |
bias_format: SAME
|
101 |
input_format: SAME
|
102 |
instance: LayerNorm
|
|
|
131 |
weight_format: BFP[8|8]{64,-1}(SN)
|
132 |
weight_sparseness: DENSE
|
133 |
model.decoder.layers.1.final_layer_norm:
|
134 |
+
approximation_function: NONE
|
135 |
bias_format: SAME
|
136 |
input_format: SAME
|
137 |
instance: LayerNorm
|
|
|
170 |
weight_format: BFP[8|8]{64,-1}(SN)
|
171 |
weight_sparseness: DENSE
|
172 |
model.decoder.layers.1.self_attn.softmax:
|
173 |
+
approximation_function: NONE
|
174 |
input_format: SAME
|
175 |
instance: Softmax
|
176 |
output_format: SAME
|
|
|
184 |
weight_format: BFP[8|8]{64,-1}(SN)
|
185 |
weight_sparseness: DENSE
|
186 |
model.decoder.layers.1.self_attn_layer_norm:
|
187 |
+
approximation_function: NONE
|
188 |
bias_format: SAME
|
189 |
input_format: SAME
|
190 |
instance: LayerNorm
|
|
|
219 |
weight_format: BFP[8|8]{64,-1}(SN)
|
220 |
weight_sparseness: DENSE
|
221 |
model.decoder.layers.10.final_layer_norm:
|
222 |
+
approximation_function: NONE
|
223 |
bias_format: SAME
|
224 |
input_format: SAME
|
225 |
instance: LayerNorm
|
|
|
258 |
weight_format: BFP[8|8]{64,-1}(SN)
|
259 |
weight_sparseness: DENSE
|
260 |
model.decoder.layers.10.self_attn.softmax:
|
261 |
+
approximation_function: NONE
|
262 |
input_format: SAME
|
263 |
instance: Softmax
|
264 |
output_format: SAME
|
|
|
272 |
weight_format: BFP[8|8]{64,-1}(SN)
|
273 |
weight_sparseness: DENSE
|
274 |
model.decoder.layers.10.self_attn_layer_norm:
|
275 |
+
approximation_function: NONE
|
276 |
bias_format: SAME
|
277 |
input_format: SAME
|
278 |
instance: LayerNorm
|
|
|
307 |
weight_format: BFP[8|8]{64,-1}(SN)
|
308 |
weight_sparseness: DENSE
|
309 |
model.decoder.layers.11.final_layer_norm:
|
310 |
+
approximation_function: NONE
|
311 |
bias_format: SAME
|
312 |
input_format: SAME
|
313 |
instance: LayerNorm
|
|
|
346 |
weight_format: BFP[8|8]{64,-1}(SN)
|
347 |
weight_sparseness: DENSE
|
348 |
model.decoder.layers.11.self_attn.softmax:
|
349 |
+
approximation_function: NONE
|
350 |
input_format: SAME
|
351 |
instance: Softmax
|
352 |
output_format: SAME
|
|
|
360 |
weight_format: BFP[8|8]{64,-1}(SN)
|
361 |
weight_sparseness: DENSE
|
362 |
model.decoder.layers.11.self_attn_layer_norm:
|
363 |
+
approximation_function: NONE
|
364 |
bias_format: SAME
|
365 |
input_format: SAME
|
366 |
instance: LayerNorm
|
|
|
395 |
weight_format: BFP[8|8]{64,-1}(SN)
|
396 |
weight_sparseness: DENSE
|
397 |
model.decoder.layers.12.final_layer_norm:
|
398 |
+
approximation_function: NONE
|
399 |
bias_format: SAME
|
400 |
input_format: SAME
|
401 |
instance: LayerNorm
|
|
|
434 |
weight_format: BFP[8|8]{64,-1}(SN)
|
435 |
weight_sparseness: DENSE
|
436 |
model.decoder.layers.12.self_attn.softmax:
|
437 |
+
approximation_function: NONE
|
438 |
input_format: SAME
|
439 |
instance: Softmax
|
440 |
output_format: SAME
|
|
|
448 |
weight_format: BFP[8|8]{64,-1}(SN)
|
449 |
weight_sparseness: DENSE
|
450 |
model.decoder.layers.12.self_attn_layer_norm:
|
451 |
+
approximation_function: NONE
|
452 |
bias_format: SAME
|
453 |
input_format: SAME
|
454 |
instance: LayerNorm
|
|
|
483 |
weight_format: BFP[8|8]{64,-1}(SN)
|
484 |
weight_sparseness: DENSE
|
485 |
model.decoder.layers.13.final_layer_norm:
|
486 |
+
approximation_function: NONE
|
487 |
bias_format: SAME
|
488 |
input_format: SAME
|
489 |
instance: LayerNorm
|
|
|
522 |
weight_format: BFP[8|8]{64,-1}(SN)
|
523 |
weight_sparseness: DENSE
|
524 |
model.decoder.layers.13.self_attn.softmax:
|
525 |
+
approximation_function: NONE
|
526 |
input_format: SAME
|
527 |
instance: Softmax
|
528 |
output_format: SAME
|
|
|
536 |
weight_format: BFP[8|8]{64,-1}(SN)
|
537 |
weight_sparseness: DENSE
|
538 |
model.decoder.layers.13.self_attn_layer_norm:
|
539 |
+
approximation_function: NONE
|
540 |
bias_format: SAME
|
541 |
input_format: SAME
|
542 |
instance: LayerNorm
|
|
|
571 |
weight_format: BFP[8|8]{64,-1}(SN)
|
572 |
weight_sparseness: DENSE
|
573 |
model.decoder.layers.14.final_layer_norm:
|
574 |
+
approximation_function: NONE
|
575 |
bias_format: SAME
|
576 |
input_format: SAME
|
577 |
instance: LayerNorm
|
|
|
610 |
weight_format: BFP[8|8]{64,-1}(SN)
|
611 |
weight_sparseness: DENSE
|
612 |
model.decoder.layers.14.self_attn.softmax:
|
613 |
+
approximation_function: NONE
|
614 |
input_format: SAME
|
615 |
instance: Softmax
|
616 |
output_format: SAME
|
|
|
624 |
weight_format: BFP[8|8]{64,-1}(SN)
|
625 |
weight_sparseness: DENSE
|
626 |
model.decoder.layers.14.self_attn_layer_norm:
|
627 |
+
approximation_function: NONE
|
628 |
bias_format: SAME
|
629 |
input_format: SAME
|
630 |
instance: LayerNorm
|
|
|
659 |
weight_format: BFP[8|8]{64,-1}(SN)
|
660 |
weight_sparseness: DENSE
|
661 |
model.decoder.layers.15.final_layer_norm:
|
662 |
+
approximation_function: NONE
|
663 |
bias_format: SAME
|
664 |
input_format: SAME
|
665 |
instance: LayerNorm
|
|
|
698 |
weight_format: BFP[8|8]{64,-1}(SN)
|
699 |
weight_sparseness: DENSE
|
700 |
model.decoder.layers.15.self_attn.softmax:
|
701 |
+
approximation_function: NONE
|
702 |
input_format: SAME
|
703 |
instance: Softmax
|
704 |
output_format: SAME
|
|
|
712 |
weight_format: BFP[8|8]{64,-1}(SN)
|
713 |
weight_sparseness: DENSE
|
714 |
model.decoder.layers.15.self_attn_layer_norm:
|
715 |
+
approximation_function: NONE
|
716 |
bias_format: SAME
|
717 |
input_format: SAME
|
718 |
instance: LayerNorm
|
|
|
747 |
weight_format: BFP[8|8]{64,-1}(SN)
|
748 |
weight_sparseness: DENSE
|
749 |
model.decoder.layers.16.final_layer_norm:
|
750 |
+
approximation_function: NONE
|
751 |
bias_format: SAME
|
752 |
input_format: SAME
|
753 |
instance: LayerNorm
|
|
|
786 |
weight_format: BFP[8|8]{64,-1}(SN)
|
787 |
weight_sparseness: DENSE
|
788 |
model.decoder.layers.16.self_attn.softmax:
|
789 |
+
approximation_function: NONE
|
790 |
input_format: SAME
|
791 |
instance: Softmax
|
792 |
output_format: SAME
|
|
|
800 |
weight_format: BFP[8|8]{64,-1}(SN)
|
801 |
weight_sparseness: DENSE
|
802 |
model.decoder.layers.16.self_attn_layer_norm:
|
803 |
+
approximation_function: NONE
|
804 |
bias_format: SAME
|
805 |
input_format: SAME
|
806 |
instance: LayerNorm
|
|
|
835 |
weight_format: BFP[8|8]{64,-1}(SN)
|
836 |
weight_sparseness: DENSE
|
837 |
model.decoder.layers.17.final_layer_norm:
|
838 |
+
approximation_function: NONE
|
839 |
bias_format: SAME
|
840 |
input_format: SAME
|
841 |
instance: LayerNorm
|
|
|
874 |
weight_format: BFP[8|8]{64,-1}(SN)
|
875 |
weight_sparseness: DENSE
|
876 |
model.decoder.layers.17.self_attn.softmax:
|
877 |
+
approximation_function: NONE
|
878 |
input_format: SAME
|
879 |
instance: Softmax
|
880 |
output_format: SAME
|
|
|
888 |
weight_format: BFP[8|8]{64,-1}(SN)
|
889 |
weight_sparseness: DENSE
|
890 |
model.decoder.layers.17.self_attn_layer_norm:
|
891 |
+
approximation_function: NONE
|
892 |
bias_format: SAME
|
893 |
input_format: SAME
|
894 |
instance: LayerNorm
|
|
|
923 |
weight_format: BFP[8|8]{64,-1}(SN)
|
924 |
weight_sparseness: DENSE
|
925 |
model.decoder.layers.18.final_layer_norm:
|
926 |
+
approximation_function: NONE
|
927 |
bias_format: SAME
|
928 |
input_format: SAME
|
929 |
instance: LayerNorm
|
|
|
962 |
weight_format: BFP[8|8]{64,-1}(SN)
|
963 |
weight_sparseness: DENSE
|
964 |
model.decoder.layers.18.self_attn.softmax:
|
965 |
+
approximation_function: NONE
|
966 |
input_format: SAME
|
967 |
instance: Softmax
|
968 |
output_format: SAME
|
|
|
976 |
weight_format: BFP[8|8]{64,-1}(SN)
|
977 |
weight_sparseness: DENSE
|
978 |
model.decoder.layers.18.self_attn_layer_norm:
|
979 |
+
approximation_function: NONE
|
980 |
bias_format: SAME
|
981 |
input_format: SAME
|
982 |
instance: LayerNorm
|
|
|
1011 |
weight_format: BFP[8|8]{64,-1}(SN)
|
1012 |
weight_sparseness: DENSE
|
1013 |
model.decoder.layers.19.final_layer_norm:
|
1014 |
+
approximation_function: NONE
|
1015 |
bias_format: SAME
|
1016 |
input_format: SAME
|
1017 |
instance: LayerNorm
|
|
|
1050 |
weight_format: BFP[8|8]{64,-1}(SN)
|
1051 |
weight_sparseness: DENSE
|
1052 |
model.decoder.layers.19.self_attn.softmax:
|
1053 |
+
approximation_function: NONE
|
1054 |
input_format: SAME
|
1055 |
instance: Softmax
|
1056 |
output_format: SAME
|
|
|
1064 |
weight_format: BFP[8|8]{64,-1}(SN)
|
1065 |
weight_sparseness: DENSE
|
1066 |
model.decoder.layers.19.self_attn_layer_norm:
|
1067 |
+
approximation_function: NONE
|
1068 |
bias_format: SAME
|
1069 |
input_format: SAME
|
1070 |
instance: LayerNorm
|
|
|
1099 |
weight_format: BFP[8|8]{64,-1}(SN)
|
1100 |
weight_sparseness: DENSE
|
1101 |
model.decoder.layers.2.final_layer_norm:
|
1102 |
+
approximation_function: NONE
|
1103 |
bias_format: SAME
|
1104 |
input_format: SAME
|
1105 |
instance: LayerNorm
|
|
|
1138 |
weight_format: BFP[8|8]{64,-1}(SN)
|
1139 |
weight_sparseness: DENSE
|
1140 |
model.decoder.layers.2.self_attn.softmax:
|
1141 |
+
approximation_function: NONE
|
1142 |
input_format: SAME
|
1143 |
instance: Softmax
|
1144 |
output_format: SAME
|
|
|
1152 |
weight_format: BFP[8|8]{64,-1}(SN)
|
1153 |
weight_sparseness: DENSE
|
1154 |
model.decoder.layers.2.self_attn_layer_norm:
|
1155 |
+
approximation_function: NONE
|
1156 |
bias_format: SAME
|
1157 |
input_format: SAME
|
1158 |
instance: LayerNorm
|
|
|
1187 |
weight_format: BFP[8|8]{64,-1}(SN)
|
1188 |
weight_sparseness: DENSE
|
1189 |
model.decoder.layers.20.final_layer_norm:
|
1190 |
+
approximation_function: NONE
|
1191 |
bias_format: SAME
|
1192 |
input_format: SAME
|
1193 |
instance: LayerNorm
|
|
|
1226 |
weight_format: BFP[8|8]{64,-1}(SN)
|
1227 |
weight_sparseness: DENSE
|
1228 |
model.decoder.layers.20.self_attn.softmax:
|
1229 |
+
approximation_function: NONE
|
1230 |
input_format: SAME
|
1231 |
instance: Softmax
|
1232 |
output_format: SAME
|
|
|
1240 |
weight_format: BFP[8|8]{64,-1}(SN)
|
1241 |
weight_sparseness: DENSE
|
1242 |
model.decoder.layers.20.self_attn_layer_norm:
|
1243 |
+
approximation_function: NONE
|
1244 |
bias_format: SAME
|
1245 |
input_format: SAME
|
1246 |
instance: LayerNorm
|
|
|
1275 |
weight_format: BFP[8|8]{64,-1}(SN)
|
1276 |
weight_sparseness: DENSE
|
1277 |
model.decoder.layers.21.final_layer_norm:
|
1278 |
+
approximation_function: NONE
|
1279 |
bias_format: SAME
|
1280 |
input_format: SAME
|
1281 |
instance: LayerNorm
|
|
|
1314 |
weight_format: BFP[8|8]{64,-1}(SN)
|
1315 |
weight_sparseness: DENSE
|
1316 |
model.decoder.layers.21.self_attn.softmax:
|
1317 |
+
approximation_function: NONE
|
1318 |
input_format: SAME
|
1319 |
instance: Softmax
|
1320 |
output_format: SAME
|
|
|
1328 |
weight_format: BFP[8|8]{64,-1}(SN)
|
1329 |
weight_sparseness: DENSE
|
1330 |
model.decoder.layers.21.self_attn_layer_norm:
|
1331 |
+
approximation_function: NONE
|
1332 |
bias_format: SAME
|
1333 |
input_format: SAME
|
1334 |
instance: LayerNorm
|
|
|
1363 |
weight_format: BFP[8|8]{64,-1}(SN)
|
1364 |
weight_sparseness: DENSE
|
1365 |
model.decoder.layers.22.final_layer_norm:
|
1366 |
+
approximation_function: NONE
|
1367 |
bias_format: SAME
|
1368 |
input_format: SAME
|
1369 |
instance: LayerNorm
|
|
|
1402 |
weight_format: BFP[8|8]{64,-1}(SN)
|
1403 |
weight_sparseness: DENSE
|
1404 |
model.decoder.layers.22.self_attn.softmax:
|
1405 |
+
approximation_function: NONE
|
1406 |
input_format: SAME
|
1407 |
instance: Softmax
|
1408 |
output_format: SAME
|
|
|
1416 |
weight_format: BFP[8|8]{64,-1}(SN)
|
1417 |
weight_sparseness: DENSE
|
1418 |
model.decoder.layers.22.self_attn_layer_norm:
|
1419 |
+
approximation_function: NONE
|
1420 |
bias_format: SAME
|
1421 |
input_format: SAME
|
1422 |
instance: LayerNorm
|
|
|
1451 |
weight_format: BFP[8|8]{64,-1}(SN)
|
1452 |
weight_sparseness: DENSE
|
1453 |
model.decoder.layers.23.final_layer_norm:
|
1454 |
+
approximation_function: NONE
|
1455 |
bias_format: SAME
|
1456 |
input_format: SAME
|
1457 |
instance: LayerNorm
|
|
|
1490 |
weight_format: BFP[8|8]{64,-1}(SN)
|
1491 |
weight_sparseness: DENSE
|
1492 |
model.decoder.layers.23.self_attn.softmax:
|
1493 |
+
approximation_function: NONE
|
1494 |
input_format: SAME
|
1495 |
instance: Softmax
|
1496 |
output_format: SAME
|
|
|
1504 |
weight_format: BFP[8|8]{64,-1}(SN)
|
1505 |
weight_sparseness: DENSE
|
1506 |
model.decoder.layers.23.self_attn_layer_norm:
|
1507 |
+
approximation_function: NONE
|
1508 |
bias_format: SAME
|
1509 |
input_format: SAME
|
1510 |
instance: LayerNorm
|
|
|
1539 |
weight_format: BFP[8|8]{64,-1}(SN)
|
1540 |
weight_sparseness: DENSE
|
1541 |
model.decoder.layers.3.final_layer_norm:
|
1542 |
+
approximation_function: NONE
|
1543 |
bias_format: SAME
|
1544 |
input_format: SAME
|
1545 |
instance: LayerNorm
|
|
|
1578 |
weight_format: BFP[8|8]{64,-1}(SN)
|
1579 |
weight_sparseness: DENSE
|
1580 |
model.decoder.layers.3.self_attn.softmax:
|
1581 |
+
approximation_function: NONE
|
1582 |
input_format: SAME
|
1583 |
instance: Softmax
|
1584 |
output_format: SAME
|
|
|
1592 |
weight_format: BFP[8|8]{64,-1}(SN)
|
1593 |
weight_sparseness: DENSE
|
1594 |
model.decoder.layers.3.self_attn_layer_norm:
|
1595 |
+
approximation_function: NONE
|
1596 |
bias_format: SAME
|
1597 |
input_format: SAME
|
1598 |
instance: LayerNorm
|
|
|
1627 |
weight_format: BFP[8|8]{64,-1}(SN)
|
1628 |
weight_sparseness: DENSE
|
1629 |
model.decoder.layers.4.final_layer_norm:
|
1630 |
+
approximation_function: NONE
|
1631 |
bias_format: SAME
|
1632 |
input_format: SAME
|
1633 |
instance: LayerNorm
|
|
|
1666 |
weight_format: BFP[8|8]{64,-1}(SN)
|
1667 |
weight_sparseness: DENSE
|
1668 |
model.decoder.layers.4.self_attn.softmax:
|
1669 |
+
approximation_function: NONE
|
1670 |
input_format: SAME
|
1671 |
instance: Softmax
|
1672 |
output_format: SAME
|
|
|
1680 |
weight_format: BFP[8|8]{64,-1}(SN)
|
1681 |
weight_sparseness: DENSE
|
1682 |
model.decoder.layers.4.self_attn_layer_norm:
|
1683 |
+
approximation_function: NONE
|
1684 |
bias_format: SAME
|
1685 |
input_format: SAME
|
1686 |
instance: LayerNorm
|
|
|
1715 |
weight_format: BFP[8|8]{64,-1}(SN)
|
1716 |
weight_sparseness: DENSE
|
1717 |
model.decoder.layers.5.final_layer_norm:
|
1718 |
+
approximation_function: NONE
|
1719 |
bias_format: SAME
|
1720 |
input_format: SAME
|
1721 |
instance: LayerNorm
|
|
|
1754 |
weight_format: BFP[8|8]{64,-1}(SN)
|
1755 |
weight_sparseness: DENSE
|
1756 |
model.decoder.layers.5.self_attn.softmax:
|
1757 |
+
approximation_function: NONE
|
1758 |
input_format: SAME
|
1759 |
instance: Softmax
|
1760 |
output_format: SAME
|
|
|
1768 |
weight_format: BFP[8|8]{64,-1}(SN)
|
1769 |
weight_sparseness: DENSE
|
1770 |
model.decoder.layers.5.self_attn_layer_norm:
|
1771 |
+
approximation_function: NONE
|
1772 |
bias_format: SAME
|
1773 |
input_format: SAME
|
1774 |
instance: LayerNorm
|
|
|
1803 |
weight_format: BFP[8|8]{64,-1}(SN)
|
1804 |
weight_sparseness: DENSE
|
1805 |
model.decoder.layers.6.final_layer_norm:
|
1806 |
+
approximation_function: NONE
|
1807 |
bias_format: SAME
|
1808 |
input_format: SAME
|
1809 |
instance: LayerNorm
|
|
|
1842 |
weight_format: BFP[8|8]{64,-1}(SN)
|
1843 |
weight_sparseness: DENSE
|
1844 |
model.decoder.layers.6.self_attn.softmax:
|
1845 |
+
approximation_function: NONE
|
1846 |
input_format: SAME
|
1847 |
instance: Softmax
|
1848 |
output_format: SAME
|
|
|
1856 |
weight_format: BFP[8|8]{64,-1}(SN)
|
1857 |
weight_sparseness: DENSE
|
1858 |
model.decoder.layers.6.self_attn_layer_norm:
|
1859 |
+
approximation_function: NONE
|
1860 |
bias_format: SAME
|
1861 |
input_format: SAME
|
1862 |
instance: LayerNorm
|
|
|
1891 |
weight_format: BFP[8|8]{64,-1}(SN)
|
1892 |
weight_sparseness: DENSE
|
1893 |
model.decoder.layers.7.final_layer_norm:
|
1894 |
+
approximation_function: NONE
|
1895 |
bias_format: SAME
|
1896 |
input_format: SAME
|
1897 |
instance: LayerNorm
|
|
|
1930 |
weight_format: BFP[8|8]{64,-1}(SN)
|
1931 |
weight_sparseness: DENSE
|
1932 |
model.decoder.layers.7.self_attn.softmax:
|
1933 |
+
approximation_function: NONE
|
1934 |
input_format: SAME
|
1935 |
instance: Softmax
|
1936 |
output_format: SAME
|
|
|
1944 |
weight_format: BFP[8|8]{64,-1}(SN)
|
1945 |
weight_sparseness: DENSE
|
1946 |
model.decoder.layers.7.self_attn_layer_norm:
|
1947 |
+
approximation_function: NONE
|
1948 |
bias_format: SAME
|
1949 |
input_format: SAME
|
1950 |
instance: LayerNorm
|
|
|
1979 |
weight_format: BFP[8|8]{64,-1}(SN)
|
1980 |
weight_sparseness: DENSE
|
1981 |
model.decoder.layers.8.final_layer_norm:
|
1982 |
+
approximation_function: NONE
|
1983 |
bias_format: SAME
|
1984 |
input_format: SAME
|
1985 |
instance: LayerNorm
|
|
|
2018 |
weight_format: BFP[8|8]{64,-1}(SN)
|
2019 |
weight_sparseness: DENSE
|
2020 |
model.decoder.layers.8.self_attn.softmax:
|
2021 |
+
approximation_function: NONE
|
2022 |
input_format: SAME
|
2023 |
instance: Softmax
|
2024 |
output_format: SAME
|
|
|
2032 |
weight_format: BFP[8|8]{64,-1}(SN)
|
2033 |
weight_sparseness: DENSE
|
2034 |
model.decoder.layers.8.self_attn_layer_norm:
|
2035 |
+
approximation_function: NONE
|
2036 |
bias_format: SAME
|
2037 |
input_format: SAME
|
2038 |
instance: LayerNorm
|
|
|
2067 |
weight_format: BFP[8|8]{64,-1}(SN)
|
2068 |
weight_sparseness: DENSE
|
2069 |
model.decoder.layers.9.final_layer_norm:
|
2070 |
+
approximation_function: NONE
|
2071 |
bias_format: SAME
|
2072 |
input_format: SAME
|
2073 |
instance: LayerNorm
|
|
|
2106 |
weight_format: BFP[8|8]{64,-1}(SN)
|
2107 |
weight_sparseness: DENSE
|
2108 |
model.decoder.layers.9.self_attn.softmax:
|
2109 |
+
approximation_function: NONE
|
2110 |
input_format: SAME
|
2111 |
instance: Softmax
|
2112 |
output_format: SAME
|
|
|
2120 |
weight_format: BFP[8|8]{64,-1}(SN)
|
2121 |
weight_sparseness: DENSE
|
2122 |
model.decoder.layers.9.self_attn_layer_norm:
|
2123 |
+
approximation_function: NONE
|
2124 |
bias_format: SAME
|
2125 |
input_format: SAME
|
2126 |
instance: LayerNorm
|