Upload processor
#8
by
ylacombe
- opened
- added_tokens.json +0 -3
- preprocessor_config.json +98 -98
- special_tokens_map.json +5 -4
- tokenizer_config.json +5 -128
added_tokens.json
CHANGED
@@ -1,8 +1,5 @@
|
|
1 |
{
|
2 |
"</s>": 3,
|
3 |
-
"<MINED_DATA>": 256099,
|
4 |
-
"<MMT_BT_DATA>": 256100,
|
5 |
-
"<SMT_BT_DATA>": 256101,
|
6 |
"<pad>": 0,
|
7 |
"<s>": 2,
|
8 |
"<unk>": 1,
|
|
|
1 |
{
|
2 |
"</s>": 3,
|
|
|
|
|
|
|
3 |
"<pad>": 0,
|
4 |
"<s>": 2,
|
5 |
"<unk>": 1,
|
preprocessor_config.json
CHANGED
@@ -2,104 +2,104 @@
|
|
2 |
"feature_extractor_type": "SeamlessM4TFeatureExtractor",
|
3 |
"feature_size": 80,
|
4 |
"language_code": [
|
5 |
-
"
|
6 |
-
"
|
7 |
-
"
|
8 |
-
"
|
9 |
-
"
|
10 |
-
"
|
11 |
-
"
|
12 |
-
"
|
13 |
-
"
|
14 |
-
"
|
15 |
-
"
|
16 |
-
"
|
17 |
-
"
|
18 |
-
"
|
19 |
-
"
|
20 |
-
"
|
21 |
-
"
|
22 |
-
"
|
23 |
-
"
|
24 |
-
"
|
25 |
-
"
|
26 |
-
"
|
27 |
-
"
|
28 |
-
"
|
29 |
-
"
|
30 |
-
"
|
31 |
-
"
|
32 |
-
"
|
33 |
-
"
|
34 |
-
"
|
35 |
-
"
|
36 |
-
"
|
37 |
-
"
|
38 |
-
"
|
39 |
-
"
|
40 |
-
"
|
41 |
-
"
|
42 |
-
"
|
43 |
-
"
|
44 |
-
"
|
45 |
-
"
|
46 |
-
"
|
47 |
-
"
|
48 |
-
"
|
49 |
-
"
|
50 |
-
"
|
51 |
-
"
|
52 |
-
"
|
53 |
-
"
|
54 |
-
"
|
55 |
-
"
|
56 |
-
"
|
57 |
-
"
|
58 |
-
"
|
59 |
-
"
|
60 |
-
"
|
61 |
-
"
|
62 |
-
"
|
63 |
-
"
|
64 |
-
"
|
65 |
-
"
|
66 |
-
"
|
67 |
-
"
|
68 |
-
"
|
69 |
-
"
|
70 |
-
"
|
71 |
-
"
|
72 |
-
"
|
73 |
-
"
|
74 |
-
"
|
75 |
-
"
|
76 |
-
"
|
77 |
-
"
|
78 |
-
"
|
79 |
-
"
|
80 |
-
"
|
81 |
-
"
|
82 |
-
"
|
83 |
-
"
|
84 |
-
"
|
85 |
-
"
|
86 |
-
"
|
87 |
-
"
|
88 |
-
"
|
89 |
-
"
|
90 |
-
"
|
91 |
-
"
|
92 |
-
"
|
93 |
-
"
|
94 |
-
"
|
95 |
-
"
|
96 |
-
"
|
97 |
-
"
|
98 |
-
"
|
99 |
-
"
|
100 |
-
"
|
101 |
-
"
|
102 |
-
"
|
103 |
],
|
104 |
"num_mel_bins": 80,
|
105 |
"padding_side": "right",
|
|
|
2 |
"feature_extractor_type": "SeamlessM4TFeatureExtractor",
|
3 |
"feature_size": 80,
|
4 |
"language_code": [
|
5 |
+
"__afr__",
|
6 |
+
"__amh__",
|
7 |
+
"__arb__",
|
8 |
+
"__ary__",
|
9 |
+
"__arz__",
|
10 |
+
"__asm__",
|
11 |
+
"__azj__",
|
12 |
+
"__bel__",
|
13 |
+
"__ben__",
|
14 |
+
"__bos__",
|
15 |
+
"__bul__",
|
16 |
+
"__cat__",
|
17 |
+
"__ceb__",
|
18 |
+
"__ces__",
|
19 |
+
"__ckb__",
|
20 |
+
"__cmn__",
|
21 |
+
"__cmn_Hant__",
|
22 |
+
"__cym__",
|
23 |
+
"__dan__",
|
24 |
+
"__deu__",
|
25 |
+
"__ell__",
|
26 |
+
"__eng__",
|
27 |
+
"__est__",
|
28 |
+
"__eus__",
|
29 |
+
"__fin__",
|
30 |
+
"__fra__",
|
31 |
+
"__fuv__",
|
32 |
+
"__gaz__",
|
33 |
+
"__gle__",
|
34 |
+
"__glg__",
|
35 |
+
"__guj__",
|
36 |
+
"__heb__",
|
37 |
+
"__hin__",
|
38 |
+
"__hrv__",
|
39 |
+
"__hun__",
|
40 |
+
"__hye__",
|
41 |
+
"__ibo__",
|
42 |
+
"__ind__",
|
43 |
+
"__isl__",
|
44 |
+
"__ita__",
|
45 |
+
"__jav__",
|
46 |
+
"__jpn__",
|
47 |
+
"__kan__",
|
48 |
+
"__kat__",
|
49 |
+
"__kaz__",
|
50 |
+
"__khk__",
|
51 |
+
"__khm__",
|
52 |
+
"__kir__",
|
53 |
+
"__kor__",
|
54 |
+
"__lao__",
|
55 |
+
"__lit__",
|
56 |
+
"__lug__",
|
57 |
+
"__luo__",
|
58 |
+
"__lvs__",
|
59 |
+
"__mai__",
|
60 |
+
"__mal__",
|
61 |
+
"__mar__",
|
62 |
+
"__mkd__",
|
63 |
+
"__mlt__",
|
64 |
+
"__mni__",
|
65 |
+
"__mya__",
|
66 |
+
"__nld__",
|
67 |
+
"__nno__",
|
68 |
+
"__nob__",
|
69 |
+
"__npi__",
|
70 |
+
"__nya__",
|
71 |
+
"__ory__",
|
72 |
+
"__pan__",
|
73 |
+
"__pbt__",
|
74 |
+
"__pes__",
|
75 |
+
"__pol__",
|
76 |
+
"__por__",
|
77 |
+
"__ron__",
|
78 |
+
"__rus__",
|
79 |
+
"__sat__",
|
80 |
+
"__slk__",
|
81 |
+
"__slv__",
|
82 |
+
"__sna__",
|
83 |
+
"__snd__",
|
84 |
+
"__som__",
|
85 |
+
"__spa__",
|
86 |
+
"__srp__",
|
87 |
+
"__swe__",
|
88 |
+
"__swh__",
|
89 |
+
"__tam__",
|
90 |
+
"__tel__",
|
91 |
+
"__tgk__",
|
92 |
+
"__tgl__",
|
93 |
+
"__tha__",
|
94 |
+
"__tur__",
|
95 |
+
"__ukr__",
|
96 |
+
"__urd__",
|
97 |
+
"__uzn__",
|
98 |
+
"__vie__",
|
99 |
+
"__yor__",
|
100 |
+
"__yue__",
|
101 |
+
"__zlm__",
|
102 |
+
"__zul__"
|
103 |
],
|
104 |
"num_mel_bins": 80,
|
105 |
"padding_side": "right",
|
special_tokens_map.json
CHANGED
@@ -1,5 +1,9 @@
|
|
1 |
{
|
2 |
"additional_special_tokens": [
|
|
|
|
|
|
|
|
|
3 |
"__afr__",
|
4 |
"__amh__",
|
5 |
"__arb__",
|
@@ -97,10 +101,7 @@
|
|
97 |
"__yor__",
|
98 |
"__yue__",
|
99 |
"__zlm__",
|
100 |
-
"__zul__"
|
101 |
-
"<MINED_DATA>",
|
102 |
-
"<MMT_BT_DATA>",
|
103 |
-
"<SMT_BT_DATA>"
|
104 |
],
|
105 |
"bos_token": "<s>",
|
106 |
"cls_token": "<s>",
|
|
|
1 |
{
|
2 |
"additional_special_tokens": [
|
3 |
+
"<pad>",
|
4 |
+
"<unk>",
|
5 |
+
"<s>",
|
6 |
+
"</s>",
|
7 |
"__afr__",
|
8 |
"__amh__",
|
9 |
"__arb__",
|
|
|
101 |
"__yor__",
|
102 |
"__yue__",
|
103 |
"__zlm__",
|
104 |
+
"__zul__"
|
|
|
|
|
|
|
105 |
],
|
106 |
"bos_token": "<s>",
|
107 |
"cls_token": "<s>",
|
tokenizer_config.json
CHANGED
@@ -815,33 +815,13 @@
|
|
815 |
"rstrip": true,
|
816 |
"single_word": false,
|
817 |
"special": true
|
818 |
-
},
|
819 |
-
"256099": {
|
820 |
-
"content": "<MINED_DATA>",
|
821 |
-
"lstrip": true,
|
822 |
-
"normalized": false,
|
823 |
-
"rstrip": true,
|
824 |
-
"single_word": false,
|
825 |
-
"special": true
|
826 |
-
},
|
827 |
-
"256100": {
|
828 |
-
"content": "<MMT_BT_DATA>",
|
829 |
-
"lstrip": true,
|
830 |
-
"normalized": false,
|
831 |
-
"rstrip": true,
|
832 |
-
"single_word": false,
|
833 |
-
"special": true
|
834 |
-
},
|
835 |
-
"256101": {
|
836 |
-
"content": "<SMT_BT_DATA>",
|
837 |
-
"lstrip": true,
|
838 |
-
"normalized": false,
|
839 |
-
"rstrip": true,
|
840 |
-
"single_word": false,
|
841 |
-
"special": true
|
842 |
}
|
843 |
},
|
844 |
"additional_special_tokens": [
|
|
|
|
|
|
|
|
|
845 |
"__afr__",
|
846 |
"__amh__",
|
847 |
"__arb__",
|
@@ -939,115 +919,12 @@
|
|
939 |
"__yor__",
|
940 |
"__yue__",
|
941 |
"__zlm__",
|
942 |
-
"__zul__"
|
943 |
-
"<MINED_DATA>",
|
944 |
-
"<MMT_BT_DATA>",
|
945 |
-
"<SMT_BT_DATA>"
|
946 |
],
|
947 |
"bos_token": "<s>",
|
948 |
"clean_up_tokenization_spaces": true,
|
949 |
"cls_token": "<s>",
|
950 |
"eos_token": "</s>",
|
951 |
-
"language_code": [
|
952 |
-
"afr",
|
953 |
-
"amh",
|
954 |
-
"arb",
|
955 |
-
"ary",
|
956 |
-
"arz",
|
957 |
-
"asm",
|
958 |
-
"azj",
|
959 |
-
"bel",
|
960 |
-
"ben",
|
961 |
-
"bos",
|
962 |
-
"bul",
|
963 |
-
"cat",
|
964 |
-
"ceb",
|
965 |
-
"ces",
|
966 |
-
"ckb",
|
967 |
-
"cmn",
|
968 |
-
"cmn_Hant",
|
969 |
-
"cym",
|
970 |
-
"dan",
|
971 |
-
"deu",
|
972 |
-
"ell",
|
973 |
-
"eng",
|
974 |
-
"est",
|
975 |
-
"eus",
|
976 |
-
"fin",
|
977 |
-
"fra",
|
978 |
-
"fuv",
|
979 |
-
"gaz",
|
980 |
-
"gle",
|
981 |
-
"glg",
|
982 |
-
"guj",
|
983 |
-
"heb",
|
984 |
-
"hin",
|
985 |
-
"hrv",
|
986 |
-
"hun",
|
987 |
-
"hye",
|
988 |
-
"ibo",
|
989 |
-
"ind",
|
990 |
-
"isl",
|
991 |
-
"ita",
|
992 |
-
"jav",
|
993 |
-
"jpn",
|
994 |
-
"kan",
|
995 |
-
"kat",
|
996 |
-
"kaz",
|
997 |
-
"khk",
|
998 |
-
"khm",
|
999 |
-
"kir",
|
1000 |
-
"kor",
|
1001 |
-
"lao",
|
1002 |
-
"lit",
|
1003 |
-
"lug",
|
1004 |
-
"luo",
|
1005 |
-
"lvs",
|
1006 |
-
"mai",
|
1007 |
-
"mal",
|
1008 |
-
"mar",
|
1009 |
-
"mkd",
|
1010 |
-
"mlt",
|
1011 |
-
"mni",
|
1012 |
-
"mya",
|
1013 |
-
"nld",
|
1014 |
-
"nno",
|
1015 |
-
"nob",
|
1016 |
-
"npi",
|
1017 |
-
"nya",
|
1018 |
-
"ory",
|
1019 |
-
"pan",
|
1020 |
-
"pbt",
|
1021 |
-
"pes",
|
1022 |
-
"pol",
|
1023 |
-
"por",
|
1024 |
-
"ron",
|
1025 |
-
"rus",
|
1026 |
-
"sat",
|
1027 |
-
"slk",
|
1028 |
-
"slv",
|
1029 |
-
"sna",
|
1030 |
-
"snd",
|
1031 |
-
"som",
|
1032 |
-
"spa",
|
1033 |
-
"srp",
|
1034 |
-
"swe",
|
1035 |
-
"swh",
|
1036 |
-
"tam",
|
1037 |
-
"tel",
|
1038 |
-
"tgk",
|
1039 |
-
"tgl",
|
1040 |
-
"tha",
|
1041 |
-
"tur",
|
1042 |
-
"ukr",
|
1043 |
-
"urd",
|
1044 |
-
"uzn",
|
1045 |
-
"vie",
|
1046 |
-
"yor",
|
1047 |
-
"yue",
|
1048 |
-
"zlm",
|
1049 |
-
"zul"
|
1050 |
-
],
|
1051 |
"model_max_length": 1000000000000000019884624838656,
|
1052 |
"pad_token": "<pad>",
|
1053 |
"processor_class": "SeamlessM4TProcessor",
|
|
|
815 |
"rstrip": true,
|
816 |
"single_word": false,
|
817 |
"special": true
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
818 |
}
|
819 |
},
|
820 |
"additional_special_tokens": [
|
821 |
+
"<pad>",
|
822 |
+
"<unk>",
|
823 |
+
"<s>",
|
824 |
+
"</s>",
|
825 |
"__afr__",
|
826 |
"__amh__",
|
827 |
"__arb__",
|
|
|
919 |
"__yor__",
|
920 |
"__yue__",
|
921 |
"__zlm__",
|
922 |
+
"__zul__"
|
|
|
|
|
|
|
923 |
],
|
924 |
"bos_token": "<s>",
|
925 |
"clean_up_tokenization_spaces": true,
|
926 |
"cls_token": "<s>",
|
927 |
"eos_token": "</s>",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
928 |
"model_max_length": 1000000000000000019884624838656,
|
929 |
"pad_token": "<pad>",
|
930 |
"processor_class": "SeamlessM4TProcessor",
|