added_tokens.json CHANGED
@@ -1,8 +1,5 @@
1
  {
2
  "</s>": 3,
3
- "<MINED_DATA>": 256099,
4
- "<MMT_BT_DATA>": 256100,
5
- "<SMT_BT_DATA>": 256101,
6
  "<pad>": 0,
7
  "<s>": 2,
8
  "<unk>": 1,
 
1
  {
2
  "</s>": 3,
 
 
 
3
  "<pad>": 0,
4
  "<s>": 2,
5
  "<unk>": 1,
preprocessor_config.json CHANGED
@@ -2,104 +2,104 @@
2
  "feature_extractor_type": "SeamlessM4TFeatureExtractor",
3
  "feature_size": 80,
4
  "language_code": [
5
- "afr",
6
- "amh",
7
- "arb",
8
- "ary",
9
- "arz",
10
- "asm",
11
- "azj",
12
- "bel",
13
- "ben",
14
- "bos",
15
- "bul",
16
- "cat",
17
- "ceb",
18
- "ces",
19
- "ckb",
20
- "cmn",
21
- "cmn_Hant",
22
- "cym",
23
- "dan",
24
- "deu",
25
- "ell",
26
- "eng",
27
- "est",
28
- "eus",
29
- "fin",
30
- "fra",
31
- "fuv",
32
- "gaz",
33
- "gle",
34
- "glg",
35
- "guj",
36
- "heb",
37
- "hin",
38
- "hrv",
39
- "hun",
40
- "hye",
41
- "ibo",
42
- "ind",
43
- "isl",
44
- "ita",
45
- "jav",
46
- "jpn",
47
- "kan",
48
- "kat",
49
- "kaz",
50
- "khk",
51
- "khm",
52
- "kir",
53
- "kor",
54
- "lao",
55
- "lit",
56
- "lug",
57
- "luo",
58
- "lvs",
59
- "mai",
60
- "mal",
61
- "mar",
62
- "mkd",
63
- "mlt",
64
- "mni",
65
- "mya",
66
- "nld",
67
- "nno",
68
- "nob",
69
- "npi",
70
- "nya",
71
- "ory",
72
- "pan",
73
- "pbt",
74
- "pes",
75
- "pol",
76
- "por",
77
- "ron",
78
- "rus",
79
- "sat",
80
- "slk",
81
- "slv",
82
- "sna",
83
- "snd",
84
- "som",
85
- "spa",
86
- "srp",
87
- "swe",
88
- "swh",
89
- "tam",
90
- "tel",
91
- "tgk",
92
- "tgl",
93
- "tha",
94
- "tur",
95
- "ukr",
96
- "urd",
97
- "uzn",
98
- "vie",
99
- "yor",
100
- "yue",
101
- "zlm",
102
- "zul"
103
  ],
104
  "num_mel_bins": 80,
105
  "padding_side": "right",
 
2
  "feature_extractor_type": "SeamlessM4TFeatureExtractor",
3
  "feature_size": 80,
4
  "language_code": [
5
+ "__afr__",
6
+ "__amh__",
7
+ "__arb__",
8
+ "__ary__",
9
+ "__arz__",
10
+ "__asm__",
11
+ "__azj__",
12
+ "__bel__",
13
+ "__ben__",
14
+ "__bos__",
15
+ "__bul__",
16
+ "__cat__",
17
+ "__ceb__",
18
+ "__ces__",
19
+ "__ckb__",
20
+ "__cmn__",
21
+ "__cmn_Hant__",
22
+ "__cym__",
23
+ "__dan__",
24
+ "__deu__",
25
+ "__ell__",
26
+ "__eng__",
27
+ "__est__",
28
+ "__eus__",
29
+ "__fin__",
30
+ "__fra__",
31
+ "__fuv__",
32
+ "__gaz__",
33
+ "__gle__",
34
+ "__glg__",
35
+ "__guj__",
36
+ "__heb__",
37
+ "__hin__",
38
+ "__hrv__",
39
+ "__hun__",
40
+ "__hye__",
41
+ "__ibo__",
42
+ "__ind__",
43
+ "__isl__",
44
+ "__ita__",
45
+ "__jav__",
46
+ "__jpn__",
47
+ "__kan__",
48
+ "__kat__",
49
+ "__kaz__",
50
+ "__khk__",
51
+ "__khm__",
52
+ "__kir__",
53
+ "__kor__",
54
+ "__lao__",
55
+ "__lit__",
56
+ "__lug__",
57
+ "__luo__",
58
+ "__lvs__",
59
+ "__mai__",
60
+ "__mal__",
61
+ "__mar__",
62
+ "__mkd__",
63
+ "__mlt__",
64
+ "__mni__",
65
+ "__mya__",
66
+ "__nld__",
67
+ "__nno__",
68
+ "__nob__",
69
+ "__npi__",
70
+ "__nya__",
71
+ "__ory__",
72
+ "__pan__",
73
+ "__pbt__",
74
+ "__pes__",
75
+ "__pol__",
76
+ "__por__",
77
+ "__ron__",
78
+ "__rus__",
79
+ "__sat__",
80
+ "__slk__",
81
+ "__slv__",
82
+ "__sna__",
83
+ "__snd__",
84
+ "__som__",
85
+ "__spa__",
86
+ "__srp__",
87
+ "__swe__",
88
+ "__swh__",
89
+ "__tam__",
90
+ "__tel__",
91
+ "__tgk__",
92
+ "__tgl__",
93
+ "__tha__",
94
+ "__tur__",
95
+ "__ukr__",
96
+ "__urd__",
97
+ "__uzn__",
98
+ "__vie__",
99
+ "__yor__",
100
+ "__yue__",
101
+ "__zlm__",
102
+ "__zul__"
103
  ],
104
  "num_mel_bins": 80,
105
  "padding_side": "right",
special_tokens_map.json CHANGED
@@ -1,5 +1,9 @@
1
  {
2
  "additional_special_tokens": [
 
 
 
 
3
  "__afr__",
4
  "__amh__",
5
  "__arb__",
@@ -97,10 +101,7 @@
97
  "__yor__",
98
  "__yue__",
99
  "__zlm__",
100
- "__zul__",
101
- "<MINED_DATA>",
102
- "<MMT_BT_DATA>",
103
- "<SMT_BT_DATA>"
104
  ],
105
  "bos_token": "<s>",
106
  "cls_token": "<s>",
 
1
  {
2
  "additional_special_tokens": [
3
+ "<pad>",
4
+ "<unk>",
5
+ "<s>",
6
+ "</s>",
7
  "__afr__",
8
  "__amh__",
9
  "__arb__",
 
101
  "__yor__",
102
  "__yue__",
103
  "__zlm__",
104
+ "__zul__"
 
 
 
105
  ],
106
  "bos_token": "<s>",
107
  "cls_token": "<s>",
tokenizer_config.json CHANGED
@@ -815,33 +815,13 @@
815
  "rstrip": true,
816
  "single_word": false,
817
  "special": true
818
- },
819
- "256099": {
820
- "content": "<MINED_DATA>",
821
- "lstrip": true,
822
- "normalized": false,
823
- "rstrip": true,
824
- "single_word": false,
825
- "special": true
826
- },
827
- "256100": {
828
- "content": "<MMT_BT_DATA>",
829
- "lstrip": true,
830
- "normalized": false,
831
- "rstrip": true,
832
- "single_word": false,
833
- "special": true
834
- },
835
- "256101": {
836
- "content": "<SMT_BT_DATA>",
837
- "lstrip": true,
838
- "normalized": false,
839
- "rstrip": true,
840
- "single_word": false,
841
- "special": true
842
  }
843
  },
844
  "additional_special_tokens": [
 
 
 
 
845
  "__afr__",
846
  "__amh__",
847
  "__arb__",
@@ -939,115 +919,12 @@
939
  "__yor__",
940
  "__yue__",
941
  "__zlm__",
942
- "__zul__",
943
- "<MINED_DATA>",
944
- "<MMT_BT_DATA>",
945
- "<SMT_BT_DATA>"
946
  ],
947
  "bos_token": "<s>",
948
  "clean_up_tokenization_spaces": true,
949
  "cls_token": "<s>",
950
  "eos_token": "</s>",
951
- "language_code": [
952
- "afr",
953
- "amh",
954
- "arb",
955
- "ary",
956
- "arz",
957
- "asm",
958
- "azj",
959
- "bel",
960
- "ben",
961
- "bos",
962
- "bul",
963
- "cat",
964
- "ceb",
965
- "ces",
966
- "ckb",
967
- "cmn",
968
- "cmn_Hant",
969
- "cym",
970
- "dan",
971
- "deu",
972
- "ell",
973
- "eng",
974
- "est",
975
- "eus",
976
- "fin",
977
- "fra",
978
- "fuv",
979
- "gaz",
980
- "gle",
981
- "glg",
982
- "guj",
983
- "heb",
984
- "hin",
985
- "hrv",
986
- "hun",
987
- "hye",
988
- "ibo",
989
- "ind",
990
- "isl",
991
- "ita",
992
- "jav",
993
- "jpn",
994
- "kan",
995
- "kat",
996
- "kaz",
997
- "khk",
998
- "khm",
999
- "kir",
1000
- "kor",
1001
- "lao",
1002
- "lit",
1003
- "lug",
1004
- "luo",
1005
- "lvs",
1006
- "mai",
1007
- "mal",
1008
- "mar",
1009
- "mkd",
1010
- "mlt",
1011
- "mni",
1012
- "mya",
1013
- "nld",
1014
- "nno",
1015
- "nob",
1016
- "npi",
1017
- "nya",
1018
- "ory",
1019
- "pan",
1020
- "pbt",
1021
- "pes",
1022
- "pol",
1023
- "por",
1024
- "ron",
1025
- "rus",
1026
- "sat",
1027
- "slk",
1028
- "slv",
1029
- "sna",
1030
- "snd",
1031
- "som",
1032
- "spa",
1033
- "srp",
1034
- "swe",
1035
- "swh",
1036
- "tam",
1037
- "tel",
1038
- "tgk",
1039
- "tgl",
1040
- "tha",
1041
- "tur",
1042
- "ukr",
1043
- "urd",
1044
- "uzn",
1045
- "vie",
1046
- "yor",
1047
- "yue",
1048
- "zlm",
1049
- "zul"
1050
- ],
1051
  "model_max_length": 1000000000000000019884624838656,
1052
  "pad_token": "<pad>",
1053
  "processor_class": "SeamlessM4TProcessor",
 
815
  "rstrip": true,
816
  "single_word": false,
817
  "special": true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
818
  }
819
  },
820
  "additional_special_tokens": [
821
+ "<pad>",
822
+ "<unk>",
823
+ "<s>",
824
+ "</s>",
825
  "__afr__",
826
  "__amh__",
827
  "__arb__",
 
919
  "__yor__",
920
  "__yue__",
921
  "__zlm__",
922
+ "__zul__"
 
 
 
923
  ],
924
  "bos_token": "<s>",
925
  "clean_up_tokenization_spaces": true,
926
  "cls_token": "<s>",
927
  "eos_token": "</s>",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
928
  "model_max_length": 1000000000000000019884624838656,
929
  "pad_token": "<pad>",
930
  "processor_class": "SeamlessM4TProcessor",