xlm-mlm-100-1280 / tokenizer_config.json
Xenova's picture
Xenova HF staff
Upload folder using huggingface_hub
505c875
{
"additional_special_tokens": [
"<special0>",
"<special1>",
"<special2>",
"<special3>",
"<special4>",
"<special5>",
"<special6>",
"<special7>",
"<special8>",
"<special9>"
],
"bos_token": "<s>",
"clean_up_tokenization_spaces": true,
"cls_token": "</s>",
"do_lowercase_and_remove_accent": false,
"id2lang": {
"0": "af",
"1": "als",
"10": "be",
"11": "bg",
"12": "bn",
"13": "br",
"14": "bs",
"15": "ca",
"16": "ceb",
"17": "ckb",
"18": "cs",
"19": "cy",
"2": "am",
"20": "da",
"21": "de",
"22": "el",
"23": "en",
"24": "eo",
"25": "es",
"26": "et",
"27": "eu",
"28": "fa",
"29": "fi",
"3": "an",
"30": "fr",
"31": "fy",
"32": "ga",
"33": "gan",
"34": "gl",
"35": "gu",
"36": "he",
"37": "hi",
"38": "hr",
"39": "hu",
"4": "ang",
"40": "hy",
"41": "ia",
"42": "id",
"43": "is",
"44": "it",
"45": "ja",
"46": "jv",
"47": "ka",
"48": "kk",
"49": "kn",
"5": "ar",
"50": "ko",
"51": "ku",
"52": "la",
"53": "lb",
"54": "lt",
"55": "lv",
"56": "mk",
"57": "ml",
"58": "mn",
"59": "mr",
"6": "arz",
"60": "ms",
"61": "my",
"62": "nds",
"63": "ne",
"64": "nl",
"65": "nn",
"66": "no",
"67": "oc",
"68": "pl",
"69": "pt",
"7": "ast",
"70": "ro",
"71": "ru",
"72": "scn",
"73": "sco",
"74": "sh",
"75": "si",
"76": "simple",
"77": "sk",
"78": "sl",
"79": "sq",
"8": "az",
"80": "sr",
"81": "sv",
"82": "sw",
"83": "ta",
"84": "te",
"85": "th",
"86": "tl",
"87": "tr",
"88": "tt",
"89": "uk",
"9": "bar",
"90": "ur",
"91": "uz",
"92": "vi",
"93": "war",
"94": "wuu",
"95": "yi",
"96": "zh",
"97": "zh_classical",
"98": "zh_min_nan",
"99": "zh_yue"
},
"lang2id": {
"af": 0,
"als": 1,
"am": 2,
"an": 3,
"ang": 4,
"ar": 5,
"arz": 6,
"ast": 7,
"az": 8,
"bar": 9,
"be": 10,
"bg": 11,
"bn": 12,
"br": 13,
"bs": 14,
"ca": 15,
"ceb": 16,
"ckb": 17,
"cs": 18,
"cy": 19,
"da": 20,
"de": 21,
"el": 22,
"en": 23,
"eo": 24,
"es": 25,
"et": 26,
"eu": 27,
"fa": 28,
"fi": 29,
"fr": 30,
"fy": 31,
"ga": 32,
"gan": 33,
"gl": 34,
"gu": 35,
"he": 36,
"hi": 37,
"hr": 38,
"hu": 39,
"hy": 40,
"ia": 41,
"id": 42,
"is": 43,
"it": 44,
"ja": 45,
"jv": 46,
"ka": 47,
"kk": 48,
"kn": 49,
"ko": 50,
"ku": 51,
"la": 52,
"lb": 53,
"lt": 54,
"lv": 55,
"mk": 56,
"ml": 57,
"mn": 58,
"mr": 59,
"ms": 60,
"my": 61,
"nds": 62,
"ne": 63,
"nl": 64,
"nn": 65,
"no": 66,
"oc": 67,
"pl": 68,
"pt": 69,
"ro": 70,
"ru": 71,
"scn": 72,
"sco": 73,
"sh": 74,
"si": 75,
"simple": 76,
"sk": 77,
"sl": 78,
"sq": 79,
"sr": 80,
"sv": 81,
"sw": 82,
"ta": 83,
"te": 84,
"th": 85,
"tl": 86,
"tr": 87,
"tt": 88,
"uk": 89,
"ur": 90,
"uz": 91,
"vi": 92,
"war": 93,
"wuu": 94,
"yi": 95,
"zh": 96,
"zh_classical": 97,
"zh_min_nan": 98,
"zh_yue": 99
},
"mask_token": "<special1>",
"model_max_length": 512,
"pad_token": "<pad>",
"sep_token": "</s>",
"tokenizer_class": "XLMTokenizer",
"unk_token": "<unk>"
}