frpron / tokenizer.json
Marxav's picture
add tokenizer
4d82991
raw
history blame
5.48 kB
{
"version": "1.0",
"truncation": null,
"padding": null,
"added_tokens": [
{
"id": 0,
"content": "<|endoftext|>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 253,
"content": ">",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 254,
"content": "<",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 255,
"content": "#",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
}
],
"normalizer": null,
"pre_tokenizer": {
"type": "ByteLevel",
"add_prefix_space": false,
"trim_offsets": true
},
"post_processor": {
"type": "ByteLevel",
"add_prefix_space": true,
"trim_offsets": false
},
"decoder": {
"type": "ByteLevel",
"add_prefix_space": true,
"trim_offsets": true
},
"model": {
"type": "BPE",
"dropout": null,
"unk_token": null,
"continuing_subword_prefix": "",
"end_of_word_suffix": "",
"fuse_unk": false,
"vocab": {
"<|endoftext|>": 0,
"\n": 1,
" ": 2,
"!": 3,
"&": 4,
"'": 5,
"(": 6,
")": 7,
"*": 8,
"+": 9,
"-": 10,
".": 11,
"/": 12,
"0": 13,
"1": 14,
"2": 15,
"3": 16,
"4": 17,
"5": 18,
"6": 19,
"7": 20,
"8": 21,
"9": 22,
":": 23,
";": 24,
"@": 25,
"A": 26,
"B": 27,
"C": 28,
"D": 29,
"E": 30,
"F": 31,
"G": 32,
"H": 33,
"I": 34,
"J": 35,
"K": 36,
"L": 37,
"M": 38,
"N": 39,
"O": 40,
"P": 41,
"Q": 42,
"R": 43,
"S": 44,
"T": 45,
"U": 46,
"V": 47,
"W": 48,
"X": 49,
"Y": 50,
"Z": 51,
"a": 52,
"b": 53,
"c": 54,
"d": 55,
"e": 56,
"f": 57,
"g": 58,
"h": 59,
"i": 60,
"j": 61,
"k": 62,
"l": 63,
"m": 64,
"n": 65,
"o": 66,
"p": 67,
"q": 68,
"r": 69,
"s": 70,
"t": 71,
"u": 72,
"v": 73,
"w": 74,
"x": 75,
"y": 76,
"z": 77,
"¡": 78,
"¢": 79,
"£": 80,
"¤": 81,
"¥": 82,
"¦": 83,
"§": 84,
"¨": 85,
"©": 86,
"ª": 87,
"«": 88,
"¬": 89,
"®": 90,
"¯": 91,
"°": 92,
"±": 93,
"²": 94,
"³": 95,
"´": 96,
"µ": 97,
"¶": 98,
"·": 99,
"¸": 100,
"¹": 101,
"º": 102,
"»": 103,
"¼": 104,
"½": 105,
"¾": 106,
"¿": 107,
"Á": 108,
"Â": 109,
"Ã": 110,
"Ä": 111,
"Å": 112,
"Æ": 113,
"Ç": 114,
"È": 115,
"É": 116,
"Ê": 117,
"Ì": 118,
"Î": 119,
"Ï": 120,
"Ð": 121,
"Ô": 122,
"Ö": 123,
"×": 124,
"Ü": 125,
"ß": 126,
"à": 127,
"á": 128,
"â": 129,
"ã": 130,
"ä": 131,
"å": 132,
"æ": 133,
"ç": 134,
"è": 135,
"é": 136,
"ê": 137,
"ë": 138,
"ì": 139,
"í": 140,
"î": 141,
"ï": 142,
"ñ": 143,
"ò": 144,
"ó": 145,
"ô": 146,
"õ": 147,
"ö": 148,
"ø": 149,
"ù": 150,
"ú": 151,
"û": 152,
"ü": 153,
"ý": 154,
"ÿ": 155,
"ā": 156,
"ć": 157,
"č": 158,
"Ğ": 159,
"ğ": 160,
"Ġ": 161,
"Ģ": 162,
"ģ": 163,
"Ĥ": 164,
"ĥ": 165,
"Ĩ": 166,
"ĩ": 167,
"Ī": 168,
"ī": 169,
"Ĭ": 170,
"ĭ": 171,
"į": 172,
"İ": 173,
"ı": 174,
"IJ": 175,
"ij": 176,
"Ĵ": 177,
"ĵ": 178,
"Ķ": 179,
"ĸ": 180,
"Ĺ": 181,
"ĺ": 182,
"Ļ": 183,
"ļ": 184,
"Ľ": 185,
"ľ": 186,
"Ŀ": 187,
"ŀ": 188,
"Ł": 189,
"ł": 190,
"Ń": 191,
"ŋ": 192,
"ō": 193,
"Œ": 194,
"œ": 195,
"Ś": 196,
"ŝ": 197,
"Ş": 198,
"ş": 199,
"š": 200,
"ū": 201,
"ŵ": 202,
"ź": 203,
"Ž": 204,
"ž": 205,
"ǘ": 206,
"ȝ": 207,
"ɑ": 208,
"ɔ": 209,
"ə": 210,
"ɛ": 211,
"ɡ": 212,
"ɣ": 213,
"ɥ": 214,
"ɨ": 215,
"ɩ": 216,
"ɲ": 217,
"ʁ": 218,
"ʃ": 219,
"ʋ": 220,
"ʒ": 221,
"ʔ": 222,
"ʻ": 223,
"ʼ": 224,
"ʾ": 225,
"ʿ": 226,
"́": 227,
"̃": 228,
"̐": 229,
"̓": 230,
"͠": 231,
"Δ": 232,
"α": 233,
"β": 234,
"μ": 235,
"σ": 236,
"а": 237,
"е": 238,
"ḷ": 239,
"ṃ": 240,
"ṇ": 241,
"ṣ": 242,
"ṭ": 243,
"–": 244,
"‘": 245,
"’": 246,
"′": 247,
"‿": 248,
"₂": 249,
"€": 250,
"−": 251,
"∴": 252
},
"merges": []
}
}