nz's picture
Upload tokenizer
7fab1ce verified
{
"version": "1.0",
"truncation": null,
"padding": null,
"added_tokens": [
{
"id": 0,
"content": "<|endoftext|>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
}
],
"normalizer": null,
"pre_tokenizer": {
"type": "ByteLevel",
"add_prefix_space": false,
"trim_offsets": true,
"use_regex": true
},
"post_processor": {
"type": "ByteLevel",
"add_prefix_space": true,
"trim_offsets": true,
"use_regex": true
},
"decoder": null,
"model": {
"type": "BPE",
"dropout": null,
"unk_token": "<|endoftext|>",
"continuing_subword_prefix": null,
"end_of_word_suffix": null,
"fuse_unk": false,
"byte_fallback": false,
"ignore_merges": false,
"vocab": {
"<|endoftext|>": 0,
"\u0000": 1,
"\u0001": 2,
"\u0002": 3,
"\u0003": 4,
"\u0004": 5,
"\u0005": 6,
"\u0006": 7,
"\u0007": 8,
"\b": 9,
"\t": 10,
"\n": 11,
"\u000b": 12,
"\f": 13,
"\r": 14,
"\u000e": 15,
"\u000f": 16,
"\u0010": 17,
"\u0011": 18,
"\u0012": 19,
"\u0013": 20,
"\u0014": 21,
"\u0015": 22,
"\u0016": 23,
"\u0017": 24,
"\u0018": 25,
"\u0019": 26,
"\u001a": 27,
"\u001b": 28,
"\u001c": 29,
"\u001d": 30,
"\u001e": 31,
"\u001f": 32,
" ": 33,
"!": 34,
"\"": 35,
"#": 36,
"$": 37,
"%": 38,
"&": 39,
"'": 40,
"(": 41,
")": 42,
"*": 43,
"+": 44,
",": 45,
"-": 46,
".": 47,
"/": 48,
"0": 49,
"1": 50,
"2": 51,
"3": 52,
"4": 53,
"5": 54,
"6": 55,
"7": 56,
"8": 57,
"9": 58,
":": 59,
";": 60,
"<": 61,
"=": 62,
">": 63,
"?": 64,
"@": 65,
"A": 66,
"B": 67,
"C": 68,
"D": 69,
"E": 70,
"F": 71,
"G": 72,
"H": 73,
"I": 74,
"J": 75,
"K": 76,
"L": 77,
"M": 78,
"N": 79,
"O": 80,
"P": 81,
"Q": 82,
"R": 83,
"S": 84,
"T": 85,
"U": 86,
"V": 87,
"W": 88,
"X": 89,
"Y": 90,
"Z": 91,
"[": 92,
"\\": 93,
"]": 94,
"^": 95,
"_": 96,
"`": 97,
"a": 98,
"b": 99,
"c": 100,
"d": 101,
"e": 102,
"f": 103,
"g": 104,
"h": 105,
"i": 106,
"j": 107,
"k": 108,
"l": 109,
"m": 110,
"n": 111,
"o": 112,
"p": 113,
"q": 114,
"r": 115,
"s": 116,
"t": 117,
"u": 118,
"v": 119,
"w": 120,
"x": 121,
"y": 122,
"z": 123,
"{": 124,
"|": 125,
"}": 126,
"~": 127,
"": 128,
"€": 129,
"": 130,
"‚": 131,
"ƒ": 132,
"„": 133,
"…": 134,
"†": 135,
"‡": 136,
"ˆ": 137,
"‰": 138,
"Š": 139,
"‹": 140,
"Œ": 141,
"": 142,
"Ž": 143,
"": 144,
"": 145,
"‘": 146,
"’": 147,
"“": 148,
"”": 149,
"•": 150,
"–": 151,
"—": 152,
"˜": 153,
"™": 154,
"š": 155,
"›": 156,
"œ": 157,
"": 158,
"ž": 159,
"Ÿ": 160,
" ": 161,
"¡": 162,
"¢": 163,
"£": 164,
"¤": 165,
"¥": 166,
"¦": 167,
"§": 168,
"¨": 169,
"©": 170,
"ª": 171,
"«": 172,
"¬": 173,
"­": 174,
"®": 175,
"¯": 176,
"°": 177,
"±": 178,
"²": 179,
"³": 180,
"´": 181,
"µ": 182,
"¶": 183,
"·": 184,
"¸": 185,
"¹": 186,
"º": 187,
"»": 188,
"¼": 189,
"½": 190,
"¾": 191,
"¿": 192,
"À": 193,
"Á": 194,
"Â": 195,
"Ã": 196,
"Ä": 197,
"Å": 198,
"Æ": 199,
"Ç": 200,
"È": 201,
"É": 202,
"Ê": 203,
"Ë": 204,
"Ì": 205,
"Í": 206,
"Î": 207,
"Ï": 208,
"Ð": 209,
"Ñ": 210,
"Ò": 211,
"Ó": 212,
"Ô": 213,
"Õ": 214,
"Ö": 215,
"×": 216,
"Ø": 217,
"Ù": 218,
"Ú": 219,
"Û": 220,
"Ü": 221,
"Ý": 222,
"Þ": 223,
"ß": 224,
"à": 225,
"á": 226,
"â": 227,
"ã": 228,
"ä": 229,
"å": 230,
"æ": 231,
"ç": 232,
"è": 233,
"é": 234,
"ê": 235,
"ë": 236,
"ì": 237,
"í": 238,
"î": 239,
"ï": 240,
"ð": 241,
"ñ": 242,
"ò": 243,
"ó": 244,
"ô": 245,
"õ": 246,
"ö": 247,
"÷": 248,
"ø": 249,
"ù": 250,
"ú": 251,
"û": 252,
"ü": 253,
"ý": 254,
"þ": 255,
"ÿ": 256,
"Ċ": 257,
"Ġ": 258,
"Ġt": 259,
"he": 260,
"Ġa": 261,
"Ġs": 262,
"nd": 263,
"Ġw": 264,
"Ġthe": 265,
"ed": 266,
"Ġb": 267,
"Ġto": 268,
"Ġand": 269,
"Ġh": 270,
"Ġf": 271,
"ĠT": 272,
"in": 273,
"Ġwa": 274,
"re": 275,
"it": 276,
"ou": 277,
"Ġl": 278,
"Ġd": 279,
"Ġc": 280,
"Ġp": 281,
"ay": 282,
"Ġm": 283,
"er": 284,
"Ġwas": 285,
"ĠThe": 286,
"om": 287,
"Ġhe": 288,
"is": 289,
"Ġn": 290,
"ar": 291,
"im": 292,
"on": 293,
"Ġsa": 294,
"id": 295,
"ll": 296,
"Ġha": 297,
"Ġg": 298,
"at": 299,
"ĠS": 300,
"ing": 301,
"ot": 302,
"en": 303,
"an": 304,
"le": 305,
"or": 306,
"end": 307,
"ir": 308,
"of": 309,
"am": 310,
"et": 311,
"ĠH": 312,
"Ġit": 313,
"Ġth": 314,
"ig": 315,
"ĠThey": 316,
"Ġin": 317,
"il": 318,
"Ġpl": 319,
"Ġ\"": 320,
"ĠHe": 321,
"ow": 322,
"ri": 323,
"ver": 324,
"ut": 325,
"Ġu": 326,
"Ġbe": 327,
"Ġplay": 328,
"Ġsaid": 329,
"ith": 330,
"Ġday": 331,
"Ġwith": 332,
"pp": 333,
"On": 334,
"Ġy": 335,
"oo": 336,
"ked": 337,
"Ġr": 338,
"ex": 339,
"Ġher": 340,
"ce": 341,
"ĠI": 342,
"ĠTim": 343,
"ĠShe": 344,
"ld": 345,
"Ġhis": 346,
"Ġst": 347,
"ke": 348,
"Ġbig": 349,
"nt": 350,
"ck": 351,
"very": 352,
"Ġyou": 353,
"st": 354,
"ve": 355,
"Ġhapp": 356,
"un": 357,
"Ġon": 358,
"riend": 359,
"Ġfriend": 360,
"all": 361,
"ily": 362,
"ext": 363,
"ĠL": 364,
"Ġthey": 365,
"oft": 366,
"Ġwe": 367,
"Ġhad": 368,
"Ġnot": 369,
"Ġli": 370,
"Ġup": 371,
"her": 372,
"Ġwant": 373,
"Ġof": 374,
"itt": 375,
"<|": 376,
"|>": 377,
"endoft": 378,
"endoftext": 379,
"ad": 380,
"se": 381,
"ĠB": 382,
"Ġdo": 383
},
"merges": [
"Ġ t",
"h e",
"Ġ a",
"Ġ s",
"n d",
"Ġ w",
"Ġt he",
"e d",
"Ġ b",
"Ġt o",
"Ġa nd",
"Ġ h",
"Ġ f",
"Ġ T",
"i n",
"Ġw a",
"r e",
"i t",
"o u",
"Ġ l",
"Ġ d",
"Ġ c",
"Ġ p",
"a y",
"Ġ m",
"e r",
"Ġwa s",
"ĠT he",
"o m",
"Ġ he",
"i s",
"Ġ n",
"a r",
"i m",
"o n",
"Ġs a",
"i d",
"l l",
"Ġh a",
"Ġ g",
"a t",
"Ġ S",
"in g",
"o t",
"e n",
"a n",
"l e",
"o r",
"e nd",
"i r",
"o f",
"a m",
"e t",
"Ġ H",
"Ġ it",
"Ġt h",
"i g",
"ĠThe y",
"Ġ in",
"i l",
"Ġp l",
"Ġ \"",
"ĠH e",
"o w",
"r i",
"v er",
"u t",
"Ġ u",
"Ġb e",
"Ġpl ay",
"Ġsa id",
"it h",
"Ġd ay",
"Ġw ith",
"p p",
"O n",
"Ġ y",
"o o",
"k ed",
"Ġ r",
"e x",
"Ġhe r",
"c e",
"Ġ I",
"ĠT im",
"ĠS he",
"l d",
"Ġh is",
"Ġs t",
"k e",
"Ġb ig",
"n t",
"c k",
"ver y",
"Ġy ou",
"s t",
"v e",
"Ġha pp",
"u n",
"Ġ on",
"ri end",
"Ġf riend",
"a ll",
"il y",
"ex t",
"Ġ L",
"Ġthe y",
"of t",
"Ġw e",
"Ġha d",
"Ġn ot",
"Ġl i",
"Ġu p",
"he r",
"Ġwa nt",
"Ġ of",
"it t",
"< |",
"| >",
"end oft",
"endoft ext",
"a d",
"s e",
"Ġ B",
"Ġd o"
]
}
}