gpt2_50t_1M_256d_8l / tokenizer.json
jumelet's picture
Training in progress, step 1757
e499b31
{
"version": "1.0",
"truncation": null,
"padding": null,
"added_tokens": [
{
"id": 0,
"content": "<unk>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 1,
"content": "<pad>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 2,
"content": "<mask>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 3,
"content": "<bos>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
}
],
"normalizer": null,
"pre_tokenizer": {
"type": "WhitespaceSplit"
},
"post_processor": {
"type": "TemplateProcessing",
"single": [
{
"SpecialToken": {
"id": "<bos>",
"type_id": 0
}
},
{
"Sequence": {
"id": "A",
"type_id": 0
}
}
],
"pair": [
{
"Sequence": {
"id": "A",
"type_id": 0
}
},
{
"Sequence": {
"id": "B",
"type_id": 1
}
}
],
"special_tokens": {
"<bos>": {
"id": "<bos>",
"ids": [
3
],
"tokens": [
"<bos>"
]
}
}
},
"decoder": null,
"model": {
"type": "WordLevel",
"vocab": {
"<unk>": 0,
"<pad>": 1,
"<mask>": 2,
"<bos>": 3,
".": 4,
",": 5,
"the": 6,
"was": 7,
"and": 8,
"to": 9,
"a": 10,
"Noah": 11,
"he": 12,
"his": 13,
"of": 14,
"n<apostrophe>t": 15,
"back": 16,
"<apostrophe><apostrophe>": 17,
"The": 18,
"in": 19,
"it": 20,
"boy": 21,
"had": 22,
"``": 23,
"<apostrophe>s": 24,
"up": 25,
"you": 26,
"Bible": 27,
"one": 28,
"that": 29,
"She": 30,
"she": 31,
"I": 32,
"there": 33,
"on": 34,
"be": 35,
"from": 36,
"friend": 37,
"get": 38,
"would": 39,
"over": 40,
"really": 41,
"about": 42,
"And": 43,
"He": 44,
"off": 45,
"this": 46,
"just": 47,
"never": 48,
"here": 49,
"good": 50,
"with": 51,
"by": 52,
"ditch": 53,
":": 54,
"happened": 55,
"came": 56,
"But": 57,
"left": 58,
"but": 59,
"him": 60,
"It": 61,
"they": 62,
"did": 63,
"<apostrophe>ll": 64,
"were": 65,
"her": 66,
"Quechua": 67,
"now": 68,
"then": 69,
"dead": 70,
"like": 71,
"guy": 72,
"higher": 73,
"for": 74,
"chin": 75,
"best": 76,
"into": 77,
"so": 78,
"them": 79,
"leave": 80,
"out": 81,
"looking": 82,
"Cochabamba": 83,
"eyes": 84,
"grass": 85,
"?": 86,
"Oh": 87,
"slid": 88,
"asked": 89,
"knew": 90,
"began": 91,
"saw": 92,
"quite": 93,
"is": 94,
"looked": 95,
"heard": 96,
"pushed": 97,
"forced": 98,
"wore": 99,
"used": 100,
"intended": 101,
"warmed": 102,
"fell": 103,
"Plus": 104,
"meant": 105,
"cared": 106,
"not": 107,
"make": 108,
"stop": 109,
"himself": 110,
"badly": 111,
"They": 112,
"eye": 113,
"stand": 114,
"give": 115,
"downright": 116,
"try": 117,
"gold": 118,
"teach": 119,
"speak": 120,
"breathe": 121,
"unmistakably": 122,
"no": 123,
"hardly": 124,
"surfer": 125,
"we": 126,
"care": 127,
"town": 128,
"march": 129,
"help": 130,
"garden": 131,
"school": 132,
"sleeve": 133,
"blackened": 134,
"find": 135,
"soon": 136,
"Wiping": 137,
"justice": 138,
"gone": 139,
"Another": 140,
"dude": 141,
"told": 142,
"aside": 143,
"class": 144,
"mattered": 145,
"crumpled": 146,
"anymore": 147,
"face-up": 148,
"together": 149,
"can": 150,
"freezing": 151,
"none": 152,
"finally": 153,
"waking": 154,
"teaching": 155,
"moved": 156,
"might": 157,
"sprawled": 158,
"Today": 159,
"everything": 160,
"dripping": 161,
"should": 162,
"been": 163,
"stepping": 164,
"muscle": 165,
"going": 166,
"Did": 167,
"climbing": 168,
"flung": 169,
"Ohio": 170,
"could": 171,
"have": 172,
"mutter": 173,
"position": 174,
"shaking": 175,
"know": 176,
"seat": 177,
"Tuesday": 178,
"are": 179,
"Hearst": 180,
"glinting": 181,
"Word": 182,
"finger": 183,
"world": 184,
"right": 185,
"cap": 186,
"Bolivian": 187,
"turquoise": 188,
"native": 189,
"women": 190,
"hat": 191,
"lot": 192,
"street": 193,
"fingers": 194,
"road": 195,
"moonlight": 196,
"four": 197,
"country": 198,
"calculations": 199,
"first": 200,
"clouds": 201,
"hypocrite": 202,
"tears": 203,
"eight": 204,
"bloodied": 205,
"things": 206,
"hours": 207,
"luggage": 208,
"fat": 209,
"cows": 210,
"aura": 211,
"stones": 212,
"knees": 213,
"lucky": 214,
"classes": 215,
"months": 216,
"By": 217,
"His": 218,
"onto": 219,
"sure": 220,
"potatoes": 221,
"freaky": 222,
"Her": 223,
"full": 224,
"rough": 225,
"surfers": 226,
"white": 227,
"under": 228,
"green": 229,
"sharp": 230,
"emerald": 231,
"swollen": 232,
"nice": 233,
"big": 234,
"content": 235,
"At": 236,
"long": 237,
"embroidered": 238,
"Because": 239,
"able": 240,
"stuff": 241,
"low": 242,
"cross": 243,
"city": 244,
"vibrant": 245,
"breath": 246,
"moan": 247,
"face": 248,
"nap": 249,
"glance": 250,
"perch": 251,
"muddy": 252,
"uniform": 253,
"ride": 254,
"hike": 255,
"virtue": 256,
"knit": 257,
"literacy": 258,
"since": 259,
"towards": 260,
"as": 261,
"at": 262,
"Despite": 263,
"because": 264,
"what": 265,
"where": 266,
"how": 267,
"when": 268,
"narrowed": 269,
"grimaced": 270,
"rose": 271,
"gave": 272,
"crisped": 273,
"tried": 274,
"tossed": 275,
"thought": 276,
"hazel": 277,
"anybody": 278,
"house": 279,
"path": 280,
"skin": 281,
"dirt": 282,
"addition": 283,
"sun": 284,
"seriously": 285,
"Not": 286,
"Seriously": 287,
"again": 288,
"place": 289,
"tree": 290,
"baby": 291,
"even": 292,
"child": 293,
"This": 294,
"too": 295,
"forward": 296,
"Jeep": 297,
"looks": 298,
"conference": 299,
"light": 300,
"stretch": 301,
"all": 302,
"lunch": 303,
"lip": 304,
"take": 305,
"sky": 306,
"carrying": 307,
"beckoning": 308,
"red": 309,
"rising": 310,
"Sounding": 311,
"quivered": 312,
"worried": 313,
"Zone": 314,
"studied": 315,
"being": 316,
"guess": 317,
"skeletal": 318,
"single": 319,
"gravel": 320,
"Spanish": 321,
"fine": 322,
"strong": 323,
"Spain": 324,
"murky": 325,
"In": 326,
"than": 327,
"Twilight": 328,
"staying": 329,
"silky": 330,
"own": 331,
"tall": 332,
"thick": 333,
"our": 334,
"blond": 335,
"parents": 336,
"shadows": 337,
"remains": 338,
"If": 339,
"outside": 340,
"darker": 341,
"lower": 342,
"<apostrophe>": 343,
"who": 344
},
"unk_token": "<unk>"
}
}