gpt2_100t_1M_256d_8l / tokenizer.json
jumelet's picture
Training in progress, step 1757
530650f
{
"version": "1.0",
"truncation": null,
"padding": null,
"added_tokens": [
{
"id": 0,
"content": "<unk>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 1,
"content": "<pad>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 2,
"content": "<mask>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 3,
"content": "<bos>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
}
],
"normalizer": null,
"pre_tokenizer": {
"type": "WhitespaceSplit"
},
"post_processor": {
"type": "TemplateProcessing",
"single": [
{
"SpecialToken": {
"id": "<bos>",
"type_id": 0
}
},
{
"Sequence": {
"id": "A",
"type_id": 0
}
}
],
"pair": [
{
"Sequence": {
"id": "A",
"type_id": 0
}
},
{
"Sequence": {
"id": "B",
"type_id": 1
}
}
],
"special_tokens": {
"<bos>": {
"id": "<bos>",
"ids": [
3
],
"tokens": [
"<bos>"
]
}
}
},
"decoder": null,
"model": {
"type": "WordLevel",
"vocab": {
"<unk>": 0,
"<pad>": 1,
"<mask>": 2,
"<bos>": 3,
".": 4,
"the": 5,
",": 6,
"to": 7,
"was": 8,
"a": 9,
"and": 10,
"of": 11,
"Noah": 12,
"She": 13,
"in": 14,
"her": 15,
"she": 16,
"with": 17,
"up": 18,
"``": 19,
"<apostrophe><apostrophe>": 20,
"his": 21,
"had": 22,
"it": 23,
"that": 24,
"on": 25,
"he": 26,
"n<apostrophe>t": 27,
"be": 28,
"The": 29,
"were": 30,
"<apostrophe>s": 31,
"really": 32,
"I": 33,
"one": 34,
"Bible": 35,
"friend": 36,
"you": 37,
"back": 38,
"Quechua": 39,
"boy": 40,
"out": 41,
"over": 42,
"would": 43,
"from": 44,
"by": 45,
"for": 46,
"anything": 47,
"kids": 48,
"He": 49,
"never": 50,
"there": 51,
"off": 52,
"did": 53,
"as": 54,
"They": 55,
"now": 56,
"they": 57,
"this": 58,
"get": 59,
"just": 60,
"than": 61,
"have": 62,
"And": 63,
"could": 64,
"him": 65,
"group": 66,
"then": 67,
"so": 68,
"wore": 69,
"came": 70,
"about": 71,
"It": 72,
"them": 73,
"stand": 74,
"but": 75,
"pink": 76,
"best": 77,
"white": 78,
"like": 79,
"is": 80,
"guy": 81,
"into": 82,
"down": 83,
"here": 84,
"Spanish": 85,
"But": 86,
"eyes": 87,
"herself": 88,
"good": 89,
"asked": 90,
"happened": 91,
"heard": 92,
"ditch": 93,
":": 94,
"let": 95,
"<apostrophe>ll": 96,
"should": 97,
"told": 98,
"all": 99,
"used": 100,
"who": 101,
"moved": 102,
"going": 103,
"know": 104,
"Despite": 105,
"under": 106,
"left": 107,
"Bolivian": 108,
"too": 109,
"higher": 110,
"not": 111,
"think": 112,
"classes": 113,
"right": 114,
"way": 115,
"an": 116,
"help": 117,
"morning": 118,
"warm": 119,
"dead": 120,
"brown": 121,
"afraid": 122,
"around": 123,
"car": 124,
"at": 125,
"knit": 126,
"chin": 127,
"hat": 128,
"valley": 129,
"because": 130,
"what": 131,
"remains": 132,
"how": 133,
"When": 134,
"when": 135,
"below": 136,
"schoolhouse": 137,
"gravel": 138,
"no": 139,
"always": 140,
"In": 141,
"<apostrophe>m": 142,
"brushed": 143,
"tried": 144,
"looking": 145,
"even": 146,
"Cochabamba": 147,
"tell": 148,
"Her": 149,
"A": 150,
"parents": 151,
"something": 152,
"fingers": 153,
"clouds": 154,
"keep": 155,
"Plus": 156,
"leave": 157,
"boulders": 158,
"Jeep": 159,
"purple": 160,
"school": 161,
"world": 162,
"story": 163,
"grass": 164,
"outside": 165,
"hike": 166,
"if": 167,
"stifled": 168,
"discovered": 169,
"?": 170,
"You": 171,
"where": 172,
"slid": 173,
"loved": 174,
"spoke": 175,
"knew": 176,
"warmed": 177,
"himself": 178,
"lives": 179,
"storms": 180,
"aside": 181,
"attended": 182,
"looked": 183,
"can": 184,
"sounded": 185,
"began": 186,
"warily": 187,
"Another": 188,
"we": 189,
"might": 190,
"onto": 191,
"That": 192,
"much": 193,
"finally": 194,
"meant": 195,
"fixed": 196,
"unmistakably": 197,
"By": 198,
"hardly": 199,
"queasy": 200,
"through": 201,
"Did": 202,
"maybe": 203,
"anymore": 204,
"heart": 205,
"smiled": 206,
"muscle": 207,
"puffy": 208,
"Within": 209,
"diamond": 210,
"before": 211,
"lacy": 212,
"saw": 213,
"alpaca": 214,
"At": 215,
"together": 216,
"face-up": 217,
"someone": 218,
"Behind": 219,
"none": 220,
"Still": 221,
"turned": 222,
"Maybe": 223,
"charge": 224,
"attention": 225,
"turquoise": 226,
"inside": 227,
"streaming": 228,
"navy": 229,
"dude": 230,
"spit": 231,
"privacy": 232,
"Hearst": 233,
"town": 234,
"forced": 235,
"downright": 236,
"soon": 237,
"finger": 238,
"piled": 239,
"position": 240,
"chided": 241,
"felt": 242,
"translating": 243,
"English": 244,
"sleeping": 245,
"do": 246,
"sporting": 247,
"Today": 248,
"doing": 249,
"pushed": 250,
"snow": 251,
"mumbled": 252,
"woke": 253,
"class": 254,
"cared": 255,
"captivated": 256,
"deserved": 257,
"water": 258,
"Vida": 259,
"honey": 260,
"dropped": 261,
"intended": 262,
"climbing": 263,
"died": 264,
"church": 265,
"freezing": 266,
"shaking": 267,
"grew": 268,
"dried": 269,
"mutter": 270,
"dripping": 271,
"assigned": 272,
"stepping": 273,
"sitting": 274,
"darn": 275,
"overlooking": 276,
"flung": 277,
"Then": 278,
"cap": 279,
"quite": 280,
"blackened": 281,
"fell": 282,
"are": 283,
"blouse": 284,
"badly": 285,
"glinting": 286,
"waking": 287,
"teaching": 288,
"scattered": 289,
"teeth": 290,
"later": 291,
"calculations": 292,
"refer": 293,
"Pura": 294,
"braked": 295,
"why": 296,
"crumpled": 297,
"across": 298,
"these": 299,
"more": 300,
"been": 301,
"Word": 302,
"jeans": 303,
"else": 304,
"knees": 305,
"jerked": 306,
"mattered": 307,
"gone": 308,
"shifted": 309,
"Wiping": 310,
"sheathed": 311,
"sprawled": 312,
"Ohio": 313,
"Tuesday": 314,
"powdery": 315,
"listened": 316,
"Of": 317,
"women": 318,
"gaze": 319,
"halt": 320,
"phrase": 321,
"colder": 322,
"stones": 323,
"Bolivians": 324,
"daylight": 325,
"fleas": 326,
"kind": 327,
"flats": 328,
"months": 329,
"Thanks": 330,
"air": 331,
"embroidered": 332,
"burgundy": 333,
"tears": 334,
"bloodied": 335,
"sleeve": 336,
"swollen": 337,
"luggage": 338,
"four": 339,
"march": 340,
"everything": 341,
"minutes": 342,
"hill": 343,
"holes": 344,
"scarves": 345,
"sparkles": 346,
"missing": 347,
"mounds": 348,
"things": 349,
"His": 350,
"surfer": 351,
"eye": 352,
"legs": 353,
"giggles": 354,
"moonlight": 355,
"hypocrite": 356,
"maroon": 357,
"friends": 358,
"thunder": 359,
"eight": 360,
"moan": 361,
"actions": 362,
"hours": 363,
"native": 364,
"lot": 365,
"feathers": 366,
"glasses": 367,
"your": 368,
"their": 369,
"beak": 370,
"stars": 371,
"regional": 372,
"ask": 373,
"lisp": 374,
"glance": 375,
"black": 376,
"trips": 377,
"garden": 378,
"uneven": 379,
"fit": 380,
"terrible": 381,
"gold": 382,
"goofy": 383,
"rattle": 384,
"able": 385,
"rough": 386,
"country": 387,
"surfers": 388,
"lucky": 389,
"ladies": 390,
"road": 391,
"breath": 392,
"find": 393,
"highlights": 394,
"literacy": 395,
"first": 396,
"fake": 397,
"Several": 398,
"waist-high": 399,
"muffled": 400,
"arm": 401,
"lesson": 402,
"give": 403,
"teach": 404,
"figure": 405,
"songs": 406,
"cows": 407,
"come": 408,
"breathe": 409,
"move": 410,
"stop": 411,
"vibrant": 412,
"speak": 413,
"green": 414,
"sudden": 415,
"aura": 416,
"big": 417,
"hippie": 418,
"freaky": 419,
"sharp": 420,
"try": 421,
"potatoes": 422,
"full": 423,
"dented": 424,
"grassy": 425,
"ballet": 426,
"frayed": 427,
"appropriate": 428,
"perch": 429,
"heavy": 430,
"towards": 431,
"sure": 432,
"content": 433,
"Because": 434,
"fat": 435,
"plastic": 436,
"trendy": 437,
"pale": 438,
"face": 439,
"emerald": 440,
"silly": 441,
"scruffy": 442,
"own": 443,
"bare": 444,
"muddy": 445,
"stuff": 446,
"egg": 447,
"or": 448,
"side": 449,
"deserted": 450,
"make": 451,
"hard": 452,
"street": 453,
"care": 454,
"coat": 455,
"low": 456,
"path": 457,
"nice": 458,
"seat": 459,
"goat": 460,
"scuffed": 461,
"spray": 462,
"anyone": 463,
"long": 464,
"wall": 465,
"crazy": 466,
"justice": 467,
"bag": 468,
"idiom": 469,
"mahogany": 470,
"tunic": 471,
"other": 472,
"nap": 473,
"virtue": 474,
"ride": 475,
"knoll": 476,
"cut": 477,
"chill": 478,
"hair": 479,
"cluster": 480,
"sweater": 481,
"option": 482,
"yawn": 483,
"api": 484,
"bottle": 485,
"uniform": 486,
"cardigan": 487,
"city": 488,
"fleece": 489,
"cross": 490,
"Oh": 491,
"since": 492,
"while": 493,
"Whenever": 494,
"howled": 495,
"gave": 496,
"thought": 497,
"crisped": 498,
"anybody": 499,
"grimaced": 500,
"baby": 501,
"skin": 502,
"dirt": 503,
"addition": 504,
"child": 505,
"narrowed": 506,
"All": 507,
"slithered": 508,
"watched": 509,
"tossed": 510,
"Off": 511,
"still": 512,
"Seriously": 513,
"distance": 514,
"rose": 515,
"forward": 516,
"looks": 517,
"seriously": 518,
"place": 519,
"corn": 520,
"again": 521,
"This": 522,
"Not": 523,
"breakfast": 524,
"house": 525,
"lazily": 526,
"adobe": 527,
"tree": 528,
"moment": 529,
"word": 530,
"Jesus": 531,
"lunch": 532,
"hazel": 533,
"conference": 534,
"hauntingly": 535,
"lip": 536,
"Twilight": 537,
"Zone": 538,
"slept": 539,
"sky": 540,
"hulking": 541,
"wisps": 542,
"Spain": 543,
"nerd": 544,
"smile": 545,
"carrying": 546,
"staying": 547,
"Sounding": 548,
"structures": 549,
"light": 550,
"ahead": 551,
"lady": 552,
"cartoons": 553,
"waiting": 554,
"granite": 555,
"cheeks": 556,
"drink": 557,
"sun": 558,
"mountains": 559,
"every": 560,
"Frozen": 561,
"eggs": 562,
"darker": 563,
"only": 564,
"tall": 565,
"visible": 566,
"beckoning": 567,
"shadows": 568,
"failings": 569,
"glued": 570,
"rising": 571,
"being": 572,
"scared": 573,
"single": 574,
"sturdy": 575,
"houses": 576,
"two": 577,
"worried": 578,
"silky": 579,
"rounded": 580,
"boiled": 581,
"lower": 582,
"forested": 583,
"quivered": 584,
"strong": 585,
"thick": 586,
"seven-hour": 587,
"red": 588,
"Trailing": 589,
"many": 590,
"beautiful": 591,
"murky": 592,
"huge": 593,
"little": 594,
"stronger": 595,
"sweet": 596,
"fine": 597,
"ruin": 598,
"blond": 599,
"our": 600,
"studied": 601,
"skeletal": 602,
"guess": 603,
"take": 604,
"want": 605,
"happy-go-lucky": 606,
"such": 607,
"If": 608,
"stretch": 609,
"<apostrophe>": 610
},
"unk_token": "<unk>"
}
}