{ "version": "1.0", "truncation": null, "padding": null, "added_tokens": [ { "id": 0, "content": "_", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true }, { "id": 111, "content": "UNK", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true } ], "normalizer": { "type": "Sequence", "normalizers": [ { "type": "Replace", "pattern": { "Regex": "(?:\\[SEP\\]|^|$)" }, "content": "_" }, { "type": "Replace", "pattern": { "Regex": "(?:_)+" }, "content": "_" } ] }, "pre_tokenizer": { "type": "Split", "pattern": { "Regex": "_" }, "behavior": "Isolated", "invert": false }, "post_processor": null, "decoder": null, "model": { "vocab": { "!": 103, "'": 108, ",": 106, "-": 109, ".": 107, "?": 104, "AA": 1, "E": 2, "EE": 3, "En": 4, "N": 5, "OO": 6, "SP": 110, "UNK": 111, "V": 7, "_": 0, "a": 8, "a:": 9, "aa": 10, "ae": 11, "ah": 12, "ai": 13, "an": 14, "ang": 15, "ao": 16, "aw": 17, "ay": 18, "b": 19, "by": 20, "c": 21, "ch": 22, "d": 23, "dh": 24, "dy": 25, "e": 26, "e:": 27, "eh": 28, "ei": 29, "en": 30, "eng": 31, "er": 32, "ey": 33, "f": 34, "g": 35, "gy": 36, "h": 37, "hh": 38, "hy": 39, "i": 40, "i0": 41, "i:": 42, "ia": 43, "ian": 44, "iang": 45, "iao": 46, "ie": 47, "ih": 48, "in": 49, "ing": 50, "iong": 51, "ir": 52, "iu": 53, "iy": 54, "j": 55, "jh": 56, "k": 57, "ky": 58, "l": 59, "m": 60, "my": 61, "n": 62, "ng": 63, "ny": 64, "o": 65, "o:": 66, "ong": 67, "ou": 68, "ow": 69, "oy": 70, "p": 71, "py": 72, "q": 73, "r": 74, "ry": 75, "s": 76, "sh": 77, "t": 78, "th": 79, "ts": 80, "ty": 81, "u": 82, "u:": 83, "ua": 84, "uai": 85, "uan": 86, "uang": 87, "uh": 88, "ui": 89, "un": 90, "uo": 91, "uw": 92, "v": 93, "van": 94, "ve": 95, "vn": 96, "w": 97, "x": 98, "y": 99, "z": 100, "zh": 101, "zy": 102, "…": 105 } } }