codebyzeb commited on
Commit
87fc94c
·
verified ·
1 Parent(s): 2b4611b

Upload tokenizer

Browse files
Files changed (3) hide show
  1. tokenizer.json +103 -59
  2. tokenizer_config.json +1 -1
  3. vocab.json +1 -1
tokenizer.json CHANGED
@@ -22,7 +22,7 @@
22
  "special": true
23
  },
24
  {
25
- "id": 5,
26
  "content": "UTT_BOUNDARY",
27
  "single_word": false,
28
  "lstrip": false,
@@ -34,13 +34,6 @@
34
  "normalizer": {
35
  "type": "Sequence",
36
  "normalizers": [
37
- {
38
- "type": "Replace",
39
- "pattern": {
40
- "String": "\n"
41
- },
42
- "content": " UTT_BOUNDARY"
43
- },
44
  {
45
  "type": "Strip",
46
  "strip_left": true,
@@ -51,63 +44,114 @@
51
  "pre_tokenizer": {
52
  "type": "Whitespace"
53
  },
54
- "post_processor": null,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  "decoder": null,
56
  "model": {
57
  "type": "WordLevel",
58
  "vocab": {
59
  "UNK": 0,
60
  "PAD": 1,
61
- "BOS": 2,
62
- "EOS": 3,
63
- "WORD_BOUNDARY": 4,
64
- "UTT_BOUNDARY": 5,
65
- "k": 6,
66
- "y": 7,
67
- "m": 8,
68
- "": 9,
69
- "s": 10,
70
- "t": 11,
71
- "": 12,
72
- "ŋ": 13,
73
- "a": 14,
74
- "i": 15,
75
- "n": 16,
76
- "ɛ": 17,
77
- "æ": 18,
78
- "z": 19,
79
- "ɡ": 20,
80
- "r": 21,
81
- "v": 22,
82
- "u": 23,
83
- "ɾ": 24,
84
- "d": 25,
85
- "ʊ": 26,
86
- "": 27,
87
- "ɲ": 28,
88
- "e": 29,
89
- "f": 30,
90
- "o": 31,
91
- "p": 32,
92
- "ʒ": 33,
93
- "t̠ʃ": 34,
94
- "d̠ʒ": 35,
95
- "": 36,
96
- "w": 37,
97
- "": 38,
98
- "j": 39,
99
- "ə": 40,
100
- "": 41,
101
- "ɐ̃": 42,
102
- "l": 43,
103
- "b": 44,
104
- "x": 45,
105
- "ɔ": 46,
106
- "ʃ": 47,
107
- "": 48,
108
- "ɛʊ": 49,
109
- "ɔɪ": 50,
110
- "uɪ": 51
111
  },
112
  "unk_token": "UNK"
113
  }
 
22
  "special": true
23
  },
24
  {
25
+ "id": 3,
26
  "content": "UTT_BOUNDARY",
27
  "single_word": false,
28
  "lstrip": false,
 
34
  "normalizer": {
35
  "type": "Sequence",
36
  "normalizers": [
 
 
 
 
 
 
 
37
  {
38
  "type": "Strip",
39
  "strip_left": true,
 
44
  "pre_tokenizer": {
45
  "type": "Whitespace"
46
  },
47
+ "post_processor": {
48
+ "type": "TemplateProcessing",
49
+ "single": [
50
+ {
51
+ "SpecialToken": {
52
+ "id": "UTT_BOUNDARY",
53
+ "type_id": 0
54
+ }
55
+ },
56
+ {
57
+ "Sequence": {
58
+ "id": "A",
59
+ "type_id": 0
60
+ }
61
+ }
62
+ ],
63
+ "pair": [
64
+ {
65
+ "SpecialToken": {
66
+ "id": "UTT_BOUNDARY",
67
+ "type_id": 0
68
+ }
69
+ },
70
+ {
71
+ "Sequence": {
72
+ "id": "A",
73
+ "type_id": 0
74
+ }
75
+ },
76
+ {
77
+ "SpecialToken": {
78
+ "id": "UTT_BOUNDARY",
79
+ "type_id": 0
80
+ }
81
+ },
82
+ {
83
+ "Sequence": {
84
+ "id": "B",
85
+ "type_id": 1
86
+ }
87
+ }
88
+ ],
89
+ "special_tokens": {
90
+ "UTT_BOUNDARY": {
91
+ "id": "UTT_BOUNDARY",
92
+ "ids": [
93
+ 3
94
+ ],
95
+ "tokens": [
96
+ "UTT_BOUNDARY"
97
+ ]
98
+ }
99
+ }
100
+ },
101
  "decoder": null,
102
  "model": {
103
  "type": "WordLevel",
104
  "vocab": {
105
  "UNK": 0,
106
  "PAD": 1,
107
+ "WORD_BOUNDARY": 2,
108
+ "UTT_BOUNDARY": 3,
109
+ "k": 4,
110
+ "y": 5,
111
+ "m": 6,
112
+ "": 7,
113
+ "s": 8,
114
+ "t": 9,
115
+ "": 10,
116
+ "ŋ": 11,
117
+ "a": 12,
118
+ "i": 13,
119
+ "n": 14,
120
+ "ɛ": 15,
121
+ "æ": 16,
122
+ "z": 17,
123
+ "ɡ": 18,
124
+ "r": 19,
125
+ "v": 20,
126
+ "u": 21,
127
+ "ɾ": 22,
128
+ "d": 23,
129
+ "ʊ": 24,
130
+ "": 25,
131
+ "ɲ": 26,
132
+ "e": 27,
133
+ "f": 28,
134
+ "o": 29,
135
+ "p": 30,
136
+ "ʒ": 31,
137
+ "t̠ʃ": 32,
138
+ "d̠ʒ": 33,
139
+ "": 34,
140
+ "w": 35,
141
+ "": 36,
142
+ "j": 37,
143
+ "ə": 38,
144
+ "": 39,
145
+ "ɐ̃": 40,
146
+ "l": 41,
147
+ "b": 42,
148
+ "x": 43,
149
+ "ɔ": 44,
150
+ "ʃ": 45,
151
+ "": 46,
152
+ "ɛʊ": 47,
153
+ "ɔɪ": 48,
154
+ "": 49
 
 
155
  },
156
  "unk_token": "UNK"
157
  }
tokenizer_config.json CHANGED
@@ -17,7 +17,7 @@
17
  "single_word": false,
18
  "special": true
19
  },
20
- "5": {
21
  "content": "UTT_BOUNDARY",
22
  "lstrip": false,
23
  "normalized": false,
 
17
  "single_word": false,
18
  "special": true
19
  },
20
+ "3": {
21
  "content": "UTT_BOUNDARY",
22
  "lstrip": false,
23
  "normalized": false,
vocab.json CHANGED
@@ -1 +1 @@
1
- {"UNK":0,"PAD":1,"BOS":2,"EOS":3,"WORD_BOUNDARY":4,"UTT_BOUNDARY":5,"k":6,"y":7,"m":8,"":9,"s":10,"t":11,"":12,"ŋ":13,"a":14,"i":15,"n":16,"ɛ":17,"æ":18,"z":19,"ɡ":20,"r":21,"v":22,"u":23,"ɾ":24,"d":25,"ʊ":26,"":27,"ɲ":28,"e":29,"f":30,"o":31,"p":32,"ʒ":33,"t̠ʃ":34,"d̠ʒ":35,"eʊ":36,"w":37,"aʊ":38,"j":39,"ə":40,"ũ":41,"ɐ̃":42,"l":43,"b":44,"x":45,"ɔ":46,"ʃ":47,"iʊ":48,"ɛʊ":49,"ɔɪ":50,"uɪ":51}
 
1
+ {"UNK":0,"PAD":1,"WORD_BOUNDARY":2,"UTT_BOUNDARY":3,"k":4,"y":5,"m":6,"":7,"s":8,"t":9,"":10,"ŋ":11,"a":12,"i":13,"n":14,"ɛ":15,"æ":16,"z":17,"ɡ":18,"r":19,"v":20,"u":21,"ɾ":22,"d":23,"ʊ":24,"":25,"ɲ":26,"e":27,"f":28,"o":29,"p":30,"ʒ":31,"t̠ʃ":32,"d̠ʒ":33,"eʊ":34,"w":35,"aʊ":36,"j":37,"ə":38,"ũ":39,"ɐ̃":40,"l":41,"b":42,"x":43,"ɔ":44,"ʃ":45,"iʊ":46,"ɛʊ":47,"ɔɪ":48,"uɪ":49}