jlonsako commited on
Commit
8158e87
1 Parent(s): b0e0085

Upload lm-boosted decoder

Browse files
alphabet.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"labels": ["", "<s>", "</s>", "\u2047", " ", "\u1295", "\u1260", "\u1275", "\u12e8", "\u12cd", "\u1235", "\u122d", "\u1218", "\u120d", "\u121d", "\u127d", "\u12ed", "\u12a0", "\u1208", "\u1270", "\u12a5", "\u1290", "\u12eb", "\u1293", "\u120b", "\u12a8", "\u12f0", "\u12f5", "\u1265", "\u121b", "\u1228", "\u130d", "\u1308", "\u12c8", "\u1273", "\u122b", "\u1230", "\u1325", "\u121a", "\u12ad", "\u1263", "\u12ce", "\u1240", "\u1209", "\u122a", "\u1320", "\u12ab", "\u1233", "\u1206", "\u134d", "\u1245", "\u1205", "\u12f3", "\u12cb", "\u130b", "\u1278", "\u1276", "\u129b", "\u122e", "\u1323", "\u1348", "\u120a", "\u120e", "\u1201", "0", "\u1229", "\u1271", "\u1243", "\u12dd", "\u1296", "\u1232", "\u12d8", "\u1262", "\u1363", "\u12ca", "\u12f2", "\u130a", "1", "\u1309", "\u121e", "\u1266", "\u12dc", "\u120c", "\u1291", "\u12db", "\u1300", "\u123d", "\u12d9", "\u12ae", "\u121c", "\u12d3", "\u12da", "\u1261", "\u134a", "\u12f6", "\u1203", "\u130e", "\u1264", "\u127b", "\u1292", "\u12e9", "\u12d5", "\u1328", "\u1279", "2", "\u1241", "\u134b", "\u1246", "\u12f1", "\u1305", "\u12aa", "\u122c", "\u1231", "a", "\u1272", "\u12a4", "\u124b", "\u1219", "\u1294", "\u1236", "\u1353", "\u123b", "\u1355", "\u1200", "\u129d", "\u1356", "\u12a2", "\u132b", "\u12ee", "\u1274", "\u1215", "\u132d", "\u1225", "\u1303", "\u1234", "o", "\u12a9", "\u1204", ".", "9", "\u1313", "n", "-", "\u12a6", "5", "\u1238", "e", "\u1242", "\u134e", "4", "\u128b", "\u1321", "i", "3", "\u1298", "\u1352", "\u133d", "6", "r", "8", "t", "\u1349", "\u1326", "\u1299", "s", "l", "\u1302", "\u12de", "\u126a", "\u1277", "\u1345", "\u1306", "\u1283", "c", "\u1220", "u", "\u12ac", "\u12f4", "\u1343", "\u129e", "7", "\u12b3", "m", "p", "\u1202", "\u120f", "h", "\u1340", "\u1362", "\u1210", "\u126b", "\u1338", "\u123a", "\u1324", ",", "\u134c", "\u122f", "g", "\u1361", "\u133b", "\u12cc", "\u12e5", "\u1301", "d", "\u1350", "b", "\u1268", "\u1239", "/", "\u123e", "\u1267", "\u126c", "\u121f", "\u126d", "\u201d", "\u1354", "\u132a", "k", "f", "\u1244", "\u127a", "y", "\u134f", "\u1237", "\u1304", "\u12ec", "w", "v", "\u1223", "\u1226", "\u130c", "\u12e3", "\u127e", "\u1214", "\u1329", "\u1211", "\u1327", "\u1297", "j", "\u132e", "\u12d0", "\u127c", "\"", ":", "\u129a", "\u12e6", "\u12f7", "\u1364", "\u12e2", "\u12d1", "\u12c9", "\u1285", "\u12a3", "\u1346", "\u133e", "x", "$", "\u12a1", "\u1322", "\u126e", "\u00ab", "\u00bb", "\u1351", "\u1339", "%", "\u12df", "z", "\u12e1", "\u1335", "\u2014", "\u1341", "\u133f", "\u12ea", "?", "\u2019", "\u123c", "+", "\u1365", "\u12d2", "\u00b0", "\u123f", "\u12d6", "\u1310", "\u00f5", "\u2013", "\u12b8", ";", "\u00e3", "\u00a5", "\u12ba", "\u1334", "\u1333", "\u129c", "'", "\u1307", "\u133a", "\u2018", "!", "\u00b2", "\u127f", "\u12b0", "\u1357", "q", "&", "\u00a3", "\u129f", "[", "]", "\u1366", "\u1280", "\u1248", "\u1312", "\u12b5", "\u1213", "\u124d", "\u124a", "\u1221", "\u1332", "\u1336", "\u12b9", "\u12be", "\u1224", "\u1315", "\u12bd", "\u12bb", "\u12d4", "\u132c", "\u133c", "\u1284", "\u12e4", "\u1286", "\u1216", "\u12e0", "\u132f", "\u1331", "\u1330", "\u1222", "\u1344", "\u1212"], "is_bpe": false}
language_model/attrs.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"alpha": 0.5, "beta": 1.5, "unk_score_offset": -10.0, "score_boundary": true}
language_model/output.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d4e901a272a67d25a1c1b21dde060b997b5230eb5ce8b3b2aaf85be6fccff131
3
+ size 182248560
language_model/unigrams.txt ADDED
The diff for this file is too large to render. See raw diff
 
preprocessor_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": true,
3
+ "feature_extractor_type": "Wav2Vec2FeatureExtractor",
4
+ "feature_size": 1,
5
+ "padding_side": "right",
6
+ "padding_value": 0,
7
+ "processor_class": "Wav2Vec2ProcessorWithLM",
8
+ "return_attention_mask": true,
9
+ "sampling_rate": 16000
10
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "eos_token": "</s>",
4
+ "pad_token": "<pad>",
5
+ "unk_token": "<unk>"
6
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "clean_up_tokenization_spaces": true,
4
+ "do_lower_case": false,
5
+ "eos_token": "</s>",
6
+ "model_max_length": 1000000000000000019884624838656,
7
+ "pad_token": "<pad>",
8
+ "processor_class": "Wav2Vec2ProcessorWithLM",
9
+ "replace_word_delimiter_char": " ",
10
+ "target_lang": null,
11
+ "tokenizer_class": "Wav2Vec2CTCTokenizer",
12
+ "unk_token": "<unk>",
13
+ "word_delimiter_token": "|"
14
+ }
vocab.json ADDED
@@ -0,0 +1,341 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "!": 297,
3
+ "\"": 242,
4
+ "$": 256,
5
+ "%": 264,
6
+ "&": 303,
7
+ "'": 293,
8
+ "+": 276,
9
+ ",": 194,
10
+ "-": 142,
11
+ ".": 138,
12
+ "/": 208,
13
+ "0": 64,
14
+ "1": 77,
15
+ "2": 104,
16
+ "3": 153,
17
+ "4": 149,
18
+ "5": 144,
19
+ "6": 157,
20
+ "7": 180,
21
+ "8": 159,
22
+ "9": 139,
23
+ ":": 243,
24
+ ";": 286,
25
+ "</s>": 2,
26
+ "<pad>": 0,
27
+ "<s>": 1,
28
+ "<unk>": 3,
29
+ "?": 273,
30
+ "[": 306,
31
+ "]": 307,
32
+ "a": 113,
33
+ "b": 205,
34
+ "c": 173,
35
+ "d": 203,
36
+ "e": 146,
37
+ "f": 218,
38
+ "g": 197,
39
+ "h": 186,
40
+ "i": 152,
41
+ "j": 238,
42
+ "k": 217,
43
+ "l": 165,
44
+ "m": 182,
45
+ "n": 141,
46
+ "o": 135,
47
+ "p": 183,
48
+ "q": 302,
49
+ "r": 158,
50
+ "s": 164,
51
+ "t": 160,
52
+ "u": 175,
53
+ "v": 227,
54
+ "w": 226,
55
+ "x": 255,
56
+ "y": 221,
57
+ "z": 266,
58
+ "|": 4,
59
+ "£": 304,
60
+ "¥": 288,
61
+ "«": 260,
62
+ "°": 279,
63
+ "²": 298,
64
+ "»": 261,
65
+ "ã": 287,
66
+ "õ": 283,
67
+ "ሀ": 123,
68
+ "ሁ": 63,
69
+ "ሂ": 184,
70
+ "ሃ": 95,
71
+ "ሄ": 137,
72
+ "ህ": 51,
73
+ "ሆ": 48,
74
+ "ለ": 18,
75
+ "ሉ": 43,
76
+ "ሊ": 61,
77
+ "ላ": 24,
78
+ "ሌ": 82,
79
+ "ል": 13,
80
+ "ሎ": 62,
81
+ "ሏ": 185,
82
+ "ሐ": 189,
83
+ "ሑ": 235,
84
+ "ሒ": 338,
85
+ "ሓ": 313,
86
+ "ሔ": 233,
87
+ "ሕ": 130,
88
+ "ሖ": 331,
89
+ "መ": 12,
90
+ "ሙ": 117,
91
+ "ሚ": 38,
92
+ "ማ": 29,
93
+ "ሜ": 89,
94
+ "ም": 14,
95
+ "ሞ": 79,
96
+ "ሟ": 212,
97
+ "ሠ": 174,
98
+ "ሡ": 316,
99
+ "ሢ": 336,
100
+ "ሣ": 228,
101
+ "ሤ": 321,
102
+ "ሥ": 132,
103
+ "ሦ": 229,
104
+ "ረ": 30,
105
+ "ሩ": 65,
106
+ "ሪ": 44,
107
+ "ራ": 35,
108
+ "ሬ": 111,
109
+ "ር": 11,
110
+ "ሮ": 58,
111
+ "ሯ": 196,
112
+ "ሰ": 36,
113
+ "ሱ": 112,
114
+ "ሲ": 70,
115
+ "ሳ": 47,
116
+ "ሴ": 134,
117
+ "ስ": 10,
118
+ "ሶ": 119,
119
+ "ሷ": 223,
120
+ "ሸ": 145,
121
+ "ሹ": 207,
122
+ "ሺ": 192,
123
+ "ሻ": 121,
124
+ "ሼ": 275,
125
+ "ሽ": 86,
126
+ "ሾ": 209,
127
+ "ሿ": 280,
128
+ "ቀ": 42,
129
+ "ቁ": 105,
130
+ "ቂ": 147,
131
+ "ቃ": 67,
132
+ "ቄ": 219,
133
+ "ቅ": 50,
134
+ "ቆ": 107,
135
+ "ቈ": 310,
136
+ "ቊ": 315,
137
+ "ቋ": 116,
138
+ "ቍ": 314,
139
+ "በ": 6,
140
+ "ቡ": 92,
141
+ "ቢ": 72,
142
+ "ባ": 40,
143
+ "ቤ": 97,
144
+ "ብ": 28,
145
+ "ቦ": 80,
146
+ "ቧ": 210,
147
+ "ቨ": 206,
148
+ "ቪ": 168,
149
+ "ቫ": 190,
150
+ "ቬ": 211,
151
+ "ቭ": 213,
152
+ "ቮ": 259,
153
+ "ተ": 19,
154
+ "ቱ": 66,
155
+ "ቲ": 114,
156
+ "ታ": 34,
157
+ "ቴ": 129,
158
+ "ት": 7,
159
+ "ቶ": 56,
160
+ "ቷ": 169,
161
+ "ቸ": 55,
162
+ "ቹ": 103,
163
+ "ቺ": 220,
164
+ "ቻ": 98,
165
+ "ቼ": 241,
166
+ "ች": 15,
167
+ "ቾ": 232,
168
+ "ቿ": 299,
169
+ "ኀ": 309,
170
+ "ኃ": 172,
171
+ "ኄ": 328,
172
+ "ኅ": 251,
173
+ "ኆ": 330,
174
+ "ኋ": 150,
175
+ "ነ": 21,
176
+ "ኑ": 83,
177
+ "ኒ": 99,
178
+ "ና": 23,
179
+ "ኔ": 118,
180
+ "ን": 5,
181
+ "ኖ": 69,
182
+ "ኗ": 237,
183
+ "ኘ": 154,
184
+ "ኙ": 163,
185
+ "ኚ": 244,
186
+ "ኛ": 57,
187
+ "ኜ": 292,
188
+ "ኝ": 124,
189
+ "ኞ": 179,
190
+ "ኟ": 305,
191
+ "አ": 17,
192
+ "ኡ": 257,
193
+ "ኢ": 126,
194
+ "ኣ": 252,
195
+ "ኤ": 115,
196
+ "እ": 20,
197
+ "ኦ": 143,
198
+ "ከ": 25,
199
+ "ኩ": 136,
200
+ "ኪ": 110,
201
+ "ካ": 46,
202
+ "ኬ": 176,
203
+ "ክ": 39,
204
+ "ኮ": 88,
205
+ "ኰ": 300,
206
+ "ኳ": 181,
207
+ "ኵ": 312,
208
+ "ኸ": 285,
209
+ "ኹ": 319,
210
+ "ኺ": 289,
211
+ "ኻ": 324,
212
+ "ኽ": 323,
213
+ "ኾ": 320,
214
+ "ወ": 33,
215
+ "ዉ": 250,
216
+ "ዊ": 74,
217
+ "ዋ": 53,
218
+ "ዌ": 200,
219
+ "ው": 9,
220
+ "ዎ": 41,
221
+ "ዐ": 240,
222
+ "ዑ": 249,
223
+ "ዒ": 278,
224
+ "ዓ": 90,
225
+ "ዔ": 325,
226
+ "ዕ": 101,
227
+ "ዖ": 281,
228
+ "ዘ": 71,
229
+ "ዙ": 87,
230
+ "ዚ": 91,
231
+ "ዛ": 84,
232
+ "ዜ": 81,
233
+ "ዝ": 68,
234
+ "ዞ": 167,
235
+ "ዟ": 265,
236
+ "ዠ": 332,
237
+ "ዡ": 267,
238
+ "ዢ": 248,
239
+ "ዣ": 231,
240
+ "ዤ": 329,
241
+ "ዥ": 201,
242
+ "ዦ": 245,
243
+ "የ": 8,
244
+ "ዩ": 100,
245
+ "ዪ": 272,
246
+ "ያ": 22,
247
+ "ዬ": 225,
248
+ "ይ": 16,
249
+ "ዮ": 128,
250
+ "ደ": 26,
251
+ "ዱ": 108,
252
+ "ዲ": 75,
253
+ "ዳ": 52,
254
+ "ዴ": 177,
255
+ "ድ": 27,
256
+ "ዶ": 94,
257
+ "ዷ": 246,
258
+ "ጀ": 85,
259
+ "ጁ": 202,
260
+ "ጂ": 166,
261
+ "ጃ": 133,
262
+ "ጄ": 224,
263
+ "ጅ": 109,
264
+ "ጆ": 171,
265
+ "ጇ": 294,
266
+ "ገ": 32,
267
+ "ጉ": 78,
268
+ "ጊ": 76,
269
+ "ጋ": 54,
270
+ "ጌ": 230,
271
+ "ግ": 31,
272
+ "ጎ": 96,
273
+ "ጐ": 282,
274
+ "ጒ": 311,
275
+ "ጓ": 140,
276
+ "ጕ": 322,
277
+ "ጠ": 45,
278
+ "ጡ": 151,
279
+ "ጢ": 258,
280
+ "ጣ": 59,
281
+ "ጤ": 193,
282
+ "ጥ": 37,
283
+ "ጦ": 162,
284
+ "ጧ": 236,
285
+ "ጨ": 102,
286
+ "ጩ": 234,
287
+ "ጪ": 216,
288
+ "ጫ": 127,
289
+ "ጬ": 326,
290
+ "ጭ": 131,
291
+ "ጮ": 239,
292
+ "ጯ": 333,
293
+ "ጰ": 335,
294
+ "ጱ": 334,
295
+ "ጲ": 317,
296
+ "ጳ": 291,
297
+ "ጴ": 290,
298
+ "ጵ": 268,
299
+ "ጶ": 318,
300
+ "ጸ": 191,
301
+ "ጹ": 263,
302
+ "ጺ": 295,
303
+ "ጻ": 199,
304
+ "ጼ": 327,
305
+ "ጽ": 156,
306
+ "ጾ": 254,
307
+ "ጿ": 271,
308
+ "ፀ": 187,
309
+ "ፁ": 270,
310
+ "ፃ": 178,
311
+ "ፄ": 337,
312
+ "ፅ": 170,
313
+ "ፆ": 253,
314
+ "ፈ": 60,
315
+ "ፉ": 161,
316
+ "ፊ": 93,
317
+ "ፋ": 106,
318
+ "ፌ": 195,
319
+ "ፍ": 49,
320
+ "ፎ": 148,
321
+ "ፏ": 222,
322
+ "ፐ": 204,
323
+ "ፑ": 262,
324
+ "ፒ": 155,
325
+ "ፓ": 120,
326
+ "ፔ": 215,
327
+ "ፕ": 122,
328
+ "ፖ": 125,
329
+ "ፗ": 301,
330
+ "፡": 198,
331
+ "።": 188,
332
+ "፣": 73,
333
+ "፤": 247,
334
+ "፥": 277,
335
+ "፦": 308,
336
+ "–": 284,
337
+ "—": 269,
338
+ "‘": 296,
339
+ "’": 274,
340
+ "”": 214
341
+ }