polinaeterna HF staff commited on
Commit
dace825
·
1 Parent(s): a733f91

add script for downloading delta release (currently for cv12)

Browse files
Files changed (2) hide show
  1. download_delta.py +101 -0
  2. langs.json +702 -0
download_delta.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import urllib
2
+ import sys
3
+ import requests
4
+ import os
5
+ import logging
6
+ import shutil
7
+ import json
8
+ from tqdm import tqdm
9
+ import time
10
+ from pathlib import Path
11
+ from datasets.download import DownloadConfig, DownloadManager
12
+
13
+
14
+ logging.basicConfig(
15
+ format='%(asctime)s %(levelname)s: %(message)s',
16
+ level=logging.INFO,
17
+ handlers=[
18
+ logging.FileHandler("cv12_download.log"),
19
+ logging.StreamHandler(sys.stdout)
20
+ ]
21
+ )
22
+
23
+ _BUNDLE_URL_TEMPLATE_DELTA = 'cv-corpus-12.0-delta-2022-12-07/cv-corpus-12.0-delta-2022-12-07-{locale}.tar.gz'
24
+ _BUNDLE_VERSION = _BUNDLE_URL_TEMPLATE_DELTA.split("/")[0]
25
+ _API_URL = "https://commonvoice.mozilla.org/api/v1"
26
+
27
+
28
+ def _get_bundle_url(locale, url_template):
29
+ path = url_template.replace("{locale}", locale)
30
+ path = urllib.parse.quote(path.encode("utf-8"), safe="~()*!.'")
31
+ response = requests.get(f"{_API_URL}/bucket/dataset/{path}", timeout=10.0).json()
32
+ return response["url"]
33
+
34
+
35
+ def _log_download(locale, bundle_version):
36
+ email = "[email protected]"
37
+ payload = {"email": email, "locale": locale, "dataset": bundle_version}
38
+ requests.post(f"{_API_URL}/{locale}/downloaders", json=payload).json()
39
+
40
+
41
+ def download_language(dl_manager, lang, root_dir):
42
+ _log_download(lang, _BUNDLE_VERSION)
43
+ url = _get_bundle_url(lang, _BUNDLE_URL_TEMPLATE_DELTA)
44
+ i = 1
45
+ while url == "https://s3.dualstack.us-west-2.amazonaws.com/":
46
+ if i == 6:
47
+ raise ConnectionError(f"Cannot download '{lang.upper()}' data, fetched url: {url}. ")
48
+ i += 1
49
+ logging.warning(f"Unsuccessful attempt to fetch data url. Trying {i} time. ")
50
+ time.sleep(15)
51
+ _log_download(lang, _BUNDLE_VERSION)
52
+ url = _get_bundle_url(lang, _BUNDLE_URL_TEMPLATE_DELTA)
53
+
54
+ logging.info(f"Trying to download data for '{lang.upper()}'... ")
55
+ path = dl_manager.download_and_extract(url)
56
+ if os.path.isdir(path):
57
+ logging.info(f"'{lang.upper()}' data downloaded to {path}. ")
58
+ shutil.move(path, root_dir / f"data/{lang}")
59
+ else: # if it's not a dir, there was no data update in the release
60
+ logging.info(f"No data for '{lang.upper()}' found. ")
61
+
62
+
63
+ def main():
64
+ root_dir = Path("")
65
+ with open("langs.json", "r") as f:
66
+ languages = json.load(f).keys()
67
+
68
+ if (root_dir / "langs_ok.txt").exists():
69
+ with open(root_dir / "langs_ok.txt") as f:
70
+ langs_to_skip = set([line.strip().split("_")[1] for line in f.read().split("\n") if line])
71
+ logging.info(f"Already downloaded languages: {langs_to_skip}")
72
+ else:
73
+ langs_to_skip = set()
74
+
75
+ dl_config = DownloadConfig(
76
+ cache_dir=root_dir / "cache",
77
+ resume_download=True,
78
+ max_retries=5,
79
+ )
80
+ dl_manager = DownloadManager(
81
+ download_config=dl_config,
82
+ record_checksums=False,
83
+ )
84
+
85
+ for lang_id, lang in enumerate(tqdm(languages, desc="Processing languages...")):
86
+ if lang in langs_to_skip:
87
+ logging.info(f"Data for '{lang.upper()}' language already downloaded, skipping it. ")
88
+ continue
89
+ try:
90
+ download_language(dl_manager, lang, root_dir=root_dir)
91
+ with open(root_dir / "langs_ok.txt", "a") as f:
92
+ f.write(f"{lang_id}_{lang}\n")
93
+ except ConnectionError as e:
94
+ logging.error(e.strerror)
95
+ with open(root_dir / "langs_failed.txt", "a") as f:
96
+ f.write(f"{lang_id}_{lang}\n")
97
+ time.sleep(10)
98
+
99
+
100
+ if __name__ == "__main__":
101
+ main()
langs.json ADDED
@@ -0,0 +1,702 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "ab": {
3
+ "train": 1,
4
+ "dev": 1,
5
+ "test": 1,
6
+ "other": 1,
7
+ "invalidated": 1
8
+ },
9
+ "ar": {
10
+ "train": 1,
11
+ "dev": 1,
12
+ "test": 1,
13
+ "other": 1,
14
+ "invalidated": 1
15
+ },
16
+ "as": {
17
+ "train": 1,
18
+ "dev": 1,
19
+ "test": 1,
20
+ "other": 1,
21
+ "invalidated": 1
22
+ },
23
+ "ast": {
24
+ "train": 1,
25
+ "dev": 0,
26
+ "test": 1,
27
+ "other": 1,
28
+ "invalidated": 0
29
+ },
30
+ "az": {
31
+ "train": 1,
32
+ "dev": 1,
33
+ "test": 1,
34
+ "other": 1,
35
+ "invalidated": 1
36
+ },
37
+ "ba": {
38
+ "train": 3,
39
+ "dev": 1,
40
+ "test": 1,
41
+ "other": 1,
42
+ "invalidated": 1
43
+ },
44
+ "bas": {
45
+ "train": 1,
46
+ "dev": 1,
47
+ "test": 1,
48
+ "other": 1,
49
+ "invalidated": 1
50
+ },
51
+ "be": {
52
+ "train": 9,
53
+ "dev": 1,
54
+ "test": 1,
55
+ "other": 1,
56
+ "invalidated": 1
57
+ },
58
+ "bg": {
59
+ "train": 1,
60
+ "dev": 1,
61
+ "test": 1,
62
+ "other": 1,
63
+ "invalidated": 1
64
+ },
65
+ "bn": {
66
+ "train": 1,
67
+ "dev": 1,
68
+ "test": 1,
69
+ "other": 6,
70
+ "invalidated": 1
71
+ },
72
+ "br": {
73
+ "train": 1,
74
+ "dev": 1,
75
+ "test": 1,
76
+ "other": 1,
77
+ "invalidated": 1
78
+ },
79
+ "ca": {
80
+ "train": 23,
81
+ "dev": 1,
82
+ "test": 1,
83
+ "other": 13,
84
+ "invalidated": 2
85
+ },
86
+ "ckb": {
87
+ "train": 1,
88
+ "dev": 1,
89
+ "test": 1,
90
+ "other": 1,
91
+ "invalidated": 1
92
+ },
93
+ "cnh": {
94
+ "train": 1,
95
+ "dev": 1,
96
+ "test": 1,
97
+ "other": 1,
98
+ "invalidated": 1
99
+ },
100
+ "cs": {
101
+ "train": 1,
102
+ "dev": 1,
103
+ "test": 1,
104
+ "other": 1,
105
+ "invalidated": 1
106
+ },
107
+ "cv": {
108
+ "train": 1,
109
+ "dev": 1,
110
+ "test": 1,
111
+ "other": 1,
112
+ "invalidated": 1
113
+ },
114
+ "cy": {
115
+ "train": 1,
116
+ "dev": 1,
117
+ "test": 1,
118
+ "other": 1,
119
+ "invalidated": 1
120
+ },
121
+ "da": {
122
+ "train": 1,
123
+ "dev": 1,
124
+ "test": 1,
125
+ "other": 1,
126
+ "invalidated": 1
127
+ },
128
+ "de": {
129
+ "train": 12,
130
+ "dev": 1,
131
+ "test": 1,
132
+ "other": 1,
133
+ "invalidated": 2
134
+ },
135
+ "dv": {
136
+ "train": 1,
137
+ "dev": 1,
138
+ "test": 1,
139
+ "other": 1,
140
+ "invalidated": 1
141
+ },
142
+ "el": {
143
+ "train": 1,
144
+ "dev": 1,
145
+ "test": 1,
146
+ "other": 1,
147
+ "invalidated": 1
148
+ },
149
+ "en": {
150
+ "train": 24,
151
+ "dev": 1,
152
+ "test": 1,
153
+ "other": 8,
154
+ "invalidated": 7
155
+ },
156
+ "eo": {
157
+ "train": 4,
158
+ "dev": 1,
159
+ "test": 1,
160
+ "other": 4,
161
+ "invalidated": 4
162
+ },
163
+ "es": {
164
+ "train": 6,
165
+ "dev": 1,
166
+ "test": 1,
167
+ "other": 30,
168
+ "invalidated": 2
169
+ },
170
+ "et": {
171
+ "train": 1,
172
+ "dev": 1,
173
+ "test": 1,
174
+ "other": 1,
175
+ "invalidated": 1
176
+ },
177
+ "eu": {
178
+ "train": 1,
179
+ "dev": 1,
180
+ "test": 1,
181
+ "other": 1,
182
+ "invalidated": 1
183
+ },
184
+ "fa": {
185
+ "train": 1,
186
+ "dev": 1,
187
+ "test": 1,
188
+ "other": 1,
189
+ "invalidated": 1
190
+ },
191
+ "fi": {
192
+ "train": 1,
193
+ "dev": 1,
194
+ "test": 1,
195
+ "other": 1,
196
+ "invalidated": 1
197
+ },
198
+ "fr": {
199
+ "train": 13,
200
+ "dev": 1,
201
+ "test": 1,
202
+ "other": 1,
203
+ "invalidated": 2
204
+ },
205
+ "fy-NL": {
206
+ "train": 1,
207
+ "dev": 1,
208
+ "test": 1,
209
+ "other": 2,
210
+ "invalidated": 1
211
+ },
212
+ "ga-IE": {
213
+ "train": 1,
214
+ "dev": 1,
215
+ "test": 1,
216
+ "other": 1,
217
+ "invalidated": 1
218
+ },
219
+ "gl": {
220
+ "train": 1,
221
+ "dev": 1,
222
+ "test": 1,
223
+ "other": 1,
224
+ "invalidated": 1
225
+ },
226
+ "gn": {
227
+ "train": 1,
228
+ "dev": 1,
229
+ "test": 1,
230
+ "other": 1,
231
+ "invalidated": 1
232
+ },
233
+ "ha": {
234
+ "train": 1,
235
+ "dev": 1,
236
+ "test": 1,
237
+ "other": 1,
238
+ "invalidated": 1
239
+ },
240
+ "hi": {
241
+ "train": 1,
242
+ "dev": 1,
243
+ "test": 1,
244
+ "other": 1,
245
+ "invalidated": 1
246
+ },
247
+ "hsb": {
248
+ "train": 1,
249
+ "dev": 1,
250
+ "test": 1,
251
+ "other": 0,
252
+ "invalidated": 1
253
+ },
254
+ "hu": {
255
+ "train": 1,
256
+ "dev": 1,
257
+ "test": 1,
258
+ "other": 1,
259
+ "invalidated": 1
260
+ },
261
+ "hy-AM": {
262
+ "train": 1,
263
+ "dev": 1,
264
+ "test": 1,
265
+ "other": 1,
266
+ "invalidated": 1
267
+ },
268
+ "ia": {
269
+ "train": 1,
270
+ "dev": 1,
271
+ "test": 1,
272
+ "other": 1,
273
+ "invalidated": 1
274
+ },
275
+ "id": {
276
+ "train": 1,
277
+ "dev": 1,
278
+ "test": 1,
279
+ "other": 1,
280
+ "invalidated": 1
281
+ },
282
+ "ig": {
283
+ "train": 1,
284
+ "dev": 1,
285
+ "test": 1,
286
+ "other": 1,
287
+ "invalidated": 1
288
+ },
289
+ "it": {
290
+ "train": 4,
291
+ "dev": 1,
292
+ "test": 1,
293
+ "other": 1,
294
+ "invalidated": 1
295
+ },
296
+ "ja": {
297
+ "train": 1,
298
+ "dev": 1,
299
+ "test": 1,
300
+ "other": 1,
301
+ "invalidated": 1
302
+ },
303
+ "ka": {
304
+ "train": 1,
305
+ "dev": 1,
306
+ "test": 1,
307
+ "other": 1,
308
+ "invalidated": 1
309
+ },
310
+ "kab": {
311
+ "train": 4,
312
+ "dev": 1,
313
+ "test": 1,
314
+ "other": 3,
315
+ "invalidated": 1
316
+ },
317
+ "kk": {
318
+ "train": 1,
319
+ "dev": 1,
320
+ "test": 1,
321
+ "other": 0,
322
+ "invalidated": 1
323
+ },
324
+ "kmr": {
325
+ "train": 1,
326
+ "dev": 1,
327
+ "test": 1,
328
+ "other": 1,
329
+ "invalidated": 1
330
+ },
331
+ "ky": {
332
+ "train": 1,
333
+ "dev": 1,
334
+ "test": 1,
335
+ "other": 1,
336
+ "invalidated": 1
337
+ },
338
+ "lg": {
339
+ "train": 2,
340
+ "dev": 1,
341
+ "test": 1,
342
+ "other": 1,
343
+ "invalidated": 1
344
+ },
345
+ "lt": {
346
+ "train": 1,
347
+ "dev": 1,
348
+ "test": 1,
349
+ "other": 1,
350
+ "invalidated": 1
351
+ },
352
+ "lv": {
353
+ "train": 1,
354
+ "dev": 1,
355
+ "test": 1,
356
+ "other": 1,
357
+ "invalidated": 1
358
+ },
359
+ "mdf": {
360
+ "train": 1,
361
+ "dev": 1,
362
+ "test": 1,
363
+ "other": 1,
364
+ "invalidated": 1
365
+ },
366
+ "mhr": {
367
+ "train": 2,
368
+ "dev": 1,
369
+ "test": 1,
370
+ "other": 1,
371
+ "invalidated": 1
372
+ },
373
+ "mk": {
374
+ "train": 1,
375
+ "dev": 0,
376
+ "test": 1,
377
+ "other": 1,
378
+ "invalidated": 1
379
+ },
380
+ "ml": {
381
+ "train": 1,
382
+ "dev": 0,
383
+ "test": 1,
384
+ "other": 1,
385
+ "invalidated": 1
386
+ },
387
+ "mn": {
388
+ "train": 1,
389
+ "dev": 1,
390
+ "test": 1,
391
+ "other": 1,
392
+ "invalidated": 1
393
+ },
394
+ "mr": {
395
+ "train": 1,
396
+ "dev": 1,
397
+ "test": 1,
398
+ "other": 1,
399
+ "invalidated": 1
400
+ },
401
+ "mrj": {
402
+ "train": 1,
403
+ "dev": 1,
404
+ "test": 1,
405
+ "other": 1,
406
+ "invalidated": 1
407
+ },
408
+ "mt": {
409
+ "train": 1,
410
+ "dev": 1,
411
+ "test": 1,
412
+ "other": 1,
413
+ "invalidated": 1
414
+ },
415
+ "myv": {
416
+ "train": 1,
417
+ "dev": 1,
418
+ "test": 1,
419
+ "other": 1,
420
+ "invalidated": 1
421
+ },
422
+ "nan-tw": {
423
+ "train": 1,
424
+ "dev": 1,
425
+ "test": 1,
426
+ "other": 1,
427
+ "invalidated": 1
428
+ },
429
+ "ne-NP": {
430
+ "train": 1,
431
+ "dev": 1,
432
+ "test": 1,
433
+ "other": 1,
434
+ "invalidated": 1
435
+ },
436
+ "nl": {
437
+ "train": 1,
438
+ "dev": 1,
439
+ "test": 1,
440
+ "other": 1,
441
+ "invalidated": 1
442
+ },
443
+ "nn-NO": {
444
+ "train": 1,
445
+ "dev": 1,
446
+ "test": 1,
447
+ "other": 0,
448
+ "invalidated": 1
449
+ },
450
+ "or": {
451
+ "train": 1,
452
+ "dev": 1,
453
+ "test": 1,
454
+ "other": 1,
455
+ "invalidated": 1
456
+ },
457
+ "pa-IN": {
458
+ "train": 1,
459
+ "dev": 1,
460
+ "test": 1,
461
+ "other": 1,
462
+ "invalidated": 1
463
+ },
464
+ "pl": {
465
+ "train": 1,
466
+ "dev": 1,
467
+ "test": 1,
468
+ "other": 1,
469
+ "invalidated": 1
470
+ },
471
+ "pt": {
472
+ "train": 1,
473
+ "dev": 1,
474
+ "test": 1,
475
+ "other": 1,
476
+ "invalidated": 1
477
+ },
478
+ "rm-sursilv": {
479
+ "train": 1,
480
+ "dev": 1,
481
+ "test": 1,
482
+ "other": 1,
483
+ "invalidated": 1
484
+ },
485
+ "rm-vallader": {
486
+ "train": 1,
487
+ "dev": 1,
488
+ "test": 1,
489
+ "other": 1,
490
+ "invalidated": 1
491
+ },
492
+ "ro": {
493
+ "train": 1,
494
+ "dev": 1,
495
+ "test": 1,
496
+ "other": 1,
497
+ "invalidated": 1
498
+ },
499
+ "ru": {
500
+ "train": 1,
501
+ "dev": 1,
502
+ "test": 1,
503
+ "other": 1,
504
+ "invalidated": 1
505
+ },
506
+ "rw": {
507
+ "train": 26,
508
+ "dev": 1,
509
+ "test": 1,
510
+ "other": 2,
511
+ "invalidated": 6
512
+ },
513
+ "sah": {
514
+ "train": 1,
515
+ "dev": 1,
516
+ "test": 1,
517
+ "other": 1,
518
+ "invalidated": 1
519
+ },
520
+ "sat": {
521
+ "train": 1,
522
+ "dev": 0,
523
+ "test": 1,
524
+ "other": 1,
525
+ "invalidated": 1
526
+ },
527
+ "sc": {
528
+ "train": 1,
529
+ "dev": 1,
530
+ "test": 1,
531
+ "other": 1,
532
+ "invalidated": 1
533
+ },
534
+ "sk": {
535
+ "train": 1,
536
+ "dev": 1,
537
+ "test": 1,
538
+ "other": 1,
539
+ "invalidated": 1
540
+ },
541
+ "skr": {
542
+ "train": 1,
543
+ "dev": 1,
544
+ "test": 1,
545
+ "other": 1,
546
+ "invalidated": 1
547
+ },
548
+ "sl": {
549
+ "train": 1,
550
+ "dev": 1,
551
+ "test": 1,
552
+ "other": 1,
553
+ "invalidated": 1
554
+ },
555
+ "sr": {
556
+ "train": 1,
557
+ "dev": 1,
558
+ "test": 1,
559
+ "other": 1,
560
+ "invalidated": 1
561
+ },
562
+ "sv-SE": {
563
+ "train": 1,
564
+ "dev": 1,
565
+ "test": 1,
566
+ "other": 1,
567
+ "invalidated": 1
568
+ },
569
+ "sw": {
570
+ "train": 1,
571
+ "dev": 1,
572
+ "test": 1,
573
+ "other": 7,
574
+ "invalidated": 2
575
+ },
576
+ "ta": {
577
+ "train": 2,
578
+ "dev": 1,
579
+ "test": 1,
580
+ "other": 3,
581
+ "invalidated": 1
582
+ },
583
+ "th": {
584
+ "train": 1,
585
+ "dev": 1,
586
+ "test": 1,
587
+ "other": 5,
588
+ "invalidated": 1
589
+ },
590
+ "ti": {
591
+ "train": 1,
592
+ "dev": 0,
593
+ "test": 1,
594
+ "other": 1,
595
+ "invalidated": 0
596
+ },
597
+ "tig": {
598
+ "train": 1,
599
+ "dev": 0,
600
+ "test": 1,
601
+ "other": 0,
602
+ "invalidated": 1
603
+ },
604
+ "tok": {
605
+ "train": 1,
606
+ "dev": 1,
607
+ "test": 1,
608
+ "other": 1,
609
+ "invalidated": 1
610
+ },
611
+ "tr": {
612
+ "train": 1,
613
+ "dev": 1,
614
+ "test": 1,
615
+ "other": 1,
616
+ "invalidated": 1
617
+ },
618
+ "tt": {
619
+ "train": 1,
620
+ "dev": 1,
621
+ "test": 1,
622
+ "other": 1,
623
+ "invalidated": 1
624
+ },
625
+ "tw": {
626
+ "train": 1,
627
+ "dev": 0,
628
+ "test": 0,
629
+ "other": 0,
630
+ "invalidated": 0
631
+ },
632
+ "ug": {
633
+ "train": 1,
634
+ "dev": 1,
635
+ "test": 1,
636
+ "other": 1,
637
+ "invalidated": 1
638
+ },
639
+ "uk": {
640
+ "train": 1,
641
+ "dev": 1,
642
+ "test": 1,
643
+ "other": 1,
644
+ "invalidated": 1
645
+ },
646
+ "ur": {
647
+ "train": 1,
648
+ "dev": 1,
649
+ "test": 1,
650
+ "other": 3,
651
+ "invalidated": 1
652
+ },
653
+ "uz": {
654
+ "train": 2,
655
+ "dev": 1,
656
+ "test": 1,
657
+ "other": 4,
658
+ "invalidated": 1
659
+ },
660
+ "vi": {
661
+ "train": 1,
662
+ "dev": 1,
663
+ "test": 1,
664
+ "other": 1,
665
+ "invalidated": 1
666
+ },
667
+ "vot": {
668
+ "train": 1,
669
+ "dev": 0,
670
+ "test": 1,
671
+ "other": 0,
672
+ "invalidated": 1
673
+ },
674
+ "yue": {
675
+ "train": 1,
676
+ "dev": 1,
677
+ "test": 1,
678
+ "other": 1,
679
+ "invalidated": 1
680
+ },
681
+ "zh-CN": {
682
+ "train": 1,
683
+ "dev": 1,
684
+ "test": 1,
685
+ "other": 18,
686
+ "invalidated": 1
687
+ },
688
+ "zh-HK": {
689
+ "train": 1,
690
+ "dev": 1,
691
+ "test": 1,
692
+ "other": 1,
693
+ "invalidated": 1
694
+ },
695
+ "zh-TW": {
696
+ "train": 1,
697
+ "dev": 1,
698
+ "test": 1,
699
+ "other": 2,
700
+ "invalidated": 1
701
+ }
702
+ }