marinone94 commited on
Commit
4abd0b0
·
1 Parent(s): 2d00b9a

add script to upload nst sv

Browse files
Files changed (2) hide show
  1. eda.ipynb +1030 -9
  2. upload_nst_sv_to_hf_dataset.py +103 -0
eda.ipynb CHANGED
@@ -2,7 +2,7 @@
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
- "execution_count": 1,
6
  "id": "c9526c52",
7
  "metadata": {},
8
  "outputs": [],
@@ -23,12 +23,12 @@
23
  },
24
  {
25
  "cell_type": "code",
26
- "execution_count": 2,
27
  "id": "cc9f1c45",
28
  "metadata": {},
29
  "outputs": [],
30
  "source": [
31
- "dataset_name = \"mozilla-foundation/common_voice_8_0\"\n",
32
  "dataset_config_name = \"sv-SE\"\n",
33
  "train_split_name = \"train+validation\"\n",
34
  "use_auth_token = True"
@@ -36,7 +36,7 @@
36
  },
37
  {
38
  "cell_type": "code",
39
- "execution_count": 3,
40
  "id": "21fd7030",
41
  "metadata": {},
42
  "outputs": [],
@@ -159,7 +159,7 @@
159
  },
160
  {
161
  "cell_type": "code",
162
- "execution_count": 28,
163
  "id": "7945cada",
164
  "metadata": {},
165
  "outputs": [
@@ -167,7 +167,7 @@
167
  "name": "stderr",
168
  "output_type": "stream",
169
  "text": [
170
- "Reusing dataset common_voice (/Users/emiliomarinone/.cache/huggingface/datasets/mozilla-foundation___common_voice/sv-SE/7.0.0/33e08856cfa0d0665e837bcad73ffd920a0bc713ce8c5fffb55dbdf1c084d5ba)\n"
171
  ]
172
  }
173
  ],
@@ -192,7 +192,7 @@
192
  },
193
  {
194
  "cell_type": "code",
195
- "execution_count": 29,
196
  "id": "1aead6a1",
197
  "metadata": {},
198
  "outputs": [],
@@ -226,7 +226,7 @@
226
  },
227
  {
228
  "cell_type": "code",
229
- "execution_count": 30,
230
  "id": "fc794e39",
231
  "metadata": {},
232
  "outputs": [
@@ -239,7 +239,7 @@
239
  "})"
240
  ]
241
  },
242
- "execution_count": 30,
243
  "metadata": {},
244
  "output_type": "execute_result"
245
  }
@@ -248,6 +248,1027 @@
248
  "test_data"
249
  ]
250
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
251
  {
252
  "cell_type": "code",
253
  "execution_count": 31,
 
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
+ "execution_count": 5,
6
  "id": "c9526c52",
7
  "metadata": {},
8
  "outputs": [],
 
23
  },
24
  {
25
  "cell_type": "code",
26
+ "execution_count": 6,
27
  "id": "cc9f1c45",
28
  "metadata": {},
29
  "outputs": [],
30
  "source": [
31
+ "dataset_name = \"mozilla-foundation/common_voice_7_0\"\n",
32
  "dataset_config_name = \"sv-SE\"\n",
33
  "train_split_name = \"train+validation\"\n",
34
  "use_auth_token = True"
 
36
  },
37
  {
38
  "cell_type": "code",
39
+ "execution_count": 7,
40
  "id": "21fd7030",
41
  "metadata": {},
42
  "outputs": [],
 
159
  },
160
  {
161
  "cell_type": "code",
162
+ "execution_count": 8,
163
  "id": "7945cada",
164
  "metadata": {},
165
  "outputs": [
 
167
  "name": "stderr",
168
  "output_type": "stream",
169
  "text": [
170
+ "Reusing dataset common_voice (/Users/emiliomarinone/.cache/huggingface/datasets/mozilla-foundation___common_voice/sv-SE/7.0.0/fe20cac47c166e25b1f096ab661832e3da7cf298ed4a91dcaa1343ad972d175b)\n"
171
  ]
172
  }
173
  ],
 
192
  },
193
  {
194
  "cell_type": "code",
195
+ "execution_count": 10,
196
  "id": "1aead6a1",
197
  "metadata": {},
198
  "outputs": [],
 
226
  },
227
  {
228
  "cell_type": "code",
229
+ "execution_count": 18,
230
  "id": "fc794e39",
231
  "metadata": {},
232
  "outputs": [
 
239
  "})"
240
  ]
241
  },
242
+ "execution_count": 18,
243
  "metadata": {},
244
  "output_type": "execute_result"
245
  }
 
248
  "test_data"
249
  ]
250
  },
251
+ {
252
+ "cell_type": "code",
253
+ "execution_count": 14,
254
+ "id": "900052ec",
255
+ "metadata": {},
256
+ "outputs": [
257
+ {
258
+ "data": {
259
+ "text/plain": [
260
+ "['',\n",
261
+ " '',\n",
262
+ " '',\n",
263
+ " '',\n",
264
+ " '',\n",
265
+ " '',\n",
266
+ " '',\n",
267
+ " '',\n",
268
+ " '',\n",
269
+ " '',\n",
270
+ " '',\n",
271
+ " '',\n",
272
+ " '',\n",
273
+ " '',\n",
274
+ " '',\n",
275
+ " '',\n",
276
+ " '',\n",
277
+ " '',\n",
278
+ " '',\n",
279
+ " '',\n",
280
+ " '',\n",
281
+ " '',\n",
282
+ " '',\n",
283
+ " '',\n",
284
+ " '',\n",
285
+ " '',\n",
286
+ " '',\n",
287
+ " '',\n",
288
+ " '',\n",
289
+ " '',\n",
290
+ " '',\n",
291
+ " '',\n",
292
+ " '',\n",
293
+ " '',\n",
294
+ " '',\n",
295
+ " '',\n",
296
+ " '',\n",
297
+ " '',\n",
298
+ " '',\n",
299
+ " '',\n",
300
+ " '',\n",
301
+ " '',\n",
302
+ " '',\n",
303
+ " '',\n",
304
+ " '',\n",
305
+ " '',\n",
306
+ " '',\n",
307
+ " '',\n",
308
+ " '',\n",
309
+ " '',\n",
310
+ " '',\n",
311
+ " '',\n",
312
+ " '',\n",
313
+ " '',\n",
314
+ " '',\n",
315
+ " '',\n",
316
+ " '',\n",
317
+ " '',\n",
318
+ " '',\n",
319
+ " '',\n",
320
+ " '',\n",
321
+ " '',\n",
322
+ " '',\n",
323
+ " '',\n",
324
+ " '',\n",
325
+ " '',\n",
326
+ " '',\n",
327
+ " '',\n",
328
+ " '',\n",
329
+ " '',\n",
330
+ " '',\n",
331
+ " '',\n",
332
+ " '',\n",
333
+ " '',\n",
334
+ " '',\n",
335
+ " '',\n",
336
+ " '',\n",
337
+ " '',\n",
338
+ " '',\n",
339
+ " '',\n",
340
+ " '',\n",
341
+ " '',\n",
342
+ " '',\n",
343
+ " '',\n",
344
+ " '',\n",
345
+ " '',\n",
346
+ " '',\n",
347
+ " '',\n",
348
+ " '',\n",
349
+ " '',\n",
350
+ " '',\n",
351
+ " '',\n",
352
+ " '',\n",
353
+ " '',\n",
354
+ " '',\n",
355
+ " '',\n",
356
+ " '',\n",
357
+ " '',\n",
358
+ " '',\n",
359
+ " '',\n",
360
+ " '',\n",
361
+ " '',\n",
362
+ " '',\n",
363
+ " '',\n",
364
+ " '',\n",
365
+ " '',\n",
366
+ " '',\n",
367
+ " '',\n",
368
+ " '',\n",
369
+ " '',\n",
370
+ " '',\n",
371
+ " '',\n",
372
+ " '',\n",
373
+ " '',\n",
374
+ " '',\n",
375
+ " '',\n",
376
+ " '',\n",
377
+ " '',\n",
378
+ " '',\n",
379
+ " '',\n",
380
+ " '',\n",
381
+ " '',\n",
382
+ " '',\n",
383
+ " '',\n",
384
+ " '',\n",
385
+ " '',\n",
386
+ " '',\n",
387
+ " '',\n",
388
+ " '',\n",
389
+ " '',\n",
390
+ " '',\n",
391
+ " '',\n",
392
+ " '',\n",
393
+ " '',\n",
394
+ " '',\n",
395
+ " '',\n",
396
+ " '',\n",
397
+ " '',\n",
398
+ " '',\n",
399
+ " '',\n",
400
+ " '',\n",
401
+ " '',\n",
402
+ " '',\n",
403
+ " '',\n",
404
+ " '',\n",
405
+ " '',\n",
406
+ " '',\n",
407
+ " '',\n",
408
+ " '',\n",
409
+ " '',\n",
410
+ " '',\n",
411
+ " '',\n",
412
+ " '',\n",
413
+ " '',\n",
414
+ " '',\n",
415
+ " '',\n",
416
+ " '',\n",
417
+ " '',\n",
418
+ " '',\n",
419
+ " '',\n",
420
+ " '',\n",
421
+ " '',\n",
422
+ " '',\n",
423
+ " '',\n",
424
+ " '',\n",
425
+ " '',\n",
426
+ " '',\n",
427
+ " '',\n",
428
+ " '',\n",
429
+ " '',\n",
430
+ " '',\n",
431
+ " '',\n",
432
+ " '',\n",
433
+ " '',\n",
434
+ " '',\n",
435
+ " '',\n",
436
+ " '',\n",
437
+ " '',\n",
438
+ " '',\n",
439
+ " '',\n",
440
+ " '',\n",
441
+ " '',\n",
442
+ " '',\n",
443
+ " '',\n",
444
+ " '',\n",
445
+ " '',\n",
446
+ " '',\n",
447
+ " '',\n",
448
+ " '',\n",
449
+ " '',\n",
450
+ " '',\n",
451
+ " '',\n",
452
+ " '',\n",
453
+ " '',\n",
454
+ " '',\n",
455
+ " '',\n",
456
+ " '',\n",
457
+ " '',\n",
458
+ " '',\n",
459
+ " '',\n",
460
+ " '',\n",
461
+ " '',\n",
462
+ " '',\n",
463
+ " '',\n",
464
+ " '',\n",
465
+ " '',\n",
466
+ " '',\n",
467
+ " '',\n",
468
+ " '',\n",
469
+ " '',\n",
470
+ " '',\n",
471
+ " '',\n",
472
+ " '',\n",
473
+ " '',\n",
474
+ " '',\n",
475
+ " '',\n",
476
+ " '',\n",
477
+ " '',\n",
478
+ " '',\n",
479
+ " '',\n",
480
+ " '',\n",
481
+ " '',\n",
482
+ " '',\n",
483
+ " '',\n",
484
+ " '',\n",
485
+ " '',\n",
486
+ " '',\n",
487
+ " '',\n",
488
+ " '',\n",
489
+ " '',\n",
490
+ " '',\n",
491
+ " '',\n",
492
+ " '',\n",
493
+ " '',\n",
494
+ " '',\n",
495
+ " '',\n",
496
+ " '',\n",
497
+ " '',\n",
498
+ " '',\n",
499
+ " '',\n",
500
+ " '',\n",
501
+ " '',\n",
502
+ " '',\n",
503
+ " '',\n",
504
+ " '',\n",
505
+ " '',\n",
506
+ " '',\n",
507
+ " '',\n",
508
+ " '',\n",
509
+ " '',\n",
510
+ " '',\n",
511
+ " '',\n",
512
+ " '',\n",
513
+ " '',\n",
514
+ " '',\n",
515
+ " '',\n",
516
+ " '',\n",
517
+ " '',\n",
518
+ " '',\n",
519
+ " '',\n",
520
+ " '',\n",
521
+ " '',\n",
522
+ " '',\n",
523
+ " '',\n",
524
+ " '',\n",
525
+ " '',\n",
526
+ " '',\n",
527
+ " '',\n",
528
+ " '',\n",
529
+ " '',\n",
530
+ " '',\n",
531
+ " '',\n",
532
+ " '',\n",
533
+ " '',\n",
534
+ " '',\n",
535
+ " '',\n",
536
+ " '',\n",
537
+ " '',\n",
538
+ " '',\n",
539
+ " '',\n",
540
+ " '',\n",
541
+ " '',\n",
542
+ " '',\n",
543
+ " '',\n",
544
+ " '',\n",
545
+ " '',\n",
546
+ " '',\n",
547
+ " '',\n",
548
+ " '',\n",
549
+ " '',\n",
550
+ " '',\n",
551
+ " '',\n",
552
+ " '',\n",
553
+ " '',\n",
554
+ " '',\n",
555
+ " '',\n",
556
+ " '',\n",
557
+ " '',\n",
558
+ " '',\n",
559
+ " '',\n",
560
+ " '',\n",
561
+ " '',\n",
562
+ " '',\n",
563
+ " '',\n",
564
+ " '',\n",
565
+ " '',\n",
566
+ " '',\n",
567
+ " '',\n",
568
+ " '',\n",
569
+ " '',\n",
570
+ " '',\n",
571
+ " '',\n",
572
+ " '',\n",
573
+ " '',\n",
574
+ " '',\n",
575
+ " '',\n",
576
+ " '',\n",
577
+ " '',\n",
578
+ " '',\n",
579
+ " '',\n",
580
+ " '',\n",
581
+ " '',\n",
582
+ " '',\n",
583
+ " '',\n",
584
+ " '',\n",
585
+ " '',\n",
586
+ " '',\n",
587
+ " '',\n",
588
+ " '',\n",
589
+ " '',\n",
590
+ " '',\n",
591
+ " '',\n",
592
+ " '',\n",
593
+ " '',\n",
594
+ " '',\n",
595
+ " '',\n",
596
+ " '',\n",
597
+ " '',\n",
598
+ " '',\n",
599
+ " '',\n",
600
+ " '',\n",
601
+ " '',\n",
602
+ " '',\n",
603
+ " '',\n",
604
+ " '',\n",
605
+ " '',\n",
606
+ " '',\n",
607
+ " '',\n",
608
+ " '',\n",
609
+ " '',\n",
610
+ " '',\n",
611
+ " '',\n",
612
+ " '',\n",
613
+ " '',\n",
614
+ " '',\n",
615
+ " '',\n",
616
+ " '',\n",
617
+ " '',\n",
618
+ " '',\n",
619
+ " '',\n",
620
+ " '',\n",
621
+ " '',\n",
622
+ " '',\n",
623
+ " '',\n",
624
+ " '',\n",
625
+ " '',\n",
626
+ " '',\n",
627
+ " '',\n",
628
+ " '',\n",
629
+ " '',\n",
630
+ " '',\n",
631
+ " '',\n",
632
+ " '',\n",
633
+ " '',\n",
634
+ " '',\n",
635
+ " '',\n",
636
+ " '',\n",
637
+ " '',\n",
638
+ " '',\n",
639
+ " '',\n",
640
+ " '',\n",
641
+ " '',\n",
642
+ " '',\n",
643
+ " '',\n",
644
+ " '',\n",
645
+ " '',\n",
646
+ " '',\n",
647
+ " '',\n",
648
+ " '',\n",
649
+ " '',\n",
650
+ " '',\n",
651
+ " '',\n",
652
+ " '',\n",
653
+ " '',\n",
654
+ " '',\n",
655
+ " '',\n",
656
+ " '',\n",
657
+ " '',\n",
658
+ " '',\n",
659
+ " '',\n",
660
+ " '',\n",
661
+ " '',\n",
662
+ " '',\n",
663
+ " '',\n",
664
+ " '',\n",
665
+ " '',\n",
666
+ " '',\n",
667
+ " '',\n",
668
+ " '',\n",
669
+ " '',\n",
670
+ " '',\n",
671
+ " '',\n",
672
+ " '',\n",
673
+ " '',\n",
674
+ " '',\n",
675
+ " '',\n",
676
+ " '',\n",
677
+ " '',\n",
678
+ " '',\n",
679
+ " '',\n",
680
+ " '',\n",
681
+ " '',\n",
682
+ " '',\n",
683
+ " '',\n",
684
+ " '',\n",
685
+ " '',\n",
686
+ " '',\n",
687
+ " '',\n",
688
+ " '',\n",
689
+ " '',\n",
690
+ " '',\n",
691
+ " '',\n",
692
+ " '',\n",
693
+ " '',\n",
694
+ " '',\n",
695
+ " '',\n",
696
+ " '',\n",
697
+ " '',\n",
698
+ " '',\n",
699
+ " '',\n",
700
+ " '',\n",
701
+ " '',\n",
702
+ " '',\n",
703
+ " '',\n",
704
+ " '',\n",
705
+ " '',\n",
706
+ " '',\n",
707
+ " '',\n",
708
+ " '',\n",
709
+ " '',\n",
710
+ " '',\n",
711
+ " '',\n",
712
+ " '',\n",
713
+ " '',\n",
714
+ " '',\n",
715
+ " '',\n",
716
+ " '',\n",
717
+ " '',\n",
718
+ " '',\n",
719
+ " '',\n",
720
+ " '',\n",
721
+ " '',\n",
722
+ " '',\n",
723
+ " '',\n",
724
+ " '',\n",
725
+ " '',\n",
726
+ " '',\n",
727
+ " '',\n",
728
+ " '',\n",
729
+ " '',\n",
730
+ " '',\n",
731
+ " '',\n",
732
+ " '',\n",
733
+ " '',\n",
734
+ " '',\n",
735
+ " '',\n",
736
+ " '',\n",
737
+ " '',\n",
738
+ " '',\n",
739
+ " '',\n",
740
+ " '',\n",
741
+ " '',\n",
742
+ " '',\n",
743
+ " '',\n",
744
+ " '',\n",
745
+ " '',\n",
746
+ " '',\n",
747
+ " '',\n",
748
+ " '',\n",
749
+ " '',\n",
750
+ " '',\n",
751
+ " '',\n",
752
+ " '',\n",
753
+ " '',\n",
754
+ " '',\n",
755
+ " '',\n",
756
+ " '',\n",
757
+ " '',\n",
758
+ " '',\n",
759
+ " '',\n",
760
+ " '',\n",
761
+ " '',\n",
762
+ " '',\n",
763
+ " '',\n",
764
+ " '',\n",
765
+ " '',\n",
766
+ " '',\n",
767
+ " '',\n",
768
+ " '',\n",
769
+ " '',\n",
770
+ " '',\n",
771
+ " '',\n",
772
+ " '',\n",
773
+ " '',\n",
774
+ " '',\n",
775
+ " '',\n",
776
+ " '',\n",
777
+ " '',\n",
778
+ " '',\n",
779
+ " '',\n",
780
+ " '',\n",
781
+ " '',\n",
782
+ " '',\n",
783
+ " '',\n",
784
+ " '',\n",
785
+ " '',\n",
786
+ " '',\n",
787
+ " '',\n",
788
+ " '',\n",
789
+ " '',\n",
790
+ " '',\n",
791
+ " '',\n",
792
+ " '',\n",
793
+ " '',\n",
794
+ " '',\n",
795
+ " '',\n",
796
+ " '',\n",
797
+ " '',\n",
798
+ " '',\n",
799
+ " '',\n",
800
+ " '',\n",
801
+ " '',\n",
802
+ " '',\n",
803
+ " '',\n",
804
+ " '',\n",
805
+ " '',\n",
806
+ " '',\n",
807
+ " '',\n",
808
+ " '',\n",
809
+ " '',\n",
810
+ " '',\n",
811
+ " '',\n",
812
+ " '',\n",
813
+ " '',\n",
814
+ " '',\n",
815
+ " '',\n",
816
+ " '',\n",
817
+ " '',\n",
818
+ " '',\n",
819
+ " '',\n",
820
+ " '',\n",
821
+ " '',\n",
822
+ " '',\n",
823
+ " '',\n",
824
+ " '',\n",
825
+ " '',\n",
826
+ " '',\n",
827
+ " '',\n",
828
+ " '',\n",
829
+ " '',\n",
830
+ " '',\n",
831
+ " '',\n",
832
+ " '',\n",
833
+ " '',\n",
834
+ " '',\n",
835
+ " '',\n",
836
+ " '',\n",
837
+ " '',\n",
838
+ " '',\n",
839
+ " '',\n",
840
+ " '',\n",
841
+ " '',\n",
842
+ " '',\n",
843
+ " '',\n",
844
+ " '',\n",
845
+ " '',\n",
846
+ " '',\n",
847
+ " '',\n",
848
+ " '',\n",
849
+ " '',\n",
850
+ " '',\n",
851
+ " '',\n",
852
+ " '',\n",
853
+ " '',\n",
854
+ " '',\n",
855
+ " '',\n",
856
+ " '',\n",
857
+ " '',\n",
858
+ " '',\n",
859
+ " '',\n",
860
+ " '',\n",
861
+ " '',\n",
862
+ " '',\n",
863
+ " '',\n",
864
+ " '',\n",
865
+ " '',\n",
866
+ " '',\n",
867
+ " '',\n",
868
+ " '',\n",
869
+ " '',\n",
870
+ " '',\n",
871
+ " '',\n",
872
+ " '',\n",
873
+ " '',\n",
874
+ " '',\n",
875
+ " '',\n",
876
+ " '',\n",
877
+ " '',\n",
878
+ " '',\n",
879
+ " '',\n",
880
+ " '',\n",
881
+ " '',\n",
882
+ " '',\n",
883
+ " '',\n",
884
+ " '',\n",
885
+ " '',\n",
886
+ " '',\n",
887
+ " '',\n",
888
+ " '',\n",
889
+ " '',\n",
890
+ " '',\n",
891
+ " '',\n",
892
+ " '',\n",
893
+ " '',\n",
894
+ " '',\n",
895
+ " '',\n",
896
+ " '',\n",
897
+ " '',\n",
898
+ " '',\n",
899
+ " '',\n",
900
+ " '',\n",
901
+ " '',\n",
902
+ " '',\n",
903
+ " '',\n",
904
+ " '',\n",
905
+ " '',\n",
906
+ " '',\n",
907
+ " '',\n",
908
+ " '',\n",
909
+ " '',\n",
910
+ " '',\n",
911
+ " '',\n",
912
+ " '',\n",
913
+ " '',\n",
914
+ " '',\n",
915
+ " '',\n",
916
+ " '',\n",
917
+ " '',\n",
918
+ " '',\n",
919
+ " '',\n",
920
+ " '',\n",
921
+ " '',\n",
922
+ " '',\n",
923
+ " '',\n",
924
+ " '',\n",
925
+ " '',\n",
926
+ " '',\n",
927
+ " '',\n",
928
+ " '',\n",
929
+ " '',\n",
930
+ " '',\n",
931
+ " '',\n",
932
+ " '',\n",
933
+ " '',\n",
934
+ " '',\n",
935
+ " '',\n",
936
+ " '',\n",
937
+ " '',\n",
938
+ " '',\n",
939
+ " '',\n",
940
+ " '',\n",
941
+ " '',\n",
942
+ " '',\n",
943
+ " '',\n",
944
+ " '',\n",
945
+ " '',\n",
946
+ " '',\n",
947
+ " '',\n",
948
+ " '',\n",
949
+ " '',\n",
950
+ " '',\n",
951
+ " '',\n",
952
+ " '',\n",
953
+ " '',\n",
954
+ " '',\n",
955
+ " '',\n",
956
+ " '',\n",
957
+ " '',\n",
958
+ " '',\n",
959
+ " '',\n",
960
+ " '',\n",
961
+ " '',\n",
962
+ " '',\n",
963
+ " '',\n",
964
+ " '',\n",
965
+ " '',\n",
966
+ " '',\n",
967
+ " '',\n",
968
+ " '',\n",
969
+ " '',\n",
970
+ " '',\n",
971
+ " '',\n",
972
+ " '',\n",
973
+ " '',\n",
974
+ " '',\n",
975
+ " '',\n",
976
+ " '',\n",
977
+ " '',\n",
978
+ " '',\n",
979
+ " '',\n",
980
+ " '',\n",
981
+ " '',\n",
982
+ " '',\n",
983
+ " '',\n",
984
+ " '',\n",
985
+ " '',\n",
986
+ " '',\n",
987
+ " '',\n",
988
+ " '',\n",
989
+ " '',\n",
990
+ " '',\n",
991
+ " '',\n",
992
+ " '',\n",
993
+ " '',\n",
994
+ " '',\n",
995
+ " '',\n",
996
+ " '',\n",
997
+ " '',\n",
998
+ " '',\n",
999
+ " '',\n",
1000
+ " '',\n",
1001
+ " '',\n",
1002
+ " '',\n",
1003
+ " '',\n",
1004
+ " '',\n",
1005
+ " '',\n",
1006
+ " '',\n",
1007
+ " '',\n",
1008
+ " '',\n",
1009
+ " '',\n",
1010
+ " '',\n",
1011
+ " '',\n",
1012
+ " '',\n",
1013
+ " '',\n",
1014
+ " '',\n",
1015
+ " '',\n",
1016
+ " '',\n",
1017
+ " '',\n",
1018
+ " '',\n",
1019
+ " '',\n",
1020
+ " '',\n",
1021
+ " '',\n",
1022
+ " '',\n",
1023
+ " '',\n",
1024
+ " '',\n",
1025
+ " '',\n",
1026
+ " '',\n",
1027
+ " '',\n",
1028
+ " '',\n",
1029
+ " '',\n",
1030
+ " '',\n",
1031
+ " '',\n",
1032
+ " '',\n",
1033
+ " '',\n",
1034
+ " '',\n",
1035
+ " '',\n",
1036
+ " '',\n",
1037
+ " '',\n",
1038
+ " '',\n",
1039
+ " '',\n",
1040
+ " '',\n",
1041
+ " '',\n",
1042
+ " '',\n",
1043
+ " '',\n",
1044
+ " '',\n",
1045
+ " '',\n",
1046
+ " '',\n",
1047
+ " '',\n",
1048
+ " '',\n",
1049
+ " '',\n",
1050
+ " '',\n",
1051
+ " '',\n",
1052
+ " '',\n",
1053
+ " '',\n",
1054
+ " '',\n",
1055
+ " '',\n",
1056
+ " '',\n",
1057
+ " '',\n",
1058
+ " '',\n",
1059
+ " '',\n",
1060
+ " '',\n",
1061
+ " '',\n",
1062
+ " '',\n",
1063
+ " '',\n",
1064
+ " '',\n",
1065
+ " '',\n",
1066
+ " '',\n",
1067
+ " '',\n",
1068
+ " '',\n",
1069
+ " '',\n",
1070
+ " '',\n",
1071
+ " '',\n",
1072
+ " '',\n",
1073
+ " '',\n",
1074
+ " '',\n",
1075
+ " '',\n",
1076
+ " '',\n",
1077
+ " '',\n",
1078
+ " '',\n",
1079
+ " '',\n",
1080
+ " '',\n",
1081
+ " '',\n",
1082
+ " '',\n",
1083
+ " '',\n",
1084
+ " '',\n",
1085
+ " '',\n",
1086
+ " '',\n",
1087
+ " '',\n",
1088
+ " '',\n",
1089
+ " '',\n",
1090
+ " '',\n",
1091
+ " '',\n",
1092
+ " '',\n",
1093
+ " '',\n",
1094
+ " '',\n",
1095
+ " '',\n",
1096
+ " '',\n",
1097
+ " '',\n",
1098
+ " '',\n",
1099
+ " '',\n",
1100
+ " '',\n",
1101
+ " '',\n",
1102
+ " '',\n",
1103
+ " '',\n",
1104
+ " '',\n",
1105
+ " '',\n",
1106
+ " '',\n",
1107
+ " '',\n",
1108
+ " '',\n",
1109
+ " '',\n",
1110
+ " '',\n",
1111
+ " '',\n",
1112
+ " '',\n",
1113
+ " '',\n",
1114
+ " '',\n",
1115
+ " '',\n",
1116
+ " '',\n",
1117
+ " '',\n",
1118
+ " '',\n",
1119
+ " '',\n",
1120
+ " '',\n",
1121
+ " '',\n",
1122
+ " '',\n",
1123
+ " '',\n",
1124
+ " '',\n",
1125
+ " '',\n",
1126
+ " '',\n",
1127
+ " '',\n",
1128
+ " '',\n",
1129
+ " '',\n",
1130
+ " '',\n",
1131
+ " '',\n",
1132
+ " '',\n",
1133
+ " '',\n",
1134
+ " '',\n",
1135
+ " '',\n",
1136
+ " '',\n",
1137
+ " '',\n",
1138
+ " '',\n",
1139
+ " '',\n",
1140
+ " '',\n",
1141
+ " '',\n",
1142
+ " '',\n",
1143
+ " '',\n",
1144
+ " '',\n",
1145
+ " '',\n",
1146
+ " '',\n",
1147
+ " '',\n",
1148
+ " '',\n",
1149
+ " '',\n",
1150
+ " '',\n",
1151
+ " '',\n",
1152
+ " '',\n",
1153
+ " '',\n",
1154
+ " '',\n",
1155
+ " '',\n",
1156
+ " '',\n",
1157
+ " '',\n",
1158
+ " '',\n",
1159
+ " '',\n",
1160
+ " '',\n",
1161
+ " '',\n",
1162
+ " '',\n",
1163
+ " '',\n",
1164
+ " '',\n",
1165
+ " '',\n",
1166
+ " '',\n",
1167
+ " '',\n",
1168
+ " '',\n",
1169
+ " '',\n",
1170
+ " '',\n",
1171
+ " '',\n",
1172
+ " '',\n",
1173
+ " '',\n",
1174
+ " '',\n",
1175
+ " '',\n",
1176
+ " '',\n",
1177
+ " '',\n",
1178
+ " '',\n",
1179
+ " '',\n",
1180
+ " '',\n",
1181
+ " '',\n",
1182
+ " '',\n",
1183
+ " '',\n",
1184
+ " '',\n",
1185
+ " '',\n",
1186
+ " '',\n",
1187
+ " '',\n",
1188
+ " '',\n",
1189
+ " '',\n",
1190
+ " '',\n",
1191
+ " '',\n",
1192
+ " '',\n",
1193
+ " '',\n",
1194
+ " '',\n",
1195
+ " '',\n",
1196
+ " '',\n",
1197
+ " '',\n",
1198
+ " '',\n",
1199
+ " '',\n",
1200
+ " '',\n",
1201
+ " '',\n",
1202
+ " '',\n",
1203
+ " '',\n",
1204
+ " '',\n",
1205
+ " '',\n",
1206
+ " '',\n",
1207
+ " '',\n",
1208
+ " '',\n",
1209
+ " '',\n",
1210
+ " '',\n",
1211
+ " '',\n",
1212
+ " '',\n",
1213
+ " '',\n",
1214
+ " '',\n",
1215
+ " '',\n",
1216
+ " '',\n",
1217
+ " '',\n",
1218
+ " '',\n",
1219
+ " '',\n",
1220
+ " '',\n",
1221
+ " '',\n",
1222
+ " '',\n",
1223
+ " '',\n",
1224
+ " '',\n",
1225
+ " '',\n",
1226
+ " '',\n",
1227
+ " '',\n",
1228
+ " '',\n",
1229
+ " '',\n",
1230
+ " '',\n",
1231
+ " '',\n",
1232
+ " '',\n",
1233
+ " '',\n",
1234
+ " '',\n",
1235
+ " '',\n",
1236
+ " '',\n",
1237
+ " '',\n",
1238
+ " '',\n",
1239
+ " '',\n",
1240
+ " '',\n",
1241
+ " '',\n",
1242
+ " '',\n",
1243
+ " '',\n",
1244
+ " '',\n",
1245
+ " '',\n",
1246
+ " '',\n",
1247
+ " '',\n",
1248
+ " '',\n",
1249
+ " '',\n",
1250
+ " '',\n",
1251
+ " '',\n",
1252
+ " '',\n",
1253
+ " '',\n",
1254
+ " '',\n",
1255
+ " '',\n",
1256
+ " '',\n",
1257
+ " '',\n",
1258
+ " '',\n",
1259
+ " '',\n",
1260
+ " ...]"
1261
+ ]
1262
+ },
1263
+ "execution_count": 14,
1264
+ "metadata": {},
1265
+ "output_type": "execute_result"
1266
+ }
1267
+ ],
1268
+ "source": [
1269
+ "test_data[\"segment\"]"
1270
+ ]
1271
+ },
1272
  {
1273
  "cell_type": "code",
1274
  "execution_count": 31,
upload_nst_sv_to_hf_dataset.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ Script to load, transform and upload swedish NST dataset to 🤗 datasets.
2
+
3
+ Dataset source: https://www.nb.no/sprakbanken/en/resource-catalogue/oai-nb-no-sbr-56/
4
+
5
+ Procedure:
6
+ 1. Loop over annotations
7
+ 2. Decide whether to discard specific item
8
+ 3. Create DatasetDict = {
9
+ features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'],
10
+ num_rows: 11030
11
+ }
12
+ 3b. Mapping common_voice <---> NST
13
+ - 'client_id': info.Speaker_ID
14
+ - 'path': val_recording.file
15
+ - 'audio': wav file (binary)
16
+ - 'sentence': val_recording.text
17
+ - 'up_votes': 0
18
+ - 'down_votes': 0
19
+ - 'age': info.Age
20
+ - 'gender': info.Sex
21
+ - 'accent': ""
22
+ - 'locale': "sv"
23
+ - 'segment': ""
24
+ 4. Dump to parquet
25
+ 5. Upload to hub
26
+
27
+ Filter out:
28
+ - single words
29
+ - single characters
30
+ - words splitted in single characters
31
+
32
+ """
33
+
34
+ import json
35
+ import os
36
+
37
+ from datasets import DatasetDict
38
+
39
+
40
+ hf_dataset_repo = "marinone94/nst_sv"
41
+ audio_files_path = "/Users/emiliomarinone/datasets/nst_sv/audio_files"
42
+ annotations_path = "/Users/emiliomarinone/datasets/nst_sv/annotations"
43
+
44
+
45
+ def load_audio_file(filepath):
46
+ return None
47
+
48
+ def is_record_valid(text):
49
+ text_split = text.split()
50
+
51
+ if len(text_split) < 2:
52
+ return False
53
+
54
+ is_all_single_chars = True
55
+ for token in text_split:
56
+ if len(token) != 1:
57
+ is_all_single_chars = False
58
+ break
59
+ if is_all_single_chars:
60
+ return False
61
+
62
+ return True
63
+
64
+
65
+ def create_dataset_row(annotation_filename):
66
+ annotations_filepath = os.path.join(annotations_path, annotation_filename)
67
+ with open(annotations_filepath, "r") as f:
68
+ annotation = json.load(f)
69
+
70
+ dataset_rows = []
71
+ for recording in annotation["val_recordings"]:
72
+ if is_record_valid(recording["text"]):
73
+ audio_filepath = f'{audio_files_path}/{annotation["pid"]}/{annotation["pid"]}_{recording["file"]}'
74
+ dataset_row = {
75
+ "client_id": annotation["info"]["Speaker_ID"],
76
+ 'path': recording["file"],
77
+ 'audio': load_audio_file(audio_filepath),
78
+ 'sentence': recording["text"],
79
+ 'up_votes': 0,
80
+ 'down_votes': 0,
81
+ 'age': annotation["info"]["Age"],
82
+ 'gender': annotation["info"]["Sex"],
83
+ 'accent': "",
84
+ 'locale': "sv",
85
+ 'segment': ""
86
+ }
87
+ dataset_rows.append(dataset_row)
88
+
89
+ return dataset_rows
90
+
91
+
92
+ dataset_rows = []
93
+ for i, filename in enumerate(os.listdir(annotations_path)):
94
+ dataset_rows.extend(create_dataset_row(filename))
95
+ if i == 5:
96
+ break
97
+
98
+ from pprint import pformat
99
+ pformat(dataset_rows)
100
+
101
+ # dataset = DatasetDict(dataset_rows)
102
+ # with open("temp.json", "w") as f:
103
+ # json.dump(f, dataset_rows)