File size: 3,673 Bytes
3b17866
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
# Lint as: python3
"""Simple, minimal ASR dataset template."""


import csv
import os

import datasets
from datasets.tasks import AutomaticSpeechRecognition


_CITATION = ""

_DESCRIPTION = """\
This is a private dataset
"""

_URL = "https://localhost"
_DL_URL = "http://localhost:8000/data_simple.tgz"


class SimpleTplConfig(datasets.BuilderConfig):
    """BuilderConfig for LucerneTest."""

    def __init__(self, name, **kwargs):
        """
        Args:
          data_dir: `string`, the path to the folder containing the audio files
            in the downloaded .tar.gz file.
          citation: `string`, optional citation for the dataset.
          url: `string`, url for information about the dataset.
          **kwargs: keyword arguments forwarded to super.
        """
        self.num_of_voice = 100

        description = f"Simple Dataset."
        super(SimpleTplConfig, self).__init__(
            name=name, version=datasets.Version("1.1.0", ""), description=description, **kwargs
        )

class SimpleTpl(datasets.GeneratorBasedBuilder):
    """Simple Speech dataset."""

    VERSION = datasets.Version("1.1.0")
    #SimpleTplConfig(name="simpletpl")

    DEFAULT_WRITER_BATCH_SIZE = 1000
    BUILDER_CONFIGS = [
        datasets.BuilderConfig(
            name="main",
            version=VERSION,
            description="The simple dataset"
        )
    ]

    def _info(self):
        return datasets.DatasetInfo(
            description=_DESCRIPTION,
            features=datasets.Features(
                {
                    "audio": datasets.Audio(sampling_rate=16000),
                    "path": datasets.Value("string"),
                    "sentence": datasets.Value("string"),
                }
            ),
            supervised_keys=None,
            homepage=_URL,
            citation=_CITATION,
            task_templates=[
                AutomaticSpeechRecognition(
                    audio_file_path_column="path",
                    transcription_column="sentence")
            ],
        )

    def _split_generators(self, dl_manager):
        root_path = dl_manager.download_and_extract(_DL_URL)
        root_path = os.path.join(root_path, "data_simple")
        wav_path = os.path.join(root_path, "audio")
        train_csv = os.path.join(root_path, "train.csv")
        valid_csv = os.path.join(root_path, "valid.csv")
        test_csv = os.path.join(root_path, "test.csv")

        return [
            datasets.SplitGenerator(
                name=datasets.Split.TRAIN,
                gen_kwargs={"wav_path": wav_path, "csv_path": train_csv}
            ),
            datasets.SplitGenerator(
                name=datasets.Split.VALIDATION,
                gen_kwargs={"wav_path": wav_path, "csv_path": valid_csv}
            ),
            datasets.SplitGenerator(
                name=datasets.Split.TEST,
                gen_kwargs={"wav_path": wav_path, "csv_path": test_csv}
            ),
        ]

    def _generate_examples(self, wav_path, csv_path):
        """Generate examples from a Speech archive_path."""

        with open(csv_path, encoding="utf-8") as csv_file:
            csv_reader = csv.reader(
                csv_file,
                delimiter=",",
                quotechar=None,
                skipinitialspace=True
            )

            for idx,row in enumerate(csv_reader):
                if idx == 0:
                    continue
                wav_path, sentence = row
                example = {
                    "path": wav_path,
                    "audio": wav_path,
                    "sentence": sentence,
                }

                yield wav_path, example