Sin2pi commited on
Commit
22ef493
·
verified ·
1 Parent(s): 5e4bd82

Upload librispeech_asr.py

Browse files
Files changed (1) hide show
  1. librispeech_asr.py +132 -0
librispeech_asr.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ """Librispeech automatic speech recognition dataset."""
3
+
4
+ import os
5
+
6
+ import datasets
7
+
8
+ _CITATION = """\
9
+ @inproceedings{panayotov2015librispeech,
10
+ title={Librispeech: an ASR corpus based on public domain audio books},
11
+ author={Panayotov, Vassil and Chen, Guoguo and Povey, Daniel and Khudanpur, Sanjeev},
12
+ booktitle={Acoustics, Speech and Signal Processing (ICASSP), 2015 IEEE International Conference on},
13
+ pages={5206--5210},
14
+ year={2015},
15
+ organization={IEEE}
16
+ }
17
+ """
18
+
19
+ _DESCRIPTION = """\
20
+ LibriSpeech is a corpus of approximately 1000 hours of read English speech with sampling rate of 16 kHz,
21
+ prepared by Vassil Panayotov with the assistance of Daniel Povey. The data is derived from read
22
+ audiobooks from the LibriVox project, and has been carefully segmented and aligned.87
23
+ """
24
+
25
+ _URL = "http://www.openslr.org/12"
26
+ _DL_URL = "http://www.openslr.org/resources/12/"
27
+
28
+ _DL_URLS = {"test": _DL_URL + "test-clean.tar.gz",
29
+ "train.100": _DL_URL + "train-clean-100.tar.gz",
30
+ }
31
+
32
+ class LibrispeechASRConfig(datasets.BuilderConfig):
33
+ """BuilderConfig for LibriSpeechASR."""
34
+
35
+ def __init__(self, **kwargs):
36
+ """
37
+ Args:
38
+ data_dir: `string`, the path to the folder containing the files in the
39
+ downloaded .tar
40
+ citation: `string`, citation for the data set
41
+ url: `string`, url for information about the data set
42
+ **kwargs: keyword arguments forwarded to super.
43
+ """
44
+ super(LibrispeechASRConfig, self).__init__(version=datasets.Version("2.1.0", ""), **kwargs)
45
+
46
+ class LibrispeechASR(datasets.GeneratorBasedBuilder):
47
+ """Librispeech dataset."""
48
+
49
+ DEFAULT_WRITER_BATCH_SIZE = 256
50
+ DEFAULT_CONFIG_NAME = "all"
51
+ BUILDER_CONFIG = LibrispeechASRConfig(name="clean", description="'Clean' speech.")
52
+
53
+ def _info(self):
54
+ return datasets.DatasetInfo(
55
+ description=_DESCRIPTION,
56
+ features=datasets.Features(
57
+ {
58
+ "file": datasets.Value("string"),
59
+ "audio": datasets.Audio(sampling_rate=16_000),
60
+ "text": datasets.Value("string"),
61
+ "speaker_id": datasets.Value("int64"),
62
+ "chapter_id": datasets.Value("int64"),
63
+ "id": datasets.Value("string"),
64
+ }
65
+ ),
66
+ supervised_keys=("file", "text"),
67
+ homepage=_URL,
68
+ citation=_CITATION,
69
+ )
70
+
71
+ def _split_generators(self, dl_manager):
72
+ archive_path = dl_manager.download(_DL_URLS)
73
+ # (Optional) In non-streaming mode, we can extract the archive locally to have actual local audio files:
74
+ local_extracted_archive = dl_manager.extract(archive_path) if not dl_manager.is_streaming else {}
75
+
76
+ train_split = [
77
+ datasets.SplitGenerator(
78
+ name="train.100",
79
+ gen_kwargs={
80
+ "local_extracted_archive": local_extracted_archive.get("train.100"),
81
+ "files": dl_manager.iter_archive(archive_path["train.100"]),
82
+ },
83
+ ),
84
+ ]
85
+ test_split = [
86
+ datasets.SplitGenerator(
87
+ name=datasets.Split.TEST,
88
+ gen_kwargs={
89
+ "local_extracted_archive": local_extracted_archive.get("test"),
90
+ "files": dl_manager.iter_archive(archive_path["test"]),
91
+ },
92
+ )
93
+ ]
94
+ return train_split + test_split
95
+
96
+ def _generate_examples(self, files, local_extracted_archive):
97
+ """Generate examples from a LibriSpeech archive_path."""
98
+ key = 0
99
+ audio_data = {}
100
+ transcripts = []
101
+ for path, f in files:
102
+ if path.endswith(".flac"):
103
+ id_ = path.split("/")[-1][: -len(".flac")]
104
+ audio_data[id_] = f.read()
105
+ elif path.endswith(".trans.txt"):
106
+ for line in f:
107
+ if line:
108
+ line = line.decode("utf-8").strip()
109
+ id_, transcript = line.split(" ", 1)
110
+ audio_file = f"{id_}.flac"
111
+ speaker_id, chapter_id = [int(el) for el in id_.split("-")[:2]]
112
+ audio_file = (
113
+ os.path.join(local_extracted_archive, audio_file)
114
+ if local_extracted_archive
115
+ else audio_file
116
+ )
117
+ transcripts.append(
118
+ {
119
+ "id": id_,
120
+ "speaker_id": speaker_id,
121
+ "chapter_id": chapter_id,
122
+ "file": audio_file,
123
+ "text": transcript,
124
+ }
125
+ )
126
+ if audio_data and len(audio_data) == len(transcripts):
127
+ for transcript in transcripts:
128
+ audio = {"path": transcript["file"], "bytes": audio_data[transcript["id"]]}
129
+ yield key, {"audio": audio, **transcript}
130
+ key += 1
131
+ audio_data = {}
132
+ transcripts = []