Upload librispeech_asr.py
Browse files- librispeech_asr.py +132 -0
librispeech_asr.py
ADDED
@@ -0,0 +1,132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
"""Librispeech automatic speech recognition dataset."""
|
3 |
+
|
4 |
+
import os
|
5 |
+
|
6 |
+
import datasets
|
7 |
+
|
8 |
+
_CITATION = """\
|
9 |
+
@inproceedings{panayotov2015librispeech,
|
10 |
+
title={Librispeech: an ASR corpus based on public domain audio books},
|
11 |
+
author={Panayotov, Vassil and Chen, Guoguo and Povey, Daniel and Khudanpur, Sanjeev},
|
12 |
+
booktitle={Acoustics, Speech and Signal Processing (ICASSP), 2015 IEEE International Conference on},
|
13 |
+
pages={5206--5210},
|
14 |
+
year={2015},
|
15 |
+
organization={IEEE}
|
16 |
+
}
|
17 |
+
"""
|
18 |
+
|
19 |
+
_DESCRIPTION = """\
|
20 |
+
LibriSpeech is a corpus of approximately 1000 hours of read English speech with sampling rate of 16 kHz,
|
21 |
+
prepared by Vassil Panayotov with the assistance of Daniel Povey. The data is derived from read
|
22 |
+
audiobooks from the LibriVox project, and has been carefully segmented and aligned.87
|
23 |
+
"""
|
24 |
+
|
25 |
+
_URL = "http://www.openslr.org/12"
|
26 |
+
_DL_URL = "http://www.openslr.org/resources/12/"
|
27 |
+
|
28 |
+
_DL_URLS = {"test": _DL_URL + "test-clean.tar.gz",
|
29 |
+
"train.100": _DL_URL + "train-clean-100.tar.gz",
|
30 |
+
}
|
31 |
+
|
32 |
+
class LibrispeechASRConfig(datasets.BuilderConfig):
|
33 |
+
"""BuilderConfig for LibriSpeechASR."""
|
34 |
+
|
35 |
+
def __init__(self, **kwargs):
|
36 |
+
"""
|
37 |
+
Args:
|
38 |
+
data_dir: `string`, the path to the folder containing the files in the
|
39 |
+
downloaded .tar
|
40 |
+
citation: `string`, citation for the data set
|
41 |
+
url: `string`, url for information about the data set
|
42 |
+
**kwargs: keyword arguments forwarded to super.
|
43 |
+
"""
|
44 |
+
super(LibrispeechASRConfig, self).__init__(version=datasets.Version("2.1.0", ""), **kwargs)
|
45 |
+
|
46 |
+
class LibrispeechASR(datasets.GeneratorBasedBuilder):
|
47 |
+
"""Librispeech dataset."""
|
48 |
+
|
49 |
+
DEFAULT_WRITER_BATCH_SIZE = 256
|
50 |
+
DEFAULT_CONFIG_NAME = "all"
|
51 |
+
BUILDER_CONFIG = LibrispeechASRConfig(name="clean", description="'Clean' speech.")
|
52 |
+
|
53 |
+
def _info(self):
|
54 |
+
return datasets.DatasetInfo(
|
55 |
+
description=_DESCRIPTION,
|
56 |
+
features=datasets.Features(
|
57 |
+
{
|
58 |
+
"file": datasets.Value("string"),
|
59 |
+
"audio": datasets.Audio(sampling_rate=16_000),
|
60 |
+
"text": datasets.Value("string"),
|
61 |
+
"speaker_id": datasets.Value("int64"),
|
62 |
+
"chapter_id": datasets.Value("int64"),
|
63 |
+
"id": datasets.Value("string"),
|
64 |
+
}
|
65 |
+
),
|
66 |
+
supervised_keys=("file", "text"),
|
67 |
+
homepage=_URL,
|
68 |
+
citation=_CITATION,
|
69 |
+
)
|
70 |
+
|
71 |
+
def _split_generators(self, dl_manager):
|
72 |
+
archive_path = dl_manager.download(_DL_URLS)
|
73 |
+
# (Optional) In non-streaming mode, we can extract the archive locally to have actual local audio files:
|
74 |
+
local_extracted_archive = dl_manager.extract(archive_path) if not dl_manager.is_streaming else {}
|
75 |
+
|
76 |
+
train_split = [
|
77 |
+
datasets.SplitGenerator(
|
78 |
+
name="train.100",
|
79 |
+
gen_kwargs={
|
80 |
+
"local_extracted_archive": local_extracted_archive.get("train.100"),
|
81 |
+
"files": dl_manager.iter_archive(archive_path["train.100"]),
|
82 |
+
},
|
83 |
+
),
|
84 |
+
]
|
85 |
+
test_split = [
|
86 |
+
datasets.SplitGenerator(
|
87 |
+
name=datasets.Split.TEST,
|
88 |
+
gen_kwargs={
|
89 |
+
"local_extracted_archive": local_extracted_archive.get("test"),
|
90 |
+
"files": dl_manager.iter_archive(archive_path["test"]),
|
91 |
+
},
|
92 |
+
)
|
93 |
+
]
|
94 |
+
return train_split + test_split
|
95 |
+
|
96 |
+
def _generate_examples(self, files, local_extracted_archive):
|
97 |
+
"""Generate examples from a LibriSpeech archive_path."""
|
98 |
+
key = 0
|
99 |
+
audio_data = {}
|
100 |
+
transcripts = []
|
101 |
+
for path, f in files:
|
102 |
+
if path.endswith(".flac"):
|
103 |
+
id_ = path.split("/")[-1][: -len(".flac")]
|
104 |
+
audio_data[id_] = f.read()
|
105 |
+
elif path.endswith(".trans.txt"):
|
106 |
+
for line in f:
|
107 |
+
if line:
|
108 |
+
line = line.decode("utf-8").strip()
|
109 |
+
id_, transcript = line.split(" ", 1)
|
110 |
+
audio_file = f"{id_}.flac"
|
111 |
+
speaker_id, chapter_id = [int(el) for el in id_.split("-")[:2]]
|
112 |
+
audio_file = (
|
113 |
+
os.path.join(local_extracted_archive, audio_file)
|
114 |
+
if local_extracted_archive
|
115 |
+
else audio_file
|
116 |
+
)
|
117 |
+
transcripts.append(
|
118 |
+
{
|
119 |
+
"id": id_,
|
120 |
+
"speaker_id": speaker_id,
|
121 |
+
"chapter_id": chapter_id,
|
122 |
+
"file": audio_file,
|
123 |
+
"text": transcript,
|
124 |
+
}
|
125 |
+
)
|
126 |
+
if audio_data and len(audio_data) == len(transcripts):
|
127 |
+
for transcript in transcripts:
|
128 |
+
audio = {"path": transcript["file"], "bytes": audio_data[transcript["id"]]}
|
129 |
+
yield key, {"audio": audio, **transcript}
|
130 |
+
key += 1
|
131 |
+
audio_data = {}
|
132 |
+
transcripts = []
|