File size: 3,586 Bytes
affcd23 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 |
import json
import logging
import os
import shutil
from pathlib import Path
import torchaudio
from speechbrain.utils.data_utils import download_file
from tqdm import tqdm
from hw_asr.base.base_dataset import BaseDataset
from hw_asr.utils import ROOT_PATH
logger = logging.getLogger(__name__)
URL_LINKS = {
"dev-clean": "https://www.openslr.org/resources/12/dev-clean.tar.gz",
"dev-other": "https://www.openslr.org/resources/12/dev-other.tar.gz",
"test-clean": "https://www.openslr.org/resources/12/test-clean.tar.gz",
"test-other": "https://www.openslr.org/resources/12/test-other.tar.gz",
"train-clean-100": "https://www.openslr.org/resources/12/train-clean-100.tar.gz",
"train-clean-360": "https://www.openslr.org/resources/12/train-clean-360.tar.gz",
"train-other-500": "https://www.openslr.org/resources/12/train-other-500.tar.gz",
}
class LibrispeechDataset(BaseDataset):
def __init__(self, part, data_dir=None, *args, **kwargs):
assert part in URL_LINKS or part == 'train_all'
if data_dir is None:
data_dir = ROOT_PATH / "data" / "datasets" / "librispeech"
data_dir.mkdir(exist_ok=True, parents=True)
self._data_dir = data_dir
if part == 'train_all':
index = sum([self._get_or_load_index(part)
for part in URL_LINKS if 'train' in part], [])
else:
index = self._get_or_load_index(part)
super().__init__(index, *args, **kwargs)
def _load_part(self, part):
arch_path = self._data_dir / f"{part}.tar.gz"
print(f"Loading part {part}")
download_file(URL_LINKS[part], arch_path)
shutil.unpack_archive(arch_path, self._data_dir)
for fpath in (self._data_dir / "LibriSpeech").iterdir():
shutil.move(str(fpath), str(self._data_dir / fpath.name))
os.remove(str(arch_path))
shutil.rmtree(str(self._data_dir / "LibriSpeech"))
def _get_or_load_index(self, part):
index_path = self._data_dir / f"{part}_index.json"
if index_path.exists():
with index_path.open() as f:
index = json.load(f)
else:
index = self._create_index(part)
with index_path.open("w") as f:
json.dump(index, f, indent=2)
return index
def _create_index(self, part):
index = []
split_dir = self._data_dir / part
if not split_dir.exists():
self._load_part(part)
flac_dirs = set()
for dirpath, dirnames, filenames in os.walk(str(split_dir)):
if any([f.endswith(".flac") for f in filenames]):
flac_dirs.add(dirpath)
for flac_dir in tqdm(
list(flac_dirs), desc=f"Preparing librispeech folders: {part}"
):
flac_dir = Path(flac_dir)
trans_path = list(flac_dir.glob("*.trans.txt"))[0]
with trans_path.open() as f:
for line in f:
f_id = line.split()[0]
f_text = " ".join(line.split()[1:]).strip()
flac_path = flac_dir / f"{f_id}.flac"
t_info = torchaudio.info(str(flac_path))
length = t_info.num_frames / t_info.sample_rate
index.append(
{
"path": str(flac_path.absolute().resolve()),
"text": f_text.lower(),
"audio_len": length,
}
)
return index
|