|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import argparse |
|
import glob |
|
import json |
|
import logging |
|
import os |
|
import shutil |
|
import subprocess |
|
import urllib.request |
|
|
|
parser = argparse.ArgumentParser(description='LibriSpeech Data download') |
|
parser.add_argument("--data_root", required=True, default=None, type=str) |
|
parser.add_argument("--data_sets", default="ALL", type=str) |
|
|
|
parser.add_argument('--log', dest='log', action='store_true', default=False) |
|
args = parser.parse_args() |
|
|
|
URLS = { |
|
'DKITCHEN': ("https://zenodo.org/record/1227121/files/DKITCHEN_16k.zip"), |
|
'DLIVING': ("https://zenodo.org/record/1227121/files/DLIVING_16k.zip"), |
|
'DWASHING': ("https://zenodo.org/record/1227121/files/DWASHING_16k.zip"), |
|
'NFIELD': ("https://zenodo.org/record/1227121/files/NFIELD_16k.zip"), |
|
'NPARK': ("https://zenodo.org/record/1227121/files/NPARK_16k.zip"), |
|
'NRIVER': ("https://zenodo.org/record/1227121/files/NRIVER_16k.zip"), |
|
'OHALLWAY': ("https://zenodo.org/record/1227121/files/OHALLWAY_16k.zip"), |
|
'OMEETING': ("https://zenodo.org/record/1227121/files/OMEETING_16k.zip"), |
|
'OOFFICE': ("https://zenodo.org/record/1227121/files/OOFFICE_16k.zip"), |
|
'PCAFETER': ("https://zenodo.org/record/1227121/files/PCAFETER_16k.zip"), |
|
'PRESTO': ("https://zenodo.org/record/1227121/files/PRESTO_16k.zip"), |
|
'PSTATION': ("https://zenodo.org/record/1227121/files/PSTATION_16k.zip"), |
|
'SPSQUARE': ("https://zenodo.org/record/1227121/files/SPSQUARE_16k.zip"), |
|
'STRAFFIC': ("https://zenodo.org/record/1227121/files/STRAFFIC_16k.zip"), |
|
'TBUS': ("https://zenodo.org/record/1227121/files/TBUS_16k.zip"), |
|
'TCAR': ("https://zenodo.org/record/1227121/files/TCAR_16k.zip"), |
|
'TMETRO': ("https://zenodo.org/record/1227121/files/TMETRO_16k.zip"), |
|
} |
|
|
|
|
|
def __maybe_download_file(destination: str, source: str): |
|
""" |
|
Downloads source to destination if it doesn't exist. |
|
If exists, skips download |
|
Args: |
|
destination: local filepath |
|
source: url of resource |
|
Returns: |
|
""" |
|
source = URLS[source] |
|
if not os.path.exists(destination): |
|
logging.info("{0} does not exist. Downloading ...".format(destination)) |
|
urllib.request.urlretrieve(source, filename=destination + '.tmp') |
|
os.rename(destination + '.tmp', destination) |
|
logging.info("Downloaded {0}.".format(destination)) |
|
else: |
|
logging.info("Destination {0} exists. Skipping.".format(destination)) |
|
return destination |
|
|
|
|
|
def __extract_file(filepath: str, data_dir: str): |
|
shutil.unpack_archive(filepath, data_dir) |
|
|
|
|
|
def __create_manifest(dst_folder: str): |
|
""" |
|
Create manifests for the noise files |
|
Args: |
|
file_path: path to a source transcript with flac sources |
|
dst_folder: path where manifests will be created |
|
Returns: |
|
|
|
a list of metadata entries for processed files. |
|
""" |
|
|
|
|
|
|
|
noise_name = os.path.basename(dst_folder) |
|
wav_files = glob.glob(dst_folder + "/*.wav") |
|
wav_files.sort() |
|
os.makedirs(os.path.join(os.path.dirname(dst_folder), "manifests"), exist_ok=True) |
|
with open(os.path.join(os.path.dirname(dst_folder), "manifests", noise_name + ".json"), "w") as mfst_f: |
|
for wav_f in wav_files: |
|
dur = subprocess.check_output("soxi -D {0}".format(wav_f), shell=True) |
|
row = {"audio_filepath": wav_f, "text": "", "duration": float(dur)} |
|
mfst_f.write(json.dumps(row) + "\n") |
|
|
|
|
|
def main(): |
|
data_root = args.data_root |
|
data_sets = args.data_sets |
|
|
|
if args.log: |
|
print("here") |
|
logging.basicConfig(level=logging.INFO) |
|
if not os.path.exists(data_root): |
|
os.makedirs(data_root) |
|
|
|
if data_sets == "ALL": |
|
data_sets = URLS.keys() |
|
else: |
|
data_sets = data_sets.split(',') |
|
|
|
for data_set in data_sets: |
|
if data_set not in URLS.keys(): |
|
raise ValueError(f"{data_sets} is not part of demand noise database") |
|
logging.info("\n\nWorking on: {0}".format(data_set)) |
|
filepath = os.path.join(data_root, data_set + "_16k.zip") |
|
logging.info("Getting {0}".format(data_set)) |
|
__maybe_download_file(filepath, data_set.upper()) |
|
logging.info("Extracting {0}".format(data_set)) |
|
__extract_file(filepath, data_root) |
|
logging.info("Processing {0}".format(data_set)) |
|
__create_manifest(os.path.join(data_root, data_set)) |
|
logging.info('Done!') |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |
|
|