File size: 5,458 Bytes

2d8da09

# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# USAGE: python get_demand_data.py --data_root=<where to put data>
#        --data_set=<datasets_to_download>
# where <datasets_to_download> can be: one or more of the 16 kHz noise profiles
# listed at https://zenodo.org/record/1227121#.Ygb4avXMKJk ,
# or ALL
# You can put more than one data_set comma-separated:
# --data_sets=DKITCHEN,DLIVING,NRIVER

import argparse
import glob
import json
import logging
import os
import shutil
import subprocess
import urllib.request

parser = argparse.ArgumentParser(description='LibriSpeech Data download')
parser.add_argument("--data_root", required=True, default=None, type=str)
parser.add_argument("--data_sets", default="ALL", type=str)

parser.add_argument('--log', dest='log', action='store_true', default=False)
args = parser.parse_args()

URLS = {
    'DKITCHEN': ("https://zenodo.org/record/1227121/files/DKITCHEN_16k.zip"),
    'DLIVING': ("https://zenodo.org/record/1227121/files/DLIVING_16k.zip"),
    'DWASHING': ("https://zenodo.org/record/1227121/files/DWASHING_16k.zip"),
    'NFIELD': ("https://zenodo.org/record/1227121/files/NFIELD_16k.zip"),
    'NPARK': ("https://zenodo.org/record/1227121/files/NPARK_16k.zip"),
    'NRIVER': ("https://zenodo.org/record/1227121/files/NRIVER_16k.zip"),
    'OHALLWAY': ("https://zenodo.org/record/1227121/files/OHALLWAY_16k.zip"),
    'OMEETING': ("https://zenodo.org/record/1227121/files/OMEETING_16k.zip"),
    'OOFFICE': ("https://zenodo.org/record/1227121/files/OOFFICE_16k.zip"),
    'PCAFETER': ("https://zenodo.org/record/1227121/files/PCAFETER_16k.zip"),
    'PRESTO': ("https://zenodo.org/record/1227121/files/PRESTO_16k.zip"),
    'PSTATION': ("https://zenodo.org/record/1227121/files/PSTATION_16k.zip"),
    'SPSQUARE': ("https://zenodo.org/record/1227121/files/SPSQUARE_16k.zip"),
    'STRAFFIC': ("https://zenodo.org/record/1227121/files/STRAFFIC_16k.zip"),
    'TBUS': ("https://zenodo.org/record/1227121/files/TBUS_16k.zip"),
    'TCAR': ("https://zenodo.org/record/1227121/files/TCAR_16k.zip"),
    'TMETRO': ("https://zenodo.org/record/1227121/files/TMETRO_16k.zip"),
}


def __maybe_download_file(destination: str, source: str):
    """
    Downloads source to destination if it doesn't exist.
    If exists, skips download
    Args:
        destination: local filepath
        source: url of resource
    Returns:
    """
    source = URLS[source]
    if not os.path.exists(destination):
        logging.info("{0} does not exist. Downloading ...".format(destination))
        urllib.request.urlretrieve(source, filename=destination + '.tmp')
        os.rename(destination + '.tmp', destination)
        logging.info("Downloaded {0}.".format(destination))
    else:
        logging.info("Destination {0} exists. Skipping.".format(destination))
    return destination


def __extract_file(filepath: str, data_dir: str):
    shutil.unpack_archive(filepath, data_dir)


def __create_manifest(dst_folder: str):
    """
    Create manifests for the noise files
    Args:
        file_path: path to a source transcript  with flac sources
        dst_folder: path where manifests will be created
    Returns:

        a list of metadata entries for processed files.
    """
    # Read directory
    # Get all wav file names
    # create line per wav file in manifest
    noise_name = os.path.basename(dst_folder)
    wav_files = glob.glob(dst_folder + "/*.wav")
    wav_files.sort()
    os.makedirs(os.path.join(os.path.dirname(dst_folder), "manifests"), exist_ok=True)
    with open(os.path.join(os.path.dirname(dst_folder), "manifests", noise_name + ".json"), "w") as mfst_f:
        for wav_f in wav_files:
            dur = subprocess.check_output("soxi -D {0}".format(wav_f), shell=True)
            row = {"audio_filepath": wav_f, "text": "", "duration": float(dur)}
            mfst_f.write(json.dumps(row) + "\n")


def main():
    data_root = args.data_root
    data_sets = args.data_sets

    if args.log:
        print("here")
        logging.basicConfig(level=logging.INFO)
    if not os.path.exists(data_root):
        os.makedirs(data_root)

    if data_sets == "ALL":
        data_sets = URLS.keys()
    else:
        data_sets = data_sets.split(',')

    for data_set in data_sets:
        if data_set not in URLS.keys():
            raise ValueError(f"{data_sets} is not part of demand noise database")
        logging.info("\n\nWorking on: {0}".format(data_set))
        filepath = os.path.join(data_root, data_set + "_16k.zip")
        logging.info("Getting {0}".format(data_set))
        __maybe_download_file(filepath, data_set.upper())
        logging.info("Extracting {0}".format(data_set))
        __extract_file(filepath, data_root)
        logging.info("Processing {0}".format(data_set))
        __create_manifest(os.path.join(data_root, data_set))
    logging.info('Done!')


if __name__ == "__main__":
    main()