File size: 8,758 Bytes
813828b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 |
# coding=utf-8
# Copyright (C) 2020 ATHENA AUTHORS; Yiping Peng; Ne Luo
# All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
# Only support eager mode and TF>=2.0.0
# pylint: disable=no-member, invalid-name, relative-beyond-top-level
# pylint: disable=too-many-locals, too-many-statements, too-many-arguments, too-many-instance-attributes
""" voxceleb 1 & 2 """
import hashlib
import os
import subprocess
import sys
import zipfile
import pandas
import soundfile as sf
from absl import logging
SUBSETS = {
"vox1_dev_wav": [
"https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_dev_wav_partaa",
"https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_dev_wav_partab",
"https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_dev_wav_partac",
"https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_dev_wav_partad",
],
"vox1_test_wav": ["https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_test_wav.zip"],
"vox2_dev_aac": [
"https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partaa",
"https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partab",
"https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partac",
"https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partad",
"https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partae",
"https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partaf",
"https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partag",
"https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partah",
],
"vox2_test_aac": ["https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_test_aac.zip"],
}
MD5SUM = {
"vox1_dev_wav": "ae63e55b951748cc486645f532ba230b",
"vox2_dev_aac": "bbc063c46078a602ca71605645c2a402",
"vox1_test_wav": "185fdc63c3c739954633d50379a3d102",
"vox2_test_aac": "0d2b3ea430a821c33263b5ea37ede312",
}
USER = {"user": "", "password": ""}
speaker_id_dict = {}
def download_and_extract(directory, subset, urls):
"""Download and extract the given split of dataset.
Args:
directory: the directory where to put the downloaded data.
subset: subset name of the corpus.
urls: the list of urls to download the data file.
"""
os.makedirs(directory, exist_ok=True)
try:
for url in urls:
zip_filepath = os.path.join(directory, url.split("/")[-1])
if os.path.exists(zip_filepath):
continue
logging.info("Downloading %s to %s" % (url, zip_filepath))
subprocess.call(
"wget %s --user %s --password %s -O %s" % (url, USER["user"], USER["password"], zip_filepath),
shell=True,
)
statinfo = os.stat(zip_filepath)
logging.info("Successfully downloaded %s, size(bytes): %d" % (url, statinfo.st_size))
# concatenate all parts into zip files
if ".zip" not in zip_filepath:
zip_filepath = "_".join(zip_filepath.split("_")[:-1])
subprocess.call("cat %s* > %s.zip" % (zip_filepath, zip_filepath), shell=True)
zip_filepath += ".zip"
extract_path = zip_filepath.strip(".zip")
# check zip file md5sum
with open(zip_filepath, "rb") as f_zip:
md5 = hashlib.md5(f_zip.read()).hexdigest()
if md5 != MD5SUM[subset]:
raise ValueError("md5sum of %s mismatch" % zip_filepath)
with zipfile.ZipFile(zip_filepath, "r") as zfile:
zfile.extractall(directory)
extract_path_ori = os.path.join(directory, zfile.infolist()[0].filename)
subprocess.call("mv %s %s" % (extract_path_ori, extract_path), shell=True)
finally:
# os.remove(zip_filepath)
pass
def exec_cmd(cmd):
"""Run a command in a subprocess.
Args:
cmd: command line to be executed.
Return:
int, the return code.
"""
try:
retcode = subprocess.call(cmd, shell=True)
if retcode < 0:
logging.info(f"Child was terminated by signal {retcode}")
except OSError as e:
logging.info(f"Execution failed: {e}")
retcode = -999
return retcode
def decode_aac_with_ffmpeg(aac_file, wav_file):
"""Decode a given AAC file into WAV using ffmpeg.
Args:
aac_file: file path to input AAC file.
wav_file: file path to output WAV file.
Return:
bool, True if success.
"""
cmd = f"ffmpeg -i {aac_file} {wav_file}"
logging.info(f"Decoding aac file using command line: {cmd}")
ret = exec_cmd(cmd)
if ret != 0:
logging.error(f"Failed to decode aac file with retcode {ret}")
logging.error("Please check your ffmpeg installation.")
return False
return True
def convert_audio_and_make_label(input_dir, subset, output_dir, output_file):
"""Optionally convert AAC to WAV and make speaker labels.
Args:
input_dir: the directory which holds the input dataset.
subset: the name of the specified subset. e.g. vox1_dev_wav
output_dir: the directory to place the newly generated csv files.
output_file: the name of the newly generated csv file. e.g. vox1_dev_wav.csv
"""
logging.info("Preprocessing audio and label for subset %s" % subset)
source_dir = os.path.join(input_dir, subset)
files = []
# Convert all AAC file into WAV format. At the same time, generate the csv
for root, _, filenames in os.walk(source_dir):
for filename in filenames:
name, ext = os.path.splitext(filename)
if ext.lower() == ".wav":
_, ext2 = os.path.splitext(name)
if ext2:
continue
wav_file = os.path.join(root, filename)
elif ext.lower() == ".m4a":
# Convert AAC to WAV.
aac_file = os.path.join(root, filename)
wav_file = aac_file + ".wav"
if not os.path.exists(wav_file):
if not decode_aac_with_ffmpeg(aac_file, wav_file):
raise RuntimeError("Audio decoding failed.")
else:
continue
speaker_name = root.split(os.path.sep)[-2]
if speaker_name not in speaker_id_dict:
num = len(speaker_id_dict)
speaker_id_dict[speaker_name] = num
# wav_filesize = os.path.getsize(wav_file)
wav_length = len(sf.read(wav_file)[0])
files.append((os.path.abspath(wav_file), wav_length, speaker_id_dict[speaker_name], speaker_name))
# Write to CSV file which contains four columns:
# "wav_filename", "wav_length_ms", "speaker_id", "speaker_name".
csv_file_path = os.path.join(output_dir, output_file)
df = pandas.DataFrame(data=files, columns=["wav_filename", "wav_length_ms", "speaker_id", "speaker_name"])
df.to_csv(csv_file_path, index=False, sep="\t")
logging.info("Successfully generated csv file {}".format(csv_file_path))
def processor(directory, subset, force_process):
"""download and process"""
urls = SUBSETS
if subset not in urls:
raise ValueError(subset, "is not in voxceleb")
subset_csv = os.path.join(directory, subset + ".csv")
if not force_process and os.path.exists(subset_csv):
return subset_csv
logging.info("Downloading and process the voxceleb in %s", directory)
logging.info("Preparing subset %s", subset)
download_and_extract(directory, subset, urls[subset])
convert_audio_and_make_label(directory, subset, directory, subset + ".csv")
logging.info("Finished downloading and processing")
return subset_csv
if __name__ == "__main__":
logging.set_verbosity(logging.INFO)
if len(sys.argv) != 4:
print("Usage: python prepare_data.py save_directory user password")
sys.exit()
DIR, USER["user"], USER["password"] = sys.argv[1], sys.argv[2], sys.argv[3]
for SUBSET in SUBSETS:
processor(DIR, SUBSET, False)
|