|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import argparse |
|
import csv |
|
import json |
|
import logging |
|
import multiprocessing |
|
import os |
|
import subprocess |
|
import sys |
|
import tarfile |
|
from multiprocessing.pool import ThreadPool |
|
from pathlib import Path |
|
from typing import List |
|
|
|
import sox |
|
from sox import Transformer |
|
from tqdm import tqdm |
|
|
|
parser = argparse.ArgumentParser(description='Downloads and processes Mozilla Common Voice dataset.') |
|
parser.add_argument("--data_root", default='CommonVoice_dataset/', type=str, help="Directory to store the dataset.") |
|
parser.add_argument('--manifest_dir', default='./', type=str, help='Output directory for manifests') |
|
parser.add_argument("--num_workers", default=multiprocessing.cpu_count(), type=int, help="Workers to process dataset.") |
|
parser.add_argument('--sample_rate', default=16000, type=int, help='Sample rate') |
|
parser.add_argument('--n_channels', default=1, type=int, help='Number of channels for output wav files') |
|
parser.add_argument("--log", dest="log", action="store_true", default=False) |
|
parser.add_argument("--cleanup", dest="cleanup", action="store_true", default=False) |
|
parser.add_argument( |
|
'--files_to_process', |
|
nargs='+', |
|
default=['test.tsv', 'dev.tsv', 'train.tsv'], |
|
type=str, |
|
help='list of *.csv file names to process', |
|
) |
|
parser.add_argument( |
|
'--version', |
|
default='cv-corpus-5.1-2020-06-22', |
|
type=str, |
|
help='Version of the dataset (obtainable via https://commonvoice.mozilla.org/en/datasets', |
|
) |
|
parser.add_argument( |
|
'--language', |
|
default='en', |
|
type=str, |
|
help='Which language to download.(default english,' |
|
'check https://commonvoice.mozilla.org/en/datasets for more language codes', |
|
) |
|
args = parser.parse_args() |
|
COMMON_VOICE_URL = ( |
|
f"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/" |
|
"{}/{}.tar.gz".format(args.version, args.language) |
|
) |
|
|
|
|
|
def create_manifest(data: List[tuple], output_name: str, manifest_path: str): |
|
output_file = Path(manifest_path) / output_name |
|
output_file.parent.mkdir(exist_ok=True, parents=True) |
|
|
|
with output_file.open(mode='w') as f: |
|
for wav_path, duration, text in tqdm(data, total=len(data)): |
|
if wav_path != '': |
|
|
|
f.write( |
|
json.dumps({'audio_filepath': os.path.abspath(wav_path), "duration": duration, 'text': text}) |
|
+ '\n' |
|
) |
|
|
|
|
|
def process_files(csv_file, data_root, num_workers): |
|
""" Read *.csv file description, convert mp3 to wav, process text. |
|
Save results to data_root. |
|
|
|
Args: |
|
csv_file: str, path to *.csv file with data description, usually start from 'cv-' |
|
data_root: str, path to dir to save results; wav/ dir will be created |
|
""" |
|
wav_dir = os.path.join(data_root, 'wav/') |
|
os.makedirs(wav_dir, exist_ok=True) |
|
audio_clips_path = os.path.dirname(csv_file) + '/clips/' |
|
|
|
def process(x): |
|
file_path, text = x |
|
file_name = os.path.splitext(os.path.basename(file_path))[0] |
|
text = text.lower().strip() |
|
audio_path = os.path.join(audio_clips_path, file_path) |
|
if os.path.getsize(audio_path) == 0: |
|
logging.warning(f'Skipping empty audio file {audio_path}') |
|
return '', '', '' |
|
|
|
output_wav_path = os.path.join(wav_dir, file_name + '.wav') |
|
|
|
if not os.path.exists(output_wav_path): |
|
tfm = Transformer() |
|
tfm.rate(samplerate=args.sample_rate) |
|
tfm.channels(n_channels=args.n_channels) |
|
tfm.build(input_filepath=audio_path, output_filepath=output_wav_path) |
|
|
|
duration = sox.file_info.duration(output_wav_path) |
|
return output_wav_path, duration, text |
|
|
|
logging.info('Converting mp3 to wav for {}.'.format(csv_file)) |
|
with open(csv_file) as csvfile: |
|
reader = csv.DictReader(csvfile, delimiter='\t') |
|
next(reader, None) |
|
data = [] |
|
for row in reader: |
|
file_name = row['path'] |
|
|
|
if not file_name.endswith('.mp3'): |
|
file_name += '.mp3' |
|
data.append((file_name, row['sentence'])) |
|
with ThreadPool(num_workers) as pool: |
|
data = list(tqdm(pool.imap(process, data), total=len(data))) |
|
return data |
|
|
|
|
|
def main(): |
|
if args.log: |
|
logging.basicConfig(level=logging.INFO) |
|
|
|
data_root = args.data_root |
|
os.makedirs(data_root, exist_ok=True) |
|
|
|
target_unpacked_dir = os.path.join(data_root, "CV_unpacked") |
|
|
|
if os.path.exists(target_unpacked_dir): |
|
logging.info('Find existing folder {}'.format(target_unpacked_dir)) |
|
else: |
|
logging.info("Could not find Common Voice, Downloading corpus...") |
|
|
|
|
|
output_archive_filename = args.language + '.tar.gz' |
|
output_archive_filename = os.path.join(data_root, output_archive_filename) |
|
|
|
commands = [ |
|
'wget', |
|
'--user-agent', |
|
'"Mozilla/5.0 (Windows NT 10.0; WOW64) ' |
|
'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"', |
|
'-O', |
|
output_archive_filename, |
|
f'{COMMON_VOICE_URL}', |
|
] |
|
commands = " ".join(commands) |
|
subprocess.run(commands, shell=True, stderr=sys.stderr, stdout=sys.stdout, capture_output=False) |
|
filename = f"{args.language}.tar.gz" |
|
target_file = os.path.join(data_root, os.path.basename(filename)) |
|
|
|
os.makedirs(target_unpacked_dir, exist_ok=True) |
|
logging.info("Unpacking corpus to {} ...".format(target_unpacked_dir)) |
|
tar = tarfile.open(target_file) |
|
tar.extractall(target_unpacked_dir) |
|
tar.close() |
|
if args.cleanup: |
|
logging.info("removing tar archive to save space") |
|
os.remove(target_file) |
|
|
|
folder_path = os.path.join(target_unpacked_dir, args.version + f'/{args.language}/') |
|
if not os.path.isdir(folder_path): |
|
|
|
folder_path = os.path.join(target_unpacked_dir, args.version) |
|
if not os.path.isdir(folder_path): |
|
|
|
folder_path = target_unpacked_dir |
|
if not os.path.isdir(folder_path): |
|
logging.error(f'unable to locate unpacked files in {folder_path}') |
|
sys.exit() |
|
|
|
for csv_file in args.files_to_process: |
|
data = process_files( |
|
csv_file=os.path.join(folder_path, csv_file), |
|
data_root=os.path.join(data_root, os.path.splitext(csv_file)[0]), |
|
num_workers=args.num_workers, |
|
) |
|
logging.info('Creating manifests...') |
|
create_manifest( |
|
data=data, |
|
output_name=f'commonvoice_{os.path.splitext(csv_file)[0]}_manifest.json', |
|
manifest_path=args.manifest_dir, |
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |
|
|