File size: 13,287 Bytes
2d8da09 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 |
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import os
import shutil
from pathlib import Path
from nemo.collections.asr.parts.utils.manifest_utils import read_manifest, write_ctm, write_manifest
from nemo.utils import logging
def get_unaligned_files(unaligned_path):
"""
Get files without alignments in order to filter them out (as they cannot be used for data simulation).
In the unaligned file, each line contains the file name and the reason for the unalignment, if necessary to specify.
Example: unaligned.txt
<utterance_id> <comment>
1272-128104-0000 (no such file)
2289-152257-0025 (no such file)
2289-152257-0026 (mapping failed)
...
Args:
unaligned_path (str): Path to the file containing unaligned examples
Returns:
skip_files (list): Unaligned file names to skip
"""
skip_files = []
with open(unaligned_path, 'r', encoding='utf-8') as f:
for line in f.readlines():
line = line.strip()
if not line:
continue
unaligned_file = line.split()[0]
skip_files.append(unaligned_file)
return skip_files
def create_new_ctm_entry(session_name, speaker_id, wordlist, alignments, output_precision=3):
"""
Create new CTM entry (to write to output ctm file)
Args:
session_name (str): Current session name.
speaker_id (int): LibriSpeech speaker ID for the current entry.
wordlist (list): List of words
alignments (list): List of alignments
output_precision (int): Precision for CTM outputs
Returns:
arr (list): List of ctm entries
"""
arr = []
for i in range(len(wordlist)):
word = wordlist[i]
if word != "":
# note that using the current alignments the first word is always empty, so there is no error from indexing the array with i-1
align1 = float(round(alignments[i - 1], output_precision))
align2 = float(round(alignments[i] - alignments[i - 1], output_precision,))
text = f"{session_name} {speaker_id} {align1} {align2} {word} 0\n"
arr.append((align1, text))
return arr
def load_librispeech_alignment(alignment_filepath: str) -> dict:
"""
Load alignment data for librispeech
Args:
alignment_filepath (str): Path to the file containing alignments
Returns:
alignments (dict[tuple]): A dictionary containing file index and alignments
"""
alignments = {}
with open(alignment_filepath, "r") as fin:
for line in fin.readlines():
line = line.strip()
if not line:
continue
file_id, words, timestamps = line.split()
alignments[file_id] = (words, timestamps)
return alignments
def create_librispeech_ctm_alignments(
input_manifest_filepath, base_alignment_path, ctm_output_directory, libri_dataset_split
):
"""
Create new CTM alignments using input LibriSpeech word alignments.
Args:
input_manifest_filepath (str): Path to the input LibriSpeech manifest file
base_alignment_path (str): Path to the base directory containing the LibriSpeech word alignments
ctm_source_dir (str): Directory to write the CTM files to
libri_dataset_split (str): Which split of the LibriSpeech dataset is being used
"""
manifest = read_manifest(input_manifest_filepath)
unaligned_path = os.path.join(base_alignment_path, "unaligned.txt")
if os.path.exists(unaligned_path):
unaligned_file_ids = set(get_unaligned_files(unaligned_path))
else:
unaligned_file_ids = set()
libri_dataset_split = libri_dataset_split.replace("_", "-")
# delete output directory if it exists or throw warning
if os.path.isdir(ctm_output_directory):
logging.info(f"Removing existing output directory: {ctm_output_directory}")
shutil.rmtree(ctm_output_directory)
if not os.path.exists(ctm_output_directory):
logging.info(f"Creating output directory: {ctm_output_directory}")
os.mkdir(ctm_output_directory)
if len(manifest) == 0:
raise Exception(f"Input manifest is empty: {input_manifest_filepath}")
for entry in manifest:
audio_file = entry['audio_filepath']
file_id = Path(audio_file).stem
if file_id in unaligned_file_ids:
continue
speaker_id = file_id.split('-')[0]
book_id = file_id.split('-')[1]
book_dir = os.path.join(base_alignment_path, "LibriSpeech", libri_dataset_split, speaker_id, book_id)
alignment_filepath = os.path.join(book_dir, f"{speaker_id}-{book_id}.alignment.txt")
alignment_data = load_librispeech_alignment(alignment_filepath)
if file_id not in alignment_data:
logging.warning(f"Cannot find alignment data for {audio_file} in {alignment_filepath}")
continue
words, end_times = alignment_data[file_id]
words = words.replace('\"', '').lower().split(',')
end_times = [float(e) for e in end_times.replace('\"', '').split(',')]
ctm_list = create_new_ctm_entry(file_id, speaker_id, words, end_times)
write_ctm(os.path.join(ctm_output_directory, file_id + '.ctm'), ctm_list)
def create_manifest_with_alignments(
input_manifest_filepath,
ctm_source_dir,
output_manifest_filepath,
data_format_style,
silence_dur_threshold=0.1,
output_precision=3,
):
"""
Create new manifest file with word alignments using CTM files
Args:
input_manifest_filepath (str): Path to the input manifest file
ctm_source_dir (str): Directory to read the CTM files from
output_manifest_filepath (str): Path to the output manifest file containing word alignments
precision (int): How many decimal places to keep in the manifest file
"""
manifest = read_manifest(input_manifest_filepath)
target_manifest = []
src_i = 0
tgt_i = 0
while src_i < len(manifest):
f = manifest[src_i]
fn = f['audio_filepath'].split('/')[-1]
filename = fn.split('.')[0] # assuming that there is only one period in the input filenames
if "voxceleb" in data_format_style:
fn_split = f['audio_filepath'].split('/')
filename = fn_split[-3] + '-' + fn_split[-2] + '-' + fn_split[-1].split('.')[0]
ctm_filepath = os.path.join(ctm_source_dir, filename + '.ctm')
else:
ctm_filepath = os.path.join(ctm_source_dir, filename + '.ctm')
if not os.path.isfile(ctm_filepath):
logging.info(f"Skipping {filename}.wav as there is no corresponding CTM file")
src_i += 1
continue
with open(ctm_filepath, 'r') as ctm_file:
lines = ctm_file.readlines()
# One-word samples should be filtered out.
if len(lines) <= 1:
src_i += 1
continue
words = []
end_times = []
i = 0
prev_end = 0
for i in range(len(lines)):
ctm = lines[i].split(' ')
speaker_id = ctm[1]
start = float(ctm[2])
end = float(ctm[2]) + float(ctm[3])
start = round(start, output_precision)
end = round(end, output_precision)
interval = start - prev_end
if (i == 0 and interval > 0) or (i > 0 and interval > silence_dur_threshold):
words.append("")
end_times.append(start)
elif i > 0:
end_times[-1] = start
words.append(ctm[4])
end_times.append(end)
i += 1
prev_end = end
# append last end
if f['duration'] > prev_end:
words.append("")
end_times.append(f['duration'])
# build target manifest entry
target_manifest.append({})
target_manifest[tgt_i]['audio_filepath'] = f['audio_filepath']
target_manifest[tgt_i]['duration'] = f['duration']
target_manifest[tgt_i]['text'] = f['text']
target_manifest[tgt_i]['words'] = words
target_manifest[tgt_i]['alignments'] = end_times
target_manifest[tgt_i]['speaker_id'] = speaker_id
src_i += 1
tgt_i += 1
logging.info(f"Writing output manifest file to {output_manifest_filepath}")
write_manifest(output_manifest_filepath, target_manifest)
def main():
"""
Create a combined manifest file including word alignments and speaker IDs
"""
input_manifest_filepath = args.input_manifest_filepath
base_alignment_path = args.base_alignment_path
output_manifest_filepath = args.output_manifest_filepath
ctm_output_directory = args.ctm_output_directory
libri_dataset_split = args.libri_dataset_split
use_ctm_alignment_source = args.use_ctm_alignment_source
output_precision = args.output_precision
# Case 1: args.base_alignment_path is containing the ctm files
if use_ctm_alignment_source:
ctm_source_dir = args.base_alignment_path
# Case 2: args.base_alignment_path is containing *.lab style alignments for the dataset
else:
create_librispeech_ctm_alignments(
input_manifest_filepath, base_alignment_path, ctm_output_directory, libri_dataset_split
)
ctm_source_dir = ctm_output_directory
create_manifest_with_alignments(
input_manifest_filepath,
ctm_source_dir,
output_manifest_filepath,
data_format_style=args.data_format_style,
silence_dur_threshold=args.silence_dur_threshold,
output_precision=output_precision,
)
if __name__ == "__main__":
"""
This script creates a manifest file to be used for generating synthetic
multispeaker audio sessions. The script takes in the default manifest file
for a LibriSpeech dataset and corresponding word alignments and produces
a combined manifest file that contains word alignments and speaker IDs
per example. It can also be used to produce a manifest file for a different
dataset if alignments are passed in CTM files.
The alignments are obtained from: https://github.com/CorentinJ/librispeech-alignments
Args:
input_manifest_filepath (str): Path to input manifest file
base_alignment_path (str): Path to the base directory for the LibriSpeech alignment dataset
(specifically to the LibriSpeech-Alignments directory containing
both the LibriSpeech folder as well as the unaligned.txt file)
or to a directory containing the requisite CTM files
output_manifest_filepath (str): Path to output manifest file
ctm_output_directory (str): Path to output CTM directory (only used for LibriSpeech)
libri_dataset_split (str): Which dataset split to create a combined manifest file for
use_ctm_alignment_source (bool): If true, base_alignment_path points to a directory containing ctm files
"""
parser = argparse.ArgumentParser(description="LibriSpeech Alignment Manifest Creator")
parser.add_argument("--input_manifest_filepath", help="path to input manifest file", type=str, required=True)
parser.add_argument("--base_alignment_path", help="path to alignments (LibriSpeech)", type=str, required=False)
parser.add_argument("--output_manifest_filepath", help="path to output manifest file", type=str, required=True)
parser.add_argument(
"--ctm_output_directory",
help="path to output ctm directory for LibriSpeech (or to input CTM directory)",
type=str,
required=True,
)
parser.add_argument(
"--libri_dataset_split",
help="which test/dev/training set to create a manifest for (only used for LibriSpeech)",
type=str,
required=False,
default="",
)
parser.add_argument(
"--use_ctm_alignment_source",
help="if true, base_alignment_path points to a directory containing ctm files",
action='store_true',
required=False,
)
parser.add_argument(
"--data_format_style",
help="Use specific format for speaker IDs and utterance IDs. e.g. 'voxceleb', 'librispeech', 'swbd'",
default="",
type=str,
required=False,
)
parser.add_argument(
"--output_precision", help="precision for output alignments", type=int, required=False, default=3
)
parser.add_argument(
"--silence_dur_threshold", help="threshold for inserting silence", type=float, required=False, default=0.1
)
args = parser.parse_args()
main()
|