|
import librosa |
|
import numpy |
|
import scipy |
|
import soundfile |
|
|
|
from espnet.utils.io_utils import SoundHDF5File |
|
|
|
|
|
class SpeedPerturbation(object): |
|
"""SpeedPerturbation |
|
|
|
The speed perturbation in kaldi uses sox-speed instead of sox-tempo, |
|
and sox-speed just to resample the input, |
|
i.e pitch and tempo are changed both. |
|
|
|
"Why use speed option instead of tempo -s in SoX for speed perturbation" |
|
https://groups.google.com/forum/#!topic/kaldi-help/8OOG7eE4sZ8 |
|
|
|
Warning: |
|
This function is very slow because of resampling. |
|
I recommmend to apply speed-perturb outside the training using sox. |
|
|
|
""" |
|
|
|
def __init__( |
|
self, |
|
lower=0.9, |
|
upper=1.1, |
|
utt2ratio=None, |
|
keep_length=True, |
|
res_type="kaiser_best", |
|
seed=None, |
|
): |
|
self.res_type = res_type |
|
self.keep_length = keep_length |
|
self.state = numpy.random.RandomState(seed) |
|
|
|
if utt2ratio is not None: |
|
self.utt2ratio = {} |
|
|
|
self.utt2ratio_file = utt2ratio |
|
self.lower = None |
|
self.upper = None |
|
self.accept_uttid = True |
|
|
|
with open(utt2ratio, "r") as f: |
|
for line in f: |
|
utt, ratio = line.rstrip().split(None, 1) |
|
ratio = float(ratio) |
|
self.utt2ratio[utt] = ratio |
|
else: |
|
self.utt2ratio = None |
|
|
|
self.lower = lower |
|
self.upper = upper |
|
|
|
def __repr__(self): |
|
if self.utt2ratio is None: |
|
return "{}(lower={}, upper={}, " "keep_length={}, res_type={})".format( |
|
self.__class__.__name__, |
|
self.lower, |
|
self.upper, |
|
self.keep_length, |
|
self.res_type, |
|
) |
|
else: |
|
return "{}({}, res_type={})".format( |
|
self.__class__.__name__, self.utt2ratio_file, self.res_type |
|
) |
|
|
|
def __call__(self, x, uttid=None, train=True): |
|
if not train: |
|
return x |
|
|
|
x = x.astype(numpy.float32) |
|
if self.accept_uttid: |
|
ratio = self.utt2ratio[uttid] |
|
else: |
|
ratio = self.state.uniform(self.lower, self.upper) |
|
|
|
|
|
|
|
y = librosa.resample(x, ratio, 1, res_type=self.res_type) |
|
|
|
if self.keep_length: |
|
diff = abs(len(x) - len(y)) |
|
if len(y) > len(x): |
|
|
|
y = y[diff // 2 : -((diff + 1) // 2)] |
|
elif len(y) < len(x): |
|
|
|
pad_width = [(diff // 2, (diff + 1) // 2)] + [ |
|
(0, 0) for _ in range(y.ndim - 1) |
|
] |
|
y = numpy.pad( |
|
y, pad_width=pad_width, constant_values=0, mode="constant" |
|
) |
|
return y |
|
|
|
|
|
class BandpassPerturbation(object): |
|
"""BandpassPerturbation |
|
|
|
Randomly dropout along the frequency axis. |
|
|
|
The original idea comes from the following: |
|
"randomly-selected frequency band was cut off under the constraint of |
|
leaving at least 1,000 Hz band within the range of less than 4,000Hz." |
|
(The Hitachi/JHU CHiME-5 system: Advances in speech recognition for |
|
everyday home environments using multiple microphone arrays; |
|
http://spandh.dcs.shef.ac.uk/chime_workshop/papers/CHiME_2018_paper_kanda.pdf) |
|
|
|
""" |
|
|
|
def __init__(self, lower=0.0, upper=0.75, seed=None, axes=(-1,)): |
|
self.lower = lower |
|
self.upper = upper |
|
self.state = numpy.random.RandomState(seed) |
|
|
|
self.axes = axes |
|
|
|
def __repr__(self): |
|
return "{}(lower={}, upper={})".format( |
|
self.__class__.__name__, self.lower, self.upper |
|
) |
|
|
|
def __call__(self, x_stft, uttid=None, train=True): |
|
if not train: |
|
return x_stft |
|
|
|
if x_stft.ndim == 1: |
|
raise RuntimeError( |
|
"Input in time-freq domain: " "(Time, Channel, Freq) or (Time, Freq)" |
|
) |
|
|
|
ratio = self.state.uniform(self.lower, self.upper) |
|
axes = [i if i >= 0 else x_stft.ndim - i for i in self.axes] |
|
shape = [s if i in axes else 1 for i, s in enumerate(x_stft.shape)] |
|
|
|
mask = self.state.randn(*shape) > ratio |
|
x_stft *= mask |
|
return x_stft |
|
|
|
|
|
class VolumePerturbation(object): |
|
def __init__(self, lower=-1.6, upper=1.6, utt2ratio=None, dbunit=True, seed=None): |
|
self.dbunit = dbunit |
|
self.utt2ratio_file = utt2ratio |
|
self.lower = lower |
|
self.upper = upper |
|
self.state = numpy.random.RandomState(seed) |
|
|
|
if utt2ratio is not None: |
|
|
|
self.utt2ratio = {} |
|
self.lower = None |
|
self.upper = None |
|
self.accept_uttid = True |
|
|
|
with open(utt2ratio, "r") as f: |
|
for line in f: |
|
utt, ratio = line.rstrip().split(None, 1) |
|
ratio = float(ratio) |
|
self.utt2ratio[utt] = ratio |
|
else: |
|
|
|
self.utt2ratio = None |
|
|
|
def __repr__(self): |
|
if self.utt2ratio is None: |
|
return "{}(lower={}, upper={}, dbunit={})".format( |
|
self.__class__.__name__, self.lower, self.upper, self.dbunit |
|
) |
|
else: |
|
return '{}("{}", dbunit={})'.format( |
|
self.__class__.__name__, self.utt2ratio_file, self.dbunit |
|
) |
|
|
|
def __call__(self, x, uttid=None, train=True): |
|
if not train: |
|
return x |
|
|
|
x = x.astype(numpy.float32) |
|
|
|
if self.accept_uttid: |
|
ratio = self.utt2ratio[uttid] |
|
else: |
|
ratio = self.state.uniform(self.lower, self.upper) |
|
if self.dbunit: |
|
ratio = 10 ** (ratio / 20) |
|
return x * ratio |
|
|
|
|
|
class NoiseInjection(object): |
|
"""Add isotropic noise""" |
|
|
|
def __init__( |
|
self, |
|
utt2noise=None, |
|
lower=-20, |
|
upper=-5, |
|
utt2ratio=None, |
|
filetype="list", |
|
dbunit=True, |
|
seed=None, |
|
): |
|
self.utt2noise_file = utt2noise |
|
self.utt2ratio_file = utt2ratio |
|
self.filetype = filetype |
|
self.dbunit = dbunit |
|
self.lower = lower |
|
self.upper = upper |
|
self.state = numpy.random.RandomState(seed) |
|
|
|
if utt2ratio is not None: |
|
|
|
self.utt2ratio = {} |
|
with open(utt2noise, "r") as f: |
|
for line in f: |
|
utt, snr = line.rstrip().split(None, 1) |
|
snr = float(snr) |
|
self.utt2ratio[utt] = snr |
|
else: |
|
|
|
self.utt2ratio = None |
|
|
|
if utt2noise is not None: |
|
self.utt2noise = {} |
|
if filetype == "list": |
|
with open(utt2noise, "r") as f: |
|
for line in f: |
|
utt, filename = line.rstrip().split(None, 1) |
|
signal, rate = soundfile.read(filename, dtype="int16") |
|
|
|
self.utt2noise[utt] = (signal, rate) |
|
|
|
elif filetype == "sound.hdf5": |
|
self.utt2noise = SoundHDF5File(utt2noise, "r") |
|
else: |
|
raise ValueError(filetype) |
|
else: |
|
self.utt2noise = None |
|
|
|
if utt2noise is not None and utt2ratio is not None: |
|
if set(self.utt2ratio) != set(self.utt2noise): |
|
raise RuntimeError( |
|
"The uttids mismatch between {} and {}".format(utt2ratio, utt2noise) |
|
) |
|
|
|
def __repr__(self): |
|
if self.utt2ratio is None: |
|
return "{}(lower={}, upper={}, dbunit={})".format( |
|
self.__class__.__name__, self.lower, self.upper, self.dbunit |
|
) |
|
else: |
|
return '{}("{}", dbunit={})'.format( |
|
self.__class__.__name__, self.utt2ratio_file, self.dbunit |
|
) |
|
|
|
def __call__(self, x, uttid=None, train=True): |
|
if not train: |
|
return x |
|
x = x.astype(numpy.float32) |
|
|
|
|
|
if uttid is not None and self.utt2ratio is not None: |
|
ratio = self.utt2ratio[uttid] |
|
else: |
|
ratio = self.state.uniform(self.lower, self.upper) |
|
|
|
if self.dbunit: |
|
ratio = 10 ** (ratio / 20) |
|
scale = ratio * numpy.sqrt((x ** 2).mean()) |
|
|
|
|
|
if self.utt2noise is not None: |
|
|
|
if uttid is not None: |
|
noise, rate = self.utt2noise[uttid] |
|
else: |
|
|
|
noise = self.state.choice(list(self.utt2noise.values())) |
|
|
|
noise /= numpy.sqrt((noise ** 2).mean()) |
|
|
|
|
|
diff = abs(len(x) - len(noise)) |
|
offset = self.state.randint(0, diff) |
|
if len(noise) > len(x): |
|
|
|
noise = noise[offset : -(diff - offset)] |
|
else: |
|
noise = numpy.pad(noise, pad_width=[offset, diff - offset], mode="wrap") |
|
|
|
else: |
|
|
|
noise = self.state.normal(0, 1, x.shape) |
|
|
|
|
|
return x + noise * scale |
|
|
|
|
|
class RIRConvolve(object): |
|
def __init__(self, utt2rir, filetype="list"): |
|
self.utt2rir_file = utt2rir |
|
self.filetype = filetype |
|
|
|
self.utt2rir = {} |
|
if filetype == "list": |
|
with open(utt2rir, "r") as f: |
|
for line in f: |
|
utt, filename = line.rstrip().split(None, 1) |
|
signal, rate = soundfile.read(filename, dtype="int16") |
|
self.utt2rir[utt] = (signal, rate) |
|
|
|
elif filetype == "sound.hdf5": |
|
self.utt2rir = SoundHDF5File(utt2rir, "r") |
|
else: |
|
raise NotImplementedError(filetype) |
|
|
|
def __repr__(self): |
|
return '{}("{}")'.format(self.__class__.__name__, self.utt2rir_file) |
|
|
|
def __call__(self, x, uttid=None, train=True): |
|
if not train: |
|
return x |
|
|
|
x = x.astype(numpy.float32) |
|
|
|
if x.ndim != 1: |
|
|
|
raise RuntimeError( |
|
"Input x must be one dimensional array, but got {}".format(x.shape) |
|
) |
|
|
|
rir, rate = self.utt2rir[uttid] |
|
if rir.ndim == 2: |
|
|
|
|
|
return numpy.stack( |
|
[scipy.convolve(x, r, mode="same") for r in rir], axis=-1 |
|
) |
|
else: |
|
return scipy.convolve(x, rir, mode="same") |
|
|