|
import copy |
|
from contextlib import contextmanager |
|
from inspect import signature |
|
from typing import List |
|
|
|
import numpy as np |
|
import torch |
|
from flatten_dict import flatten |
|
from flatten_dict import unflatten |
|
from numpy.random import RandomState |
|
|
|
from .. import ml |
|
from ..core import AudioSignal |
|
from ..core import util |
|
from .datasets import AudioLoader |
|
|
|
tt = torch.tensor |
|
"""Shorthand for converting things to torch.tensor.""" |
|
|
|
|
|
class BaseTransform: |
|
"""This is the base class for all transforms that are implemented |
|
in this library. Transforms have two main operations: ``transform`` |
|
and ``instantiate``. |
|
|
|
``instantiate`` sets the parameters randomly |
|
from distribution tuples for each parameter. For example, for the |
|
``BackgroundNoise`` transform, the signal-to-noise ratio (``snr``) |
|
is chosen randomly by instantiate. By default, it chosen uniformly |
|
between 10.0 and 30.0 (the tuple is set to ``("uniform", 10.0, 30.0)``). |
|
|
|
``transform`` applies the transform using the instantiated parameters. |
|
A simple example is as follows: |
|
|
|
>>> seed = 0 |
|
>>> signal = ... |
|
>>> transform = transforms.NoiseFloor(db = ("uniform", -50.0, -30.0)) |
|
>>> kwargs = transform.instantiate() |
|
>>> output = transform(signal.clone(), **kwargs) |
|
|
|
By breaking apart the instantiation of parameters from the actual audio |
|
processing of the transform, we can make things more reproducible, while |
|
also applying the transform on batches of data efficiently on GPU, |
|
rather than on individual audio samples. |
|
|
|
.. note:: |
|
We call ``signal.clone()`` for the input to the ``transform`` function |
|
because signals are modified in-place! If you don't clone the signal, |
|
you will lose the original data. |
|
|
|
Parameters |
|
---------- |
|
keys : list, optional |
|
Keys that the transform looks for when |
|
calling ``self.transform``, by default []. In general this is |
|
set automatically, and you won't need to manipulate this argument. |
|
name : str, optional |
|
Name of this transform, used to identify it in the dictionary |
|
produced by ``self.instantiate``, by default None |
|
prob : float, optional |
|
Probability of applying this transform, by default 1.0 |
|
|
|
Examples |
|
-------- |
|
|
|
>>> seed = 0 |
|
>>> |
|
>>> audio_path = "tests/audio/spk/f10_script4_produced.wav" |
|
>>> signal = AudioSignal(audio_path, offset=10, duration=2) |
|
>>> transform = tfm.Compose( |
|
>>> [ |
|
>>> tfm.RoomImpulseResponse(sources=["tests/audio/irs.csv"]), |
|
>>> tfm.BackgroundNoise(sources=["tests/audio/noises.csv"]), |
|
>>> ], |
|
>>> ) |
|
>>> |
|
>>> kwargs = transform.instantiate(seed, signal) |
|
>>> output = transform(signal, **kwargs) |
|
|
|
""" |
|
|
|
def __init__(self, keys: list = [], name: str = None, prob: float = 1.0): |
|
|
|
tfm_keys = list(signature(self._transform).parameters.keys()) |
|
|
|
|
|
ignore_keys = ["signal", "kwargs"] |
|
tfm_keys = [k for k in tfm_keys if k not in ignore_keys] |
|
|
|
|
|
|
|
self.keys = keys + tfm_keys + ["mask"] |
|
|
|
self.prob = prob |
|
|
|
if name is None: |
|
name = self.__class__.__name__ |
|
self.name = name |
|
|
|
def _prepare(self, batch: dict): |
|
sub_batch = batch[self.name] |
|
|
|
for k in self.keys: |
|
assert k in sub_batch.keys(), f"{k} not in batch" |
|
|
|
return sub_batch |
|
|
|
def _transform(self, signal): |
|
return signal |
|
|
|
def _instantiate(self, state: RandomState, signal: AudioSignal = None): |
|
return {} |
|
|
|
@staticmethod |
|
def apply_mask(batch: dict, mask: torch.Tensor): |
|
"""Applies a mask to the batch. |
|
|
|
Parameters |
|
---------- |
|
batch : dict |
|
Batch whose values will be masked in the ``transform`` pass. |
|
mask : torch.Tensor |
|
Mask to apply to batch. |
|
|
|
Returns |
|
------- |
|
dict |
|
A dictionary that contains values only where ``mask = True``. |
|
""" |
|
masked_batch = {k: v[mask] for k, v in flatten(batch).items()} |
|
return unflatten(masked_batch) |
|
|
|
def transform(self, signal: AudioSignal, **kwargs): |
|
"""Apply the transform to the audio signal, |
|
with given keyword arguments. |
|
|
|
Parameters |
|
---------- |
|
signal : AudioSignal |
|
Signal that will be modified by the transforms in-place. |
|
kwargs: dict |
|
Keyword arguments to the specific transforms ``self._transform`` |
|
function. |
|
|
|
Returns |
|
------- |
|
AudioSignal |
|
Transformed AudioSignal. |
|
|
|
Examples |
|
-------- |
|
|
|
>>> for seed in range(10): |
|
>>> kwargs = transform.instantiate(seed, signal) |
|
>>> output = transform(signal.clone(), **kwargs) |
|
|
|
""" |
|
tfm_kwargs = self._prepare(kwargs) |
|
mask = tfm_kwargs["mask"] |
|
|
|
if torch.any(mask): |
|
tfm_kwargs = self.apply_mask(tfm_kwargs, mask) |
|
tfm_kwargs = {k: v for k, v in tfm_kwargs.items() if k != "mask"} |
|
signal[mask] = self._transform(signal[mask], **tfm_kwargs) |
|
|
|
return signal |
|
|
|
def __call__(self, *args, **kwargs): |
|
return self.transform(*args, **kwargs) |
|
|
|
def instantiate( |
|
self, |
|
state: RandomState = None, |
|
signal: AudioSignal = None, |
|
): |
|
"""Instantiates parameters for the transform. |
|
|
|
Parameters |
|
---------- |
|
state : RandomState, optional |
|
_description_, by default None |
|
signal : AudioSignal, optional |
|
_description_, by default None |
|
|
|
Returns |
|
------- |
|
dict |
|
Dictionary containing instantiated arguments for every keyword |
|
argument to ``self._transform``. |
|
|
|
Examples |
|
-------- |
|
|
|
>>> for seed in range(10): |
|
>>> kwargs = transform.instantiate(seed, signal) |
|
>>> output = transform(signal.clone(), **kwargs) |
|
|
|
""" |
|
state = util.random_state(state) |
|
|
|
|
|
|
|
|
|
|
|
needs_signal = "signal" in set(signature(self._instantiate).parameters.keys()) |
|
kwargs = {} |
|
if needs_signal: |
|
kwargs = {"signal": signal} |
|
|
|
|
|
params = self._instantiate(state, **kwargs) |
|
for k in list(params.keys()): |
|
v = params[k] |
|
if isinstance(v, (AudioSignal, torch.Tensor, dict)): |
|
params[k] = v |
|
else: |
|
params[k] = tt(v) |
|
mask = state.rand() <= self.prob |
|
params[f"mask"] = tt(mask) |
|
|
|
|
|
|
|
|
|
params = {self.name: params} |
|
|
|
return params |
|
|
|
def batch_instantiate( |
|
self, |
|
states: list = None, |
|
signal: AudioSignal = None, |
|
): |
|
"""Instantiates arguments for every item in a batch, |
|
given a list of states. Each state in the list |
|
corresponds to one item in the batch. |
|
|
|
Parameters |
|
---------- |
|
states : list, optional |
|
List of states, by default None |
|
signal : AudioSignal, optional |
|
AudioSignal to pass to the ``self.instantiate`` section |
|
if it is needed for this transform, by default None |
|
|
|
Returns |
|
------- |
|
dict |
|
Collated dictionary of arguments. |
|
|
|
Examples |
|
-------- |
|
|
|
>>> batch_size = 4 |
|
>>> signal = AudioSignal(audio_path, offset=10, duration=2) |
|
>>> signal_batch = AudioSignal.batch([signal.clone() for _ in range(batch_size)]) |
|
>>> |
|
>>> states = [seed + idx for idx in list(range(batch_size))] |
|
>>> kwargs = transform.batch_instantiate(states, signal_batch) |
|
>>> batch_output = transform(signal_batch, **kwargs) |
|
""" |
|
kwargs = [] |
|
for state in states: |
|
kwargs.append(self.instantiate(state, signal)) |
|
kwargs = util.collate(kwargs) |
|
return kwargs |
|
|
|
|
|
class Identity(BaseTransform): |
|
"""This transform just returns the original signal.""" |
|
|
|
pass |
|
|
|
|
|
class SpectralTransform(BaseTransform): |
|
"""Spectral transforms require STFT data to exist, since manipulations |
|
of the STFT require the spectrogram. This just calls ``stft`` before |
|
the transform is called, and calls ``istft`` after the transform is |
|
called so that the audio data is written to after the spectral |
|
manipulation. |
|
""" |
|
|
|
def transform(self, signal, **kwargs): |
|
signal.stft() |
|
super().transform(signal, **kwargs) |
|
signal.istft() |
|
return signal |
|
|
|
|
|
class Compose(BaseTransform): |
|
"""Compose applies transforms in sequence, one after the other. The |
|
transforms are passed in as positional arguments or as a list like so: |
|
|
|
>>> transform = tfm.Compose( |
|
>>> [ |
|
>>> tfm.RoomImpulseResponse(sources=["tests/audio/irs.csv"]), |
|
>>> tfm.BackgroundNoise(sources=["tests/audio/noises.csv"]), |
|
>>> ], |
|
>>> ) |
|
|
|
This will convolve the signal with a room impulse response, and then |
|
add background noise to the signal. Instantiate instantiates |
|
all the parameters for every transform in the transform list so the |
|
interface for using the Compose transform is the same as everything |
|
else: |
|
|
|
>>> kwargs = transform.instantiate() |
|
>>> output = transform(signal.clone(), **kwargs) |
|
|
|
Under the hood, the transform maps each transform to a unique name |
|
under the hood of the form ``{position}.{name}``, where ``position`` |
|
is the index of the transform in the list. ``Compose`` can nest |
|
within other ``Compose`` transforms, like so: |
|
|
|
>>> preprocess = transforms.Compose( |
|
>>> tfm.GlobalVolumeNorm(), |
|
>>> tfm.CrossTalk(), |
|
>>> name="preprocess", |
|
>>> ) |
|
>>> augment = transforms.Compose( |
|
>>> tfm.RoomImpulseResponse(), |
|
>>> tfm.BackgroundNoise(), |
|
>>> name="augment", |
|
>>> ) |
|
>>> postprocess = transforms.Compose( |
|
>>> tfm.VolumeChange(), |
|
>>> tfm.RescaleAudio(), |
|
>>> tfm.ShiftPhase(), |
|
>>> name="postprocess", |
|
>>> ) |
|
>>> transform = transforms.Compose(preprocess, augment, postprocess), |
|
|
|
This defines 3 composed transforms, and then composes them in sequence |
|
with one another. |
|
|
|
Parameters |
|
---------- |
|
*transforms : list |
|
List of transforms to apply |
|
name : str, optional |
|
Name of this transform, used to identify it in the dictionary |
|
produced by ``self.instantiate``, by default None |
|
prob : float, optional |
|
Probability of applying this transform, by default 1.0 |
|
""" |
|
|
|
def __init__(self, *transforms: list, name: str = None, prob: float = 1.0): |
|
if isinstance(transforms[0], list): |
|
transforms = transforms[0] |
|
|
|
for i, tfm in enumerate(transforms): |
|
tfm.name = f"{i}.{tfm.name}" |
|
|
|
keys = [tfm.name for tfm in transforms] |
|
super().__init__(keys=keys, name=name, prob=prob) |
|
|
|
self.transforms = transforms |
|
self.transforms_to_apply = keys |
|
|
|
@contextmanager |
|
def filter(self, *names: list): |
|
"""This can be used to skip transforms entirely when applying |
|
the sequence of transforms to a signal. For example, take |
|
the following transforms with the names ``preprocess, augment, postprocess``. |
|
|
|
>>> preprocess = transforms.Compose( |
|
>>> tfm.GlobalVolumeNorm(), |
|
>>> tfm.CrossTalk(), |
|
>>> name="preprocess", |
|
>>> ) |
|
>>> augment = transforms.Compose( |
|
>>> tfm.RoomImpulseResponse(), |
|
>>> tfm.BackgroundNoise(), |
|
>>> name="augment", |
|
>>> ) |
|
>>> postprocess = transforms.Compose( |
|
>>> tfm.VolumeChange(), |
|
>>> tfm.RescaleAudio(), |
|
>>> tfm.ShiftPhase(), |
|
>>> name="postprocess", |
|
>>> ) |
|
>>> transform = transforms.Compose(preprocess, augment, postprocess) |
|
|
|
If we wanted to apply all 3 to a signal, we do: |
|
|
|
>>> kwargs = transform.instantiate() |
|
>>> output = transform(signal.clone(), **kwargs) |
|
|
|
But if we only wanted to apply the ``preprocess`` and ``postprocess`` |
|
transforms to the signal, we do: |
|
|
|
>>> with transform_fn.filter("preprocess", "postprocess"): |
|
>>> output = transform(signal.clone(), **kwargs) |
|
|
|
Parameters |
|
---------- |
|
*names : list |
|
List of transforms, identified by name, to apply to signal. |
|
""" |
|
old_transforms = self.transforms_to_apply |
|
self.transforms_to_apply = names |
|
yield |
|
self.transforms_to_apply = old_transforms |
|
|
|
def _transform(self, signal, **kwargs): |
|
for transform in self.transforms: |
|
if any([x in transform.name for x in self.transforms_to_apply]): |
|
signal = transform(signal, **kwargs) |
|
return signal |
|
|
|
def _instantiate(self, state: RandomState, signal: AudioSignal = None): |
|
parameters = {} |
|
for transform in self.transforms: |
|
parameters.update(transform.instantiate(state, signal=signal)) |
|
return parameters |
|
|
|
def __getitem__(self, idx): |
|
return self.transforms[idx] |
|
|
|
def __len__(self): |
|
return len(self.transforms) |
|
|
|
def __iter__(self): |
|
for transform in self.transforms: |
|
yield transform |
|
|
|
|
|
class Choose(Compose): |
|
"""Choose logic is the same as :py:func:`audiotools.data.transforms.Compose`, |
|
but instead of applying all the transforms in sequence, it applies just a single transform, |
|
which is chosen for each item in the batch. |
|
|
|
Parameters |
|
---------- |
|
*transforms : list |
|
List of transforms to apply |
|
weights : list |
|
Probability of choosing any specific transform. |
|
name : str, optional |
|
Name of this transform, used to identify it in the dictionary |
|
produced by ``self.instantiate``, by default None |
|
prob : float, optional |
|
Probability of applying this transform, by default 1.0 |
|
|
|
Examples |
|
-------- |
|
|
|
>>> transforms.Choose(tfm.LowPass(), tfm.HighPass()) |
|
""" |
|
|
|
def __init__( |
|
self, |
|
*transforms: list, |
|
weights: list = None, |
|
name: str = None, |
|
prob: float = 1.0, |
|
): |
|
super().__init__(*transforms, name=name, prob=prob) |
|
|
|
if weights is None: |
|
_len = len(self.transforms) |
|
weights = [1 / _len for _ in range(_len)] |
|
self.weights = np.array(weights) |
|
|
|
def _instantiate(self, state: RandomState, signal: AudioSignal = None): |
|
kwargs = super()._instantiate(state, signal) |
|
tfm_idx = list(range(len(self.transforms))) |
|
tfm_idx = state.choice(tfm_idx, p=self.weights) |
|
one_hot = [] |
|
for i, t in enumerate(self.transforms): |
|
mask = kwargs[t.name]["mask"] |
|
if mask.item(): |
|
kwargs[t.name]["mask"] = tt(i == tfm_idx) |
|
one_hot.append(kwargs[t.name]["mask"]) |
|
kwargs["one_hot"] = one_hot |
|
return kwargs |
|
|
|
|
|
class Repeat(Compose): |
|
"""Repeatedly applies a given transform ``n_repeat`` times." |
|
|
|
Parameters |
|
---------- |
|
transform : BaseTransform |
|
Transform to repeat. |
|
n_repeat : int, optional |
|
Number of times to repeat transform, by default 1 |
|
""" |
|
|
|
def __init__( |
|
self, |
|
transform, |
|
n_repeat: int = 1, |
|
name: str = None, |
|
prob: float = 1.0, |
|
): |
|
transforms = [copy.copy(transform) for _ in range(n_repeat)] |
|
super().__init__(transforms, name=name, prob=prob) |
|
|
|
self.n_repeat = n_repeat |
|
|
|
|
|
class RepeatUpTo(Choose): |
|
"""Repeatedly applies a given transform up to ``max_repeat`` times." |
|
|
|
Parameters |
|
---------- |
|
transform : BaseTransform |
|
Transform to repeat. |
|
max_repeat : int, optional |
|
Max number of times to repeat transform, by default 1 |
|
weights : list |
|
Probability of choosing any specific number up to ``max_repeat``. |
|
""" |
|
|
|
def __init__( |
|
self, |
|
transform, |
|
max_repeat: int = 5, |
|
weights: list = None, |
|
name: str = None, |
|
prob: float = 1.0, |
|
): |
|
transforms = [] |
|
for n in range(1, max_repeat): |
|
transforms.append(Repeat(transform, n_repeat=n)) |
|
super().__init__(transforms, name=name, prob=prob, weights=weights) |
|
|
|
self.max_repeat = max_repeat |
|
|
|
|
|
class ClippingDistortion(BaseTransform): |
|
"""Adds clipping distortion to signal. Corresponds |
|
to :py:func:`audiotools.core.effects.EffectMixin.clip_distortion`. |
|
|
|
Parameters |
|
---------- |
|
perc : tuple, optional |
|
Clipping percentile. Values are between 0.0 to 1.0. |
|
Typical values are 0.1 or below, by default ("uniform", 0.0, 0.1) |
|
name : str, optional |
|
Name of this transform, used to identify it in the dictionary |
|
produced by ``self.instantiate``, by default None |
|
prob : float, optional |
|
Probability of applying this transform, by default 1.0 |
|
""" |
|
|
|
def __init__( |
|
self, |
|
perc: tuple = ("uniform", 0.0, 0.1), |
|
name: str = None, |
|
prob: float = 1.0, |
|
): |
|
super().__init__(name=name, prob=prob) |
|
|
|
self.perc = perc |
|
|
|
def _instantiate(self, state: RandomState): |
|
return {"perc": util.sample_from_dist(self.perc, state)} |
|
|
|
def _transform(self, signal, perc): |
|
return signal.clip_distortion(perc) |
|
|
|
|
|
class Equalizer(BaseTransform): |
|
"""Applies an equalization curve to the audio signal. Corresponds |
|
to :py:func:`audiotools.core.effects.EffectMixin.equalizer`. |
|
|
|
Parameters |
|
---------- |
|
eq_amount : tuple, optional |
|
The maximum dB cut to apply to the audio in any band, |
|
by default ("const", 1.0 dB) |
|
n_bands : int, optional |
|
Number of bands in EQ, by default 6 |
|
name : str, optional |
|
Name of this transform, used to identify it in the dictionary |
|
produced by ``self.instantiate``, by default None |
|
prob : float, optional |
|
Probability of applying this transform, by default 1.0 |
|
""" |
|
|
|
def __init__( |
|
self, |
|
eq_amount: tuple = ("const", 1.0), |
|
n_bands: int = 6, |
|
name: str = None, |
|
prob: float = 1.0, |
|
): |
|
super().__init__(name=name, prob=prob) |
|
|
|
self.eq_amount = eq_amount |
|
self.n_bands = n_bands |
|
|
|
def _instantiate(self, state: RandomState): |
|
eq_amount = util.sample_from_dist(self.eq_amount, state) |
|
eq = -eq_amount * state.rand(self.n_bands) |
|
return {"eq": eq} |
|
|
|
def _transform(self, signal, eq): |
|
return signal.equalizer(eq) |
|
|
|
|
|
class Quantization(BaseTransform): |
|
"""Applies quantization to the input waveform. Corresponds |
|
to :py:func:`audiotools.core.effects.EffectMixin.quantization`. |
|
|
|
Parameters |
|
---------- |
|
channels : tuple, optional |
|
Number of evenly spaced quantization channels to quantize |
|
to, by default ("choice", [8, 32, 128, 256, 1024]) |
|
name : str, optional |
|
Name of this transform, used to identify it in the dictionary |
|
produced by ``self.instantiate``, by default None |
|
prob : float, optional |
|
Probability of applying this transform, by default 1.0 |
|
""" |
|
|
|
def __init__( |
|
self, |
|
channels: tuple = ("choice", [8, 32, 128, 256, 1024]), |
|
name: str = None, |
|
prob: float = 1.0, |
|
): |
|
super().__init__(name=name, prob=prob) |
|
|
|
self.channels = channels |
|
|
|
def _instantiate(self, state: RandomState): |
|
return {"channels": util.sample_from_dist(self.channels, state)} |
|
|
|
def _transform(self, signal, channels): |
|
return signal.quantization(channels) |
|
|
|
|
|
class MuLawQuantization(BaseTransform): |
|
"""Applies mu-law quantization to the input waveform. Corresponds |
|
to :py:func:`audiotools.core.effects.EffectMixin.mulaw_quantization`. |
|
|
|
Parameters |
|
---------- |
|
channels : tuple, optional |
|
Number of mu-law spaced quantization channels to quantize |
|
to, by default ("choice", [8, 32, 128, 256, 1024]) |
|
name : str, optional |
|
Name of this transform, used to identify it in the dictionary |
|
produced by ``self.instantiate``, by default None |
|
prob : float, optional |
|
Probability of applying this transform, by default 1.0 |
|
""" |
|
|
|
def __init__( |
|
self, |
|
channels: tuple = ("choice", [8, 32, 128, 256, 1024]), |
|
name: str = None, |
|
prob: float = 1.0, |
|
): |
|
super().__init__(name=name, prob=prob) |
|
|
|
self.channels = channels |
|
|
|
def _instantiate(self, state: RandomState): |
|
return {"channels": util.sample_from_dist(self.channels, state)} |
|
|
|
def _transform(self, signal, channels): |
|
return signal.mulaw_quantization(channels) |
|
|
|
|
|
class NoiseFloor(BaseTransform): |
|
"""Adds a noise floor of Gaussian noise to the signal at a specified |
|
dB. |
|
|
|
Parameters |
|
---------- |
|
db : tuple, optional |
|
Level of noise to add to signal, by default ("const", -50.0) |
|
name : str, optional |
|
Name of this transform, used to identify it in the dictionary |
|
produced by ``self.instantiate``, by default None |
|
prob : float, optional |
|
Probability of applying this transform, by default 1.0 |
|
""" |
|
|
|
def __init__( |
|
self, |
|
db: tuple = ("const", -50.0), |
|
name: str = None, |
|
prob: float = 1.0, |
|
): |
|
super().__init__(name=name, prob=prob) |
|
|
|
self.db = db |
|
|
|
def _instantiate(self, state: RandomState, signal: AudioSignal): |
|
db = util.sample_from_dist(self.db, state) |
|
audio_data = state.randn(signal.num_channels, signal.signal_length) |
|
nz_signal = AudioSignal(audio_data, signal.sample_rate) |
|
nz_signal.normalize(db) |
|
return {"nz_signal": nz_signal} |
|
|
|
def _transform(self, signal, nz_signal): |
|
|
|
|
|
return signal + nz_signal |
|
|
|
|
|
class BackgroundNoise(BaseTransform): |
|
"""Adds background noise from audio specified by a set of CSV files. |
|
A valid CSV file looks like, and is typically generated by |
|
:py:func:`audiotools.data.preprocess.create_csv`: |
|
|
|
.. csv-table:: |
|
:header: path |
|
|
|
room_tone/m6_script2_clean.wav |
|
room_tone/m6_script2_cleanraw.wav |
|
room_tone/m6_script2_ipad_balcony1.wav |
|
room_tone/m6_script2_ipad_bedroom1.wav |
|
room_tone/m6_script2_ipad_confroom1.wav |
|
room_tone/m6_script2_ipad_confroom2.wav |
|
room_tone/m6_script2_ipad_livingroom1.wav |
|
room_tone/m6_script2_ipad_office1.wav |
|
|
|
.. note:: |
|
All paths are relative to an environment variable called ``PATH_TO_DATA``, |
|
so that CSV files are portable across machines where data may be |
|
located in different places. |
|
|
|
This transform calls :py:func:`audiotools.core.effects.EffectMixin.mix` |
|
and :py:func:`audiotools.core.effects.EffectMixin.equalizer` under the |
|
hood. |
|
|
|
Parameters |
|
---------- |
|
snr : tuple, optional |
|
Signal-to-noise ratio, by default ("uniform", 10.0, 30.0) |
|
sources : List[str], optional |
|
Sources containing folders, or CSVs with paths to audio files, |
|
by default None |
|
weights : List[float], optional |
|
Weights to sample audio files from each source, by default None |
|
eq_amount : tuple, optional |
|
Amount of equalization to apply, by default ("const", 1.0) |
|
n_bands : int, optional |
|
Number of bands in equalizer, by default 3 |
|
name : str, optional |
|
Name of this transform, used to identify it in the dictionary |
|
produced by ``self.instantiate``, by default None |
|
prob : float, optional |
|
Probability of applying this transform, by default 1.0 |
|
loudness_cutoff : float, optional |
|
Loudness cutoff when loading from audio files, by default None |
|
""" |
|
|
|
def __init__( |
|
self, |
|
snr: tuple = ("uniform", 10.0, 30.0), |
|
sources: List[str] = None, |
|
weights: List[float] = None, |
|
eq_amount: tuple = ("const", 1.0), |
|
n_bands: int = 3, |
|
name: str = None, |
|
prob: float = 1.0, |
|
loudness_cutoff: float = None, |
|
): |
|
super().__init__(name=name, prob=prob) |
|
|
|
self.snr = snr |
|
self.eq_amount = eq_amount |
|
self.n_bands = n_bands |
|
self.loader = AudioLoader(sources, weights) |
|
self.loudness_cutoff = loudness_cutoff |
|
|
|
def _instantiate(self, state: RandomState, signal: AudioSignal): |
|
eq_amount = util.sample_from_dist(self.eq_amount, state) |
|
eq = -eq_amount * state.rand(self.n_bands) |
|
snr = util.sample_from_dist(self.snr, state) |
|
|
|
bg_signal = self.loader( |
|
state, |
|
signal.sample_rate, |
|
duration=signal.signal_duration, |
|
loudness_cutoff=self.loudness_cutoff, |
|
num_channels=signal.num_channels, |
|
)["signal"] |
|
|
|
return {"eq": eq, "bg_signal": bg_signal, "snr": snr} |
|
|
|
def _transform(self, signal, bg_signal, snr, eq): |
|
|
|
|
|
return signal.mix(bg_signal.clone(), snr, eq) |
|
|
|
|
|
class CrossTalk(BaseTransform): |
|
"""Adds crosstalk between speakers, whose audio is drawn from a CSV file |
|
that was produced via :py:func:`audiotools.data.preprocess.create_csv`. |
|
|
|
This transform calls :py:func:`audiotools.core.effects.EffectMixin.mix` |
|
under the hood. |
|
|
|
Parameters |
|
---------- |
|
snr : tuple, optional |
|
How loud cross-talk speaker is relative to original signal in dB, |
|
by default ("uniform", 0.0, 10.0) |
|
sources : List[str], optional |
|
Sources containing folders, or CSVs with paths to audio files, |
|
by default None |
|
weights : List[float], optional |
|
Weights to sample audio files from each source, by default None |
|
name : str, optional |
|
Name of this transform, used to identify it in the dictionary |
|
produced by ``self.instantiate``, by default None |
|
prob : float, optional |
|
Probability of applying this transform, by default 1.0 |
|
loudness_cutoff : float, optional |
|
Loudness cutoff when loading from audio files, by default -40 |
|
""" |
|
|
|
def __init__( |
|
self, |
|
snr: tuple = ("uniform", 0.0, 10.0), |
|
sources: List[str] = None, |
|
weights: List[float] = None, |
|
name: str = None, |
|
prob: float = 1.0, |
|
loudness_cutoff: float = -40, |
|
): |
|
super().__init__(name=name, prob=prob) |
|
|
|
self.snr = snr |
|
self.loader = AudioLoader(sources, weights) |
|
self.loudness_cutoff = loudness_cutoff |
|
|
|
def _instantiate(self, state: RandomState, signal: AudioSignal): |
|
snr = util.sample_from_dist(self.snr, state) |
|
crosstalk_signal = self.loader( |
|
state, |
|
signal.sample_rate, |
|
duration=signal.signal_duration, |
|
loudness_cutoff=self.loudness_cutoff, |
|
num_channels=signal.num_channels, |
|
)["signal"] |
|
|
|
return {"crosstalk_signal": crosstalk_signal, "snr": snr} |
|
|
|
def _transform(self, signal, crosstalk_signal, snr): |
|
|
|
|
|
loudness = signal.loudness() |
|
mix = signal.mix(crosstalk_signal.clone(), snr) |
|
mix.normalize(loudness) |
|
return mix |
|
|
|
|
|
class RoomImpulseResponse(BaseTransform): |
|
"""Convolves signal with a room impulse response, at a specified |
|
direct-to-reverberant ratio, with equalization applied. Room impulse |
|
response data is drawn from a CSV file that was produced via |
|
:py:func:`audiotools.data.preprocess.create_csv`. |
|
|
|
This transform calls :py:func:`audiotools.core.effects.EffectMixin.apply_ir` |
|
under the hood. |
|
|
|
Parameters |
|
---------- |
|
drr : tuple, optional |
|
_description_, by default ("uniform", 0.0, 30.0) |
|
sources : List[str], optional |
|
Sources containing folders, or CSVs with paths to audio files, |
|
by default None |
|
weights : List[float], optional |
|
Weights to sample audio files from each source, by default None |
|
eq_amount : tuple, optional |
|
Amount of equalization to apply, by default ("const", 1.0) |
|
n_bands : int, optional |
|
Number of bands in equalizer, by default 6 |
|
name : str, optional |
|
Name of this transform, used to identify it in the dictionary |
|
produced by ``self.instantiate``, by default None |
|
prob : float, optional |
|
Probability of applying this transform, by default 1.0 |
|
use_original_phase : bool, optional |
|
Whether or not to use the original phase, by default False |
|
offset : float, optional |
|
Offset from each impulse response file to use, by default 0.0 |
|
duration : float, optional |
|
Duration of each impulse response, by default 1.0 |
|
""" |
|
|
|
def __init__( |
|
self, |
|
drr: tuple = ("uniform", 0.0, 30.0), |
|
sources: List[str] = None, |
|
weights: List[float] = None, |
|
eq_amount: tuple = ("const", 1.0), |
|
n_bands: int = 6, |
|
name: str = None, |
|
prob: float = 1.0, |
|
use_original_phase: bool = False, |
|
offset: float = 0.0, |
|
duration: float = 1.0, |
|
): |
|
super().__init__(name=name, prob=prob) |
|
|
|
self.drr = drr |
|
self.eq_amount = eq_amount |
|
self.n_bands = n_bands |
|
self.use_original_phase = use_original_phase |
|
|
|
self.loader = AudioLoader(sources, weights) |
|
self.offset = offset |
|
self.duration = duration |
|
|
|
def _instantiate(self, state: RandomState, signal: AudioSignal = None): |
|
eq_amount = util.sample_from_dist(self.eq_amount, state) |
|
eq = -eq_amount * state.rand(self.n_bands) |
|
drr = util.sample_from_dist(self.drr, state) |
|
|
|
ir_signal = self.loader( |
|
state, |
|
signal.sample_rate, |
|
offset=self.offset, |
|
duration=self.duration, |
|
loudness_cutoff=None, |
|
num_channels=signal.num_channels, |
|
)["signal"] |
|
ir_signal.zero_pad_to(signal.sample_rate) |
|
|
|
return {"eq": eq, "ir_signal": ir_signal, "drr": drr} |
|
|
|
def _transform(self, signal, ir_signal, drr, eq): |
|
|
|
|
|
return signal.apply_ir( |
|
ir_signal.clone(), drr, eq, use_original_phase=self.use_original_phase |
|
) |
|
|
|
|
|
class VolumeChange(BaseTransform): |
|
"""Changes the volume of the input signal. |
|
|
|
Uses :py:func:`audiotools.core.effects.EffectMixin.volume_change`. |
|
|
|
Parameters |
|
---------- |
|
db : tuple, optional |
|
Change in volume in decibels, by default ("uniform", -12.0, 0.0) |
|
name : str, optional |
|
Name of this transform, used to identify it in the dictionary |
|
produced by ``self.instantiate``, by default None |
|
prob : float, optional |
|
Probability of applying this transform, by default 1.0 |
|
""" |
|
|
|
def __init__( |
|
self, |
|
db: tuple = ("uniform", -12.0, 0.0), |
|
name: str = None, |
|
prob: float = 1.0, |
|
): |
|
super().__init__(name=name, prob=prob) |
|
self.db = db |
|
|
|
def _instantiate(self, state: RandomState): |
|
return {"db": util.sample_from_dist(self.db, state)} |
|
|
|
def _transform(self, signal, db): |
|
return signal.volume_change(db) |
|
|
|
|
|
class VolumeNorm(BaseTransform): |
|
"""Normalizes the volume of the excerpt to a specified decibel. |
|
|
|
Uses :py:func:`audiotools.core.effects.EffectMixin.normalize`. |
|
|
|
Parameters |
|
---------- |
|
db : tuple, optional |
|
dB to normalize signal to, by default ("const", -24) |
|
name : str, optional |
|
Name of this transform, used to identify it in the dictionary |
|
produced by ``self.instantiate``, by default None |
|
prob : float, optional |
|
Probability of applying this transform, by default 1.0 |
|
""" |
|
|
|
def __init__( |
|
self, |
|
db: tuple = ("const", -24), |
|
name: str = None, |
|
prob: float = 1.0, |
|
): |
|
super().__init__(name=name, prob=prob) |
|
|
|
self.db = db |
|
|
|
def _instantiate(self, state: RandomState): |
|
return {"db": util.sample_from_dist(self.db, state)} |
|
|
|
def _transform(self, signal, db): |
|
return signal.normalize(db) |
|
|
|
|
|
class GlobalVolumeNorm(BaseTransform): |
|
"""Similar to :py:func:`audiotools.data.transforms.VolumeNorm`, this |
|
transform also normalizes the volume of a signal, but it uses |
|
the volume of the entire audio file the loaded excerpt comes from, |
|
rather than the volume of just the excerpt. The volume of the |
|
entire audio file is expected in ``signal.metadata["loudness"]``. |
|
If loading audio from a CSV generated by :py:func:`audiotools.data.preprocess.create_csv` |
|
with ``loudness = True``, like the following: |
|
|
|
.. csv-table:: |
|
:header: path,loudness |
|
|
|
daps/produced/f1_script1_produced.wav,-16.299999237060547 |
|
daps/produced/f1_script2_produced.wav,-16.600000381469727 |
|
daps/produced/f1_script3_produced.wav,-17.299999237060547 |
|
daps/produced/f1_script4_produced.wav,-16.100000381469727 |
|
daps/produced/f1_script5_produced.wav,-16.700000762939453 |
|
daps/produced/f3_script1_produced.wav,-16.5 |
|
|
|
The ``AudioLoader`` will automatically load the loudness column into |
|
the metadata of the signal. |
|
|
|
Uses :py:func:`audiotools.core.effects.EffectMixin.volume_change`. |
|
|
|
Parameters |
|
---------- |
|
db : tuple, optional |
|
dB to normalize signal to, by default ("const", -24) |
|
name : str, optional |
|
Name of this transform, used to identify it in the dictionary |
|
produced by ``self.instantiate``, by default None |
|
prob : float, optional |
|
Probability of applying this transform, by default 1.0 |
|
""" |
|
|
|
def __init__( |
|
self, |
|
db: tuple = ("const", -24), |
|
name: str = None, |
|
prob: float = 1.0, |
|
): |
|
super().__init__(name=name, prob=prob) |
|
|
|
self.db = db |
|
|
|
def _instantiate(self, state: RandomState, signal: AudioSignal): |
|
if "loudness" not in signal.metadata: |
|
db_change = 0.0 |
|
elif float(signal.metadata["loudness"]) == float("-inf"): |
|
db_change = 0.0 |
|
else: |
|
db = util.sample_from_dist(self.db, state) |
|
db_change = db - float(signal.metadata["loudness"]) |
|
|
|
return {"db": db_change} |
|
|
|
def _transform(self, signal, db): |
|
return signal.volume_change(db) |
|
|
|
|
|
class Silence(BaseTransform): |
|
"""Zeros out the signal with some probability. |
|
|
|
Parameters |
|
---------- |
|
name : str, optional |
|
Name of this transform, used to identify it in the dictionary |
|
produced by ``self.instantiate``, by default None |
|
prob : float, optional |
|
Probability of applying this transform, by default 0.1 |
|
""" |
|
|
|
def __init__(self, name: str = None, prob: float = 0.1): |
|
super().__init__(name=name, prob=prob) |
|
|
|
def _transform(self, signal): |
|
_loudness = signal._loudness |
|
signal = AudioSignal( |
|
torch.zeros_like(signal.audio_data), |
|
sample_rate=signal.sample_rate, |
|
stft_params=signal.stft_params, |
|
) |
|
|
|
|
|
signal._loudness = _loudness |
|
|
|
return signal |
|
|
|
|
|
class LowPass(BaseTransform): |
|
"""Applies a LowPass filter. |
|
|
|
Uses :py:func:`audiotools.core.dsp.DSPMixin.low_pass`. |
|
|
|
Parameters |
|
---------- |
|
cutoff : tuple, optional |
|
Cutoff frequency distribution, |
|
by default ``("choice", [4000, 8000, 16000])`` |
|
zeros : int, optional |
|
Number of zero-crossings in filter, argument to |
|
``julius.LowPassFilters``, by default 51 |
|
name : str, optional |
|
Name of this transform, used to identify it in the dictionary |
|
produced by ``self.instantiate``, by default None |
|
prob : float, optional |
|
Probability of applying this transform, by default 1.0 |
|
""" |
|
|
|
def __init__( |
|
self, |
|
cutoff: tuple = ("choice", [4000, 8000, 16000]), |
|
zeros: int = 51, |
|
name: str = None, |
|
prob: float = 1, |
|
): |
|
super().__init__(name=name, prob=prob) |
|
|
|
self.cutoff = cutoff |
|
self.zeros = zeros |
|
|
|
def _instantiate(self, state: RandomState): |
|
return {"cutoff": util.sample_from_dist(self.cutoff, state)} |
|
|
|
def _transform(self, signal, cutoff): |
|
return signal.low_pass(cutoff, zeros=self.zeros) |
|
|
|
|
|
class HighPass(BaseTransform): |
|
"""Applies a HighPass filter. |
|
|
|
Uses :py:func:`audiotools.core.dsp.DSPMixin.high_pass`. |
|
|
|
Parameters |
|
---------- |
|
cutoff : tuple, optional |
|
Cutoff frequency distribution, |
|
by default ``("choice", [50, 100, 250, 500, 1000])`` |
|
zeros : int, optional |
|
Number of zero-crossings in filter, argument to |
|
``julius.LowPassFilters``, by default 51 |
|
name : str, optional |
|
Name of this transform, used to identify it in the dictionary |
|
produced by ``self.instantiate``, by default None |
|
prob : float, optional |
|
Probability of applying this transform, by default 1.0 |
|
""" |
|
|
|
def __init__( |
|
self, |
|
cutoff: tuple = ("choice", [50, 100, 250, 500, 1000]), |
|
zeros: int = 51, |
|
name: str = None, |
|
prob: float = 1, |
|
): |
|
super().__init__(name=name, prob=prob) |
|
|
|
self.cutoff = cutoff |
|
self.zeros = zeros |
|
|
|
def _instantiate(self, state: RandomState): |
|
return {"cutoff": util.sample_from_dist(self.cutoff, state)} |
|
|
|
def _transform(self, signal, cutoff): |
|
return signal.high_pass(cutoff, zeros=self.zeros) |
|
|
|
|
|
class RescaleAudio(BaseTransform): |
|
"""Rescales the audio so it is in between ``-val`` and ``val`` |
|
only if the original audio exceeds those bounds. Useful if |
|
transforms have caused the audio to clip. |
|
|
|
Uses :py:func:`audiotools.core.effects.EffectMixin.ensure_max_of_audio`. |
|
|
|
Parameters |
|
---------- |
|
val : float, optional |
|
Max absolute value of signal, by default 1.0 |
|
name : str, optional |
|
Name of this transform, used to identify it in the dictionary |
|
produced by ``self.instantiate``, by default None |
|
prob : float, optional |
|
Probability of applying this transform, by default 1.0 |
|
""" |
|
|
|
def __init__(self, val: float = 1.0, name: str = None, prob: float = 1): |
|
super().__init__(name=name, prob=prob) |
|
|
|
self.val = val |
|
|
|
def _transform(self, signal): |
|
return signal.ensure_max_of_audio(self.val) |
|
|
|
|
|
class ShiftPhase(SpectralTransform): |
|
"""Shifts the phase of the audio. |
|
|
|
Uses :py:func:`audiotools.core.dsp.DSPMixin.shift)phase`. |
|
|
|
Parameters |
|
---------- |
|
shift : tuple, optional |
|
How much to shift phase by, by default ("uniform", -np.pi, np.pi) |
|
name : str, optional |
|
Name of this transform, used to identify it in the dictionary |
|
produced by ``self.instantiate``, by default None |
|
prob : float, optional |
|
Probability of applying this transform, by default 1.0 |
|
""" |
|
|
|
def __init__( |
|
self, |
|
shift: tuple = ("uniform", -np.pi, np.pi), |
|
name: str = None, |
|
prob: float = 1, |
|
): |
|
super().__init__(name=name, prob=prob) |
|
self.shift = shift |
|
|
|
def _instantiate(self, state: RandomState): |
|
return {"shift": util.sample_from_dist(self.shift, state)} |
|
|
|
def _transform(self, signal, shift): |
|
return signal.shift_phase(shift) |
|
|
|
|
|
class InvertPhase(ShiftPhase): |
|
"""Inverts the phase of the audio. |
|
|
|
Uses :py:func:`audiotools.core.dsp.DSPMixin.shift_phase`. |
|
|
|
Parameters |
|
---------- |
|
name : str, optional |
|
Name of this transform, used to identify it in the dictionary |
|
produced by ``self.instantiate``, by default None |
|
prob : float, optional |
|
Probability of applying this transform, by default 1.0 |
|
""" |
|
|
|
def __init__(self, name: str = None, prob: float = 1): |
|
super().__init__(shift=("const", np.pi), name=name, prob=prob) |
|
|
|
|
|
class CorruptPhase(SpectralTransform): |
|
"""Corrupts the phase of the audio. |
|
|
|
Uses :py:func:`audiotools.core.dsp.DSPMixin.corrupt_phase`. |
|
|
|
Parameters |
|
---------- |
|
scale : tuple, optional |
|
How much to corrupt phase by, by default ("uniform", 0, np.pi) |
|
name : str, optional |
|
Name of this transform, used to identify it in the dictionary |
|
produced by ``self.instantiate``, by default None |
|
prob : float, optional |
|
Probability of applying this transform, by default 1.0 |
|
""" |
|
|
|
def __init__( |
|
self, scale: tuple = ("uniform", 0, np.pi), name: str = None, prob: float = 1 |
|
): |
|
super().__init__(name=name, prob=prob) |
|
self.scale = scale |
|
|
|
def _instantiate(self, state: RandomState, signal: AudioSignal = None): |
|
scale = util.sample_from_dist(self.scale, state) |
|
corruption = state.normal(scale=scale, size=signal.phase.shape[1:]) |
|
return {"corruption": corruption.astype("float32")} |
|
|
|
def _transform(self, signal, corruption): |
|
return signal.shift_phase(shift=corruption) |
|
|
|
|
|
class FrequencyMask(SpectralTransform): |
|
"""Masks a band of frequencies at a center frequency |
|
from the audio. |
|
|
|
Uses :py:func:`audiotools.core.dsp.DSPMixin.mask_frequencies`. |
|
|
|
Parameters |
|
---------- |
|
f_center : tuple, optional |
|
Center frequency between 0.0 and 1.0 (Nyquist), by default ("uniform", 0.0, 1.0) |
|
f_width : tuple, optional |
|
Width of zero'd out band, by default ("const", 0.1) |
|
name : str, optional |
|
Name of this transform, used to identify it in the dictionary |
|
produced by ``self.instantiate``, by default None |
|
prob : float, optional |
|
Probability of applying this transform, by default 1.0 |
|
""" |
|
|
|
def __init__( |
|
self, |
|
f_center: tuple = ("uniform", 0.0, 1.0), |
|
f_width: tuple = ("const", 0.1), |
|
name: str = None, |
|
prob: float = 1, |
|
): |
|
super().__init__(name=name, prob=prob) |
|
self.f_center = f_center |
|
self.f_width = f_width |
|
|
|
def _instantiate(self, state: RandomState, signal: AudioSignal): |
|
f_center = util.sample_from_dist(self.f_center, state) |
|
f_width = util.sample_from_dist(self.f_width, state) |
|
|
|
fmin = max(f_center - (f_width / 2), 0.0) |
|
fmax = min(f_center + (f_width / 2), 1.0) |
|
|
|
fmin_hz = (signal.sample_rate / 2) * fmin |
|
fmax_hz = (signal.sample_rate / 2) * fmax |
|
|
|
return {"fmin_hz": fmin_hz, "fmax_hz": fmax_hz} |
|
|
|
def _transform(self, signal, fmin_hz: float, fmax_hz: float): |
|
return signal.mask_frequencies(fmin_hz=fmin_hz, fmax_hz=fmax_hz) |
|
|
|
|
|
class TimeMask(SpectralTransform): |
|
"""Masks out contiguous time-steps from signal. |
|
|
|
Uses :py:func:`audiotools.core.dsp.DSPMixin.mask_timesteps`. |
|
|
|
Parameters |
|
---------- |
|
t_center : tuple, optional |
|
Center time in terms of 0.0 and 1.0 (duration of signal), |
|
by default ("uniform", 0.0, 1.0) |
|
t_width : tuple, optional |
|
Width of dropped out portion, by default ("const", 0.025) |
|
name : str, optional |
|
Name of this transform, used to identify it in the dictionary |
|
produced by ``self.instantiate``, by default None |
|
prob : float, optional |
|
Probability of applying this transform, by default 1.0 |
|
""" |
|
|
|
def __init__( |
|
self, |
|
t_center: tuple = ("uniform", 0.0, 1.0), |
|
t_width: tuple = ("const", 0.025), |
|
name: str = None, |
|
prob: float = 1, |
|
): |
|
super().__init__(name=name, prob=prob) |
|
self.t_center = t_center |
|
self.t_width = t_width |
|
|
|
def _instantiate(self, state: RandomState, signal: AudioSignal): |
|
t_center = util.sample_from_dist(self.t_center, state) |
|
t_width = util.sample_from_dist(self.t_width, state) |
|
|
|
tmin = max(t_center - (t_width / 2), 0.0) |
|
tmax = min(t_center + (t_width / 2), 1.0) |
|
|
|
tmin_s = signal.signal_duration * tmin |
|
tmax_s = signal.signal_duration * tmax |
|
return {"tmin_s": tmin_s, "tmax_s": tmax_s} |
|
|
|
def _transform(self, signal, tmin_s: float, tmax_s: float): |
|
return signal.mask_timesteps(tmin_s=tmin_s, tmax_s=tmax_s) |
|
|
|
|
|
class MaskLowMagnitudes(SpectralTransform): |
|
"""Masks low magnitude regions out of signal. |
|
|
|
Uses :py:func:`audiotools.core.dsp.DSPMixin.mask_low_magnitudes`. |
|
|
|
Parameters |
|
---------- |
|
db_cutoff : tuple, optional |
|
Decibel value for which things below it will be masked away, |
|
by default ("uniform", -10, 10) |
|
name : str, optional |
|
Name of this transform, used to identify it in the dictionary |
|
produced by ``self.instantiate``, by default None |
|
prob : float, optional |
|
Probability of applying this transform, by default 1.0 |
|
""" |
|
|
|
def __init__( |
|
self, |
|
db_cutoff: tuple = ("uniform", -10, 10), |
|
name: str = None, |
|
prob: float = 1, |
|
): |
|
super().__init__(name=name, prob=prob) |
|
self.db_cutoff = db_cutoff |
|
|
|
def _instantiate(self, state: RandomState, signal: AudioSignal = None): |
|
return {"db_cutoff": util.sample_from_dist(self.db_cutoff, state)} |
|
|
|
def _transform(self, signal, db_cutoff: float): |
|
return signal.mask_low_magnitudes(db_cutoff) |
|
|
|
|
|
class Smoothing(BaseTransform): |
|
"""Convolves the signal with a smoothing window. |
|
|
|
Uses :py:func:`audiotools.core.effects.EffectMixin.convolve`. |
|
|
|
Parameters |
|
---------- |
|
window_type : tuple, optional |
|
Type of window to use, by default ("const", "average") |
|
window_length : tuple, optional |
|
Length of smoothing window, by |
|
default ("choice", [8, 16, 32, 64, 128, 256, 512]) |
|
name : str, optional |
|
Name of this transform, used to identify it in the dictionary |
|
produced by ``self.instantiate``, by default None |
|
prob : float, optional |
|
Probability of applying this transform, by default 1.0 |
|
""" |
|
|
|
def __init__( |
|
self, |
|
window_type: tuple = ("const", "average"), |
|
window_length: tuple = ("choice", [8, 16, 32, 64, 128, 256, 512]), |
|
name: str = None, |
|
prob: float = 1, |
|
): |
|
super().__init__(name=name, prob=prob) |
|
self.window_type = window_type |
|
self.window_length = window_length |
|
|
|
def _instantiate(self, state: RandomState, signal: AudioSignal = None): |
|
window_type = util.sample_from_dist(self.window_type, state) |
|
window_length = util.sample_from_dist(self.window_length, state) |
|
window = signal.get_window( |
|
window_type=window_type, window_length=window_length, device="cpu" |
|
) |
|
return {"window": AudioSignal(window, signal.sample_rate)} |
|
|
|
def _transform(self, signal, window): |
|
sscale = signal.audio_data.abs().max(dim=-1, keepdim=True).values |
|
sscale[sscale == 0.0] = 1.0 |
|
|
|
out = signal.convolve(window) |
|
|
|
oscale = out.audio_data.abs().max(dim=-1, keepdim=True).values |
|
oscale[oscale == 0.0] = 1.0 |
|
|
|
out = out * (sscale / oscale) |
|
return out |
|
|
|
|
|
class TimeNoise(TimeMask): |
|
"""Similar to :py:func:`audiotools.data.transforms.TimeMask`, but |
|
replaces with noise instead of zeros. |
|
|
|
Parameters |
|
---------- |
|
t_center : tuple, optional |
|
Center time in terms of 0.0 and 1.0 (duration of signal), |
|
by default ("uniform", 0.0, 1.0) |
|
t_width : tuple, optional |
|
Width of dropped out portion, by default ("const", 0.025) |
|
name : str, optional |
|
Name of this transform, used to identify it in the dictionary |
|
produced by ``self.instantiate``, by default None |
|
prob : float, optional |
|
Probability of applying this transform, by default 1.0 |
|
""" |
|
|
|
def __init__( |
|
self, |
|
t_center: tuple = ("uniform", 0.0, 1.0), |
|
t_width: tuple = ("const", 0.025), |
|
name: str = None, |
|
prob: float = 1, |
|
): |
|
super().__init__(t_center=t_center, t_width=t_width, name=name, prob=prob) |
|
|
|
def _transform(self, signal, tmin_s: float, tmax_s: float): |
|
signal = signal.mask_timesteps(tmin_s=tmin_s, tmax_s=tmax_s, val=0.0) |
|
mag, phase = signal.magnitude, signal.phase |
|
|
|
mag_r, phase_r = torch.randn_like(mag), torch.randn_like(phase) |
|
mask = (mag == 0.0) * (phase == 0.0) |
|
|
|
mag[mask] = mag_r[mask] |
|
phase[mask] = phase_r[mask] |
|
|
|
signal.magnitude = mag |
|
signal.phase = phase |
|
return signal |
|
|
|
|
|
class FrequencyNoise(FrequencyMask): |
|
"""Similar to :py:func:`audiotools.data.transforms.FrequencyMask`, but |
|
replaces with noise instead of zeros. |
|
|
|
Parameters |
|
---------- |
|
f_center : tuple, optional |
|
Center frequency between 0.0 and 1.0 (Nyquist), by default ("uniform", 0.0, 1.0) |
|
f_width : tuple, optional |
|
Width of zero'd out band, by default ("const", 0.1) |
|
name : str, optional |
|
Name of this transform, used to identify it in the dictionary |
|
produced by ``self.instantiate``, by default None |
|
prob : float, optional |
|
Probability of applying this transform, by default 1.0 |
|
""" |
|
|
|
def __init__( |
|
self, |
|
f_center: tuple = ("uniform", 0.0, 1.0), |
|
f_width: tuple = ("const", 0.1), |
|
name: str = None, |
|
prob: float = 1, |
|
): |
|
super().__init__(f_center=f_center, f_width=f_width, name=name, prob=prob) |
|
|
|
def _transform(self, signal, fmin_hz: float, fmax_hz: float): |
|
signal = signal.mask_frequencies(fmin_hz=fmin_hz, fmax_hz=fmax_hz) |
|
mag, phase = signal.magnitude, signal.phase |
|
|
|
mag_r, phase_r = torch.randn_like(mag), torch.randn_like(phase) |
|
mask = (mag == 0.0) * (phase == 0.0) |
|
|
|
mag[mask] = mag_r[mask] |
|
phase[mask] = phase_r[mask] |
|
|
|
signal.magnitude = mag |
|
signal.phase = phase |
|
return signal |
|
|
|
|
|
class SpectralDenoising(Equalizer): |
|
"""Applies denoising algorithm detailed in |
|
:py:func:`audiotools.ml.layers.spectral_gate.SpectralGate`, |
|
using a randomly generated noise signal for denoising. |
|
|
|
Parameters |
|
---------- |
|
eq_amount : tuple, optional |
|
Amount of eq to apply to noise signal, by default ("const", 1.0) |
|
denoise_amount : tuple, optional |
|
Amount to denoise by, by default ("uniform", 0.8, 1.0) |
|
nz_volume : float, optional |
|
Volume of noise to denoise with, by default -40 |
|
n_bands : int, optional |
|
Number of bands in equalizer, by default 6 |
|
n_freq : int, optional |
|
Number of frequency bins to smooth by, by default 3 |
|
n_time : int, optional |
|
Number of time bins to smooth by, by default 5 |
|
name : str, optional |
|
Name of this transform, used to identify it in the dictionary |
|
produced by ``self.instantiate``, by default None |
|
prob : float, optional |
|
Probability of applying this transform, by default 1.0 |
|
""" |
|
|
|
def __init__( |
|
self, |
|
eq_amount: tuple = ("const", 1.0), |
|
denoise_amount: tuple = ("uniform", 0.8, 1.0), |
|
nz_volume: float = -40, |
|
n_bands: int = 6, |
|
n_freq: int = 3, |
|
n_time: int = 5, |
|
name: str = None, |
|
prob: float = 1, |
|
): |
|
super().__init__(eq_amount=eq_amount, n_bands=n_bands, name=name, prob=prob) |
|
|
|
self.nz_volume = nz_volume |
|
self.denoise_amount = denoise_amount |
|
self.spectral_gate = ml.layers.SpectralGate(n_freq, n_time) |
|
|
|
def _transform(self, signal, nz, eq, denoise_amount): |
|
nz = nz.normalize(self.nz_volume).equalizer(eq) |
|
self.spectral_gate = self.spectral_gate.to(signal.device) |
|
signal = self.spectral_gate(signal, nz, denoise_amount) |
|
return signal |
|
|
|
def _instantiate(self, state: RandomState): |
|
kwargs = super()._instantiate(state) |
|
kwargs["denoise_amount"] = util.sample_from_dist(self.denoise_amount, state) |
|
kwargs["nz"] = AudioSignal(state.randn(22050), 44100) |
|
return kwargs |
|
|