Spaces:
Running
on
Zero
Running
on
Zero
import copy | |
from contextlib import contextmanager | |
from inspect import signature | |
from typing import List | |
import numpy as np | |
import torch | |
from flatten_dict import flatten | |
from flatten_dict import unflatten | |
from numpy.random import RandomState | |
from .. import ml | |
from ..core import AudioSignal | |
from ..core import util | |
from .datasets import AudioLoader | |
tt = torch.tensor | |
"""Shorthand for converting things to torch.tensor.""" | |
class BaseTransform: | |
"""This is the base class for all transforms that are implemented | |
in this library. Transforms have two main operations: ``transform`` | |
and ``instantiate``. | |
``instantiate`` sets the parameters randomly | |
from distribution tuples for each parameter. For example, for the | |
``BackgroundNoise`` transform, the signal-to-noise ratio (``snr``) | |
is chosen randomly by instantiate. By default, it chosen uniformly | |
between 10.0 and 30.0 (the tuple is set to ``("uniform", 10.0, 30.0)``). | |
``transform`` applies the transform using the instantiated parameters. | |
A simple example is as follows: | |
>>> seed = 0 | |
>>> signal = ... | |
>>> transform = transforms.NoiseFloor(db = ("uniform", -50.0, -30.0)) | |
>>> kwargs = transform.instantiate() | |
>>> output = transform(signal.clone(), **kwargs) | |
By breaking apart the instantiation of parameters from the actual audio | |
processing of the transform, we can make things more reproducible, while | |
also applying the transform on batches of data efficiently on GPU, | |
rather than on individual audio samples. | |
.. note:: | |
We call ``signal.clone()`` for the input to the ``transform`` function | |
because signals are modified in-place! If you don't clone the signal, | |
you will lose the original data. | |
Parameters | |
---------- | |
keys : list, optional | |
Keys that the transform looks for when | |
calling ``self.transform``, by default []. In general this is | |
set automatically, and you won't need to manipulate this argument. | |
name : str, optional | |
Name of this transform, used to identify it in the dictionary | |
produced by ``self.instantiate``, by default None | |
prob : float, optional | |
Probability of applying this transform, by default 1.0 | |
Examples | |
-------- | |
>>> seed = 0 | |
>>> | |
>>> audio_path = "tests/audio/spk/f10_script4_produced.wav" | |
>>> signal = AudioSignal(audio_path, offset=10, duration=2) | |
>>> transform = tfm.Compose( | |
>>> [ | |
>>> tfm.RoomImpulseResponse(sources=["tests/audio/irs.csv"]), | |
>>> tfm.BackgroundNoise(sources=["tests/audio/noises.csv"]), | |
>>> ], | |
>>> ) | |
>>> | |
>>> kwargs = transform.instantiate(seed, signal) | |
>>> output = transform(signal, **kwargs) | |
""" | |
def __init__(self, keys: list = [], name: str = None, prob: float = 1.0): | |
# Get keys from the _transform signature. | |
tfm_keys = list(signature(self._transform).parameters.keys()) | |
# Filter out signal and kwargs keys. | |
ignore_keys = ["signal", "kwargs"] | |
tfm_keys = [k for k in tfm_keys if k not in ignore_keys] | |
# Combine keys specified by the child class, the keys found in | |
# _transform signature, and the mask key. | |
self.keys = keys + tfm_keys + ["mask"] | |
self.prob = prob | |
if name is None: | |
name = self.__class__.__name__ | |
self.name = name | |
def _prepare(self, batch: dict): | |
sub_batch = batch[self.name] | |
for k in self.keys: | |
assert k in sub_batch.keys(), f"{k} not in batch" | |
return sub_batch | |
def _transform(self, signal): | |
return signal | |
def _instantiate(self, state: RandomState, signal: AudioSignal = None): | |
return {} | |
def apply_mask(batch: dict, mask: torch.Tensor): | |
"""Applies a mask to the batch. | |
Parameters | |
---------- | |
batch : dict | |
Batch whose values will be masked in the ``transform`` pass. | |
mask : torch.Tensor | |
Mask to apply to batch. | |
Returns | |
------- | |
dict | |
A dictionary that contains values only where ``mask = True``. | |
""" | |
masked_batch = {k: v[mask] for k, v in flatten(batch).items()} | |
return unflatten(masked_batch) | |
def transform(self, signal: AudioSignal, **kwargs): | |
"""Apply the transform to the audio signal, | |
with given keyword arguments. | |
Parameters | |
---------- | |
signal : AudioSignal | |
Signal that will be modified by the transforms in-place. | |
kwargs: dict | |
Keyword arguments to the specific transforms ``self._transform`` | |
function. | |
Returns | |
------- | |
AudioSignal | |
Transformed AudioSignal. | |
Examples | |
-------- | |
>>> for seed in range(10): | |
>>> kwargs = transform.instantiate(seed, signal) | |
>>> output = transform(signal.clone(), **kwargs) | |
""" | |
tfm_kwargs = self._prepare(kwargs) | |
mask = tfm_kwargs["mask"] | |
if torch.any(mask): | |
tfm_kwargs = self.apply_mask(tfm_kwargs, mask) | |
tfm_kwargs = {k: v for k, v in tfm_kwargs.items() if k != "mask"} | |
signal[mask] = self._transform(signal[mask], **tfm_kwargs) | |
return signal | |
def __call__(self, *args, **kwargs): | |
return self.transform(*args, **kwargs) | |
def instantiate( | |
self, | |
state: RandomState = None, | |
signal: AudioSignal = None, | |
): | |
"""Instantiates parameters for the transform. | |
Parameters | |
---------- | |
state : RandomState, optional | |
_description_, by default None | |
signal : AudioSignal, optional | |
_description_, by default None | |
Returns | |
------- | |
dict | |
Dictionary containing instantiated arguments for every keyword | |
argument to ``self._transform``. | |
Examples | |
-------- | |
>>> for seed in range(10): | |
>>> kwargs = transform.instantiate(seed, signal) | |
>>> output = transform(signal.clone(), **kwargs) | |
""" | |
state = util.random_state(state) | |
# Not all instantiates need the signal. Check if signal | |
# is needed before passing it in, so that the end-user | |
# doesn't need to have variables they're not using flowing | |
# into their function. | |
needs_signal = "signal" in set(signature(self._instantiate).parameters.keys()) | |
kwargs = {} | |
if needs_signal: | |
kwargs = {"signal": signal} | |
# Instantiate the parameters for the transform. | |
params = self._instantiate(state, **kwargs) | |
for k in list(params.keys()): | |
v = params[k] | |
if isinstance(v, (AudioSignal, torch.Tensor, dict)): | |
params[k] = v | |
else: | |
params[k] = tt(v) | |
mask = state.rand() <= self.prob | |
params[f"mask"] = tt(mask) | |
# Put the params into a nested dictionary that will be | |
# used later when calling the transform. This is to avoid | |
# collisions in the dictionary. | |
params = {self.name: params} | |
return params | |
def batch_instantiate( | |
self, | |
states: list = None, | |
signal: AudioSignal = None, | |
): | |
"""Instantiates arguments for every item in a batch, | |
given a list of states. Each state in the list | |
corresponds to one item in the batch. | |
Parameters | |
---------- | |
states : list, optional | |
List of states, by default None | |
signal : AudioSignal, optional | |
AudioSignal to pass to the ``self.instantiate`` section | |
if it is needed for this transform, by default None | |
Returns | |
------- | |
dict | |
Collated dictionary of arguments. | |
Examples | |
-------- | |
>>> batch_size = 4 | |
>>> signal = AudioSignal(audio_path, offset=10, duration=2) | |
>>> signal_batch = AudioSignal.batch([signal.clone() for _ in range(batch_size)]) | |
>>> | |
>>> states = [seed + idx for idx in list(range(batch_size))] | |
>>> kwargs = transform.batch_instantiate(states, signal_batch) | |
>>> batch_output = transform(signal_batch, **kwargs) | |
""" | |
kwargs = [] | |
for state in states: | |
kwargs.append(self.instantiate(state, signal)) | |
kwargs = util.collate(kwargs) | |
return kwargs | |
class Identity(BaseTransform): | |
"""This transform just returns the original signal.""" | |
pass | |
class SpectralTransform(BaseTransform): | |
"""Spectral transforms require STFT data to exist, since manipulations | |
of the STFT require the spectrogram. This just calls ``stft`` before | |
the transform is called, and calls ``istft`` after the transform is | |
called so that the audio data is written to after the spectral | |
manipulation. | |
""" | |
def transform(self, signal, **kwargs): | |
signal.stft() | |
super().transform(signal, **kwargs) | |
signal.istft() | |
return signal | |
class Compose(BaseTransform): | |
"""Compose applies transforms in sequence, one after the other. The | |
transforms are passed in as positional arguments or as a list like so: | |
>>> transform = tfm.Compose( | |
>>> [ | |
>>> tfm.RoomImpulseResponse(sources=["tests/audio/irs.csv"]), | |
>>> tfm.BackgroundNoise(sources=["tests/audio/noises.csv"]), | |
>>> ], | |
>>> ) | |
This will convolve the signal with a room impulse response, and then | |
add background noise to the signal. Instantiate instantiates | |
all the parameters for every transform in the transform list so the | |
interface for using the Compose transform is the same as everything | |
else: | |
>>> kwargs = transform.instantiate() | |
>>> output = transform(signal.clone(), **kwargs) | |
Under the hood, the transform maps each transform to a unique name | |
under the hood of the form ``{position}.{name}``, where ``position`` | |
is the index of the transform in the list. ``Compose`` can nest | |
within other ``Compose`` transforms, like so: | |
>>> preprocess = transforms.Compose( | |
>>> tfm.GlobalVolumeNorm(), | |
>>> tfm.CrossTalk(), | |
>>> name="preprocess", | |
>>> ) | |
>>> augment = transforms.Compose( | |
>>> tfm.RoomImpulseResponse(), | |
>>> tfm.BackgroundNoise(), | |
>>> name="augment", | |
>>> ) | |
>>> postprocess = transforms.Compose( | |
>>> tfm.VolumeChange(), | |
>>> tfm.RescaleAudio(), | |
>>> tfm.ShiftPhase(), | |
>>> name="postprocess", | |
>>> ) | |
>>> transform = transforms.Compose(preprocess, augment, postprocess), | |
This defines 3 composed transforms, and then composes them in sequence | |
with one another. | |
Parameters | |
---------- | |
*transforms : list | |
List of transforms to apply | |
name : str, optional | |
Name of this transform, used to identify it in the dictionary | |
produced by ``self.instantiate``, by default None | |
prob : float, optional | |
Probability of applying this transform, by default 1.0 | |
""" | |
def __init__(self, *transforms: list, name: str = None, prob: float = 1.0): | |
if isinstance(transforms[0], list): | |
transforms = transforms[0] | |
for i, tfm in enumerate(transforms): | |
tfm.name = f"{i}.{tfm.name}" | |
keys = [tfm.name for tfm in transforms] | |
super().__init__(keys=keys, name=name, prob=prob) | |
self.transforms = transforms | |
self.transforms_to_apply = keys | |
def filter(self, *names: list): | |
"""This can be used to skip transforms entirely when applying | |
the sequence of transforms to a signal. For example, take | |
the following transforms with the names ``preprocess, augment, postprocess``. | |
>>> preprocess = transforms.Compose( | |
>>> tfm.GlobalVolumeNorm(), | |
>>> tfm.CrossTalk(), | |
>>> name="preprocess", | |
>>> ) | |
>>> augment = transforms.Compose( | |
>>> tfm.RoomImpulseResponse(), | |
>>> tfm.BackgroundNoise(), | |
>>> name="augment", | |
>>> ) | |
>>> postprocess = transforms.Compose( | |
>>> tfm.VolumeChange(), | |
>>> tfm.RescaleAudio(), | |
>>> tfm.ShiftPhase(), | |
>>> name="postprocess", | |
>>> ) | |
>>> transform = transforms.Compose(preprocess, augment, postprocess) | |
If we wanted to apply all 3 to a signal, we do: | |
>>> kwargs = transform.instantiate() | |
>>> output = transform(signal.clone(), **kwargs) | |
But if we only wanted to apply the ``preprocess`` and ``postprocess`` | |
transforms to the signal, we do: | |
>>> with transform_fn.filter("preprocess", "postprocess"): | |
>>> output = transform(signal.clone(), **kwargs) | |
Parameters | |
---------- | |
*names : list | |
List of transforms, identified by name, to apply to signal. | |
""" | |
old_transforms = self.transforms_to_apply | |
self.transforms_to_apply = names | |
yield | |
self.transforms_to_apply = old_transforms | |
def _transform(self, signal, **kwargs): | |
for transform in self.transforms: | |
if any([x in transform.name for x in self.transforms_to_apply]): | |
signal = transform(signal, **kwargs) | |
return signal | |
def _instantiate(self, state: RandomState, signal: AudioSignal = None): | |
parameters = {} | |
for transform in self.transforms: | |
parameters.update(transform.instantiate(state, signal=signal)) | |
return parameters | |
def __getitem__(self, idx): | |
return self.transforms[idx] | |
def __len__(self): | |
return len(self.transforms) | |
def __iter__(self): | |
for transform in self.transforms: | |
yield transform | |
class Choose(Compose): | |
"""Choose logic is the same as :py:func:`audiotools.data.transforms.Compose`, | |
but instead of applying all the transforms in sequence, it applies just a single transform, | |
which is chosen for each item in the batch. | |
Parameters | |
---------- | |
*transforms : list | |
List of transforms to apply | |
weights : list | |
Probability of choosing any specific transform. | |
name : str, optional | |
Name of this transform, used to identify it in the dictionary | |
produced by ``self.instantiate``, by default None | |
prob : float, optional | |
Probability of applying this transform, by default 1.0 | |
Examples | |
-------- | |
>>> transforms.Choose(tfm.LowPass(), tfm.HighPass()) | |
""" | |
def __init__( | |
self, | |
*transforms: list, | |
weights: list = None, | |
name: str = None, | |
prob: float = 1.0, | |
): | |
super().__init__(*transforms, name=name, prob=prob) | |
if weights is None: | |
_len = len(self.transforms) | |
weights = [1 / _len for _ in range(_len)] | |
self.weights = np.array(weights) | |
def _instantiate(self, state: RandomState, signal: AudioSignal = None): | |
kwargs = super()._instantiate(state, signal) | |
tfm_idx = list(range(len(self.transforms))) | |
tfm_idx = state.choice(tfm_idx, p=self.weights) | |
one_hot = [] | |
for i, t in enumerate(self.transforms): | |
mask = kwargs[t.name]["mask"] | |
if mask.item(): | |
kwargs[t.name]["mask"] = tt(i == tfm_idx) | |
one_hot.append(kwargs[t.name]["mask"]) | |
kwargs["one_hot"] = one_hot | |
return kwargs | |
class Repeat(Compose): | |
"""Repeatedly applies a given transform ``n_repeat`` times." | |
Parameters | |
---------- | |
transform : BaseTransform | |
Transform to repeat. | |
n_repeat : int, optional | |
Number of times to repeat transform, by default 1 | |
""" | |
def __init__( | |
self, | |
transform, | |
n_repeat: int = 1, | |
name: str = None, | |
prob: float = 1.0, | |
): | |
transforms = [copy.copy(transform) for _ in range(n_repeat)] | |
super().__init__(transforms, name=name, prob=prob) | |
self.n_repeat = n_repeat | |
class RepeatUpTo(Choose): | |
"""Repeatedly applies a given transform up to ``max_repeat`` times." | |
Parameters | |
---------- | |
transform : BaseTransform | |
Transform to repeat. | |
max_repeat : int, optional | |
Max number of times to repeat transform, by default 1 | |
weights : list | |
Probability of choosing any specific number up to ``max_repeat``. | |
""" | |
def __init__( | |
self, | |
transform, | |
max_repeat: int = 5, | |
weights: list = None, | |
name: str = None, | |
prob: float = 1.0, | |
): | |
transforms = [] | |
for n in range(1, max_repeat): | |
transforms.append(Repeat(transform, n_repeat=n)) | |
super().__init__(transforms, name=name, prob=prob, weights=weights) | |
self.max_repeat = max_repeat | |
class ClippingDistortion(BaseTransform): | |
"""Adds clipping distortion to signal. Corresponds | |
to :py:func:`audiotools.core.effects.EffectMixin.clip_distortion`. | |
Parameters | |
---------- | |
perc : tuple, optional | |
Clipping percentile. Values are between 0.0 to 1.0. | |
Typical values are 0.1 or below, by default ("uniform", 0.0, 0.1) | |
name : str, optional | |
Name of this transform, used to identify it in the dictionary | |
produced by ``self.instantiate``, by default None | |
prob : float, optional | |
Probability of applying this transform, by default 1.0 | |
""" | |
def __init__( | |
self, | |
perc: tuple = ("uniform", 0.0, 0.1), | |
name: str = None, | |
prob: float = 1.0, | |
): | |
super().__init__(name=name, prob=prob) | |
self.perc = perc | |
def _instantiate(self, state: RandomState): | |
return {"perc": util.sample_from_dist(self.perc, state)} | |
def _transform(self, signal, perc): | |
return signal.clip_distortion(perc) | |
class Equalizer(BaseTransform): | |
"""Applies an equalization curve to the audio signal. Corresponds | |
to :py:func:`audiotools.core.effects.EffectMixin.equalizer`. | |
Parameters | |
---------- | |
eq_amount : tuple, optional | |
The maximum dB cut to apply to the audio in any band, | |
by default ("const", 1.0 dB) | |
n_bands : int, optional | |
Number of bands in EQ, by default 6 | |
name : str, optional | |
Name of this transform, used to identify it in the dictionary | |
produced by ``self.instantiate``, by default None | |
prob : float, optional | |
Probability of applying this transform, by default 1.0 | |
""" | |
def __init__( | |
self, | |
eq_amount: tuple = ("const", 1.0), | |
n_bands: int = 6, | |
name: str = None, | |
prob: float = 1.0, | |
): | |
super().__init__(name=name, prob=prob) | |
self.eq_amount = eq_amount | |
self.n_bands = n_bands | |
def _instantiate(self, state: RandomState): | |
eq_amount = util.sample_from_dist(self.eq_amount, state) | |
eq = -eq_amount * state.rand(self.n_bands) | |
return {"eq": eq} | |
def _transform(self, signal, eq): | |
return signal.equalizer(eq) | |
class Quantization(BaseTransform): | |
"""Applies quantization to the input waveform. Corresponds | |
to :py:func:`audiotools.core.effects.EffectMixin.quantization`. | |
Parameters | |
---------- | |
channels : tuple, optional | |
Number of evenly spaced quantization channels to quantize | |
to, by default ("choice", [8, 32, 128, 256, 1024]) | |
name : str, optional | |
Name of this transform, used to identify it in the dictionary | |
produced by ``self.instantiate``, by default None | |
prob : float, optional | |
Probability of applying this transform, by default 1.0 | |
""" | |
def __init__( | |
self, | |
channels: tuple = ("choice", [8, 32, 128, 256, 1024]), | |
name: str = None, | |
prob: float = 1.0, | |
): | |
super().__init__(name=name, prob=prob) | |
self.channels = channels | |
def _instantiate(self, state: RandomState): | |
return {"channels": util.sample_from_dist(self.channels, state)} | |
def _transform(self, signal, channels): | |
return signal.quantization(channels) | |
class MuLawQuantization(BaseTransform): | |
"""Applies mu-law quantization to the input waveform. Corresponds | |
to :py:func:`audiotools.core.effects.EffectMixin.mulaw_quantization`. | |
Parameters | |
---------- | |
channels : tuple, optional | |
Number of mu-law spaced quantization channels to quantize | |
to, by default ("choice", [8, 32, 128, 256, 1024]) | |
name : str, optional | |
Name of this transform, used to identify it in the dictionary | |
produced by ``self.instantiate``, by default None | |
prob : float, optional | |
Probability of applying this transform, by default 1.0 | |
""" | |
def __init__( | |
self, | |
channels: tuple = ("choice", [8, 32, 128, 256, 1024]), | |
name: str = None, | |
prob: float = 1.0, | |
): | |
super().__init__(name=name, prob=prob) | |
self.channels = channels | |
def _instantiate(self, state: RandomState): | |
return {"channels": util.sample_from_dist(self.channels, state)} | |
def _transform(self, signal, channels): | |
return signal.mulaw_quantization(channels) | |
class NoiseFloor(BaseTransform): | |
"""Adds a noise floor of Gaussian noise to the signal at a specified | |
dB. | |
Parameters | |
---------- | |
db : tuple, optional | |
Level of noise to add to signal, by default ("const", -50.0) | |
name : str, optional | |
Name of this transform, used to identify it in the dictionary | |
produced by ``self.instantiate``, by default None | |
prob : float, optional | |
Probability of applying this transform, by default 1.0 | |
""" | |
def __init__( | |
self, | |
db: tuple = ("const", -50.0), | |
name: str = None, | |
prob: float = 1.0, | |
): | |
super().__init__(name=name, prob=prob) | |
self.db = db | |
def _instantiate(self, state: RandomState, signal: AudioSignal): | |
db = util.sample_from_dist(self.db, state) | |
audio_data = state.randn(signal.num_channels, signal.signal_length) | |
nz_signal = AudioSignal(audio_data, signal.sample_rate) | |
nz_signal.normalize(db) | |
return {"nz_signal": nz_signal} | |
def _transform(self, signal, nz_signal): | |
# Clone bg_signal so that transform can be repeatedly applied | |
# to different signals with the same effect. | |
return signal + nz_signal | |
class BackgroundNoise(BaseTransform): | |
"""Adds background noise from audio specified by a set of CSV files. | |
A valid CSV file looks like, and is typically generated by | |
:py:func:`audiotools.data.preprocess.create_csv`: | |
.. csv-table:: | |
:header: path | |
room_tone/m6_script2_clean.wav | |
room_tone/m6_script2_cleanraw.wav | |
room_tone/m6_script2_ipad_balcony1.wav | |
room_tone/m6_script2_ipad_bedroom1.wav | |
room_tone/m6_script2_ipad_confroom1.wav | |
room_tone/m6_script2_ipad_confroom2.wav | |
room_tone/m6_script2_ipad_livingroom1.wav | |
room_tone/m6_script2_ipad_office1.wav | |
.. note:: | |
All paths are relative to an environment variable called ``PATH_TO_DATA``, | |
so that CSV files are portable across machines where data may be | |
located in different places. | |
This transform calls :py:func:`audiotools.core.effects.EffectMixin.mix` | |
and :py:func:`audiotools.core.effects.EffectMixin.equalizer` under the | |
hood. | |
Parameters | |
---------- | |
snr : tuple, optional | |
Signal-to-noise ratio, by default ("uniform", 10.0, 30.0) | |
sources : List[str], optional | |
Sources containing folders, or CSVs with paths to audio files, | |
by default None | |
weights : List[float], optional | |
Weights to sample audio files from each source, by default None | |
eq_amount : tuple, optional | |
Amount of equalization to apply, by default ("const", 1.0) | |
n_bands : int, optional | |
Number of bands in equalizer, by default 3 | |
name : str, optional | |
Name of this transform, used to identify it in the dictionary | |
produced by ``self.instantiate``, by default None | |
prob : float, optional | |
Probability of applying this transform, by default 1.0 | |
loudness_cutoff : float, optional | |
Loudness cutoff when loading from audio files, by default None | |
""" | |
def __init__( | |
self, | |
snr: tuple = ("uniform", 10.0, 30.0), | |
sources: List[str] = None, | |
weights: List[float] = None, | |
eq_amount: tuple = ("const", 1.0), | |
n_bands: int = 3, | |
name: str = None, | |
prob: float = 1.0, | |
loudness_cutoff: float = None, | |
): | |
super().__init__(name=name, prob=prob) | |
self.snr = snr | |
self.eq_amount = eq_amount | |
self.n_bands = n_bands | |
self.loader = AudioLoader(sources, weights) | |
self.loudness_cutoff = loudness_cutoff | |
def _instantiate(self, state: RandomState, signal: AudioSignal): | |
eq_amount = util.sample_from_dist(self.eq_amount, state) | |
eq = -eq_amount * state.rand(self.n_bands) | |
snr = util.sample_from_dist(self.snr, state) | |
bg_signal = self.loader( | |
state, | |
signal.sample_rate, | |
duration=signal.signal_duration, | |
loudness_cutoff=self.loudness_cutoff, | |
num_channels=signal.num_channels, | |
)["signal"] | |
return {"eq": eq, "bg_signal": bg_signal, "snr": snr} | |
def _transform(self, signal, bg_signal, snr, eq): | |
# Clone bg_signal so that transform can be repeatedly applied | |
# to different signals with the same effect. | |
return signal.mix(bg_signal.clone(), snr, eq) | |
class CrossTalk(BaseTransform): | |
"""Adds crosstalk between speakers, whose audio is drawn from a CSV file | |
that was produced via :py:func:`audiotools.data.preprocess.create_csv`. | |
This transform calls :py:func:`audiotools.core.effects.EffectMixin.mix` | |
under the hood. | |
Parameters | |
---------- | |
snr : tuple, optional | |
How loud cross-talk speaker is relative to original signal in dB, | |
by default ("uniform", 0.0, 10.0) | |
sources : List[str], optional | |
Sources containing folders, or CSVs with paths to audio files, | |
by default None | |
weights : List[float], optional | |
Weights to sample audio files from each source, by default None | |
name : str, optional | |
Name of this transform, used to identify it in the dictionary | |
produced by ``self.instantiate``, by default None | |
prob : float, optional | |
Probability of applying this transform, by default 1.0 | |
loudness_cutoff : float, optional | |
Loudness cutoff when loading from audio files, by default -40 | |
""" | |
def __init__( | |
self, | |
snr: tuple = ("uniform", 0.0, 10.0), | |
sources: List[str] = None, | |
weights: List[float] = None, | |
name: str = None, | |
prob: float = 1.0, | |
loudness_cutoff: float = -40, | |
): | |
super().__init__(name=name, prob=prob) | |
self.snr = snr | |
self.loader = AudioLoader(sources, weights) | |
self.loudness_cutoff = loudness_cutoff | |
def _instantiate(self, state: RandomState, signal: AudioSignal): | |
snr = util.sample_from_dist(self.snr, state) | |
crosstalk_signal = self.loader( | |
state, | |
signal.sample_rate, | |
duration=signal.signal_duration, | |
loudness_cutoff=self.loudness_cutoff, | |
num_channels=signal.num_channels, | |
)["signal"] | |
return {"crosstalk_signal": crosstalk_signal, "snr": snr} | |
def _transform(self, signal, crosstalk_signal, snr): | |
# Clone bg_signal so that transform can be repeatedly applied | |
# to different signals with the same effect. | |
loudness = signal.loudness() | |
mix = signal.mix(crosstalk_signal.clone(), snr) | |
mix.normalize(loudness) | |
return mix | |
class RoomImpulseResponse(BaseTransform): | |
"""Convolves signal with a room impulse response, at a specified | |
direct-to-reverberant ratio, with equalization applied. Room impulse | |
response data is drawn from a CSV file that was produced via | |
:py:func:`audiotools.data.preprocess.create_csv`. | |
This transform calls :py:func:`audiotools.core.effects.EffectMixin.apply_ir` | |
under the hood. | |
Parameters | |
---------- | |
drr : tuple, optional | |
_description_, by default ("uniform", 0.0, 30.0) | |
sources : List[str], optional | |
Sources containing folders, or CSVs with paths to audio files, | |
by default None | |
weights : List[float], optional | |
Weights to sample audio files from each source, by default None | |
eq_amount : tuple, optional | |
Amount of equalization to apply, by default ("const", 1.0) | |
n_bands : int, optional | |
Number of bands in equalizer, by default 6 | |
name : str, optional | |
Name of this transform, used to identify it in the dictionary | |
produced by ``self.instantiate``, by default None | |
prob : float, optional | |
Probability of applying this transform, by default 1.0 | |
use_original_phase : bool, optional | |
Whether or not to use the original phase, by default False | |
offset : float, optional | |
Offset from each impulse response file to use, by default 0.0 | |
duration : float, optional | |
Duration of each impulse response, by default 1.0 | |
""" | |
def __init__( | |
self, | |
drr: tuple = ("uniform", 0.0, 30.0), | |
sources: List[str] = None, | |
weights: List[float] = None, | |
eq_amount: tuple = ("const", 1.0), | |
n_bands: int = 6, | |
name: str = None, | |
prob: float = 1.0, | |
use_original_phase: bool = False, | |
offset: float = 0.0, | |
duration: float = 1.0, | |
): | |
super().__init__(name=name, prob=prob) | |
self.drr = drr | |
self.eq_amount = eq_amount | |
self.n_bands = n_bands | |
self.use_original_phase = use_original_phase | |
self.loader = AudioLoader(sources, weights) | |
self.offset = offset | |
self.duration = duration | |
def _instantiate(self, state: RandomState, signal: AudioSignal = None): | |
eq_amount = util.sample_from_dist(self.eq_amount, state) | |
eq = -eq_amount * state.rand(self.n_bands) | |
drr = util.sample_from_dist(self.drr, state) | |
ir_signal = self.loader( | |
state, | |
signal.sample_rate, | |
offset=self.offset, | |
duration=self.duration, | |
loudness_cutoff=None, | |
num_channels=signal.num_channels, | |
)["signal"] | |
ir_signal.zero_pad_to(signal.sample_rate) | |
return {"eq": eq, "ir_signal": ir_signal, "drr": drr} | |
def _transform(self, signal, ir_signal, drr, eq): | |
# Clone ir_signal so that transform can be repeatedly applied | |
# to different signals with the same effect. | |
return signal.apply_ir( | |
ir_signal.clone(), drr, eq, use_original_phase=self.use_original_phase | |
) | |
class VolumeChange(BaseTransform): | |
"""Changes the volume of the input signal. | |
Uses :py:func:`audiotools.core.effects.EffectMixin.volume_change`. | |
Parameters | |
---------- | |
db : tuple, optional | |
Change in volume in decibels, by default ("uniform", -12.0, 0.0) | |
name : str, optional | |
Name of this transform, used to identify it in the dictionary | |
produced by ``self.instantiate``, by default None | |
prob : float, optional | |
Probability of applying this transform, by default 1.0 | |
""" | |
def __init__( | |
self, | |
db: tuple = ("uniform", -12.0, 0.0), | |
name: str = None, | |
prob: float = 1.0, | |
): | |
super().__init__(name=name, prob=prob) | |
self.db = db | |
def _instantiate(self, state: RandomState): | |
return {"db": util.sample_from_dist(self.db, state)} | |
def _transform(self, signal, db): | |
return signal.volume_change(db) | |
class VolumeNorm(BaseTransform): | |
"""Normalizes the volume of the excerpt to a specified decibel. | |
Uses :py:func:`audiotools.core.effects.EffectMixin.normalize`. | |
Parameters | |
---------- | |
db : tuple, optional | |
dB to normalize signal to, by default ("const", -24) | |
name : str, optional | |
Name of this transform, used to identify it in the dictionary | |
produced by ``self.instantiate``, by default None | |
prob : float, optional | |
Probability of applying this transform, by default 1.0 | |
""" | |
def __init__( | |
self, | |
db: tuple = ("const", -24), | |
name: str = None, | |
prob: float = 1.0, | |
): | |
super().__init__(name=name, prob=prob) | |
self.db = db | |
def _instantiate(self, state: RandomState): | |
return {"db": util.sample_from_dist(self.db, state)} | |
def _transform(self, signal, db): | |
return signal.normalize(db) | |
class GlobalVolumeNorm(BaseTransform): | |
"""Similar to :py:func:`audiotools.data.transforms.VolumeNorm`, this | |
transform also normalizes the volume of a signal, but it uses | |
the volume of the entire audio file the loaded excerpt comes from, | |
rather than the volume of just the excerpt. The volume of the | |
entire audio file is expected in ``signal.metadata["loudness"]``. | |
If loading audio from a CSV generated by :py:func:`audiotools.data.preprocess.create_csv` | |
with ``loudness = True``, like the following: | |
.. csv-table:: | |
:header: path,loudness | |
daps/produced/f1_script1_produced.wav,-16.299999237060547 | |
daps/produced/f1_script2_produced.wav,-16.600000381469727 | |
daps/produced/f1_script3_produced.wav,-17.299999237060547 | |
daps/produced/f1_script4_produced.wav,-16.100000381469727 | |
daps/produced/f1_script5_produced.wav,-16.700000762939453 | |
daps/produced/f3_script1_produced.wav,-16.5 | |
The ``AudioLoader`` will automatically load the loudness column into | |
the metadata of the signal. | |
Uses :py:func:`audiotools.core.effects.EffectMixin.volume_change`. | |
Parameters | |
---------- | |
db : tuple, optional | |
dB to normalize signal to, by default ("const", -24) | |
name : str, optional | |
Name of this transform, used to identify it in the dictionary | |
produced by ``self.instantiate``, by default None | |
prob : float, optional | |
Probability of applying this transform, by default 1.0 | |
""" | |
def __init__( | |
self, | |
db: tuple = ("const", -24), | |
name: str = None, | |
prob: float = 1.0, | |
): | |
super().__init__(name=name, prob=prob) | |
self.db = db | |
def _instantiate(self, state: RandomState, signal: AudioSignal): | |
if "loudness" not in signal.metadata: | |
db_change = 0.0 | |
elif float(signal.metadata["loudness"]) == float("-inf"): | |
db_change = 0.0 | |
else: | |
db = util.sample_from_dist(self.db, state) | |
db_change = db - float(signal.metadata["loudness"]) | |
return {"db": db_change} | |
def _transform(self, signal, db): | |
return signal.volume_change(db) | |
class Silence(BaseTransform): | |
"""Zeros out the signal with some probability. | |
Parameters | |
---------- | |
name : str, optional | |
Name of this transform, used to identify it in the dictionary | |
produced by ``self.instantiate``, by default None | |
prob : float, optional | |
Probability of applying this transform, by default 0.1 | |
""" | |
def __init__(self, name: str = None, prob: float = 0.1): | |
super().__init__(name=name, prob=prob) | |
def _transform(self, signal): | |
_loudness = signal._loudness | |
signal = AudioSignal( | |
torch.zeros_like(signal.audio_data), | |
sample_rate=signal.sample_rate, | |
stft_params=signal.stft_params, | |
) | |
# So that the amound of noise added is as if it wasn't silenced. | |
# TODO: improve this hack | |
signal._loudness = _loudness | |
return signal | |
class LowPass(BaseTransform): | |
"""Applies a LowPass filter. | |
Uses :py:func:`audiotools.core.dsp.DSPMixin.low_pass`. | |
Parameters | |
---------- | |
cutoff : tuple, optional | |
Cutoff frequency distribution, | |
by default ``("choice", [4000, 8000, 16000])`` | |
zeros : int, optional | |
Number of zero-crossings in filter, argument to | |
``julius.LowPassFilters``, by default 51 | |
name : str, optional | |
Name of this transform, used to identify it in the dictionary | |
produced by ``self.instantiate``, by default None | |
prob : float, optional | |
Probability of applying this transform, by default 1.0 | |
""" | |
def __init__( | |
self, | |
cutoff: tuple = ("choice", [4000, 8000, 16000]), | |
zeros: int = 51, | |
name: str = None, | |
prob: float = 1, | |
): | |
super().__init__(name=name, prob=prob) | |
self.cutoff = cutoff | |
self.zeros = zeros | |
def _instantiate(self, state: RandomState): | |
return {"cutoff": util.sample_from_dist(self.cutoff, state)} | |
def _transform(self, signal, cutoff): | |
return signal.low_pass(cutoff, zeros=self.zeros) | |
class HighPass(BaseTransform): | |
"""Applies a HighPass filter. | |
Uses :py:func:`audiotools.core.dsp.DSPMixin.high_pass`. | |
Parameters | |
---------- | |
cutoff : tuple, optional | |
Cutoff frequency distribution, | |
by default ``("choice", [50, 100, 250, 500, 1000])`` | |
zeros : int, optional | |
Number of zero-crossings in filter, argument to | |
``julius.LowPassFilters``, by default 51 | |
name : str, optional | |
Name of this transform, used to identify it in the dictionary | |
produced by ``self.instantiate``, by default None | |
prob : float, optional | |
Probability of applying this transform, by default 1.0 | |
""" | |
def __init__( | |
self, | |
cutoff: tuple = ("choice", [50, 100, 250, 500, 1000]), | |
zeros: int = 51, | |
name: str = None, | |
prob: float = 1, | |
): | |
super().__init__(name=name, prob=prob) | |
self.cutoff = cutoff | |
self.zeros = zeros | |
def _instantiate(self, state: RandomState): | |
return {"cutoff": util.sample_from_dist(self.cutoff, state)} | |
def _transform(self, signal, cutoff): | |
return signal.high_pass(cutoff, zeros=self.zeros) | |
class RescaleAudio(BaseTransform): | |
"""Rescales the audio so it is in between ``-val`` and ``val`` | |
only if the original audio exceeds those bounds. Useful if | |
transforms have caused the audio to clip. | |
Uses :py:func:`audiotools.core.effects.EffectMixin.ensure_max_of_audio`. | |
Parameters | |
---------- | |
val : float, optional | |
Max absolute value of signal, by default 1.0 | |
name : str, optional | |
Name of this transform, used to identify it in the dictionary | |
produced by ``self.instantiate``, by default None | |
prob : float, optional | |
Probability of applying this transform, by default 1.0 | |
""" | |
def __init__(self, val: float = 1.0, name: str = None, prob: float = 1): | |
super().__init__(name=name, prob=prob) | |
self.val = val | |
def _transform(self, signal): | |
return signal.ensure_max_of_audio(self.val) | |
class ShiftPhase(SpectralTransform): | |
"""Shifts the phase of the audio. | |
Uses :py:func:`audiotools.core.dsp.DSPMixin.shift)phase`. | |
Parameters | |
---------- | |
shift : tuple, optional | |
How much to shift phase by, by default ("uniform", -np.pi, np.pi) | |
name : str, optional | |
Name of this transform, used to identify it in the dictionary | |
produced by ``self.instantiate``, by default None | |
prob : float, optional | |
Probability of applying this transform, by default 1.0 | |
""" | |
def __init__( | |
self, | |
shift: tuple = ("uniform", -np.pi, np.pi), | |
name: str = None, | |
prob: float = 1, | |
): | |
super().__init__(name=name, prob=prob) | |
self.shift = shift | |
def _instantiate(self, state: RandomState): | |
return {"shift": util.sample_from_dist(self.shift, state)} | |
def _transform(self, signal, shift): | |
return signal.shift_phase(shift) | |
class InvertPhase(ShiftPhase): | |
"""Inverts the phase of the audio. | |
Uses :py:func:`audiotools.core.dsp.DSPMixin.shift_phase`. | |
Parameters | |
---------- | |
name : str, optional | |
Name of this transform, used to identify it in the dictionary | |
produced by ``self.instantiate``, by default None | |
prob : float, optional | |
Probability of applying this transform, by default 1.0 | |
""" | |
def __init__(self, name: str = None, prob: float = 1): | |
super().__init__(shift=("const", np.pi), name=name, prob=prob) | |
class CorruptPhase(SpectralTransform): | |
"""Corrupts the phase of the audio. | |
Uses :py:func:`audiotools.core.dsp.DSPMixin.corrupt_phase`. | |
Parameters | |
---------- | |
scale : tuple, optional | |
How much to corrupt phase by, by default ("uniform", 0, np.pi) | |
name : str, optional | |
Name of this transform, used to identify it in the dictionary | |
produced by ``self.instantiate``, by default None | |
prob : float, optional | |
Probability of applying this transform, by default 1.0 | |
""" | |
def __init__( | |
self, scale: tuple = ("uniform", 0, np.pi), name: str = None, prob: float = 1 | |
): | |
super().__init__(name=name, prob=prob) | |
self.scale = scale | |
def _instantiate(self, state: RandomState, signal: AudioSignal = None): | |
scale = util.sample_from_dist(self.scale, state) | |
corruption = state.normal(scale=scale, size=signal.phase.shape[1:]) | |
return {"corruption": corruption.astype("float32")} | |
def _transform(self, signal, corruption): | |
return signal.shift_phase(shift=corruption) | |
class FrequencyMask(SpectralTransform): | |
"""Masks a band of frequencies at a center frequency | |
from the audio. | |
Uses :py:func:`audiotools.core.dsp.DSPMixin.mask_frequencies`. | |
Parameters | |
---------- | |
f_center : tuple, optional | |
Center frequency between 0.0 and 1.0 (Nyquist), by default ("uniform", 0.0, 1.0) | |
f_width : tuple, optional | |
Width of zero'd out band, by default ("const", 0.1) | |
name : str, optional | |
Name of this transform, used to identify it in the dictionary | |
produced by ``self.instantiate``, by default None | |
prob : float, optional | |
Probability of applying this transform, by default 1.0 | |
""" | |
def __init__( | |
self, | |
f_center: tuple = ("uniform", 0.0, 1.0), | |
f_width: tuple = ("const", 0.1), | |
name: str = None, | |
prob: float = 1, | |
): | |
super().__init__(name=name, prob=prob) | |
self.f_center = f_center | |
self.f_width = f_width | |
def _instantiate(self, state: RandomState, signal: AudioSignal): | |
f_center = util.sample_from_dist(self.f_center, state) | |
f_width = util.sample_from_dist(self.f_width, state) | |
fmin = max(f_center - (f_width / 2), 0.0) | |
fmax = min(f_center + (f_width / 2), 1.0) | |
fmin_hz = (signal.sample_rate / 2) * fmin | |
fmax_hz = (signal.sample_rate / 2) * fmax | |
return {"fmin_hz": fmin_hz, "fmax_hz": fmax_hz} | |
def _transform(self, signal, fmin_hz: float, fmax_hz: float): | |
return signal.mask_frequencies(fmin_hz=fmin_hz, fmax_hz=fmax_hz) | |
class TimeMask(SpectralTransform): | |
"""Masks out contiguous time-steps from signal. | |
Uses :py:func:`audiotools.core.dsp.DSPMixin.mask_timesteps`. | |
Parameters | |
---------- | |
t_center : tuple, optional | |
Center time in terms of 0.0 and 1.0 (duration of signal), | |
by default ("uniform", 0.0, 1.0) | |
t_width : tuple, optional | |
Width of dropped out portion, by default ("const", 0.025) | |
name : str, optional | |
Name of this transform, used to identify it in the dictionary | |
produced by ``self.instantiate``, by default None | |
prob : float, optional | |
Probability of applying this transform, by default 1.0 | |
""" | |
def __init__( | |
self, | |
t_center: tuple = ("uniform", 0.0, 1.0), | |
t_width: tuple = ("const", 0.025), | |
name: str = None, | |
prob: float = 1, | |
): | |
super().__init__(name=name, prob=prob) | |
self.t_center = t_center | |
self.t_width = t_width | |
def _instantiate(self, state: RandomState, signal: AudioSignal): | |
t_center = util.sample_from_dist(self.t_center, state) | |
t_width = util.sample_from_dist(self.t_width, state) | |
tmin = max(t_center - (t_width / 2), 0.0) | |
tmax = min(t_center + (t_width / 2), 1.0) | |
tmin_s = signal.signal_duration * tmin | |
tmax_s = signal.signal_duration * tmax | |
return {"tmin_s": tmin_s, "tmax_s": tmax_s} | |
def _transform(self, signal, tmin_s: float, tmax_s: float): | |
return signal.mask_timesteps(tmin_s=tmin_s, tmax_s=tmax_s) | |
class MaskLowMagnitudes(SpectralTransform): | |
"""Masks low magnitude regions out of signal. | |
Uses :py:func:`audiotools.core.dsp.DSPMixin.mask_low_magnitudes`. | |
Parameters | |
---------- | |
db_cutoff : tuple, optional | |
Decibel value for which things below it will be masked away, | |
by default ("uniform", -10, 10) | |
name : str, optional | |
Name of this transform, used to identify it in the dictionary | |
produced by ``self.instantiate``, by default None | |
prob : float, optional | |
Probability of applying this transform, by default 1.0 | |
""" | |
def __init__( | |
self, | |
db_cutoff: tuple = ("uniform", -10, 10), | |
name: str = None, | |
prob: float = 1, | |
): | |
super().__init__(name=name, prob=prob) | |
self.db_cutoff = db_cutoff | |
def _instantiate(self, state: RandomState, signal: AudioSignal = None): | |
return {"db_cutoff": util.sample_from_dist(self.db_cutoff, state)} | |
def _transform(self, signal, db_cutoff: float): | |
return signal.mask_low_magnitudes(db_cutoff) | |
class Smoothing(BaseTransform): | |
"""Convolves the signal with a smoothing window. | |
Uses :py:func:`audiotools.core.effects.EffectMixin.convolve`. | |
Parameters | |
---------- | |
window_type : tuple, optional | |
Type of window to use, by default ("const", "average") | |
window_length : tuple, optional | |
Length of smoothing window, by | |
default ("choice", [8, 16, 32, 64, 128, 256, 512]) | |
name : str, optional | |
Name of this transform, used to identify it in the dictionary | |
produced by ``self.instantiate``, by default None | |
prob : float, optional | |
Probability of applying this transform, by default 1.0 | |
""" | |
def __init__( | |
self, | |
window_type: tuple = ("const", "average"), | |
window_length: tuple = ("choice", [8, 16, 32, 64, 128, 256, 512]), | |
name: str = None, | |
prob: float = 1, | |
): | |
super().__init__(name=name, prob=prob) | |
self.window_type = window_type | |
self.window_length = window_length | |
def _instantiate(self, state: RandomState, signal: AudioSignal = None): | |
window_type = util.sample_from_dist(self.window_type, state) | |
window_length = util.sample_from_dist(self.window_length, state) | |
window = signal.get_window( | |
window_type=window_type, window_length=window_length, device="cpu" | |
) | |
return {"window": AudioSignal(window, signal.sample_rate)} | |
def _transform(self, signal, window): | |
sscale = signal.audio_data.abs().max(dim=-1, keepdim=True).values | |
sscale[sscale == 0.0] = 1.0 | |
out = signal.convolve(window) | |
oscale = out.audio_data.abs().max(dim=-1, keepdim=True).values | |
oscale[oscale == 0.0] = 1.0 | |
out = out * (sscale / oscale) | |
return out | |
class TimeNoise(TimeMask): | |
"""Similar to :py:func:`audiotools.data.transforms.TimeMask`, but | |
replaces with noise instead of zeros. | |
Parameters | |
---------- | |
t_center : tuple, optional | |
Center time in terms of 0.0 and 1.0 (duration of signal), | |
by default ("uniform", 0.0, 1.0) | |
t_width : tuple, optional | |
Width of dropped out portion, by default ("const", 0.025) | |
name : str, optional | |
Name of this transform, used to identify it in the dictionary | |
produced by ``self.instantiate``, by default None | |
prob : float, optional | |
Probability of applying this transform, by default 1.0 | |
""" | |
def __init__( | |
self, | |
t_center: tuple = ("uniform", 0.0, 1.0), | |
t_width: tuple = ("const", 0.025), | |
name: str = None, | |
prob: float = 1, | |
): | |
super().__init__(t_center=t_center, t_width=t_width, name=name, prob=prob) | |
def _transform(self, signal, tmin_s: float, tmax_s: float): | |
signal = signal.mask_timesteps(tmin_s=tmin_s, tmax_s=tmax_s, val=0.0) | |
mag, phase = signal.magnitude, signal.phase | |
mag_r, phase_r = torch.randn_like(mag), torch.randn_like(phase) | |
mask = (mag == 0.0) * (phase == 0.0) | |
mag[mask] = mag_r[mask] | |
phase[mask] = phase_r[mask] | |
signal.magnitude = mag | |
signal.phase = phase | |
return signal | |
class FrequencyNoise(FrequencyMask): | |
"""Similar to :py:func:`audiotools.data.transforms.FrequencyMask`, but | |
replaces with noise instead of zeros. | |
Parameters | |
---------- | |
f_center : tuple, optional | |
Center frequency between 0.0 and 1.0 (Nyquist), by default ("uniform", 0.0, 1.0) | |
f_width : tuple, optional | |
Width of zero'd out band, by default ("const", 0.1) | |
name : str, optional | |
Name of this transform, used to identify it in the dictionary | |
produced by ``self.instantiate``, by default None | |
prob : float, optional | |
Probability of applying this transform, by default 1.0 | |
""" | |
def __init__( | |
self, | |
f_center: tuple = ("uniform", 0.0, 1.0), | |
f_width: tuple = ("const", 0.1), | |
name: str = None, | |
prob: float = 1, | |
): | |
super().__init__(f_center=f_center, f_width=f_width, name=name, prob=prob) | |
def _transform(self, signal, fmin_hz: float, fmax_hz: float): | |
signal = signal.mask_frequencies(fmin_hz=fmin_hz, fmax_hz=fmax_hz) | |
mag, phase = signal.magnitude, signal.phase | |
mag_r, phase_r = torch.randn_like(mag), torch.randn_like(phase) | |
mask = (mag == 0.0) * (phase == 0.0) | |
mag[mask] = mag_r[mask] | |
phase[mask] = phase_r[mask] | |
signal.magnitude = mag | |
signal.phase = phase | |
return signal | |
class SpectralDenoising(Equalizer): | |
"""Applies denoising algorithm detailed in | |
:py:func:`audiotools.ml.layers.spectral_gate.SpectralGate`, | |
using a randomly generated noise signal for denoising. | |
Parameters | |
---------- | |
eq_amount : tuple, optional | |
Amount of eq to apply to noise signal, by default ("const", 1.0) | |
denoise_amount : tuple, optional | |
Amount to denoise by, by default ("uniform", 0.8, 1.0) | |
nz_volume : float, optional | |
Volume of noise to denoise with, by default -40 | |
n_bands : int, optional | |
Number of bands in equalizer, by default 6 | |
n_freq : int, optional | |
Number of frequency bins to smooth by, by default 3 | |
n_time : int, optional | |
Number of time bins to smooth by, by default 5 | |
name : str, optional | |
Name of this transform, used to identify it in the dictionary | |
produced by ``self.instantiate``, by default None | |
prob : float, optional | |
Probability of applying this transform, by default 1.0 | |
""" | |
def __init__( | |
self, | |
eq_amount: tuple = ("const", 1.0), | |
denoise_amount: tuple = ("uniform", 0.8, 1.0), | |
nz_volume: float = -40, | |
n_bands: int = 6, | |
n_freq: int = 3, | |
n_time: int = 5, | |
name: str = None, | |
prob: float = 1, | |
): | |
super().__init__(eq_amount=eq_amount, n_bands=n_bands, name=name, prob=prob) | |
self.nz_volume = nz_volume | |
self.denoise_amount = denoise_amount | |
self.spectral_gate = ml.layers.SpectralGate(n_freq, n_time) | |
def _transform(self, signal, nz, eq, denoise_amount): | |
nz = nz.normalize(self.nz_volume).equalizer(eq) | |
self.spectral_gate = self.spectral_gate.to(signal.device) | |
signal = self.spectral_gate(signal, nz, denoise_amount) | |
return signal | |
def _instantiate(self, state: RandomState): | |
kwargs = super()._instantiate(state) | |
kwargs["denoise_amount"] = util.sample_from_dist(self.denoise_amount, state) | |
kwargs["nz"] = AudioSignal(state.randn(22050), 44100) | |
return kwargs | |