Spaces:
Runtime error
Runtime error
File size: 4,951 Bytes
75c6e9a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 |
from typing import Dict
import librosa
import numpy as np
from bytesep.utils import db_to_magnitude, get_pitch_shift_factor, magnitude_to_db
class Augmentor:
def __init__(self, augmentations: Dict, random_seed=1234):
r"""Augmentor for data augmentation of a waveform.
Args:
augmentations: Dict, e.g, {
'mixaudio': {'vocals': 2, 'accompaniment': 2}
'pitch_shift': {'vocals': 4, 'accompaniment': 4},
...,
}
random_seed: int
"""
self.augmentations = augmentations
self.random_state = np.random.RandomState(random_seed)
def __call__(self, waveform: np.array, source_type: str) -> np.array:
r"""Augment a waveform.
Args:
waveform: (channels_num, audio_samples)
source_type: str
Returns:
new_waveform: (channels_num, new_audio_samples)
"""
if 'pitch_shift' in self.augmentations.keys():
waveform = self.pitch_shift(waveform, source_type)
if 'magnitude_scale' in self.augmentations.keys():
waveform = self.magnitude_scale(waveform, source_type)
if 'swap_channel' in self.augmentations.keys():
waveform = self.swap_channel(waveform, source_type)
if 'flip_axis' in self.augmentations.keys():
waveform = self.flip_axis(waveform, source_type)
return waveform
def pitch_shift(self, waveform: np.array, source_type: str) -> np.array:
r"""Shift the pitch of a waveform. We use resampling for fast pitch
shifting, so the speed will also be chaneged. The length of the returned
waveform will be changed.
Args:
waveform: (channels_num, audio_samples)
source_type: str
Returns:
new_waveform: (channels_num, new_audio_samples)
"""
# maximum pitch shift in semitones
max_pitch_shift = self.augmentations['pitch_shift'][source_type]
if max_pitch_shift == 0: # No pitch shift augmentations.
return waveform
# random pitch shift
rand_pitch = self.random_state.uniform(
low=-max_pitch_shift, high=max_pitch_shift
)
# We use librosa.resample instead of librosa.effects.pitch_shift
# because it is 10x times faster.
pitch_shift_factor = get_pitch_shift_factor(rand_pitch)
dummy_sample_rate = 10000 # Dummy constant.
channels_num = waveform.shape[0]
if channels_num == 1:
waveform = np.squeeze(waveform)
new_waveform = librosa.resample(
y=waveform,
orig_sr=dummy_sample_rate,
target_sr=dummy_sample_rate / pitch_shift_factor,
res_type='linear',
axis=-1,
)
if channels_num == 1:
new_waveform = new_waveform[None, :]
return new_waveform
def magnitude_scale(self, waveform: np.array, source_type: str) -> np.array:
r"""Scale the magnitude of a waveform.
Args:
waveform: (channels_num, audio_samples)
source_type: str
Returns:
new_waveform: (channels_num, audio_samples)
"""
lower_db = self.augmentations['magnitude_scale'][source_type]['lower_db']
higher_db = self.augmentations['magnitude_scale'][source_type]['higher_db']
if lower_db == 0 and higher_db == 0: # No magnitude scale augmentation.
return waveform
# The magnitude (in dB) of the sample with the maximum value.
waveform_db = magnitude_to_db(np.max(np.abs(waveform)))
new_waveform_db = self.random_state.uniform(
waveform_db + lower_db, min(waveform_db + higher_db, 0)
)
relative_db = new_waveform_db - waveform_db
relative_scale = db_to_magnitude(relative_db)
new_waveform = waveform * relative_scale
return new_waveform
def swap_channel(self, waveform: np.array, source_type: str) -> np.array:
r"""Randomly swap channels.
Args:
waveform: (channels_num, audio_samples)
source_type: str
Returns:
new_waveform: (channels_num, audio_samples)
"""
ndim = waveform.shape[0]
if ndim == 1:
return waveform
else:
random_axes = self.random_state.permutation(ndim)
return waveform[random_axes, :]
def flip_axis(self, waveform: np.array, source_type: str) -> np.array:
r"""Randomly flip the waveform along x-axis.
Args:
waveform: (channels_num, audio_samples)
source_type: str
Returns:
new_waveform: (channels_num, audio_samples)
"""
ndim = waveform.shape[0]
random_values = self.random_state.choice([-1, 1], size=ndim)
return waveform * random_values[:, None]
|