import librosa import numpy as np import soundfile import torch def random_amplify(mix, targets, shapes, min, max): ''' Data augmentation by randomly amplifying sources before adding them to form a new mixture :param mix: Original mixture :param targets: Source targets :param shapes: Shape dict from model :param min: Minimum possible amplification :param max: Maximum possible amplification :return: New data point as tuple (mix, targets) ''' residual = mix # start with original mix for key in targets.keys(): if key != "mix": residual -= targets[key] # subtract all instruments (output is zero if all instruments add to mix) mix = residual * np.random.uniform(min, max) # also apply gain data augmentation to residual for key in targets.keys(): if key != "mix": targets[key] = targets[key] * np.random.uniform(min, max) mix += targets[key] # add instrument with gain data augmentation to mix mix = np.clip(mix, -1.0, 1.0) return crop_targets(mix, targets, shapes) def crop_targets(mix, targets, shapes): ''' Crops target audio to the output shape required by the model given in "shapes" ''' for key in targets.keys(): if key != "mix": targets[key] = targets[key][:, shapes["output_start_frame"]:shapes["output_end_frame"]] return mix, targets def load(path, sr=22050, mono=True, mode="numpy", offset=0.0, duration=None): y, curr_sr = librosa.load(path, sr=sr, mono=mono, res_type='kaiser_fast', offset=offset, duration=duration) if len(y.shape) == 1: # Expand channel dimension y = y[np.newaxis, :] if mode == "pytorch": y = torch.tensor(y) return y, curr_sr def write_wav(path, audio, sr): soundfile.write(path, audio.T, sr, "PCM_16") def resample(audio, orig_sr, new_sr, mode="numpy"): if orig_sr == new_sr: return audio if isinstance(audio, torch.Tensor): audio = audio.detach().cpu().numpy() out = librosa.resample(audio, orig_sr=orig_sr, target_sr=new_sr, res_type='kaiser_fast') if mode == "pytorch": out = torch.tensor(out) return out