hieupt's picture
Update data/utils.py
e7063e9 verified
import librosa
import numpy as np
import soundfile
import torch
def random_amplify(mix, targets, shapes, min, max):
'''
Data augmentation by randomly amplifying sources before adding them to form a new mixture
:param mix: Original mixture
:param targets: Source targets
:param shapes: Shape dict from model
:param min: Minimum possible amplification
:param max: Maximum possible amplification
:return: New data point as tuple (mix, targets)
'''
residual = mix # start with original mix
for key in targets.keys():
if key != "mix":
residual -= targets[key] # subtract all instruments (output is zero if all instruments add to mix)
mix = residual * np.random.uniform(min, max) # also apply gain data augmentation to residual
for key in targets.keys():
if key != "mix":
targets[key] = targets[key] * np.random.uniform(min, max)
mix += targets[key] # add instrument with gain data augmentation to mix
mix = np.clip(mix, -1.0, 1.0)
return crop_targets(mix, targets, shapes)
def crop_targets(mix, targets, shapes):
'''
Crops target audio to the output shape required by the model given in "shapes"
'''
for key in targets.keys():
if key != "mix":
targets[key] = targets[key][:, shapes["output_start_frame"]:shapes["output_end_frame"]]
return mix, targets
def load(path, sr=22050, mono=True, mode="numpy", offset=0.0, duration=None):
y, curr_sr = librosa.load(path, sr=sr, mono=mono, res_type='kaiser_fast', offset=offset, duration=duration)
if len(y.shape) == 1:
# Expand channel dimension
y = y[np.newaxis, :]
if mode == "pytorch":
y = torch.tensor(y)
return y, curr_sr
def write_wav(path, audio, sr):
soundfile.write(path, audio.T, sr, "PCM_16")
def resample(audio, orig_sr, new_sr, mode="numpy"):
if orig_sr == new_sr:
return audio
if isinstance(audio, torch.Tensor):
audio = audio.detach().cpu().numpy()
out = librosa.resample(audio, orig_sr=orig_sr, target_sr=new_sr, res_type='kaiser_fast')
if mode == "pytorch":
out = torch.tensor(out)
return out