import paddle import paddle.nn as nn import paddle import os import numpy as np import math import paddle.nn as nn import ffmpeg from scipy.signal.windows import hann from librosa.core import stft, istft class UNet(nn.Layer): def __init__(self, use_elu=False): super(UNet, self).__init__() self.use_elu = use_elu self.pad = nn.Pad2D(padding=[1, 2, 1, 2]) ### Encoder ### # First Layer self.conv1 = nn.Conv2D(2, 16, kernel_size=5, stride=2) ## padding self.encoder1 = self.encoder_block(16) # Second Layer self.conv2 = nn.Conv2D(16, 32, kernel_size=5, stride=2) self.encoder2 = self.encoder_block(32) # Third Layer self.conv3 = nn.Conv2D(32, 64, kernel_size=5, stride=2) self.encoder3 = self.encoder_block(64) # Fourth Layer self.conv4 = nn.Conv2D(64, 128, kernel_size=5, stride=2) self.encoder4 = self.encoder_block(128) # Fifth Layer self.conv5 = nn.Conv2D(128, 256, kernel_size=5, stride=2) self.encoder5 = self.encoder_block(256) # Sixth Layer self.conv6 = nn.Conv2D(256, 512, kernel_size=5, stride=2) self.encoder6 = self.encoder_block(512) ### Decoder ### # First Layer self.decoder1 = self.decoder_block(512, 256, dropout=True) # Second Layer self.decoder2 = self.decoder_block(512, 128, dropout=True) # Third Layer self.decoder3 = self.decoder_block(256, 64, dropout=True) # Fourth Layer self.decoder4 = self.decoder_block(128, 32) # Fifth Layer self.decoder5 = self.decoder_block(64, 16) # Sixth Layer self.decoder6 = self.decoder_block(32, 1) # Last Layer self.mask = nn.Conv2D(1, 2, kernel_size=4, dilation=2, padding=3) self.sig = nn.Sigmoid() def encoder_block(self, out_channel): if not self.use_elu: return nn.Sequential( nn.BatchNorm2D(out_channel, epsilon=1e-3, momentum=0.01), nn.LeakyReLU(0.2) ) else: return nn.Sequential( nn.BatchNorm2D(out_channel, epsilon=1e-3, momentum=0.01), nn.ELU() ) def decoder_block(self, in_channel, out_channel, dropout=False): layers = [ nn.Conv2DTranspose(in_channel, out_channel, kernel_size=5, stride=2) ] if not self.use_elu: layers.append(nn.ReLU()) else: layers.append(nn.ELU()) layers.append(nn.BatchNorm2D(out_channel, epsilon=1e-3, momentum=0.01)) if dropout: layers.append(nn.Dropout(0.5)) return nn.Sequential(*layers) def forward(self, x): ### Encoder ### skip1 = self.pad(x) skip1 = self.conv1(skip1) down1 = self.encoder1(skip1) skip2 = self.pad(down1) skip2 = self.conv2(skip2) down2 = self.encoder2(skip2) skip3 = self.pad(down2) skip3 = self.conv3(skip3) down3 = self.encoder3(skip3) skip4 = self.pad(down3) skip4 = self.conv4(skip4) down4 = self.encoder4(skip4) skip5 = self.pad(down4) skip5 = self.conv5(skip5) down5 = self.encoder5(skip5) skip6 = self.pad(down5) skip6 = self.conv6(skip6) down6 = self.encoder6(skip6) ### Decoder ### up1 = self.decoder1(skip6) up1 = up1[:, :, 1: -2, 1: -2] merge1 = paddle.concat((skip5, up1), 1) up2 = self.decoder2(merge1) up2 = up2[:, :, 1: -2, 1: -2] merge2 = paddle.concat((skip4, up2), 1) up3 = self.decoder3(merge2) up3 = up3[:, :, 1: -2, 1: -2] merge3 = paddle.concat((skip3, up3), 1) up4 = self.decoder4(merge3) up4 = up4[:, :, 1: -2, 1: -2] merge4 = paddle.concat((skip2, up4), 1) up5 = self.decoder5(merge4) up5 = up5[:, :, 1: -2, 1: -2] merge5 = paddle.concat((skip1, up5), 1) up6 = self.decoder6(merge5) up6 = up6[:, :, 1: -2, 1: -2] m = self.mask(up6) m = self.sig(m) return m * x class Separator(object): def __init__(self, params): self.num_instruments = params['num_instruments'] self.output_dir = params['output_dir'] self.model_list = nn.LayerList() for i, name in enumerate(self.num_instruments): print('Loading model for instrumment {}'.format(i)) net = UNet(use_elu=params['use_elu']) net.eval() state_dict = paddle.load(os.path.join(params['checkpoint_path'], '%dstems_%s.pdparams' % (len(self.num_instruments), name))) net.set_dict(state_dict) self.model_list.append(net) self.T = params['T'] self.F = params['F'] self.frame_length = params['frame_length'] self.frame_step = params['frame_step'] self.samplerate = params['sample_rate'] def _load_audio( self, path, offset=None, duration=None, sample_rate=None, dtype=np.float32): """ Loads the audio file denoted by the given path and returns it data as a waveform. :param path: Path of the audio file to load data from. :param offset: (Optional) Start offset to load from in seconds. :param duration: (Optional) Duration to load in seconds. :param sample_rate: (Optional) Sample rate to load audio with. :param dtype: (Optional) Numpy data type to use, default to float32. :returns: Loaded data a (waveform, sample_rate) tuple. :raise SpleeterError: If any error occurs while loading audio. """ if not isinstance(path, str): path = path.decode() probe = ffmpeg.probe(path) metadata = next( stream for stream in probe['streams'] if stream['codec_type'] == 'audio') n_channels = metadata['channels'] if sample_rate is None: sample_rate = metadata['sample_rate'] output_kwargs = {'format': 'f32le', 'ar': sample_rate} process = ( ffmpeg .input(path) .output('pipe:', **output_kwargs) .run_async(pipe_stdout=True, pipe_stderr=True)) buffer, _ = process.communicate() waveform = np.frombuffer(buffer, dtype=' 2: source_audio = source_audio[:, :2] stft = self._stft(source_audio) # L * F * 2 stft = stft[:, : self.F, :] stft_mag = abs(stft) # L * F * 2 stft_mag = paddle.to_tensor(stft_mag) stft_mag = stft_mag.unsqueeze(0).transpose([0, 3, 2, 1]) # 1 * 2 * F * L L = stft.shape[0] stft_mag = self._pad_and_partition( stft_mag, self.T) # [(L + T) / T] * 2 * F * T stft_mag = stft_mag.transpose((0, 1, 3, 2)) # stft_mag : B * 2 * T * F B = stft_mag.shape[0] masks = [] stft_mag = stft_mag for model, name in zip(self.model_list, self.num_instruments): mask = model(stft_mag) masks.append(mask) paddle.save(model.state_dict(), '2stems_%s.pdparams' % name) mask_sum = sum([m ** 2 for m in masks]) mask_sum += 1e-10 for i in range(len(self.num_instruments)): mask = masks[i] mask = (mask ** 2 + 1e-10/2) / (mask_sum) mask = mask.transpose((0, 1, 3, 2)) # B x 2 X F x T mask = paddle.concat(paddle.split(mask, mask.shape[0], axis=0), axis=3) mask = mask.squeeze(0)[:, :, :L] # 2 x F x L mask = mask.transpose([2, 1, 0]) # End using GPU mask = mask.detach().numpy() stft_masked = stft * mask stft_masked = np.pad( stft_masked, ((0, 0), (0, 1025), (0, 0)), 'constant') wav_masked = self._stft( stft_masked, inverse=True, length=source_audio.shape[0]) save_path = os.path.join( output_dir, (wav_name + '-' + self.num_instruments[i] + '.wav')) self._save_to_file(save_path, wav_masked, samplerate, 'wav', '128k') print('Audio {} separated'.format(wav_name))