Spaces:
Runtime error
Runtime error
import museval | |
from tqdm import tqdm | |
import numpy as np | |
import torch | |
import gradio as gr | |
import data.utils | |
import model.utils as model_utils | |
import utils | |
import soundfile as sf | |
import argparse | |
import os | |
from model.waveunet import Waveunet | |
features = 32 | |
feature_growth = "double" | |
output_size = 2 | |
sr=44100 | |
levels=6 | |
channels =2 | |
instruments =["bass", "drums", "other", "vocals"] | |
cuda="false" | |
def compute_model_output(model, inputs): | |
''' | |
Computes outputs of model with given inputs. Does NOT allow propagating gradients! See compute_loss for training. | |
Procedure depends on whether we have one model for each source or not | |
:param model: Model to train with | |
:param compute_grad: Whether to compute gradients | |
:return: Model outputs, Average loss over batch | |
''' | |
all_outputs = {} | |
if model.separate: | |
for inst in model.instruments: | |
output = model(inputs, inst) | |
all_outputs[inst] = output[inst].detach().clone() | |
else: | |
all_outputs = model(inputs) | |
return all_outputs | |
def predict(audio, model): | |
''' | |
Predict sources for a given audio input signal, with a given model. Audio is split into chunks to make predictions on each chunk before they are concatenated. | |
:param audio: Audio input tensor, either Pytorch tensor or numpy array | |
:param model: Pytorch model | |
:return: Source predictions, dictionary with source names as keys | |
''' | |
if isinstance(audio, torch.Tensor): | |
is_cuda = audio.is_cuda() | |
audio = audio.detach().cpu().numpy() | |
return_mode = "pytorch" | |
else: | |
return_mode = "numpy" | |
expected_outputs = audio.shape[1] | |
# Pad input if it is not divisible in length by the frame shift number | |
output_shift = model.shapes["output_frames"] | |
pad_back = audio.shape[1] % output_shift | |
pad_back = 0 if pad_back == 0 else output_shift - pad_back | |
if pad_back > 0: | |
audio = np.pad(audio, [(0,0), (0, pad_back)], mode="constant", constant_values=0.0) | |
target_outputs = audio.shape[1] | |
outputs = {key: np.zeros(audio.shape, np.float32) for key in model.instruments} | |
# Pad mixture across time at beginning and end so that neural network can make prediction at the beginning and end of signal | |
pad_front_context = model.shapes["output_start_frame"] | |
pad_back_context = model.shapes["input_frames"] - model.shapes["output_end_frame"] | |
audio = np.pad(audio, [(0,0), (pad_front_context, pad_back_context)], mode="constant", constant_values=0.0) | |
# Iterate over mixture magnitudes, fetch network prediction | |
with torch.no_grad(): | |
for target_start_pos in range(0, target_outputs, model.shapes["output_frames"]): | |
# Prepare mixture excerpt by selecting time interval | |
curr_input = audio[:, target_start_pos:target_start_pos + model.shapes["input_frames"]] # Since audio was front-padded input of [targetpos:targetpos+inputframes] actually predicts [targetpos:targetpos+outputframes] target range | |
# Convert to Pytorch tensor for model prediction | |
curr_input = torch.from_numpy(curr_input).unsqueeze(0) | |
# Predict | |
for key, curr_targets in compute_model_output(model, curr_input).items(): | |
outputs[key][:,target_start_pos:target_start_pos+model.shapes["output_frames"]] = curr_targets.squeeze(0).cpu().numpy() | |
# Crop to expected length (since we padded to handle the frame shift) | |
outputs = {key : outputs[key][:,:expected_outputs] for key in outputs.keys()} | |
if return_mode == "pytorch": | |
outputs = torch.from_numpy(outputs) | |
if is_cuda: | |
outputs = outputs.cuda() | |
return outputs | |
def predict_song(audio_path): | |
''' | |
Predicts sources for an audio file for which the file path is given, using a given model. | |
Takes care of resampling the input audio to the models sampling rate and resampling predictions back to input sampling rate. | |
:param args: Options dictionary | |
:param audio_path: Path to mixture audio file | |
:param model: Pytorch model | |
:return: Source estimates given as dictionary with keys as source names | |
''' | |
# sr, data = audio_path | |
# print(sr) | |
# print(data) | |
# return (sr, np.flipud(data)) | |
sr = 44100 | |
model.eval() | |
# Load mixture in original sampling rate | |
mix_audio, mix_sr = data.utils.load(audio_path, sr=None, mono=False) | |
mix_channels = mix_audio.shape[0] | |
mix_len = mix_audio.shape[1] | |
# Adapt mixture channels to required input channels | |
if channels == 1: | |
mix_audio = np.mean(mix_audio, axis=0, keepdims=True) | |
else: | |
if mix_channels == 1: # Duplicate channels if input is mono but model is stereo | |
mix_audio = np.tile(mix_audio, [channels, 1]) | |
else: | |
assert(mix_channels == channels) | |
# resample to model sampling rate | |
mix_audio = data.utils.resample(mix_audio, mix_sr, sr) | |
sources = predict(mix_audio, model) | |
# Resample back to mixture sampling rate in case we had model on different sampling rate | |
sources = {key : data.utils.resample(sources[key], sr, mix_sr) for key in sources.keys()} | |
# In case we had to pad the mixture at the end, or we have a few samples too many due to inconsistent down- and upsamṕling, remove those samples from source prediction now | |
for key in sources.keys(): | |
diff = sources[key].shape[1] - mix_len | |
if diff > 0: | |
print("WARNING: Cropping " + str(diff) + " samples") | |
sources[key] = sources[key][:, :-diff] | |
elif diff < 0: | |
print("WARNING: Padding output by " + str(diff) + " samples") | |
sources[key] = np.pad(sources[key], [(0,0), (0, -diff)], "constant", 0.0) | |
# Adapt channels | |
if mix_channels > channels: | |
assert(channels == 1) | |
# Duplicate mono predictions | |
sources[key] = np.tile(sources[key], [mix_channels, 1]) | |
elif mix_channels < channels: | |
assert(mix_channels == 1) | |
# Reduce model output to mono | |
sources[key] = np.mean(sources[key], axis=0, keepdims=True) | |
sources[key] = np.asfortranarray(sources[key]) # So librosa does not complain if we want to save it | |
data.utils.write_wav("test.wav", sources['vocals'], sr) | |
return "test.wav" | |
# load model | |
num_features = [features*i for i in range(1, levels+1)] if feature_growth == "add" else \ | |
[features*2**i for i in range(0, levels)] | |
target_outputs = int(output_size * sr) | |
model = Waveunet(channels, num_features, channels, instruments, kernel_size=5, | |
target_output_size=target_outputs, depth=1, strides=4, | |
conv_type="gn", res="fixed", separate=1) | |
load_model = 'checkpoints/waveunet/model' | |
state = model_utils.load_model(model, None, load_model, cuda=0) | |
# Create title, description and article strings | |
title = "Denoise Audio" | |
description = "Using Wave-u-net to Denoise Audio" | |
article = "Created at github [Wave-U-Net-Pytorch] of author Daniel Stoller(https://github.com/f90/Wave-U-Net-Pytorch)." | |
# Create the Gradio demo | |
demo = gr.Interface(fn=predict_song, # mapping function from input to output | |
inputs=gr.Audio(type="filepath"), # what are the inputs? | |
outputs=gr.File(file_count="multiple", file_types=[".wav"]), # our fn has two outputs, therefore we have two outputs | |
title=title, | |
description=description, | |
article=article) | |
# Launch the demo! | |
demo.launch() # generate a publically shareable URL? |