Spaces:

hieupt
/

Wave_U_Net_audio

Runtime error

App Files Files Community

Wave_U_Net_audio / app.py

hieupt

Update app.py

5a2c34a verified over 1 year ago

raw

history blame contribute delete

7.58 kB

	import museval
	from tqdm import tqdm

	import numpy as np
	import torch
	import gradio as gr
	import data.utils
	import model.utils as model_utils
	import utils
	import soundfile as sf
	import argparse
	import os
	from model.waveunet import Waveunet

	features = 32
	feature_growth = "double"
	output_size = 2
	sr=44100
	levels=6
	channels =2
	instruments =["bass", "drums", "other", "vocals"]
	cuda="false"

	def compute_model_output(model, inputs):
	'''
	Computes outputs of model with given inputs. Does NOT allow propagating gradients! See compute_loss for training.
	Procedure depends on whether we have one model for each source or not
	:param model: Model to train with
	:param compute_grad: Whether to compute gradients
	:return: Model outputs, Average loss over batch
	'''
	all_outputs = {}

	if model.separate:
	for inst in model.instruments:
	output = model(inputs, inst)
	all_outputs[inst] = output[inst].detach().clone()
	else:
	all_outputs = model(inputs)

	return all_outputs

	def predict(audio, model):
	'''
	Predict sources for a given audio input signal, with a given model. Audio is split into chunks to make predictions on each chunk before they are concatenated.
	:param audio: Audio input tensor, either Pytorch tensor or numpy array
	:param model: Pytorch model
	:return: Source predictions, dictionary with source names as keys
	'''
	if isinstance(audio, torch.Tensor):
	is_cuda = audio.is_cuda()
	audio = audio.detach().cpu().numpy()
	return_mode = "pytorch"
	else:
	return_mode = "numpy"

	expected_outputs = audio.shape[1]

	# Pad input if it is not divisible in length by the frame shift number
	output_shift = model.shapes["output_frames"]
	pad_back = audio.shape[1] % output_shift
	pad_back = 0 if pad_back == 0 else output_shift - pad_back
	if pad_back > 0:
	audio = np.pad(audio, [(0,0), (0, pad_back)], mode="constant", constant_values=0.0)

	target_outputs = audio.shape[1]
	outputs = {key: np.zeros(audio.shape, np.float32) for key in model.instruments}

	# Pad mixture across time at beginning and end so that neural network can make prediction at the beginning and end of signal
	pad_front_context = model.shapes["output_start_frame"]
	pad_back_context = model.shapes["input_frames"] - model.shapes["output_end_frame"]
	audio = np.pad(audio, [(0,0), (pad_front_context, pad_back_context)], mode="constant", constant_values=0.0)

	# Iterate over mixture magnitudes, fetch network prediction
	with torch.no_grad():
	for target_start_pos in range(0, target_outputs, model.shapes["output_frames"]):
	# Prepare mixture excerpt by selecting time interval
	curr_input = audio[:, target_start_pos:target_start_pos + model.shapes["input_frames"]] # Since audio was front-padded input of [targetpos:targetpos+inputframes] actually predicts [targetpos:targetpos+outputframes] target range

	# Convert to Pytorch tensor for model prediction
	curr_input = torch.from_numpy(curr_input).unsqueeze(0)

	# Predict
	for key, curr_targets in compute_model_output(model, curr_input).items():
	outputs[key][:,target_start_pos:target_start_pos+model.shapes["output_frames"]] = curr_targets.squeeze(0).cpu().numpy()

	# Crop to expected length (since we padded to handle the frame shift)
	outputs = {key : outputs[key][:,:expected_outputs] for key in outputs.keys()}

	if return_mode == "pytorch":
	outputs = torch.from_numpy(outputs)
	if is_cuda:
	outputs = outputs.cuda()
	return outputs

	def predict_song(audio_path):
	'''
	Predicts sources for an audio file for which the file path is given, using a given model.
	Takes care of resampling the input audio to the models sampling rate and resampling predictions back to input sampling rate.
	:param args: Options dictionary
	:param audio_path: Path to mixture audio file
	:param model: Pytorch model
	:return: Source estimates given as dictionary with keys as source names
	'''
	# sr, data = audio_path
	# print(sr)
	# print(data)
	# return (sr, np.flipud(data))
	sr = 44100
	model.eval()

	# Load mixture in original sampling rate
	mix_audio, mix_sr = data.utils.load(audio_path, sr=None, mono=False)
	mix_channels = mix_audio.shape[0]
	mix_len = mix_audio.shape[1]

	# Adapt mixture channels to required input channels
	if channels == 1:
	mix_audio = np.mean(mix_audio, axis=0, keepdims=True)
	else:
	if mix_channels == 1: # Duplicate channels if input is mono but model is stereo
	mix_audio = np.tile(mix_audio, [channels, 1])
	else:
	assert(mix_channels == channels)

	# resample to model sampling rate
	mix_audio = data.utils.resample(mix_audio, mix_sr, sr)

	sources = predict(mix_audio, model)

	# Resample back to mixture sampling rate in case we had model on different sampling rate
	sources = {key : data.utils.resample(sources[key], sr, mix_sr) for key in sources.keys()}

	# In case we had to pad the mixture at the end, or we have a few samples too many due to inconsistent down- and upsamṕling, remove those samples from source prediction now
	for key in sources.keys():
	diff = sources[key].shape[1] - mix_len
	if diff > 0:
	print("WARNING: Cropping " + str(diff) + " samples")
	sources[key] = sources[key][:, :-diff]
	elif diff < 0:
	print("WARNING: Padding output by " + str(diff) + " samples")
	sources[key] = np.pad(sources[key], [(0,0), (0, -diff)], "constant", 0.0)

	# Adapt channels
	if mix_channels > channels:
	assert(channels == 1)
	# Duplicate mono predictions
	sources[key] = np.tile(sources[key], [mix_channels, 1])
	elif mix_channels < channels:
	assert(mix_channels == 1)
	# Reduce model output to mono
	sources[key] = np.mean(sources[key], axis=0, keepdims=True)

	sources[key] = np.asfortranarray(sources[key]) # So librosa does not complain if we want to save it

	data.utils.write_wav("test.wav", sources['vocals'], sr)
	return "test.wav"

	# load model
	num_features = [features*i for i in range(1, levels+1)] if feature_growth == "add" else \
	[features2*i for i in range(0, levels)]
	target_outputs = int(output_size * sr)
	model = Waveunet(channels, num_features, channels, instruments, kernel_size=5,
	target_output_size=target_outputs, depth=1, strides=4,
	conv_type="gn", res="fixed", separate=1)
	load_model = 'checkpoints/waveunet/model'
	state = model_utils.load_model(model, None, load_model, cuda=0)

	# Create title, description and article strings
	title = "Denoise Audio"
	description = "Using Wave-u-net to Denoise Audio"
	article = "Created at github [Wave-U-Net-Pytorch] of author Daniel Stoller(https://github.com/f90/Wave-U-Net-Pytorch)."

	# Create the Gradio demo
	demo = gr.Interface(fn=predict_song, # mapping function from input to output
	inputs=gr.Audio(type="filepath"), # what are the inputs?
	outputs=gr.File(file_count="multiple", file_types=[".wav"]), # our fn has two outputs, therefore we have two outputs
	title=title,
	description=description,
	article=article)

	# Launch the demo!
	demo.launch() # generate a publically shareable URL?