Spaces:
Runtime error
Runtime error
Upload app.py
Browse files
app.py
ADDED
@@ -0,0 +1,183 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import museval
|
2 |
+
from tqdm import tqdm
|
3 |
+
|
4 |
+
import numpy as np
|
5 |
+
import torch
|
6 |
+
|
7 |
+
import data.utils
|
8 |
+
import model.utils as model_utils
|
9 |
+
import utils
|
10 |
+
import soundfile as sf
|
11 |
+
import argparse
|
12 |
+
import os
|
13 |
+
from model.waveunet import Waveunet
|
14 |
+
|
15 |
+
features = 32
|
16 |
+
feature_growth = "double"
|
17 |
+
output_size = 2
|
18 |
+
sr=44100
|
19 |
+
levels=6
|
20 |
+
channels =2
|
21 |
+
instruments =["bass", "drums", "other", "vocals"]
|
22 |
+
cuda="false"
|
23 |
+
|
24 |
+
def compute_model_output(model, inputs):
|
25 |
+
'''
|
26 |
+
Computes outputs of model with given inputs. Does NOT allow propagating gradients! See compute_loss for training.
|
27 |
+
Procedure depends on whether we have one model for each source or not
|
28 |
+
:param model: Model to train with
|
29 |
+
:param compute_grad: Whether to compute gradients
|
30 |
+
:return: Model outputs, Average loss over batch
|
31 |
+
'''
|
32 |
+
all_outputs = {}
|
33 |
+
|
34 |
+
if model.separate:
|
35 |
+
for inst in model.instruments:
|
36 |
+
output = model(inputs, inst)
|
37 |
+
all_outputs[inst] = output[inst].detach().clone()
|
38 |
+
else:
|
39 |
+
all_outputs = model(inputs)
|
40 |
+
|
41 |
+
return all_outputs
|
42 |
+
|
43 |
+
def predict(audio, model):
|
44 |
+
'''
|
45 |
+
Predict sources for a given audio input signal, with a given model. Audio is split into chunks to make predictions on each chunk before they are concatenated.
|
46 |
+
:param audio: Audio input tensor, either Pytorch tensor or numpy array
|
47 |
+
:param model: Pytorch model
|
48 |
+
:return: Source predictions, dictionary with source names as keys
|
49 |
+
'''
|
50 |
+
if isinstance(audio, torch.Tensor):
|
51 |
+
is_cuda = audio.is_cuda()
|
52 |
+
audio = audio.detach().cpu().numpy()
|
53 |
+
return_mode = "pytorch"
|
54 |
+
else:
|
55 |
+
return_mode = "numpy"
|
56 |
+
|
57 |
+
expected_outputs = audio.shape[1]
|
58 |
+
|
59 |
+
# Pad input if it is not divisible in length by the frame shift number
|
60 |
+
output_shift = model.shapes["output_frames"]
|
61 |
+
pad_back = audio.shape[1] % output_shift
|
62 |
+
pad_back = 0 if pad_back == 0 else output_shift - pad_back
|
63 |
+
if pad_back > 0:
|
64 |
+
audio = np.pad(audio, [(0,0), (0, pad_back)], mode="constant", constant_values=0.0)
|
65 |
+
|
66 |
+
target_outputs = audio.shape[1]
|
67 |
+
outputs = {key: np.zeros(audio.shape, np.float32) for key in model.instruments}
|
68 |
+
|
69 |
+
# Pad mixture across time at beginning and end so that neural network can make prediction at the beginning and end of signal
|
70 |
+
pad_front_context = model.shapes["output_start_frame"]
|
71 |
+
pad_back_context = model.shapes["input_frames"] - model.shapes["output_end_frame"]
|
72 |
+
audio = np.pad(audio, [(0,0), (pad_front_context, pad_back_context)], mode="constant", constant_values=0.0)
|
73 |
+
|
74 |
+
# Iterate over mixture magnitudes, fetch network prediction
|
75 |
+
with torch.no_grad():
|
76 |
+
for target_start_pos in range(0, target_outputs, model.shapes["output_frames"]):
|
77 |
+
# Prepare mixture excerpt by selecting time interval
|
78 |
+
curr_input = audio[:, target_start_pos:target_start_pos + model.shapes["input_frames"]] # Since audio was front-padded input of [targetpos:targetpos+inputframes] actually predicts [targetpos:targetpos+outputframes] target range
|
79 |
+
|
80 |
+
# Convert to Pytorch tensor for model prediction
|
81 |
+
curr_input = torch.from_numpy(curr_input).unsqueeze(0)
|
82 |
+
|
83 |
+
# Predict
|
84 |
+
for key, curr_targets in compute_model_output(model, curr_input).items():
|
85 |
+
outputs[key][:,target_start_pos:target_start_pos+model.shapes["output_frames"]] = curr_targets.squeeze(0).cpu().numpy()
|
86 |
+
|
87 |
+
# Crop to expected length (since we padded to handle the frame shift)
|
88 |
+
outputs = {key : outputs[key][:,:expected_outputs] for key in outputs.keys()}
|
89 |
+
|
90 |
+
if return_mode == "pytorch":
|
91 |
+
outputs = torch.from_numpy(outputs)
|
92 |
+
if is_cuda:
|
93 |
+
outputs = outputs.cuda()
|
94 |
+
return outputs
|
95 |
+
|
96 |
+
def predict_song(audio_path):
|
97 |
+
'''
|
98 |
+
Predicts sources for an audio file for which the file path is given, using a given model.
|
99 |
+
Takes care of resampling the input audio to the models sampling rate and resampling predictions back to input sampling rate.
|
100 |
+
:param args: Options dictionary
|
101 |
+
:param audio_path: Path to mixture audio file
|
102 |
+
:param model: Pytorch model
|
103 |
+
:return: Source estimates given as dictionary with keys as source names
|
104 |
+
'''
|
105 |
+
# sr, data = audio_path
|
106 |
+
# print(sr)
|
107 |
+
# print(data)
|
108 |
+
# return (sr, np.flipud(data))
|
109 |
+
sr = 44100
|
110 |
+
model.eval()
|
111 |
+
|
112 |
+
# Load mixture in original sampling rate
|
113 |
+
mix_audio, mix_sr = data.utils.load(audio_path, sr=None, mono=False)
|
114 |
+
mix_channels = mix_audio.shape[0]
|
115 |
+
mix_len = mix_audio.shape[1]
|
116 |
+
|
117 |
+
# Adapt mixture channels to required input channels
|
118 |
+
if channels == 1:
|
119 |
+
mix_audio = np.mean(mix_audio, axis=0, keepdims=True)
|
120 |
+
else:
|
121 |
+
if mix_channels == 1: # Duplicate channels if input is mono but model is stereo
|
122 |
+
mix_audio = np.tile(mix_audio, [channels, 1])
|
123 |
+
else:
|
124 |
+
assert(mix_channels == channels)
|
125 |
+
|
126 |
+
# resample to model sampling rate
|
127 |
+
mix_audio = data.utils.resample(mix_audio, mix_sr, sr)
|
128 |
+
|
129 |
+
sources = predict(mix_audio, model)
|
130 |
+
|
131 |
+
# Resample back to mixture sampling rate in case we had model on different sampling rate
|
132 |
+
sources = {key : data.utils.resample(sources[key], sr, mix_sr) for key in sources.keys()}
|
133 |
+
|
134 |
+
# In case we had to pad the mixture at the end, or we have a few samples too many due to inconsistent down- and upsamṕling, remove those samples from source prediction now
|
135 |
+
for key in sources.keys():
|
136 |
+
diff = sources[key].shape[1] - mix_len
|
137 |
+
if diff > 0:
|
138 |
+
print("WARNING: Cropping " + str(diff) + " samples")
|
139 |
+
sources[key] = sources[key][:, :-diff]
|
140 |
+
elif diff < 0:
|
141 |
+
print("WARNING: Padding output by " + str(diff) + " samples")
|
142 |
+
sources[key] = np.pad(sources[key], [(0,0), (0, -diff)], "constant", 0.0)
|
143 |
+
|
144 |
+
# Adapt channels
|
145 |
+
if mix_channels > channels:
|
146 |
+
assert(channels == 1)
|
147 |
+
# Duplicate mono predictions
|
148 |
+
sources[key] = np.tile(sources[key], [mix_channels, 1])
|
149 |
+
elif mix_channels < channels:
|
150 |
+
assert(mix_channels == 1)
|
151 |
+
# Reduce model output to mono
|
152 |
+
sources[key] = np.mean(sources[key], axis=0, keepdims=True)
|
153 |
+
|
154 |
+
sources[key] = np.asfortranarray(sources[key]) # So librosa does not complain if we want to save it
|
155 |
+
|
156 |
+
data.utils.write_wav("test.wav", sources['vocals'], sr)
|
157 |
+
return "test.wav"
|
158 |
+
|
159 |
+
# load model
|
160 |
+
num_features = [features*i for i in range(1, levels+1)] if feature_growth == "add" else \
|
161 |
+
[features*2**i for i in range(0, levels)]
|
162 |
+
target_outputs = int(output_size * sr)
|
163 |
+
model = Waveunet(channels, num_features, channels, instruments, kernel_size=5,
|
164 |
+
target_output_size=target_outputs, depth=1, strides=4,
|
165 |
+
conv_type="gn", res="fixed", separate=1)
|
166 |
+
load_model = 'checkpoints/waveunet/model'
|
167 |
+
state = model_utils.load_model(model, None, load_model, cuda=0)
|
168 |
+
|
169 |
+
# Create title, description and article strings
|
170 |
+
title = "Denoise Audio"
|
171 |
+
description = "Using Wave-u-net to Denoise Audio"
|
172 |
+
article = "Created at github [Wave-U-Net-Pytorch](https://github.com/f90/Wave-U-Net-Pytorch)."
|
173 |
+
|
174 |
+
# Create the Gradio demo
|
175 |
+
demo = gr.Interface(fn=predict_song, # mapping function from input to output
|
176 |
+
inputs=gr.Audio(type="filepath"), # what are the inputs?
|
177 |
+
outputs=gr.File(file_count="multiple", file_types=[".wav"]), # our fn has two outputs, therefore we have two outputs
|
178 |
+
title=title,
|
179 |
+
description=description,
|
180 |
+
article=article)
|
181 |
+
|
182 |
+
# Launch the demo!
|
183 |
+
demo.launch() # generate a publically shareable URL?
|