hieupt commited on
Commit
a898647
·
verified ·
1 Parent(s): 9d4b1c6

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +183 -0
app.py ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import museval
2
+ from tqdm import tqdm
3
+
4
+ import numpy as np
5
+ import torch
6
+
7
+ import data.utils
8
+ import model.utils as model_utils
9
+ import utils
10
+ import soundfile as sf
11
+ import argparse
12
+ import os
13
+ from model.waveunet import Waveunet
14
+
15
+ features = 32
16
+ feature_growth = "double"
17
+ output_size = 2
18
+ sr=44100
19
+ levels=6
20
+ channels =2
21
+ instruments =["bass", "drums", "other", "vocals"]
22
+ cuda="false"
23
+
24
+ def compute_model_output(model, inputs):
25
+ '''
26
+ Computes outputs of model with given inputs. Does NOT allow propagating gradients! See compute_loss for training.
27
+ Procedure depends on whether we have one model for each source or not
28
+ :param model: Model to train with
29
+ :param compute_grad: Whether to compute gradients
30
+ :return: Model outputs, Average loss over batch
31
+ '''
32
+ all_outputs = {}
33
+
34
+ if model.separate:
35
+ for inst in model.instruments:
36
+ output = model(inputs, inst)
37
+ all_outputs[inst] = output[inst].detach().clone()
38
+ else:
39
+ all_outputs = model(inputs)
40
+
41
+ return all_outputs
42
+
43
+ def predict(audio, model):
44
+ '''
45
+ Predict sources for a given audio input signal, with a given model. Audio is split into chunks to make predictions on each chunk before they are concatenated.
46
+ :param audio: Audio input tensor, either Pytorch tensor or numpy array
47
+ :param model: Pytorch model
48
+ :return: Source predictions, dictionary with source names as keys
49
+ '''
50
+ if isinstance(audio, torch.Tensor):
51
+ is_cuda = audio.is_cuda()
52
+ audio = audio.detach().cpu().numpy()
53
+ return_mode = "pytorch"
54
+ else:
55
+ return_mode = "numpy"
56
+
57
+ expected_outputs = audio.shape[1]
58
+
59
+ # Pad input if it is not divisible in length by the frame shift number
60
+ output_shift = model.shapes["output_frames"]
61
+ pad_back = audio.shape[1] % output_shift
62
+ pad_back = 0 if pad_back == 0 else output_shift - pad_back
63
+ if pad_back > 0:
64
+ audio = np.pad(audio, [(0,0), (0, pad_back)], mode="constant", constant_values=0.0)
65
+
66
+ target_outputs = audio.shape[1]
67
+ outputs = {key: np.zeros(audio.shape, np.float32) for key in model.instruments}
68
+
69
+ # Pad mixture across time at beginning and end so that neural network can make prediction at the beginning and end of signal
70
+ pad_front_context = model.shapes["output_start_frame"]
71
+ pad_back_context = model.shapes["input_frames"] - model.shapes["output_end_frame"]
72
+ audio = np.pad(audio, [(0,0), (pad_front_context, pad_back_context)], mode="constant", constant_values=0.0)
73
+
74
+ # Iterate over mixture magnitudes, fetch network prediction
75
+ with torch.no_grad():
76
+ for target_start_pos in range(0, target_outputs, model.shapes["output_frames"]):
77
+ # Prepare mixture excerpt by selecting time interval
78
+ curr_input = audio[:, target_start_pos:target_start_pos + model.shapes["input_frames"]] # Since audio was front-padded input of [targetpos:targetpos+inputframes] actually predicts [targetpos:targetpos+outputframes] target range
79
+
80
+ # Convert to Pytorch tensor for model prediction
81
+ curr_input = torch.from_numpy(curr_input).unsqueeze(0)
82
+
83
+ # Predict
84
+ for key, curr_targets in compute_model_output(model, curr_input).items():
85
+ outputs[key][:,target_start_pos:target_start_pos+model.shapes["output_frames"]] = curr_targets.squeeze(0).cpu().numpy()
86
+
87
+ # Crop to expected length (since we padded to handle the frame shift)
88
+ outputs = {key : outputs[key][:,:expected_outputs] for key in outputs.keys()}
89
+
90
+ if return_mode == "pytorch":
91
+ outputs = torch.from_numpy(outputs)
92
+ if is_cuda:
93
+ outputs = outputs.cuda()
94
+ return outputs
95
+
96
+ def predict_song(audio_path):
97
+ '''
98
+ Predicts sources for an audio file for which the file path is given, using a given model.
99
+ Takes care of resampling the input audio to the models sampling rate and resampling predictions back to input sampling rate.
100
+ :param args: Options dictionary
101
+ :param audio_path: Path to mixture audio file
102
+ :param model: Pytorch model
103
+ :return: Source estimates given as dictionary with keys as source names
104
+ '''
105
+ # sr, data = audio_path
106
+ # print(sr)
107
+ # print(data)
108
+ # return (sr, np.flipud(data))
109
+ sr = 44100
110
+ model.eval()
111
+
112
+ # Load mixture in original sampling rate
113
+ mix_audio, mix_sr = data.utils.load(audio_path, sr=None, mono=False)
114
+ mix_channels = mix_audio.shape[0]
115
+ mix_len = mix_audio.shape[1]
116
+
117
+ # Adapt mixture channels to required input channels
118
+ if channels == 1:
119
+ mix_audio = np.mean(mix_audio, axis=0, keepdims=True)
120
+ else:
121
+ if mix_channels == 1: # Duplicate channels if input is mono but model is stereo
122
+ mix_audio = np.tile(mix_audio, [channels, 1])
123
+ else:
124
+ assert(mix_channels == channels)
125
+
126
+ # resample to model sampling rate
127
+ mix_audio = data.utils.resample(mix_audio, mix_sr, sr)
128
+
129
+ sources = predict(mix_audio, model)
130
+
131
+ # Resample back to mixture sampling rate in case we had model on different sampling rate
132
+ sources = {key : data.utils.resample(sources[key], sr, mix_sr) for key in sources.keys()}
133
+
134
+ # In case we had to pad the mixture at the end, or we have a few samples too many due to inconsistent down- and upsamṕling, remove those samples from source prediction now
135
+ for key in sources.keys():
136
+ diff = sources[key].shape[1] - mix_len
137
+ if diff > 0:
138
+ print("WARNING: Cropping " + str(diff) + " samples")
139
+ sources[key] = sources[key][:, :-diff]
140
+ elif diff < 0:
141
+ print("WARNING: Padding output by " + str(diff) + " samples")
142
+ sources[key] = np.pad(sources[key], [(0,0), (0, -diff)], "constant", 0.0)
143
+
144
+ # Adapt channels
145
+ if mix_channels > channels:
146
+ assert(channels == 1)
147
+ # Duplicate mono predictions
148
+ sources[key] = np.tile(sources[key], [mix_channels, 1])
149
+ elif mix_channels < channels:
150
+ assert(mix_channels == 1)
151
+ # Reduce model output to mono
152
+ sources[key] = np.mean(sources[key], axis=0, keepdims=True)
153
+
154
+ sources[key] = np.asfortranarray(sources[key]) # So librosa does not complain if we want to save it
155
+
156
+ data.utils.write_wav("test.wav", sources['vocals'], sr)
157
+ return "test.wav"
158
+
159
+ # load model
160
+ num_features = [features*i for i in range(1, levels+1)] if feature_growth == "add" else \
161
+ [features*2**i for i in range(0, levels)]
162
+ target_outputs = int(output_size * sr)
163
+ model = Waveunet(channels, num_features, channels, instruments, kernel_size=5,
164
+ target_output_size=target_outputs, depth=1, strides=4,
165
+ conv_type="gn", res="fixed", separate=1)
166
+ load_model = 'checkpoints/waveunet/model'
167
+ state = model_utils.load_model(model, None, load_model, cuda=0)
168
+
169
+ # Create title, description and article strings
170
+ title = "Denoise Audio"
171
+ description = "Using Wave-u-net to Denoise Audio"
172
+ article = "Created at github [Wave-U-Net-Pytorch](https://github.com/f90/Wave-U-Net-Pytorch)."
173
+
174
+ # Create the Gradio demo
175
+ demo = gr.Interface(fn=predict_song, # mapping function from input to output
176
+ inputs=gr.Audio(type="filepath"), # what are the inputs?
177
+ outputs=gr.File(file_count="multiple", file_types=[".wav"]), # our fn has two outputs, therefore we have two outputs
178
+ title=title,
179
+ description=description,
180
+ article=article)
181
+
182
+ # Launch the demo!
183
+ demo.launch() # generate a publically shareable URL?