kevinwang676 commited on
Commit
6ca7e3b
·
1 Parent(s): 7d16da4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +281 -0
app.py CHANGED
@@ -11,6 +11,42 @@ import torch
11
  import pytorch_seed
12
  import time
13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
  from xml.sax import saxutils
16
  from bark.api import generate_with_settings
@@ -30,6 +66,221 @@ from swap_voice import swap_voice_from_audio
30
  from training.training_prepare import prepare_semantics_from_text, prepare_wavs_from_semantics
31
  from training.train import training_prepare_files, train
32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  settings = Settings('config.yaml')
34
 
35
  def generate_text_to_speech(text, selected_speaker, text_temp, waveform_temp, eos_prob, quick_generation, complete_settings, seed, batchcount, progress=gr.Progress(track_tqdm=True)):
@@ -353,6 +604,36 @@ while run_server:
353
  with gr.Row():
354
  output_audio = gr.Audio(label="Generated Audio", type="filepath")
355
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
356
  with gr.Tab("🔮 - Voice Conversion"):
357
  with gr.Row():
358
  swap_audio_filename = gr.Audio(label="Input audio.wav to swap voice", source="upload", type="filepath")
 
11
  import pytorch_seed
12
  import time
13
 
14
+ import math
15
+ import tempfile
16
+ from typing import Optional, Tuple, Union
17
+
18
+
19
+ import matplotlib.pyplot as plt
20
+ from loguru import logger
21
+ from PIL import Image
22
+ from torch import Tensor
23
+ from torchaudio.backend.common import AudioMetaData
24
+
25
+ from df import config
26
+ from df.enhance import enhance, init_df, load_audio, save_audio
27
+ from df.io import resample
28
+
29
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
30
+ model, df, _ = init_df("./DeepFilterNet2", config_allow_defaults=True)
31
+ model = model.to(device=device).eval()
32
+
33
+ fig_noisy: plt.Figure
34
+ fig_enh: plt.Figure
35
+ ax_noisy: plt.Axes
36
+ ax_enh: plt.Axes
37
+ fig_noisy, ax_noisy = plt.subplots(figsize=(15.2, 4))
38
+ fig_noisy.set_tight_layout(True)
39
+ fig_enh, ax_enh = plt.subplots(figsize=(15.2, 4))
40
+ fig_enh.set_tight_layout(True)
41
+
42
+ NOISES = {
43
+ "None": None,
44
+ "Kitchen": "samples/dkitchen.wav",
45
+ "Living Room": "samples/dliving.wav",
46
+ "River": "samples/nriver.wav",
47
+ "Cafe": "samples/scafe.wav",
48
+ }
49
+
50
 
51
  from xml.sax import saxutils
52
  from bark.api import generate_with_settings
 
66
  from training.training_prepare import prepare_semantics_from_text, prepare_wavs_from_semantics
67
  from training.train import training_prepare_files, train
68
 
69
+
70
+ # Denoise
71
+
72
+ def mix_at_snr(clean, noise, snr, eps=1e-10):
73
+ """Mix clean and noise signal at a given SNR.
74
+ Args:
75
+ clean: 1D Tensor with the clean signal to mix.
76
+ noise: 1D Tensor of shape.
77
+ snr: Signal to noise ratio.
78
+ Returns:
79
+ clean: 1D Tensor with gain changed according to the snr.
80
+ noise: 1D Tensor with the combined noise channels.
81
+ mix: 1D Tensor with added clean and noise signals.
82
+ """
83
+ clean = torch.as_tensor(clean).mean(0, keepdim=True)
84
+ noise = torch.as_tensor(noise).mean(0, keepdim=True)
85
+ if noise.shape[1] < clean.shape[1]:
86
+ noise = noise.repeat((1, int(math.ceil(clean.shape[1] / noise.shape[1]))))
87
+ max_start = int(noise.shape[1] - clean.shape[1])
88
+ start = torch.randint(0, max_start, ()).item() if max_start > 0 else 0
89
+ logger.debug(f"start: {start}, {clean.shape}")
90
+ noise = noise[:, start : start + clean.shape[1]]
91
+ E_speech = torch.mean(clean.pow(2)) + eps
92
+ E_noise = torch.mean(noise.pow(2))
93
+ K = torch.sqrt((E_noise / E_speech) * 10 ** (snr / 10) + eps)
94
+ noise = noise / K
95
+ mixture = clean + noise
96
+ logger.debug("mixture: {mixture.shape}")
97
+ assert torch.isfinite(mixture).all()
98
+ max_m = mixture.abs().max()
99
+ if max_m > 1:
100
+ logger.warning(f"Clipping detected during mixing. Reducing gain by {1/max_m}")
101
+ clean, noise, mixture = clean / max_m, noise / max_m, mixture / max_m
102
+ return clean, noise, mixture
103
+
104
+
105
+ def load_audio_gradio(
106
+ audio_or_file: Union[None, str, Tuple[int, np.ndarray]], sr: int
107
+ ) -> Optional[Tuple[Tensor, AudioMetaData]]:
108
+ if audio_or_file is None:
109
+ return None
110
+ if isinstance(audio_or_file, str):
111
+ if audio_or_file.lower() == "none":
112
+ return None
113
+ # First try default format
114
+ audio, meta = load_audio(audio_or_file, sr)
115
+ else:
116
+ meta = AudioMetaData(-1, -1, -1, -1, "")
117
+ assert isinstance(audio_or_file, (tuple, list))
118
+ meta.sample_rate, audio_np = audio_or_file
119
+ # Gradio documentation says, the shape is [samples, 2], but apparently sometimes its not.
120
+ audio_np = audio_np.reshape(audio_np.shape[0], -1).T
121
+ if audio_np.dtype == np.int16:
122
+ audio_np = (audio_np / (1 << 15)).astype(np.float32)
123
+ elif audio_np.dtype == np.int32:
124
+ audio_np = (audio_np / (1 << 31)).astype(np.float32)
125
+ audio = resample(torch.from_numpy(audio_np), meta.sample_rate, sr)
126
+ return audio, meta
127
+
128
+
129
+ def demo_fn(speech_upl: str, noise_type: str, snr: int, mic_input: str):
130
+ if mic_input:
131
+ speech_upl = mic_input
132
+ sr = config("sr", 48000, int, section="df")
133
+ logger.info(f"Got parameters speech_upl: {speech_upl}, noise: {noise_type}, snr: {snr}")
134
+ snr = int(snr)
135
+ noise_fn = NOISES[noise_type]
136
+ meta = AudioMetaData(-1, -1, -1, -1, "")
137
+ max_s = 10 # limit to 10 seconds
138
+ if speech_upl is not None:
139
+ sample, meta = load_audio(speech_upl, sr)
140
+ max_len = max_s * sr
141
+ if sample.shape[-1] > max_len:
142
+ start = torch.randint(0, sample.shape[-1] - max_len, ()).item()
143
+ sample = sample[..., start : start + max_len]
144
+ else:
145
+ sample, meta = load_audio("samples/p232_013_clean.wav", sr)
146
+ sample = sample[..., : max_s * sr]
147
+ if sample.dim() > 1 and sample.shape[0] > 1:
148
+ assert (
149
+ sample.shape[1] > sample.shape[0]
150
+ ), f"Expecting channels first, but got {sample.shape}"
151
+ sample = sample.mean(dim=0, keepdim=True)
152
+ logger.info(f"Loaded sample with shape {sample.shape}")
153
+ if noise_fn is not None:
154
+ noise, _ = load_audio(noise_fn, sr) # type: ignore
155
+ logger.info(f"Loaded noise with shape {noise.shape}")
156
+ _, _, sample = mix_at_snr(sample, noise, snr)
157
+ logger.info("Start denoising audio")
158
+ enhanced = enhance(model, df, sample)
159
+ logger.info("Denoising finished")
160
+ lim = torch.linspace(0.0, 1.0, int(sr * 0.15)).unsqueeze(0)
161
+ lim = torch.cat((lim, torch.ones(1, enhanced.shape[1] - lim.shape[1])), dim=1)
162
+ enhanced = enhanced * lim
163
+ if meta.sample_rate != sr:
164
+ enhanced = resample(enhanced, sr, meta.sample_rate)
165
+ sample = resample(sample, sr, meta.sample_rate)
166
+ sr = meta.sample_rate
167
+ noisy_wav = tempfile.NamedTemporaryFile(suffix="noisy.wav", delete=False).name
168
+ save_audio(noisy_wav, sample, sr)
169
+ enhanced_wav = tempfile.NamedTemporaryFile(suffix="enhanced.wav", delete=False).name
170
+ save_audio(enhanced_wav, enhanced, sr)
171
+ logger.info(f"saved audios: {noisy_wav}, {enhanced_wav}")
172
+ ax_noisy.clear()
173
+ ax_enh.clear()
174
+ # noisy_wav = gr.make_waveform(noisy_fn, bar_count=200)
175
+ # enh_wav = gr.make_waveform(enhanced_fn, bar_count=200)
176
+ return noisy_wav, enhanced_wav
177
+
178
+
179
+ def specshow(
180
+ spec,
181
+ ax=None,
182
+ title=None,
183
+ xlabel=None,
184
+ ylabel=None,
185
+ sr=48000,
186
+ n_fft=None,
187
+ hop=None,
188
+ t=None,
189
+ f=None,
190
+ vmin=-100,
191
+ vmax=0,
192
+ xlim=None,
193
+ ylim=None,
194
+ cmap="inferno",
195
+ ):
196
+ """Plots a spectrogram of shape [F, T]"""
197
+ spec_np = spec.cpu().numpy() if isinstance(spec, torch.Tensor) else spec
198
+ if ax is not None:
199
+ set_title = ax.set_title
200
+ set_xlabel = ax.set_xlabel
201
+ set_ylabel = ax.set_ylabel
202
+ set_xlim = ax.set_xlim
203
+ set_ylim = ax.set_ylim
204
+ else:
205
+ ax = plt
206
+ set_title = plt.title
207
+ set_xlabel = plt.xlabel
208
+ set_ylabel = plt.ylabel
209
+ set_xlim = plt.xlim
210
+ set_ylim = plt.ylim
211
+ if n_fft is None:
212
+ if spec.shape[0] % 2 == 0:
213
+ n_fft = spec.shape[0] * 2
214
+ else:
215
+ n_fft = (spec.shape[0] - 1) * 2
216
+ hop = hop or n_fft // 4
217
+ if t is None:
218
+ t = np.arange(0, spec_np.shape[-1]) * hop / sr
219
+ if f is None:
220
+ f = np.arange(0, spec_np.shape[0]) * sr // 2 / (n_fft // 2) / 1000
221
+ im = ax.pcolormesh(
222
+ t, f, spec_np, rasterized=True, shading="auto", vmin=vmin, vmax=vmax, cmap=cmap
223
+ )
224
+ if title is not None:
225
+ set_title(title)
226
+ if xlabel is not None:
227
+ set_xlabel(xlabel)
228
+ if ylabel is not None:
229
+ set_ylabel(ylabel)
230
+ if xlim is not None:
231
+ set_xlim(xlim)
232
+ if ylim is not None:
233
+ set_ylim(ylim)
234
+ return im
235
+
236
+
237
+ def spec_im(
238
+ audio: torch.Tensor,
239
+ figsize=(15, 5),
240
+ colorbar=False,
241
+ colorbar_format=None,
242
+ figure=None,
243
+ labels=True,
244
+ **kwargs,
245
+ ) -> Image:
246
+ audio = torch.as_tensor(audio)
247
+ if labels:
248
+ kwargs.setdefault("xlabel", "Time [s]")
249
+ kwargs.setdefault("ylabel", "Frequency [Hz]")
250
+ n_fft = kwargs.setdefault("n_fft", 1024)
251
+ hop = kwargs.setdefault("hop", 512)
252
+ w = torch.hann_window(n_fft, device=audio.device)
253
+ spec = torch.stft(audio, n_fft, hop, window=w, return_complex=False)
254
+ spec = spec.div_(w.pow(2).sum())
255
+ spec = torch.view_as_complex(spec).abs().clamp_min(1e-12).log10().mul(10)
256
+ kwargs.setdefault("vmax", max(0.0, spec.max().item()))
257
+
258
+ if figure is None:
259
+ figure = plt.figure(figsize=figsize)
260
+ figure.set_tight_layout(True)
261
+ if spec.dim() > 2:
262
+ spec = spec.squeeze(0)
263
+ im = specshow(spec, **kwargs)
264
+ if colorbar:
265
+ ckwargs = {}
266
+ if "ax" in kwargs:
267
+ if colorbar_format is None:
268
+ if kwargs.get("vmin", None) is not None or kwargs.get("vmax", None) is not None:
269
+ colorbar_format = "%+2.0f dB"
270
+ ckwargs = {"ax": kwargs["ax"]}
271
+ plt.colorbar(im, format=colorbar_format, **ckwargs)
272
+ figure.canvas.draw()
273
+ return Image.frombytes("RGB", figure.canvas.get_width_height(), figure.canvas.tostring_rgb())
274
+
275
+
276
+ def toggle(choice):
277
+ if choice == "mic":
278
+ return gr.update(visible=True, value=None), gr.update(visible=False, value=None)
279
+ else:
280
+ return gr.update(visible=False, value=None), gr.update(visible=True, value=None)
281
+
282
+ # Bark
283
+
284
  settings = Settings('config.yaml')
285
 
286
  def generate_text_to_speech(text, selected_speaker, text_temp, waveform_temp, eos_prob, quick_generation, complete_settings, seed, batchcount, progress=gr.Progress(track_tqdm=True)):
 
604
  with gr.Row():
605
  output_audio = gr.Audio(label="Generated Audio", type="filepath")
606
 
607
+ with gr.Row():
608
+ with gr.Column():
609
+ radio = gr.Radio(
610
+ ["mic", "file"], value="file", label="How would you like to upload your audio?", visible=False
611
+ )
612
+ mic_input = gr.Mic(label="Input", type="filepath", visible=False)
613
+ audio_file = output_audio
614
+ inputs = [
615
+ audio_file,
616
+ gr.Dropdown(
617
+ label="Add background noise",
618
+ choices=list(NOISES.keys()),
619
+ value="None", visible =False,
620
+ ),
621
+ gr.Dropdown(
622
+ label="Noise Level (SNR)",
623
+ choices=["-5", "0", "10", "20"],
624
+ value="0", visible =False,
625
+ ),
626
+ mic_input,
627
+ ]
628
+ btn_denoise = gr.Button("Denoise")
629
+ with gr.Column():
630
+ outputs = [
631
+ gr.Audio(type="filepath", label="Noisy audio"),
632
+ gr.Audio(type="filepath", label="Enhanced audio"),
633
+ ]
634
+ btn_denoise.click(fn=demo_fn, inputs=inputs, outputs=outputs)
635
+ radio.change(toggle, radio, [mic_input, audio_file])
636
+
637
  with gr.Tab("🔮 - Voice Conversion"):
638
  with gr.Row():
639
  swap_audio_filename = gr.Audio(label="Input audio.wav to swap voice", source="upload", type="filepath")