import math import gradio import gradio.inputs import gradio.outputs import torch from df import config from df.enhance import enhance, init_df, load_audio, save_audio device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model, df, _ = init_df() model = model.to(device=device).eval() def mix_at_snr(clean, noise, snr, eps=1e-10): """Mix clean and noise signal at a given SNR. Args: clean: 1D Tensor with the clean signal to mix. noise: 1D Tensor of shape. snr: Signal to noise ratio. Returns: clean: 1D Tensor with gain changed according to the snr. noise: 1D Tensor with the combined noise channels. mix: 1D Tensor with added clean and noise signals. """ clean = torch.as_tensor(clean).mean(0, keepdim=True) noise = torch.as_tensor(noise).mean(0, keepdim=True) if noise.shape[1] < clean.shape[1]: noise = noise.repeat((1, int(math.ceil(clean.shape[1] / noise.shape[1])))) max_start = int(noise.shape[1] - clean.shape[1]) start = torch.randint(0, max_start, ()).item() noise = noise[:, start : start + clean.shape[1]] E_speech = torch.mean(clean.pow(2)) + eps E_noise = torch.mean(noise.pow(2)) K = torch.sqrt((E_noise / E_speech) * 10 ** (snr / 10) + eps) noise = noise / K mixture = clean + noise assert torch.isfinite(mixture).all() return clean, noise, mixture def as_gradio_audio(x): sr = config("sr", 48000, int, section="df") return sr, (x / 0x7FFF).to(torch.int16).cpu().numpy() def mix_and_denoise(speech, speech_alt, noise, snr): if noise is None: noise = "samples/dkitchen.wav" if speech is None: if speech_alt is None: speech = "samples/p232_013_clean.wav" speech = speech_alt print(speech, noise, snr) sr = config("sr", 48000, int, section="df") speech, _ = load_audio(speech, sr) noise, _ = load_audio(noise, sr) speech, noise, noisy = mix_at_snr(speech, noise, snr) enhanced = enhance(model, df, noisy) save_audio("clean.wav", speech, sr) save_audio("noisy.wav", noisy, sr) save_audio("enhanced.wav", enhanced, sr) return "clean.wav", "noisy.wav", "enhanced.wav" inputs = [ gradio.inputs.Audio( source="microphone", type="filepath", optional=True, label="Record your own voice" ), gradio.inputs.Audio( source="upload", type="filepath", optional=True, label="Alternative: Upload speech sample" ), gradio.inputs.Audio(source="upload", type="filepath", optional=True, label="Upload noise sample"), gradio.inputs.Slider(minimum=-20, maximum=40, step=5, default=10), ] examples = [ ["samples/p232_013_clean.wav", "samples/dkitchen.wav", 10], ["samples/p232_019_clean.wav", "samples/dliving.wav", 10], ] outputs = [ gradio.outputs.Audio(label="Clean"), gradio.outputs.Audio(label="Noisy"), gradio.outputs.Audio(label="Enhanced"), ] description = ( "This demo denoises audio files using DeepFilterNet. Try it with your own voice!" ) iface = gradio.Interface( fn=mix_and_denoise, inputs=inputs, outputs=outputs, examples=examples, description=description, ) iface.launch()