File size: 2,758 Bytes
0f79c5b
 
d446ca4
 
 
 
0f79c5b
 
4674b6c
d446ca4
0f79c5b
 
4674b6c
d446ca4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0f79c5b
 
 
 
 
d446ca4
 
 
 
 
0f79c5b
d446ca4
 
a52c38e
0f79c5b
7d50d5d
a52c38e
 
d446ca4
 
0f79c5b
7d50d5d
0f79c5b
 
d446ca4
0f79c5b
 
 
 
 
d446ca4
 
 
 
 
 
 
 
 
 
 
 
0f79c5b
baa5010
d446ca4
 
 
 
 
 
a52c38e
 
 
d446ca4
a52c38e
 
 
 
 
d446ca4
4674b6c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import math

import gradio
import gradio.inputs
import gradio.outputs
import torch
from df import config
from df.enhance import enhance, init_df, load_audio, save_audio

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model, df, _ = init_df()
model = model.to(device=device).eval()


def mix_at_snr(clean, noise, snr, eps=1e-10):
    """Mix clean and noise signal at a given SNR.

    Args:
        clean: 1D Tensor with the clean signal to mix.
        noise: 1D Tensor of shape.
        snr: Signal to noise ratio.

    Returns:
        clean: 1D Tensor with gain changed according to the snr.
        noise: 1D Tensor with the combined noise channels.
        mix: 1D Tensor with added clean and noise signals.

    """
    clean = torch.as_tensor(clean).mean(0, keepdim=True)
    noise = torch.as_tensor(noise).mean(0, keepdim=True)
    if noise.shape[1] < clean.shape[1]:
        noise = noise.repeat((1, int(math.ceil(clean.shape[1] / noise.shape[1]))))
    noise = noise[:, : clean.shape[1]]
    E_speech = torch.mean(clean.pow(2)) + eps
    E_noise = torch.mean(noise.pow(2))
    K = torch.sqrt((E_noise / E_speech) * 10 ** (snr / 10) + eps)
    noise = noise / K
    mixture = clean + noise
    assert torch.isfinite(mixture).all()
    return clean, noise, mixture


def as_gradio_audio(x):
    sr = config("sr", 48000, int, section="df")
    return sr, (x / 0x7FFF).to(torch.int16).cpu().numpy()


def mix_and_denoise(speech, noise, snr):
    print(speech, noise, snr)
    sr = config("sr", 48000, int, section="df")
    speech, _ = load_audio(speech, sr)
    noise, _ = load_audio(noise, sr)
    speech, noise, noisy = mix_at_snr(speech, noise, snr)
    enhanced = enhance(model, df, noisy)
    save_audio("clean.wav", speech, sr)
    save_audio("noisy.wav", noisy, sr)
    save_audio("enhanced.wav", enhanced, sr)
    return "clean.wav", "noisy.wav", "enhanced.wav"


inputs = [
    gradio.inputs.Audio(
        source="microphone", type="filepath", optional=True, label="Speech"
    ),
    gradio.inputs.Audio(
        source="microphone", type="filepath", optional=True, label="Noise"
    ),
    gradio.inputs.Slider(minimum=-10, maximum=40, step=5, default=10),
]
examples = [
    ["samples/p232_013_clean.wav", "samples/noise_freesound_2530.wav", 10],
    ["samples/p232_019_clean.wav", "samples/dliving.wav", 10],
]
outputs = [
    gradio.outputs.Audio(label="Clean"),
    gradio.outputs.Audio(label="Noisy"),
    gradio.outputs.Audio(label="Enhanced"),
]
description = (
    "This demo denoises audio files using DeepFilterNet. Try it with your own voice!"
)
iface = gradio.Interface(
    fn=mix_and_denoise,
    inputs=inputs,
    outputs=outputs,
    examples=examples,
    description=description,
)
iface.launch()