File size: 2,523 Bytes
0f79c5b
 
d446ca4
 
 
 
0f79c5b
 
4674b6c
d446ca4
0f79c5b
 
4674b6c
d446ca4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0f79c5b
 
 
 
 
d446ca4
 
 
 
 
0f79c5b
d446ca4
 
0f79c5b
 
 
d446ca4
 
0f79c5b
 
 
 
d446ca4
0f79c5b
 
 
 
 
d446ca4
 
 
 
 
 
 
 
 
 
 
 
0f79c5b
d446ca4
 
 
 
 
 
 
 
 
4674b6c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import math

import gradio
import gradio.inputs
import gradio.outputs
import torch
from df import config
from df.enhance import enhance, init_df, load_audio, save_audio

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model, df, _ = init_df()
model = model.to(device=device).eval()


def mix_at_snr(clean, noise, snr, eps=1e-10):
    """Mix clean and noise signal at a given SNR.

    Args:
        clean: 1D Tensor with the clean signal to mix.
        noise: 1D Tensor of shape.
        snr: Signal to noise ratio.

    Returns:
        clean: 1D Tensor with gain changed according to the snr.
        noise: 1D Tensor with the combined noise channels.
        mix: 1D Tensor with added clean and noise signals.

    """
    clean = torch.as_tensor(clean).mean(0, keepdim=True)
    noise = torch.as_tensor(noise).mean(0, keepdim=True)
    if noise.shape[1] < clean.shape[1]:
        noise = noise.repeat((1, int(math.ceil(clean.shape[1] / noise.shape[1]))))
    noise = noise[:, : clean.shape[1]]
    E_speech = torch.mean(clean.pow(2)) + eps
    E_noise = torch.mean(noise.pow(2))
    K = torch.sqrt((E_noise / E_speech) * 10 ** (snr / 10) + eps)
    noise = noise / K
    mixture = clean + noise
    assert torch.isfinite(mixture).all()
    return clean, noise, mixture

def as_gradio_audio(x):
    sr = config.get("sr", "df", int)
    return sr, (x/0x7fff).to(torch.int16).cpu().numpy()

def mix_and_denoise(speech, noise, snr):
    print(speech, noise, snr)
    sr = config.get("sr", "df", int)
    speech, _ = load_audio(speech, sr)
    noise, _ = load_audio(noise, sr)
    speech, noise, noisy = mix_at_snr(speech, noise, snr)
    enhanced = enhance(model, df, noisy)
    save_audio("clean.wav", speech, sr)
    save_audio("noisy.wav", noisy, sr)
    save_audio("enhanced.wav", enhanced, sr)
    return "clean.wav", "noisy.wav", "enhanced.wav"


inputs = [
    gradio.inputs.Audio(
        source="microphone", type="filepath", optional=True, label="Speech"
    ),
    gradio.inputs.Audio(
        source="microphone", type="filepath", optional=True, label="Noise"
    ),
    gradio.inputs.Slider(minimum=-10, maximum=40, step=5, default=10),
]
examples = [
    ["samples/p232_013_clean.wav", "samples/noise_freesound_2530.wav", 10],
]
outputs = [
    gradio.outputs.Audio(label="Clean"),
    gradio.outputs.Audio(label="Noisy"),
    gradio.outputs.Audio(label="Enhanced"),
]
iface = gradio.Interface(
    fn=mix_and_denoise, inputs=inputs, outputs=outputs, examples=examples
)
iface.launch()