Spaces:
Runtime error
Runtime error
Hendrik Schroeter
commited on
Finaly workaround chrome bug where audio is opus encoded but gets .wav extension
Browse files
app.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
| 1 |
import math
|
| 2 |
import tempfile
|
|
|
|
| 3 |
|
| 4 |
import gradio
|
| 5 |
import gradio.inputs
|
|
@@ -13,6 +14,7 @@ from loguru import logger
|
|
| 13 |
from df import config
|
| 14 |
from df.enhance import enhance, init_df, load_audio, save_audio
|
| 15 |
from df.utils import resample
|
|
|
|
| 16 |
|
| 17 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 18 |
model, df, _ = init_df()
|
|
@@ -39,52 +41,46 @@ def mix_at_snr(clean, noise, snr, eps=1e-10):
|
|
| 39 |
noise = noise.repeat((1, int(math.ceil(clean.shape[1] / noise.shape[1]))))
|
| 40 |
max_start = int(noise.shape[1] - clean.shape[1])
|
| 41 |
start = torch.randint(0, max_start, ()).item()
|
| 42 |
-
|
| 43 |
noise = noise[:, start : start + clean.shape[1]]
|
| 44 |
E_speech = torch.mean(clean.pow(2)) + eps
|
| 45 |
E_noise = torch.mean(noise.pow(2))
|
| 46 |
K = torch.sqrt((E_noise / E_speech) * 10 ** (snr / 10) + eps)
|
| 47 |
noise = noise / K
|
| 48 |
mixture = clean + noise
|
| 49 |
-
|
| 50 |
assert torch.isfinite(mixture).all()
|
| 51 |
max_m = mixture.abs().max()
|
| 52 |
if max_m > 1:
|
| 53 |
-
|
| 54 |
clean, noise, mixture = clean / max_m, noise / max_m, mixture / max_m
|
| 55 |
return clean, noise, mixture
|
| 56 |
|
| 57 |
|
| 58 |
-
def mix_and_denoise(speech_rec, speech_upl,
|
| 59 |
sr = config("sr", 48000, int, section="df")
|
| 60 |
logger.info(
|
| 61 |
-
f"Got parameters speech_rec: {speech_rec}, speech_upl: {speech_upl}, noise: {
|
| 62 |
)
|
| 63 |
-
if
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
if
|
| 67 |
speech_file = "samples/p232_013_clean.wav"
|
| 68 |
if speech_upl is not None and "none" not in speech_upl:
|
| 69 |
speech_file = speech_upl
|
|
|
|
| 70 |
else:
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
speech, meta = load_audio(speech_file, sr, **sp_kwargs)
|
| 80 |
-
except RuntimeError:
|
| 81 |
-
if meta is not None:
|
| 82 |
-
print(meta)
|
| 83 |
-
break
|
| 84 |
-
if meta is None:
|
| 85 |
-
raise ValueError("Could not load recorded speech")
|
| 86 |
logger.info(f"Loaded speech with shape {speech.shape}")
|
| 87 |
-
noise, _ = load_audio(
|
| 88 |
if meta.sample_rate != sr:
|
| 89 |
# Low pass filter by resampling
|
| 90 |
noise = resample(resample(noise, sr, meta.sample_rate), meta.sample_rate, sr)
|
|
@@ -215,7 +211,7 @@ def spec_figure(
|
|
| 215 |
inputs = [
|
| 216 |
gradio.inputs.Audio(
|
| 217 |
source="microphone",
|
| 218 |
-
type="
|
| 219 |
optional=True,
|
| 220 |
label="Record your own voice",
|
| 221 |
),
|
|
@@ -262,4 +258,4 @@ iface = gradio.Interface(
|
|
| 262 |
allow_flagging="never",
|
| 263 |
article=markdown.markdown(open("usage.md").read()),
|
| 264 |
)
|
| 265 |
-
iface.launch(cache_examples=False)
|
|
|
|
| 1 |
import math
|
| 2 |
import tempfile
|
| 3 |
+
from typing import Tuple
|
| 4 |
|
| 5 |
import gradio
|
| 6 |
import gradio.inputs
|
|
|
|
| 14 |
from df import config
|
| 15 |
from df.enhance import enhance, init_df, load_audio, save_audio
|
| 16 |
from df.utils import resample
|
| 17 |
+
from torchaudio.backend.common import AudioMetaData
|
| 18 |
|
| 19 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 20 |
model, df, _ = init_df()
|
|
|
|
| 41 |
noise = noise.repeat((1, int(math.ceil(clean.shape[1] / noise.shape[1]))))
|
| 42 |
max_start = int(noise.shape[1] - clean.shape[1])
|
| 43 |
start = torch.randint(0, max_start, ()).item()
|
| 44 |
+
logger.debug(f"start: {start}, {clean.shape}")
|
| 45 |
noise = noise[:, start : start + clean.shape[1]]
|
| 46 |
E_speech = torch.mean(clean.pow(2)) + eps
|
| 47 |
E_noise = torch.mean(noise.pow(2))
|
| 48 |
K = torch.sqrt((E_noise / E_speech) * 10 ** (snr / 10) + eps)
|
| 49 |
noise = noise / K
|
| 50 |
mixture = clean + noise
|
| 51 |
+
logger.debug("mixture: {mixture.shape}")
|
| 52 |
assert torch.isfinite(mixture).all()
|
| 53 |
max_m = mixture.abs().max()
|
| 54 |
if max_m > 1:
|
| 55 |
+
logger.warning(f"Clipping detected during mixing. Reducing gain by {1/max_m}")
|
| 56 |
clean, noise, mixture = clean / max_m, noise / max_m, mixture / max_m
|
| 57 |
return clean, noise, mixture
|
| 58 |
|
| 59 |
|
| 60 |
+
def mix_and_denoise(speech_rec: Tuple[int, np.ndarray], speech_upl: str, noise_fn: str, snr: int):
|
| 61 |
sr = config("sr", 48000, int, section="df")
|
| 62 |
logger.info(
|
| 63 |
+
f"Got parameters speech_rec: {speech_rec}, speech_upl: {speech_upl}, noise: {noise_fn}, snr: {snr}"
|
| 64 |
)
|
| 65 |
+
if noise_fn is None:
|
| 66 |
+
noise_fn = "samples/dkitchen.wav"
|
| 67 |
+
meta = AudioMetaData(-1, -1, -1, -1, "")
|
| 68 |
+
if speech_upl is not None and "none" not in speech_upl:
|
| 69 |
speech_file = "samples/p232_013_clean.wav"
|
| 70 |
if speech_upl is not None and "none" not in speech_upl:
|
| 71 |
speech_file = speech_upl
|
| 72 |
+
speech, meta = load_audio(speech_file, sr)
|
| 73 |
else:
|
| 74 |
+
meta.sample_rate, speech_rec_a = speech_rec
|
| 75 |
+
# Gradio documentation says, the shape is [samples, 2], but apparently sometimes its not.
|
| 76 |
+
speech_rec_a = speech_rec_a.reshape(speech_rec_a.shape[0], -1).T
|
| 77 |
+
if speech_rec_a.dtype == np.int16:
|
| 78 |
+
speech_rec_a = (speech_rec_a / (1 << 15)).astype(np.float32)
|
| 79 |
+
elif speech_rec_a.dtype == np.int32:
|
| 80 |
+
speech_rec_a = (speech_rec_a / (1 << 31)).astype(np.float32)
|
| 81 |
+
speech = resample(torch.from_numpy(speech_rec_a), meta.sample_rate, sr)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
logger.info(f"Loaded speech with shape {speech.shape}")
|
| 83 |
+
noise, _ = load_audio(noise_fn, sr) # type: ignore
|
| 84 |
if meta.sample_rate != sr:
|
| 85 |
# Low pass filter by resampling
|
| 86 |
noise = resample(resample(noise, sr, meta.sample_rate), meta.sample_rate, sr)
|
|
|
|
| 211 |
inputs = [
|
| 212 |
gradio.inputs.Audio(
|
| 213 |
source="microphone",
|
| 214 |
+
type="numpy",
|
| 215 |
optional=True,
|
| 216 |
label="Record your own voice",
|
| 217 |
),
|
|
|
|
| 258 |
allow_flagging="never",
|
| 259 |
article=markdown.markdown(open("usage.md").read()),
|
| 260 |
)
|
| 261 |
+
iface.launch(cache_examples=False, debug=True)
|