Hendrik Schroeter commited on
Commit
4cf88e6
1 Parent(s): 454fcfd

Finaly workaround chrome bug where audio is opus encoded but gets .wav extension

Browse files
Files changed (1) hide show
  1. app.py +23 -27
app.py CHANGED
@@ -1,5 +1,6 @@
1
  import math
2
  import tempfile
 
3
 
4
  import gradio
5
  import gradio.inputs
@@ -13,6 +14,7 @@ from loguru import logger
13
  from df import config
14
  from df.enhance import enhance, init_df, load_audio, save_audio
15
  from df.utils import resample
 
16
 
17
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
18
  model, df, _ = init_df()
@@ -39,52 +41,46 @@ def mix_at_snr(clean, noise, snr, eps=1e-10):
39
  noise = noise.repeat((1, int(math.ceil(clean.shape[1] / noise.shape[1]))))
40
  max_start = int(noise.shape[1] - clean.shape[1])
41
  start = torch.randint(0, max_start, ()).item()
42
- print("start:", start, clean.shape)
43
  noise = noise[:, start : start + clean.shape[1]]
44
  E_speech = torch.mean(clean.pow(2)) + eps
45
  E_noise = torch.mean(noise.pow(2))
46
  K = torch.sqrt((E_noise / E_speech) * 10 ** (snr / 10) + eps)
47
  noise = noise / K
48
  mixture = clean + noise
49
- print("mixture:", mixture.shape)
50
  assert torch.isfinite(mixture).all()
51
  max_m = mixture.abs().max()
52
  if max_m > 1:
53
- print(f"Clipping detected during mixing. Reducing gain by {1/max_m}")
54
  clean, noise, mixture = clean / max_m, noise / max_m, mixture / max_m
55
  return clean, noise, mixture
56
 
57
 
58
- def mix_and_denoise(speech_rec, speech_upl, noise, snr):
59
  sr = config("sr", 48000, int, section="df")
60
  logger.info(
61
- f"Got parameters speech_rec: {speech_rec}, speech_upl: {speech_upl}, noise: {noise}, snr: {snr}"
62
  )
63
- if noise is None:
64
- noise = "samples/dkitchen.wav"
65
- sp_kwargs = {}
66
- if speech_rec is None or "none" in speech_rec:
67
  speech_file = "samples/p232_013_clean.wav"
68
  if speech_upl is not None and "none" not in speech_upl:
69
  speech_file = speech_upl
 
70
  else:
71
- speech_file = speech_rec
72
- sp_kwargs = {"frame_offset": 4800}
73
- meta = None
74
- # Apperently chrome uses mp3 or opus?
75
- for f in ("wav", "mp3", "flac", "vorbis", "opus"):
76
- sp_kwargs["format"] = f
77
- logger.info(f"Trying to load speech: {speech_file}, with codec {f}")
78
- try:
79
- speech, meta = load_audio(speech_file, sr, **sp_kwargs)
80
- except RuntimeError:
81
- if meta is not None:
82
- print(meta)
83
- break
84
- if meta is None:
85
- raise ValueError("Could not load recorded speech")
86
  logger.info(f"Loaded speech with shape {speech.shape}")
87
- noise, _ = load_audio(noise, sr)
88
  if meta.sample_rate != sr:
89
  # Low pass filter by resampling
90
  noise = resample(resample(noise, sr, meta.sample_rate), meta.sample_rate, sr)
@@ -215,7 +211,7 @@ def spec_figure(
215
  inputs = [
216
  gradio.inputs.Audio(
217
  source="microphone",
218
- type="filepath",
219
  optional=True,
220
  label="Record your own voice",
221
  ),
@@ -262,4 +258,4 @@ iface = gradio.Interface(
262
  allow_flagging="never",
263
  article=markdown.markdown(open("usage.md").read()),
264
  )
265
- iface.launch(cache_examples=False)
 
1
  import math
2
  import tempfile
3
+ from typing import Tuple
4
 
5
  import gradio
6
  import gradio.inputs
 
14
  from df import config
15
  from df.enhance import enhance, init_df, load_audio, save_audio
16
  from df.utils import resample
17
+ from torchaudio.backend.common import AudioMetaData
18
 
19
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
20
  model, df, _ = init_df()
 
41
  noise = noise.repeat((1, int(math.ceil(clean.shape[1] / noise.shape[1]))))
42
  max_start = int(noise.shape[1] - clean.shape[1])
43
  start = torch.randint(0, max_start, ()).item()
44
+ logger.debug(f"start: {start}, {clean.shape}")
45
  noise = noise[:, start : start + clean.shape[1]]
46
  E_speech = torch.mean(clean.pow(2)) + eps
47
  E_noise = torch.mean(noise.pow(2))
48
  K = torch.sqrt((E_noise / E_speech) * 10 ** (snr / 10) + eps)
49
  noise = noise / K
50
  mixture = clean + noise
51
+ logger.debug("mixture: {mixture.shape}")
52
  assert torch.isfinite(mixture).all()
53
  max_m = mixture.abs().max()
54
  if max_m > 1:
55
+ logger.warning(f"Clipping detected during mixing. Reducing gain by {1/max_m}")
56
  clean, noise, mixture = clean / max_m, noise / max_m, mixture / max_m
57
  return clean, noise, mixture
58
 
59
 
60
+ def mix_and_denoise(speech_rec: Tuple[int, np.ndarray], speech_upl: str, noise_fn: str, snr: int):
61
  sr = config("sr", 48000, int, section="df")
62
  logger.info(
63
+ f"Got parameters speech_rec: {speech_rec}, speech_upl: {speech_upl}, noise: {noise_fn}, snr: {snr}"
64
  )
65
+ if noise_fn is None:
66
+ noise_fn = "samples/dkitchen.wav"
67
+ meta = AudioMetaData(-1, -1, -1, -1, "")
68
+ if speech_upl is not None and "none" not in speech_upl:
69
  speech_file = "samples/p232_013_clean.wav"
70
  if speech_upl is not None and "none" not in speech_upl:
71
  speech_file = speech_upl
72
+ speech, meta = load_audio(speech_file, sr)
73
  else:
74
+ meta.sample_rate, speech_rec_a = speech_rec
75
+ # Gradio documentation says, the shape is [samples, 2], but apparently sometimes its not.
76
+ speech_rec_a = speech_rec_a.reshape(speech_rec_a.shape[0], -1).T
77
+ if speech_rec_a.dtype == np.int16:
78
+ speech_rec_a = (speech_rec_a / (1 << 15)).astype(np.float32)
79
+ elif speech_rec_a.dtype == np.int32:
80
+ speech_rec_a = (speech_rec_a / (1 << 31)).astype(np.float32)
81
+ speech = resample(torch.from_numpy(speech_rec_a), meta.sample_rate, sr)
 
 
 
 
 
 
 
82
  logger.info(f"Loaded speech with shape {speech.shape}")
83
+ noise, _ = load_audio(noise_fn, sr) # type: ignore
84
  if meta.sample_rate != sr:
85
  # Low pass filter by resampling
86
  noise = resample(resample(noise, sr, meta.sample_rate), meta.sample_rate, sr)
 
211
  inputs = [
212
  gradio.inputs.Audio(
213
  source="microphone",
214
+ type="numpy",
215
  optional=True,
216
  label="Record your own voice",
217
  ),
 
258
  allow_flagging="never",
259
  article=markdown.markdown(open("usage.md").read()),
260
  )
261
+ iface.launch(cache_examples=False, debug=True)