Hendrik Schroeter commited on
Commit
c8dbfb3
1 Parent(s): 4cf88e6

back to filepath input; huggingspace has no ffmpeg :(

Browse files
Files changed (1) hide show
  1. app.py +43 -12
app.py CHANGED
@@ -1,6 +1,6 @@
1
  import math
2
  import tempfile
3
- from typing import Tuple
4
 
5
  import gradio
6
  import gradio.inputs
@@ -9,12 +9,14 @@ import markdown
9
  import matplotlib.pyplot as plt
10
  import numpy as np
11
  import torch
 
12
  from loguru import logger
 
 
13
 
14
  from df import config
15
  from df.enhance import enhance, init_df, load_audio, save_audio
16
  from df.utils import resample
17
- from torchaudio.backend.common import AudioMetaData
18
 
19
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
20
  model, df, _ = init_df()
@@ -57,7 +59,40 @@ def mix_at_snr(clean, noise, snr, eps=1e-10):
57
  return clean, noise, mixture
58
 
59
 
60
- def mix_and_denoise(speech_rec: Tuple[int, np.ndarray], speech_upl: str, noise_fn: str, snr: int):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  sr = config("sr", 48000, int, section="df")
62
  logger.info(
63
  f"Got parameters speech_rec: {speech_rec}, speech_upl: {speech_upl}, noise: {noise_fn}, snr: {snr}"
@@ -71,14 +106,10 @@ def mix_and_denoise(speech_rec: Tuple[int, np.ndarray], speech_upl: str, noise_f
71
  speech_file = speech_upl
72
  speech, meta = load_audio(speech_file, sr)
73
  else:
74
- meta.sample_rate, speech_rec_a = speech_rec
75
- # Gradio documentation says, the shape is [samples, 2], but apparently sometimes its not.
76
- speech_rec_a = speech_rec_a.reshape(speech_rec_a.shape[0], -1).T
77
- if speech_rec_a.dtype == np.int16:
78
- speech_rec_a = (speech_rec_a / (1 << 15)).astype(np.float32)
79
- elif speech_rec_a.dtype == np.int32:
80
- speech_rec_a = (speech_rec_a / (1 << 31)).astype(np.float32)
81
- speech = resample(torch.from_numpy(speech_rec_a), meta.sample_rate, sr)
82
  logger.info(f"Loaded speech with shape {speech.shape}")
83
  noise, _ = load_audio(noise_fn, sr) # type: ignore
84
  if meta.sample_rate != sr:
@@ -211,7 +242,7 @@ def spec_figure(
211
  inputs = [
212
  gradio.inputs.Audio(
213
  source="microphone",
214
- type="numpy",
215
  optional=True,
216
  label="Record your own voice",
217
  ),
 
1
  import math
2
  import tempfile
3
+ from typing import Optional, Tuple, Union
4
 
5
  import gradio
6
  import gradio.inputs
 
9
  import matplotlib.pyplot as plt
10
  import numpy as np
11
  import torch
12
+ from icecream import ic
13
  from loguru import logger
14
+ from torch import Tensor
15
+ from torchaudio.backend.common import AudioMetaData
16
 
17
  from df import config
18
  from df.enhance import enhance, init_df, load_audio, save_audio
19
  from df.utils import resample
 
20
 
21
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
22
  model, df, _ = init_df()
 
59
  return clean, noise, mixture
60
 
61
 
62
+ def load_audio_gradio(
63
+ audio_or_file: Union[None, str, Tuple[int, np.ndarray]], sr: int
64
+ ) -> Optional[Tuple[Tensor, AudioMetaData]]:
65
+ if audio_or_file is None:
66
+ return None
67
+ if isinstance(audio_or_file, str):
68
+ if audio_or_file.lower()=="none":
69
+ return None
70
+ # First try default format
71
+ try:
72
+ audio, meta = load_audio(audio_or_file, sr)
73
+ except RuntimeError:
74
+ # Probably running in chrome which results in an webm/opus encoded '.wav' file - argggg
75
+ import shutil, os
76
+ audio_or_file = shutil.move(audio_or_file, os.path.splitext(audio_or_file)[0]+".opus")
77
+ print(audio_or_file)
78
+ audio, meta = load_audio(audio_or_file, sr)
79
+ else:
80
+ meta = AudioMetaData(-1, -1, -1, -1, "")
81
+ assert isinstance(audio_or_file, (tuple, list))
82
+ meta.sample_rate, audio_np = audio_or_file
83
+ # Gradio documentation says, the shape is [samples, 2], but apparently sometimes its not.
84
+ audio_np = audio_np.reshape(audio_np.shape[0], -1).T
85
+ if audio_np.dtype == np.int16:
86
+ audio_np = (audio_np / (1 << 15)).astype(np.float32)
87
+ elif audio_np.dtype == np.int32:
88
+ audio_np = (audio_np / (1 << 31)).astype(np.float32)
89
+ audio = resample(torch.from_numpy(audio_np), meta.sample_rate, sr)
90
+ return audio, meta
91
+
92
+
93
+ def mix_and_denoise(
94
+ speech_rec: Union[str, Tuple[int, np.ndarray]], speech_upl: str, noise_fn: str, snr: int
95
+ ):
96
  sr = config("sr", 48000, int, section="df")
97
  logger.info(
98
  f"Got parameters speech_rec: {speech_rec}, speech_upl: {speech_upl}, noise: {noise_fn}, snr: {snr}"
 
106
  speech_file = speech_upl
107
  speech, meta = load_audio(speech_file, sr)
108
  else:
109
+ ic(speech_rec, sr)
110
+ tmp = load_audio_gradio(speech_rec, sr)
111
+ assert tmp is not None
112
+ speech, meta = tmp
 
 
 
 
113
  logger.info(f"Loaded speech with shape {speech.shape}")
114
  noise, _ = load_audio(noise_fn, sr) # type: ignore
115
  if meta.sample_rate != sr:
 
242
  inputs = [
243
  gradio.inputs.Audio(
244
  source="microphone",
245
+ type="filepath",
246
  optional=True,
247
  label="Record your own voice",
248
  ),