roychao19477 commited on
Commit
1fdb610
Β·
1 Parent(s): 2bbe7e3
Files changed (1) hide show
  1. app.py +34 -23
app.py CHANGED
@@ -20,6 +20,7 @@ import gradio as gr
20
  import torch
21
  import yaml
22
  import librosa
 
23
  from huggingface_hub import hf_hub_download
24
  from models.stfts import mag_phase_stft, mag_phase_istft
25
  from models.generator import SEMamba
@@ -56,7 +57,6 @@ model.eval()
56
 
57
 
58
  @spaces.GPU
59
- # --- Inference ---
60
  def enhance(audio):
61
  if audio is None: return None, None
62
  orig_sr, wav_np = audio
@@ -73,31 +73,42 @@ def enhance(audio):
73
  if orig_sr != sr:
74
  out = librosa.resample(out, sr, orig_sr)
75
 
76
- # draw spectrum
77
  D = librosa.stft(out, n_fft=1024, hop_length=512)
78
  S_db = librosa.amplitude_to_db(np.abs(D), ref=np.max)
79
- fig, ax = plt.subplots()
80
  librosa.display.specshow(S_db, sr=orig_sr, hop_length=512, x_axis='time', y_axis='hz', ax=ax)
81
  ax.set_title("Enhanced Spectrogram")
82
- plt.colorbar(format="%+2.0f dB")
83
  return (orig_sr, out), fig
84
 
85
- # --- Interface ---
86
- se_demo = gr.Interface(
87
- fn=enhance,
88
- inputs=gr.Audio(sources=["upload", "microphone"], type="numpy", label="Input Audio"),
89
- outputs=[
90
- gr.Audio(label="Enhanced Audio", type="numpy"),
91
- gr.Plot(label="Spectrogram")
92
- ],
93
- title="<a href='https://github.com/RoyChao19477/SEMamba' target='_blank'>SEMamba</a>: Speech Enhancement",
94
- description="SEMamba is a state-space model for real-world noisy speech enhancement. Upload or record a noisy sample to hear the result and view the spectrogram.",
95
- article="<p style='text-align: center'><a href='https://arxiv.org/abs/2405.15144' target='_blank'>SEMamba: Mamba for Long-Context Speech Enhancement (SLT 2024)</a></p>",
96
- examples=[
97
- ["examples/noisy_sample_16k.wav"]
98
- ],
99
- cache_examples=True
100
- )
101
-
102
- # --- Launch ---
103
- se_demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
20
  import torch
21
  import yaml
22
  import librosa
23
+ import librosa.display
24
  from huggingface_hub import hf_hub_download
25
  from models.stfts import mag_phase_stft, mag_phase_istft
26
  from models.generator import SEMamba
 
57
 
58
 
59
  @spaces.GPU
 
60
  def enhance(audio):
61
  if audio is None: return None, None
62
  orig_sr, wav_np = audio
 
73
  if orig_sr != sr:
74
  out = librosa.resample(out, sr, orig_sr)
75
 
76
+ # spectrogram
77
  D = librosa.stft(out, n_fft=1024, hop_length=512)
78
  S_db = librosa.amplitude_to_db(np.abs(D), ref=np.max)
79
+ fig, ax = plt.subplots(figsize=(6, 3))
80
  librosa.display.specshow(S_db, sr=orig_sr, hop_length=512, x_axis='time', y_axis='hz', ax=ax)
81
  ax.set_title("Enhanced Spectrogram")
82
+ plt.colorbar(format="%+2.0f dB", ax=ax)
83
  return (orig_sr, out), fig
84
 
85
+ # --- Layout with Blocks ---
86
+ with gr.Blocks(css=".gr-box {border: none !important}") as demo:
87
+ gr.Markdown("<h1 style='text-align: center;'>🎧 <a href='https://github.com/RoyChao19477/SEMamba' target='_blank'>SEMamba</a>: Speech Enhancement</h1>")
88
+ gr.Markdown("Enhance real-world noisy speech using Mamba. Upload or record an audio clip and view the spectrogram.")
89
+
90
+ with gr.Row():
91
+ with gr.Column():
92
+ audio_input = gr.Audio(sources=["upload", "microphone"], type="numpy", label="Upload or Record", elem_id="input-audio")
93
+ run_btn = gr.Button("Enhance Now πŸš€", variant="primary")
94
+
95
+ with gr.Column():
96
+ enhanced_audio = gr.Audio(label="Enhanced Output", type="numpy")
97
+ spec_plot = gr.Plot(label="Spectrogram")
98
+
99
+ run_btn.click(enhance, inputs=audio_input, outputs=[enhanced_audio, spec_plot])
100
+
101
+ gr.Examples(
102
+ examples=[
103
+ ["examples/noisy_sample_16k.wav"],
104
+ ],
105
+ inputs=audio_input,
106
+ outputs=[enhanced_audio, spec_plot],
107
+ fn=enhance,
108
+ cache_examples=True,
109
+ label="πŸ“‚ Try These Examples"
110
+ )
111
+
112
+ gr.Markdown("<p style='text-align: center'><a href='https://arxiv.org/abs/2405.15144' target='_blank'>πŸ“„ SEMamba: Mamba for Long-Context Speech Enhancement (SLT 2024)</a></p>")
113
+
114
+ demo.launch()