Spaces:

rc19477
/

Speech_Enhancement_Mamba

Running on Zero

App Files Files Community

roychao19477 commited on May 30

Commit

ea5c419

1 Parent(s): 18c8531

Upload

Browse files

Files changed (1) hide show

app.py +25 -27

app.py CHANGED Viewed

@@ -56,47 +56,45 @@ model.eval()
 @spaces.GPU
-def enhance(audio, do_pcs):
     orig_sr, wav_np = audio
-    # 1) resample to 16 kHz if needed
     if orig_sr != sr:
         wav_np = librosa.resample(wav_np, orig_sr, sr)
     wav = torch.from_numpy(wav_np).float().to(device)
-    # normalize
-    norm = torch.sqrt(len(wav) / torch.sum(wav**2))
-    wav  = (wav * norm).unsqueeze(0)
-    # STFT → model → ISTFT
-    amp, pha, _ = mag_phase_stft(wav, n_fft, hop_size, win_size, compress_ff)
     amp_g, pha_g = model(amp, pha)
-    out = mag_phase_istft(amp_g, pha_g, n_fft, hop_size, win_size, compress_ff)
     out = (out / norm).squeeze().cpu().numpy()
-    # optional PCS filter
-    if do_pcs:
-        out = cal_pcs(out)
-    # 2) resample back to original rate
     if orig_sr != sr:
         out = librosa.resample(out, sr, orig_sr)
-    return orig_sr, out
-with gr.Blocks() as demo:
-    gr.Markdown("## SEMamba Speech Enhancement demo")
-    with gr.Row():
-        upload = gr.Audio(label="Upload WAV", type="numpy")
-        record = gr.Audio(label="Record via mic", type="numpy")
-    pcs    = gr.Checkbox(label="Apply PCS post-processing", value=False)
-    btn    = gr.Button("Enhance")
-    out    = gr.Audio(label="Enhanced WAV", type="numpy")
-    @spaces.GPU
-    def runner(up, rec, do_pcs):
-        return enhance(up if up else rec, do_pcs)
-    btn.click(runner, [upload, record, pcs], out)
 demo.launch()

 @spaces.GPU
+def enhance_and_plot(audio):
+    if audio is None: return None, None
     orig_sr, wav_np = audio
     if orig_sr != sr:
         wav_np = librosa.resample(wav_np, orig_sr, sr)
     wav = torch.from_numpy(wav_np).float().to(device)
+    norm = torch.sqrt(len(wav)/torch.sum(wav**2))
+    wav = (wav * norm).unsqueeze(0)
+    amp, pha, _ = mag_phase_stft(wav, **stft_cfg, compress_factor=model_cfg["compress_factor"])
     amp_g, pha_g = model(amp, pha)
+    out = mag_phase_istft(amp_g, pha_g, **stft_cfg, compress_factor=model_cfg["compress_factor"])
     out = (out / norm).squeeze().cpu().numpy()
     if orig_sr != sr:
         out = librosa.resample(out, sr, orig_sr)
+    D = librosa.stft(out, n_fft=1024, hop_length=512)
+    S_db = librosa.amplitude_to_db(np.abs(D), ref=np.max)
+    fig, ax = plt.subplots()
+    img = librosa.display.specshow(S_db, sr=orig_sr, hop_length=512, x_axis='time', y_axis='hz', ax=ax)
+    plt.colorbar(img, ax=ax, format="%+2.0f dB")
+    ax.set_title("Enhanced Output Spectrum")
+    return (orig_sr, out), fig
+with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# 🎧 SEMamba Speech Enhancement")
+    gr.Markdown("Upload or record a noisy audio sample to enhance it and view the spectrogram.")
+    with gr.Row():
+        with gr.Column():
+            input_audio = gr.Audio(sources=["upload", "microphone"], type="numpy", label="Noisy Input")
+            btn = gr.Button("Enhance")
+        with gr.Column():
+            output_audio = gr.Audio(label="Enhanced Output", type="numpy")
+            spectrum     = gr.Plot(label="Spectrogram")
+    btn.click(enhance_and_plot, inputs=input_audio, outputs=[output_audio, spectrum])
 demo.launch()