roychao19477 commited on
Commit
ff0b6ec
Β·
1 Parent(s): e231b3a
Files changed (1) hide show
  1. app.py +42 -37
app.py CHANGED
@@ -59,45 +59,50 @@ model.eval()
59
 
60
  @spaces.GPU
61
  def enhance(filepath):
62
- # load & resample to model SR
63
  wav_np, orig_sr = librosa.load(filepath, sr=None)
64
- if orig_sr != SR:
65
- wav_np = librosa.resample(wav_np, orig_sr, SR)
66
- # to tensor + normalize
67
  wav = torch.from_numpy(wav_np).float().to(device)
68
- norm = torch.sqrt(len(wav) / torch.sum(wav**2))
69
- wav = (wav * norm).unsqueeze(0)
70
- # STFT β†’ model β†’ ISTFT
71
- amp, pha, _ = mag_phase_stft(wav, **stft_cfg, compress_factor=model_cfg["compress_factor"])
72
- amp_g, pha_g = model(amp, pha)
73
- out = mag_phase_istft(amp_g, pha_g, **stft_cfg, compress_factor=model_cfg["compress_factor"])
74
- out = (out / norm).squeeze().cpu().numpy()
75
- # resample back
76
- if orig_sr != SR:
77
- out = librosa.resample(out, SR, orig_sr)
78
- # write to temp file
79
- out_path = "enhanced.wav"
80
- sf.write(out_path, out, orig_sr)
81
- # compute spectrogram plot
82
  D = librosa.stft(out, n_fft=1024, hop_length=512)
83
- S_db = librosa.amplitude_to_db(np.abs(D), ref=np.max)
84
- fig, ax = plt.subplots(figsize=(6,3))
85
- librosa.display.specshow(S_db, sr=orig_sr, hop_length=512,
86
- x_axis="time", y_axis="hz", ax=ax)
87
- ax.set_title("Enhanced Spectrogram")
88
- plt.colorbar(format="%+2.0f dB", ax=ax)
89
- return out_path, fig
90
-
91
- demo = gr.Interface(
92
- fn=enhance,
93
- inputs=gr.Audio(sources=["upload","microphone"], type="filepath", label="Input Audio"),
94
- outputs=[
95
- gr.Audio(label="Enhanced Audio", type="filepath"),
96
- gr.Plot(label="Spectrogram")
97
- ],
98
- title="<a href='https://github.com/RoyChao19477/SEMamba' target='_blank'>SEMamba</a>: Speech Enhancement",
99
- description="Upload or record noisy speech; SEMamba enhances it and shows the spectrogram.",
100
- article="<p style='text-align:center'><a href='https://arxiv.org/abs/2405.15144' target='_blank'>SEMamba: Mamba for Long-Context Speech Enhancement (SLT 2024)</a></p>"
101
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
 
103
  demo.launch()
 
59
 
60
  @spaces.GPU
61
  def enhance(filepath):
 
62
  wav_np, orig_sr = librosa.load(filepath, sr=None)
63
+ if orig_sr!=SR: wav_np=librosa.resample(wav_np,orig_sr,SR)
 
 
64
  wav = torch.from_numpy(wav_np).float().to(device)
65
+ norm= torch.sqrt(len(wav)/torch.sum(wav**2)); wav=(wav*norm).unsqueeze(0)
66
+ amp,pha,_ = mag_phase_stft(wav,**stft_cfg,compress_factor=model_cfg["compress_factor"])
67
+ amp_g,pha_g = model(amp,pha)
68
+ out = mag_phase_istft(amp_g,pha_g,**stft_cfg,compress_factor=model_cfg["compress_factor"])
69
+ out = (out/norm).squeeze().cpu().numpy()
70
+ if orig_sr!=SR: out=librosa.resample(out,SR,orig_sr)
71
+ sf.write("enhanced.wav", out, orig_sr)
72
+
 
 
 
 
 
 
73
  D = librosa.stft(out, n_fft=1024, hop_length=512)
74
+ S = librosa.amplitude_to_db(np.abs(D), ref=np.max)
75
+ fig,ax=plt.subplots(figsize=(6,3))
76
+ librosa.display.specshow(S,sr=orig_sr,hop_length=512,x_axis="time",y_axis="hz",ax=ax)
77
+ ax.set_title("Enhanced Spectrogram"); plt.colorbar(format="%+2.0f dB",ax=ax)
78
+ return "enhanced.wav", fig
79
+
80
+ # β€” Custom CSS β€”
81
+ CSS = """
82
+ #title {text-align:center; margin-bottom:0.2em;}
83
+ #subtitle {text-align:center; color:#555; margin-bottom:1.5em;}
84
+ .duplicate-button {display:block; margin:0 auto 1.5em;}
85
+ #audio-in {border:2px dashed #aaa; border-radius:8px; padding:1em;}
86
+ #run-btn {width:100%; margin-top:0.5em;}
87
+ #out-audio, #spec-plot {margin-top:1em;}
88
+ """
89
+
90
+ # β€” Blocks layout β€”
91
+ with gr.Blocks(css=CSS, theme="soft") as demo:
92
+ gr.HTML("<h1 id='title'>🎧 SEMamba: Speech Enhancement</h1>")
93
+ gr.HTML("<p id='subtitle'>Upload or record your noisy clip, then click Enhance to boost clarity and view its spectrogram.</p>")
94
+ gr.DuplicateButton(value="Duplicate Space for private use", elem_classes="duplicate-button")
95
+
96
+ with gr.Row():
97
+ with gr.Column(scale=1):
98
+ audio_in = gr.Audio(sources=["upload","microphone"], type="filepath",
99
+ label="Your Noisy Audio", elem_id="audio-in")
100
+ run_btn = gr.Button("Enhance Now πŸš€", variant="primary", elem_id="run-btn")
101
+
102
+ with gr.Column(scale=1):
103
+ audio_out = gr.Audio(type="filepath", label="Enhanced Audio", elem_id="out-audio")
104
+ spec_plot = gr.Plot(label="Spectrogram", elem_id="spec-plot")
105
+
106
+ run_btn.click(enhance, inputs=audio_in, outputs=[audio_out, spec_plot])
107
 
108
  demo.launch()