Spaces:

sp-uhh
/

test

Running

File size: 2,269 Bytes

740a68f
 
 
8a51868
a297d3b
d0dbc2d
8a4b264
 
 
 
 
 
 
 
 
 
 
 
740a68f
8a4b264
8a51868
740a68f
 
e6e1f50
8a4b264
a297d3b
 
 
 
8a4b264
a297d3b
8a4b264
1acbf3a
8a4b264
a297d3b
 
8a4b264
 
 
a297d3b
8a4b264
a297d3b
 
 
 
 
8a4b264
740a68f
 
8a4b264
740a68f
 
 
 
 
 
 
 
 
 
38a61d7

import torch
import torchaudio
from sgmse.model import ScoreModel
import gradio as gr
from sgmse.util.other import pad_spec

# Define parameters based on the argparse configuration in enhancement.py
args = {
    "test_dir": "./test_data",  # example directory, adjust as needed
    "enhanced_dir": "./enhanced_data",  # example directory, adjust as needed
    "ckpt": "https://huggingface.co/sp-uhh/speech-enhancement-sgmse/resolve/main/train_vb_29nqe0uh_epoch%3D115.ckpt",
    "corrector": "ald",
    "corrector_steps": 1,
    "snr": 0.5,
    "N": 30,
    "device": "cuda" if torch.cuda.is_available() else "cpu"
}

# Load the pre-trained model
model = ScoreModel.load_from_checkpoint(args["ckpt"])

def enhance_speech(audio_file):
    # Load and process the audio file
    y, sr = torchaudio.load(audio_file)
    T_orig = y.size(1)

    # Normalize
    norm_factor = y.abs().max()
    y = y / norm_factor

    # Prepare DNN input
    Y = torch.unsqueeze(model._forward_transform(model._stft(y.to(args["device"]))), 0)
    Y = pad_spec(Y, mode="zero_pad")  # Use "zero_pad" mode for padding

    # Reverse sampling
    sampler = model.get_pc_sampler(
        'reverse_diffusion', args["corrector"], Y.to(args["device"]),
        N=args["N"], corrector_steps=args["corrector_steps"], snr=args["snr"]
    )
    sample, _ = sampler()

    # Backward transform in time domain
    x_hat = model.to_audio(sample.squeeze(), T_orig)

    # Renormalize
    x_hat = x_hat * norm_factor

    # Save the enhanced audio
    output_file = 'enhanced_output.wav'
    torchaudio.save(output_file, x_hat.cpu(), sr)
    
    return output_file

# Gradio interface setup
inputs = gr.Audio(label="Input Audio", type="filepath")
outputs = gr.Audio(label="Output Audio", type="filepath")
title = "Speech Enhancement using SGMSE"
description = "This Gradio demo uses the SGMSE model for speech enhancement. Upload your audio file to enhance it."
article = "<p style='text-align: center'><a href='https://huggingface.co/SP-UHH/speech-enhancement-sgmse' target='_blank'>Model Card</a></p>"

# Launch without share=True (as it's not supported on Hugging Face Spaces)
gr.Interface(fn=enhance_speech, inputs=inputs, outputs=outputs, title=title, description=description, article=article).launch()