import librosa import torch from demucs.apply import apply_model from demucs.pretrained import get_model from scipy.io.wavfile import write demucs_model = get_model('cfa93e08') def denoise(filename: str, device: str, out_filename='denoise.wav') -> str: wav_ref, sr = librosa.load(filename, mono=False, sr=44100) wav = torch.tensor(wav_ref) wav = torch.cat([wav.unsqueeze(0), wav.unsqueeze(0)]) if len(wav.shape) == 1 else wav ref = wav.mean(0) wav = (wav - ref.mean()) / wav.std() sources = apply_model( demucs_model, wav[None], device=device, shifts=1, split=True, overlap=0.1, progress=True, num_workers=0 )[0] sources = sources * ref.std() + ref.mean() vocal_wav = sources[-1] vocal_wav = vocal_wav / max(1.01 * vocal_wav.abs().max(), 1) vocal_wav = vocal_wav.numpy() vocal_wav = librosa.to_mono(vocal_wav) vocal_wav = vocal_wav.T vocal_wav = librosa.resample(vocal_wav, orig_sr=44100, target_sr=48000) write(out_filename, 48000, vocal_wav) return out_filename if __name__ == '__main__': device = 'cuda' if torch.cuda.is_available() else 'cpu' denoise(filename='../oxx.wav', device=device)