import json import torch from tqdm import tqdm import torchaudio import librosa import os import math import numpy as np from get_melvaehifigan48k import build_pretrained_models import tools.torch_tools as torch_tools class Tango: def __init__(self, \ device="cuda:0"): self.sample_rate = 48000 self.device = device self.vae, self.stft = build_pretrained_models() self.vae, self.stft = self.vae.eval().to(device), self.stft.eval().to(device) # print(sum(p.numel() for p in self.vae.parameters()));exit() def mel_spectrogram_to_waveform(self, mel_spectrogram): if mel_spectrogram.dim() == 4: mel_spectrogram = mel_spectrogram.squeeze(1) waveform = self.vocoder(mel_spectrogram) # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16 waveform = waveform.cpu().float() return waveform def sound2sound_generate_longterm(self, fname, batch_size=1, duration=10.24, steps=200, disable_progress=False): """ Genrate audio without condition. """ num_frames = math.ceil(duration * 100. / 8) with torch.no_grad(): orig_samples, fs = torchaudio.load(fname) if(orig_samples.shape[-1]