|
|
|
|
|
|
|
|
|
|
|
|
|
import torch |
|
from examples.textless_nlp.gslm.unit2speech.tacotron2.model import Tacotron2 |
|
from examples.textless_nlp.gslm.unit2speech.tacotron2.waveglow_denoiser import ( |
|
Denoiser, |
|
) |
|
|
|
|
|
def load_quantized_audio_from_file(file_path): |
|
base_fname_batch, quantized_units_batch = [], [] |
|
with open(file_path) as f: |
|
for line in f: |
|
base_fname, quantized_units_str = line.rstrip().split("|") |
|
quantized_units = [int(q) for q in quantized_units_str.split(" ")] |
|
base_fname_batch.append(base_fname) |
|
quantized_units_batch.append(quantized_units) |
|
return base_fname_batch, quantized_units_batch |
|
|
|
|
|
def synthesize_audio(model, waveglow, denoiser, inp, lab=None, strength=0.0): |
|
assert inp.size(0) == 1 |
|
inp = inp.cuda() |
|
if lab is not None: |
|
lab = torch.LongTensor(1).cuda().fill_(lab) |
|
|
|
with torch.no_grad(): |
|
_, mel, _, ali, has_eos = model.inference(inp, lab, ret_has_eos=True) |
|
aud = waveglow.infer(mel, sigma=0.666) |
|
aud_dn = denoiser(aud, strength=strength).squeeze(1) |
|
return mel, aud, aud_dn, has_eos |
|
|
|
|
|
def load_tacotron(tacotron_model_path, max_decoder_steps): |
|
ckpt_dict = torch.load(tacotron_model_path) |
|
hparams = ckpt_dict["hparams"] |
|
hparams.max_decoder_steps = max_decoder_steps |
|
sr = hparams.sampling_rate |
|
model = Tacotron2(hparams) |
|
model.load_state_dict(ckpt_dict["model_dict"]) |
|
model = model.cuda().eval().half() |
|
return model, sr, hparams |
|
|
|
|
|
def load_waveglow(waveglow_path): |
|
waveglow = torch.load(waveglow_path)["model"] |
|
waveglow = waveglow.cuda().eval().half() |
|
for k in waveglow.convinv: |
|
k.float() |
|
denoiser = Denoiser(waveglow) |
|
return waveglow, denoiser |
|
|