|
import os
|
|
import torch
|
|
import librosa
|
|
import soundfile as sf
|
|
from pathlib import Path
|
|
|
|
from transformers import WavLMModel
|
|
from .freevc.utils import load_checkpoint, get_hparams_from_file
|
|
from .freevc.models import SynthesizerTrn
|
|
|
|
|
|
|
|
|
|
|
|
def get_freevc_models(path='freevc', speaker_path='../pre_ckpts/spk_encoder/pretrained.pt', device='cuda'):
|
|
hps = get_hparams_from_file(f"{path}/freevc.json")
|
|
freevc = SynthesizerTrn(
|
|
hps.data.filter_length // 2 + 1,
|
|
hps.train.segment_size // hps.data.hop_length,
|
|
**hps.model).to(device)
|
|
freevc.eval()
|
|
load_checkpoint(f"{path}/freevc.pth", freevc, None)
|
|
|
|
cmodel = WavLMModel.from_pretrained("microsoft/wavlm-large").to(device)
|
|
cmodel.eval()
|
|
|
|
|
|
|
|
|
|
|
|
return freevc, cmodel, hps
|
|
|
|
|
|
@torch.no_grad()
|
|
def convert(freevc, content, speaker):
|
|
audio = freevc.infer(content, g=speaker)
|
|
audio = audio[0][0].data.cpu().float().numpy()
|
|
return audio, 16000
|
|
|
|
|
|
if __name__ == '__main__':
|
|
freevc_24, cmodel, smodel, hps = get_freevc_models()
|
|
|
|
tgt = 'p226_002.wav'
|
|
|
|
src = 'p225_001.wav'
|
|
device = 'cuda'
|
|
|
|
|
|
wav_tgt, _ = librosa.load(tgt, sr=hps.data.sampling_rate)
|
|
wav_tgt, _ = librosa.effects.trim(wav_tgt, top_db=20)
|
|
g_tgt = smodel.embed_utterance(wav_tgt)
|
|
g_tgt = torch.from_numpy(g_tgt).unsqueeze(0).to(device)
|
|
|
|
|
|
|
|
wav_src, _ = librosa.load(src, sr=hps.data.sampling_rate)
|
|
wav_src = torch.from_numpy(wav_src).unsqueeze(0).to(device)
|
|
content = cmodel(wav_src).last_hidden_state.transpose(1, 2).to(device)
|
|
|
|
output, sr = convert(freevc_24, content, g_tgt)
|
|
|
|
sf.write('output.wav', output, sr) |