|
import os
|
|
import torch
|
|
import torch.nn.functional as F
|
|
import librosa
|
|
import sounddevice as sd
|
|
from transformers import WavLMModel
|
|
from scipy.io.wavfile import write
|
|
from models import SynthesizerTrn
|
|
from speaker_encoder.voice_encoder import SpeakerEncoder
|
|
import utils
|
|
import numpy as np
|
|
from transformers import T5Tokenizer, T5EncoderModel
|
|
from src.plugin_wrapper import DreamVG
|
|
import soundfile as sf
|
|
|
|
|
|
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
|
|
|
print("Loading FreeVC...")
|
|
hps = utils.get_hparams_from_file("configs/freevc.json")
|
|
freevc = SynthesizerTrn(
|
|
hps.data.filter_length // 2 + 1,
|
|
hps.train.segment_size // hps.data.hop_length,
|
|
**hps.model).to(device)
|
|
freevc.eval()
|
|
utils.load_checkpoint("checkpoints/freevc.pth", freevc, None)
|
|
|
|
print("Loading Speaker Encoder...")
|
|
smodel = SpeakerEncoder('speaker_encoder/ckpt/pretrained_bak_5805000.pt')
|
|
|
|
print("Loading WavLM for content...")
|
|
cmodel = WavLMModel.from_pretrained("microsoft/wavlm-large").to(device)
|
|
|
|
lm_path = 'google/flan-t5-base'
|
|
tokenizer = T5Tokenizer.from_pretrained(lm_path)
|
|
text_encoder = T5EncoderModel.from_pretrained(lm_path).to(device).eval()
|
|
|
|
dreamvg = DreamVG(config_path='src/configs/plugin_cross.yaml',
|
|
ckpt_path='checkpoints/dreamvc_plugin.pt',
|
|
device=device)
|
|
|
|
|
|
prompt = "girl's voice, very young and cute"
|
|
prompt_guidance_scale = 3.0
|
|
|
|
text_batch = tokenizer(prompt, max_length=32,
|
|
padding='max_length', truncation=True, return_tensors="pt")
|
|
text, text_mask = text_batch.input_ids.to(device), \
|
|
text_batch.attention_mask.to(device)
|
|
text = text_encoder(input_ids=text, attention_mask=text_mask)[0]
|
|
target_embedding = dreamvg.inference([text, text_mask],
|
|
guidance_scale=prompt_guidance_scale,
|
|
guidance_rescale=0.0,
|
|
ddim_steps=100, eta=1,
|
|
random_seed=None)
|
|
|
|
|
|
audio, sr = librosa.load('segment_1.mp3', sr=16000)
|
|
audio = torch.from_numpy(audio).unsqueeze(0).to(device).float()
|
|
audio = F.pad(audio, (40, 40))
|
|
|
|
|
|
c = cmodel(audio).last_hidden_state.transpose(1, 2).to(device)
|
|
|
|
audio = freevc.infer(c, g=target_embedding)
|
|
audio = audio[0][0].data.cpu().float().numpy()
|
|
|
|
sf.write('freevc_out.wav', audio, 16000) |