Higobeatz's picture
freevc plugin
0dabde8
raw
history blame
2.5 kB
import os
import torch
import torch.nn.functional as F
import librosa
import sounddevice as sd
from transformers import WavLMModel
from scipy.io.wavfile import write
from models import SynthesizerTrn
from speaker_encoder.voice_encoder import SpeakerEncoder
import utils
import numpy as np
from transformers import T5Tokenizer, T5EncoderModel
from src.plugin_wrapper import DreamVG
import soundfile as sf
# Load configurations and models
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Loading FreeVC...")
hps = utils.get_hparams_from_file("configs/freevc.json")
freevc = SynthesizerTrn(
hps.data.filter_length // 2 + 1,
hps.train.segment_size // hps.data.hop_length,
**hps.model).to(device)
freevc.eval()
utils.load_checkpoint("checkpoints/freevc.pth", freevc, None)
print("Loading Speaker Encoder...")
smodel = SpeakerEncoder('speaker_encoder/ckpt/pretrained_bak_5805000.pt')
print("Loading WavLM for content...")
cmodel = WavLMModel.from_pretrained("microsoft/wavlm-large").to(device)
lm_path = 'google/flan-t5-base'
tokenizer = T5Tokenizer.from_pretrained(lm_path)
text_encoder = T5EncoderModel.from_pretrained(lm_path).to(device).eval()
dreamvg = DreamVG(config_path='src/configs/plugin_cross.yaml',
ckpt_path='checkpoints/dreamvc_plugin.pt',
device=device)
prompt = "girl's voice, very young and cute"
prompt_guidance_scale = 3.0
text_batch = tokenizer(prompt, max_length=32,
padding='max_length', truncation=True, return_tensors="pt")
text, text_mask = text_batch.input_ids.to(device), \
text_batch.attention_mask.to(device)
text = text_encoder(input_ids=text, attention_mask=text_mask)[0]
target_embedding = dreamvg.inference([text, text_mask],
guidance_scale=prompt_guidance_scale,
guidance_rescale=0.0,
ddim_steps=100, eta=1,
random_seed=None)
# Convert to tensor and pad
audio, sr = librosa.load('segment_1.mp3', sr=16000)
audio = torch.from_numpy(audio).unsqueeze(0).to(device).float()
audio = F.pad(audio, (40, 40))
# Extract content features using WavLM
c = cmodel(audio).last_hidden_state.transpose(1, 2).to(device)
audio = freevc.infer(c, g=target_embedding)
audio = audio[0][0].data.cpu().float().numpy()
sf.write('freevc_out.wav', audio, 16000)