import os import torch import torch.nn.functional as F import librosa import sounddevice as sd from transformers import WavLMModel from scipy.io.wavfile import write from models import SynthesizerTrn from speaker_encoder.voice_encoder import SpeakerEncoder import utils import numpy as np from transformers import T5Tokenizer, T5EncoderModel from src.plugin_wrapper import DreamVG import soundfile as sf # Load configurations and models device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print("Loading FreeVC...") hps = utils.get_hparams_from_file("configs/freevc.json") freevc = SynthesizerTrn( hps.data.filter_length // 2 + 1, hps.train.segment_size // hps.data.hop_length, **hps.model).to(device) freevc.eval() utils.load_checkpoint("checkpoints/freevc.pth", freevc, None) print("Loading Speaker Encoder...") smodel = SpeakerEncoder('speaker_encoder/ckpt/pretrained_bak_5805000.pt') print("Loading WavLM for content...") cmodel = WavLMModel.from_pretrained("microsoft/wavlm-large").to(device) lm_path = 'google/flan-t5-base' tokenizer = T5Tokenizer.from_pretrained(lm_path) text_encoder = T5EncoderModel.from_pretrained(lm_path).to(device).eval() dreamvg = DreamVG(config_path='src/configs/plugin_cross.yaml', ckpt_path='checkpoints/dreamvc_plugin.pt', device=device) prompt = "girl's voice, very young and cute" prompt_guidance_scale = 3.0 text_batch = tokenizer(prompt, max_length=32, padding='max_length', truncation=True, return_tensors="pt") text, text_mask = text_batch.input_ids.to(device), \ text_batch.attention_mask.to(device) text = text_encoder(input_ids=text, attention_mask=text_mask)[0] target_embedding = dreamvg.inference([text, text_mask], guidance_scale=prompt_guidance_scale, guidance_rescale=0.0, ddim_steps=100, eta=1, random_seed=None) # Convert to tensor and pad audio, sr = librosa.load('segment_1.mp3', sr=16000) audio = torch.from_numpy(audio).unsqueeze(0).to(device).float() audio = F.pad(audio, (40, 40)) # Extract content features using WavLM c = cmodel(audio).last_hidden_state.transpose(1, 2).to(device) audio = freevc.infer(c, g=target_embedding) audio = audio[0][0].data.cpu().float().numpy() sf.write('freevc_out.wav', audio, 16000)