myshell-ai
/

DreamVoice

speech-to-speech

Model card Files Files and versions Community

DreamVoice / dreamvoice /train_utils /prepare_freevc /freevc /freevc_pipeline.py

Higobeatz's picture

freevc plugin

0dabde8 about 1 month ago

2.5 kB

	import os
	import torch
	import torch.nn.functional as F
	import librosa
	import sounddevice as sd
	from transformers import WavLMModel
	from scipy.io.wavfile import write
	from models import SynthesizerTrn
	from speaker_encoder.voice_encoder import SpeakerEncoder
	import utils
	import numpy as np
	from transformers import T5Tokenizer, T5EncoderModel
	from src.plugin_wrapper import DreamVG
	import soundfile as sf


	# Load configurations and models
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

	print("Loading FreeVC...")
	hps = utils.get_hparams_from_file("configs/freevc.json")
	freevc = SynthesizerTrn(
	hps.data.filter_length // 2 + 1,
	hps.train.segment_size // hps.data.hop_length,
	**hps.model).to(device)
	freevc.eval()
	utils.load_checkpoint("checkpoints/freevc.pth", freevc, None)

	print("Loading Speaker Encoder...")
	smodel = SpeakerEncoder('speaker_encoder/ckpt/pretrained_bak_5805000.pt')

	print("Loading WavLM for content...")
	cmodel = WavLMModel.from_pretrained("microsoft/wavlm-large").to(device)

	lm_path = 'google/flan-t5-base'
	tokenizer = T5Tokenizer.from_pretrained(lm_path)
	text_encoder = T5EncoderModel.from_pretrained(lm_path).to(device).eval()

	dreamvg = DreamVG(config_path='src/configs/plugin_cross.yaml',
	ckpt_path='checkpoints/dreamvc_plugin.pt',
	device=device)


	prompt = "girl's voice, very young and cute"
	prompt_guidance_scale = 3.0

	text_batch = tokenizer(prompt, max_length=32,
	padding='max_length', truncation=True, return_tensors="pt")
	text, text_mask = text_batch.input_ids.to(device), \
	text_batch.attention_mask.to(device)
	text = text_encoder(input_ids=text, attention_mask=text_mask)[0]
	target_embedding = dreamvg.inference([text, text_mask],
	guidance_scale=prompt_guidance_scale,
	guidance_rescale=0.0,
	ddim_steps=100, eta=1,
	random_seed=None)

	# Convert to tensor and pad
	audio, sr = librosa.load('segment_1.mp3', sr=16000)
	audio = torch.from_numpy(audio).unsqueeze(0).to(device).float()
	audio = F.pad(audio, (40, 40))

	# Extract content features using WavLM
	c = cmodel(audio).last_hidden_state.transpose(1, 2).to(device)

	audio = freevc.infer(c, g=target_embedding)
	audio = audio[0][0].data.cpu().float().numpy()

	sf.write('freevc_out.wav', audio, 16000)