freevc plugin

0dabde8 about 1 month ago

5.77 kB

	import os
	import torch
	import torch.nn.functional as F
	import librosa
	import sounddevice as sd
	from transformers import WavLMModel
	from scipy.io.wavfile import write
	from models import SynthesizerTrn
	from speaker_encoder.voice_encoder import SpeakerEncoder
	import utils
	import numpy as np
	from transformers import T5Tokenizer, T5EncoderModel
	from src.plugin_wrapper import DreamVG


	# Load configurations and models
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

	print("Loading FreeVC...")
	hps = utils.get_hparams_from_file("configs/freevc.json")
	freevc = SynthesizerTrn(
	hps.data.filter_length // 2 + 1,
	hps.train.segment_size // hps.data.hop_length,
	**hps.model).to(device)
	freevc.eval()
	utils.load_checkpoint("checkpoints/freevc.pth", freevc, None)

	print("Loading Speaker Encoder...")
	smodel = SpeakerEncoder('speaker_encoder/ckpt/pretrained_bak_5805000.pt')

	print("Loading WavLM for content...")
	cmodel = WavLMModel.from_pretrained("microsoft/wavlm-large").to(device)

	lm_path = 'google/flan-t5-base'
	tokenizer = T5Tokenizer.from_pretrained(lm_path)
	text_encoder = T5EncoderModel.from_pretrained(lm_path).to(device).eval()

	dreamvg = DreamVG(config_path='src/configs/plugin_cross.yaml',
	ckpt_path='checkpoints/dreamvc_plugin.pt',
	device=device)


	# Constants for overlap-add
	CHUNK_SIZE = 47040
	OVERLAP = 960
	BUFFER_SIZE = OVERLAP + CHUNK_SIZE
	fade_size = OVERLAP
	HANN_WINDOW = np.ones(BUFFER_SIZE)
	HANN_WINDOW[:fade_size] = 0.5 * (1 - np.cos(np.pi * np.arange(fade_size) / fade_size))
	HANN_WINDOW[-fade_size:] = 0.5 * (1 - np.cos(np.pi * np.arange(fade_size) / fade_size))[::-1]

	# Initialize buffers
	input_buffer = np.zeros(BUFFER_SIZE, dtype=np.float32)
	output_buffer = np.zeros(BUFFER_SIZE, dtype=np.float32)


	@torch.no_grad()
	def convert_realtime_with_buffers(audio_chunk, tgt_embedding, freevc, cmodel):
	"""Process audio in chunks with overlap and manage input/output buffers."""
	global input_buffer, output_buffer, HANN_WINDOW, BUFFER_SIZE, CHUNK_SIZE

	# Add incoming audio chunk to input buffer
	input_buffer[:OVERLAP] = input_buffer[-OVERLAP:]
	input_buffer[OVERLAP:] = audio_chunk

	# Downsample to 16,000 Hz
	chunk = input_buffer
	chunk = librosa.resample(chunk, orig_sr=48000, target_sr=16000)

	# Convert to tensor and pad
	chunk_tensor = torch.from_numpy(chunk).unsqueeze(0).to(device).float()
	chunk_tensor = F.pad(chunk_tensor, (40, 40))

	# Extract content features using WavLM
	c = cmodel(chunk_tensor).last_hidden_state.transpose(1, 2).to(device)

	# Generate converted audio using FreeVC
	audio = freevc.infer(c, g=tgt_embedding)
	audio = audio[0][0].data.cpu().float().numpy()

	# Upsample back to 48,000 Hz
	audio = librosa.resample(audio, orig_sr=16000, target_sr=48000)

	# Apply Hann window to the output
	windowed_output = audio * HANN_WINDOW

	# Add the new processed audio to the output buffer with overlap
	output_buffer[:OVERLAP] = output_buffer[-OVERLAP:]
	output_buffer[OVERLAP:] = 0
	output_buffer += windowed_output

	normalization_factors = np.zeros(BUFFER_SIZE)
	normalization_factors[:OVERLAP] += HANN_WINDOW[-OVERLAP:]
	normalization_factors += HANN_WINDOW
	normalization_factors = np.clip(normalization_factors, 1e-6, None)
	# output_buffer[:CHUNK_SIZE] = output_buffer[:CHUNK_SIZE] / normalization_factors[:CHUNK_SIZE]

	return output_buffer[:CHUNK_SIZE]


	def prepare_target_embedding(tgt_audio_path):
	"""Preprocess target audio and get speaker embedding."""
	wav_tgt, _ = librosa.load(tgt_audio_path, sr=16000)
	wav_tgt, _ = librosa.effects.trim(wav_tgt, top_db=20)
	g_tgt = smodel.embed_utterance(wav_tgt)
	g_tgt = torch.from_numpy(g_tgt).unsqueeze(0).to(device)
	return g_tgt


	# Prepare the target speaker embedding
	# target_audio = "p225_001.wav" # Target speaker audio
	# target_embedding = prepare_target_embedding(target_audio)
	prompt = "A young girl voice, very cute"
	prompt_guidance_scale = 3.0

	text_batch = tokenizer(prompt, max_length=32,
	padding='max_length', truncation=True, return_tensors="pt")
	text, text_mask = text_batch.input_ids.to(device), \
	text_batch.attention_mask.to(device)
	text = text_encoder(input_ids=text, attention_mask=text_mask)[0]
	target_embedding = dreamvg.inference([text, text_mask],
	guidance_scale=prompt_guidance_scale,
	guidance_rescale=0.0,
	ddim_steps=100, eta=1,
	random_seed=None)

	# Stream settings
	SAMPLING_RATE = 48000
	INPUT_DEVICE = 69
	OUTPUT_DEVICE = 58


	def audio_callback(indata, outdata, frames, time, status):
	"""Callback function for real-time audio processing with input and output buffers."""
	global input_buffer, output_buffer

	if status:
	print(f"Status: {status}")
	# Reshape and process input audio
	indata = indata[:, 0] # Mono input
	converted_audio = convert_realtime_with_buffers(indata, target_embedding, freevc, cmodel)
	# Write the converted audio to the output stream
	outdata[:] = converted_audio.reshape(-1, 1)


	# Start the audio stream with the updated callback
	with sd.Stream(
	samplerate=SAMPLING_RATE,
	blocksize=CHUNK_SIZE,
	channels=1,
	dtype='float32',
	latency='low',
	device=(INPUT_DEVICE, OUTPUT_DEVICE),
	callback=audio_callback):
	try:
	sd.sleep(1000000)
	except KeyboardInterrupt:
	print("Voice conversion stopped.")