Higobeatz's picture
freevc plugin
0dabde8
raw
history blame
5.77 kB
import os
import torch
import torch.nn.functional as F
import librosa
import sounddevice as sd
from transformers import WavLMModel
from scipy.io.wavfile import write
from models import SynthesizerTrn
from speaker_encoder.voice_encoder import SpeakerEncoder
import utils
import numpy as np
from transformers import T5Tokenizer, T5EncoderModel
from src.plugin_wrapper import DreamVG
# Load configurations and models
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Loading FreeVC...")
hps = utils.get_hparams_from_file("configs/freevc.json")
freevc = SynthesizerTrn(
hps.data.filter_length // 2 + 1,
hps.train.segment_size // hps.data.hop_length,
**hps.model).to(device)
freevc.eval()
utils.load_checkpoint("checkpoints/freevc.pth", freevc, None)
print("Loading Speaker Encoder...")
smodel = SpeakerEncoder('speaker_encoder/ckpt/pretrained_bak_5805000.pt')
print("Loading WavLM for content...")
cmodel = WavLMModel.from_pretrained("microsoft/wavlm-large").to(device)
lm_path = 'google/flan-t5-base'
tokenizer = T5Tokenizer.from_pretrained(lm_path)
text_encoder = T5EncoderModel.from_pretrained(lm_path).to(device).eval()
dreamvg = DreamVG(config_path='src/configs/plugin_cross.yaml',
ckpt_path='checkpoints/dreamvc_plugin.pt',
device=device)
# Constants for overlap-add
CHUNK_SIZE = 47040
OVERLAP = 960
BUFFER_SIZE = OVERLAP + CHUNK_SIZE
fade_size = OVERLAP
HANN_WINDOW = np.ones(BUFFER_SIZE)
HANN_WINDOW[:fade_size] = 0.5 * (1 - np.cos(np.pi * np.arange(fade_size) / fade_size))
HANN_WINDOW[-fade_size:] = 0.5 * (1 - np.cos(np.pi * np.arange(fade_size) / fade_size))[::-1]
# Initialize buffers
input_buffer = np.zeros(BUFFER_SIZE, dtype=np.float32)
output_buffer = np.zeros(BUFFER_SIZE, dtype=np.float32)
@torch.no_grad()
def convert_realtime_with_buffers(audio_chunk, tgt_embedding, freevc, cmodel):
"""Process audio in chunks with overlap and manage input/output buffers."""
global input_buffer, output_buffer, HANN_WINDOW, BUFFER_SIZE, CHUNK_SIZE
# Add incoming audio chunk to input buffer
input_buffer[:OVERLAP] = input_buffer[-OVERLAP:]
input_buffer[OVERLAP:] = audio_chunk
# Downsample to 16,000 Hz
chunk = input_buffer
chunk = librosa.resample(chunk, orig_sr=48000, target_sr=16000)
# Convert to tensor and pad
chunk_tensor = torch.from_numpy(chunk).unsqueeze(0).to(device).float()
chunk_tensor = F.pad(chunk_tensor, (40, 40))
# Extract content features using WavLM
c = cmodel(chunk_tensor).last_hidden_state.transpose(1, 2).to(device)
# Generate converted audio using FreeVC
audio = freevc.infer(c, g=tgt_embedding)
audio = audio[0][0].data.cpu().float().numpy()
# Upsample back to 48,000 Hz
audio = librosa.resample(audio, orig_sr=16000, target_sr=48000)
# Apply Hann window to the output
windowed_output = audio * HANN_WINDOW
# Add the new processed audio to the output buffer with overlap
output_buffer[:OVERLAP] = output_buffer[-OVERLAP:]
output_buffer[OVERLAP:] = 0
output_buffer += windowed_output
normalization_factors = np.zeros(BUFFER_SIZE)
normalization_factors[:OVERLAP] += HANN_WINDOW[-OVERLAP:]
normalization_factors += HANN_WINDOW
normalization_factors = np.clip(normalization_factors, 1e-6, None)
# output_buffer[:CHUNK_SIZE] = output_buffer[:CHUNK_SIZE] / normalization_factors[:CHUNK_SIZE]
return output_buffer[:CHUNK_SIZE]
def prepare_target_embedding(tgt_audio_path):
"""Preprocess target audio and get speaker embedding."""
wav_tgt, _ = librosa.load(tgt_audio_path, sr=16000)
wav_tgt, _ = librosa.effects.trim(wav_tgt, top_db=20)
g_tgt = smodel.embed_utterance(wav_tgt)
g_tgt = torch.from_numpy(g_tgt).unsqueeze(0).to(device)
return g_tgt
# Prepare the target speaker embedding
# target_audio = "p225_001.wav" # Target speaker audio
# target_embedding = prepare_target_embedding(target_audio)
prompt = "A young girl voice, very cute"
prompt_guidance_scale = 3.0
text_batch = tokenizer(prompt, max_length=32,
padding='max_length', truncation=True, return_tensors="pt")
text, text_mask = text_batch.input_ids.to(device), \
text_batch.attention_mask.to(device)
text = text_encoder(input_ids=text, attention_mask=text_mask)[0]
target_embedding = dreamvg.inference([text, text_mask],
guidance_scale=prompt_guidance_scale,
guidance_rescale=0.0,
ddim_steps=100, eta=1,
random_seed=None)
# Stream settings
SAMPLING_RATE = 48000
INPUT_DEVICE = 69
OUTPUT_DEVICE = 58
def audio_callback(indata, outdata, frames, time, status):
"""Callback function for real-time audio processing with input and output buffers."""
global input_buffer, output_buffer
if status:
print(f"Status: {status}")
# Reshape and process input audio
indata = indata[:, 0] # Mono input
converted_audio = convert_realtime_with_buffers(indata, target_embedding, freevc, cmodel)
# Write the converted audio to the output stream
outdata[:] = converted_audio.reshape(-1, 1)
# Start the audio stream with the updated callback
with sd.Stream(
samplerate=SAMPLING_RATE,
blocksize=CHUNK_SIZE,
channels=1,
dtype='float32',
latency='low',
device=(INPUT_DEVICE, OUTPUT_DEVICE),
callback=audio_callback):
try:
sd.sleep(1000000)
except KeyboardInterrupt:
print("Voice conversion stopped.")