|
import os
|
|
import torch
|
|
import torch.nn.functional as F
|
|
import librosa
|
|
import sounddevice as sd
|
|
from transformers import WavLMModel
|
|
from scipy.io.wavfile import write
|
|
from models import SynthesizerTrn
|
|
from speaker_encoder.voice_encoder import SpeakerEncoder
|
|
import utils
|
|
import numpy as np
|
|
from transformers import T5Tokenizer, T5EncoderModel
|
|
from src.plugin_wrapper import DreamVG
|
|
|
|
|
|
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
|
|
|
print("Loading FreeVC...")
|
|
hps = utils.get_hparams_from_file("configs/freevc.json")
|
|
freevc = SynthesizerTrn(
|
|
hps.data.filter_length // 2 + 1,
|
|
hps.train.segment_size // hps.data.hop_length,
|
|
**hps.model).to(device)
|
|
freevc.eval()
|
|
utils.load_checkpoint("checkpoints/freevc.pth", freevc, None)
|
|
|
|
print("Loading Speaker Encoder...")
|
|
smodel = SpeakerEncoder('speaker_encoder/ckpt/pretrained_bak_5805000.pt')
|
|
|
|
print("Loading WavLM for content...")
|
|
cmodel = WavLMModel.from_pretrained("microsoft/wavlm-large").to(device)
|
|
|
|
lm_path = 'google/flan-t5-base'
|
|
tokenizer = T5Tokenizer.from_pretrained(lm_path)
|
|
text_encoder = T5EncoderModel.from_pretrained(lm_path).to(device).eval()
|
|
|
|
dreamvg = DreamVG(config_path='src/configs/plugin_cross.yaml',
|
|
ckpt_path='checkpoints/dreamvc_plugin.pt',
|
|
device=device)
|
|
|
|
|
|
|
|
CHUNK_SIZE = 47040
|
|
OVERLAP = 960
|
|
BUFFER_SIZE = OVERLAP + CHUNK_SIZE
|
|
fade_size = OVERLAP
|
|
HANN_WINDOW = np.ones(BUFFER_SIZE)
|
|
HANN_WINDOW[:fade_size] = 0.5 * (1 - np.cos(np.pi * np.arange(fade_size) / fade_size))
|
|
HANN_WINDOW[-fade_size:] = 0.5 * (1 - np.cos(np.pi * np.arange(fade_size) / fade_size))[::-1]
|
|
|
|
|
|
input_buffer = np.zeros(BUFFER_SIZE, dtype=np.float32)
|
|
output_buffer = np.zeros(BUFFER_SIZE, dtype=np.float32)
|
|
|
|
|
|
@torch.no_grad()
|
|
def convert_realtime_with_buffers(audio_chunk, tgt_embedding, freevc, cmodel):
|
|
"""Process audio in chunks with overlap and manage input/output buffers."""
|
|
global input_buffer, output_buffer, HANN_WINDOW, BUFFER_SIZE, CHUNK_SIZE
|
|
|
|
|
|
input_buffer[:OVERLAP] = input_buffer[-OVERLAP:]
|
|
input_buffer[OVERLAP:] = audio_chunk
|
|
|
|
|
|
chunk = input_buffer
|
|
chunk = librosa.resample(chunk, orig_sr=48000, target_sr=16000)
|
|
|
|
|
|
chunk_tensor = torch.from_numpy(chunk).unsqueeze(0).to(device).float()
|
|
chunk_tensor = F.pad(chunk_tensor, (40, 40))
|
|
|
|
|
|
c = cmodel(chunk_tensor).last_hidden_state.transpose(1, 2).to(device)
|
|
|
|
|
|
audio = freevc.infer(c, g=tgt_embedding)
|
|
audio = audio[0][0].data.cpu().float().numpy()
|
|
|
|
|
|
audio = librosa.resample(audio, orig_sr=16000, target_sr=48000)
|
|
|
|
|
|
windowed_output = audio * HANN_WINDOW
|
|
|
|
|
|
output_buffer[:OVERLAP] = output_buffer[-OVERLAP:]
|
|
output_buffer[OVERLAP:] = 0
|
|
output_buffer += windowed_output
|
|
|
|
normalization_factors = np.zeros(BUFFER_SIZE)
|
|
normalization_factors[:OVERLAP] += HANN_WINDOW[-OVERLAP:]
|
|
normalization_factors += HANN_WINDOW
|
|
normalization_factors = np.clip(normalization_factors, 1e-6, None)
|
|
|
|
|
|
return output_buffer[:CHUNK_SIZE]
|
|
|
|
|
|
def prepare_target_embedding(tgt_audio_path):
|
|
"""Preprocess target audio and get speaker embedding."""
|
|
wav_tgt, _ = librosa.load(tgt_audio_path, sr=16000)
|
|
wav_tgt, _ = librosa.effects.trim(wav_tgt, top_db=20)
|
|
g_tgt = smodel.embed_utterance(wav_tgt)
|
|
g_tgt = torch.from_numpy(g_tgt).unsqueeze(0).to(device)
|
|
return g_tgt
|
|
|
|
|
|
|
|
|
|
|
|
prompt = "A young girl voice, very cute"
|
|
prompt_guidance_scale = 3.0
|
|
|
|
text_batch = tokenizer(prompt, max_length=32,
|
|
padding='max_length', truncation=True, return_tensors="pt")
|
|
text, text_mask = text_batch.input_ids.to(device), \
|
|
text_batch.attention_mask.to(device)
|
|
text = text_encoder(input_ids=text, attention_mask=text_mask)[0]
|
|
target_embedding = dreamvg.inference([text, text_mask],
|
|
guidance_scale=prompt_guidance_scale,
|
|
guidance_rescale=0.0,
|
|
ddim_steps=100, eta=1,
|
|
random_seed=None)
|
|
|
|
|
|
SAMPLING_RATE = 48000
|
|
INPUT_DEVICE = 69
|
|
OUTPUT_DEVICE = 58
|
|
|
|
|
|
def audio_callback(indata, outdata, frames, time, status):
|
|
"""Callback function for real-time audio processing with input and output buffers."""
|
|
global input_buffer, output_buffer
|
|
|
|
if status:
|
|
print(f"Status: {status}")
|
|
|
|
indata = indata[:, 0]
|
|
converted_audio = convert_realtime_with_buffers(indata, target_embedding, freevc, cmodel)
|
|
|
|
outdata[:] = converted_audio.reshape(-1, 1)
|
|
|
|
|
|
|
|
with sd.Stream(
|
|
samplerate=SAMPLING_RATE,
|
|
blocksize=CHUNK_SIZE,
|
|
channels=1,
|
|
dtype='float32',
|
|
latency='low',
|
|
device=(INPUT_DEVICE, OUTPUT_DEVICE),
|
|
callback=audio_callback):
|
|
try:
|
|
sd.sleep(1000000)
|
|
except KeyboardInterrupt:
|
|
print("Voice conversion stopped.")
|
|
|