File size: 5,769 Bytes

0dabde8

import os
import torch
import torch.nn.functional as F
import librosa
import sounddevice as sd
from transformers import WavLMModel
from scipy.io.wavfile import write
from models import SynthesizerTrn
from speaker_encoder.voice_encoder import SpeakerEncoder
import utils
import numpy as np
from transformers import T5Tokenizer, T5EncoderModel
from src.plugin_wrapper import DreamVG


# Load configurations and models
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print("Loading FreeVC...")
hps = utils.get_hparams_from_file("configs/freevc.json")
freevc = SynthesizerTrn(
    hps.data.filter_length // 2 + 1,
    hps.train.segment_size // hps.data.hop_length,
    **hps.model).to(device)
freevc.eval()
utils.load_checkpoint("checkpoints/freevc.pth", freevc, None)

print("Loading Speaker Encoder...")
smodel = SpeakerEncoder('speaker_encoder/ckpt/pretrained_bak_5805000.pt')

print("Loading WavLM for content...")
cmodel = WavLMModel.from_pretrained("microsoft/wavlm-large").to(device)

lm_path = 'google/flan-t5-base'
tokenizer = T5Tokenizer.from_pretrained(lm_path)
text_encoder = T5EncoderModel.from_pretrained(lm_path).to(device).eval()

dreamvg = DreamVG(config_path='src/configs/plugin_cross.yaml',
                  ckpt_path='checkpoints/dreamvc_plugin.pt',
                  device=device)


# Constants for overlap-add
CHUNK_SIZE = 47040
OVERLAP = 960
BUFFER_SIZE = OVERLAP + CHUNK_SIZE
fade_size = OVERLAP
HANN_WINDOW = np.ones(BUFFER_SIZE)
HANN_WINDOW[:fade_size] = 0.5 * (1 - np.cos(np.pi * np.arange(fade_size) / fade_size))
HANN_WINDOW[-fade_size:] = 0.5 * (1 - np.cos(np.pi * np.arange(fade_size) / fade_size))[::-1]

# Initialize buffers
input_buffer = np.zeros(BUFFER_SIZE, dtype=np.float32)
output_buffer = np.zeros(BUFFER_SIZE, dtype=np.float32)


@torch.no_grad()
def convert_realtime_with_buffers(audio_chunk, tgt_embedding, freevc, cmodel):
    """Process audio in chunks with overlap and manage input/output buffers."""
    global input_buffer, output_buffer, HANN_WINDOW, BUFFER_SIZE, CHUNK_SIZE

    # Add incoming audio chunk to input buffer
    input_buffer[:OVERLAP] = input_buffer[-OVERLAP:]
    input_buffer[OVERLAP:] = audio_chunk

    # Downsample to 16,000 Hz
    chunk = input_buffer
    chunk = librosa.resample(chunk, orig_sr=48000, target_sr=16000)

    # Convert to tensor and pad
    chunk_tensor = torch.from_numpy(chunk).unsqueeze(0).to(device).float()
    chunk_tensor = F.pad(chunk_tensor, (40, 40))

    # Extract content features using WavLM
    c = cmodel(chunk_tensor).last_hidden_state.transpose(1, 2).to(device)

    # Generate converted audio using FreeVC
    audio = freevc.infer(c, g=tgt_embedding)
    audio = audio[0][0].data.cpu().float().numpy()

    # Upsample back to 48,000 Hz
    audio = librosa.resample(audio, orig_sr=16000, target_sr=48000)

    # Apply Hann window to the output
    windowed_output = audio * HANN_WINDOW

    # Add the new processed audio to the output buffer with overlap
    output_buffer[:OVERLAP] = output_buffer[-OVERLAP:]
    output_buffer[OVERLAP:] = 0
    output_buffer += windowed_output

    normalization_factors = np.zeros(BUFFER_SIZE)
    normalization_factors[:OVERLAP] += HANN_WINDOW[-OVERLAP:]
    normalization_factors += HANN_WINDOW
    normalization_factors = np.clip(normalization_factors, 1e-6, None)
    # output_buffer[:CHUNK_SIZE] = output_buffer[:CHUNK_SIZE] / normalization_factors[:CHUNK_SIZE]

    return output_buffer[:CHUNK_SIZE]


def prepare_target_embedding(tgt_audio_path):
    """Preprocess target audio and get speaker embedding."""
    wav_tgt, _ = librosa.load(tgt_audio_path, sr=16000)
    wav_tgt, _ = librosa.effects.trim(wav_tgt, top_db=20)
    g_tgt = smodel.embed_utterance(wav_tgt)
    g_tgt = torch.from_numpy(g_tgt).unsqueeze(0).to(device)
    return g_tgt


# Prepare the target speaker embedding
# target_audio = "p225_001.wav"  # Target speaker audio
# target_embedding = prepare_target_embedding(target_audio)
prompt = "A young girl voice, very cute"
prompt_guidance_scale = 3.0

text_batch = tokenizer(prompt, max_length=32,
                       padding='max_length', truncation=True, return_tensors="pt")
text, text_mask = text_batch.input_ids.to(device), \
    text_batch.attention_mask.to(device)
text = text_encoder(input_ids=text, attention_mask=text_mask)[0]
target_embedding = dreamvg.inference([text, text_mask],
                                     guidance_scale=prompt_guidance_scale,
                                     guidance_rescale=0.0,
                                     ddim_steps=100, eta=1,
                                     random_seed=None)

# Stream settings
SAMPLING_RATE = 48000
INPUT_DEVICE = 69
OUTPUT_DEVICE = 58


def audio_callback(indata, outdata, frames, time, status):
    """Callback function for real-time audio processing with input and output buffers."""
    global input_buffer, output_buffer

    if status:
        print(f"Status: {status}")
    # Reshape and process input audio
    indata = indata[:, 0]  # Mono input
    converted_audio = convert_realtime_with_buffers(indata, target_embedding, freevc, cmodel)
    # Write the converted audio to the output stream
    outdata[:] = converted_audio.reshape(-1, 1)


# Start the audio stream with the updated callback
with sd.Stream(
        samplerate=SAMPLING_RATE,
        blocksize=CHUNK_SIZE,
        channels=1,
        dtype='float32',
        latency='low',
        device=(INPUT_DEVICE, OUTPUT_DEVICE),
        callback=audio_callback):
    try:
        sd.sleep(1000000)
    except KeyboardInterrupt:
        print("Voice conversion stopped.")