import os import torch import torch.nn.functional as F import librosa import sounddevice as sd from transformers import WavLMModel from scipy.io.wavfile import write from models import SynthesizerTrn from speaker_encoder.voice_encoder import SpeakerEncoder import utils import numpy as np from transformers import T5Tokenizer, T5EncoderModel from src.plugin_wrapper import DreamVG # Load configurations and models device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print("Loading FreeVC...") hps = utils.get_hparams_from_file("configs/freevc.json") freevc = SynthesizerTrn( hps.data.filter_length // 2 + 1, hps.train.segment_size // hps.data.hop_length, **hps.model).to(device) freevc.eval() utils.load_checkpoint("checkpoints/freevc.pth", freevc, None) print("Loading Speaker Encoder...") smodel = SpeakerEncoder('speaker_encoder/ckpt/pretrained_bak_5805000.pt') print("Loading WavLM for content...") cmodel = WavLMModel.from_pretrained("microsoft/wavlm-large").to(device) lm_path = 'google/flan-t5-base' tokenizer = T5Tokenizer.from_pretrained(lm_path) text_encoder = T5EncoderModel.from_pretrained(lm_path).to(device).eval() dreamvg = DreamVG(config_path='src/configs/plugin_cross.yaml', ckpt_path='checkpoints/dreamvc_plugin.pt', device=device) # Constants for overlap-add CHUNK_SIZE = 47040 OVERLAP = 960 BUFFER_SIZE = OVERLAP + CHUNK_SIZE fade_size = OVERLAP HANN_WINDOW = np.ones(BUFFER_SIZE) HANN_WINDOW[:fade_size] = 0.5 * (1 - np.cos(np.pi * np.arange(fade_size) / fade_size)) HANN_WINDOW[-fade_size:] = 0.5 * (1 - np.cos(np.pi * np.arange(fade_size) / fade_size))[::-1] # Initialize buffers input_buffer = np.zeros(BUFFER_SIZE, dtype=np.float32) output_buffer = np.zeros(BUFFER_SIZE, dtype=np.float32) @torch.no_grad() def convert_realtime_with_buffers(audio_chunk, tgt_embedding, freevc, cmodel): """Process audio in chunks with overlap and manage input/output buffers.""" global input_buffer, output_buffer, HANN_WINDOW, BUFFER_SIZE, CHUNK_SIZE # Add incoming audio chunk to input buffer input_buffer[:OVERLAP] = input_buffer[-OVERLAP:] input_buffer[OVERLAP:] = audio_chunk # Downsample to 16,000 Hz chunk = input_buffer chunk = librosa.resample(chunk, orig_sr=48000, target_sr=16000) # Convert to tensor and pad chunk_tensor = torch.from_numpy(chunk).unsqueeze(0).to(device).float() chunk_tensor = F.pad(chunk_tensor, (40, 40)) # Extract content features using WavLM c = cmodel(chunk_tensor).last_hidden_state.transpose(1, 2).to(device) # Generate converted audio using FreeVC audio = freevc.infer(c, g=tgt_embedding) audio = audio[0][0].data.cpu().float().numpy() # Upsample back to 48,000 Hz audio = librosa.resample(audio, orig_sr=16000, target_sr=48000) # Apply Hann window to the output windowed_output = audio * HANN_WINDOW # Add the new processed audio to the output buffer with overlap output_buffer[:OVERLAP] = output_buffer[-OVERLAP:] output_buffer[OVERLAP:] = 0 output_buffer += windowed_output normalization_factors = np.zeros(BUFFER_SIZE) normalization_factors[:OVERLAP] += HANN_WINDOW[-OVERLAP:] normalization_factors += HANN_WINDOW normalization_factors = np.clip(normalization_factors, 1e-6, None) # output_buffer[:CHUNK_SIZE] = output_buffer[:CHUNK_SIZE] / normalization_factors[:CHUNK_SIZE] return output_buffer[:CHUNK_SIZE] def prepare_target_embedding(tgt_audio_path): """Preprocess target audio and get speaker embedding.""" wav_tgt, _ = librosa.load(tgt_audio_path, sr=16000) wav_tgt, _ = librosa.effects.trim(wav_tgt, top_db=20) g_tgt = smodel.embed_utterance(wav_tgt) g_tgt = torch.from_numpy(g_tgt).unsqueeze(0).to(device) return g_tgt # Prepare the target speaker embedding # target_audio = "p225_001.wav" # Target speaker audio # target_embedding = prepare_target_embedding(target_audio) prompt = "A young girl voice, very cute" prompt_guidance_scale = 3.0 text_batch = tokenizer(prompt, max_length=32, padding='max_length', truncation=True, return_tensors="pt") text, text_mask = text_batch.input_ids.to(device), \ text_batch.attention_mask.to(device) text = text_encoder(input_ids=text, attention_mask=text_mask)[0] target_embedding = dreamvg.inference([text, text_mask], guidance_scale=prompt_guidance_scale, guidance_rescale=0.0, ddim_steps=100, eta=1, random_seed=None) # Stream settings SAMPLING_RATE = 48000 INPUT_DEVICE = 69 OUTPUT_DEVICE = 58 def audio_callback(indata, outdata, frames, time, status): """Callback function for real-time audio processing with input and output buffers.""" global input_buffer, output_buffer if status: print(f"Status: {status}") # Reshape and process input audio indata = indata[:, 0] # Mono input converted_audio = convert_realtime_with_buffers(indata, target_embedding, freevc, cmodel) # Write the converted audio to the output stream outdata[:] = converted_audio.reshape(-1, 1) # Start the audio stream with the updated callback with sd.Stream( samplerate=SAMPLING_RATE, blocksize=CHUNK_SIZE, channels=1, dtype='float32', latency='low', device=(INPUT_DEVICE, OUTPUT_DEVICE), callback=audio_callback): try: sd.sleep(1000000) except KeyboardInterrupt: print("Voice conversion stopped.")