import sys import io, os, stat import subprocess import random from zipfile import ZipFile import uuid import time import torch import torchaudio import langid import base64 import csv from io import StringIO import datetime import re from scipy.io.wavfile import write from pydub import AudioSegment import gradio as gr from TTS.api import TTS from TTS.tts.configs.xtts_config import XttsConfig from TTS.tts.models.xtts import Xtts from TTS.utils.generic_utils import get_user_data_dir from huggingface_hub import hf_hub_download # Configuración inicial os.environ["COQUI_TOS_AGREED"] = "1" os.system('python -m unidic download') # Autenticación y descarga del modelo repo_id = "Blakus/Pedro_Lab_XTTS" local_dir = os.path.join(get_user_data_dir("tts"), "tts_models--multilingual--multi-dataset--xtts_v2") os.makedirs(local_dir, exist_ok=True) files_to_download = ["config.json", "model.pth", "vocab.json"] for file_name in files_to_download: print(f"Downloading {file_name} from {repo_id}") local_file_path = os.path.join(local_dir, file_name) hf_hub_download(repo_id=repo_id, filename=file_name, local_dir=local_dir) # Carga de configuración y modelo config_path = os.path.join(local_dir, "config.json") checkpoint_path = os.path.join(local_dir, "model.pth") vocab_path = os.path.join(local_dir, "vocab.json") config = XttsConfig() config.load_json(config_path) model = Xtts.init_from_config(config) model.load_checkpoint(config, checkpoint_path=checkpoint_path, vocab_path=vocab_path, eval=True, use_deepspeed=False) print("Modelo cargado en CPU") # Variables globales supported_languages = config.languages reference_audios = [ "serio.wav", "neutral.wav", "alegre.wav", ] # Función para dividir el texto en chunks def split_text(text): sentences = re.split(r'(?<=[.!?])\s+', text) return sentences # Función de inferencia mejorada def predict(prompt, language, audio_file_pth, use_reference_audio): try: if use_reference_audio: speaker_wav = audio_file_pth else: speaker_wav = "neutral.wav" # Audio por defecto si no se selecciona uno sentences = split_text(prompt) temperature = getattr(config, "temperature", 0.75) repetition_penalty = getattr(config, "repetition_penalty", 5.0) gpt_cond_len = getattr(config, "gpt_cond_len", 30) gpt_cond_chunk_len = getattr(config, "gpt_cond_chunk_len", 4) max_ref_length = getattr(config, "max_ref_len", 60) gpt_cond_latent, speaker_embedding = model.get_conditioning_latents( audio_path=speaker_wav, gpt_cond_len=gpt_cond_len, gpt_cond_chunk_len=gpt_cond_chunk_len, max_ref_length=max_ref_length ) start_time = time.time() combined_audio = AudioSegment.empty() for sentence in sentences: out = model.inference( sentence, language, gpt_cond_latent, speaker_embedding, temperature=temperature, repetition_penalty=repetition_penalty, ) audio_segment = AudioSegment( out["wav"].tobytes(), frame_rate=24000, sample_width=2, channels=1 ) combined_audio += audio_segment combined_audio += AudioSegment.silent(duration=500) # 0.5 segundos de silencio inference_time = time.time() - start_time output_path = "output.wav" combined_audio.export(output_path, format="wav") audio_length = len(combined_audio) / 1000 # duración del audio en segundos real_time_factor = inference_time / audio_length metrics_text = f"Tiempo de generación: {inference_time:.2f} segundos\n" metrics_text += f"Factor de tiempo real: {real_time_factor:.2f}" return gr.make_waveform(output_path), output_path, metrics_text except Exception as e: print(f"Error detallado: {str(e)}") return None, None, f"Error: {str(e)}" # Definir el tema personalizado theme = gr.themes.Soft( primary_hue="blue", secondary_hue="gray", ).set( body_background_fill='*neutral_100', body_background_fill_dark='*neutral_900', ) # Descripción del proyecto description = """ # Sintetizador de voz de Pedro Labattaglia 🎙️ Sintetizador de voz con la voz del locutor argentino Pedro Labattaglia. ## Cómo usarlo: - Elija el idioma (Español o Inglés) - Elija un audio de referencia de la lista o cargue su propio audio - Escriba el texto a sintetizar - Presione generar voz """ # Interfaz de Gradio with gr.Blocks(theme=theme) as demo: gr.Markdown(description) with gr.Row(): gr.Image("https://i1.sndcdn.com/artworks-000237574740-gwz61j-t500x500.jpg", label="", show_label=False, width=250, height=250) with gr.Row(): with gr.Column(scale=2): language = gr.Dropdown(label="Idioma", choices=supported_languages, value="es") use_reference_audio = gr.Checkbox(label="Usar audio de referencia") reference_audio = gr.Dropdown(label="Audio de referencia predefinido", choices=reference_audios, visible=False) audio_file = gr.Audio(label="O cargue su propio audio de referencia", type="filepath", visible=False) use_reference_audio.change( fn=lambda x: [gr.update(visible=x), gr.update(visible=x)], inputs=[use_reference_audio], outputs=[reference_audio, audio_file] ) input_text = gr.Textbox(label="Texto a sintetizar", placeholder="Escribe aquí el texto que quieres convertir a voz...", lines=5) generate_button = gr.Button("Generar voz", variant="primary") with gr.Column(scale=1): output_audio = gr.Audio(label="Audio generado") waveform = gr.Image(label="Forma de onda") metrics = gr.Textbox(label="Métricas") generate_button.click( predict, inputs=[input_text, language, audio_file, use_reference_audio], outputs=[waveform, output_audio, metrics] ) if __name__ == "__main__": demo.launch(debug=True)