Spaces:

alex16052G
/

abi

Paused

App Files Files Community

alex16052G commited on Jan 22

Commit

43489d1

verified ·

1 Parent(s): 23c65b5

Update chat_ai.py

Browse files

Files changed (1) hide show

chat_ai.py +45 -30

chat_ai.py CHANGED Viewed

@@ -1,8 +1,5 @@
 # text_to_speech_ai.py
-# ruff: noqa: E402
-# Above allows ruff to ignore E402: module level import not at top of file
 import re
 import tempfile
 import os
@@ -21,9 +18,6 @@ try:
 except ImportError:
     USING_SPACES = False
-# Definir el dispositivo global
-DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 def gpu_decorator(func):
     if USING_SPACES:
         return spaces.GPU(func)
@@ -40,19 +34,37 @@ from f5_tts.infer.utils_infer import (
     save_spectrogram,
 )
-# Cargar el vocoder
-vocoder = load_vocoder().to(DEVICE)
-# Configuración y carga del modelo F5-TTS
-F5TTS_model_cfg = dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4)
-F5TTS_ema_model = load_model(
-    DiT, F5TTS_model_cfg, str(cached_path("hf://jpgallegoar/F5-Spanish/model_1200000.safetensors"))
-).to(DEVICE)
-# Cargar el modelo Whisper para transcripción
-whisper_processor = WhisperProcessor.from_pretrained("openai/whisper-base")
-whisper_model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-base").to(DEVICE)
-whisper_model.eval()
 @gpu_decorator
 def infer(
@@ -61,34 +73,34 @@ def infer(
     """Genera el audio sintetizado a partir del texto utilizando la voz de referencia."""
     try:
         with torch.no_grad():
             # Preprocesar el audio de referencia y el texto de referencia
             ref_audio, ref_text = preprocess_ref_audio_text(ref_audio_orig, ref_text)
-            ema_model = F5TTS_ema_model
             # Asegurar que el texto a generar esté correctamente formateado
             if not gen_text.startswith(" "):
                 gen_text = " " + gen_text
             if not gen_text.endswith(". "):
                 gen_text += ". "
             # El texto ingresado por el usuario se utiliza directamente sin modificaciones
             input_text = gen_text
             print(f"Texto para generar audio: {input_text}")  # Debug: Verificar el texto
             # Procesar la inferencia para generar el audio
             final_wave, final_sample_rate, combined_spectrogram = infer_process(
-                ref_audio.to(DEVICE),
                 ref_text,
                 input_text,
-                ema_model,
                 vocoder,
                 cross_fade_duration=cross_fade_duration,
                 speed=speed,
                 progress=gr.Progress(),
             )
             # Eliminar silencios si está activado
             if remove_silence:
                 with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
@@ -96,21 +108,24 @@ def infer(
                     remove_silence_for_generated_wav(f.name)
                     final_wave, _ = torchaudio.load(f.name)
                 final_wave = final_wave.squeeze().cpu().numpy()
             # Guardar el espectrograma (opcional)
             with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_spectrogram:
                 spectrogram_path = tmp_spectrogram.name
                 save_spectrogram(combined_spectrogram, spectrogram_path)
             return (final_sample_rate, final_wave), spectrogram_path
     except Exception as e:
         # Log del error para depuración
         print(f"Error en infer: {e}")
         return None, None
 def transcribe_audio(audio_path):
     """Transcribe el audio de referencia usando el modelo Whisper en español."""
     try:
         if not os.path.exists(audio_path):
             raise FileNotFoundError(f"Archivo de audio no encontrado: {audio_path}")
@@ -129,7 +144,7 @@ def transcribe_audio(audio_path):
         # Procesar el audio con el procesador de Whisper
         inputs = whisper_processor(audio.cpu().numpy(), sampling_rate=16000, return_tensors="pt")
-        inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
         # Forzar el idioma a español (usando el nombre en inglés)
         forced_decoder_ids = whisper_processor.get_decoder_prompt_ids(language="spanish", task="transcribe")

 # text_to_speech_ai.py
 import re
 import tempfile
 import os
 except ImportError:
     USING_SPACES = False
 def gpu_decorator(func):
     if USING_SPACES:
         return spaces.GPU(func)
     save_spectrogram,
 )
+# Definir el dispositivo global (se usa solo dentro de las funciones)
+def get_device():
+    return torch.device("cuda" if torch.cuda.is_available() else "cpu")
+@gpu_decorator
+def load_models():
+    """Carga y devuelve los modelos necesarios."""
+    device = get_device()
+    # Cargar el vocoder y moverlo al dispositivo
+    vocoder = load_vocoder().to(device)
+    # Configuración y carga del modelo F5-TTS
+    F5TTS_model_cfg = dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4)
+    F5TTS_ema_model = load_model(
+        DiT, F5TTS_model_cfg, str(cached_path("hf://jpgallegoar/F5-Spanish/model_1200000.safetensors"))
+    ).to(device)
+    # Cargar el modelo Whisper para transcripción
+    whisper_processor = WhisperProcessor.from_pretrained("openai/whisper-base")
+    whisper_model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-base").to(device)
+    whisper_model.eval()
+    return vocoder, F5TTS_ema_model, whisper_processor, whisper_model, device
+# Cargar modelos una sola vez y almacenarlos en variables globales dentro de la función
+# Esto se logra usando atributos de función
+def get_models():
+    if not hasattr(get_models, "vocoder"):
+        get_models.vocoder, get_models.F5TTS_ema_model, get_models.whisper_processor, get_models.whisper_model, get_models.device = load_models()
+    return get_models.vocoder, get_models.F5TTS_ema_model, get_models.whisper_processor, get_models.whisper_model, get_models.device
 @gpu_decorator
 def infer(
     """Genera el audio sintetizado a partir del texto utilizando la voz de referencia."""
     try:
         with torch.no_grad():
+            vocoder, F5TTS_ema_model, _, _, device = get_models()
             # Preprocesar el audio de referencia y el texto de referencia
             ref_audio, ref_text = preprocess_ref_audio_text(ref_audio_orig, ref_text)
             # Asegurar que el texto a generar esté correctamente formateado
             if not gen_text.startswith(" "):
                 gen_text = " " + gen_text
             if not gen_text.endswith(". "):
                 gen_text += ". "
             # El texto ingresado por el usuario se utiliza directamente sin modificaciones
             input_text = gen_text
             print(f"Texto para generar audio: {input_text}")  # Debug: Verificar el texto
             # Procesar la inferencia para generar el audio
             final_wave, final_sample_rate, combined_spectrogram = infer_process(
+                ref_audio.to(device),
                 ref_text,
                 input_text,
+                F5TTS_ema_model,
                 vocoder,
                 cross_fade_duration=cross_fade_duration,
                 speed=speed,
                 progress=gr.Progress(),
             )
             # Eliminar silencios si está activado
             if remove_silence:
                 with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
                     remove_silence_for_generated_wav(f.name)
                     final_wave, _ = torchaudio.load(f.name)
                 final_wave = final_wave.squeeze().cpu().numpy()
             # Guardar el espectrograma (opcional)
             with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_spectrogram:
                 spectrogram_path = tmp_spectrogram.name
                 save_spectrogram(combined_spectrogram, spectrogram_path)
             return (final_sample_rate, final_wave), spectrogram_path
     except Exception as e:
         # Log del error para depuración
         print(f"Error en infer: {e}")
         return None, None
+@gpu_decorator
 def transcribe_audio(audio_path):
     """Transcribe el audio de referencia usando el modelo Whisper en español."""
     try:
+        vocoder, F5TTS_ema_model, whisper_processor, whisper_model, device = get_models()
         if not os.path.exists(audio_path):
             raise FileNotFoundError(f"Archivo de audio no encontrado: {audio_path}")
         # Procesar el audio con el procesador de Whisper
         inputs = whisper_processor(audio.cpu().numpy(), sampling_rate=16000, return_tensors="pt")
+        inputs = {k: v.to(device) for k, v in inputs.items()}
         # Forzar el idioma a español (usando el nombre en inglés)
         forced_decoder_ids = whisper_processor.get_decoder_prompt_ids(language="spanish", task="transcribe")