Spaces:

alex16052G
/

abi

Paused

App Files Files Community

alex16052G commited on Jan 22

Commit

23c65b5

verified ·

1 Parent(s): 019a00a

Update chat_ai.py

Browse files

Files changed (1) hide show

chat_ai.py +56 -64

chat_ai.py CHANGED Viewed

@@ -21,6 +21,9 @@ try:
 except ImportError:
     USING_SPACES = False
 def gpu_decorator(func):
     if USING_SPACES:
         return spaces.GPU(func)
@@ -38,26 +41,18 @@ from f5_tts.infer.utils_infer import (
 )
 # Cargar el vocoder
-vocoder = load_vocoder()
 # Configuración y carga del modelo F5-TTS
 F5TTS_model_cfg = dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4)
 F5TTS_ema_model = load_model(
     DiT, F5TTS_model_cfg, str(cached_path("hf://jpgallegoar/F5-Spanish/model_1200000.safetensors"))
-)
-# Eliminamos la carga global de WhisperProcessor y WhisperForConditionalGeneration
-# Estos se cargarán dentro de la función de transcripción
-@gr.Caching.cache  # Ajusta según tu versión de Gradio
-def get_whisper_models():
-    """Carga y retorna los modelos Whisper y el procesador."""
-    processor = WhisperProcessor.from_pretrained("openai/whisper-base")
-    model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-base")
-    model.eval()
-    if torch.cuda.is_available():
-        model.to("cuda")
-    return processor, model
 @gpu_decorator
 def infer(
@@ -65,54 +60,54 @@ def infer(
 ):
     """Genera el audio sintetizado a partir del texto utilizando la voz de referencia."""
     try:
-        # Preprocesar el audio de referencia y el texto de referencia
-        ref_audio, ref_text = preprocess_ref_audio_text(ref_audio_orig, ref_text)
-        ema_model = F5TTS_ema_model
-        # Asegurar que el texto a generar esté correctamente formateado
-        if not gen_text.startswith(" "):
-            gen_text = " " + gen_text
-        if not gen_text.endswith(". "):
-            gen_text += ". "
-        # El texto ingresado por el usuario se utiliza directamente sin modificaciones
-        input_text = gen_text
-        print(f"Texto para generar audio: {input_text}")  # Debug: Verificar el texto
-        # Procesar la inferencia para generar el audio
-        final_wave, final_sample_rate, combined_spectrogram = infer_process(
-            ref_audio,
-            ref_text,
-            input_text,
-            ema_model,
-            vocoder,
-            cross_fade_duration=cross_fade_duration,
-            speed=speed,
-            progress=gr.Progress(),
-        )
-        # Eliminar silencios si está activado
-        if remove_silence:
-            with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
-                sf.write(f.name, final_wave, final_sample_rate)
-                remove_silence_for_generated_wav(f.name)
-                final_wave, _ = torchaudio.load(f.name)
-            final_wave = final_wave.squeeze().cpu().numpy()
-        # Guardar el espectrograma (opcional)
-        with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_spectrogram:
-            spectrogram_path = tmp_spectrogram.name
-            save_spectrogram(combined_spectrogram, spectrogram_path)
-        return (final_sample_rate, final_wave), spectrogram_path
     except Exception as e:
         # Log del error para depuración
         print(f"Error en infer: {e}")
         return None, None
-@gpu_decorator
 def transcribe_audio(audio_path):
     """Transcribe el audio de referencia usando el modelo Whisper en español."""
     try:
@@ -131,23 +126,20 @@ def transcribe_audio(audio_path):
         if audio.ndim > 1:
             audio = torch.mean(audio, dim=0)
-        # Cargar los modelos Whisper
-        whisper_processor, whisper_model = get_whisper_models()
         # Procesar el audio con el procesador de Whisper
         inputs = whisper_processor(audio.cpu().numpy(), sampling_rate=16000, return_tensors="pt")
-        if torch.cuda.is_available():
-            inputs = {k: v.to("cuda") for k, v in inputs.items()}
         # Forzar el idioma a español (usando el nombre en inglés)
         forced_decoder_ids = whisper_processor.get_decoder_prompt_ids(language="spanish", task="transcribe")
         # Generar la transcripción
-        predicted_ids = whisper_model.generate(
-            inputs["input_features"],
-            forced_decoder_ids=forced_decoder_ids
-        )
         transcription = whisper_processor.decode(predicted_ids[0], skip_special_tokens=True)
         print(f"Transcripción: {transcription}")  # Debug: Verificar la transcripción
@@ -155,7 +147,7 @@ def transcribe_audio(audio_path):
         return transcription
     except Exception as e:
         print(f"Error en transcribe_audio: {e}")
-        return "Error al transcribir el audio de referencia."
 def transcribe_and_update(audio_path):
     """Transcribe el audio de referencia y devuelve el texto transcrito."""

 except ImportError:
     USING_SPACES = False
+# Definir el dispositivo global
+DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 def gpu_decorator(func):
     if USING_SPACES:
         return spaces.GPU(func)
 )
 # Cargar el vocoder
+vocoder = load_vocoder().to(DEVICE)
 # Configuración y carga del modelo F5-TTS
 F5TTS_model_cfg = dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4)
 F5TTS_ema_model = load_model(
     DiT, F5TTS_model_cfg, str(cached_path("hf://jpgallegoar/F5-Spanish/model_1200000.safetensors"))
+).to(DEVICE)
+# Cargar el modelo Whisper para transcripción
+whisper_processor = WhisperProcessor.from_pretrained("openai/whisper-base")
+whisper_model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-base").to(DEVICE)
+whisper_model.eval()
 @gpu_decorator
 def infer(
 ):
     """Genera el audio sintetizado a partir del texto utilizando la voz de referencia."""
     try:
+        with torch.no_grad():
+            # Preprocesar el audio de referencia y el texto de referencia
+            ref_audio, ref_text = preprocess_ref_audio_text(ref_audio_orig, ref_text)
+            ema_model = F5TTS_ema_model
+            # Asegurar que el texto a generar esté correctamente formateado
+            if not gen_text.startswith(" "):
+                gen_text = " " + gen_text
+            if not gen_text.endswith(". "):
+                gen_text += ". "
+            # El texto ingresado por el usuario se utiliza directamente sin modificaciones
+            input_text = gen_text
+            print(f"Texto para generar audio: {input_text}")  # Debug: Verificar el texto
+            # Procesar la inferencia para generar el audio
+            final_wave, final_sample_rate, combined_spectrogram = infer_process(
+                ref_audio.to(DEVICE),
+                ref_text,
+                input_text,
+                ema_model,
+                vocoder,
+                cross_fade_duration=cross_fade_duration,
+                speed=speed,
+                progress=gr.Progress(),
+            )
+            # Eliminar silencios si está activado
+            if remove_silence:
+                with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
+                    sf.write(f.name, final_wave.cpu().numpy(), final_sample_rate)
+                    remove_silence_for_generated_wav(f.name)
+                    final_wave, _ = torchaudio.load(f.name)
+                final_wave = final_wave.squeeze().cpu().numpy()
+            # Guardar el espectrograma (opcional)
+            with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_spectrogram:
+                spectrogram_path = tmp_spectrogram.name
+                save_spectrogram(combined_spectrogram, spectrogram_path)
+            return (final_sample_rate, final_wave), spectrogram_path
     except Exception as e:
         # Log del error para depuración
         print(f"Error en infer: {e}")
         return None, None
 def transcribe_audio(audio_path):
     """Transcribe el audio de referencia usando el modelo Whisper en español."""
     try:
         if audio.ndim > 1:
             audio = torch.mean(audio, dim=0)
         # Procesar el audio con el procesador de Whisper
         inputs = whisper_processor(audio.cpu().numpy(), sampling_rate=16000, return_tensors="pt")
+        inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
         # Forzar el idioma a español (usando el nombre en inglés)
         forced_decoder_ids = whisper_processor.get_decoder_prompt_ids(language="spanish", task="transcribe")
         # Generar la transcripción
+        with torch.no_grad():
+            predicted_ids = whisper_model.generate(
+                inputs["input_features"],
+                forced_decoder_ids=forced_decoder_ids
+            )
         transcription = whisper_processor.decode(predicted_ids[0], skip_special_tokens=True)
         print(f"Transcripción: {transcription}")  # Debug: Verificar la transcripción
         return transcription
     except Exception as e:
         print(f"Error en transcribe_audio: {e}")
+        return None
 def transcribe_and_update(audio_path):
     """Transcribe el audio de referencia y devuelve el texto transcrito."""