Pedro_Lab_XTTS_demo

Paused

App Files Files

Blakus commited on Sep 22, 2024

Commit

0781598

verified ·

1 Parent(s): e0e731e

Update app.py

Browse files

Files changed (1) hide show

app.py +17 -9

app.py CHANGED Viewed

@@ -7,7 +7,7 @@ import uuid
 import time
 import torch
 import torchaudio
 # Mantenemos la descarga de MeCab
 os.system('python -m unidic download')
@@ -76,12 +76,12 @@ def predict(prompt, language, audio_file_pth, mic_file_path, use_mic):
         if len(prompt) < 2 or len(prompt) > 200:
             return None, None, "El texto debe tener entre 2 y 200 caracteres."
-        # Usamos los valores de la configuración predeterminada
-        temperature = config.inference.get("temperature", 0.75)
-        repetition_penalty = config.inference.get("repetition_penalty", 5.0)
-        gpt_cond_len = config.inference.get("gpt_cond_len", 30)
-        gpt_cond_chunk_len = config.inference.get("gpt_cond_chunk_len", 4)
-        max_ref_length = config.inference.get("max_ref_length", 60)
         gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(
             audio_path=speaker_wav,
@@ -90,6 +90,8 @@ def predict(prompt, language, audio_file_pth, mic_file_path, use_mic):
             max_ref_length=max_ref_length
         )
         out = model.inference(
             prompt,
             language,
@@ -98,11 +100,16 @@ def predict(prompt, language, audio_file_pth, mic_file_path, use_mic):
             temperature=temperature,
             repetition_penalty=repetition_penalty,
         )
         torchaudio.save("output.wav", torch.tensor(out["wav"]).unsqueeze(0), 24000)
-        metrics_text = f"Tiempo de generación: {out['inference_time']:.2f} segundos\n"
-        metrics_text += f"Factor de tiempo real: {out['inference_time'] / (len(out['wav']) / 24000):.2f}"
         return gr.make_waveform("output.wav"), "output.wav", metrics_text
@@ -110,6 +117,7 @@ def predict(prompt, language, audio_file_pth, mic_file_path, use_mic):
         print(f"Error detallado: {str(e)}")
         return None, None, f"Error: {str(e)}"
 # Interfaz de Gradio actualizada sin sliders
 with gr.Blocks(theme=gr.themes.Base()) as demo:
     gr.Markdown("# Sintetizador de Voz XTTS")

 import time
 import torch
 import torchaudio
+import time
 # Mantenemos la descarga de MeCab
 os.system('python -m unidic download')
         if len(prompt) < 2 or len(prompt) > 200:
             return None, None, "El texto debe tener entre 2 y 200 caracteres."
+        # Usamos los valores de la configuración directamente
+        temperature = getattr(config, "temperature", 0.75)
+        repetition_penalty = getattr(config, "repetition_penalty", 5.0)
+        gpt_cond_len = getattr(config, "gpt_cond_len", 30)
+        gpt_cond_chunk_len = getattr(config, "gpt_cond_chunk_len", 4)
+        max_ref_length = getattr(config, "max_ref_len", 60)
         gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(
             audio_path=speaker_wav,
             max_ref_length=max_ref_length
         )
+        # Medimos el tiempo de inferencia manualmente
+        start_time = time.time()
         out = model.inference(
             prompt,
             language,
             temperature=temperature,
             repetition_penalty=repetition_penalty,
         )
+        inference_time = time.time() - start_time
         torchaudio.save("output.wav", torch.tensor(out["wav"]).unsqueeze(0), 24000)
+        # Calculamos las métricas usando el tiempo medido manualmente
+        audio_length = len(out["wav"]) / 24000  # duración del audio en segundos
+        real_time_factor = inference_time / audio_length
+        metrics_text = f"Tiempo de generación: {inference_time:.2f} segundos\n"
+        metrics_text += f"Factor de tiempo real: {real_time_factor:.2f}"
         return gr.make_waveform("output.wav"), "output.wav", metrics_text
         print(f"Error detallado: {str(e)}")
         return None, None, f"Error: {str(e)}"
 # Interfaz de Gradio actualizada sin sliders
 with gr.Blocks(theme=gr.themes.Base()) as demo:
     gr.Markdown("# Sintetizador de Voz XTTS")