Blakus commited on
Commit
0b10e6e
·
verified ·
1 Parent(s): dde251d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +10 -98
app.py CHANGED
@@ -1,71 +1,5 @@
1
- import sys
2
- import io, os, stat
3
- import subprocess
4
- import random
5
- from zipfile import ZipFile
6
- import uuid
7
  import time
8
- import torch
9
- import torchaudio
10
 
11
- # Mantenemos la descarga de MeCab
12
- os.system('python -m unidic download')
13
-
14
- # Mantenemos el acuerdo de CPML
15
- os.environ["COQUI_TOS_AGREED"] = "1"
16
-
17
- import langid
18
- import base64
19
- import csv
20
- from io import StringIO
21
- import datetime
22
- import re
23
-
24
- import gradio as gr
25
- from scipy.io.wavfile import write
26
- from pydub import AudioSegment
27
-
28
- from TTS.api import TTS
29
- from TTS.tts.configs.xtts_config import XttsConfig
30
- from TTS.tts.models.xtts import Xtts
31
- from TTS.utils.generic_utils import get_user_data_dir
32
-
33
- HF_TOKEN = os.environ.get("HF_TOKEN")
34
-
35
- from huggingface_hub import hf_hub_download
36
- import os
37
- from TTS.utils.manage import get_user_data_dir
38
-
39
- # Mantenemos la autenticación y descarga del modelo
40
- repo_id = "Blakus/Pedro_Lab_XTTS"
41
- local_dir = os.path.join(get_user_data_dir("tts"), "tts_models--multilingual--multi-dataset--xtts_v2")
42
- os.makedirs(local_dir, exist_ok=True)
43
- files_to_download = ["config.json", "model.pth", "vocab.json"]
44
- for file_name in files_to_download:
45
- print(f"Downloading {file_name} from {repo_id}")
46
- local_file_path = os.path.join(local_dir, file_name)
47
- hf_hub_download(repo_id=repo_id, filename=file_name, local_dir=local_dir)
48
-
49
- # Cargamos configuración y modelo
50
- config_path = os.path.join(local_dir, "config.json")
51
- checkpoint_path = os.path.join(local_dir, "model.pth")
52
- vocab_path = os.path.join(local_dir, "vocab.json")
53
-
54
- config = XttsConfig()
55
- config.load_json(config_path)
56
-
57
- model = Xtts.init_from_config(config)
58
- model.load_checkpoint(config, checkpoint_path=checkpoint_path, vocab_path=vocab_path, eval=True, use_deepspeed=False)
59
-
60
- print("Modelo cargado en CPU")
61
-
62
- # Mantenemos variables globales y funciones auxiliares
63
- DEVICE_ASSERT_DETECTED = 0
64
- DEVICE_ASSERT_PROMPT = None
65
- DEVICE_ASSERT_LANG = None
66
- supported_languages = config.languages
67
-
68
- # Función de inferencia usando parámetros predeterminados del archivo de configuración
69
  def predict(prompt, language, audio_file_pth, mic_file_path, use_mic):
70
  try:
71
  if use_mic:
@@ -90,6 +24,8 @@ def predict(prompt, language, audio_file_pth, mic_file_path, use_mic):
90
  max_ref_length=max_ref_length
91
  )
92
 
 
 
93
  out = model.inference(
94
  prompt,
95
  language,
@@ -98,43 +34,19 @@ def predict(prompt, language, audio_file_pth, mic_file_path, use_mic):
98
  temperature=temperature,
99
  repetition_penalty=repetition_penalty,
100
  )
 
101
 
102
  torchaudio.save("output.wav", torch.tensor(out["wav"]).unsqueeze(0), 24000)
103
 
104
- metrics_text = f"Tiempo de generación: {out['inference_time']:.2f} segundos\n"
105
- metrics_text += f"Factor de tiempo real: {out['inference_time'] / (len(out['wav']) / 24000):.2f}"
 
 
 
 
106
 
107
  return gr.make_waveform("output.wav"), "output.wav", metrics_text
108
 
109
  except Exception as e:
110
  print(f"Error detallado: {str(e)}")
111
- return None, None, f"Error: {str(e)}"
112
-
113
- # Interfaz de Gradio actualizada sin sliders
114
- with gr.Blocks(theme=gr.themes.Base()) as demo:
115
- gr.Markdown("# Sintetizador de Voz XTTS")
116
-
117
- with gr.Row():
118
- with gr.Column():
119
- input_text = gr.Textbox(label="Texto a sintetizar", placeholder="Escribe aquí el texto que quieres convertir a voz...")
120
- language = gr.Dropdown(label="Idioma", choices=supported_languages, value="es")
121
- audio_file = gr.Audio(label="Audio de referencia", type="filepath")
122
- use_mic = gr.Checkbox(label="Usar micrófono")
123
- mic_file = gr.Audio(label="Grabar con micrófono", source="microphone", type="filepath", visible=False)
124
-
125
- use_mic.change(fn=lambda x: gr.update(visible=x), inputs=[use_mic], outputs=[mic_file])
126
-
127
- generate_button = gr.Button("Generar voz")
128
-
129
- with gr.Column():
130
- output_audio = gr.Audio(label="Audio generado")
131
- waveform = gr.Image(label="Forma de onda")
132
- metrics = gr.Textbox(label="Métricas")
133
-
134
- generate_button.click(
135
- predict,
136
- inputs=[input_text, language, audio_file, mic_file, use_mic],
137
- outputs=[waveform, output_audio, metrics]
138
- )
139
-
140
- demo.launch(debug=True)
 
 
 
 
 
 
 
1
  import time
 
 
2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  def predict(prompt, language, audio_file_pth, mic_file_path, use_mic):
4
  try:
5
  if use_mic:
 
24
  max_ref_length=max_ref_length
25
  )
26
 
27
+ # Medimos el tiempo de inferencia manualmente
28
+ start_time = time.time()
29
  out = model.inference(
30
  prompt,
31
  language,
 
34
  temperature=temperature,
35
  repetition_penalty=repetition_penalty,
36
  )
37
+ inference_time = time.time() - start_time
38
 
39
  torchaudio.save("output.wav", torch.tensor(out["wav"]).unsqueeze(0), 24000)
40
 
41
+ # Calculamos las métricas usando el tiempo medido manualmente
42
+ audio_length = len(out["wav"]) / 24000 # duración del audio en segundos
43
+ real_time_factor = inference_time / audio_length
44
+
45
+ metrics_text = f"Tiempo de generación: {inference_time:.2f} segundos\n"
46
+ metrics_text += f"Factor de tiempo real: {real_time_factor:.2f}"
47
 
48
  return gr.make_waveform("output.wav"), "output.wav", metrics_text
49
 
50
  except Exception as e:
51
  print(f"Error detallado: {str(e)}")
52
+ return None, None, f"Error: {str(e)}"