Blakus commited on
Commit
bf42cab
·
verified ·
1 Parent(s): 6b9bc13

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +94 -58
app.py CHANGED
@@ -7,36 +7,27 @@ import uuid
7
  import time
8
  import torch
9
  import torchaudio
10
- import time
11
- # Mantenemos la descarga de MeCab
12
- os.system('python -m unidic download')
13
-
14
- # Mantenemos el acuerdo de CPML
15
- os.environ["COQUI_TOS_AGREED"] = "1"
16
-
17
  import langid
18
  import base64
19
  import csv
20
  from io import StringIO
21
  import datetime
22
  import re
23
-
24
- import gradio as gr
25
  from scipy.io.wavfile import write
26
  from pydub import AudioSegment
27
 
 
28
  from TTS.api import TTS
29
  from TTS.tts.configs.xtts_config import XttsConfig
30
  from TTS.tts.models.xtts import Xtts
31
  from TTS.utils.generic_utils import get_user_data_dir
32
-
33
- HF_TOKEN = os.environ.get("HF_TOKEN")
34
-
35
  from huggingface_hub import hf_hub_download
36
- import os
37
- from TTS.utils.manage import get_user_data_dir
38
 
39
- # Mantenemos la autenticación y descarga del modelo
 
 
 
 
40
  repo_id = "Blakus/Pedro_Lab_XTTS"
41
  local_dir = os.path.join(get_user_data_dir("tts"), "tts_models--multilingual--multi-dataset--xtts_v2")
42
  os.makedirs(local_dir, exist_ok=True)
@@ -46,7 +37,7 @@ for file_name in files_to_download:
46
  local_file_path = os.path.join(local_dir, file_name)
47
  hf_hub_download(repo_id=repo_id, filename=file_name, local_dir=local_dir)
48
 
49
- # Cargamos configuración y modelo
50
  config_path = os.path.join(local_dir, "config.json")
51
  checkpoint_path = os.path.join(local_dir, "model.pth")
52
  vocab_path = os.path.join(local_dir, "vocab.json")
@@ -59,24 +50,29 @@ model.load_checkpoint(config, checkpoint_path=checkpoint_path, vocab_path=vocab_
59
 
60
  print("Modelo cargado en CPU")
61
 
62
- # Mantenemos variables globales y funciones auxiliares
63
- DEVICE_ASSERT_DETECTED = 0
64
- DEVICE_ASSERT_PROMPT = None
65
- DEVICE_ASSERT_LANG = None
66
  supported_languages = config.languages
67
-
68
- # Función de inferencia usando parámetros predeterminados del archivo de configuración
69
- def predict(prompt, language, audio_file_pth, mic_file_path, use_mic):
 
 
 
 
 
 
 
 
 
 
70
  try:
71
- if use_mic:
72
- speaker_wav = mic_file_path
73
- else:
74
  speaker_wav = audio_file_pth
 
 
75
 
76
- if len(prompt) < 2 or len(prompt) > 200:
77
- return None, None, "El texto debe tener entre 2 y 200 caracteres."
78
-
79
- # Usamos los valores de la configuración directamente
80
  temperature = getattr(config, "temperature", 0.75)
81
  repetition_penalty = getattr(config, "repetition_penalty", 5.0)
82
  gpt_cond_len = getattr(config, "gpt_cond_len", 30)
@@ -90,59 +86,99 @@ def predict(prompt, language, audio_file_pth, mic_file_path, use_mic):
90
  max_ref_length=max_ref_length
91
  )
92
 
93
- # Medimos el tiempo de inferencia manualmente
94
  start_time = time.time()
95
- out = model.inference(
96
- prompt,
97
- language,
98
- gpt_cond_latent,
99
- speaker_embedding,
100
- temperature=temperature,
101
- repetition_penalty=repetition_penalty,
102
- )
 
 
 
 
 
 
 
 
 
 
 
 
103
  inference_time = time.time() - start_time
104
 
105
- torchaudio.save("output.wav", torch.tensor(out["wav"]).unsqueeze(0), 24000)
 
106
 
107
- # Calculamos las métricas usando el tiempo medido manualmente
108
- audio_length = len(out["wav"]) / 24000 # duración del audio en segundos
109
  real_time_factor = inference_time / audio_length
110
 
111
  metrics_text = f"Tiempo de generación: {inference_time:.2f} segundos\n"
112
  metrics_text += f"Factor de tiempo real: {real_time_factor:.2f}"
113
 
114
- return gr.make_waveform("output.wav"), "output.wav", metrics_text
115
 
116
  except Exception as e:
117
  print(f"Error detallado: {str(e)}")
118
  return None, None, f"Error: {str(e)}"
119
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
 
121
- # Interfaz de Gradio actualizada sin sliders
122
- with gr.Blocks(theme=gr.themes.Base()) as demo:
123
- gr.Markdown("# Sintetizador de Voz XTTS")
124
-
125
  with gr.Row():
126
- with gr.Column():
127
- input_text = gr.Textbox(label="Texto a sintetizar", placeholder="Escribe aquí el texto que quieres convertir a voz...")
 
 
128
  language = gr.Dropdown(label="Idioma", choices=supported_languages, value="es")
129
- audio_file = gr.Audio(label="Audio de referencia", type="filepath")
130
- use_mic = gr.Checkbox(label="Usar micrófono")
131
- mic_file = gr.Audio(label="Grabar con micrófono", source="microphone", type="filepath", visible=False)
132
 
133
- use_mic.change(fn=lambda x: gr.update(visible=x), inputs=[use_mic], outputs=[mic_file])
 
 
 
 
134
 
135
- generate_button = gr.Button("Generar voz")
136
-
137
- with gr.Column():
 
138
  output_audio = gr.Audio(label="Audio generado")
139
  waveform = gr.Image(label="Forma de onda")
140
  metrics = gr.Textbox(label="Métricas")
141
 
142
  generate_button.click(
143
  predict,
144
- inputs=[input_text, language, audio_file, mic_file, use_mic],
145
  outputs=[waveform, output_audio, metrics]
146
  )
147
 
148
- demo.launch(debug=True)
 
 
7
  import time
8
  import torch
9
  import torchaudio
 
 
 
 
 
 
 
10
  import langid
11
  import base64
12
  import csv
13
  from io import StringIO
14
  import datetime
15
  import re
 
 
16
  from scipy.io.wavfile import write
17
  from pydub import AudioSegment
18
 
19
+ import gradio as gr
20
  from TTS.api import TTS
21
  from TTS.tts.configs.xtts_config import XttsConfig
22
  from TTS.tts.models.xtts import Xtts
23
  from TTS.utils.generic_utils import get_user_data_dir
 
 
 
24
  from huggingface_hub import hf_hub_download
 
 
25
 
26
+ # Configuración inicial
27
+ os.environ["COQUI_TOS_AGREED"] = "1"
28
+ os.system('python -m unidic download')
29
+
30
+ # Autenticación y descarga del modelo
31
  repo_id = "Blakus/Pedro_Lab_XTTS"
32
  local_dir = os.path.join(get_user_data_dir("tts"), "tts_models--multilingual--multi-dataset--xtts_v2")
33
  os.makedirs(local_dir, exist_ok=True)
 
37
  local_file_path = os.path.join(local_dir, file_name)
38
  hf_hub_download(repo_id=repo_id, filename=file_name, local_dir=local_dir)
39
 
40
+ # Carga de configuración y modelo
41
  config_path = os.path.join(local_dir, "config.json")
42
  checkpoint_path = os.path.join(local_dir, "model.pth")
43
  vocab_path = os.path.join(local_dir, "vocab.json")
 
50
 
51
  print("Modelo cargado en CPU")
52
 
53
+ # Variables globales
 
 
 
54
  supported_languages = config.languages
55
+ reference_audios = [
56
+ "serio.wav",
57
+ "neutral.wav",
58
+ "alegre.wav",
59
+ ]
60
+
61
+ # Función para dividir el texto en chunks
62
+ def split_text(text):
63
+ sentences = re.split(r'(?<=[.!?])\s+', text)
64
+ return sentences
65
+
66
+ # Función de inferencia mejorada
67
+ def predict(prompt, language, audio_file_pth, use_reference_audio):
68
  try:
69
+ if use_reference_audio:
 
 
70
  speaker_wav = audio_file_pth
71
+ else:
72
+ speaker_wav = "neutral.wav" # Audio por defecto si no se selecciona uno
73
 
74
+ sentences = split_text(prompt)
75
+
 
 
76
  temperature = getattr(config, "temperature", 0.75)
77
  repetition_penalty = getattr(config, "repetition_penalty", 5.0)
78
  gpt_cond_len = getattr(config, "gpt_cond_len", 30)
 
86
  max_ref_length=max_ref_length
87
  )
88
 
 
89
  start_time = time.time()
90
+ combined_audio = AudioSegment.empty()
91
+
92
+ for sentence in sentences:
93
+ out = model.inference(
94
+ sentence,
95
+ language,
96
+ gpt_cond_latent,
97
+ speaker_embedding,
98
+ temperature=temperature,
99
+ repetition_penalty=repetition_penalty,
100
+ )
101
+ audio_segment = AudioSegment(
102
+ out["wav"].tobytes(),
103
+ frame_rate=24000,
104
+ sample_width=2,
105
+ channels=1
106
+ )
107
+ combined_audio += audio_segment
108
+ combined_audio += AudioSegment.silent(duration=500) # 0.5 segundos de silencio
109
+
110
  inference_time = time.time() - start_time
111
 
112
+ output_path = "output.wav"
113
+ combined_audio.export(output_path, format="wav")
114
 
115
+ audio_length = len(combined_audio) / 1000 # duración del audio en segundos
 
116
  real_time_factor = inference_time / audio_length
117
 
118
  metrics_text = f"Tiempo de generación: {inference_time:.2f} segundos\n"
119
  metrics_text += f"Factor de tiempo real: {real_time_factor:.2f}"
120
 
121
+ return gr.make_waveform(output_path), output_path, metrics_text
122
 
123
  except Exception as e:
124
  print(f"Error detallado: {str(e)}")
125
  return None, None, f"Error: {str(e)}"
126
 
127
+ # Definir el tema personalizado
128
+ theme = gr.themes.Soft(
129
+ primary_hue="blue",
130
+ secondary_hue="gray",
131
+ ).set(
132
+ body_background_fill='*neutral_100',
133
+ body_background_fill_dark='*neutral_900',
134
+ )
135
+
136
+ # Descripción del proyecto
137
+ description = """
138
+ # Sintetizador de voz de Pedro Labattaglia 🎙️
139
+
140
+ Sintetizador de voz con la voz del locutor argentino Pedro Labattaglia.
141
+
142
+ ## Cómo usarlo:
143
+ - Elija el idioma (Español o Inglés)
144
+ - Elija un audio de referencia de la lista o cargue su propio audio
145
+ - Escriba el texto a sintetizar
146
+ - Presione generar voz
147
+ """
148
+
149
+ # Interfaz de Gradio
150
+ with gr.Blocks(theme=theme) as demo:
151
+ gr.Markdown(description)
152
 
 
 
 
 
153
  with gr.Row():
154
+ gr.Image("https://i1.sndcdn.com/artworks-000237574740-gwz61j-t500x500.jpg", label="", show_label=False, width=250, height=250)
155
+
156
+ with gr.Row():
157
+ with gr.Column(scale=2):
158
  language = gr.Dropdown(label="Idioma", choices=supported_languages, value="es")
159
+ use_reference_audio = gr.Checkbox(label="Usar audio de referencia")
160
+ reference_audio = gr.Dropdown(label="Audio de referencia predefinido", choices=reference_audios, visible=False)
161
+ audio_file = gr.Audio(label="O cargue su propio audio de referencia", type="filepath", visible=False)
162
 
163
+ use_reference_audio.change(
164
+ fn=lambda x: [gr.update(visible=x), gr.update(visible=x)],
165
+ inputs=[use_reference_audio],
166
+ outputs=[reference_audio, audio_file]
167
+ )
168
 
169
+ input_text = gr.Textbox(label="Texto a sintetizar", placeholder="Escribe aquí el texto que quieres convertir a voz...", lines=5)
170
+ generate_button = gr.Button("Generar voz", variant="primary")
171
+
172
+ with gr.Column(scale=1):
173
  output_audio = gr.Audio(label="Audio generado")
174
  waveform = gr.Image(label="Forma de onda")
175
  metrics = gr.Textbox(label="Métricas")
176
 
177
  generate_button.click(
178
  predict,
179
+ inputs=[input_text, language, audio_file, use_reference_audio],
180
  outputs=[waveform, output_audio, metrics]
181
  )
182
 
183
+ if __name__ == "__main__":
184
+ demo.launch(debug=True)