Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
@@ -1,22 +1,8 @@
|
|
1 |
-
import
|
2 |
-
import io, os, stat
|
3 |
-
import subprocess
|
4 |
-
import random
|
5 |
-
from zipfile import ZipFile
|
6 |
-
import uuid
|
7 |
-
import time
|
8 |
-
import torch
|
9 |
-
import torchaudio
|
10 |
-
import langid
|
11 |
-
import base64
|
12 |
-
import csv
|
13 |
-
from io import StringIO
|
14 |
-
import datetime
|
15 |
import re
|
16 |
-
|
17 |
-
from pydub import AudioSegment
|
18 |
-
|
19 |
import gradio as gr
|
|
|
20 |
from TTS.api import TTS
|
21 |
from TTS.tts.configs.xtts_config import XttsConfig
|
22 |
from TTS.tts.models.xtts import Xtts
|
@@ -25,18 +11,18 @@ from huggingface_hub import hf_hub_download
|
|
25 |
|
26 |
# Configuración inicial
|
27 |
os.environ["COQUI_TOS_AGREED"] = "1"
|
|
|
28 |
|
29 |
-
#
|
30 |
repo_id = "Blakus/Pedro_Lab_XTTS"
|
31 |
local_dir = os.path.join(get_user_data_dir("tts"), "tts_models--multilingual--multi-dataset--xtts_v2")
|
32 |
os.makedirs(local_dir, exist_ok=True)
|
33 |
files_to_download = ["config.json", "model.pth", "vocab.json"]
|
|
|
34 |
for file_name in files_to_download:
|
35 |
print(f"Downloading {file_name} from {repo_id}")
|
36 |
-
local_file_path = os.path.join(local_dir, file_name)
|
37 |
hf_hub_download(repo_id=repo_id, filename=file_name, local_dir=local_dir)
|
38 |
|
39 |
-
# Carga de configuración y modelo
|
40 |
config_path = os.path.join(local_dir, "config.json")
|
41 |
checkpoint_path = os.path.join(local_dir, "model.pth")
|
42 |
vocab_path = os.path.join(local_dir, "vocab.json")
|
@@ -49,37 +35,25 @@ model.load_checkpoint(config, checkpoint_path=checkpoint_path, vocab_path=vocab_
|
|
49 |
|
50 |
print("Modelo cargado en CPU")
|
51 |
|
52 |
-
#
|
53 |
-
supported_languages = [lang for lang in config.languages if lang != "ja"]
|
54 |
-
reference_audios = [
|
55 |
-
"serio.wav",
|
56 |
-
"neutral.wav",
|
57 |
-
"alegre.wav",
|
58 |
-
]
|
59 |
-
|
60 |
-
# Función para dividir el texto en chunks
|
61 |
def split_text(text):
|
62 |
-
|
63 |
-
return sentences
|
64 |
|
65 |
-
|
66 |
-
def predict(prompt, language, audio_file_pth, use_reference_audio):
|
67 |
try:
|
68 |
-
if
|
69 |
-
|
70 |
-
else:
|
71 |
-
speaker_wav = "neutral.wav" # Audio por defecto si no se selecciona uno
|
72 |
|
73 |
sentences = split_text(prompt)
|
74 |
-
|
75 |
-
temperature =
|
76 |
-
repetition_penalty =
|
77 |
-
gpt_cond_len =
|
78 |
-
gpt_cond_chunk_len =
|
79 |
-
max_ref_length =
|
80 |
|
81 |
gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(
|
82 |
-
audio_path=
|
83 |
gpt_cond_len=gpt_cond_len,
|
84 |
gpt_cond_chunk_len=gpt_cond_chunk_len,
|
85 |
max_ref_length=max_ref_length
|
@@ -87,7 +61,7 @@ def predict(prompt, language, audio_file_pth, use_reference_audio):
|
|
87 |
|
88 |
start_time = time.time()
|
89 |
combined_audio = AudioSegment.empty()
|
90 |
-
|
91 |
for sentence in sentences:
|
92 |
out = model.inference(
|
93 |
sentence,
|
@@ -117,13 +91,20 @@ def predict(prompt, language, audio_file_pth, use_reference_audio):
|
|
117 |
metrics_text = f"Tiempo de generación: {inference_time:.2f} segundos\n"
|
118 |
metrics_text += f"Factor de tiempo real: {real_time_factor:.2f}"
|
119 |
|
120 |
-
return
|
121 |
|
122 |
except Exception as e:
|
123 |
print(f"Error detallado: {str(e)}")
|
124 |
-
return None,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
125 |
|
126 |
-
# Definir el tema personalizado
|
127 |
theme = gr.themes.Soft(
|
128 |
primary_hue="blue",
|
129 |
secondary_hue="gray",
|
@@ -132,16 +113,15 @@ theme = gr.themes.Soft(
|
|
132 |
body_background_fill_dark='*neutral_900',
|
133 |
)
|
134 |
|
135 |
-
# Descripción del proyecto
|
136 |
description = """
|
137 |
# Sintetizador de voz de Pedro Labattaglia 🎙️
|
138 |
|
139 |
Sintetizador de voz con la voz del locutor argentino Pedro Labattaglia.
|
140 |
|
141 |
## Cómo usarlo:
|
142 |
-
- Elija el idioma
|
143 |
-
- Elija un audio de referencia de la lista
|
144 |
-
- Escriba el texto
|
145 |
- Presione generar voz
|
146 |
"""
|
147 |
|
@@ -154,30 +134,20 @@ with gr.Blocks(theme=theme) as demo:
|
|
154 |
|
155 |
with gr.Row():
|
156 |
with gr.Column(scale=2):
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
audio_file = gr.Audio(label="O cargue su propio audio de referencia", type="filepath", visible=False)
|
161 |
-
|
162 |
-
use_reference_audio.change(
|
163 |
-
fn=lambda x: [gr.update(visible=x), gr.update(visible=x)],
|
164 |
-
inputs=[use_reference_audio],
|
165 |
-
outputs=[reference_audio, audio_file]
|
166 |
-
)
|
167 |
-
|
168 |
-
input_text = gr.Textbox(label="Texto a sintetizar", placeholder="Escribe aquí el texto que quieres convertir a voz...", lines=5)
|
169 |
generate_button = gr.Button("Generar voz", variant="primary")
|
170 |
|
171 |
with gr.Column(scale=1):
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
generate_button.click(
|
177 |
predict,
|
178 |
-
inputs=[input_text,
|
179 |
-
outputs=[
|
180 |
)
|
181 |
|
182 |
if __name__ == "__main__":
|
183 |
-
demo.launch(
|
|
|
1 |
+
import os
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
import re
|
3 |
+
import time
|
|
|
|
|
4 |
import gradio as gr
|
5 |
+
from pydub import AudioSegment
|
6 |
from TTS.api import TTS
|
7 |
from TTS.tts.configs.xtts_config import XttsConfig
|
8 |
from TTS.tts.models.xtts import Xtts
|
|
|
11 |
|
12 |
# Configuración inicial
|
13 |
os.environ["COQUI_TOS_AGREED"] = "1"
|
14 |
+
os.system('python -m unidic download')
|
15 |
|
16 |
+
# Descargar y configurar el modelo
|
17 |
repo_id = "Blakus/Pedro_Lab_XTTS"
|
18 |
local_dir = os.path.join(get_user_data_dir("tts"), "tts_models--multilingual--multi-dataset--xtts_v2")
|
19 |
os.makedirs(local_dir, exist_ok=True)
|
20 |
files_to_download = ["config.json", "model.pth", "vocab.json"]
|
21 |
+
|
22 |
for file_name in files_to_download:
|
23 |
print(f"Downloading {file_name} from {repo_id}")
|
|
|
24 |
hf_hub_download(repo_id=repo_id, filename=file_name, local_dir=local_dir)
|
25 |
|
|
|
26 |
config_path = os.path.join(local_dir, "config.json")
|
27 |
checkpoint_path = os.path.join(local_dir, "model.pth")
|
28 |
vocab_path = os.path.join(local_dir, "vocab.json")
|
|
|
35 |
|
36 |
print("Modelo cargado en CPU")
|
37 |
|
38 |
+
# Funciones auxiliares
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
def split_text(text):
|
40 |
+
return re.split(r'(?<=[.!?])\s+', text)
|
|
|
41 |
|
42 |
+
def predict(prompt, language, reference_audio):
|
|
|
43 |
try:
|
44 |
+
if len(prompt) < 2 or len(prompt) > 600:
|
45 |
+
return None, "El texto debe tener entre 2 y 600 caracteres."
|
|
|
|
|
46 |
|
47 |
sentences = split_text(prompt)
|
48 |
+
|
49 |
+
temperature = config.inference.get("temperature", 0.75)
|
50 |
+
repetition_penalty = config.inference.get("repetition_penalty", 5.0)
|
51 |
+
gpt_cond_len = config.inference.get("gpt_cond_len", 30)
|
52 |
+
gpt_cond_chunk_len = config.inference.get("gpt_cond_chunk_len", 4)
|
53 |
+
max_ref_length = config.inference.get("max_ref_length", 60)
|
54 |
|
55 |
gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(
|
56 |
+
audio_path=reference_audio,
|
57 |
gpt_cond_len=gpt_cond_len,
|
58 |
gpt_cond_chunk_len=gpt_cond_chunk_len,
|
59 |
max_ref_length=max_ref_length
|
|
|
61 |
|
62 |
start_time = time.time()
|
63 |
combined_audio = AudioSegment.empty()
|
64 |
+
|
65 |
for sentence in sentences:
|
66 |
out = model.inference(
|
67 |
sentence,
|
|
|
91 |
metrics_text = f"Tiempo de generación: {inference_time:.2f} segundos\n"
|
92 |
metrics_text += f"Factor de tiempo real: {real_time_factor:.2f}"
|
93 |
|
94 |
+
return output_path, metrics_text
|
95 |
|
96 |
except Exception as e:
|
97 |
print(f"Error detallado: {str(e)}")
|
98 |
+
return None, f"Error: {str(e)}"
|
99 |
+
|
100 |
+
# Configuración de la interfaz de Gradio
|
101 |
+
supported_languages = ["es", "en"]
|
102 |
+
reference_audios = [
|
103 |
+
"serio.wav",
|
104 |
+
"neutral.wav",
|
105 |
+
"alegre.wav",
|
106 |
+
]
|
107 |
|
|
|
108 |
theme = gr.themes.Soft(
|
109 |
primary_hue="blue",
|
110 |
secondary_hue="gray",
|
|
|
113 |
body_background_fill_dark='*neutral_900',
|
114 |
)
|
115 |
|
|
|
116 |
description = """
|
117 |
# Sintetizador de voz de Pedro Labattaglia 🎙️
|
118 |
|
119 |
Sintetizador de voz con la voz del locutor argentino Pedro Labattaglia.
|
120 |
|
121 |
## Cómo usarlo:
|
122 |
+
- Elija el idioma (Español o Inglés)
|
123 |
+
- Elija un audio de referencia de la lista
|
124 |
+
- Escriba el texto que desea sintetizar
|
125 |
- Presione generar voz
|
126 |
"""
|
127 |
|
|
|
134 |
|
135 |
with gr.Row():
|
136 |
with gr.Column(scale=2):
|
137 |
+
language_selector = gr.Dropdown(label="Idioma", choices=supported_languages)
|
138 |
+
reference_audio = gr.Dropdown(label="Audio de referencia", choices=reference_audios)
|
139 |
+
input_text = gr.Textbox(label="Texto a sintetizar", placeholder="Escribe aquí el texto que quieres convertir a voz...")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
140 |
generate_button = gr.Button("Generar voz", variant="primary")
|
141 |
|
142 |
with gr.Column(scale=1):
|
143 |
+
generated_audio = gr.Audio(label="Audio generado", interactive=False)
|
144 |
+
metrics_output = gr.Textbox(label="Métricas", value="Tiempo de generación: -- segundos\nFactor de tiempo real: --")
|
145 |
+
|
|
|
146 |
generate_button.click(
|
147 |
predict,
|
148 |
+
inputs=[input_text, language_selector, reference_audio],
|
149 |
+
outputs=[generated_audio, metrics_output]
|
150 |
)
|
151 |
|
152 |
if __name__ == "__main__":
|
153 |
+
demo.launch()
|