Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
@@ -65,7 +65,7 @@ DEVICE_ASSERT_PROMPT = None
|
|
65 |
DEVICE_ASSERT_LANG = None
|
66 |
supported_languages = config.languages
|
67 |
|
68 |
-
def predict(prompt, language, audio_file_pth, mic_file_path, use_mic):
|
69 |
try:
|
70 |
if use_mic:
|
71 |
speaker_wav = mic_file_path
|
@@ -75,15 +75,25 @@ def predict(prompt, language, audio_file_pth, mic_file_path, use_mic):
|
|
75 |
if len(prompt) < 2 or len(prompt) > 200:
|
76 |
return None, None, "El texto debe tener entre 2 y 200 caracteres."
|
77 |
|
78 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
79 |
|
80 |
out = model.inference(
|
81 |
prompt,
|
82 |
language,
|
83 |
gpt_cond_latent,
|
84 |
speaker_embedding,
|
85 |
-
|
86 |
-
|
87 |
)
|
88 |
|
89 |
torchaudio.save("output.wav", torch.tensor(out["wav"]).unsqueeze(0), 24000)
|
@@ -94,10 +104,10 @@ def predict(prompt, language, audio_file_pth, mic_file_path, use_mic):
|
|
94 |
return gr.make_waveform("output.wav"), "output.wav", metrics_text
|
95 |
|
96 |
except Exception as e:
|
97 |
-
print(f"Error: {str(e)}")
|
98 |
return None, None, f"Error: {str(e)}"
|
99 |
|
100 |
-
# Interfaz de Gradio
|
101 |
with gr.Blocks(theme=gr.themes.Base()) as demo:
|
102 |
gr.Markdown("# Sintetizador de Voz XTTS")
|
103 |
|
@@ -111,6 +121,13 @@ with gr.Blocks(theme=gr.themes.Base()) as demo:
|
|
111 |
|
112 |
use_mic.change(fn=lambda x: gr.update(visible=x), inputs=[use_mic], outputs=[mic_file])
|
113 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
114 |
generate_button = gr.Button("Generar voz")
|
115 |
|
116 |
with gr.Column():
|
@@ -120,7 +137,7 @@ with gr.Blocks(theme=gr.themes.Base()) as demo:
|
|
120 |
|
121 |
generate_button.click(
|
122 |
predict,
|
123 |
-
inputs=[input_text, language, audio_file, mic_file, use_mic],
|
124 |
outputs=[waveform, output_audio, metrics]
|
125 |
)
|
126 |
|
|
|
65 |
DEVICE_ASSERT_LANG = None
|
66 |
supported_languages = config.languages
|
67 |
|
68 |
+
def predict(prompt, language, audio_file_pth, mic_file_path, use_mic, temperature, repetition_penalty, gpt_cond_len, gpt_cond_chunk_len, max_ref_length):
|
69 |
try:
|
70 |
if use_mic:
|
71 |
speaker_wav = mic_file_path
|
|
|
75 |
if len(prompt) < 2 or len(prompt) > 200:
|
76 |
return None, None, "El texto debe tener entre 2 y 200 caracteres."
|
77 |
|
78 |
+
# Ajustamos el tamaño del prompt si es necesario
|
79 |
+
max_prompt_length = model.config.max_text_token_len
|
80 |
+
if len(prompt) > max_prompt_length:
|
81 |
+
prompt = prompt[:max_prompt_length]
|
82 |
+
|
83 |
+
gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(
|
84 |
+
audio_path=speaker_wav,
|
85 |
+
gpt_cond_len=gpt_cond_len,
|
86 |
+
gpt_cond_chunk_len=gpt_cond_chunk_len,
|
87 |
+
max_ref_length=max_ref_length
|
88 |
+
)
|
89 |
|
90 |
out = model.inference(
|
91 |
prompt,
|
92 |
language,
|
93 |
gpt_cond_latent,
|
94 |
speaker_embedding,
|
95 |
+
temperature=temperature,
|
96 |
+
repetition_penalty=repetition_penalty,
|
97 |
)
|
98 |
|
99 |
torchaudio.save("output.wav", torch.tensor(out["wav"]).unsqueeze(0), 24000)
|
|
|
104 |
return gr.make_waveform("output.wav"), "output.wav", metrics_text
|
105 |
|
106 |
except Exception as e:
|
107 |
+
print(f"Error detallado: {str(e)}")
|
108 |
return None, None, f"Error: {str(e)}"
|
109 |
|
110 |
+
# Interfaz de Gradio actualizada
|
111 |
with gr.Blocks(theme=gr.themes.Base()) as demo:
|
112 |
gr.Markdown("# Sintetizador de Voz XTTS")
|
113 |
|
|
|
121 |
|
122 |
use_mic.change(fn=lambda x: gr.update(visible=x), inputs=[use_mic], outputs=[mic_file])
|
123 |
|
124 |
+
# Nuevos sliders para parámetros de inferencia
|
125 |
+
temperature = gr.Slider(label="Temperatura", minimum=0.1, maximum=1.0, value=0.75, step=0.01)
|
126 |
+
repetition_penalty = gr.Slider(label="Penalización de repetición", minimum=1.0, maximum=10.0, value=5.0, step=0.1)
|
127 |
+
gpt_cond_len = gr.Slider(label="GPT Cond Len", minimum=10, maximum=50, value=30, step=1)
|
128 |
+
gpt_cond_chunk_len = gr.Slider(label="GPT Cond Chunk Len", minimum=1, maximum=10, value=4, step=1)
|
129 |
+
max_ref_length = gr.Slider(label="Max Ref Length", minimum=30, maximum=120, value=60, step=1)
|
130 |
+
|
131 |
generate_button = gr.Button("Generar voz")
|
132 |
|
133 |
with gr.Column():
|
|
|
137 |
|
138 |
generate_button.click(
|
139 |
predict,
|
140 |
+
inputs=[input_text, language, audio_file, mic_file, use_mic, temperature, repetition_penalty, gpt_cond_len, gpt_cond_chunk_len, max_ref_length],
|
141 |
outputs=[waveform, output_audio, metrics]
|
142 |
)
|
143 |
|