Spaces:
Running
Running
fixed saved voice persistence on reload
Browse files- app.py +95 -122
- voices.json +1 -1
app.py
CHANGED
@@ -16,6 +16,10 @@ import nltk
|
|
16 |
matplotlib.use("Agg") # Use non-interactive backend
|
17 |
import matplotlib.pyplot as plt
|
18 |
|
|
|
|
|
|
|
|
|
19 |
from text2speech import tts_randomized, parse_speed, tts_with_style_vector
|
20 |
|
21 |
# Constants and Paths
|
@@ -47,7 +51,6 @@ print(f"Using device: {device}")
|
|
47 |
# LOAD PCA MODEL AND ANNOTATED FEATURES
|
48 |
##############################################################################
|
49 |
|
50 |
-
# Load PCA model and annotated features
|
51 |
try:
|
52 |
pca = joblib.load(PCA_MODEL_PATH)
|
53 |
print("PCA model loaded successfully.")
|
@@ -90,12 +93,7 @@ def save_voices_json(data, path=VOICES_JSON_PATH):
|
|
90 |
def update_sliders(voice_name):
|
91 |
"""
|
92 |
Update slider values based on the selected predefined voice using reverse PCA.
|
93 |
-
|
94 |
-
Args:
|
95 |
-
voice_name (str): The name of the selected voice.
|
96 |
-
|
97 |
-
Returns:
|
98 |
-
list: A list of PCA component values to set the sliders.
|
99 |
"""
|
100 |
if not voice_name:
|
101 |
# Return default slider values (e.g., zeros) if no voice is selected
|
@@ -124,24 +122,16 @@ def update_sliders(voice_name):
|
|
124 |
def generate_audio_with_voice(text, voice_key, speed_val):
|
125 |
"""
|
126 |
Generate audio using the style vector of the selected predefined voice.
|
127 |
-
|
128 |
-
Args:
|
129 |
-
text (str): The text to synthesize.
|
130 |
-
voice_key (str): The name of the selected voice.
|
131 |
-
speed_val (float): The speed multiplier.
|
132 |
-
|
133 |
-
Returns:
|
134 |
-
tuple: (audio_tuple, style_vector)
|
135 |
"""
|
136 |
try:
|
137 |
# Load voices data
|
138 |
voices_data = load_voices_json()
|
139 |
-
|
140 |
if voice_key not in voices_data:
|
141 |
-
|
142 |
-
|
|
|
143 |
|
144 |
-
# Retrieve the style vector for the selected voice
|
145 |
style_vector = np.array(voices_data[voice_key], dtype=np.float32).reshape(1, -1)
|
146 |
print(f"Selected Voice: {voice_key}")
|
147 |
print(f"Style Vector (First 6): {style_vector[0][:6]}")
|
@@ -149,7 +139,7 @@ def generate_audio_with_voice(text, voice_key, speed_val):
|
|
149 |
# Convert to torch tensor and move to device
|
150 |
style_vec_torch = torch.from_numpy(style_vector).float().to(device)
|
151 |
|
152 |
-
# Generate audio
|
153 |
audio_np = tts_with_style_vector(
|
154 |
text,
|
155 |
style_vec=style_vec_torch,
|
@@ -161,14 +151,12 @@ def generate_audio_with_voice(text, voice_key, speed_val):
|
|
161 |
)
|
162 |
|
163 |
if audio_np is None:
|
164 |
-
|
165 |
-
|
|
|
166 |
|
167 |
-
|
168 |
-
sr = 24000 # Adjust based on your actual sampling rate
|
169 |
audio_tuple = (sr, audio_np)
|
170 |
-
|
171 |
-
# Return audio, image, and style vector
|
172 |
return audio_tuple, style_vector.tolist()
|
173 |
|
174 |
except Exception as e:
|
@@ -177,7 +165,7 @@ def generate_audio_with_voice(text, voice_key, speed_val):
|
|
177 |
|
178 |
|
179 |
def build_modified_vector(voice_key, top6_values):
|
180 |
-
"""
|
181 |
voices_data = load_voices_json()
|
182 |
if voice_key not in voices_data:
|
183 |
print(f"Voice '{voice_key}' not found in {VOICES_JSON_PATH}.")
|
@@ -189,7 +177,6 @@ def build_modified_vector(voice_key, top6_values):
|
|
189 |
return None
|
190 |
|
191 |
try:
|
192 |
-
# Reconstruct the style vector using inverse PCA
|
193 |
pca_components = np.array(top6_values).reshape(1, -1)
|
194 |
reconstructed_vec = pca.inverse_transform(pca_components)[0]
|
195 |
return reconstructed_vec
|
@@ -198,39 +185,18 @@ def build_modified_vector(voice_key, top6_values):
|
|
198 |
return None
|
199 |
|
200 |
|
201 |
-
def
|
202 |
-
"""
|
203 |
-
Reconstruct the 256-dimensional style vector from PCA components.
|
204 |
-
"""
|
205 |
-
if pca is None:
|
206 |
-
print("PCA model is not loaded.")
|
207 |
-
return None
|
208 |
-
try:
|
209 |
-
return pca.inverse_transform([pca_components])[0]
|
210 |
-
except Exception as e:
|
211 |
-
print(f"Error during inverse PCA transform: {e}")
|
212 |
-
return None
|
213 |
-
|
214 |
-
|
215 |
-
def generate_custom_audio(text, voice_key, randomize, speed_str, *slider_values):
|
216 |
"""
|
217 |
-
Generate audio
|
218 |
-
Returns
|
219 |
-
- audio tuple (sr, np_array) for Gradio's Audio
|
220 |
-
- a PIL Image representing the style vector plot
|
221 |
-
- the final style vector as a list for State
|
222 |
"""
|
223 |
try:
|
224 |
-
speed_val = parse_speed(speed_str)
|
225 |
-
print(f"Parsed speed: {speed_val}")
|
226 |
-
|
227 |
if randomize:
|
228 |
# Generate randomized style vector
|
229 |
audio_np, random_style_vec = tts_randomized(text, speed=speed_val)
|
230 |
if random_style_vec is None:
|
231 |
print("Failed to generate randomized style vector.")
|
232 |
-
return None, None
|
233 |
-
# Ensure the style vector is flat and on device
|
234 |
final_vec = (
|
235 |
random_style_vec.cpu().numpy().flatten()
|
236 |
if isinstance(random_style_vec, torch.Tensor)
|
@@ -238,20 +204,15 @@ def generate_custom_audio(text, voice_key, randomize, speed_str, *slider_values)
|
|
238 |
)
|
239 |
print("Randomized Style Vector (First 6):", final_vec[:6])
|
240 |
else:
|
241 |
-
# Reconstruct
|
242 |
reconstructed_vec = build_modified_vector(voice_key, slider_values)
|
243 |
if reconstructed_vec is None:
|
244 |
-
print(
|
245 |
-
|
246 |
-
)
|
247 |
-
return None, None, None
|
248 |
|
249 |
-
# Convert to torch tensor and move to device
|
250 |
style_vec_torch = (
|
251 |
torch.from_numpy(reconstructed_vec).float().unsqueeze(0).to(device)
|
252 |
)
|
253 |
-
|
254 |
-
# Generate audio with the reconstructed style vector
|
255 |
audio_np = tts_with_style_vector(
|
256 |
text,
|
257 |
style_vec=style_vec_torch,
|
@@ -266,22 +227,22 @@ def generate_custom_audio(text, voice_key, randomize, speed_str, *slider_values)
|
|
266 |
|
267 |
if audio_np is None:
|
268 |
print("Audio generation failed.")
|
269 |
-
return None, None
|
270 |
|
271 |
-
|
272 |
-
sr = 24000 # Adjust based on your actual sampling rate
|
273 |
audio_tuple = (sr, audio_np)
|
274 |
-
|
275 |
-
# Return audio, image, and style vector
|
276 |
return audio_tuple, final_vec.tolist()
|
277 |
|
278 |
except Exception as e:
|
279 |
-
print(f"Error generating audio and style
|
280 |
-
return None, None
|
281 |
|
282 |
|
283 |
def save_style_to_json(style_data, style_name):
|
284 |
-
"""
|
|
|
|
|
|
|
285 |
if not style_name.strip():
|
286 |
return "Please enter a new style name before saving."
|
287 |
|
@@ -291,37 +252,37 @@ def save_style_to_json(style_data, style_name):
|
|
291 |
f"Style name '{style_name}' already exists. Please choose a different name."
|
292 |
)
|
293 |
|
294 |
-
# Ensure the style_data has the correct length
|
295 |
if len(style_data) != VECTOR_DIMENSION:
|
296 |
return f"Style vector length mismatch. Expected {VECTOR_DIMENSION}, got {len(style_data)}."
|
297 |
|
298 |
-
# Save the style vector
|
299 |
voices_data[style_name] = style_data
|
300 |
save_voices_json(voices_data)
|
301 |
return f"Saved style as '{style_name}' in {VOICES_JSON_PATH}."
|
302 |
|
303 |
|
304 |
-
# Gradio Interface Functions
|
305 |
-
|
306 |
-
|
307 |
def rearrange_voices(new_order):
|
308 |
-
"""
|
|
|
|
|
|
|
309 |
voices_data = load_voices_json()
|
310 |
new_order_list = [name.strip() for name in new_order.split(",")]
|
311 |
if not all(name in voices_data for name in new_order_list):
|
312 |
return "Error: New order contains invalid voice names.", list(
|
313 |
voices_data.keys()
|
314 |
)
|
|
|
315 |
ordered_data = OrderedDict()
|
316 |
for name in new_order_list:
|
317 |
ordered_data[name] = voices_data[name]
|
|
|
318 |
save_voices_json(ordered_data)
|
319 |
print(f"Voices rearranged: {list(ordered_data.keys())}")
|
320 |
return "Voices rearranged successfully.", list(ordered_data.keys())
|
321 |
|
322 |
|
323 |
def delete_voice(selected):
|
324 |
-
"""Delete voices from the voices.json."""
|
325 |
if not selected:
|
326 |
return "No voices selected for deletion.", list(load_voices_json().keys())
|
327 |
voices_data = load_voices_json()
|
@@ -334,14 +295,15 @@ def delete_voice(selected):
|
|
334 |
|
335 |
|
336 |
def upload_new_voices(uploaded_file):
|
337 |
-
"""Upload new voices from a JSON file."""
|
338 |
if uploaded_file is None:
|
339 |
return "No file uploaded.", list(load_voices_json().keys())
|
340 |
try:
|
341 |
uploaded_data = json.load(uploaded_file)
|
342 |
if not isinstance(uploaded_data, dict):
|
343 |
-
return
|
344 |
-
|
|
|
345 |
)
|
346 |
voices_data = load_voices_json()
|
347 |
voices_data.update(uploaded_data)
|
@@ -352,10 +314,13 @@ def upload_new_voices(uploaded_file):
|
|
352 |
return "Uploaded file is not valid JSON.", list(load_voices_json().keys())
|
353 |
|
354 |
|
355 |
-
#
|
|
|
|
|
356 |
|
357 |
|
358 |
def create_combined_interface():
|
|
|
359 |
voices_data = load_voices_json()
|
360 |
voice_choices = list(voices_data.keys())
|
361 |
default_voice = voice_choices[0] if voice_choices else None
|
@@ -367,16 +332,12 @@ def create_combined_interface():
|
|
367 |
}
|
368 |
"""
|
369 |
|
370 |
-
def refresh_voices():
|
371 |
-
"""Refresh the voices by reloading the JSON."""
|
372 |
-
new_choices = list(load_voices_json().keys())
|
373 |
-
print(f"Voices refreshed: {new_choices}")
|
374 |
-
return gr.Dropdown(choices=new_choices)
|
375 |
-
|
376 |
with gr.Blocks(theme=gr.themes.Ocean(), css=css) as demo:
|
377 |
gr.Markdown("# StyleTTS2 Studio - Build custom voices")
|
378 |
|
379 |
-
#
|
|
|
|
|
380 |
with gr.Tab("Text-to-Speech"):
|
381 |
gr.Markdown("### Generate Speech with Predefined Voices")
|
382 |
|
@@ -399,28 +360,29 @@ def create_combined_interface():
|
|
399 |
label="Speed (%)",
|
400 |
value=120,
|
401 |
)
|
402 |
-
|
403 |
-
|
404 |
-
|
405 |
audio_output = gr.Audio(label="Synthesized Audio")
|
406 |
|
407 |
-
# Generate
|
408 |
def on_generate_tts(text, voice, speed):
|
409 |
if not voice:
|
410 |
return None, "No voice selected."
|
411 |
speed_val = speed / 100 # Convert percentage to multiplier
|
412 |
-
|
413 |
-
if
|
414 |
-
return None,
|
415 |
-
return
|
416 |
|
417 |
generate_btn.click(
|
418 |
fn=on_generate_tts,
|
419 |
inputs=[text_input, voice_dropdown, speed_slider],
|
420 |
-
outputs=[audio_output,
|
421 |
)
|
422 |
|
423 |
-
#
|
|
|
|
|
424 |
with gr.Tab("Voice Studio"):
|
425 |
gr.Markdown("### Customize and Create New Voices")
|
426 |
|
@@ -463,18 +425,16 @@ def create_combined_interface():
|
|
463 |
# State to hold the last style vector
|
464 |
style_vector_state_studio = gr.State()
|
465 |
|
466 |
-
# Generate
|
467 |
def on_generate_studio(text, voice, speed, *pca_values):
|
468 |
if not voice:
|
469 |
return None, "No voice selected.", None
|
470 |
-
speed_val = speed / 100
|
471 |
-
|
472 |
text, voice, False, speed_val, *pca_values
|
473 |
)
|
474 |
-
if
|
475 |
return None, "Failed to generate audio.", None
|
476 |
-
audio_tuple, style_vector = result
|
477 |
-
style_vector_state_studio.value = style_vector
|
478 |
return audio_tuple, "Audio generated successfully.", style_vector
|
479 |
|
480 |
generate_btn_studio.click(
|
@@ -484,43 +444,56 @@ def create_combined_interface():
|
|
484 |
outputs=[audio_output_studio, status_text, style_vector_state_studio],
|
485 |
)
|
486 |
|
|
|
487 |
def on_save_style_studio(style_vector, style_name):
|
488 |
-
|
|
|
489 |
return (
|
490 |
-
"Please enter a name for the new voice!",
|
491 |
-
gr.
|
492 |
-
|
493 |
-
), # Return a new Dropdown instance with empty choices
|
494 |
-
gr.Dropdown(
|
495 |
-
choices=[]
|
496 |
-
), # Return a new Dropdown instance with empty choices
|
497 |
)
|
|
|
498 |
result = save_style_to_json(style_vector, style_name)
|
|
|
499 |
new_choices = list(load_voices_json().keys())
|
500 |
-
|
|
|
501 |
return (
|
502 |
-
gr.
|
503 |
-
|
504 |
-
),
|
505 |
-
gr.Dropdown(
|
506 |
-
choices=new_choices
|
507 |
-
), # Return a new Dropdown instance with updated choices
|
508 |
-
result, # Status message
|
509 |
)
|
510 |
|
511 |
save_btn_studio.click(
|
512 |
fn=on_save_style_studio,
|
513 |
inputs=[style_vector_state_studio, new_style_name],
|
514 |
-
|
|
|
515 |
)
|
516 |
|
517 |
-
#
|
518 |
voice_dropdown_studio.change(
|
519 |
fn=update_sliders,
|
520 |
inputs=voice_dropdown_studio,
|
521 |
outputs=pca_sliders,
|
522 |
)
|
523 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
524 |
gr.Markdown(
|
525 |
"#### Based on [StyleTTS2](https://github.com/yl4579/StyleTTS2) and [artificial StyleTTS2](https://huggingface.co/dkounadis/artificial-styletts2/tree/main)"
|
526 |
)
|
@@ -531,6 +504,6 @@ def create_combined_interface():
|
|
531 |
if __name__ == "__main__":
|
532 |
try:
|
533 |
interface = create_combined_interface()
|
534 |
-
interface.launch(share=False)
|
535 |
except Exception as e:
|
536 |
print(f"An error occurred while launching the interface: {e}")
|
|
|
16 |
matplotlib.use("Agg") # Use non-interactive backend
|
17 |
import matplotlib.pyplot as plt
|
18 |
|
19 |
+
# -------------------------------------------------------------------
|
20 |
+
# IMPORT OR DEFINE YOUR TEXT-TO-SPEECH FUNCTIONS
|
21 |
+
# (Adjust these imports to match your local TTS code)
|
22 |
+
# -------------------------------------------------------------------
|
23 |
from text2speech import tts_randomized, parse_speed, tts_with_style_vector
|
24 |
|
25 |
# Constants and Paths
|
|
|
51 |
# LOAD PCA MODEL AND ANNOTATED FEATURES
|
52 |
##############################################################################
|
53 |
|
|
|
54 |
try:
|
55 |
pca = joblib.load(PCA_MODEL_PATH)
|
56 |
print("PCA model loaded successfully.")
|
|
|
93 |
def update_sliders(voice_name):
|
94 |
"""
|
95 |
Update slider values based on the selected predefined voice using reverse PCA.
|
96 |
+
Returns a list of PCA component values to set the sliders.
|
|
|
|
|
|
|
|
|
|
|
97 |
"""
|
98 |
if not voice_name:
|
99 |
# Return default slider values (e.g., zeros) if no voice is selected
|
|
|
122 |
def generate_audio_with_voice(text, voice_key, speed_val):
|
123 |
"""
|
124 |
Generate audio using the style vector of the selected predefined voice.
|
125 |
+
Returns (audio_tuple, style_vector) or (None, error_message).
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
126 |
"""
|
127 |
try:
|
128 |
# Load voices data
|
129 |
voices_data = load_voices_json()
|
|
|
130 |
if voice_key not in voices_data:
|
131 |
+
msg = f"Voice '{voice_key}' not found in {VOICES_JSON_PATH}."
|
132 |
+
print(msg)
|
133 |
+
return None, msg
|
134 |
|
|
|
135 |
style_vector = np.array(voices_data[voice_key], dtype=np.float32).reshape(1, -1)
|
136 |
print(f"Selected Voice: {voice_key}")
|
137 |
print(f"Style Vector (First 6): {style_vector[0][:6]}")
|
|
|
139 |
# Convert to torch tensor and move to device
|
140 |
style_vec_torch = torch.from_numpy(style_vector).float().to(device)
|
141 |
|
142 |
+
# Generate audio
|
143 |
audio_np = tts_with_style_vector(
|
144 |
text,
|
145 |
style_vec=style_vec_torch,
|
|
|
151 |
)
|
152 |
|
153 |
if audio_np is None:
|
154 |
+
msg = "Audio generation failed."
|
155 |
+
print(msg)
|
156 |
+
return None, msg
|
157 |
|
158 |
+
sr = 24000
|
|
|
159 |
audio_tuple = (sr, audio_np)
|
|
|
|
|
160 |
return audio_tuple, style_vector.tolist()
|
161 |
|
162 |
except Exception as e:
|
|
|
165 |
|
166 |
|
167 |
def build_modified_vector(voice_key, top6_values):
|
168 |
+
"""Reconstruct a style vector by applying inverse PCA on the given 6 slider values."""
|
169 |
voices_data = load_voices_json()
|
170 |
if voice_key not in voices_data:
|
171 |
print(f"Voice '{voice_key}' not found in {VOICES_JSON_PATH}.")
|
|
|
177 |
return None
|
178 |
|
179 |
try:
|
|
|
180 |
pca_components = np.array(top6_values).reshape(1, -1)
|
181 |
reconstructed_vec = pca.inverse_transform(pca_components)[0]
|
182 |
return reconstructed_vec
|
|
|
185 |
return None
|
186 |
|
187 |
|
188 |
+
def generate_custom_audio(text, voice_key, randomize, speed_val, *slider_values):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
189 |
"""
|
190 |
+
Generate audio with either a random style vector or a reconstructed vector
|
191 |
+
from the 6 PCA sliders. Returns (audio_tuple, style_vector) or (None, None).
|
|
|
|
|
|
|
192 |
"""
|
193 |
try:
|
|
|
|
|
|
|
194 |
if randomize:
|
195 |
# Generate randomized style vector
|
196 |
audio_np, random_style_vec = tts_randomized(text, speed=speed_val)
|
197 |
if random_style_vec is None:
|
198 |
print("Failed to generate randomized style vector.")
|
199 |
+
return None, None
|
|
|
200 |
final_vec = (
|
201 |
random_style_vec.cpu().numpy().flatten()
|
202 |
if isinstance(random_style_vec, torch.Tensor)
|
|
|
204 |
)
|
205 |
print("Randomized Style Vector (First 6):", final_vec[:6])
|
206 |
else:
|
207 |
+
# Reconstruct vector from PCA sliders
|
208 |
reconstructed_vec = build_modified_vector(voice_key, slider_values)
|
209 |
if reconstructed_vec is None:
|
210 |
+
print("No reconstructed vector. Skipping audio generation.")
|
211 |
+
return None, None
|
|
|
|
|
212 |
|
|
|
213 |
style_vec_torch = (
|
214 |
torch.from_numpy(reconstructed_vec).float().unsqueeze(0).to(device)
|
215 |
)
|
|
|
|
|
216 |
audio_np = tts_with_style_vector(
|
217 |
text,
|
218 |
style_vec=style_vec_torch,
|
|
|
227 |
|
228 |
if audio_np is None:
|
229 |
print("Audio generation failed.")
|
230 |
+
return None, None
|
231 |
|
232 |
+
sr = 24000
|
|
|
233 |
audio_tuple = (sr, audio_np)
|
|
|
|
|
234 |
return audio_tuple, final_vec.tolist()
|
235 |
|
236 |
except Exception as e:
|
237 |
+
print(f"Error generating audio and style: {e}")
|
238 |
+
return None, None
|
239 |
|
240 |
|
241 |
def save_style_to_json(style_data, style_name):
|
242 |
+
"""
|
243 |
+
Saves the provided style_data (list of floats) into voices.json under style_name.
|
244 |
+
Returns a status message.
|
245 |
+
"""
|
246 |
if not style_name.strip():
|
247 |
return "Please enter a new style name before saving."
|
248 |
|
|
|
252 |
f"Style name '{style_name}' already exists. Please choose a different name."
|
253 |
)
|
254 |
|
|
|
255 |
if len(style_data) != VECTOR_DIMENSION:
|
256 |
return f"Style vector length mismatch. Expected {VECTOR_DIMENSION}, got {len(style_data)}."
|
257 |
|
|
|
258 |
voices_data[style_name] = style_data
|
259 |
save_voices_json(voices_data)
|
260 |
return f"Saved style as '{style_name}' in {VOICES_JSON_PATH}."
|
261 |
|
262 |
|
|
|
|
|
|
|
263 |
def rearrange_voices(new_order):
|
264 |
+
"""
|
265 |
+
Rearrange the voices in voices.json based on the comma-separated `new_order`.
|
266 |
+
Returns (status_msg, updated_list_of_voices).
|
267 |
+
"""
|
268 |
voices_data = load_voices_json()
|
269 |
new_order_list = [name.strip() for name in new_order.split(",")]
|
270 |
if not all(name in voices_data for name in new_order_list):
|
271 |
return "Error: New order contains invalid voice names.", list(
|
272 |
voices_data.keys()
|
273 |
)
|
274 |
+
|
275 |
ordered_data = OrderedDict()
|
276 |
for name in new_order_list:
|
277 |
ordered_data[name] = voices_data[name]
|
278 |
+
|
279 |
save_voices_json(ordered_data)
|
280 |
print(f"Voices rearranged: {list(ordered_data.keys())}")
|
281 |
return "Voices rearranged successfully.", list(ordered_data.keys())
|
282 |
|
283 |
|
284 |
def delete_voice(selected):
|
285 |
+
"""Delete voices from the voices.json. Returns (status_msg, updated_list_of_voices)."""
|
286 |
if not selected:
|
287 |
return "No voices selected for deletion.", list(load_voices_json().keys())
|
288 |
voices_data = load_voices_json()
|
|
|
295 |
|
296 |
|
297 |
def upload_new_voices(uploaded_file):
|
298 |
+
"""Upload new voices from a JSON file. Returns (status_msg, updated_list_of_voices)."""
|
299 |
if uploaded_file is None:
|
300 |
return "No file uploaded.", list(load_voices_json().keys())
|
301 |
try:
|
302 |
uploaded_data = json.load(uploaded_file)
|
303 |
if not isinstance(uploaded_data, dict):
|
304 |
+
return (
|
305 |
+
"Invalid JSON format. Expected a dictionary of voices.",
|
306 |
+
list(load_voices_json().keys()),
|
307 |
)
|
308 |
voices_data = load_voices_json()
|
309 |
voices_data.update(uploaded_data)
|
|
|
314 |
return "Uploaded file is not valid JSON.", list(load_voices_json().keys())
|
315 |
|
316 |
|
317 |
+
# -------------------------------------------------------------------
|
318 |
+
# GRADIO INTERFACE
|
319 |
+
# -------------------------------------------------------------------
|
320 |
|
321 |
|
322 |
def create_combined_interface():
|
323 |
+
# We'll initially load the voices to get a default set for the dropdown
|
324 |
voices_data = load_voices_json()
|
325 |
voice_choices = list(voices_data.keys())
|
326 |
default_voice = voice_choices[0] if voice_choices else None
|
|
|
332 |
}
|
333 |
"""
|
334 |
|
|
|
|
|
|
|
|
|
|
|
|
|
335 |
with gr.Blocks(theme=gr.themes.Ocean(), css=css) as demo:
|
336 |
gr.Markdown("# StyleTTS2 Studio - Build custom voices")
|
337 |
|
338 |
+
# -------------------------------------------------------
|
339 |
+
# 1) Text-to-Speech Tab
|
340 |
+
# -------------------------------------------------------
|
341 |
with gr.Tab("Text-to-Speech"):
|
342 |
gr.Markdown("### Generate Speech with Predefined Voices")
|
343 |
|
|
|
360 |
label="Speed (%)",
|
361 |
value=120,
|
362 |
)
|
363 |
+
generate_btn = gr.Button("Generate Audio")
|
364 |
+
status_tts = gr.Textbox(label="Status", visible=False)
|
|
|
365 |
audio_output = gr.Audio(label="Synthesized Audio")
|
366 |
|
367 |
+
# Generate TTS callback
|
368 |
def on_generate_tts(text, voice, speed):
|
369 |
if not voice:
|
370 |
return None, "No voice selected."
|
371 |
speed_val = speed / 100 # Convert percentage to multiplier
|
372 |
+
audio_result, msg = generate_audio_with_voice(text, voice, speed_val)
|
373 |
+
if audio_result is None:
|
374 |
+
return None, msg
|
375 |
+
return audio_result, "Audio generated successfully."
|
376 |
|
377 |
generate_btn.click(
|
378 |
fn=on_generate_tts,
|
379 |
inputs=[text_input, voice_dropdown, speed_slider],
|
380 |
+
outputs=[audio_output, status_tts],
|
381 |
)
|
382 |
|
383 |
+
# -------------------------------------------------------
|
384 |
+
# 2) Voice Studio Tab
|
385 |
+
# -------------------------------------------------------
|
386 |
with gr.Tab("Voice Studio"):
|
387 |
gr.Markdown("### Customize and Create New Voices")
|
388 |
|
|
|
425 |
# State to hold the last style vector
|
426 |
style_vector_state_studio = gr.State()
|
427 |
|
428 |
+
# Generate customized audio callback
|
429 |
def on_generate_studio(text, voice, speed, *pca_values):
|
430 |
if not voice:
|
431 |
return None, "No voice selected.", None
|
432 |
+
speed_val = speed / 100
|
433 |
+
audio_tuple, style_vector = generate_custom_audio(
|
434 |
text, voice, False, speed_val, *pca_values
|
435 |
)
|
436 |
+
if audio_tuple is None:
|
437 |
return None, "Failed to generate audio.", None
|
|
|
|
|
438 |
return audio_tuple, "Audio generated successfully.", style_vector
|
439 |
|
440 |
generate_btn_studio.click(
|
|
|
444 |
outputs=[audio_output_studio, status_text, style_vector_state_studio],
|
445 |
)
|
446 |
|
447 |
+
# Save customized voice callback
|
448 |
def on_save_style_studio(style_vector, style_name):
|
449 |
+
"""Save the new style, then update the dropdown choices."""
|
450 |
+
if not style_vector or not style_name:
|
451 |
return (
|
452 |
+
gr.update(value="Please enter a name for the new voice!"),
|
453 |
+
gr.update(),
|
454 |
+
gr.update(),
|
|
|
|
|
|
|
|
|
455 |
)
|
456 |
+
# Save the style
|
457 |
result = save_style_to_json(style_vector, style_name)
|
458 |
+
# Reload the voices to get the new list
|
459 |
new_choices = list(load_voices_json().keys())
|
460 |
+
|
461 |
+
# Return dictionary updates to existing components
|
462 |
return (
|
463 |
+
gr.update(value=result),
|
464 |
+
gr.update(choices=new_choices),
|
465 |
+
gr.update(choices=new_choices),
|
|
|
|
|
|
|
|
|
466 |
)
|
467 |
|
468 |
save_btn_studio.click(
|
469 |
fn=on_save_style_studio,
|
470 |
inputs=[style_vector_state_studio, new_style_name],
|
471 |
+
# We update: status_text, voice_dropdown, voice_dropdown_studio
|
472 |
+
outputs=[status_text, voice_dropdown, voice_dropdown_studio],
|
473 |
)
|
474 |
|
475 |
+
# Update sliders callback
|
476 |
voice_dropdown_studio.change(
|
477 |
fn=update_sliders,
|
478 |
inputs=voice_dropdown_studio,
|
479 |
outputs=pca_sliders,
|
480 |
)
|
481 |
|
482 |
+
# -------------------------------------------------------
|
483 |
+
# Optionally: Reload voices on page load
|
484 |
+
# -------------------------------------------------------
|
485 |
+
def on_page_load():
|
486 |
+
new_choices = list(load_voices_json().keys())
|
487 |
+
return {
|
488 |
+
voice_dropdown: gr.update(choices=new_choices),
|
489 |
+
voice_dropdown_studio: gr.update(choices=new_choices),
|
490 |
+
}
|
491 |
+
|
492 |
+
# This automatically refreshes dropdowns every time the user loads/refreshes the page
|
493 |
+
demo.load(
|
494 |
+
on_page_load, inputs=None, outputs=[voice_dropdown, voice_dropdown_studio]
|
495 |
+
)
|
496 |
+
|
497 |
gr.Markdown(
|
498 |
"#### Based on [StyleTTS2](https://github.com/yl4579/StyleTTS2) and [artificial StyleTTS2](https://huggingface.co/dkounadis/artificial-styletts2/tree/main)"
|
499 |
)
|
|
|
504 |
if __name__ == "__main__":
|
505 |
try:
|
506 |
interface = create_combined_interface()
|
507 |
+
interface.launch(share=False) # or share=True if you want a public share link
|
508 |
except Exception as e:
|
509 |
print(f"An error occurred while launching the interface: {e}")
|
voices.json
CHANGED
@@ -2837,4 +2837,4 @@
|
|
2837 |
0.057131367030820654,
|
2838 |
-0.0762246848122452
|
2839 |
]
|
2840 |
-
}
|
|
|
2837 |
0.057131367030820654,
|
2838 |
-0.0762246848122452
|
2839 |
]
|
2840 |
+
}
|