Blakus commited on
Commit
aa65645
·
verified ·
1 Parent(s): 2cf7c96

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +38 -695
app.py CHANGED
@@ -1,707 +1,50 @@
1
- import sys
2
- import io, os, stat
3
- import subprocess
4
- import random
5
- from zipfile import ZipFile
6
- import uuid
7
- import time
8
- import torch
9
- import torchaudio
10
-
11
- #download for mecab
12
- os.system('python -m unidic download')
13
-
14
- # By using XTTS you agree to CPML license https://coqui.ai/cpml
15
- os.environ["COQUI_TOS_AGREED"] = "1"
16
-
17
- # langid is used to detect language for longer text
18
- # Most users expect text to be their own language, there is checkbox to disable it
19
- import langid
20
- import base64
21
- import csv
22
- from io import StringIO
23
- import datetime
24
- import re
25
-
26
  import gradio as gr
27
- from scipy.io.wavfile import write
28
- from pydub import AudioSegment
29
-
30
  from TTS.api import TTS
31
  from TTS.tts.configs.xtts_config import XttsConfig
32
  from TTS.tts.models.xtts import Xtts
33
- from TTS.utils.generic_utils import get_user_data_dir
34
-
35
- HF_TOKEN = os.environ.get("HF_TOKEN")
36
-
37
- from huggingface_hub import hf_hub_download
38
- import os
39
- from TTS.utils.manage import get_user_data_dir
40
-
41
- # Autenticación con la API de Hugging Face
42
- repo_id = "Blakus/Pedro_Lab_XTTS"
43
- local_dir = os.path.join(get_user_data_dir("tts"), "tts_models--multilingual--multi-dataset--xtts_v2")
44
-
45
- # Crear el directorio si no existe
46
- os.makedirs(local_dir, exist_ok=True)
47
-
48
- # Lista de archivos necesarios
49
- files_to_download = ["config.json", "model.pth", "vocab.json"]
50
-
51
- # Descargar cada archivo del repositorio
52
- for file_name in files_to_download:
53
- print(f"Downloading {file_name} from {repo_id}")
54
- local_file_path = os.path.join(local_dir, file_name)
55
- hf_hub_download(repo_id=repo_id, filename=file_name, local_dir=local_dir)
56
-
57
- # Cargar configuración y modelo
58
- config_path = os.path.join(local_dir, "config.json")
59
- checkpoint_path = os.path.join(local_dir, "model.pth")
60
- vocab_path = os.path.join(local_dir, "vocab.json")
61
 
 
62
  config = XttsConfig()
63
- config.load_json(config_path)
64
-
65
  model = Xtts.init_from_config(config)
66
- model.load_checkpoint(
67
- config,
68
- checkpoint_path=checkpoint_path,
69
- vocab_path=vocab_path,
70
- eval=True,
71
- use_deepspeed=False, # Desactivamos DeepSpeed para ejecución en CPU
72
- )
73
-
74
- # No transfieras el modelo a la GPU
75
- print("Modelo cargado en CPU")
76
-
77
- # This is for debugging purposes only
78
- DEVICE_ASSERT_DETECTED = 0
79
- DEVICE_ASSERT_PROMPT = None
80
- DEVICE_ASSERT_LANG = None
81
-
82
- supported_languages = config.languages
83
-
84
- def predict(
85
- prompt,
86
- language,
87
- audio_file_pth,
88
- mic_file_path,
89
- use_mic,
90
- voice_cleanup,
91
- no_lang_auto_detect,
92
- agree,
93
- ):
94
- if agree == True:
95
- if language not in supported_languages:
96
- gr.Warning(
97
- f"Language you put {language} in is not in is not in our Supported Languages, please choose from dropdown"
98
- )
99
-
100
- return (
101
- None,
102
- None,
103
- None,
104
- None,
105
- )
106
-
107
- language_predicted = langid.classify(prompt)[
108
- 0
109
- ].strip() # strip need as there is space at end!
110
-
111
- # tts expects chinese as zh-cn
112
- if language_predicted == "zh":
113
- # we use zh-cn
114
- language_predicted = "zh-cn"
115
-
116
- print(f"Detected language:{language_predicted}, Chosen language:{language}")
117
-
118
- # After text character length 15 trigger language detection
119
- if len(prompt) > 15:
120
- # allow any language for short text as some may be common
121
- # If user unchecks language autodetection it will not trigger
122
- # You may remove this completely for own use
123
- if language_predicted != language and not no_lang_auto_detect:
124
- # Please duplicate and remove this check if you really want this
125
- # Or auto-detector fails to identify language (which it can on pretty short text or mixed text)
126
- gr.Warning(
127
- f"It looks like your text isn’t the language you chose , if you’re sure the text is the same language you chose, please check disable language auto-detection checkbox"
128
- )
129
-
130
- return (
131
- None,
132
- None,
133
- None,
134
- None,
135
- )
136
-
137
- if use_mic == True:
138
- if mic_file_path is not None:
139
- speaker_wav = mic_file_path
140
- else:
141
- gr.Warning(
142
- "Please record your voice with Microphone, or uncheck Use Microphone to use reference audios"
143
- )
144
- return (
145
- None,
146
- None,
147
- None,
148
- None,
149
- )
150
-
151
- else:
152
- speaker_wav = audio_file_pth
153
-
154
- # Filtering for microphone input, as it has BG noise, maybe silence in beginning and end
155
- # This is fast filtering not perfect
156
-
157
- # Apply all on demand
158
- lowpassfilter = denoise = trim = loudness = True
159
-
160
- if lowpassfilter:
161
- lowpass_highpass = "lowpass=8000,highpass=75,"
162
- else:
163
- lowpass_highpass = ""
164
-
165
- if trim:
166
- # better to remove silence in beginning and end for microphone
167
- trim_silence = "areverse,silenceremove=start_periods=1:start_silence=0:start_threshold=0.02,areverse,silenceremove=start_periods=1:start_silence=0:start_threshold=0.02,"
168
- else:
169
- trim_silence = ""
170
-
171
- if voice_cleanup:
172
- try:
173
- out_filename = (
174
- speaker_wav + str(uuid.uuid4()) + ".wav"
175
- ) # ffmpeg to know output format
176
-
177
- # we will use newer ffmpeg as that has afftn denoise filter
178
- shell_command = f"./ffmpeg -y -i {speaker_wav} -af {lowpass_highpass}{trim_silence} {out_filename}".split(
179
- " "
180
- )
181
-
182
- command_result = subprocess.run(
183
- [item for item in shell_command],
184
- capture_output=False,
185
- text=True,
186
- check=True,
187
- )
188
- speaker_wav = out_filename
189
- print("Filtered microphone input")
190
- except subprocess.CalledProcessError:
191
- # There was an error - command exited with non-zero code
192
- print("Error: failed filtering, use original microphone input")
193
- else:
194
- speaker_wav = speaker_wav
195
-
196
- if len(prompt) < 2:
197
- gr.Warning("Please give a longer prompt text")
198
- return (
199
- None,
200
- None,
201
- None,
202
- None,
203
- )
204
- if len(prompt) > 200:
205
- gr.Warning(
206
- "Text length limited to 200 characters for this demo, please try shorter text. You can clone this space and edit code for your own usage"
207
- )
208
- return (
209
- None,
210
- None,
211
- None,
212
- None,
213
- )
214
- global DEVICE_ASSERT_DETECTED
215
- if DEVICE_ASSERT_DETECTED:
216
- global DEVICE_ASSERT_PROMPT
217
- global DEVICE_ASSERT_LANG
218
- # It will likely never come here as we restart space on first unrecoverable error now
219
- print(
220
- f"Unrecoverable exception caused by language:{DEVICE_ASSERT_LANG} prompt:{DEVICE_ASSERT_PROMPT}"
221
- )
222
-
223
- # HF Space specific.. This error is unrecoverable need to restart space
224
- space = api.get_space_runtime(repo_id=repo_id)
225
- if space.stage!="BUILDING":
226
- api.restart_space(repo_id=repo_id)
227
- else:
228
- print("TRIED TO RESTART but space is building")
229
-
230
- try:
231
- metrics_text = ""
232
- t_latent = time.time()
233
-
234
- # note diffusion_conditioning not used on hifigan (default mode), it will be empty but need to pass it to model.inference
235
- try:
236
- (
237
- gpt_cond_latent,
238
- speaker_embedding,
239
- ) = model.get_conditioning_latents(audio_path=speaker_wav, gpt_cond_len=30, gpt_cond_chunk_len=4, max_ref_length=60)
240
- except Exception as e:
241
- print("Speaker encoding error", str(e))
242
- gr.Warning(
243
- "It appears something wrong with reference, did you unmute your microphone?"
244
- )
245
- return (
246
- None,
247
- None,
248
- None,
249
- None,
250
- )
251
-
252
- latent_calculation_time = time.time() - t_latent
253
- # metrics_text=f"Embedding calculation time: {latent_calculation_time:.2f} seconds\n"
254
-
255
- # temporary comma fix
256
- prompt= re.sub("([^\x00-\x7F]|\w)(\.|\。|\?)",r"\1 \2\2",prompt)
257
-
258
- wav_chunks = []
259
- ## Direct mode
260
-
261
- print("I: Generating new audio...")
262
- t0 = time.time()
263
- out = model.inference(
264
- prompt,
265
- language,
266
- gpt_cond_latent,
267
- speaker_embedding,
268
- repetition_penalty=5.0,
269
- temperature=0.75,
270
- )
271
- inference_time = time.time() - t0
272
- print(f"I: Time to generate audio: {round(inference_time*1000)} milliseconds")
273
- metrics_text+=f"Time to generate audio: {round(inference_time*1000)} milliseconds\n"
274
- real_time_factor= (time.time() - t0) / out['wav'].shape[-1] * 24000
275
- print(f"Real-time factor (RTF): {real_time_factor}")
276
- metrics_text+=f"Real-time factor (RTF): {real_time_factor:.2f}\n"
277
- torchaudio.save("output.wav", torch.tensor(out["wav"]).unsqueeze(0), 24000)
278
-
279
 
280
- """
281
- print("I: Generating new audio in streaming mode...")
282
- t0 = time.time()
283
- chunks = model.inference_stream(
284
- prompt,
285
- language,
286
- gpt_cond_latent,
287
- speaker_embedding,
288
- repetition_penalty=7.0,
289
- temperature=0.85,
290
- )
291
-
292
- first_chunk = True
293
- for i, chunk in enumerate(chunks):
294
- if first_chunk:
295
- first_chunk_time = time.time() - t0
296
- metrics_text += f"Latency to first audio chunk: {round(first_chunk_time*1000)} milliseconds\n"
297
- first_chunk = False
298
- wav_chunks.append(chunk)
299
- print(f"Received chunk {i} of audio length {chunk.shape[-1]}")
300
- inference_time = time.time() - t0
301
- print(
302
- f"I: Time to generate audio: {round(inference_time*1000)} milliseconds"
303
- )
304
- #metrics_text += (
305
- # f"Time to generate audio: {round(inference_time*1000)} milliseconds\n"
306
- #)
307
-
308
- wav = torch.cat(wav_chunks, dim=0)
309
- print(wav.shape)
310
- real_time_factor = (time.time() - t0) / wav.shape[0] * 24000
311
- print(f"Real-time factor (RTF): {real_time_factor}")
312
- metrics_text += f"Real-time factor (RTF): {real_time_factor:.2f}\n"
313
-
314
- torchaudio.save("output.wav", wav.squeeze().unsqueeze(0).cpu(), 24000)
315
- """
316
-
317
- except RuntimeError as e:
318
- if "device-side assert" in str(e):
319
- # cannot do anything on cuda device side error, need tor estart
320
- print(
321
- f"Exit due to: Unrecoverable exception caused by language:{language} prompt:{prompt}",
322
- flush=True,
323
- )
324
- gr.Warning("Unhandled Exception encounter, please retry in a minute")
325
- print("Cuda device-assert Runtime encountered need restart")
326
- if not DEVICE_ASSERT_DETECTED:
327
- DEVICE_ASSERT_DETECTED = 1
328
- DEVICE_ASSERT_PROMPT = prompt
329
- DEVICE_ASSERT_LANG = language
330
-
331
- # just before restarting save what caused the issue so we can handle it in future
332
- # Uploading Error data only happens for unrecovarable error
333
- error_time = datetime.datetime.now().strftime("%d-%m-%Y-%H:%M:%S")
334
- error_data = [
335
- error_time,
336
- prompt,
337
- language,
338
- audio_file_pth,
339
- mic_file_path,
340
- use_mic,
341
- voice_cleanup,
342
- no_lang_auto_detect,
343
- agree,
344
- ]
345
- error_data = [str(e) if type(e) != str else e for e in error_data]
346
- print(error_data)
347
- print(speaker_wav)
348
- write_io = StringIO()
349
- csv.writer(write_io).writerows([error_data])
350
- csv_upload = write_io.getvalue().encode()
351
-
352
- filename = error_time + "_" + str(uuid.uuid4()) + ".csv"
353
- print("Writing error csv")
354
- error_api = HfApi()
355
- error_api.upload_file(
356
- path_or_fileobj=csv_upload,
357
- path_in_repo=filename,
358
- repo_id="coqui/xtts-flagged-dataset",
359
- repo_type="dataset",
360
- )
361
-
362
- # speaker_wav
363
- print("Writing error reference audio")
364
- speaker_filename = (
365
- error_time + "_reference_" + str(uuid.uuid4()) + ".wav"
366
- )
367
- error_api = HfApi()
368
- error_api.upload_file(
369
- path_or_fileobj=speaker_wav,
370
- path_in_repo=speaker_filename,
371
- repo_id="coqui/xtts-flagged-dataset",
372
- repo_type="dataset",
373
- )
374
-
375
- # HF Space specific.. This error is unrecoverable need to restart space
376
- space = api.get_space_runtime(repo_id=repo_id)
377
- if space.stage!="BUILDING":
378
- api.restart_space(repo_id=repo_id)
379
- else:
380
- print("TRIED TO RESTART but space is building")
381
-
382
- else:
383
- if "Failed to decode" in str(e):
384
- print("Speaker encoding error", str(e))
385
- gr.Warning(
386
- "It appears something wrong with reference, did you unmute your microphone?"
387
- )
388
- else:
389
- print("RuntimeError: non device-side assert error:", str(e))
390
- gr.Warning("Something unexpected happened please retry again.")
391
- return (
392
- None,
393
- None,
394
- None,
395
- None,
396
- )
397
- return (
398
- gr.make_waveform(
399
- audio="output.wav",
400
- ),
401
- "output.wav",
402
- metrics_text,
403
- speaker_wav,
404
- )
405
  else:
406
- gr.Warning("Please accept the Terms & Condition!")
407
- return (
408
- None,
409
- None,
410
- None,
411
- None,
412
- )
413
-
414
-
415
- title = "Coqui🐸 XTTS"
416
-
417
- description = """
418
-
419
- <br/>
420
-
421
- This demo is currently running **XTTS v2.0.3** <a href="https://huggingface.co/coqui/XTTS-v2">XTTS</a> is a multilingual text-to-speech and voice-cloning model. This demo features zero-shot voice cloning, however, you can fine-tune XTTS for better results. Leave a star 🌟 on Github <a href="https://github.com/coqui-ai/TTS">🐸TTS</a>, where our open-source inference and training code lives.
422
-
423
- <br/>
424
-
425
- Supported languages: Arabic: ar, Brazilian Portuguese: pt , Mandarin Chinese: zh-cn, Czech: cs, Dutch: nl, English: en, French: fr, German: de, Italian: it, Polish: pl, Russian: ru, Spanish: es, Turkish: tr, Japanese: ja, Korean: ko, Hungarian: hu, Hindi: hi
426
-
427
- <br/>
428
- """
429
-
430
- links = """
431
- <img referrerpolicy="no-referrer-when-downgrade" src="https://static.scarf.sh/a.png?x-pxid=0d00920c-8cc9-4bf3-90f2-a615797e5f59" />
432
-
433
- | | |
434
- | ------------------------------- | --------------------------------------- |
435
- | 🐸💬 **CoquiTTS** | <a style="display:inline-block" href='https://github.com/coqui-ai/TTS'><img src='https://img.shields.io/github/stars/coqui-ai/TTS?style=social' /></a>|
436
- | 💼 **Documentation** | [ReadTheDocs](https://tts.readthedocs.io/en/latest/)
437
- | 👩‍💻 **Questions** | [GitHub Discussions](https://github.com/coqui-ai/TTS/discussions) |
438
- | 🗯 **Community** | [![Dicord](https://img.shields.io/discord/1037326658807533628?color=%239B59B6&label=chat%20on%20discord)](https://discord.gg/5eXr5seRrv) |
439
-
440
-
441
- """
442
-
443
- article = """
444
- <div style='margin:20px auto;'>
445
- <p>By using this demo you agree to the terms of the Coqui Public Model License at https://coqui.ai/cpml</p>
446
- <p>We collect data only for error cases for improvement.</p>
447
- </div>
448
- """
449
- examples = [
450
- [
451
- "Once when I was six years old I saw a magnificent picture",
452
- "en",
453
- "examples/female.wav",
454
- None,
455
- False,
456
- False,
457
- False,
458
- True,
459
- ],
460
- [
461
- "Lorsque j'avais six ans j'ai vu, une fois, une magnifique image",
462
- "fr",
463
- "examples/male.wav",
464
- None,
465
- False,
466
- False,
467
- False,
468
- True,
469
- ],
470
- [
471
- "Als ich sechs war, sah ich einmal ein wunderbares Bild",
472
- "de",
473
- "examples/female.wav",
474
- None,
475
- False,
476
- False,
477
- False,
478
- True,
479
- ],
480
- [
481
- "Cuando tenía seis años, vi una vez una imagen magnífica",
482
- "es",
483
- "examples/male.wav",
484
- None,
485
- False,
486
- False,
487
- False,
488
- True,
489
- ],
490
- [
491
- "Quando eu tinha seis anos eu vi, uma vez, uma imagem magnífica",
492
- "pt",
493
- "examples/female.wav",
494
- None,
495
- False,
496
- False,
497
- False,
498
- True,
499
- ],
500
- [
501
- "Kiedy miałem sześć lat, zobaczyłem pewnego razu wspaniały obrazek",
502
- "pl",
503
- "examples/male.wav",
504
- None,
505
- False,
506
- False,
507
- False,
508
- True,
509
- ],
510
- [
511
- "Un tempo lontano, quando avevo sei anni, vidi un magnifico disegno",
512
- "it",
513
- "examples/female.wav",
514
- None,
515
- False,
516
- False,
517
- False,
518
- True,
519
- ],
520
- [
521
- "Bir zamanlar, altı yaşındayken, muhteşem bir resim gördüm",
522
- "tr",
523
- "examples/female.wav",
524
- None,
525
- False,
526
- False,
527
- False,
528
- True,
529
- ],
530
- [
531
- "Когда мне было шесть лет, я увидел однажды удивительную картинку",
532
- "ru",
533
- "examples/female.wav",
534
- None,
535
- False,
536
- False,
537
- False,
538
- True,
539
- ],
540
- [
541
- "Toen ik een jaar of zes was, zag ik op een keer een prachtige plaat",
542
- "nl",
543
- "examples/male.wav",
544
- None,
545
- False,
546
- False,
547
- False,
548
- True,
549
- ],
550
- [
551
- "Když mi bylo šest let, viděl jsem jednou nádherný obrázek",
552
- "cs",
553
- "examples/female.wav",
554
- None,
555
- False,
556
- False,
557
- False,
558
- True,
559
- ],
560
- [
561
- "当我还只有六岁的时候, 看到了一副精彩的插画",
562
- "zh-cn",
563
- "examples/female.wav",
564
- None,
565
- False,
566
- False,
567
- False,
568
- True,
569
- ],
570
- [
571
- "かつて 六歳のとき、素晴らしい絵を見ました",
572
- "ja",
573
- "examples/female.wav",
574
- None,
575
- False,
576
- True,
577
- False,
578
- True,
579
- ],
580
- [
581
- "한번은 내가 여섯 살이었을 때 멋진 ���림을 보았습니다.",
582
- "ko",
583
- "examples/female.wav",
584
- None,
585
- False,
586
- True,
587
- False,
588
- True,
589
- ],
590
- [
591
- "Egyszer hat éves koromban láttam egy csodálatos képet",
592
- "hu",
593
- "examples/male.wav",
594
- None,
595
- False,
596
- True,
597
- False,
598
- True,
599
- ],
600
- ]
601
-
602
-
603
-
604
- with gr.Blocks(analytics_enabled=False) as demo:
605
- with gr.Row():
606
- with gr.Column():
607
- gr.Markdown(
608
- """
609
- ## <img src="https://raw.githubusercontent.com/coqui-ai/TTS/main/images/coqui-log-green-TTS.png" height="56"/>
610
- """
611
- )
612
- with gr.Column():
613
- # placeholder to align the image
614
- pass
615
-
616
- with gr.Row():
617
- with gr.Column():
618
- gr.Markdown(description)
619
- with gr.Column():
620
- gr.Markdown(links)
621
-
622
  with gr.Row():
623
  with gr.Column():
624
- input_text_gr = gr.Textbox(
625
- label="Text Prompt",
626
- info="One or two sentences at a time is better. Up to 200 text characters.",
627
- value="Hi there, I'm your new voice clone. Try your best to upload quality audio.",
628
- )
629
- language_gr = gr.Dropdown(
630
- label="Language",
631
- info="Select an output language for the synthesised speech",
632
- choices=[
633
- "en",
634
- "es",
635
- "fr",
636
- "de",
637
- "it",
638
- "pt",
639
- "pl",
640
- "tr",
641
- "ru",
642
- "nl",
643
- "cs",
644
- "ar",
645
- "zh-cn",
646
- "ja",
647
- "ko",
648
- "hu",
649
- "hi"
650
- ],
651
- max_choices=1,
652
- value="en",
653
- )
654
- ref_gr = gr.Audio(
655
- label="Reference Audio",
656
- info="Click on the ✎ button to upload your own target speaker audio",
657
- type="filepath",
658
- value="examples/female.wav",
659
- )
660
- mic_gr = gr.Audio(
661
- source="microphone",
662
- type="filepath",
663
- info="Use your microphone to record audio",
664
- label="Use Microphone for Reference",
665
- )
666
- use_mic_gr = gr.Checkbox(
667
- label="Use Microphone",
668
- value=False,
669
- info="Notice: Microphone input may not work properly under traffic",
670
- )
671
- clean_ref_gr = gr.Checkbox(
672
- label="Cleanup Reference Voice",
673
- value=False,
674
- info="This check can improve output if your microphone or reference voice is noisy",
675
- )
676
- auto_det_lang_gr = gr.Checkbox(
677
- label="Do not use language auto-detect",
678
- value=False,
679
- info="Check to disable language auto-detection",
680
- )
681
- tos_gr = gr.Checkbox(
682
- label="Agree",
683
- value=False,
684
- info="I agree to the terms of the CPML: https://coqui.ai/cpml",
685
- )
686
-
687
- tts_button = gr.Button("Send", elem_id="send-btn", visible=True)
688
-
689
-
690
  with gr.Column():
691
- video_gr = gr.Video(label="Waveform Visual")
692
- audio_gr = gr.Audio(label="Synthesised Audio", autoplay=True)
693
- out_text_gr = gr.Text(label="Metrics")
694
- ref_audio_gr = gr.Audio(label="Reference Audio Used")
695
-
696
- with gr.Row():
697
- gr.Examples(examples,
698
- label="Examples",
699
- inputs=[input_text_gr, language_gr, ref_gr, mic_gr, use_mic_gr, clean_ref_gr, auto_det_lang_gr, tos_gr],
700
- outputs=[video_gr, audio_gr, out_text_gr, ref_audio_gr],
701
- fn=predict,
702
- cache_examples=False,)
703
-
704
- tts_button.click(predict, [input_text_gr, language_gr, ref_gr, mic_gr, use_mic_gr, clean_ref_gr, auto_det_lang_gr, tos_gr], outputs=[video_gr, audio_gr, out_text_gr, ref_audio_gr])
705
-
706
- demo.queue()
707
- demo.launch(debug=True, show_api=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
 
 
 
2
  from TTS.api import TTS
3
  from TTS.tts.configs.xtts_config import XttsConfig
4
  from TTS.tts.models.xtts import Xtts
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
+ # Cargar el modelo XTTS (asumiendo que ya está descargado y configurado)
7
  config = XttsConfig()
8
+ config.load_json("ruta/al/config.json")
 
9
  model = Xtts.init_from_config(config)
10
+ model.load_checkpoint(config, checkpoint_path="ruta/al/modelo.pth")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
+ def sintetizar_voz(texto, idioma, audio_referencia, usar_microfono, audio_microfono):
13
+ if usar_microfono:
14
+ audio_entrada = audio_microfono
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  else:
16
+ audio_entrada = audio_referencia
17
+
18
+ # Aquí iría la lógica de síntesis de voz usando el modelo XTTS
19
+ # Por simplicidad, este es un placeholder
20
+ audio_salida = model.tts(texto, speaker_wav=audio_entrada, language=idioma)
21
+
22
+ return audio_salida, "Métricas de síntesis irían aquí"
23
+
24
+ with gr.Blocks(theme=gr.themes.Base()) as demo:
25
+ gr.Markdown("# Sintetizador de Voz XTTS")
26
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  with gr.Row():
28
  with gr.Column():
29
+ texto_entrada = gr.Textbox(label="Texto a sintetizar", placeholder="Escribe aquí el texto que quieres convertir a voz...")
30
+ idioma = gr.Dropdown(label="Idioma", choices=["es", "en", "fr", "de", "it"], value="es")
31
+ audio_referencia = gr.Audio(label="Audio de referencia", type="filepath")
32
+ usar_microfono = gr.Checkbox(label="Usar micrófono")
33
+ audio_microfono = gr.Audio(label="Grabar con micrófono", source="microphone", type="filepath", visible=False)
34
+
35
+ usar_microfono.change(fn=lambda x: gr.update(visible=x), inputs=[usar_microfono], outputs=[audio_microfono])
36
+
37
+ boton_sintetizar = gr.Button("Sintetizar")
38
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  with gr.Column():
40
+ audio_salida = gr.Audio(label="Audio sintetizado")
41
+ waveform = gr.Image(label="Forma de onda")
42
+ metricas = gr.Textbox(label="Métricas")
43
+
44
+ boton_sintetizar.click(
45
+ sintetizar_voz,
46
+ inputs=[texto_entrada, idioma, audio_referencia, usar_microfono, audio_microfono],
47
+ outputs=[audio_salida, metricas]
48
+ )
49
+
50
+ demo.launch()