Blakus commited on
Commit
d59fc80
·
verified ·
1 Parent(s): 293a3de

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +669 -50
app.py CHANGED
@@ -1,77 +1,696 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
 
 
 
2
  from TTS.api import TTS
3
  from TTS.tts.configs.xtts_config import XttsConfig
4
  from TTS.tts.models.xtts import Xtts
5
  from TTS.utils.generic_utils import get_user_data_dir
6
- import os
 
 
7
  from huggingface_hub import hf_hub_download
 
 
8
 
9
- # Configuración de rutas y descarga del modelo
10
  repo_id = "Blakus/Pedro_Lab_XTTS"
11
  local_dir = os.path.join(get_user_data_dir("tts"), "tts_models--multilingual--multi-dataset--xtts_v2")
 
 
12
  os.makedirs(local_dir, exist_ok=True)
13
 
 
14
  files_to_download = ["config.json", "model.pth", "vocab.json"]
 
 
15
  for file_name in files_to_download:
 
 
16
  hf_hub_download(repo_id=repo_id, filename=file_name, local_dir=local_dir)
17
 
 
18
  config_path = os.path.join(local_dir, "config.json")
19
  checkpoint_path = os.path.join(local_dir, "model.pth")
20
  vocab_path = os.path.join(local_dir, "vocab.json")
21
 
22
- # Cargar el modelo XTTS
23
  config = XttsConfig()
24
  config.load_json(config_path)
 
25
  model = Xtts.init_from_config(config)
26
- model.load_checkpoint(config, checkpoint_path=checkpoint_path, vocab_path=vocab_path, eval=True, use_deepspeed=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
- def sintetizar_voz(texto, idioma, audio_referencia, usar_microfono, audio_microfono):
29
- if usar_microfono:
30
- audio_entrada = audio_microfono
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  else:
32
- audio_entrada = audio_referencia
33
-
34
- # Lógica de síntesis de voz usando el modelo XTTS
35
- gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=audio_entrada, gpt_cond_len=30, gpt_cond_chunk_len=4, max_ref_length=60)
36
- out = model.inference(
37
- texto,
38
- language=idioma,
39
- gpt_cond_latent=gpt_cond_latent,
40
- speaker_embedding=speaker_embedding,
41
- repetition_penalty=5.0,
42
- temperature=0.75,
43
- )
44
-
45
- # Guardar el audio generado
46
- output_path = "output.wav"
47
- model.save_wav(wav=out["wav"], path=output_path)
48
-
49
- return output_path, f"Tiempo de generación: {out['inference_time']:.2f} segundos"
50
-
51
- with gr.Blocks(theme=gr.themes.Base()) as demo:
52
- gr.Markdown("# Sintetizador de Voz XTTS")
53
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  with gr.Row():
55
  with gr.Column():
56
- texto_entrada = gr.Textbox(label="Texto a sintetizar", placeholder="Escribe aquí el texto que quieres convertir a voz...")
57
- idioma = gr.Dropdown(label="Idioma", choices=config.languages, value="es")
58
- audio_referencia = gr.Audio(label="Audio de referencia", type="filepath")
59
- usar_microfono = gr.Checkbox(label="Usar micrófono")
60
- audio_microfono = gr.Audio(label="Grabar con micrófono", source="microphone", type="filepath", visible=False)
61
-
62
- usar_microfono.change(fn=lambda x: gr.update(visible=x), inputs=[usar_microfono], outputs=[audio_microfono])
63
-
64
- boton_sintetizar = gr.Button("Sintetizar")
65
-
 
 
 
 
 
 
66
  with gr.Column():
67
- audio_salida = gr.Audio(label="Audio sintetizado")
68
- waveform = gr.Image(label="Forma de onda")
69
- metricas = gr.Textbox(label="Métricas")
70
-
71
- boton_sintetizar.click(
72
- sintetizar_voz,
73
- inputs=[texto_entrada, idioma, audio_referencia, usar_microfono, audio_microfono],
74
- outputs=[audio_salida, metricas]
75
- )
76
-
77
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import io, os, stat
3
+ import subprocess
4
+ import random
5
+ from zipfile import ZipFile
6
+ import uuid
7
+ import time
8
+ import torch
9
+ import torchaudio
10
+
11
+ #download for mecab
12
+ os.system('python -m unidic download')
13
+
14
+ # By using XTTS you agree to CPML license https://coqui.ai/cpml
15
+ os.environ["COQUI_TOS_AGREED"] = "1"
16
+
17
+ # langid is used to detect language for longer text
18
+ # Most users expect text to be their own language, there is checkbox to disable it
19
+ import langid
20
+ import base64
21
+ import csv
22
+ from io import StringIO
23
+ import datetime
24
+ import re
25
+
26
  import gradio as gr
27
+ from scipy.io.wavfile import write
28
+ from pydub import AudioSegment
29
+
30
  from TTS.api import TTS
31
  from TTS.tts.configs.xtts_config import XttsConfig
32
  from TTS.tts.models.xtts import Xtts
33
  from TTS.utils.generic_utils import get_user_data_dir
34
+
35
+ HF_TOKEN = os.environ.get("HF_TOKEN")
36
+
37
  from huggingface_hub import hf_hub_download
38
+ import os
39
+ from TTS.utils.manage import get_user_data_dir
40
 
41
+ # Autenticación con la API de Hugging Face
42
  repo_id = "Blakus/Pedro_Lab_XTTS"
43
  local_dir = os.path.join(get_user_data_dir("tts"), "tts_models--multilingual--multi-dataset--xtts_v2")
44
+
45
+ # Crear el directorio si no existe
46
  os.makedirs(local_dir, exist_ok=True)
47
 
48
+ # Lista de archivos necesarios
49
  files_to_download = ["config.json", "model.pth", "vocab.json"]
50
+
51
+ # Descargar cada archivo del repositorio
52
  for file_name in files_to_download:
53
+ print(f"Downloading {file_name} from {repo_id}")
54
+ local_file_path = os.path.join(local_dir, file_name)
55
  hf_hub_download(repo_id=repo_id, filename=file_name, local_dir=local_dir)
56
 
57
+ # Cargar configuración y modelo
58
  config_path = os.path.join(local_dir, "config.json")
59
  checkpoint_path = os.path.join(local_dir, "model.pth")
60
  vocab_path = os.path.join(local_dir, "vocab.json")
61
 
 
62
  config = XttsConfig()
63
  config.load_json(config_path)
64
+
65
  model = Xtts.init_from_config(config)
66
+ model.load_checkpoint(
67
+ config,
68
+ checkpoint_path=checkpoint_path,
69
+ vocab_path=vocab_path,
70
+ eval=True,
71
+ use_deepspeed=False, # Desactivamos DeepSpeed para ejecución en CPU
72
+ )
73
+
74
+ # No transfieras el modelo a la GPU
75
+ print("Modelo cargado en CPU")
76
+
77
+ # This is for debugging purposes only
78
+ DEVICE_ASSERT_DETECTED = 0
79
+ DEVICE_ASSERT_PROMPT = None
80
+ DEVICE_ASSERT_LANG = None
81
+
82
+ supported_languages = config.languages
83
+
84
+ def predict(
85
+ prompt,
86
+ language,
87
+ audio_file_pth,
88
+ mic_file_path,
89
+ use_mic,
90
+ voice_cleanup,
91
+ no_lang_auto_detect,
92
+ agree,
93
+ ):
94
+ if agree == True:
95
+ if language not in supported_languages:
96
+ gr.Warning(
97
+ f"Language you put {language} in is not in is not in our Supported Languages, please choose from dropdown"
98
+ )
99
+
100
+ return (
101
+ None,
102
+ None,
103
+ None,
104
+ None,
105
+ )
106
+
107
+ language_predicted = langid.classify(prompt)[
108
+ 0
109
+ ].strip() # strip need as there is space at end!
110
+
111
+ # tts expects chinese as zh-cn
112
+ if language_predicted == "zh":
113
+ # we use zh-cn
114
+ language_predicted = "zh-cn"
115
+
116
+ print(f"Detected language:{language_predicted}, Chosen language:{language}")
117
+
118
+ # After text character length 15 trigger language detection
119
+ if len(prompt) > 15:
120
+ # allow any language for short text as some may be common
121
+ # If user unchecks language autodetection it will not trigger
122
+ # You may remove this completely for own use
123
+ if language_predicted != language and not no_lang_auto_detect:
124
+ # Please duplicate and remove this check if you really want this
125
+ # Or auto-detector fails to identify language (which it can on pretty short text or mixed text)
126
+ gr.Warning(
127
+ f"It looks like your text isn’t the language you chose , if you’re sure the text is the same language you chose, please check disable language auto-detection checkbox"
128
+ )
129
+
130
+ return (
131
+ None,
132
+ None,
133
+ None,
134
+ None,
135
+ )
136
+
137
+ if use_mic == True:
138
+ if mic_file_path is not None:
139
+ speaker_wav = mic_file_path
140
+ else:
141
+ gr.Warning(
142
+ "Please record your voice with Microphone, or uncheck Use Microphone to use reference audios"
143
+ )
144
+ return (
145
+ None,
146
+ None,
147
+ None,
148
+ None,
149
+ )
150
+
151
+ else:
152
+ speaker_wav = audio_file_pth
153
+
154
+ # Filtering for microphone input, as it has BG noise, maybe silence in beginning and end
155
+ # This is fast filtering not perfect
156
+
157
+ # Apply all on demand
158
+ lowpassfilter = denoise = trim = loudness = True
159
+
160
+ if lowpassfilter:
161
+ lowpass_highpass = "lowpass=8000,highpass=75,"
162
+ else:
163
+ lowpass_highpass = ""
164
+
165
+ if trim:
166
+ # better to remove silence in beginning and end for microphone
167
+ trim_silence = "areverse,silenceremove=start_periods=1:start_silence=0:start_threshold=0.02,areverse,silenceremove=start_periods=1:start_silence=0:start_threshold=0.02,"
168
+ else:
169
+ trim_silence = ""
170
+
171
+ if voice_cleanup:
172
+ try:
173
+ out_filename = (
174
+ speaker_wav + str(uuid.uuid4()) + ".wav"
175
+ ) # ffmpeg to know output format
176
+
177
+ # we will use newer ffmpeg as that has afftn denoise filter
178
+ shell_command = f"./ffmpeg -y -i {speaker_wav} -af {lowpass_highpass}{trim_silence} {out_filename}".split(
179
+ " "
180
+ )
181
+
182
+ command_result = subprocess.run(
183
+ [item for item in shell_command],
184
+ capture_output=False,
185
+ text=True,
186
+ check=True,
187
+ )
188
+ speaker_wav = out_filename
189
+ print("Filtered microphone input")
190
+ except subprocess.CalledProcessError:
191
+ # There was an error - command exited with non-zero code
192
+ print("Error: failed filtering, use original microphone input")
193
+ else:
194
+ speaker_wav = speaker_wav
195
+
196
+ if len(prompt) < 2:
197
+ gr.Warning("Please give a longer prompt text")
198
+ return (
199
+ None,
200
+ None,
201
+ None,
202
+ None,
203
+ )
204
+ if len(prompt) > 200:
205
+ gr.Warning(
206
+ "Text length limited to 200 characters for this demo, please try shorter text. You can clone this space and edit code for your own usage"
207
+ )
208
+ return (
209
+ None,
210
+ None,
211
+ None,
212
+ None,
213
+ )
214
+ global DEVICE_ASSERT_DETECTED
215
+ if DEVICE_ASSERT_DETECTED:
216
+ global DEVICE_ASSERT_PROMPT
217
+ global DEVICE_ASSERT_LANG
218
+ # It will likely never come here as we restart space on first unrecoverable error now
219
+ print(
220
+ f"Unrecoverable exception caused by language:{DEVICE_ASSERT_LANG} prompt:{DEVICE_ASSERT_PROMPT}"
221
+ )
222
+
223
+ # HF Space specific.. This error is unrecoverable need to restart space
224
+ space = api.get_space_runtime(repo_id=repo_id)
225
+ if space.stage!="BUILDING":
226
+ api.restart_space(repo_id=repo_id)
227
+ else:
228
+ print("TRIED TO RESTART but space is building")
229
+
230
+ try:
231
+ metrics_text = ""
232
+ t_latent = time.time()
233
+
234
+ # note diffusion_conditioning not used on hifigan (default mode), it will be empty but need to pass it to model.inference
235
+ try:
236
+ (
237
+ gpt_cond_latent,
238
+ speaker_embedding,
239
+ ) = model.get_conditioning_latents(audio_path=speaker_wav, gpt_cond_len=30, gpt_cond_chunk_len=4, max_ref_length=60)
240
+ except Exception as e:
241
+ print("Speaker encoding error", str(e))
242
+ gr.Warning(
243
+ "It appears something wrong with reference, did you unmute your microphone?"
244
+ )
245
+ return (
246
+ None,
247
+ None,
248
+ None,
249
+ None,
250
+ )
251
 
252
+ latent_calculation_time = time.time() - t_latent
253
+ # metrics_text=f"Embedding calculation time: {latent_calculation_time:.2f} seconds\n"
254
+
255
+ # temporary comma fix
256
+ prompt= re.sub("([^\x00-\x7F]|\w)(\.|\。|\?)",r"\1 \2\2",prompt)
257
+
258
+ wav_chunks = []
259
+ ## Direct mode
260
+
261
+ print("I: Generating new audio...")
262
+ t0 = time.time()
263
+ out = model.inference(
264
+ prompt,
265
+ language,
266
+ gpt_cond_latent,
267
+ speaker_embedding,
268
+ repetition_penalty=5.0,
269
+ temperature=0.75,
270
+ )
271
+ inference_time = time.time() - t0
272
+ print(f"I: Time to generate audio: {round(inference_time*1000)} milliseconds")
273
+ metrics_text+=f"Time to generate audio: {round(inference_time*1000)} milliseconds\n"
274
+ real_time_factor= (time.time() - t0) / out['wav'].shape[-1] * 24000
275
+ print(f"Real-time factor (RTF): {real_time_factor}")
276
+ metrics_text+=f"Real-time factor (RTF): {real_time_factor:.2f}\n"
277
+ torchaudio.save("output.wav", torch.tensor(out["wav"]).unsqueeze(0), 24000)
278
+
279
+
280
+ """
281
+ print("I: Generating new audio in streaming mode...")
282
+ t0 = time.time()
283
+ chunks = model.inference_stream(
284
+ prompt,
285
+ language,
286
+ gpt_cond_latent,
287
+ speaker_embedding,
288
+ repetition_penalty=7.0,
289
+ temperature=0.85,
290
+ )
291
+ first_chunk = True
292
+ for i, chunk in enumerate(chunks):
293
+ if first_chunk:
294
+ first_chunk_time = time.time() - t0
295
+ metrics_text += f"Latency to first audio chunk: {round(first_chunk_time*1000)} milliseconds\n"
296
+ first_chunk = False
297
+ wav_chunks.append(chunk)
298
+ print(f"Received chunk {i} of audio length {chunk.shape[-1]}")
299
+ inference_time = time.time() - t0
300
+ print(
301
+ f"I: Time to generate audio: {round(inference_time*1000)} milliseconds"
302
+ )
303
+ #metrics_text += (
304
+ # f"Time to generate audio: {round(inference_time*1000)} milliseconds\n"
305
+ #)
306
+ wav = torch.cat(wav_chunks, dim=0)
307
+ print(wav.shape)
308
+ real_time_factor = (time.time() - t0) / wav.shape[0] * 24000
309
+ print(f"Real-time factor (RTF): {real_time_factor}")
310
+ metrics_text += f"Real-time factor (RTF): {real_time_factor:.2f}\n"
311
+ torchaudio.save("output.wav", wav.squeeze().unsqueeze(0).cpu(), 24000)
312
+ """
313
+
314
+ except RuntimeError as e:
315
+ if "device-side assert" in str(e):
316
+ # cannot do anything on cuda device side error, need tor estart
317
+ print(
318
+ f"Exit due to: Unrecoverable exception caused by language:{language} prompt:{prompt}",
319
+ flush=True,
320
+ )
321
+ gr.Warning("Unhandled Exception encounter, please retry in a minute")
322
+ print("Cuda device-assert Runtime encountered need restart")
323
+ if not DEVICE_ASSERT_DETECTED:
324
+ DEVICE_ASSERT_DETECTED = 1
325
+ DEVICE_ASSERT_PROMPT = prompt
326
+ DEVICE_ASSERT_LANG = language
327
+
328
+ # just before restarting save what caused the issue so we can handle it in future
329
+ # Uploading Error data only happens for unrecovarable error
330
+ error_time = datetime.datetime.now().strftime("%d-%m-%Y-%H:%M:%S")
331
+ error_data = [
332
+ error_time,
333
+ prompt,
334
+ language,
335
+ audio_file_pth,
336
+ mic_file_path,
337
+ use_mic,
338
+ voice_cleanup,
339
+ no_lang_auto_detect,
340
+ agree,
341
+ ]
342
+ error_data = [str(e) if type(e) != str else e for e in error_data]
343
+ print(error_data)
344
+ print(speaker_wav)
345
+ write_io = StringIO()
346
+ csv.writer(write_io).writerows([error_data])
347
+ csv_upload = write_io.getvalue().encode()
348
+
349
+ filename = error_time + "_" + str(uuid.uuid4()) + ".csv"
350
+ print("Writing error csv")
351
+ error_api = HfApi()
352
+ error_api.upload_file(
353
+ path_or_fileobj=csv_upload,
354
+ path_in_repo=filename,
355
+ repo_id="coqui/xtts-flagged-dataset",
356
+ repo_type="dataset",
357
+ )
358
+
359
+ # speaker_wav
360
+ print("Writing error reference audio")
361
+ speaker_filename = (
362
+ error_time + "_reference_" + str(uuid.uuid4()) + ".wav"
363
+ )
364
+ error_api = HfApi()
365
+ error_api.upload_file(
366
+ path_or_fileobj=speaker_wav,
367
+ path_in_repo=speaker_filename,
368
+ repo_id="coqui/xtts-flagged-dataset",
369
+ repo_type="dataset",
370
+ )
371
+
372
+ # HF Space specific.. This error is unrecoverable need to restart space
373
+ space = api.get_space_runtime(repo_id=repo_id)
374
+ if space.stage!="BUILDING":
375
+ api.restart_space(repo_id=repo_id)
376
+ else:
377
+ print("TRIED TO RESTART but space is building")
378
+
379
+ else:
380
+ if "Failed to decode" in str(e):
381
+ print("Speaker encoding error", str(e))
382
+ gr.Warning(
383
+ "It appears something wrong with reference, did you unmute your microphone?"
384
+ )
385
+ else:
386
+ print("RuntimeError: non device-side assert error:", str(e))
387
+ gr.Warning("Something unexpected happened please retry again.")
388
+ return (
389
+ None,
390
+ None,
391
+ None,
392
+ None,
393
+ )
394
+ return (
395
+ gr.make_waveform(
396
+ audio="output.wav",
397
+ ),
398
+ "output.wav",
399
+ metrics_text,
400
+ speaker_wav,
401
+ )
402
  else:
403
+ gr.Warning("Please accept the Terms & Condition!")
404
+ return (
405
+ None,
406
+ None,
407
+ None,
408
+ None,
409
+ )
410
+
411
+
412
+ title = "Coqui🐸 XTTS"
413
+
414
+ description = """
415
+ <br/>
416
+ This demo is currently running **XTTS v2.0.3** <a href="https://huggingface.co/coqui/XTTS-v2">XTTS</a> is a multilingual text-to-speech and voice-cloning model. This demo features zero-shot voice cloning, however, you can fine-tune XTTS for better results. Leave a star 🌟 on Github <a href="https://github.com/coqui-ai/TTS">🐸TTS</a>, where our open-source inference and training code lives.
417
+ <br/>
418
+ Supported languages: Arabic: ar, Brazilian Portuguese: pt , Mandarin Chinese: zh-cn, Czech: cs, Dutch: nl, English: en, French: fr, German: de, Italian: it, Polish: pl, Russian: ru, Spanish: es, Turkish: tr, Japanese: ja, Korean: ko, Hungarian: hu, Hindi: hi
419
+ <br/>
420
+ """
421
+
422
+ links = """
423
+ <img referrerpolicy="no-referrer-when-downgrade" src="https://static.scarf.sh/a.png?x-pxid=0d00920c-8cc9-4bf3-90f2-a615797e5f59" />
424
+ | | |
425
+ | ------------------------------- | --------------------------------------- |
426
+ | 🐸💬 **CoquiTTS** | <a style="display:inline-block" href='https://github.com/coqui-ai/TTS'><img src='https://img.shields.io/github/stars/coqui-ai/TTS?style=social' /></a>|
427
+ | 💼 **Documentation** | [ReadTheDocs](https://tts.readthedocs.io/en/latest/)
428
+ | 👩‍💻 **Questions** | [GitHub Discussions](https://github.com/coqui-ai/TTS/discussions) |
429
+ | 🗯 **Community** | [![Dicord](https://img.shields.io/discord/1037326658807533628?color=%239B59B6&label=chat%20on%20discord)](https://discord.gg/5eXr5seRrv) |
430
+ """
431
+
432
+ article = """
433
+ <div style='margin:20px auto;'>
434
+ <p>By using this demo you agree to the terms of the Coqui Public Model License at https://coqui.ai/cpml</p>
435
+ <p>We collect data only for error cases for improvement.</p>
436
+ </div>
437
+ """
438
+ examples = [
439
+ [
440
+ "Once when I was six years old I saw a magnificent picture",
441
+ "en",
442
+ "examples/female.wav",
443
+ None,
444
+ False,
445
+ False,
446
+ False,
447
+ True,
448
+ ],
449
+ [
450
+ "Lorsque j'avais six ans j'ai vu, une fois, une magnifique image",
451
+ "fr",
452
+ "examples/male.wav",
453
+ None,
454
+ False,
455
+ False,
456
+ False,
457
+ True,
458
+ ],
459
+ [
460
+ "Als ich sechs war, sah ich einmal ein wunderbares Bild",
461
+ "de",
462
+ "examples/female.wav",
463
+ None,
464
+ False,
465
+ False,
466
+ False,
467
+ True,
468
+ ],
469
+ [
470
+ "Cuando tenía seis años, vi una vez una imagen magnífica",
471
+ "es",
472
+ "examples/male.wav",
473
+ None,
474
+ False,
475
+ False,
476
+ False,
477
+ True,
478
+ ],
479
+ [
480
+ "Quando eu tinha seis anos eu vi, uma vez, uma imagem magnífica",
481
+ "pt",
482
+ "examples/female.wav",
483
+ None,
484
+ False,
485
+ False,
486
+ False,
487
+ True,
488
+ ],
489
+ [
490
+ "Kiedy miałem sześć lat, zobaczyłem pewnego razu wspaniały obrazek",
491
+ "pl",
492
+ "examples/male.wav",
493
+ None,
494
+ False,
495
+ False,
496
+ False,
497
+ True,
498
+ ],
499
+ [
500
+ "Un tempo lontano, quando avevo sei anni, vidi un magnifico disegno",
501
+ "it",
502
+ "examples/female.wav",
503
+ None,
504
+ False,
505
+ False,
506
+ False,
507
+ True,
508
+ ],
509
+ [
510
+ "Bir zamanlar, altı yaşındayken, muhteşem bir resim gördüm",
511
+ "tr",
512
+ "examples/female.wav",
513
+ None,
514
+ False,
515
+ False,
516
+ False,
517
+ True,
518
+ ],
519
+ [
520
+ "Когда мне было шесть лет, я увидел однажды удивительную картинку",
521
+ "ru",
522
+ "examples/female.wav",
523
+ None,
524
+ False,
525
+ False,
526
+ False,
527
+ True,
528
+ ],
529
+ [
530
+ "Toen ik een jaar of zes was, zag ik op een keer een prachtige plaat",
531
+ "nl",
532
+ "examples/male.wav",
533
+ None,
534
+ False,
535
+ False,
536
+ False,
537
+ True,
538
+ ],
539
+ [
540
+ "Když mi bylo šest let, viděl jsem jednou nádherný obrázek",
541
+ "cs",
542
+ "examples/female.wav",
543
+ None,
544
+ False,
545
+ False,
546
+ False,
547
+ True,
548
+ ],
549
+ [
550
+ "当我还只有六岁的时候, 看到了一副精彩的插画",
551
+ "zh-cn",
552
+ "examples/female.wav",
553
+ None,
554
+ False,
555
+ False,
556
+ False,
557
+ True,
558
+ ],
559
+ [
560
+ "かつて 六歳のとき、素晴らしい絵を見ました",
561
+ "ja",
562
+ "examples/female.wav",
563
+ None,
564
+ False,
565
+ True,
566
+ False,
567
+ True,
568
+ ],
569
+ [
570
+ "한번은 내가 여섯 살이었을 때 멋진 그림을 보았습니다.",
571
+ "ko",
572
+ "examples/female.wav",
573
+ None,
574
+ False,
575
+ True,
576
+ False,
577
+ True,
578
+ ],
579
+ [
580
+ "Egyszer hat éves koromban láttam egy csodálatos képet",
581
+ "hu",
582
+ "examples/male.wav",
583
+ None,
584
+ False,
585
+ True,
586
+ False,
587
+ True,
588
+ ],
589
+ ]
590
+
591
+
592
+
593
+ with gr.Blocks(analytics_enabled=False) as demo:
594
  with gr.Row():
595
  with gr.Column():
596
+ gr.Markdown(
597
+ """
598
+ ## <img src="https://raw.githubusercontent.com/coqui-ai/TTS/main/images/coqui-log-green-TTS.png" height="56"/>
599
+ """
600
+ )
601
+ with gr.Column():
602
+ # placeholder to align the image
603
+ pass
604
+
605
+ with gr.Row():
606
+ with gr.Column():
607
+ gr.Markdown(description)
608
+ with gr.Column():
609
+ gr.Markdown(links)
610
+
611
+ with gr.Row():
612
  with gr.Column():
613
+ input_text_gr = gr.Textbox(
614
+ label="Text Prompt",
615
+ info="One or two sentences at a time is better. Up to 200 text characters.",
616
+ value="Hi there, I'm your new voice clone. Try your best to upload quality audio.",
617
+ )
618
+ language_gr = gr.Dropdown(
619
+ label="Language",
620
+ info="Select an output language for the synthesised speech",
621
+ choices=[
622
+ "en",
623
+ "es",
624
+ "fr",
625
+ "de",
626
+ "it",
627
+ "pt",
628
+ "pl",
629
+ "tr",
630
+ "ru",
631
+ "nl",
632
+ "cs",
633
+ "ar",
634
+ "zh-cn",
635
+ "ja",
636
+ "ko",
637
+ "hu",
638
+ "hi"
639
+ ],
640
+ max_choices=1,
641
+ value="en",
642
+ )
643
+ ref_gr = gr.Audio(
644
+ label="Reference Audio",
645
+ info="Click on the ✎ button to upload your own target speaker audio",
646
+ type="filepath",
647
+ value="examples/female.wav",
648
+ )
649
+ mic_gr = gr.Audio(
650
+ source="microphone",
651
+ type="filepath",
652
+ info="Use your microphone to record audio",
653
+ label="Use Microphone for Reference",
654
+ )
655
+ use_mic_gr = gr.Checkbox(
656
+ label="Use Microphone",
657
+ value=False,
658
+ info="Notice: Microphone input may not work properly under traffic",
659
+ )
660
+ clean_ref_gr = gr.Checkbox(
661
+ label="Cleanup Reference Voice",
662
+ value=False,
663
+ info="This check can improve output if your microphone or reference voice is noisy",
664
+ )
665
+ auto_det_lang_gr = gr.Checkbox(
666
+ label="Do not use language auto-detect",
667
+ value=False,
668
+ info="Check to disable language auto-detection",
669
+ )
670
+ tos_gr = gr.Checkbox(
671
+ label="Agree",
672
+ value=False,
673
+ info="I agree to the terms of the CPML: https://coqui.ai/cpml",
674
+ )
675
+
676
+ tts_button = gr.Button("Send", elem_id="send-btn", visible=True)
677
+
678
+
679
+ with gr.Column():
680
+ video_gr = gr.Video(label="Waveform Visual")
681
+ audio_gr = gr.Audio(label="Synthesised Audio", autoplay=True)
682
+ out_text_gr = gr.Text(label="Metrics")
683
+ ref_audio_gr = gr.Audio(label="Reference Audio Used")
684
+
685
+ with gr.Row():
686
+ gr.Examples(examples,
687
+ label="Examples",
688
+ inputs=[input_text_gr, language_gr, ref_gr, mic_gr, use_mic_gr, clean_ref_gr, auto_det_lang_gr, tos_gr],
689
+ outputs=[video_gr, audio_gr, out_text_gr, ref_audio_gr],
690
+ fn=predict,
691
+ cache_examples=False,)
692
+
693
+ tts_button.click(predict, [input_text_gr, language_gr, ref_gr, mic_gr, use_mic_gr, clean_ref_gr, auto_det_lang_gr, tos_gr], outputs=[video_gr, audio_gr, out_text_gr, ref_audio_gr])
694
+
695
+ demo.queue()
696
+ demo.launch(debug=True, show_api=True)