antoniomae commited on
Commit
75244eb
·
verified ·
1 Parent(s): bf1392c

Upload 6 files

Browse files
Files changed (6) hide show
  1. README.md +12 -0
  2. app.py +563 -0
  3. gitattributes +36 -0
  4. index.html +15 -0
  5. packages.txt +1 -0
  6. requirements.txt +62 -0
README.md ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: XTTS-streaming
3
+ emoji: 🐸
4
+ colorFrom: green
5
+ colorTo: red
6
+ sdk: static
7
+ pinned: false
8
+ models:
9
+ - coqui/XTTS-v1
10
+ ---
11
+
12
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,563 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import io, os, stat
3
+ import subprocess
4
+ import random
5
+ from zipfile import ZipFile
6
+ import uuid
7
+
8
+ import time
9
+ import torch
10
+ import torchaudio
11
+ # By using XTTS you agree to CPML license https://coqui.ai/cpml
12
+ os.environ["COQUI_TOS_AGREED"] = "1"
13
+
14
+ # langid is used to detect language for longer text
15
+ # Most users expect text to be their own language, there is checkbox to disable it
16
+ import langid
17
+
18
+ import base64
19
+ import csv
20
+ from io import StringIO
21
+ import datetime
22
+
23
+ import gradio as gr
24
+ from scipy.io.wavfile import write
25
+ from pydub import AudioSegment
26
+
27
+ from TTS.api import TTS
28
+ from TTS.tts.configs.xtts_config import XttsConfig
29
+ from TTS.tts.models.xtts import Xtts
30
+ from TTS.utils.generic_utils import get_user_data_dir
31
+
32
+ HF_TOKEN = os.environ.get("HF_TOKEN")
33
+
34
+ from huggingface_hub import HfApi
35
+
36
+ # will use api to restart space on a unrecoverable error
37
+ api = HfApi(token=HF_TOKEN)
38
+ repo_id = "coqui/xtts-streaming"
39
+
40
+ # Use never ffmpeg binary for Ubuntu20 to use denoising for microphone input
41
+ print("Export newer ffmpeg binary for denoise filter")
42
+ ZipFile("ffmpeg.zip").extractall()
43
+ print("Make ffmpeg binary executable")
44
+ st = os.stat('ffmpeg')
45
+ os.chmod('ffmpeg', st.st_mode | stat.S_IEXEC)
46
+
47
+ # This will trigger downloading model
48
+ print("Downloading if not downloaded Coqui XTTS V1.1")
49
+ from TTS.utils.manage import ModelManager
50
+ model_name = "tts_models/multilingual/multi-dataset/xtts_v1.1"
51
+ ModelManager().download_model(model_name)
52
+ model_path = os.path.join(get_user_data_dir("tts"), model_name.replace("/", "--"))
53
+ print("XTTS downloaded")
54
+
55
+ config = XttsConfig()
56
+ config.load_json(os.path.join(model_path, "config.json"))
57
+ model = Xtts.init_from_config(config)
58
+ model.load_checkpoint(
59
+ config,
60
+ checkpoint_path=os.path.join(model_path, "model.pth"),
61
+ vocab_path=os.path.join(model_path, "vocab.json"),
62
+ eval=True,
63
+ use_deepspeed=True
64
+ )
65
+ model.cuda()
66
+
67
+ # it should be there just to be sure
68
+ if "ja" not in config.languages:
69
+ config.languages.append("ja")
70
+
71
+ # This is for debugging purposes only
72
+ DEVICE_ASSERT_DETECTED=0
73
+ DEVICE_ASSERT_PROMPT=None
74
+ DEVICE_ASSERT_LANG=None
75
+
76
+ #supported_languages=["en","es","fr","de","it","pt","pl","tr","ru","nl","cs","ar","zh-cn"]
77
+ supported_languages=config.languages
78
+
79
+ def predict(prompt, language, audio_file_pth, mic_file_path, use_mic, voice_cleanup, no_lang_auto_detect, agree,):
80
+ if agree == True:
81
+
82
+
83
+ if language not in supported_languages:
84
+ gr.Warning("Language you put in is not in is not in our Supported Languages, please choose from dropdown")
85
+
86
+ return (
87
+ None,
88
+ None,
89
+ None,
90
+ )
91
+
92
+ language_predicted=langid.classify(prompt)[0].strip() # strip need as there is space at end!
93
+
94
+ # tts expects chinese as zh-cn
95
+ if language_predicted == "zh":
96
+ #we use zh-cn
97
+ language_predicted = "zh-cn"
98
+ print(f"Detected language:{language_predicted}, Chosen language:{language}")
99
+
100
+ # After text character length 15 trigger language detection
101
+ if len(prompt)>15:
102
+ # allow any language for short text as some may be common
103
+ # If user unchecks language autodetection it will not trigger
104
+ # You may remove this completely for own use
105
+ if language_predicted != language and not no_lang_auto_detect:
106
+ #Please duplicate and remove this check if you really want this
107
+ #Or auto-detector fails to identify language (which it can on pretty short text or mixed text)
108
+ gr.Warning(f"It looks like your text isn’t the language you chose , if you’re sure the text is the same language you chose, please check disable language auto-detection checkbox" )
109
+
110
+ return (
111
+ None,
112
+ None,
113
+ None,
114
+ )
115
+
116
+
117
+ if use_mic == True:
118
+ if mic_file_path is not None:
119
+ speaker_wav=mic_file_path
120
+ else:
121
+ gr.Warning("Please record your voice with Microphone, or uncheck Use Microphone to use reference audios")
122
+ return (
123
+ None,
124
+ None,
125
+ None,
126
+ )
127
+
128
+ else:
129
+ speaker_wav=audio_file_pth
130
+
131
+
132
+ # Filtering for microphone input, as it has BG noise, maybe silence in beginning and end
133
+ # This is fast filtering not perfect
134
+
135
+ # Apply all on demand
136
+ lowpassfilter=denoise=trim=loudness=True
137
+
138
+ if lowpassfilter:
139
+ lowpass_highpass="lowpass=8000,highpass=75,"
140
+ else:
141
+ lowpass_highpass=""
142
+
143
+ if trim:
144
+ # better to remove silence in beginning and end for microphone
145
+ trim_silence="areverse,silenceremove=start_periods=1:start_silence=0:start_threshold=0.02,areverse,silenceremove=start_periods=1:start_silence=0:start_threshold=0.02,"
146
+ else:
147
+ trim_silence=""
148
+
149
+ if (voice_cleanup):
150
+ try:
151
+ out_filename = speaker_wav + str(uuid.uuid4()) + ".wav" #ffmpeg to know output format
152
+
153
+ #we will use newer ffmpeg as that has afftn denoise filter
154
+ shell_command = f"./ffmpeg -y -i {speaker_wav} -af {lowpass_highpass}{trim_silence} {out_filename}".split(" ")
155
+
156
+ command_result = subprocess.run([item for item in shell_command], capture_output=False,text=True, check=True)
157
+ speaker_wav=out_filename
158
+ print("Filtered microphone input")
159
+ except subprocess.CalledProcessError:
160
+ # There was an error - command exited with non-zero code
161
+ print("Error: failed filtering, use original microphone input")
162
+ else:
163
+ speaker_wav=speaker_wav
164
+
165
+ if len(prompt)<2:
166
+ gr.Warning("Please give a longer prompt text")
167
+ return (
168
+ None,
169
+ None,
170
+ None,
171
+ )
172
+ if len(prompt)>200:
173
+ gr.Warning("Text length limited to 200 characters for this demo, please try shorter text. You can clone this space and edit code for your own usage")
174
+ return (
175
+ None,
176
+ None,
177
+ None,
178
+ )
179
+ global DEVICE_ASSERT_DETECTED
180
+ if DEVICE_ASSERT_DETECTED:
181
+ global DEVICE_ASSERT_PROMPT
182
+ global DEVICE_ASSERT_LANG
183
+ #It will likely never come here as we restart space on first unrecoverable error now
184
+ print(f"Unrecoverable exception caused by language:{DEVICE_ASSERT_LANG} prompt:{DEVICE_ASSERT_PROMPT}")
185
+
186
+
187
+ metrics_text= ""
188
+
189
+
190
+ try:
191
+
192
+ t_latent=time.time()
193
+ try:
194
+ gpt_cond_latent, _, speaker_embedding = model.get_conditioning_latents(audio_path=speaker_wav)
195
+ except Exception as e:
196
+ print("Speaker encoding error", str(e))
197
+ gr.Warning("It appears something wrong with reference, did you unmute your microphone?")
198
+ return (
199
+ None,
200
+ None,
201
+ None,
202
+ None,
203
+ )
204
+
205
+ latent_calculation_time = time.time() - t_latent
206
+ ##metrics_text=f"Embedding calculation time: {latent_calculation_time:.2f} seconds\n"
207
+
208
+ wav_chunks = []
209
+
210
+ t_inference=time.time()
211
+
212
+ chunks = model.inference_stream(
213
+ prompt,
214
+ language,
215
+ gpt_cond_latent,
216
+ speaker_embedding,)
217
+
218
+ first_chunk=True
219
+ for i, chunk in enumerate(chunks):
220
+ if first_chunk:
221
+ first_chunk_time = time.time() - t_inference
222
+ metrics_text+=f"Latency to first audio chunk: {round(first_chunk_time*1000)} milliseconds\n"
223
+ first_chunk=False
224
+
225
+
226
+ wav_chunks.append(chunk)
227
+ print(f"Received chunk {i} of audio length {chunk.shape[-1]}")
228
+
229
+ out_file = f'{i}.wav'
230
+ write(out_file, 24000, chunk.detach().cpu().numpy().squeeze())
231
+ audio = AudioSegment.from_file(out_file)
232
+ audio.export(out_file, format='wav')
233
+
234
+ yield (None, out_file, metrics_text, None)
235
+
236
+ except RuntimeError as e :
237
+ if "device-side assert" in str(e):
238
+ # cannot do anything on cuda device side error, need tor estart
239
+ print(f"Exit due to: Unrecoverable exception caused by language:{language} prompt:{prompt}", flush=True)
240
+ gr.Warning("Unhandled Exception encounter, please retry in a minute")
241
+ print("Cuda device-assert Runtime encountered need restart")
242
+ if not DEVICE_ASSERT_DETECTED:
243
+ DEVICE_ASSERT_DETECTED=1
244
+ DEVICE_ASSERT_PROMPT=prompt
245
+ DEVICE_ASSERT_LANG=language
246
+
247
+ # just before restarting save what caused the issue so we can handle it in future
248
+ # Uploading Error data only happens for unrecovarable error
249
+ error_time = datetime.datetime.now().strftime('%d-%m-%Y-%H:%M:%S')
250
+ error_data = [error_time, prompt, language, audio_file_pth, mic_file_path, use_mic, voice_cleanup, no_lang_auto_detect, agree]
251
+ error_data = [str(e) if type(e)!=str else e for e in error_data]
252
+ print(error_data)
253
+ print(speaker_wav)
254
+ write_io = StringIO()
255
+ csv.writer(write_io).writerows([error_data])
256
+ csv_upload= write_io.getvalue().encode()
257
+
258
+ filename = error_time+"_xtts-stream_" + str(uuid.uuid4()) +".csv"
259
+ print("Writing error csv")
260
+ error_api = HfApi()
261
+ error_api.upload_file(
262
+ path_or_fileobj=csv_upload,
263
+ path_in_repo=filename,
264
+ repo_id="coqui/xtts-flagged-dataset",
265
+ repo_type="dataset",
266
+ )
267
+
268
+ #speaker_wav
269
+ print("Writing error reference audio")
270
+ speaker_filename = error_time+"_reference_xtts-stream_"+ str(uuid.uuid4()) +".wav"
271
+ error_api = HfApi()
272
+ error_api.upload_file(
273
+ path_or_fileobj=speaker_wav,
274
+ path_in_repo=speaker_filename,
275
+ repo_id="coqui/xtts-flagged-dataset",
276
+ repo_type="dataset",
277
+ )
278
+
279
+ # HF Space specific.. This error is unrecoverable need to restart space
280
+ api.restart_space(repo_id=repo_id)
281
+ else:
282
+ if "Failed to decode" in str(e):
283
+ print("Speaker encoding error", str(e))
284
+ gr.Warning("It appears something wrong with reference, did you unmute your microphone?")
285
+ else:
286
+ print("RuntimeError: non device-side assert error:", str(e))
287
+ gr.Warning("Something unexpected happened please retry again.")
288
+
289
+ return (
290
+ None,
291
+ None,
292
+ None,
293
+ None,
294
+ )
295
+
296
+
297
+ wav = torch.cat(wav_chunks, dim=0)
298
+ torchaudio.save("output.wav", wav.squeeze().unsqueeze(0).cpu(), 24000)
299
+
300
+ second_of_silence = AudioSegment.silent() # use default
301
+ second_of_silence.export("sil.wav", format='wav')
302
+
303
+ yield (
304
+ gr.make_waveform(
305
+ audio="output.wav",
306
+ ),
307
+ "sil.wav",
308
+ metrics_text,
309
+ speaker_wav,
310
+ )
311
+ else:
312
+ gr.Warning("Please accept the Terms & Condition!")
313
+ return (
314
+ None,
315
+ None,
316
+ None,
317
+ None,
318
+ )
319
+
320
+
321
+ title = "Coqui🐸 XTTS - Streaming"
322
+
323
+ description = """
324
+ <div>
325
+ <a style="display:inline-block" href='https://github.com/coqui-ai/TTS'><img src='https://img.shields.io/github/stars/coqui-ai/TTS?style=social' /></a>
326
+ <a style='display:inline-block' href='https://discord.gg/5eXr5seRrv'><img src='https://discord.com/api/guilds/1037326658807533628/widget.png?style=shield' /></a>
327
+ <a href="https://huggingface.co/spaces/coqui/xtts-streaming?duplicate=true">
328
+ <img style="margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>
329
+ </div>
330
+ <img referrerpolicy="no-referrer-when-downgrade" src="https://static.scarf.sh/a.png?x-pxid=0d00920c-8cc9-4bf3-90f2-a615797e5f59" />
331
+ <a href="https://huggingface.co/coqui/XTTS-v1">XTTS</a> is a Voice generation model that lets you clone voices into different languages by using just a quick 6-second audio clip.
332
+ <br/>
333
+ XTTS is built on previous research, like Tortoise, with additional architectural innovations and training to make cross-language voice cloning and multilingual speech generation possible.
334
+ <br/>
335
+ This is the same model that powers our creator application <a href="https://coqui.ai">Coqui Studio</a> as well as the <a href="https://docs.coqui.ai">Coqui API</a>. In production we apply modifications to make low-latency streaming possible.
336
+ <br/>
337
+ Leave a star on the Github <a href="https://github.com/coqui-ai/TTS">🐸TTS</a>, where our open-source inference and training code lives.
338
+ <br/>
339
+ <p>For faster inference without waiting in the queue, you should duplicate this space and upgrade to GPU via the settings.
340
+ <br/>
341
+
342
+ </p>
343
+ <p>Language Selectors:
344
+ Arabic: ar, Brazilian Portuguese: pt , Chinese: zh-cn, Czech: cs,<br/>
345
+ Dutch: nl, English: en, French: fr, Italian: it, Polish: pl,<br/>
346
+ Russian: ru, Spanish: es, Turkish: tr, Japanese: ja <br/>
347
+ </p>
348
+ <p> Notice: Autoplay may not work on mobile, if you see black waveform image on mobile click it your Audio is there</p>
349
+ <img referrerpolicy="no-referrer-when-downgrade" src="https://static.scarf.sh/a.png?x-pxid=8946ef36-c454-4a8e-a9c9-8a8dd735fabd" />
350
+ """
351
+
352
+ article = """
353
+ <div style='margin:20px auto;'>
354
+ <p>By using this demo you agree to the terms of the Coqui Public Model License at https://coqui.ai/cpml</p>
355
+ <p>We collect data only for error cases for improvement.</p>
356
+ </div>
357
+ """
358
+ examples = [
359
+ [
360
+ "Once when I was six years old I saw a magnificent picture",
361
+ "en",
362
+ "examples/female.wav",
363
+ None,
364
+ False,
365
+ False,
366
+ False,
367
+ True,
368
+ ],
369
+ [
370
+ "Lorsque j'avais six ans j'ai vu, une fois, une magnifique image",
371
+ "fr",
372
+ "examples/male.wav",
373
+ None,
374
+ False,
375
+ False,
376
+ False,
377
+ True,
378
+ ],
379
+ [
380
+ "Als ich sechs war, sah ich einmal ein wunderbares Bild",
381
+ "de",
382
+ "examples/female.wav",
383
+ None,
384
+ False,
385
+ False,
386
+ False,
387
+ True,
388
+ ],
389
+ [
390
+ "Cuando tenía seis años, vi una vez una imagen magnífica",
391
+ "es",
392
+ "examples/male.wav",
393
+ None,
394
+ False,
395
+ False,
396
+ False,
397
+ True,
398
+ ],
399
+ [
400
+ "Quando eu tinha seis anos eu vi, uma vez, uma imagem magnífica",
401
+ "pt",
402
+ "examples/female.wav",
403
+ None,
404
+ False,
405
+ False,
406
+ False,
407
+ True,
408
+ ],
409
+ [
410
+ "Kiedy miałem sześć lat, zobaczyłem pewnego razu wspaniały obrazek",
411
+ "pl",
412
+ "examples/male.wav",
413
+ None,
414
+ False,
415
+ False,
416
+ False,
417
+ True,
418
+ ],
419
+ [
420
+ "Un tempo lontano, quando avevo sei anni, vidi un magnifico disegno",
421
+ "it",
422
+ "examples/female.wav",
423
+ None,
424
+ False,
425
+ False,
426
+ False,
427
+ True,
428
+ ],
429
+ [
430
+ "Bir zamanlar, altı yaşındayken, muhteşem bir resim gördüm",
431
+ "tr",
432
+ "examples/female.wav",
433
+ None,
434
+ False,
435
+ False,
436
+ False,
437
+ True,
438
+ ],
439
+ [
440
+ "Когда мне было шесть лет, я увидел однажды удивительную картинку",
441
+ "ru",
442
+ "examples/female.wav",
443
+ None,
444
+ False,
445
+ False,
446
+ False,
447
+ True,
448
+ ],
449
+ [
450
+ "Toen ik een jaar of zes was, zag ik op een keer een prachtige plaat",
451
+ "nl",
452
+ "examples/male.wav",
453
+ None,
454
+ False,
455
+ False,
456
+ False,
457
+ True,
458
+ ],
459
+ [
460
+ "Když mi bylo šest let, viděl jsem jednou nádherný obrázek",
461
+ "cs",
462
+ "examples/female.wav",
463
+ None,
464
+ False,
465
+ False,
466
+ False,
467
+ True,
468
+ ],
469
+ [
470
+ "当我还只有六岁的时候, 看到了一副精彩的插画",
471
+ "zh-cn",
472
+ "examples/female.wav",
473
+ None,
474
+ False,
475
+ False,
476
+ False,
477
+ True,
478
+ ],
479
+ [
480
+ "かつて 六歳のとき、素晴らしい絵を見ました",
481
+ "ja",
482
+ "examples/female.wav",
483
+ None,
484
+ False,
485
+ False,
486
+ False,
487
+ True,
488
+ ],
489
+
490
+ ]
491
+
492
+
493
+
494
+ gr.Interface(
495
+ fn=predict,
496
+ inputs=[
497
+ gr.Textbox(
498
+ label="Text Prompt",
499
+ info="One or two sentences at a time is better. Up to 200 text characters.",
500
+ value="Hi there, I'm your new voice clone. Try your best to upload quality audio",
501
+ ),
502
+ gr.Dropdown(
503
+ label="Language",
504
+ info="Select an output language for the synthesised speech",
505
+ choices=[
506
+ "en",
507
+ "es",
508
+ "fr",
509
+ "de",
510
+ "it",
511
+ "pt",
512
+ "pl",
513
+ "tr",
514
+ "ru",
515
+ "nl",
516
+ "cs",
517
+ "ar",
518
+ "zh-cn",
519
+ "ja"
520
+ ],
521
+ max_choices=1,
522
+ value="en",
523
+ ),
524
+ gr.Audio(
525
+ label="Reference Audio",
526
+ info="Click on the ✎ button to upload your own target speaker audio",
527
+ type="filepath",
528
+ value="examples/female.wav",
529
+ ),
530
+ gr.Audio(source="microphone",
531
+ type="filepath",
532
+ info="Use your microphone to record audio",
533
+ label="Use Microphone for Reference"),
534
+ gr.Checkbox(label="Use Microphone",
535
+ value=False,
536
+ info="Notice: Microphone input may not work properly under traffic",),
537
+ gr.Checkbox(label="Cleanup Reference Voice",
538
+ value=False,
539
+ info="This check can improve output if your microphone or reference voice is noisy",
540
+ ),
541
+ gr.Checkbox(label="Do not use language auto-detect",
542
+ value=False,
543
+ info="Check to disable language auto-detection",),
544
+ gr.Checkbox(
545
+ label="Agree",
546
+ value=False,
547
+ info="I agree to the terms of the Coqui Public Model License at https://coqui.ai/cpml",
548
+ ),
549
+
550
+
551
+ ],
552
+ outputs=[
553
+ gr.Video(label="Waveform Visual"),
554
+ gr.Audio(label="Synthesised Audio", streaming=True, autoplay=True),
555
+ gr.Text(label="Metrics"),
556
+ gr.Audio(label="Reference Audio Used"),
557
+ ],
558
+ title=title,
559
+ description=description,
560
+ article=article,
561
+ examples=examples,
562
+ cache_examples=False,
563
+ ).queue().launch(debug=True,show_api=True)
gitattributes ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ examples/female.wav filter=lfs diff=lfs merge=lfs -text
index.html ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+
3
+ <html lang="en">
4
+ <meta http-equiv="refresh" content="5; URL=javascript:window.open('https://huggingface.co/spaces/coqui/xtts','_parent');">
5
+ <script>window.top.location.href = "https://huggingface.co/spaces/coqui/xtts";</script>
6
+ <body>
7
+ Please use <a href="https://huggingface.co/spaces/coqui/xtts">https://huggingface.co/spaces/coqui/xtts</a> for full experience.
8
+
9
+ <iframe src="https://coqui-xtts.hf.space" style="position:fixed; top:10; left:0; bottom:0; right:0; width:100%; height:100%; border:none; margin:0; padding:0; overflow:hidden; z-index:999999;">
10
+ Your browser doesn't support iframes
11
+ </iframe>
12
+
13
+ </body>
14
+
15
+ </html>
packages.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ unzip
requirements.txt ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Preinstall requirements from TTS
2
+ torch==2.0.1 --index-url https://download.pytorch.org/whl/cu118
3
+ torchvision==0.15.2 --index-url https://download.pytorch.org/whl/cu118
4
+ torchaudio==2.0.2 --index-url https://download.pytorch.org/whl/cu118
5
+ numpy==1.22.0;python_version<="3.10"
6
+ numpy==1.24.3;python_version>"3.10"
7
+ cython==0.29.30
8
+ scipy>=1.11.2
9
+ soundfile==0.12.*
10
+ librosa==0.10.*
11
+ scikit-learn==1.3.0
12
+ numba==0.55.1;python_version<"3.9"
13
+ numba==0.57.0;python_version>="3.9"
14
+ inflect==5.6.*
15
+ tqdm==4.64.*
16
+ anyascii==0.3.*
17
+ pyyaml==6.*
18
+ fsspec==2023.6.0 # <= 2023.9.1 makes aux tests fail
19
+ aiohttp==3.8.*
20
+ packaging==23.1
21
+ # deps for examples
22
+ flask==2.*
23
+ # deps for inference
24
+ pysbd==0.3.4
25
+ # deps for notebooks
26
+ umap-learn==0.5.*
27
+ pandas>=1.4,<2.0
28
+ # deps for training
29
+ matplotlib==3.7.*
30
+ # coqui stack
31
+ trainer
32
+ # config management
33
+ coqpit>=0.0.16
34
+ # chinese g2p deps
35
+ jieba
36
+ pypinyin==0.47.1
37
+ # gruut+supported langs
38
+ gruut[de,es,fr]==2.2.3
39
+ # deps for korean
40
+ jamo
41
+ nltk
42
+ g2pkk>=0.1.1
43
+ # deps for bangla
44
+ bangla
45
+ bnnumerizer
46
+ bnunicodenormalizer
47
+ #deps for tortoise
48
+ k_diffusion
49
+ einops==0.6.*
50
+ transformers==4.33.*
51
+ #deps for bark
52
+ encodec==0.1.*
53
+ # deps for XTTS
54
+ unidecode==1.3.*
55
+ langid
56
+ # Install tts
57
+ git+https://github.com/coqui-ai/[email protected]
58
+ deepspeed==0.8.3
59
+ pydub
60
+ cutlet
61
+ mecab-python3==1.0.6
62
+ unidic-lite==1.0.8