antoniomae commited on
Commit
7d6047f
1 Parent(s): 8aecc5d

Upload 7 files

Browse files
Files changed (7) hide show
  1. Dockerfile +28 -0
  2. README.md +12 -0
  3. app.py +518 -0
  4. build.py +17 -0
  5. gitattributes +36 -0
  6. packages.txt +1 -0
  7. requirements.txt +12 -0
Dockerfile ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11
2
+
3
+ # By using XTTS you agree to CPML license https://coqui.ai/cpml
4
+ ENV COQUI_TOS_AGREED=1
5
+
6
+ # Set up a new user named "user" with user ID 1000
7
+ RUN useradd -m -u 1000 user
8
+
9
+ # Switch to the "user" user
10
+ USER user
11
+
12
+ # Set home to the user's home directory
13
+ ENV HOME=/home/user \
14
+ PATH=/home/user/.local/bin:$PATH
15
+
16
+ # Set the working directory to the user's home directory
17
+ WORKDIR $HOME/app
18
+
19
+ # Install dependencies
20
+ COPY --chown=user:user requirements.txt .
21
+ RUN pip install -r requirements.txt
22
+ RUN python -m unidic download
23
+
24
+ # Install model weights
25
+ COPY --chown=user:user . .
26
+ RUN python build.py
27
+
28
+ CMD ["bash", "-c", "python --version && python app.py"]
README.md ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: XTTS
3
+ emoji: 🐸
4
+ colorFrom: green
5
+ colorTo: red
6
+ pinned: false
7
+ sdk: docker
8
+ models:
9
+ - coqui/XTTS-v2
10
+ ---
11
+
12
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,518 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import subprocess
3
+ import uuid
4
+ import time
5
+ import torch
6
+ import torchaudio
7
+
8
+ # langid is used to detect language for longer text
9
+ # Most users expect text to be their own language, there is checkbox to disable it
10
+ import langid
11
+ import csv
12
+ from io import StringIO
13
+ import datetime
14
+ import re
15
+
16
+ import gradio as gr
17
+
18
+ from TTS.tts.configs.xtts_config import XttsConfig
19
+ from TTS.tts.models.xtts import Xtts
20
+ from TTS.utils.generic_utils import get_user_data_dir
21
+
22
+ print("application starting")
23
+
24
+ HF_TOKEN = os.environ.get("HF_TOKEN")
25
+
26
+ from huggingface_hub import HfApi
27
+
28
+ # will use api to restart space on a unrecoverable error
29
+ api = HfApi(token=HF_TOKEN)
30
+ repo_id = "JacobLinCool/xtts-v2"
31
+
32
+ model = None
33
+ supported_languages = None
34
+
35
+
36
+ def load_model():
37
+ global model
38
+ global supported_languages
39
+
40
+ print("loading model")
41
+
42
+ model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
43
+ model_path = os.path.join(get_user_data_dir("tts"), model_name.replace("/", "--"))
44
+
45
+ config = XttsConfig()
46
+ config.load_json(os.path.join(model_path, "config.json"))
47
+
48
+ model = Xtts.init_from_config(config)
49
+ model.load_checkpoint(
50
+ config,
51
+ checkpoint_path=os.path.join(model_path, "model.pth"),
52
+ vocab_path=os.path.join(model_path, "vocab.json"),
53
+ eval=True,
54
+ use_deepspeed=False,
55
+ )
56
+
57
+ if torch.cuda.is_available():
58
+ model.cuda()
59
+ else:
60
+ model.cpu()
61
+
62
+ supported_languages = config.languages
63
+
64
+ print("Model loaded")
65
+
66
+
67
+ # This is for debugging purposes only
68
+ DEVICE_ASSERT_DETECTED = 0
69
+ DEVICE_ASSERT_PROMPT = None
70
+ DEVICE_ASSERT_LANG = None
71
+
72
+
73
+ def predict(
74
+ prompt,
75
+ language,
76
+ audio_file_pth,
77
+ voice_cleanup,
78
+ no_lang_auto_detect,
79
+ agree,
80
+ ):
81
+ if model is None:
82
+ load_model()
83
+
84
+ if agree == True:
85
+ if language not in supported_languages:
86
+ gr.Warning(
87
+ f"Language you put {language} in is not in is not in our Supported Languages, please choose from dropdown"
88
+ )
89
+
90
+ return (
91
+ None,
92
+ None,
93
+ None,
94
+ None,
95
+ )
96
+
97
+ language_predicted = langid.classify(prompt)[
98
+ 0
99
+ ].strip() # strip need as there is space at end!
100
+
101
+ # tts expects chinese as zh-cn
102
+ if language_predicted == "zh":
103
+ # we use zh-cn
104
+ language_predicted = "zh-cn"
105
+
106
+ print(f"Detected language:{language_predicted}, Chosen language:{language}")
107
+
108
+ # After text character length 15 trigger language detection
109
+ if len(prompt) > 15:
110
+ # allow any language for short text as some may be common
111
+ # If user unchecks language autodetection it will not trigger
112
+ # You may remove this completely for own use
113
+ if language_predicted != language and not no_lang_auto_detect:
114
+ # Please duplicate and remove this check if you really want this
115
+ # Or auto-detector fails to identify language (which it can on pretty short text or mixed text)
116
+ gr.Warning(
117
+ f"It looks like your text isn’t the language you chose , if you’re sure the text is the same language you chose, please check disable language auto-detection checkbox"
118
+ )
119
+
120
+ return (
121
+ None,
122
+ None,
123
+ None,
124
+ None,
125
+ )
126
+
127
+ speaker_wav = audio_file_pth
128
+
129
+ # Filtering for microphone input, as it has BG noise, maybe silence in beginning and end
130
+ # This is fast filtering not perfect
131
+
132
+ # Apply all on demand
133
+ lowpassfilter = denoise = trim = loudness = True
134
+
135
+ if lowpassfilter:
136
+ lowpass_highpass = "lowpass=8000,highpass=75,"
137
+ else:
138
+ lowpass_highpass = ""
139
+
140
+ if trim:
141
+ # better to remove silence in beginning and end for microphone
142
+ trim_silence = "areverse,silenceremove=start_periods=1:start_silence=0:start_threshold=0.02,areverse,silenceremove=start_periods=1:start_silence=0:start_threshold=0.02,"
143
+ else:
144
+ trim_silence = ""
145
+
146
+ if voice_cleanup:
147
+ try:
148
+ out_filename = (
149
+ speaker_wav + str(uuid.uuid4()) + ".wav"
150
+ ) # ffmpeg to know output format
151
+
152
+ # we will use newer ffmpeg as that has afftn denoise filter
153
+ shell_command = f"./ffmpeg -y -i {speaker_wav} -af {lowpass_highpass}{trim_silence} {out_filename}".split(
154
+ " "
155
+ )
156
+
157
+ command_result = subprocess.run(
158
+ [item for item in shell_command],
159
+ capture_output=False,
160
+ text=True,
161
+ check=True,
162
+ )
163
+ speaker_wav = out_filename
164
+ print("Filtered microphone input")
165
+ except subprocess.CalledProcessError:
166
+ # There was an error - command exited with non-zero code
167
+ print("Error: failed filtering, use original microphone input")
168
+ else:
169
+ speaker_wav = speaker_wav
170
+
171
+ if len(prompt) < 2:
172
+ gr.Warning("Please give a longer prompt text")
173
+ return (
174
+ None,
175
+ None,
176
+ None,
177
+ None,
178
+ )
179
+ if len(prompt) > 200:
180
+ gr.Warning(
181
+ "Text length limited to 200 characters for this demo, please try shorter text. You can clone this space and edit code for your own usage"
182
+ )
183
+ return (
184
+ None,
185
+ None,
186
+ None,
187
+ None,
188
+ )
189
+ global DEVICE_ASSERT_DETECTED
190
+ if DEVICE_ASSERT_DETECTED:
191
+ global DEVICE_ASSERT_PROMPT
192
+ global DEVICE_ASSERT_LANG
193
+ # It will likely never come here as we restart space on first unrecoverable error now
194
+ print(
195
+ f"Unrecoverable exception caused by language:{DEVICE_ASSERT_LANG} prompt:{DEVICE_ASSERT_PROMPT}"
196
+ )
197
+
198
+ # HF Space specific.. This error is unrecoverable need to restart space
199
+ space = api.get_space_runtime(repo_id=repo_id)
200
+ if space.stage != "BUILDING":
201
+ api.restart_space(repo_id=repo_id)
202
+ else:
203
+ print("TRIED TO RESTART but space is building")
204
+
205
+ try:
206
+ metrics_text = ""
207
+ t_latent = time.time()
208
+
209
+ # note diffusion_conditioning not used on hifigan (default mode), it will be empty but need to pass it to model.inference
210
+ try:
211
+ (
212
+ gpt_cond_latent,
213
+ speaker_embedding,
214
+ ) = model.get_conditioning_latents(
215
+ audio_path=speaker_wav, gpt_cond_len=30, max_ref_length=60
216
+ )
217
+ except Exception as e:
218
+ print("Speaker encoding error", str(e))
219
+ gr.Warning(
220
+ "It appears something wrong with reference, did you unmute your microphone?"
221
+ )
222
+ return (
223
+ None,
224
+ None,
225
+ None,
226
+ None,
227
+ )
228
+
229
+ latent_calculation_time = time.time() - t_latent
230
+ # metrics_text=f"Embedding calculation time: {latent_calculation_time:.2f} seconds\n"
231
+
232
+ # temporary comma fix
233
+ prompt = re.sub("([^\x00-\x7F]|\w)(\.|\。|\?)", r"\1 \2\2", prompt)
234
+
235
+ wav_chunks = []
236
+ ## Direct mode
237
+ """
238
+ print("I: Generating new audio...")
239
+ t0 = time.time()
240
+ out = model.inference(
241
+ prompt,
242
+ language,
243
+ gpt_cond_latent,
244
+ speaker_embedding
245
+ )
246
+ inference_time = time.time() - t0
247
+ print(f"I: Time to generate audio: {round(inference_time*1000)} milliseconds")
248
+ metrics_text+=f"Time to generate audio: {round(inference_time*1000)} milliseconds\n"
249
+ real_time_factor= (time.time() - t0) / out['wav'].shape[-1] * 24000
250
+ print(f"Real-time factor (RTF): {real_time_factor}")
251
+ metrics_text+=f"Real-time factor (RTF): {real_time_factor:.2f}\n"
252
+ torchaudio.save("output.wav", torch.tensor(out["wav"]).unsqueeze(0), 24000)
253
+ """
254
+
255
+ print("I: Generating new audio in streaming mode...")
256
+ t0 = time.time()
257
+ chunks = model.inference_stream(
258
+ prompt,
259
+ language,
260
+ gpt_cond_latent,
261
+ speaker_embedding,
262
+ repetition_penalty=7.0,
263
+ temperature=0.85,
264
+ )
265
+
266
+ first_chunk = True
267
+ for i, chunk in enumerate(chunks):
268
+ if first_chunk:
269
+ first_chunk_time = time.time() - t0
270
+ metrics_text += f"Latency to first audio chunk: {round(first_chunk_time*1000)} milliseconds\n"
271
+ first_chunk = False
272
+ wav_chunks.append(chunk)
273
+ print(f"Received chunk {i} of audio length {chunk.shape[-1]}")
274
+ inference_time = time.time() - t0
275
+ print(
276
+ f"I: Time to generate audio: {round(inference_time*1000)} milliseconds"
277
+ )
278
+ # metrics_text += (
279
+ # f"Time to generate audio: {round(inference_time*1000)} milliseconds\n"
280
+ # )
281
+
282
+ wav = torch.cat(wav_chunks, dim=0)
283
+ print(wav.shape)
284
+ real_time_factor = (time.time() - t0) / wav.shape[0] * 24000
285
+ print(f"Real-time factor (RTF): {real_time_factor}")
286
+ metrics_text += f"Real-time factor (RTF): {real_time_factor:.2f}\n"
287
+
288
+ torchaudio.save("output.wav", wav.squeeze().unsqueeze(0).cpu(), 24000)
289
+
290
+ except RuntimeError as e:
291
+ if "device-side assert" in str(e):
292
+ # cannot do anything on cuda device side error, need tor estart
293
+ print(
294
+ f"Exit due to: Unrecoverable exception caused by language:{language} prompt:{prompt}",
295
+ flush=True,
296
+ )
297
+ gr.Warning("Unhandled Exception encounter, please retry in a minute")
298
+ print("Cuda device-assert Runtime encountered need restart")
299
+ if not DEVICE_ASSERT_DETECTED:
300
+ DEVICE_ASSERT_DETECTED = 1
301
+ DEVICE_ASSERT_PROMPT = prompt
302
+ DEVICE_ASSERT_LANG = language
303
+
304
+ # just before restarting save what caused the issue so we can handle it in future
305
+ # Uploading Error data only happens for unrecovarable error
306
+ error_time = datetime.datetime.now().strftime("%d-%m-%Y-%H:%M:%S")
307
+ error_data = [
308
+ error_time,
309
+ prompt,
310
+ language,
311
+ audio_file_pth,
312
+ voice_cleanup,
313
+ no_lang_auto_detect,
314
+ agree,
315
+ ]
316
+ error_data = [str(e) if type(e) != str else e for e in error_data]
317
+ print(error_data)
318
+ print(speaker_wav)
319
+ write_io = StringIO()
320
+ csv.writer(write_io).writerows([error_data])
321
+ csv_upload = write_io.getvalue().encode()
322
+
323
+ filename = error_time + "_" + str(uuid.uuid4()) + ".csv"
324
+ print("Writing error csv")
325
+ error_api = HfApi()
326
+ error_api.upload_file(
327
+ path_or_fileobj=csv_upload,
328
+ path_in_repo=filename,
329
+ repo_id="coqui/xtts-flagged-dataset",
330
+ repo_type="dataset",
331
+ )
332
+
333
+ # speaker_wav
334
+ print("Writing error reference audio")
335
+ speaker_filename = (
336
+ error_time + "_reference_" + str(uuid.uuid4()) + ".wav"
337
+ )
338
+ error_api = HfApi()
339
+ error_api.upload_file(
340
+ path_or_fileobj=speaker_wav,
341
+ path_in_repo=speaker_filename,
342
+ repo_id="coqui/xtts-flagged-dataset",
343
+ repo_type="dataset",
344
+ )
345
+
346
+ # HF Space specific.. This error is unrecoverable need to restart space
347
+ space = api.get_space_runtime(repo_id=repo_id)
348
+ if space.stage != "BUILDING":
349
+ api.restart_space(repo_id=repo_id)
350
+ else:
351
+ print("TRIED TO RESTART but space is building")
352
+
353
+ else:
354
+ if "Failed to decode" in str(e):
355
+ print("Speaker encoding error", str(e))
356
+ gr.Warning(
357
+ "It appears something wrong with reference, did you unmute your microphone?"
358
+ )
359
+ else:
360
+ print("RuntimeError: non device-side assert error:", str(e))
361
+ gr.Warning("Something unexpected happened please retry again.")
362
+ return (
363
+ None,
364
+ None,
365
+ None,
366
+ None,
367
+ )
368
+ return (
369
+ gr.make_waveform(
370
+ audio="output.wav",
371
+ ),
372
+ "output.wav",
373
+ metrics_text,
374
+ speaker_wav,
375
+ )
376
+ else:
377
+ gr.Warning("Please accept the Terms & Condition!")
378
+ return (
379
+ None,
380
+ None,
381
+ None,
382
+ None,
383
+ )
384
+
385
+
386
+ title = "Coqui🐸 XTTS"
387
+
388
+ description = """
389
+
390
+ <br/>
391
+
392
+ <a href="https://huggingface.co/coqui/XTTS-v2">XTTS</a> is a text-to-speech model that lets you clone voices into different languages.
393
+
394
+ <br/>
395
+
396
+ This is the same model that powers our creator application <a href="https://coqui.ai">Coqui Studio</a> as well as the <a href="https://docs.coqui.ai">Coqui API</a>. In production we apply modifications to make low-latency streaming possible.
397
+
398
+ <br/>
399
+
400
+ There are 16 languages.
401
+
402
+ <p>
403
+ Arabic: ar, Brazilian Portuguese: pt , Chinese: zh-cn, Czech: cs, Dutch: nl, English: en, French: fr, German: de, Italian: it, Polish: pl, Russian: ru, Spanish: es, Turkish: tr, Japanese: ja, Korean: ko, Hungarian: hu <br/>
404
+ </p>
405
+
406
+ <br/>
407
+
408
+ Leave a star 🌟 on the Github <a href="https://github.com/coqui-ai/TTS">🐸TTS</a>, where our open-source inference and training code lives.
409
+
410
+ <br/>
411
+ """
412
+
413
+ links = """
414
+ <img referrerpolicy="no-referrer-when-downgrade" src="https://static.scarf.sh/a.png?x-pxid=0d00920c-8cc9-4bf3-90f2-a615797e5f59" />
415
+
416
+ | | |
417
+ | ------------------------------- | --------------------------------------- |
418
+ | 🐸💬 **CoquiTTS** | <a style="display:inline-block" href='https://github.com/coqui-ai/TTS'><img src='https://img.shields.io/github/stars/coqui-ai/TTS?style=social' /></a>|
419
+ | 💼 **Documentation** | [ReadTheDocs](https://tts.readthedocs.io/en/latest/)
420
+ | 👩‍💻 **Questions** | [GitHub Discussions](https://github.com/coqui-ai/TTS/discussions) |
421
+ | 🗯 **Community** | [![Dicord](https://img.shields.io/discord/1037326658807533628?color=%239B59B6&label=chat%20on%20discord)](https://discord.gg/5eXr5seRrv) |
422
+
423
+
424
+ """
425
+
426
+ article = """
427
+ <div style='margin:20px auto;'>
428
+ <p>By using this demo you agree to the terms of the Coqui Public Model License at https://coqui.ai/cpml</p>
429
+ <p>We collect data only for error cases for improvement.</p>
430
+ </div>
431
+ """
432
+
433
+ with gr.Blocks(analytics_enabled=False) as demo:
434
+ with gr.Row():
435
+ with gr.Column():
436
+ gr.Markdown(
437
+ """
438
+ ## <img src="https://raw.githubusercontent.com/coqui-ai/TTS/main/images/coqui-log-green-TTS.png" height="56"/>
439
+ """
440
+ )
441
+ with gr.Column():
442
+ # placeholder to align the image
443
+ pass
444
+
445
+ with gr.Row():
446
+ with gr.Column():
447
+ gr.Markdown(description)
448
+ with gr.Column():
449
+ gr.Markdown(links)
450
+
451
+ with gr.Row():
452
+ with gr.Column():
453
+ input_text_gr = gr.Textbox(
454
+ label="Text Prompt",
455
+ info="One or two sentences at a time is better. Up to 200 text characters.",
456
+ value="Hi there, I'm your new voice clone. Try your best to upload quality audio",
457
+ )
458
+ language_gr = gr.Dropdown(
459
+ label="Language",
460
+ info="Select an output language for the synthesised speech",
461
+ choices=[
462
+ "en",
463
+ "es",
464
+ "fr",
465
+ "de",
466
+ "it",
467
+ "pt",
468
+ "pl",
469
+ "tr",
470
+ "ru",
471
+ "nl",
472
+ "cs",
473
+ "ar",
474
+ "zh-cn",
475
+ "ja",
476
+ "ko",
477
+ "hu",
478
+ ],
479
+ value="en",
480
+ )
481
+ ref_gr = gr.Audio(
482
+ label="Reference Audio",
483
+ info="Click on the ✎ button to upload your own target speaker audio",
484
+ type="filepath",
485
+ value="examples/female.wav",
486
+ )
487
+ clean_ref_gr = gr.Checkbox(
488
+ label="Cleanup Reference Voice",
489
+ value=False,
490
+ info="This check can improve output if your microphone or reference voice is noisy",
491
+ )
492
+ auto_det_lang_gr = gr.Checkbox(
493
+ label="Do not use language auto-detect",
494
+ value=False,
495
+ info="Check to disable language auto-detection",
496
+ )
497
+ tos_gr = gr.Checkbox(
498
+ label="Agree",
499
+ value=False,
500
+ info="I agree to the terms of the Coqui Public Model License at https://coqui.ai/cpml",
501
+ )
502
+
503
+ tts_button = gr.Button("Send", elem_id="send-btn", visible=True)
504
+
505
+ with gr.Column():
506
+ video_gr = gr.Video(label="Waveform Visual")
507
+ audio_gr = gr.Audio(label="Synthesised Audio", autoplay=True)
508
+ out_text_gr = gr.Text(label="Metrics")
509
+ ref_audio_gr = gr.Audio(label="Reference Audio Used")
510
+
511
+ tts_button.click(
512
+ predict,
513
+ [input_text_gr, language_gr, ref_gr, clean_ref_gr, auto_det_lang_gr, tos_gr],
514
+ outputs=[video_gr, audio_gr, out_text_gr, ref_audio_gr],
515
+ )
516
+
517
+ print("Starting server")
518
+ demo.queue().launch(debug=True, show_api=True)
build.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, stat
2
+ from zipfile import ZipFile
3
+
4
+ # Use never ffmpeg binary for Ubuntu20 to use denoising for microphone input
5
+ print("Export newer ffmpeg binary for denoise filter")
6
+ ZipFile("ffmpeg.zip").extractall()
7
+ print("Make ffmpeg binary executable")
8
+ st = os.stat("ffmpeg")
9
+ os.chmod("ffmpeg", st.st_mode | stat.S_IEXEC)
10
+
11
+ # This will trigger downloading model
12
+ print("Downloading if not downloaded Coqui XTTS V2")
13
+ from TTS.utils.manage import ModelManager
14
+
15
+ model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
16
+ ModelManager().download_model(model_name)
17
+ print("XTTS downloaded")
gitattributes ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ examples/female.wav filter=lfs diff=lfs merge=lfs -text
packages.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ unzip
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Preinstall requirements from TTS
2
+ TTS @ git+https://github.com/coqui-ai/[email protected]
3
+ pydantic==1.10.13
4
+ python-multipart==0.0.6
5
+ typing-extensions>=4.8.0
6
+ cutlet
7
+ mecab-python3==1.0.6
8
+ unidic-lite==1.0.8
9
+ unidic==1.1.0
10
+ langid
11
+ pydub
12
+ gradio