Spaces:
Running
on
Zero
Running
on
Zero
New TTS: LLaSA 1B; 3B 8B runtime error; maskgct too demanding on ZeroGPU
Browse files- app/models.py +34 -14
app/models.py
CHANGED
@@ -68,7 +68,7 @@ AVAILABLE_MODELS = {
|
|
68 |
|
69 |
# MaskGCT (by Amphion)
|
70 |
# 'amphion/maskgct': 'amphion/maskgct', # DEMANDS 300 seconds of ZeroGPU!
|
71 |
-
'Svngoku/maskgct-audio-lab': 'Svngoku/maskgct-audio-lab', # DEMANDS 300 seconds of ZeroGPU!
|
72 |
|
73 |
# GPT-SoVITS
|
74 |
'lj1995/GPT-SoVITS-v2': 'lj1995/GPT-SoVITS-v2',
|
@@ -79,10 +79,12 @@ AVAILABLE_MODELS = {
|
|
79 |
# OuteTTS 1B
|
80 |
# 'OuteAI/OuteTTS-0.3-1B-Demo': 'OuteAI/OuteTTS-0.3-1B-Demo',
|
81 |
|
|
|
|
|
82 |
# llasa 3b TTS
|
83 |
-
'srinivasbilla/llasa-3b-tts': 'srinivasbilla/llasa-3b-tts',
|
84 |
# llasa 8b TTS
|
85 |
-
'srinivasbilla/llasa-8b-tts': 'srinivasbilla/llasa-8b-tts',
|
86 |
|
87 |
# Mars5
|
88 |
# 'CAMB-AI/mars5_space': 'CAMB-AI/mars5_space', # slow inference; Unstable
|
@@ -255,7 +257,8 @@ HF_SPACES = {
|
|
255 |
'text_param_index': 'gen_text_input',
|
256 |
'return_audio_index': 0,
|
257 |
'is_zero_gpu_space': True,
|
258 |
-
'series': '
|
|
|
259 |
},
|
260 |
|
261 |
# E2 TTS TODO: call switch model function
|
@@ -265,7 +268,8 @@ HF_SPACES = {
|
|
265 |
'text_param_index': 'gen_text_input',
|
266 |
'return_audio_index': 0,
|
267 |
'is_zero_gpu_space': True,
|
268 |
-
'series': '
|
|
|
269 |
},
|
270 |
|
271 |
# IMS-Toucan
|
@@ -338,7 +342,7 @@ HF_SPACES = {
|
|
338 |
'return_audio_index': 0,
|
339 |
'is_zero_gpu_space': True,
|
340 |
'series': 'MaskGCT',
|
341 |
-
|
342 |
},
|
343 |
'Svngoku/maskgct-audio-lab': {
|
344 |
'name': 'MaskGCT',
|
@@ -347,7 +351,7 @@ HF_SPACES = {
|
|
347 |
'return_audio_index': 0,
|
348 |
'is_zero_gpu_space': True,
|
349 |
'series': 'MaskGCT',
|
350 |
-
|
351 |
},
|
352 |
|
353 |
# GPT-SoVITS v2
|
@@ -362,7 +366,7 @@ HF_SPACES = {
|
|
362 |
|
363 |
# OuteTTS v0.2 500M
|
364 |
'ameerazam08/OuteTTS-0.2-500M-Demo': {
|
365 |
-
'name': 'OuteTTS
|
366 |
'function': '/generate_tts',
|
367 |
'text_param_index': 0,
|
368 |
'return_audio_index': 0,
|
@@ -372,7 +376,7 @@ HF_SPACES = {
|
|
372 |
},
|
373 |
# OuteTTS v0.3 1B
|
374 |
'OuteAI/OuteTTS-0.3-1B-Demo': {
|
375 |
-
'name': 'OuteTTS
|
376 |
'function': '/generate_tts',
|
377 |
'text_param_index': 'text',
|
378 |
'return_audio_index': 0,
|
@@ -381,6 +385,17 @@ HF_SPACES = {
|
|
381 |
'emoji': '🥵', # requires 300s reserved ZeroGPU!
|
382 |
},
|
383 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
384 |
# LlaSa 3B
|
385 |
'srinivasbilla/llasa-3b-tts': {
|
386 |
'name': 'LLaSA 3B',
|
@@ -388,7 +403,8 @@ HF_SPACES = {
|
|
388 |
'text_param_index': 'target_text',
|
389 |
'return_audio_index': 0,
|
390 |
'is_zero_gpu_space': True,
|
391 |
-
'series': '
|
|
|
392 |
},
|
393 |
|
394 |
# LlaSa 8B
|
@@ -398,7 +414,8 @@ HF_SPACES = {
|
|
398 |
'text_param_index': 'target_text',
|
399 |
'return_audio_index': 0,
|
400 |
'is_zero_gpu_space': True,
|
401 |
-
'series': '
|
|
|
402 |
},
|
403 |
|
404 |
# Mars5
|
@@ -648,11 +665,14 @@ OVERRIDE_INPUTS = {
|
|
648 |
'speaker_selection': "en_female_1",
|
649 |
'reference_audio': None,
|
650 |
},
|
|
|
|
|
|
|
651 |
'srinivasbilla/llasa-3b-tts': {
|
652 |
-
'sample_audio_path': handle_file('voice_samples/EN_B00004_S00051_W000213.mp3')
|
653 |
},
|
654 |
'srinivasbilla/llasa-8b-tts': {
|
655 |
-
'sample_audio_path': handle_file('voice_samples/EN_B00004_S00051_W000213.mp3')
|
656 |
},
|
657 |
|
658 |
# MARS 5
|
@@ -774,7 +794,7 @@ closed_source = [
|
|
774 |
]
|
775 |
|
776 |
# top five models in order to always have one of them picked and scrutinized
|
777 |
-
top_five = ['
|
778 |
|
779 |
# prioritize low vote models
|
780 |
sql = 'SELECT name FROM model WHERE (upvote + downvote) < 750 ORDER BY (upvote + downvote) ASC'
|
|
|
68 |
|
69 |
# MaskGCT (by Amphion)
|
70 |
# 'amphion/maskgct': 'amphion/maskgct', # DEMANDS 300 seconds of ZeroGPU!
|
71 |
+
# 'Svngoku/maskgct-audio-lab': 'Svngoku/maskgct-audio-lab', # DEMANDS 300 seconds of ZeroGPU!
|
72 |
|
73 |
# GPT-SoVITS
|
74 |
'lj1995/GPT-SoVITS-v2': 'lj1995/GPT-SoVITS-v2',
|
|
|
79 |
# OuteTTS 1B
|
80 |
# 'OuteAI/OuteTTS-0.3-1B-Demo': 'OuteAI/OuteTTS-0.3-1B-Demo',
|
81 |
|
82 |
+
# llasa 1b TTS
|
83 |
+
'HKUST-Audio/Llasa-1B-finetuned-for-two-speakers': 'HKUST-Audio/Llasa-1B-finetuned-for-two-speakers',
|
84 |
# llasa 3b TTS
|
85 |
+
# 'srinivasbilla/llasa-3b-tts': 'srinivasbilla/llasa-3b-tts', # ZeroGPU Pro account expired
|
86 |
# llasa 8b TTS
|
87 |
+
# 'srinivasbilla/llasa-8b-tts': 'srinivasbilla/llasa-8b-tts', # ZeroGPU Pro account expired
|
88 |
|
89 |
# Mars5
|
90 |
# 'CAMB-AI/mars5_space': 'CAMB-AI/mars5_space', # slow inference; Unstable
|
|
|
257 |
'text_param_index': 'gen_text_input',
|
258 |
'return_audio_index': 0,
|
259 |
'is_zero_gpu_space': True,
|
260 |
+
# 'series': 'E2 TTS',
|
261 |
+
'series': 'E2/F5 TTS',
|
262 |
},
|
263 |
|
264 |
# E2 TTS TODO: call switch model function
|
|
|
268 |
'text_param_index': 'gen_text_input',
|
269 |
'return_audio_index': 0,
|
270 |
'is_zero_gpu_space': True,
|
271 |
+
# 'series': 'F5 TTS',
|
272 |
+
'series': 'E2/F5 TTS',
|
273 |
},
|
274 |
|
275 |
# IMS-Toucan
|
|
|
342 |
'return_audio_index': 0,
|
343 |
'is_zero_gpu_space': True,
|
344 |
'series': 'MaskGCT',
|
345 |
+
'emoji': '🥵', # requires 300s reserved ZeroGPU!
|
346 |
},
|
347 |
'Svngoku/maskgct-audio-lab': {
|
348 |
'name': 'MaskGCT',
|
|
|
351 |
'return_audio_index': 0,
|
352 |
'is_zero_gpu_space': True,
|
353 |
'series': 'MaskGCT',
|
354 |
+
'emoji': '🥵', # requires 300s reserved ZeroGPU!
|
355 |
},
|
356 |
|
357 |
# GPT-SoVITS v2
|
|
|
366 |
|
367 |
# OuteTTS v0.2 500M
|
368 |
'ameerazam08/OuteTTS-0.2-500M-Demo': {
|
369 |
+
'name': 'OuteTTS v0.2 500M',
|
370 |
'function': '/generate_tts',
|
371 |
'text_param_index': 0,
|
372 |
'return_audio_index': 0,
|
|
|
376 |
},
|
377 |
# OuteTTS v0.3 1B
|
378 |
'OuteAI/OuteTTS-0.3-1B-Demo': {
|
379 |
+
'name': 'OuteTTS v0.3 1B',
|
380 |
'function': '/generate_tts',
|
381 |
'text_param_index': 'text',
|
382 |
'return_audio_index': 0,
|
|
|
385 |
'emoji': '🥵', # requires 300s reserved ZeroGPU!
|
386 |
},
|
387 |
|
388 |
+
# LlaSa 1B
|
389 |
+
'HKUST-Audio/Llasa-1B-finetuned-for-two-speakers': {
|
390 |
+
'name': 'LLaSA 1B',
|
391 |
+
'function': '/predict',
|
392 |
+
'text_param_index': 'input_text',
|
393 |
+
'return_audio_index': 0,
|
394 |
+
'is_zero_gpu_space': True,
|
395 |
+
'series': 'LLaSA',
|
396 |
+
# 'emoji': '😷', # broken space
|
397 |
+
},
|
398 |
+
|
399 |
# LlaSa 3B
|
400 |
'srinivasbilla/llasa-3b-tts': {
|
401 |
'name': 'LLaSA 3B',
|
|
|
403 |
'text_param_index': 'target_text',
|
404 |
'return_audio_index': 0,
|
405 |
'is_zero_gpu_space': True,
|
406 |
+
'series': 'LLaSA',
|
407 |
+
'emoji': '😷', # broken space
|
408 |
},
|
409 |
|
410 |
# LlaSa 8B
|
|
|
414 |
'text_param_index': 'target_text',
|
415 |
'return_audio_index': 0,
|
416 |
'is_zero_gpu_space': True,
|
417 |
+
'series': 'LLaSA',
|
418 |
+
'emoji': '😷', # broken space
|
419 |
},
|
420 |
|
421 |
# Mars5
|
|
|
665 |
'speaker_selection': "en_female_1",
|
666 |
'reference_audio': None,
|
667 |
},
|
668 |
+
'HKUST-Audio/Llasa-1B-finetuned-for-two-speakers': {
|
669 |
+
'speaker_choice': 'kore',
|
670 |
+
},
|
671 |
'srinivasbilla/llasa-3b-tts': {
|
672 |
+
'sample_audio_path': handle_file('voice_samples/EN_B00004_S00051_W000213.mp3'),
|
673 |
},
|
674 |
'srinivasbilla/llasa-8b-tts': {
|
675 |
+
'sample_audio_path': handle_file('voice_samples/EN_B00004_S00051_W000213.mp3'),
|
676 |
},
|
677 |
|
678 |
# MARS 5
|
|
|
794 |
]
|
795 |
|
796 |
# top five models in order to always have one of them picked and scrutinized
|
797 |
+
top_five = ['HKUST-Audio/Llasa-1B-finetuned-for-two-speakers']
|
798 |
|
799 |
# prioritize low vote models
|
800 |
sql = 'SELECT name FROM model WHERE (upvote + downvote) < 750 ORDER BY (upvote + downvote) ASC'
|