Pendrokar commited on
Commit
7ce9101
·
1 Parent(s): e285443

New TTS: LLaSA 1B; 3B 8B runtime error; maskgct too demanding on ZeroGPU

Browse files
Files changed (1) hide show
  1. app/models.py +34 -14
app/models.py CHANGED
@@ -68,7 +68,7 @@ AVAILABLE_MODELS = {
68
 
69
  # MaskGCT (by Amphion)
70
  # 'amphion/maskgct': 'amphion/maskgct', # DEMANDS 300 seconds of ZeroGPU!
71
- 'Svngoku/maskgct-audio-lab': 'Svngoku/maskgct-audio-lab', # DEMANDS 300 seconds of ZeroGPU!
72
 
73
  # GPT-SoVITS
74
  'lj1995/GPT-SoVITS-v2': 'lj1995/GPT-SoVITS-v2',
@@ -79,10 +79,12 @@ AVAILABLE_MODELS = {
79
  # OuteTTS 1B
80
  # 'OuteAI/OuteTTS-0.3-1B-Demo': 'OuteAI/OuteTTS-0.3-1B-Demo',
81
 
 
 
82
  # llasa 3b TTS
83
- 'srinivasbilla/llasa-3b-tts': 'srinivasbilla/llasa-3b-tts',
84
  # llasa 8b TTS
85
- 'srinivasbilla/llasa-8b-tts': 'srinivasbilla/llasa-8b-tts',
86
 
87
  # Mars5
88
  # 'CAMB-AI/mars5_space': 'CAMB-AI/mars5_space', # slow inference; Unstable
@@ -255,7 +257,8 @@ HF_SPACES = {
255
  'text_param_index': 'gen_text_input',
256
  'return_audio_index': 0,
257
  'is_zero_gpu_space': True,
258
- 'series': 'F5 TTS',
 
259
  },
260
 
261
  # E2 TTS TODO: call switch model function
@@ -265,7 +268,8 @@ HF_SPACES = {
265
  'text_param_index': 'gen_text_input',
266
  'return_audio_index': 0,
267
  'is_zero_gpu_space': True,
268
- 'series': 'E2 TTS',
 
269
  },
270
 
271
  # IMS-Toucan
@@ -338,7 +342,7 @@ HF_SPACES = {
338
  'return_audio_index': 0,
339
  'is_zero_gpu_space': True,
340
  'series': 'MaskGCT',
341
- # 'emoji': '🥵', # requires 300s reserved ZeroGPU!
342
  },
343
  'Svngoku/maskgct-audio-lab': {
344
  'name': 'MaskGCT',
@@ -347,7 +351,7 @@ HF_SPACES = {
347
  'return_audio_index': 0,
348
  'is_zero_gpu_space': True,
349
  'series': 'MaskGCT',
350
- # 'emoji': '🥵', # requires 300s reserved ZeroGPU!
351
  },
352
 
353
  # GPT-SoVITS v2
@@ -362,7 +366,7 @@ HF_SPACES = {
362
 
363
  # OuteTTS v0.2 500M
364
  'ameerazam08/OuteTTS-0.2-500M-Demo': {
365
- 'name': 'OuteTTS v2 500M',
366
  'function': '/generate_tts',
367
  'text_param_index': 0,
368
  'return_audio_index': 0,
@@ -372,7 +376,7 @@ HF_SPACES = {
372
  },
373
  # OuteTTS v0.3 1B
374
  'OuteAI/OuteTTS-0.3-1B-Demo': {
375
- 'name': 'OuteTTS v3 1B',
376
  'function': '/generate_tts',
377
  'text_param_index': 'text',
378
  'return_audio_index': 0,
@@ -381,6 +385,17 @@ HF_SPACES = {
381
  'emoji': '🥵', # requires 300s reserved ZeroGPU!
382
  },
383
 
 
 
 
 
 
 
 
 
 
 
 
384
  # LlaSa 3B
385
  'srinivasbilla/llasa-3b-tts': {
386
  'name': 'LLaSA 3B',
@@ -388,7 +403,8 @@ HF_SPACES = {
388
  'text_param_index': 'target_text',
389
  'return_audio_index': 0,
390
  'is_zero_gpu_space': True,
391
- 'series': 'llasa',
 
392
  },
393
 
394
  # LlaSa 8B
@@ -398,7 +414,8 @@ HF_SPACES = {
398
  'text_param_index': 'target_text',
399
  'return_audio_index': 0,
400
  'is_zero_gpu_space': True,
401
- 'series': 'llasa',
 
402
  },
403
 
404
  # Mars5
@@ -648,11 +665,14 @@ OVERRIDE_INPUTS = {
648
  'speaker_selection': "en_female_1",
649
  'reference_audio': None,
650
  },
 
 
 
651
  'srinivasbilla/llasa-3b-tts': {
652
- 'sample_audio_path': handle_file('voice_samples/EN_B00004_S00051_W000213.mp3')
653
  },
654
  'srinivasbilla/llasa-8b-tts': {
655
- 'sample_audio_path': handle_file('voice_samples/EN_B00004_S00051_W000213.mp3')
656
  },
657
 
658
  # MARS 5
@@ -774,7 +794,7 @@ closed_source = [
774
  ]
775
 
776
  # top five models in order to always have one of them picked and scrutinized
777
- top_five = ['Steveeeeeeen/Zonos', 'Steveeeeeeen/Zonos/hybrid']
778
 
779
  # prioritize low vote models
780
  sql = 'SELECT name FROM model WHERE (upvote + downvote) < 750 ORDER BY (upvote + downvote) ASC'
 
68
 
69
  # MaskGCT (by Amphion)
70
  # 'amphion/maskgct': 'amphion/maskgct', # DEMANDS 300 seconds of ZeroGPU!
71
+ # 'Svngoku/maskgct-audio-lab': 'Svngoku/maskgct-audio-lab', # DEMANDS 300 seconds of ZeroGPU!
72
 
73
  # GPT-SoVITS
74
  'lj1995/GPT-SoVITS-v2': 'lj1995/GPT-SoVITS-v2',
 
79
  # OuteTTS 1B
80
  # 'OuteAI/OuteTTS-0.3-1B-Demo': 'OuteAI/OuteTTS-0.3-1B-Demo',
81
 
82
+ # llasa 1b TTS
83
+ 'HKUST-Audio/Llasa-1B-finetuned-for-two-speakers': 'HKUST-Audio/Llasa-1B-finetuned-for-two-speakers',
84
  # llasa 3b TTS
85
+ # 'srinivasbilla/llasa-3b-tts': 'srinivasbilla/llasa-3b-tts', # ZeroGPU Pro account expired
86
  # llasa 8b TTS
87
+ # 'srinivasbilla/llasa-8b-tts': 'srinivasbilla/llasa-8b-tts', # ZeroGPU Pro account expired
88
 
89
  # Mars5
90
  # 'CAMB-AI/mars5_space': 'CAMB-AI/mars5_space', # slow inference; Unstable
 
257
  'text_param_index': 'gen_text_input',
258
  'return_audio_index': 0,
259
  'is_zero_gpu_space': True,
260
+ # 'series': 'E2 TTS',
261
+ 'series': 'E2/F5 TTS',
262
  },
263
 
264
  # E2 TTS TODO: call switch model function
 
268
  'text_param_index': 'gen_text_input',
269
  'return_audio_index': 0,
270
  'is_zero_gpu_space': True,
271
+ # 'series': 'F5 TTS',
272
+ 'series': 'E2/F5 TTS',
273
  },
274
 
275
  # IMS-Toucan
 
342
  'return_audio_index': 0,
343
  'is_zero_gpu_space': True,
344
  'series': 'MaskGCT',
345
+ 'emoji': '🥵', # requires 300s reserved ZeroGPU!
346
  },
347
  'Svngoku/maskgct-audio-lab': {
348
  'name': 'MaskGCT',
 
351
  'return_audio_index': 0,
352
  'is_zero_gpu_space': True,
353
  'series': 'MaskGCT',
354
+ 'emoji': '🥵', # requires 300s reserved ZeroGPU!
355
  },
356
 
357
  # GPT-SoVITS v2
 
366
 
367
  # OuteTTS v0.2 500M
368
  'ameerazam08/OuteTTS-0.2-500M-Demo': {
369
+ 'name': 'OuteTTS v0.2 500M',
370
  'function': '/generate_tts',
371
  'text_param_index': 0,
372
  'return_audio_index': 0,
 
376
  },
377
  # OuteTTS v0.3 1B
378
  'OuteAI/OuteTTS-0.3-1B-Demo': {
379
+ 'name': 'OuteTTS v0.3 1B',
380
  'function': '/generate_tts',
381
  'text_param_index': 'text',
382
  'return_audio_index': 0,
 
385
  'emoji': '🥵', # requires 300s reserved ZeroGPU!
386
  },
387
 
388
+ # LlaSa 1B
389
+ 'HKUST-Audio/Llasa-1B-finetuned-for-two-speakers': {
390
+ 'name': 'LLaSA 1B',
391
+ 'function': '/predict',
392
+ 'text_param_index': 'input_text',
393
+ 'return_audio_index': 0,
394
+ 'is_zero_gpu_space': True,
395
+ 'series': 'LLaSA',
396
+ # 'emoji': '😷', # broken space
397
+ },
398
+
399
  # LlaSa 3B
400
  'srinivasbilla/llasa-3b-tts': {
401
  'name': 'LLaSA 3B',
 
403
  'text_param_index': 'target_text',
404
  'return_audio_index': 0,
405
  'is_zero_gpu_space': True,
406
+ 'series': 'LLaSA',
407
+ 'emoji': '😷', # broken space
408
  },
409
 
410
  # LlaSa 8B
 
414
  'text_param_index': 'target_text',
415
  'return_audio_index': 0,
416
  'is_zero_gpu_space': True,
417
+ 'series': 'LLaSA',
418
+ 'emoji': '😷', # broken space
419
  },
420
 
421
  # Mars5
 
665
  'speaker_selection': "en_female_1",
666
  'reference_audio': None,
667
  },
668
+ 'HKUST-Audio/Llasa-1B-finetuned-for-two-speakers': {
669
+ 'speaker_choice': 'kore',
670
+ },
671
  'srinivasbilla/llasa-3b-tts': {
672
+ 'sample_audio_path': handle_file('voice_samples/EN_B00004_S00051_W000213.mp3'),
673
  },
674
  'srinivasbilla/llasa-8b-tts': {
675
+ 'sample_audio_path': handle_file('voice_samples/EN_B00004_S00051_W000213.mp3'),
676
  },
677
 
678
  # MARS 5
 
794
  ]
795
 
796
  # top five models in order to always have one of them picked and scrutinized
797
+ top_five = ['HKUST-Audio/Llasa-1B-finetuned-for-two-speakers']
798
 
799
  # prioritize low vote models
800
  sql = 'SELECT name FROM model WHERE (upvote + downvote) < 750 ORDER BY (upvote + downvote) ASC'