Spaces:
Running
on
Zero
Running
on
Zero
new TTS: sesame
Browse files- app/leaderboard.py +1 -1
- app/models.py +22 -2
app/leaderboard.py
CHANGED
@@ -54,7 +54,7 @@ def get_leaderboard(reveal_prelim = False):
|
|
54 |
and '/' in orig_name
|
55 |
):
|
56 |
style = 'text-decoration: underline;text-decoration-style: dotted; color: var(--link-text-color);'
|
57 |
-
title = '
|
58 |
# win rate dataset
|
59 |
df.at[i, 'Win Rate'] = f'<a target="_blank" style="{style}" title="{title}" href="https://huggingface.co/datasets/{DB_DATASET_ID}/viewer/summary/rejections?f[rejected][value]=%27{orig_name}%27">' + df['Win Rate'].iloc[i] + '</a>'
|
60 |
df['Elo'] = round(df['Elo'])
|
|
|
54 |
and '/' in orig_name
|
55 |
):
|
56 |
style = 'text-decoration: underline;text-decoration-style: dotted; color: var(--link-text-color);'
|
57 |
+
title = 'See rejections'
|
58 |
# win rate dataset
|
59 |
df.at[i, 'Win Rate'] = f'<a target="_blank" style="{style}" title="{title}" href="https://huggingface.co/datasets/{DB_DATASET_ID}/viewer/summary/rejections?f[rejected][value]=%27{orig_name}%27">' + df['Win Rate'].iloc[i] + '</a>'
|
60 |
df['Elo'] = round(df['Elo'])
|
app/models.py
CHANGED
@@ -101,6 +101,9 @@ AVAILABLE_MODELS = {
|
|
101 |
# Spark
|
102 |
'thunnai/SparkTTS': 'thunnai/SparkTTS',
|
103 |
|
|
|
|
|
|
|
104 |
# HF TTS w issues
|
105 |
# 'LeeSangHoon/HierSpeech_TTS': 'LeeSangHoon/HierSpeech_TTS', # irresponsive to exclamation marks # 4.29
|
106 |
# 'PolyAI/pheme': '/predict#0', # sleepy HF Space
|
@@ -261,7 +264,7 @@ HF_SPACES = {
|
|
261 |
'text_param_index': 'text',
|
262 |
'return_audio_index': 0,
|
263 |
'series': 'Fish Speech',
|
264 |
-
'emoji': '😷',
|
265 |
},
|
266 |
|
267 |
# F5 TTS
|
@@ -481,6 +484,15 @@ HF_SPACES = {
|
|
481 |
'is_zero_gpu_space': True,
|
482 |
'series': 'Spark-TTS',
|
483 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
484 |
}
|
485 |
|
486 |
# for zero-shot TTS - voice sample used by XTTS (11 seconds)
|
@@ -765,7 +777,15 @@ OVERRIDE_INPUTS = {
|
|
765 |
'prompt_text': DEFAULT_VOICE_TRANSCRIPT,
|
766 |
'prompt_wav_upload': DEFAULT_VOICE_SAMPLE,
|
767 |
'prompt_wav_record': None,
|
768 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
769 |
}
|
770 |
|
771 |
# minor mods to model from the same space
|
|
|
101 |
# Spark
|
102 |
'thunnai/SparkTTS': 'thunnai/SparkTTS',
|
103 |
|
104 |
+
# Sesame
|
105 |
+
'sesame/csm-1b' : 'sesame/csm-1b',
|
106 |
+
|
107 |
# HF TTS w issues
|
108 |
# 'LeeSangHoon/HierSpeech_TTS': 'LeeSangHoon/HierSpeech_TTS', # irresponsive to exclamation marks # 4.29
|
109 |
# 'PolyAI/pheme': '/predict#0', # sleepy HF Space
|
|
|
264 |
'text_param_index': 'text',
|
265 |
'return_audio_index': 0,
|
266 |
'series': 'Fish Speech',
|
267 |
+
# 'emoji': '😷',
|
268 |
},
|
269 |
|
270 |
# F5 TTS
|
|
|
484 |
'is_zero_gpu_space': True,
|
485 |
'series': 'Spark-TTS',
|
486 |
},
|
487 |
+
|
488 |
+
'sesame/csm-1b' : {
|
489 |
+
'name': 'sesame/csm-1b',
|
490 |
+
'function': '/infer',
|
491 |
+
'text_param_index': 'gen_conversation_input',
|
492 |
+
'return_audio_index': 0,
|
493 |
+
'is_zero_gpu_space': True,
|
494 |
+
'series': 'Spark-TTS',
|
495 |
+
},
|
496 |
}
|
497 |
|
498 |
# for zero-shot TTS - voice sample used by XTTS (11 seconds)
|
|
|
777 |
'prompt_text': DEFAULT_VOICE_TRANSCRIPT,
|
778 |
'prompt_wav_upload': DEFAULT_VOICE_SAMPLE,
|
779 |
'prompt_wav_record': None,
|
780 |
+
},
|
781 |
+
|
782 |
+
# sesame/csm-1b
|
783 |
+
'sesame/csm-1b' : {
|
784 |
+
"text_prompt_speaker_a": "And Lake turned round upon me, a little abruptly, his odd yellowish eyes, a little like those of the sea eagle, and the ghost of his smile that flickered on his singularly pale face, with a stern and insidious look, confronted me.",
|
785 |
+
"text_prompt_speaker_b": "And Lake turned round upon me, a little abruptly, his odd yellowish eyes, a little like those of the sea eagle, and the ghost of his smile that flickered on his singularly pale face, with a stern and insidious look, confronted me.", #second speaker unused
|
786 |
+
"audio_prompt_speaker_a": handle_file('voice_samples/read_speech_a.wav'),
|
787 |
+
"audio_prompt_speaker_b": handle_file('voice_samples/read_speech_a.wav'), #second speaker unused
|
788 |
+
},
|
789 |
}
|
790 |
|
791 |
# minor mods to model from the same space
|