Spaces:
Running
Running
Staticaliza
commited on
Commit
•
6f22a09
1
Parent(s):
cd84a0c
Update app.py
Browse files
app.py
CHANGED
@@ -1,3 +1,4 @@
|
|
|
|
1 |
import re
|
2 |
import torch
|
3 |
import torchaudio
|
@@ -19,18 +20,6 @@ from transformers import pipeline
|
|
19 |
import click
|
20 |
import soundfile as sf
|
21 |
|
22 |
-
try:
|
23 |
-
import spaces
|
24 |
-
USING_SPACES = True
|
25 |
-
except ImportError:
|
26 |
-
USING_SPACES = False
|
27 |
-
|
28 |
-
def gpu_decorator(func):
|
29 |
-
if USING_SPACES:
|
30 |
-
return spaces.GPU(func)
|
31 |
-
else:
|
32 |
-
return func
|
33 |
-
|
34 |
device = (
|
35 |
"cuda"
|
36 |
if torch.cuda.is_available()
|
@@ -86,8 +75,8 @@ def load_model(repo_name, exp_name, model_cls, model_cfg, ckpt_step):
|
|
86 |
|
87 |
|
88 |
# load models
|
89 |
-
F5TTS_ema_model = load_model("F5-TTS", "F5TTS_Base", DiT, {dim: 1024, depth: 22, heads: 16, ff_mult: 2, text_dim: 512, conv_layers: 4}, 1200000)
|
90 |
-
E2TTS_ema_model = load_model("E2-TTS", "E2TTS_Base", UNetT, {dim: 1024, depth: 24, heads: 16, ff_mult: 4}, 1200000)
|
91 |
|
92 |
def chunk_text(text, max_chars=135):
|
93 |
"""
|
@@ -116,7 +105,7 @@ def chunk_text(text, max_chars=135):
|
|
116 |
|
117 |
return chunks
|
118 |
|
119 |
-
@
|
120 |
def infer_batch(ref_audio, ref_text, gen_text_batches, exp_name, remove_silence, cross_fade_duration=0.15, progress=gr.Progress()):
|
121 |
if exp_name == "F5-TTS":
|
122 |
ema_model = F5TTS_ema_model
|
@@ -237,7 +226,7 @@ def infer_batch(ref_audio, ref_text, gen_text_batches, exp_name, remove_silence,
|
|
237 |
|
238 |
return (target_sample_rate, final_wave), spectrogram_path
|
239 |
|
240 |
-
@
|
241 |
def infer(ref_audio_orig, ref_text, gen_text, exp_name, remove_silence, cross_fade_duration=0.15):
|
242 |
|
243 |
print(gen_text)
|
@@ -294,7 +283,7 @@ def infer(ref_audio_orig, ref_text, gen_text, exp_name, remove_silence, cross_fa
|
|
294 |
return infer_batch((audio, sr), ref_text, gen_text_batches, exp_name, remove_silence, cross_fade_duration)
|
295 |
|
296 |
|
297 |
-
@
|
298 |
def generate_podcast(script, speaker1_name, ref_audio1, ref_text1, speaker2_name, ref_audio2, ref_text2, exp_name, remove_silence):
|
299 |
# Split the script into speaker blocks
|
300 |
speaker_pattern = re.compile(f"^({re.escape(speaker1_name)}|{re.escape(speaker2_name)}):", re.MULTILINE)
|
@@ -630,7 +619,8 @@ with gr.Blocks() as app_emotional:
|
|
630 |
|
631 |
# Output audio
|
632 |
audio_output_emotional = gr.Audio(label="Synthesized Audio")
|
633 |
-
|
|
|
634 |
def generate_emotional_speech(
|
635 |
regular_audio,
|
636 |
regular_ref_text,
|
@@ -748,27 +738,5 @@ If you're having issues, try converting your reference audio to WAV or MP3, clip
|
|
748 |
)
|
749 |
gr.TabbedInterface([app_tts, app_podcast, app_emotional, app_credits], ["TTS", "Podcast", "Multi-Style", "Credits"])
|
750 |
|
751 |
-
@click.command()
|
752 |
-
@click.option("--port", "-p", default=None, type=int, help="Port to run the app on")
|
753 |
-
@click.option("--host", "-H", default=None, help="Host to run the app on")
|
754 |
-
@click.option(
|
755 |
-
"--share",
|
756 |
-
"-s",
|
757 |
-
default=False,
|
758 |
-
is_flag=True,
|
759 |
-
help="Share the app via Gradio share link",
|
760 |
-
)
|
761 |
-
@click.option("--api", "-a", default=True, is_flag=True, help="Allow API access")
|
762 |
-
def main(port, host, share, api):
|
763 |
-
global app
|
764 |
-
print(f"Starting app...")
|
765 |
-
app.queue(api_open=api).launch(
|
766 |
-
server_name=host, server_port=port, share=share, show_api=api
|
767 |
-
)
|
768 |
-
|
769 |
-
|
770 |
if __name__ == "__main__":
|
771 |
-
|
772 |
-
main()
|
773 |
-
else:
|
774 |
-
app.queue().launch()
|
|
|
1 |
+
import spaces
|
2 |
import re
|
3 |
import torch
|
4 |
import torchaudio
|
|
|
20 |
import click
|
21 |
import soundfile as sf
|
22 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
device = (
|
24 |
"cuda"
|
25 |
if torch.cuda.is_available()
|
|
|
75 |
|
76 |
|
77 |
# load models
|
78 |
+
F5TTS_ema_model = load_model("F5-TTS", "F5TTS_Base", DiT, {"dim": 1024, "depth": 22, "heads": 16, "ff_mult": 2, "text_dim": 512, "conv_layers": 4}, 1200000)
|
79 |
+
E2TTS_ema_model = load_model("E2-TTS", "E2TTS_Base", UNetT, {"dim": 1024, "depth": 24, "heads": 16, "ff_mult": 4}, 1200000)
|
80 |
|
81 |
def chunk_text(text, max_chars=135):
|
82 |
"""
|
|
|
105 |
|
106 |
return chunks
|
107 |
|
108 |
+
@spaces.GPU(duration=30)
|
109 |
def infer_batch(ref_audio, ref_text, gen_text_batches, exp_name, remove_silence, cross_fade_duration=0.15, progress=gr.Progress()):
|
110 |
if exp_name == "F5-TTS":
|
111 |
ema_model = F5TTS_ema_model
|
|
|
226 |
|
227 |
return (target_sample_rate, final_wave), spectrogram_path
|
228 |
|
229 |
+
@spaces.GPU(duration=30)
|
230 |
def infer(ref_audio_orig, ref_text, gen_text, exp_name, remove_silence, cross_fade_duration=0.15):
|
231 |
|
232 |
print(gen_text)
|
|
|
283 |
return infer_batch((audio, sr), ref_text, gen_text_batches, exp_name, remove_silence, cross_fade_duration)
|
284 |
|
285 |
|
286 |
+
@spaces.GPU(duration=30)
|
287 |
def generate_podcast(script, speaker1_name, ref_audio1, ref_text1, speaker2_name, ref_audio2, ref_text2, exp_name, remove_silence):
|
288 |
# Split the script into speaker blocks
|
289 |
speaker_pattern = re.compile(f"^({re.escape(speaker1_name)}|{re.escape(speaker2_name)}):", re.MULTILINE)
|
|
|
619 |
|
620 |
# Output audio
|
621 |
audio_output_emotional = gr.Audio(label="Synthesized Audio")
|
622 |
+
|
623 |
+
@spaces.GPU(duration=30)
|
624 |
def generate_emotional_speech(
|
625 |
regular_audio,
|
626 |
regular_ref_text,
|
|
|
738 |
)
|
739 |
gr.TabbedInterface([app_tts, app_podcast, app_emotional, app_credits], ["TTS", "Podcast", "Multi-Style", "Credits"])
|
740 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
741 |
if __name__ == "__main__":
|
742 |
+
app.queue().launch()
|
|
|
|
|
|