Spaces:

Staticaliza
/

Voice

Running

App Files Files Community

Staticaliza commited on 20 days ago

Commit

6f22a09

•

1 Parent(s): cd84a0c

Update app.py

Browse files

Files changed (1) hide show

app.py +9 -41

app.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import re
 import torch
 import torchaudio
@@ -19,18 +20,6 @@ from transformers import pipeline
 import click
 import soundfile as sf
-try:
-    import spaces
-    USING_SPACES = True
-except ImportError:
-    USING_SPACES = False
-def gpu_decorator(func):
-    if USING_SPACES:
-        return spaces.GPU(func)
-    else:
-        return func
 device = (
     "cuda"
     if torch.cuda.is_available()
@@ -86,8 +75,8 @@ def load_model(repo_name, exp_name, model_cls, model_cfg, ckpt_step):
 # load models
-F5TTS_ema_model = load_model("F5-TTS", "F5TTS_Base", DiT, {dim: 1024, depth: 22, heads: 16, ff_mult: 2, text_dim: 512, conv_layers: 4}, 1200000)
-E2TTS_ema_model = load_model("E2-TTS", "E2TTS_Base", UNetT, {dim: 1024, depth: 24, heads: 16, ff_mult: 4}, 1200000)
 def chunk_text(text, max_chars=135):
     """
@@ -116,7 +105,7 @@ def chunk_text(text, max_chars=135):
     return chunks
-@gpu_decorator
 def infer_batch(ref_audio, ref_text, gen_text_batches, exp_name, remove_silence, cross_fade_duration=0.15, progress=gr.Progress()):
     if exp_name == "F5-TTS":
         ema_model = F5TTS_ema_model
@@ -237,7 +226,7 @@ def infer_batch(ref_audio, ref_text, gen_text_batches, exp_name, remove_silence,
     return (target_sample_rate, final_wave), spectrogram_path
-@gpu_decorator
 def infer(ref_audio_orig, ref_text, gen_text, exp_name, remove_silence, cross_fade_duration=0.15):
     print(gen_text)
@@ -294,7 +283,7 @@ def infer(ref_audio_orig, ref_text, gen_text, exp_name, remove_silence, cross_fa
     return infer_batch((audio, sr), ref_text, gen_text_batches, exp_name, remove_silence, cross_fade_duration)
-@gpu_decorator
 def generate_podcast(script, speaker1_name, ref_audio1, ref_text1, speaker2_name, ref_audio2, ref_text2, exp_name, remove_silence):
     # Split the script into speaker blocks
     speaker_pattern = re.compile(f"^({re.escape(speaker1_name)}|{re.escape(speaker2_name)}):", re.MULTILINE)
@@ -630,7 +619,8 @@ with gr.Blocks() as app_emotional:
     # Output audio
     audio_output_emotional = gr.Audio(label="Synthesized Audio")
-    @gpu_decorator
     def generate_emotional_speech(
         regular_audio,
         regular_ref_text,
@@ -748,27 +738,5 @@ If you're having issues, try converting your reference audio to WAV or MP3, clip
     )
     gr.TabbedInterface([app_tts, app_podcast, app_emotional, app_credits], ["TTS", "Podcast", "Multi-Style", "Credits"])
-@click.command()
-@click.option("--port", "-p", default=None, type=int, help="Port to run the app on")
-@click.option("--host", "-H", default=None, help="Host to run the app on")
-@click.option(
-    "--share",
-    "-s",
-    default=False,
-    is_flag=True,
-    help="Share the app via Gradio share link",
-)
-@click.option("--api", "-a", default=True, is_flag=True, help="Allow API access")
-def main(port, host, share, api):
-    global app
-    print(f"Starting app...")
-    app.queue(api_open=api).launch(
-        server_name=host, server_port=port, share=share, show_api=api
-    )
 if __name__ == "__main__":
-    if not USING_SPACES:
-        main()
-    else:
-        app.queue().launch()

+import spaces
 import re
 import torch
 import torchaudio
 import click
 import soundfile as sf
 device = (
     "cuda"
     if torch.cuda.is_available()
 # load models
+F5TTS_ema_model = load_model("F5-TTS", "F5TTS_Base", DiT, {"dim": 1024, "depth": 22, "heads": 16, "ff_mult": 2, "text_dim": 512, "conv_layers": 4}, 1200000)
+E2TTS_ema_model = load_model("E2-TTS", "E2TTS_Base", UNetT, {"dim": 1024, "depth": 24, "heads": 16, "ff_mult": 4}, 1200000)
 def chunk_text(text, max_chars=135):
     """
     return chunks
+@spaces.GPU(duration=30)
 def infer_batch(ref_audio, ref_text, gen_text_batches, exp_name, remove_silence, cross_fade_duration=0.15, progress=gr.Progress()):
     if exp_name == "F5-TTS":
         ema_model = F5TTS_ema_model
     return (target_sample_rate, final_wave), spectrogram_path
+@spaces.GPU(duration=30)
 def infer(ref_audio_orig, ref_text, gen_text, exp_name, remove_silence, cross_fade_duration=0.15):
     print(gen_text)
     return infer_batch((audio, sr), ref_text, gen_text_batches, exp_name, remove_silence, cross_fade_duration)
+@spaces.GPU(duration=30)
 def generate_podcast(script, speaker1_name, ref_audio1, ref_text1, speaker2_name, ref_audio2, ref_text2, exp_name, remove_silence):
     # Split the script into speaker blocks
     speaker_pattern = re.compile(f"^({re.escape(speaker1_name)}|{re.escape(speaker2_name)}):", re.MULTILINE)
     # Output audio
     audio_output_emotional = gr.Audio(label="Synthesized Audio")
+    @spaces.GPU(duration=30)
     def generate_emotional_speech(
         regular_audio,
         regular_ref_text,
     )
     gr.TabbedInterface([app_tts, app_podcast, app_emotional, app_credits], ["TTS", "Podcast", "Multi-Style", "Credits"])
 if __name__ == "__main__":
+    app.queue().launch()