Staticaliza commited on
Commit
6f22a09
1 Parent(s): cd84a0c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +9 -41
app.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import re
2
  import torch
3
  import torchaudio
@@ -19,18 +20,6 @@ from transformers import pipeline
19
  import click
20
  import soundfile as sf
21
 
22
- try:
23
- import spaces
24
- USING_SPACES = True
25
- except ImportError:
26
- USING_SPACES = False
27
-
28
- def gpu_decorator(func):
29
- if USING_SPACES:
30
- return spaces.GPU(func)
31
- else:
32
- return func
33
-
34
  device = (
35
  "cuda"
36
  if torch.cuda.is_available()
@@ -86,8 +75,8 @@ def load_model(repo_name, exp_name, model_cls, model_cfg, ckpt_step):
86
 
87
 
88
  # load models
89
- F5TTS_ema_model = load_model("F5-TTS", "F5TTS_Base", DiT, {dim: 1024, depth: 22, heads: 16, ff_mult: 2, text_dim: 512, conv_layers: 4}, 1200000)
90
- E2TTS_ema_model = load_model("E2-TTS", "E2TTS_Base", UNetT, {dim: 1024, depth: 24, heads: 16, ff_mult: 4}, 1200000)
91
 
92
  def chunk_text(text, max_chars=135):
93
  """
@@ -116,7 +105,7 @@ def chunk_text(text, max_chars=135):
116
 
117
  return chunks
118
 
119
- @gpu_decorator
120
  def infer_batch(ref_audio, ref_text, gen_text_batches, exp_name, remove_silence, cross_fade_duration=0.15, progress=gr.Progress()):
121
  if exp_name == "F5-TTS":
122
  ema_model = F5TTS_ema_model
@@ -237,7 +226,7 @@ def infer_batch(ref_audio, ref_text, gen_text_batches, exp_name, remove_silence,
237
 
238
  return (target_sample_rate, final_wave), spectrogram_path
239
 
240
- @gpu_decorator
241
  def infer(ref_audio_orig, ref_text, gen_text, exp_name, remove_silence, cross_fade_duration=0.15):
242
 
243
  print(gen_text)
@@ -294,7 +283,7 @@ def infer(ref_audio_orig, ref_text, gen_text, exp_name, remove_silence, cross_fa
294
  return infer_batch((audio, sr), ref_text, gen_text_batches, exp_name, remove_silence, cross_fade_duration)
295
 
296
 
297
- @gpu_decorator
298
  def generate_podcast(script, speaker1_name, ref_audio1, ref_text1, speaker2_name, ref_audio2, ref_text2, exp_name, remove_silence):
299
  # Split the script into speaker blocks
300
  speaker_pattern = re.compile(f"^({re.escape(speaker1_name)}|{re.escape(speaker2_name)}):", re.MULTILINE)
@@ -630,7 +619,8 @@ with gr.Blocks() as app_emotional:
630
 
631
  # Output audio
632
  audio_output_emotional = gr.Audio(label="Synthesized Audio")
633
- @gpu_decorator
 
634
  def generate_emotional_speech(
635
  regular_audio,
636
  regular_ref_text,
@@ -748,27 +738,5 @@ If you're having issues, try converting your reference audio to WAV or MP3, clip
748
  )
749
  gr.TabbedInterface([app_tts, app_podcast, app_emotional, app_credits], ["TTS", "Podcast", "Multi-Style", "Credits"])
750
 
751
- @click.command()
752
- @click.option("--port", "-p", default=None, type=int, help="Port to run the app on")
753
- @click.option("--host", "-H", default=None, help="Host to run the app on")
754
- @click.option(
755
- "--share",
756
- "-s",
757
- default=False,
758
- is_flag=True,
759
- help="Share the app via Gradio share link",
760
- )
761
- @click.option("--api", "-a", default=True, is_flag=True, help="Allow API access")
762
- def main(port, host, share, api):
763
- global app
764
- print(f"Starting app...")
765
- app.queue(api_open=api).launch(
766
- server_name=host, server_port=port, share=share, show_api=api
767
- )
768
-
769
-
770
  if __name__ == "__main__":
771
- if not USING_SPACES:
772
- main()
773
- else:
774
- app.queue().launch()
 
1
+ import spaces
2
  import re
3
  import torch
4
  import torchaudio
 
20
  import click
21
  import soundfile as sf
22
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  device = (
24
  "cuda"
25
  if torch.cuda.is_available()
 
75
 
76
 
77
  # load models
78
+ F5TTS_ema_model = load_model("F5-TTS", "F5TTS_Base", DiT, {"dim": 1024, "depth": 22, "heads": 16, "ff_mult": 2, "text_dim": 512, "conv_layers": 4}, 1200000)
79
+ E2TTS_ema_model = load_model("E2-TTS", "E2TTS_Base", UNetT, {"dim": 1024, "depth": 24, "heads": 16, "ff_mult": 4}, 1200000)
80
 
81
  def chunk_text(text, max_chars=135):
82
  """
 
105
 
106
  return chunks
107
 
108
+ @spaces.GPU(duration=30)
109
  def infer_batch(ref_audio, ref_text, gen_text_batches, exp_name, remove_silence, cross_fade_duration=0.15, progress=gr.Progress()):
110
  if exp_name == "F5-TTS":
111
  ema_model = F5TTS_ema_model
 
226
 
227
  return (target_sample_rate, final_wave), spectrogram_path
228
 
229
+ @spaces.GPU(duration=30)
230
  def infer(ref_audio_orig, ref_text, gen_text, exp_name, remove_silence, cross_fade_duration=0.15):
231
 
232
  print(gen_text)
 
283
  return infer_batch((audio, sr), ref_text, gen_text_batches, exp_name, remove_silence, cross_fade_duration)
284
 
285
 
286
+ @spaces.GPU(duration=30)
287
  def generate_podcast(script, speaker1_name, ref_audio1, ref_text1, speaker2_name, ref_audio2, ref_text2, exp_name, remove_silence):
288
  # Split the script into speaker blocks
289
  speaker_pattern = re.compile(f"^({re.escape(speaker1_name)}|{re.escape(speaker2_name)}):", re.MULTILINE)
 
619
 
620
  # Output audio
621
  audio_output_emotional = gr.Audio(label="Synthesized Audio")
622
+
623
+ @spaces.GPU(duration=30)
624
  def generate_emotional_speech(
625
  regular_audio,
626
  regular_ref_text,
 
738
  )
739
  gr.TabbedInterface([app_tts, app_podcast, app_emotional, app_credits], ["TTS", "Podcast", "Multi-Style", "Credits"])
740
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
741
  if __name__ == "__main__":
742
+ app.queue().launch()