Shamik88 commited on
Commit
8850539
·
1 Parent(s): 9ec61b3
Files changed (2) hide show
  1. .DS_Store +0 -0
  2. app.py +0 -65
.DS_Store CHANGED
Binary files a/.DS_Store and b/.DS_Store differ
 
app.py CHANGED
@@ -386,17 +386,6 @@ def update_speed(new_speed):
386
  def process_audio(ref_audio_path):
387
  return ref_audio_path
388
 
389
- <<<<<<< HEAD
390
- =======
391
- with gr.Blocks() as app_credits:
392
- gr.Markdown("""
393
- # Credits
394
-
395
- * [mrfakename](https://github.com/fakerybakery) for the original [online demo](https://huggingface.co/spaces/mrfakename/E2-F5-TTS)
396
- * [RootingInLoad](https://github.com/RootingInLoad) for the podcast generation
397
- * [jpgallegoar](https://github.com/jpgallegoar) for multiple speech-type generation
398
- """)
399
- >>>>>>> 92604a33e28acd02c841d163fca3c8b802cf15e5
400
  with gr.Blocks(theme='gstaff/sketch') as app_tts:
401
  gr.Markdown("# Batched TTS")
402
  ref_audio_input = gr.Audio(label="Reference Audio", type="filepath")
@@ -519,12 +508,9 @@ def parse_emotional_text(gen_text):
519
 
520
  return segments
521
 
522
- <<<<<<< HEAD
523
  def get_audio_file(audio_path):
524
  return audio_path
525
 
526
- =======
527
- >>>>>>> 92604a33e28acd02c841d163fca3c8b802cf15e5
528
  with gr.Blocks() as app_emotional:
529
  # New section for emotional generation
530
  gr.Markdown(
@@ -535,11 +521,7 @@ with gr.Blocks() as app_emotional:
535
 
536
  **Example Input:**
537
 
538
- <<<<<<< HEAD
539
  (Regular) Hello, I'd like to order a sandwich please. (Surprised) What do you mean you're out of bread? (Sad) I really wanted a sandwich though... (Angry) You know what, fuck you and your little shop, you suck! (Whisper) I'll just go back home and cry now. (Shouting) Why me?!
540
- =======
541
- (Regular) Hello, I'd like to order a sandwich please. (Surprised) What do you mean you're out of bread? (Sad) I really wanted a sandwich though... (Angry) You know what, darn you and your little shop, you suck! (Whisper) I'll just go back home and cry now. (Shouting) Why me?!
542
- >>>>>>> 92604a33e28acd02c841d163fca3c8b802cf15e5
543
  """
544
  )
545
 
@@ -550,7 +532,6 @@ with gr.Blocks() as app_emotional:
550
  regular_name = gr.Textbox(value='Regular', label='Speech Type Name', interactive=False)
551
  regular_audio = gr.Audio(label='Regular Reference Audio', type='filepath')
552
  regular_ref_text = gr.Textbox(label='Reference Text (Regular)', lines=2)
553
- <<<<<<< HEAD
554
  download_regular_audio = gr.File(label="Download Regular Reference Audio")
555
 
556
  regular_audio.change(
@@ -558,8 +539,6 @@ with gr.Blocks() as app_emotional:
558
  inputs=regular_audio,
559
  outputs=download_regular_audio
560
  )
561
- =======
562
- >>>>>>> 92604a33e28acd02c841d163fca3c8b802cf15e5
563
 
564
  # Additional speech types (up to 99 more)
565
  max_speech_types = 100
@@ -567,10 +546,7 @@ with gr.Blocks() as app_emotional:
567
  speech_type_audios = []
568
  speech_type_ref_texts = []
569
  speech_type_delete_btns = []
570
- <<<<<<< HEAD
571
  download_speech_type_audios = []
572
- =======
573
- >>>>>>> 92604a33e28acd02c841d163fca3c8b802cf15e5
574
 
575
  for i in range(max_speech_types - 1):
576
  with gr.Row():
@@ -578,15 +554,11 @@ with gr.Blocks() as app_emotional:
578
  audio_input = gr.Audio(label='Reference Audio', type='filepath', visible=False)
579
  ref_text_input = gr.Textbox(label='Reference Text', lines=2, visible=False)
580
  delete_btn = gr.Button("Delete", variant="secondary", visible=False)
581
- <<<<<<< HEAD
582
  download_audio_input = gr.File(label="Download Reference Audio", visible=False)
583
- =======
584
- >>>>>>> 92604a33e28acd02c841d163fca3c8b802cf15e5
585
  speech_type_names.append(name_input)
586
  speech_type_audios.append(audio_input)
587
  speech_type_ref_texts.append(ref_text_input)
588
  speech_type_delete_btns.append(delete_btn)
589
- <<<<<<< HEAD
590
  download_speech_type_audios.append(download_audio_input)
591
 
592
  audio_input.change(
@@ -594,8 +566,6 @@ with gr.Blocks() as app_emotional:
594
  inputs=audio_input,
595
  outputs=download_audio_input
596
  )
597
- =======
598
- >>>>>>> 92604a33e28acd02c841d163fca3c8b802cf15e5
599
 
600
  # Button to add speech type
601
  add_speech_type_btn = gr.Button("Add Speech Type")
@@ -612,29 +582,20 @@ with gr.Blocks() as app_emotional:
612
  audio_updates = []
613
  ref_text_updates = []
614
  delete_btn_updates = []
615
- <<<<<<< HEAD
616
  download_btn_updates = []
617
- =======
618
- >>>>>>> 92604a33e28acd02c841d163fca3c8b802cf15e5
619
  for i in range(max_speech_types - 1):
620
  if i < speech_type_count:
621
  name_updates.append(gr.update(visible=True))
622
  audio_updates.append(gr.update(visible=True))
623
  ref_text_updates.append(gr.update(visible=True))
624
  delete_btn_updates.append(gr.update(visible=True))
625
- <<<<<<< HEAD
626
  download_btn_updates.append(gr.update(visible=True))
627
- =======
628
- >>>>>>> 92604a33e28acd02c841d163fca3c8b802cf15e5
629
  else:
630
  name_updates.append(gr.update())
631
  audio_updates.append(gr.update())
632
  ref_text_updates.append(gr.update())
633
  delete_btn_updates.append(gr.update())
634
- <<<<<<< HEAD
635
  download_btn_updates.append(gr.update())
636
- =======
637
- >>>>>>> 92604a33e28acd02c841d163fca3c8b802cf15e5
638
  else:
639
  # Optionally, show a warning
640
  # gr.Warning("Maximum number of speech types reached.")
@@ -642,21 +603,13 @@ with gr.Blocks() as app_emotional:
642
  audio_updates = [gr.update() for _ in range(max_speech_types - 1)]
643
  ref_text_updates = [gr.update() for _ in range(max_speech_types - 1)]
644
  delete_btn_updates = [gr.update() for _ in range(max_speech_types - 1)]
645
- <<<<<<< HEAD
646
  download_btn_updates = [gr.update() for _ in range(max_speech_types - 1)]
647
  return [speech_type_count] + name_updates + audio_updates + ref_text_updates + delete_btn_updates + download_btn_updates
648
- =======
649
- return [speech_type_count] + name_updates + audio_updates + ref_text_updates + delete_btn_updates
650
- >>>>>>> 92604a33e28acd02c841d163fca3c8b802cf15e5
651
 
652
  add_speech_type_btn.click(
653
  add_speech_type_fn,
654
  inputs=speech_type_count,
655
- <<<<<<< HEAD
656
  outputs=[speech_type_count] + speech_type_names + speech_type_audios + speech_type_ref_texts + speech_type_delete_btns + download_speech_type_audios
657
- =======
658
- outputs=[speech_type_count] + speech_type_names + speech_type_audios + speech_type_ref_texts + speech_type_delete_btns
659
- >>>>>>> 92604a33e28acd02c841d163fca3c8b802cf15e5
660
  )
661
 
662
  # Function to delete a speech type
@@ -817,7 +770,6 @@ with gr.Blocks() as app_emotional:
817
  inputs=[gen_text_input_emotional, regular_name] + speech_type_names,
818
  outputs=generate_emotional_btn
819
  )
820
- <<<<<<< HEAD
821
 
822
  with gr.Blocks() as app:
823
  gr.Markdown(
@@ -825,23 +777,6 @@ with gr.Blocks() as app:
825
  # TTS
826
 
827
  This is a local web UI for TTS with advanced batch processing support. This app supports the following TTS models:
828
- =======
829
- with gr.Blocks() as app:
830
- gr.Markdown(
831
- """
832
- # E2/F5 TTS
833
-
834
- This is a local web UI for F5 TTS with advanced batch processing support. This app supports the following TTS models:
835
-
836
- * [F5-TTS](https://arxiv.org/abs/2410.06885) (A Fairytaler that Fakes Fluent and Faithful Speech with Flow Matching)
837
- * [E2 TTS](https://arxiv.org/abs/2406.18009) (Embarrassingly Easy Fully Non-Autoregressive Zero-Shot TTS)
838
-
839
- The checkpoints support English and Chinese.
840
-
841
- If you're having issues, try converting your reference audio to WAV or MP3, clipping it to 15s, and shortening your prompt.
842
-
843
- **NOTE: Reference text will be automatically transcribed with Whisper if not provided. For best results, keep your reference clips short (<15s). Ensure the audio is fully uploaded before generating.**
844
- >>>>>>> 92604a33e28acd02c841d163fca3c8b802cf15e5
845
  """
846
  )
847
  gr.TabbedInterface([app_tts, app_podcast, app_emotional], ["TTS", "Podcast", "Multi-Style"])
 
386
  def process_audio(ref_audio_path):
387
  return ref_audio_path
388
 
 
 
 
 
 
 
 
 
 
 
 
389
  with gr.Blocks(theme='gstaff/sketch') as app_tts:
390
  gr.Markdown("# Batched TTS")
391
  ref_audio_input = gr.Audio(label="Reference Audio", type="filepath")
 
508
 
509
  return segments
510
 
 
511
  def get_audio_file(audio_path):
512
  return audio_path
513
 
 
 
514
  with gr.Blocks() as app_emotional:
515
  # New section for emotional generation
516
  gr.Markdown(
 
521
 
522
  **Example Input:**
523
 
 
524
  (Regular) Hello, I'd like to order a sandwich please. (Surprised) What do you mean you're out of bread? (Sad) I really wanted a sandwich though... (Angry) You know what, fuck you and your little shop, you suck! (Whisper) I'll just go back home and cry now. (Shouting) Why me?!
 
 
 
525
  """
526
  )
527
 
 
532
  regular_name = gr.Textbox(value='Regular', label='Speech Type Name', interactive=False)
533
  regular_audio = gr.Audio(label='Regular Reference Audio', type='filepath')
534
  regular_ref_text = gr.Textbox(label='Reference Text (Regular)', lines=2)
 
535
  download_regular_audio = gr.File(label="Download Regular Reference Audio")
536
 
537
  regular_audio.change(
 
539
  inputs=regular_audio,
540
  outputs=download_regular_audio
541
  )
 
 
542
 
543
  # Additional speech types (up to 99 more)
544
  max_speech_types = 100
 
546
  speech_type_audios = []
547
  speech_type_ref_texts = []
548
  speech_type_delete_btns = []
 
549
  download_speech_type_audios = []
 
 
550
 
551
  for i in range(max_speech_types - 1):
552
  with gr.Row():
 
554
  audio_input = gr.Audio(label='Reference Audio', type='filepath', visible=False)
555
  ref_text_input = gr.Textbox(label='Reference Text', lines=2, visible=False)
556
  delete_btn = gr.Button("Delete", variant="secondary", visible=False)
 
557
  download_audio_input = gr.File(label="Download Reference Audio", visible=False)
 
 
558
  speech_type_names.append(name_input)
559
  speech_type_audios.append(audio_input)
560
  speech_type_ref_texts.append(ref_text_input)
561
  speech_type_delete_btns.append(delete_btn)
 
562
  download_speech_type_audios.append(download_audio_input)
563
 
564
  audio_input.change(
 
566
  inputs=audio_input,
567
  outputs=download_audio_input
568
  )
 
 
569
 
570
  # Button to add speech type
571
  add_speech_type_btn = gr.Button("Add Speech Type")
 
582
  audio_updates = []
583
  ref_text_updates = []
584
  delete_btn_updates = []
 
585
  download_btn_updates = []
 
 
586
  for i in range(max_speech_types - 1):
587
  if i < speech_type_count:
588
  name_updates.append(gr.update(visible=True))
589
  audio_updates.append(gr.update(visible=True))
590
  ref_text_updates.append(gr.update(visible=True))
591
  delete_btn_updates.append(gr.update(visible=True))
 
592
  download_btn_updates.append(gr.update(visible=True))
 
 
593
  else:
594
  name_updates.append(gr.update())
595
  audio_updates.append(gr.update())
596
  ref_text_updates.append(gr.update())
597
  delete_btn_updates.append(gr.update())
 
598
  download_btn_updates.append(gr.update())
 
 
599
  else:
600
  # Optionally, show a warning
601
  # gr.Warning("Maximum number of speech types reached.")
 
603
  audio_updates = [gr.update() for _ in range(max_speech_types - 1)]
604
  ref_text_updates = [gr.update() for _ in range(max_speech_types - 1)]
605
  delete_btn_updates = [gr.update() for _ in range(max_speech_types - 1)]
 
606
  download_btn_updates = [gr.update() for _ in range(max_speech_types - 1)]
607
  return [speech_type_count] + name_updates + audio_updates + ref_text_updates + delete_btn_updates + download_btn_updates
 
 
 
608
 
609
  add_speech_type_btn.click(
610
  add_speech_type_fn,
611
  inputs=speech_type_count,
 
612
  outputs=[speech_type_count] + speech_type_names + speech_type_audios + speech_type_ref_texts + speech_type_delete_btns + download_speech_type_audios
 
 
 
613
  )
614
 
615
  # Function to delete a speech type
 
770
  inputs=[gen_text_input_emotional, regular_name] + speech_type_names,
771
  outputs=generate_emotional_btn
772
  )
 
773
 
774
  with gr.Blocks() as app:
775
  gr.Markdown(
 
777
  # TTS
778
 
779
  This is a local web UI for TTS with advanced batch processing support. This app supports the following TTS models:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
780
  """
781
  )
782
  gr.TabbedInterface([app_tts, app_podcast, app_emotional], ["TTS", "Podcast", "Multi-Style"])