Shamik88 commited on
Commit
0f307eb
2 Parent(s): 0913f3e 92604a3

Resolved merge conflicts

Browse files
Files changed (3) hide show
  1. .DS_Store +0 -0
  2. .gitignore +3 -0
  3. app.py +65 -0
.DS_Store CHANGED
Binary files a/.DS_Store and b/.DS_Store differ
 
.gitignore CHANGED
@@ -4,6 +4,7 @@
4
  # Python cache files
5
  __pycache__/
6
  *.pyc
 
7
 
8
 
9
  echo "demo_audio/notebookllm_starhealth_demo.wav" >> .gitignore
@@ -11,3 +12,5 @@ echo "demo_audio/notebookllm_starhealth_demo.wav" >> .gitignore
11
 
12
  echo "demo_audio/" >> .gitignore
13
  demo_audio/
 
 
 
4
  # Python cache files
5
  __pycache__/
6
  *.pyc
7
+ <<<<<<< HEAD
8
 
9
 
10
  echo "demo_audio/notebookllm_starhealth_demo.wav" >> .gitignore
 
12
 
13
  echo "demo_audio/" >> .gitignore
14
  demo_audio/
15
+ =======
16
+ >>>>>>> 92604a33e28acd02c841d163fca3c8b802cf15e5
app.py CHANGED
@@ -386,6 +386,17 @@ def update_speed(new_speed):
386
  def process_audio(ref_audio_path):
387
  return ref_audio_path
388
 
 
 
 
 
 
 
 
 
 
 
 
389
  with gr.Blocks(theme='gstaff/sketch') as app_tts:
390
  gr.Markdown("# Batched TTS")
391
  ref_audio_input = gr.Audio(label="Reference Audio", type="filepath")
@@ -508,9 +519,12 @@ def parse_emotional_text(gen_text):
508
 
509
  return segments
510
 
 
511
  def get_audio_file(audio_path):
512
  return audio_path
513
 
 
 
514
  with gr.Blocks() as app_emotional:
515
  # New section for emotional generation
516
  gr.Markdown(
@@ -521,7 +535,11 @@ with gr.Blocks() as app_emotional:
521
 
522
  **Example Input:**
523
 
 
524
  (Regular) Hello, I'd like to order a sandwich please. (Surprised) What do you mean you're out of bread? (Sad) I really wanted a sandwich though... (Angry) You know what, fuck you and your little shop, you suck! (Whisper) I'll just go back home and cry now. (Shouting) Why me?!
 
 
 
525
  """
526
  )
527
 
@@ -532,6 +550,7 @@ with gr.Blocks() as app_emotional:
532
  regular_name = gr.Textbox(value='Regular', label='Speech Type Name', interactive=False)
533
  regular_audio = gr.Audio(label='Regular Reference Audio', type='filepath')
534
  regular_ref_text = gr.Textbox(label='Reference Text (Regular)', lines=2)
 
535
  download_regular_audio = gr.File(label="Download Regular Reference Audio")
536
 
537
  regular_audio.change(
@@ -539,6 +558,8 @@ with gr.Blocks() as app_emotional:
539
  inputs=regular_audio,
540
  outputs=download_regular_audio
541
  )
 
 
542
 
543
  # Additional speech types (up to 99 more)
544
  max_speech_types = 100
@@ -546,7 +567,10 @@ with gr.Blocks() as app_emotional:
546
  speech_type_audios = []
547
  speech_type_ref_texts = []
548
  speech_type_delete_btns = []
 
549
  download_speech_type_audios = []
 
 
550
 
551
  for i in range(max_speech_types - 1):
552
  with gr.Row():
@@ -554,11 +578,15 @@ with gr.Blocks() as app_emotional:
554
  audio_input = gr.Audio(label='Reference Audio', type='filepath', visible=False)
555
  ref_text_input = gr.Textbox(label='Reference Text', lines=2, visible=False)
556
  delete_btn = gr.Button("Delete", variant="secondary", visible=False)
 
557
  download_audio_input = gr.File(label="Download Reference Audio", visible=False)
 
 
558
  speech_type_names.append(name_input)
559
  speech_type_audios.append(audio_input)
560
  speech_type_ref_texts.append(ref_text_input)
561
  speech_type_delete_btns.append(delete_btn)
 
562
  download_speech_type_audios.append(download_audio_input)
563
 
564
  audio_input.change(
@@ -566,6 +594,8 @@ with gr.Blocks() as app_emotional:
566
  inputs=audio_input,
567
  outputs=download_audio_input
568
  )
 
 
569
 
570
  # Button to add speech type
571
  add_speech_type_btn = gr.Button("Add Speech Type")
@@ -582,20 +612,29 @@ with gr.Blocks() as app_emotional:
582
  audio_updates = []
583
  ref_text_updates = []
584
  delete_btn_updates = []
 
585
  download_btn_updates = []
 
 
586
  for i in range(max_speech_types - 1):
587
  if i < speech_type_count:
588
  name_updates.append(gr.update(visible=True))
589
  audio_updates.append(gr.update(visible=True))
590
  ref_text_updates.append(gr.update(visible=True))
591
  delete_btn_updates.append(gr.update(visible=True))
 
592
  download_btn_updates.append(gr.update(visible=True))
 
 
593
  else:
594
  name_updates.append(gr.update())
595
  audio_updates.append(gr.update())
596
  ref_text_updates.append(gr.update())
597
  delete_btn_updates.append(gr.update())
 
598
  download_btn_updates.append(gr.update())
 
 
599
  else:
600
  # Optionally, show a warning
601
  # gr.Warning("Maximum number of speech types reached.")
@@ -603,13 +642,21 @@ with gr.Blocks() as app_emotional:
603
  audio_updates = [gr.update() for _ in range(max_speech_types - 1)]
604
  ref_text_updates = [gr.update() for _ in range(max_speech_types - 1)]
605
  delete_btn_updates = [gr.update() for _ in range(max_speech_types - 1)]
 
606
  download_btn_updates = [gr.update() for _ in range(max_speech_types - 1)]
607
  return [speech_type_count] + name_updates + audio_updates + ref_text_updates + delete_btn_updates + download_btn_updates
 
 
 
608
 
609
  add_speech_type_btn.click(
610
  add_speech_type_fn,
611
  inputs=speech_type_count,
 
612
  outputs=[speech_type_count] + speech_type_names + speech_type_audios + speech_type_ref_texts + speech_type_delete_btns + download_speech_type_audios
 
 
 
613
  )
614
 
615
  # Function to delete a speech type
@@ -770,6 +817,7 @@ with gr.Blocks() as app_emotional:
770
  inputs=[gen_text_input_emotional, regular_name] + speech_type_names,
771
  outputs=generate_emotional_btn
772
  )
 
773
 
774
  with gr.Blocks() as app:
775
  gr.Markdown(
@@ -777,6 +825,23 @@ with gr.Blocks() as app:
777
  # TTS
778
 
779
  This is a local web UI for TTS with advanced batch processing support. This app supports the following TTS models:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
780
  """
781
  )
782
  gr.TabbedInterface([app_tts, app_podcast, app_emotional], ["TTS", "Podcast", "Multi-Style"])
 
386
  def process_audio(ref_audio_path):
387
  return ref_audio_path
388
 
389
+ <<<<<<< HEAD
390
+ =======
391
+ with gr.Blocks() as app_credits:
392
+ gr.Markdown("""
393
+ # Credits
394
+
395
+ * [mrfakename](https://github.com/fakerybakery) for the original [online demo](https://huggingface.co/spaces/mrfakename/E2-F5-TTS)
396
+ * [RootingInLoad](https://github.com/RootingInLoad) for the podcast generation
397
+ * [jpgallegoar](https://github.com/jpgallegoar) for multiple speech-type generation
398
+ """)
399
+ >>>>>>> 92604a33e28acd02c841d163fca3c8b802cf15e5
400
  with gr.Blocks(theme='gstaff/sketch') as app_tts:
401
  gr.Markdown("# Batched TTS")
402
  ref_audio_input = gr.Audio(label="Reference Audio", type="filepath")
 
519
 
520
  return segments
521
 
522
+ <<<<<<< HEAD
523
  def get_audio_file(audio_path):
524
  return audio_path
525
 
526
+ =======
527
+ >>>>>>> 92604a33e28acd02c841d163fca3c8b802cf15e5
528
  with gr.Blocks() as app_emotional:
529
  # New section for emotional generation
530
  gr.Markdown(
 
535
 
536
  **Example Input:**
537
 
538
+ <<<<<<< HEAD
539
  (Regular) Hello, I'd like to order a sandwich please. (Surprised) What do you mean you're out of bread? (Sad) I really wanted a sandwich though... (Angry) You know what, fuck you and your little shop, you suck! (Whisper) I'll just go back home and cry now. (Shouting) Why me?!
540
+ =======
541
+ (Regular) Hello, I'd like to order a sandwich please. (Surprised) What do you mean you're out of bread? (Sad) I really wanted a sandwich though... (Angry) You know what, darn you and your little shop, you suck! (Whisper) I'll just go back home and cry now. (Shouting) Why me?!
542
+ >>>>>>> 92604a33e28acd02c841d163fca3c8b802cf15e5
543
  """
544
  )
545
 
 
550
  regular_name = gr.Textbox(value='Regular', label='Speech Type Name', interactive=False)
551
  regular_audio = gr.Audio(label='Regular Reference Audio', type='filepath')
552
  regular_ref_text = gr.Textbox(label='Reference Text (Regular)', lines=2)
553
+ <<<<<<< HEAD
554
  download_regular_audio = gr.File(label="Download Regular Reference Audio")
555
 
556
  regular_audio.change(
 
558
  inputs=regular_audio,
559
  outputs=download_regular_audio
560
  )
561
+ =======
562
+ >>>>>>> 92604a33e28acd02c841d163fca3c8b802cf15e5
563
 
564
  # Additional speech types (up to 99 more)
565
  max_speech_types = 100
 
567
  speech_type_audios = []
568
  speech_type_ref_texts = []
569
  speech_type_delete_btns = []
570
+ <<<<<<< HEAD
571
  download_speech_type_audios = []
572
+ =======
573
+ >>>>>>> 92604a33e28acd02c841d163fca3c8b802cf15e5
574
 
575
  for i in range(max_speech_types - 1):
576
  with gr.Row():
 
578
  audio_input = gr.Audio(label='Reference Audio', type='filepath', visible=False)
579
  ref_text_input = gr.Textbox(label='Reference Text', lines=2, visible=False)
580
  delete_btn = gr.Button("Delete", variant="secondary", visible=False)
581
+ <<<<<<< HEAD
582
  download_audio_input = gr.File(label="Download Reference Audio", visible=False)
583
+ =======
584
+ >>>>>>> 92604a33e28acd02c841d163fca3c8b802cf15e5
585
  speech_type_names.append(name_input)
586
  speech_type_audios.append(audio_input)
587
  speech_type_ref_texts.append(ref_text_input)
588
  speech_type_delete_btns.append(delete_btn)
589
+ <<<<<<< HEAD
590
  download_speech_type_audios.append(download_audio_input)
591
 
592
  audio_input.change(
 
594
  inputs=audio_input,
595
  outputs=download_audio_input
596
  )
597
+ =======
598
+ >>>>>>> 92604a33e28acd02c841d163fca3c8b802cf15e5
599
 
600
  # Button to add speech type
601
  add_speech_type_btn = gr.Button("Add Speech Type")
 
612
  audio_updates = []
613
  ref_text_updates = []
614
  delete_btn_updates = []
615
+ <<<<<<< HEAD
616
  download_btn_updates = []
617
+ =======
618
+ >>>>>>> 92604a33e28acd02c841d163fca3c8b802cf15e5
619
  for i in range(max_speech_types - 1):
620
  if i < speech_type_count:
621
  name_updates.append(gr.update(visible=True))
622
  audio_updates.append(gr.update(visible=True))
623
  ref_text_updates.append(gr.update(visible=True))
624
  delete_btn_updates.append(gr.update(visible=True))
625
+ <<<<<<< HEAD
626
  download_btn_updates.append(gr.update(visible=True))
627
+ =======
628
+ >>>>>>> 92604a33e28acd02c841d163fca3c8b802cf15e5
629
  else:
630
  name_updates.append(gr.update())
631
  audio_updates.append(gr.update())
632
  ref_text_updates.append(gr.update())
633
  delete_btn_updates.append(gr.update())
634
+ <<<<<<< HEAD
635
  download_btn_updates.append(gr.update())
636
+ =======
637
+ >>>>>>> 92604a33e28acd02c841d163fca3c8b802cf15e5
638
  else:
639
  # Optionally, show a warning
640
  # gr.Warning("Maximum number of speech types reached.")
 
642
  audio_updates = [gr.update() for _ in range(max_speech_types - 1)]
643
  ref_text_updates = [gr.update() for _ in range(max_speech_types - 1)]
644
  delete_btn_updates = [gr.update() for _ in range(max_speech_types - 1)]
645
+ <<<<<<< HEAD
646
  download_btn_updates = [gr.update() for _ in range(max_speech_types - 1)]
647
  return [speech_type_count] + name_updates + audio_updates + ref_text_updates + delete_btn_updates + download_btn_updates
648
+ =======
649
+ return [speech_type_count] + name_updates + audio_updates + ref_text_updates + delete_btn_updates
650
+ >>>>>>> 92604a33e28acd02c841d163fca3c8b802cf15e5
651
 
652
  add_speech_type_btn.click(
653
  add_speech_type_fn,
654
  inputs=speech_type_count,
655
+ <<<<<<< HEAD
656
  outputs=[speech_type_count] + speech_type_names + speech_type_audios + speech_type_ref_texts + speech_type_delete_btns + download_speech_type_audios
657
+ =======
658
+ outputs=[speech_type_count] + speech_type_names + speech_type_audios + speech_type_ref_texts + speech_type_delete_btns
659
+ >>>>>>> 92604a33e28acd02c841d163fca3c8b802cf15e5
660
  )
661
 
662
  # Function to delete a speech type
 
817
  inputs=[gen_text_input_emotional, regular_name] + speech_type_names,
818
  outputs=generate_emotional_btn
819
  )
820
+ <<<<<<< HEAD
821
 
822
  with gr.Blocks() as app:
823
  gr.Markdown(
 
825
  # TTS
826
 
827
  This is a local web UI for TTS with advanced batch processing support. This app supports the following TTS models:
828
+ =======
829
+ with gr.Blocks() as app:
830
+ gr.Markdown(
831
+ """
832
+ # E2/F5 TTS
833
+
834
+ This is a local web UI for F5 TTS with advanced batch processing support. This app supports the following TTS models:
835
+
836
+ * [F5-TTS](https://arxiv.org/abs/2410.06885) (A Fairytaler that Fakes Fluent and Faithful Speech with Flow Matching)
837
+ * [E2 TTS](https://arxiv.org/abs/2406.18009) (Embarrassingly Easy Fully Non-Autoregressive Zero-Shot TTS)
838
+
839
+ The checkpoints support English and Chinese.
840
+
841
+ If you're having issues, try converting your reference audio to WAV or MP3, clipping it to 15s, and shortening your prompt.
842
+
843
+ **NOTE: Reference text will be automatically transcribed with Whisper if not provided. For best results, keep your reference clips short (<15s). Ensure the audio is fully uploaded before generating.**
844
+ >>>>>>> 92604a33e28acd02c841d163fca3c8b802cf15e5
845
  """
846
  )
847
  gr.TabbedInterface([app_tts, app_podcast, app_emotional], ["TTS", "Podcast", "Multi-Style"])