Resolved merge conflicts
Browse files- .DS_Store +0 -0
- .gitignore +3 -0
- app.py +65 -0
.DS_Store
CHANGED
Binary files a/.DS_Store and b/.DS_Store differ
|
|
.gitignore
CHANGED
@@ -4,6 +4,7 @@
|
|
4 |
# Python cache files
|
5 |
__pycache__/
|
6 |
*.pyc
|
|
|
7 |
|
8 |
|
9 |
echo "demo_audio/notebookllm_starhealth_demo.wav" >> .gitignore
|
@@ -11,3 +12,5 @@ echo "demo_audio/notebookllm_starhealth_demo.wav" >> .gitignore
|
|
11 |
|
12 |
echo "demo_audio/" >> .gitignore
|
13 |
demo_audio/
|
|
|
|
|
|
4 |
# Python cache files
|
5 |
__pycache__/
|
6 |
*.pyc
|
7 |
+
<<<<<<< HEAD
|
8 |
|
9 |
|
10 |
echo "demo_audio/notebookllm_starhealth_demo.wav" >> .gitignore
|
|
|
12 |
|
13 |
echo "demo_audio/" >> .gitignore
|
14 |
demo_audio/
|
15 |
+
=======
|
16 |
+
>>>>>>> 92604a33e28acd02c841d163fca3c8b802cf15e5
|
app.py
CHANGED
@@ -386,6 +386,17 @@ def update_speed(new_speed):
|
|
386 |
def process_audio(ref_audio_path):
|
387 |
return ref_audio_path
|
388 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
389 |
with gr.Blocks(theme='gstaff/sketch') as app_tts:
|
390 |
gr.Markdown("# Batched TTS")
|
391 |
ref_audio_input = gr.Audio(label="Reference Audio", type="filepath")
|
@@ -508,9 +519,12 @@ def parse_emotional_text(gen_text):
|
|
508 |
|
509 |
return segments
|
510 |
|
|
|
511 |
def get_audio_file(audio_path):
|
512 |
return audio_path
|
513 |
|
|
|
|
|
514 |
with gr.Blocks() as app_emotional:
|
515 |
# New section for emotional generation
|
516 |
gr.Markdown(
|
@@ -521,7 +535,11 @@ with gr.Blocks() as app_emotional:
|
|
521 |
|
522 |
**Example Input:**
|
523 |
|
|
|
524 |
(Regular) Hello, I'd like to order a sandwich please. (Surprised) What do you mean you're out of bread? (Sad) I really wanted a sandwich though... (Angry) You know what, fuck you and your little shop, you suck! (Whisper) I'll just go back home and cry now. (Shouting) Why me?!
|
|
|
|
|
|
|
525 |
"""
|
526 |
)
|
527 |
|
@@ -532,6 +550,7 @@ with gr.Blocks() as app_emotional:
|
|
532 |
regular_name = gr.Textbox(value='Regular', label='Speech Type Name', interactive=False)
|
533 |
regular_audio = gr.Audio(label='Regular Reference Audio', type='filepath')
|
534 |
regular_ref_text = gr.Textbox(label='Reference Text (Regular)', lines=2)
|
|
|
535 |
download_regular_audio = gr.File(label="Download Regular Reference Audio")
|
536 |
|
537 |
regular_audio.change(
|
@@ -539,6 +558,8 @@ with gr.Blocks() as app_emotional:
|
|
539 |
inputs=regular_audio,
|
540 |
outputs=download_regular_audio
|
541 |
)
|
|
|
|
|
542 |
|
543 |
# Additional speech types (up to 99 more)
|
544 |
max_speech_types = 100
|
@@ -546,7 +567,10 @@ with gr.Blocks() as app_emotional:
|
|
546 |
speech_type_audios = []
|
547 |
speech_type_ref_texts = []
|
548 |
speech_type_delete_btns = []
|
|
|
549 |
download_speech_type_audios = []
|
|
|
|
|
550 |
|
551 |
for i in range(max_speech_types - 1):
|
552 |
with gr.Row():
|
@@ -554,11 +578,15 @@ with gr.Blocks() as app_emotional:
|
|
554 |
audio_input = gr.Audio(label='Reference Audio', type='filepath', visible=False)
|
555 |
ref_text_input = gr.Textbox(label='Reference Text', lines=2, visible=False)
|
556 |
delete_btn = gr.Button("Delete", variant="secondary", visible=False)
|
|
|
557 |
download_audio_input = gr.File(label="Download Reference Audio", visible=False)
|
|
|
|
|
558 |
speech_type_names.append(name_input)
|
559 |
speech_type_audios.append(audio_input)
|
560 |
speech_type_ref_texts.append(ref_text_input)
|
561 |
speech_type_delete_btns.append(delete_btn)
|
|
|
562 |
download_speech_type_audios.append(download_audio_input)
|
563 |
|
564 |
audio_input.change(
|
@@ -566,6 +594,8 @@ with gr.Blocks() as app_emotional:
|
|
566 |
inputs=audio_input,
|
567 |
outputs=download_audio_input
|
568 |
)
|
|
|
|
|
569 |
|
570 |
# Button to add speech type
|
571 |
add_speech_type_btn = gr.Button("Add Speech Type")
|
@@ -582,20 +612,29 @@ with gr.Blocks() as app_emotional:
|
|
582 |
audio_updates = []
|
583 |
ref_text_updates = []
|
584 |
delete_btn_updates = []
|
|
|
585 |
download_btn_updates = []
|
|
|
|
|
586 |
for i in range(max_speech_types - 1):
|
587 |
if i < speech_type_count:
|
588 |
name_updates.append(gr.update(visible=True))
|
589 |
audio_updates.append(gr.update(visible=True))
|
590 |
ref_text_updates.append(gr.update(visible=True))
|
591 |
delete_btn_updates.append(gr.update(visible=True))
|
|
|
592 |
download_btn_updates.append(gr.update(visible=True))
|
|
|
|
|
593 |
else:
|
594 |
name_updates.append(gr.update())
|
595 |
audio_updates.append(gr.update())
|
596 |
ref_text_updates.append(gr.update())
|
597 |
delete_btn_updates.append(gr.update())
|
|
|
598 |
download_btn_updates.append(gr.update())
|
|
|
|
|
599 |
else:
|
600 |
# Optionally, show a warning
|
601 |
# gr.Warning("Maximum number of speech types reached.")
|
@@ -603,13 +642,21 @@ with gr.Blocks() as app_emotional:
|
|
603 |
audio_updates = [gr.update() for _ in range(max_speech_types - 1)]
|
604 |
ref_text_updates = [gr.update() for _ in range(max_speech_types - 1)]
|
605 |
delete_btn_updates = [gr.update() for _ in range(max_speech_types - 1)]
|
|
|
606 |
download_btn_updates = [gr.update() for _ in range(max_speech_types - 1)]
|
607 |
return [speech_type_count] + name_updates + audio_updates + ref_text_updates + delete_btn_updates + download_btn_updates
|
|
|
|
|
|
|
608 |
|
609 |
add_speech_type_btn.click(
|
610 |
add_speech_type_fn,
|
611 |
inputs=speech_type_count,
|
|
|
612 |
outputs=[speech_type_count] + speech_type_names + speech_type_audios + speech_type_ref_texts + speech_type_delete_btns + download_speech_type_audios
|
|
|
|
|
|
|
613 |
)
|
614 |
|
615 |
# Function to delete a speech type
|
@@ -770,6 +817,7 @@ with gr.Blocks() as app_emotional:
|
|
770 |
inputs=[gen_text_input_emotional, regular_name] + speech_type_names,
|
771 |
outputs=generate_emotional_btn
|
772 |
)
|
|
|
773 |
|
774 |
with gr.Blocks() as app:
|
775 |
gr.Markdown(
|
@@ -777,6 +825,23 @@ with gr.Blocks() as app:
|
|
777 |
# TTS
|
778 |
|
779 |
This is a local web UI for TTS with advanced batch processing support. This app supports the following TTS models:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
780 |
"""
|
781 |
)
|
782 |
gr.TabbedInterface([app_tts, app_podcast, app_emotional], ["TTS", "Podcast", "Multi-Style"])
|
|
|
386 |
def process_audio(ref_audio_path):
|
387 |
return ref_audio_path
|
388 |
|
389 |
+
<<<<<<< HEAD
|
390 |
+
=======
|
391 |
+
with gr.Blocks() as app_credits:
|
392 |
+
gr.Markdown("""
|
393 |
+
# Credits
|
394 |
+
|
395 |
+
* [mrfakename](https://github.com/fakerybakery) for the original [online demo](https://huggingface.co/spaces/mrfakename/E2-F5-TTS)
|
396 |
+
* [RootingInLoad](https://github.com/RootingInLoad) for the podcast generation
|
397 |
+
* [jpgallegoar](https://github.com/jpgallegoar) for multiple speech-type generation
|
398 |
+
""")
|
399 |
+
>>>>>>> 92604a33e28acd02c841d163fca3c8b802cf15e5
|
400 |
with gr.Blocks(theme='gstaff/sketch') as app_tts:
|
401 |
gr.Markdown("# Batched TTS")
|
402 |
ref_audio_input = gr.Audio(label="Reference Audio", type="filepath")
|
|
|
519 |
|
520 |
return segments
|
521 |
|
522 |
+
<<<<<<< HEAD
|
523 |
def get_audio_file(audio_path):
|
524 |
return audio_path
|
525 |
|
526 |
+
=======
|
527 |
+
>>>>>>> 92604a33e28acd02c841d163fca3c8b802cf15e5
|
528 |
with gr.Blocks() as app_emotional:
|
529 |
# New section for emotional generation
|
530 |
gr.Markdown(
|
|
|
535 |
|
536 |
**Example Input:**
|
537 |
|
538 |
+
<<<<<<< HEAD
|
539 |
(Regular) Hello, I'd like to order a sandwich please. (Surprised) What do you mean you're out of bread? (Sad) I really wanted a sandwich though... (Angry) You know what, fuck you and your little shop, you suck! (Whisper) I'll just go back home and cry now. (Shouting) Why me?!
|
540 |
+
=======
|
541 |
+
(Regular) Hello, I'd like to order a sandwich please. (Surprised) What do you mean you're out of bread? (Sad) I really wanted a sandwich though... (Angry) You know what, darn you and your little shop, you suck! (Whisper) I'll just go back home and cry now. (Shouting) Why me?!
|
542 |
+
>>>>>>> 92604a33e28acd02c841d163fca3c8b802cf15e5
|
543 |
"""
|
544 |
)
|
545 |
|
|
|
550 |
regular_name = gr.Textbox(value='Regular', label='Speech Type Name', interactive=False)
|
551 |
regular_audio = gr.Audio(label='Regular Reference Audio', type='filepath')
|
552 |
regular_ref_text = gr.Textbox(label='Reference Text (Regular)', lines=2)
|
553 |
+
<<<<<<< HEAD
|
554 |
download_regular_audio = gr.File(label="Download Regular Reference Audio")
|
555 |
|
556 |
regular_audio.change(
|
|
|
558 |
inputs=regular_audio,
|
559 |
outputs=download_regular_audio
|
560 |
)
|
561 |
+
=======
|
562 |
+
>>>>>>> 92604a33e28acd02c841d163fca3c8b802cf15e5
|
563 |
|
564 |
# Additional speech types (up to 99 more)
|
565 |
max_speech_types = 100
|
|
|
567 |
speech_type_audios = []
|
568 |
speech_type_ref_texts = []
|
569 |
speech_type_delete_btns = []
|
570 |
+
<<<<<<< HEAD
|
571 |
download_speech_type_audios = []
|
572 |
+
=======
|
573 |
+
>>>>>>> 92604a33e28acd02c841d163fca3c8b802cf15e5
|
574 |
|
575 |
for i in range(max_speech_types - 1):
|
576 |
with gr.Row():
|
|
|
578 |
audio_input = gr.Audio(label='Reference Audio', type='filepath', visible=False)
|
579 |
ref_text_input = gr.Textbox(label='Reference Text', lines=2, visible=False)
|
580 |
delete_btn = gr.Button("Delete", variant="secondary", visible=False)
|
581 |
+
<<<<<<< HEAD
|
582 |
download_audio_input = gr.File(label="Download Reference Audio", visible=False)
|
583 |
+
=======
|
584 |
+
>>>>>>> 92604a33e28acd02c841d163fca3c8b802cf15e5
|
585 |
speech_type_names.append(name_input)
|
586 |
speech_type_audios.append(audio_input)
|
587 |
speech_type_ref_texts.append(ref_text_input)
|
588 |
speech_type_delete_btns.append(delete_btn)
|
589 |
+
<<<<<<< HEAD
|
590 |
download_speech_type_audios.append(download_audio_input)
|
591 |
|
592 |
audio_input.change(
|
|
|
594 |
inputs=audio_input,
|
595 |
outputs=download_audio_input
|
596 |
)
|
597 |
+
=======
|
598 |
+
>>>>>>> 92604a33e28acd02c841d163fca3c8b802cf15e5
|
599 |
|
600 |
# Button to add speech type
|
601 |
add_speech_type_btn = gr.Button("Add Speech Type")
|
|
|
612 |
audio_updates = []
|
613 |
ref_text_updates = []
|
614 |
delete_btn_updates = []
|
615 |
+
<<<<<<< HEAD
|
616 |
download_btn_updates = []
|
617 |
+
=======
|
618 |
+
>>>>>>> 92604a33e28acd02c841d163fca3c8b802cf15e5
|
619 |
for i in range(max_speech_types - 1):
|
620 |
if i < speech_type_count:
|
621 |
name_updates.append(gr.update(visible=True))
|
622 |
audio_updates.append(gr.update(visible=True))
|
623 |
ref_text_updates.append(gr.update(visible=True))
|
624 |
delete_btn_updates.append(gr.update(visible=True))
|
625 |
+
<<<<<<< HEAD
|
626 |
download_btn_updates.append(gr.update(visible=True))
|
627 |
+
=======
|
628 |
+
>>>>>>> 92604a33e28acd02c841d163fca3c8b802cf15e5
|
629 |
else:
|
630 |
name_updates.append(gr.update())
|
631 |
audio_updates.append(gr.update())
|
632 |
ref_text_updates.append(gr.update())
|
633 |
delete_btn_updates.append(gr.update())
|
634 |
+
<<<<<<< HEAD
|
635 |
download_btn_updates.append(gr.update())
|
636 |
+
=======
|
637 |
+
>>>>>>> 92604a33e28acd02c841d163fca3c8b802cf15e5
|
638 |
else:
|
639 |
# Optionally, show a warning
|
640 |
# gr.Warning("Maximum number of speech types reached.")
|
|
|
642 |
audio_updates = [gr.update() for _ in range(max_speech_types - 1)]
|
643 |
ref_text_updates = [gr.update() for _ in range(max_speech_types - 1)]
|
644 |
delete_btn_updates = [gr.update() for _ in range(max_speech_types - 1)]
|
645 |
+
<<<<<<< HEAD
|
646 |
download_btn_updates = [gr.update() for _ in range(max_speech_types - 1)]
|
647 |
return [speech_type_count] + name_updates + audio_updates + ref_text_updates + delete_btn_updates + download_btn_updates
|
648 |
+
=======
|
649 |
+
return [speech_type_count] + name_updates + audio_updates + ref_text_updates + delete_btn_updates
|
650 |
+
>>>>>>> 92604a33e28acd02c841d163fca3c8b802cf15e5
|
651 |
|
652 |
add_speech_type_btn.click(
|
653 |
add_speech_type_fn,
|
654 |
inputs=speech_type_count,
|
655 |
+
<<<<<<< HEAD
|
656 |
outputs=[speech_type_count] + speech_type_names + speech_type_audios + speech_type_ref_texts + speech_type_delete_btns + download_speech_type_audios
|
657 |
+
=======
|
658 |
+
outputs=[speech_type_count] + speech_type_names + speech_type_audios + speech_type_ref_texts + speech_type_delete_btns
|
659 |
+
>>>>>>> 92604a33e28acd02c841d163fca3c8b802cf15e5
|
660 |
)
|
661 |
|
662 |
# Function to delete a speech type
|
|
|
817 |
inputs=[gen_text_input_emotional, regular_name] + speech_type_names,
|
818 |
outputs=generate_emotional_btn
|
819 |
)
|
820 |
+
<<<<<<< HEAD
|
821 |
|
822 |
with gr.Blocks() as app:
|
823 |
gr.Markdown(
|
|
|
825 |
# TTS
|
826 |
|
827 |
This is a local web UI for TTS with advanced batch processing support. This app supports the following TTS models:
|
828 |
+
=======
|
829 |
+
with gr.Blocks() as app:
|
830 |
+
gr.Markdown(
|
831 |
+
"""
|
832 |
+
# E2/F5 TTS
|
833 |
+
|
834 |
+
This is a local web UI for F5 TTS with advanced batch processing support. This app supports the following TTS models:
|
835 |
+
|
836 |
+
* [F5-TTS](https://arxiv.org/abs/2410.06885) (A Fairytaler that Fakes Fluent and Faithful Speech with Flow Matching)
|
837 |
+
* [E2 TTS](https://arxiv.org/abs/2406.18009) (Embarrassingly Easy Fully Non-Autoregressive Zero-Shot TTS)
|
838 |
+
|
839 |
+
The checkpoints support English and Chinese.
|
840 |
+
|
841 |
+
If you're having issues, try converting your reference audio to WAV or MP3, clipping it to 15s, and shortening your prompt.
|
842 |
+
|
843 |
+
**NOTE: Reference text will be automatically transcribed with Whisper if not provided. For best results, keep your reference clips short (<15s). Ensure the audio is fully uploaded before generating.**
|
844 |
+
>>>>>>> 92604a33e28acd02c841d163fca3c8b802cf15e5
|
845 |
"""
|
846 |
)
|
847 |
gr.TabbedInterface([app_tts, app_podcast, app_emotional], ["TTS", "Podcast", "Multi-Style"])
|