Replaced Encodec with Vocos
Browse files
app.py
CHANGED
@@ -323,7 +323,7 @@ def infer_from_prompt(text, language, accent, preset_prompt, prompt_file):
|
|
323 |
return message, (24000, samples.squeeze(0).cpu().numpy())
|
324 |
|
325 |
|
326 |
-
|
327 |
@torch.no_grad()
|
328 |
def infer_long_text(text, preset_prompt, prompt=None, language='auto', accent='no-accent'):
|
329 |
"""
|
@@ -331,11 +331,9 @@ def infer_long_text(text, preset_prompt, prompt=None, language='auto', accent='n
|
|
331 |
fixed-prompt: This mode will keep using the same prompt the user has provided, and generate audio sentence by sentence.
|
332 |
sliding-window: This mode will use the last sentence as the prompt for the next sentence, but has some concern on speaker maintenance.
|
333 |
"""
|
334 |
-
from utils.sentence_cutter import split_text_into_sentences
|
335 |
if len(text) > 1000:
|
336 |
return "Rejected, Text too long (should be less than 1000 characters)", None
|
337 |
mode = 'fixed-prompt'
|
338 |
-
global model, audio_tokenizer, text_tokenizer, text_collater
|
339 |
if (prompt is None or prompt == "") and preset_prompt == "":
|
340 |
mode = 'sliding-window' # If no prompt is given, use sliding-window mode
|
341 |
sentences = split_text_into_sentences(text)
|
@@ -463,122 +461,113 @@ def infer_long_text(text, preset_prompt, prompt=None, language='auto', accent='n
|
|
463 |
else:
|
464 |
raise ValueError(f"No such mode {mode}")
|
465 |
|
466 |
-
|
467 |
-
|
468 |
-
|
469 |
-
with
|
470 |
-
gr.Markdown(
|
471 |
-
with gr.
|
472 |
-
gr.
|
473 |
-
|
474 |
-
|
475 |
-
|
476 |
-
|
477 |
-
|
478 |
-
|
479 |
-
|
480 |
-
|
481 |
-
|
482 |
-
|
483 |
-
|
484 |
-
|
485 |
-
|
486 |
-
|
487 |
-
|
488 |
-
|
489 |
-
|
490 |
-
|
491 |
-
|
492 |
-
|
493 |
-
|
494 |
-
|
495 |
-
|
496 |
-
|
497 |
-
|
498 |
-
|
499 |
-
|
500 |
-
|
501 |
-
|
502 |
-
|
503 |
-
|
504 |
-
|
505 |
-
|
506 |
-
with gr.
|
507 |
-
gr.
|
508 |
-
|
509 |
-
|
510 |
-
|
511 |
-
|
512 |
-
|
513 |
-
|
514 |
-
|
515 |
-
|
516 |
-
|
517 |
-
|
518 |
-
|
519 |
-
|
520 |
-
|
521 |
-
|
522 |
-
|
523 |
-
|
524 |
-
|
525 |
-
|
526 |
-
|
527 |
-
|
528 |
-
|
529 |
-
|
530 |
-
|
531 |
-
with gr.
|
532 |
-
gr.
|
533 |
-
|
534 |
-
|
535 |
-
|
536 |
-
|
537 |
-
|
538 |
-
|
539 |
-
|
540 |
-
|
541 |
-
|
542 |
-
|
543 |
-
|
544 |
-
|
545 |
-
|
546 |
-
|
547 |
-
|
548 |
-
|
549 |
-
|
550 |
-
|
551 |
-
|
552 |
-
|
553 |
-
|
554 |
-
|
555 |
-
|
556 |
-
with gr.
|
557 |
-
gr.
|
558 |
-
|
559 |
-
|
560 |
-
|
561 |
-
|
562 |
-
|
563 |
-
|
564 |
-
|
565 |
-
|
566 |
-
|
567 |
-
|
568 |
-
|
569 |
-
|
570 |
-
|
571 |
-
|
572 |
-
|
573 |
-
|
574 |
-
|
575 |
-
|
576 |
-
|
577 |
-
app.launch()
|
578 |
-
|
579 |
-
if __name__ == "__main__":
|
580 |
-
formatter = (
|
581 |
-
"%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
|
582 |
-
)
|
583 |
-
logging.basicConfig(format=formatter, level=logging.INFO)
|
584 |
-
main()
|
|
|
323 |
return message, (24000, samples.squeeze(0).cpu().numpy())
|
324 |
|
325 |
|
326 |
+
from utils.sentence_cutter import split_text_into_sentences
|
327 |
@torch.no_grad()
|
328 |
def infer_long_text(text, preset_prompt, prompt=None, language='auto', accent='no-accent'):
|
329 |
"""
|
|
|
331 |
fixed-prompt: This mode will keep using the same prompt the user has provided, and generate audio sentence by sentence.
|
332 |
sliding-window: This mode will use the last sentence as the prompt for the next sentence, but has some concern on speaker maintenance.
|
333 |
"""
|
|
|
334 |
if len(text) > 1000:
|
335 |
return "Rejected, Text too long (should be less than 1000 characters)", None
|
336 |
mode = 'fixed-prompt'
|
|
|
337 |
if (prompt is None or prompt == "") and preset_prompt == "":
|
338 |
mode = 'sliding-window' # If no prompt is given, use sliding-window mode
|
339 |
sentences = split_text_into_sentences(text)
|
|
|
461 |
else:
|
462 |
raise ValueError(f"No such mode {mode}")
|
463 |
|
464 |
+
app = gr.Blocks()
|
465 |
+
with app:
|
466 |
+
gr.Markdown(top_md)
|
467 |
+
with gr.Tab("Infer from audio"):
|
468 |
+
gr.Markdown(infer_from_audio_md)
|
469 |
+
with gr.Row():
|
470 |
+
with gr.Column():
|
471 |
+
|
472 |
+
textbox = gr.TextArea(label="Text",
|
473 |
+
placeholder="Type your sentence here",
|
474 |
+
value="Welcome back, Master. What can I do for you today?", elem_id=f"tts-input")
|
475 |
+
language_dropdown = gr.Dropdown(choices=['auto-detect', 'English', '中文', '日本語'], value='auto-detect', label='language')
|
476 |
+
accent_dropdown = gr.Dropdown(choices=['no-accent', 'English', '中文', '日本語'], value='no-accent', label='accent')
|
477 |
+
textbox_transcript = gr.TextArea(label="Transcript",
|
478 |
+
placeholder="Write transcript here. (leave empty to use whisper)",
|
479 |
+
value="", elem_id=f"prompt-name")
|
480 |
+
upload_audio_prompt = gr.Audio(label='uploaded audio prompt', source='upload', interactive=True)
|
481 |
+
record_audio_prompt = gr.Audio(label='recorded audio prompt', source='microphone', interactive=True)
|
482 |
+
with gr.Column():
|
483 |
+
text_output = gr.Textbox(label="Message")
|
484 |
+
audio_output = gr.Audio(label="Output Audio", elem_id="tts-audio")
|
485 |
+
btn = gr.Button("Generate!")
|
486 |
+
btn.click(infer_from_audio,
|
487 |
+
inputs=[textbox, language_dropdown, accent_dropdown, upload_audio_prompt, record_audio_prompt, textbox_transcript],
|
488 |
+
outputs=[text_output, audio_output])
|
489 |
+
textbox_mp = gr.TextArea(label="Prompt name",
|
490 |
+
placeholder="Name your prompt here",
|
491 |
+
value="prompt_1", elem_id=f"prompt-name")
|
492 |
+
btn_mp = gr.Button("Make prompt!")
|
493 |
+
prompt_output = gr.File(interactive=False)
|
494 |
+
btn_mp.click(make_npz_prompt,
|
495 |
+
inputs=[textbox_mp, upload_audio_prompt, record_audio_prompt, textbox_transcript],
|
496 |
+
outputs=[text_output, prompt_output])
|
497 |
+
gr.Examples(examples=infer_from_audio_examples,
|
498 |
+
inputs=[textbox, language_dropdown, accent_dropdown, upload_audio_prompt, record_audio_prompt, textbox_transcript],
|
499 |
+
outputs=[text_output, audio_output],
|
500 |
+
fn=infer_from_audio,
|
501 |
+
cache_examples=False,)
|
502 |
+
with gr.Tab("Make prompt"):
|
503 |
+
gr.Markdown(make_prompt_md)
|
504 |
+
with gr.Row():
|
505 |
+
with gr.Column():
|
506 |
+
textbox2 = gr.TextArea(label="Prompt name",
|
507 |
+
placeholder="Name your prompt here",
|
508 |
+
value="prompt_1", elem_id=f"prompt-name")
|
509 |
+
# 添加选择语言和输入台本的地方
|
510 |
+
textbox_transcript2 = gr.TextArea(label="Transcript",
|
511 |
+
placeholder="Write transcript here. (leave empty to use whisper)",
|
512 |
+
value="", elem_id=f"prompt-name")
|
513 |
+
upload_audio_prompt_2 = gr.Audio(label='uploaded audio prompt', source='upload', interactive=True)
|
514 |
+
record_audio_prompt_2 = gr.Audio(label='recorded audio prompt', source='microphone', interactive=True)
|
515 |
+
with gr.Column():
|
516 |
+
text_output_2 = gr.Textbox(label="Message")
|
517 |
+
prompt_output_2 = gr.File(interactive=False)
|
518 |
+
btn_2 = gr.Button("Make!")
|
519 |
+
btn_2.click(make_npz_prompt,
|
520 |
+
inputs=[textbox2, upload_audio_prompt_2, record_audio_prompt_2, textbox_transcript2],
|
521 |
+
outputs=[text_output_2, prompt_output_2])
|
522 |
+
gr.Examples(examples=make_npz_prompt_examples,
|
523 |
+
inputs=[textbox2, upload_audio_prompt_2, record_audio_prompt_2, textbox_transcript2],
|
524 |
+
outputs=[text_output_2, prompt_output_2],
|
525 |
+
fn=make_npz_prompt,
|
526 |
+
cache_examples=False,)
|
527 |
+
with gr.Tab("Infer from prompt"):
|
528 |
+
gr.Markdown(infer_from_prompt_md)
|
529 |
+
with gr.Row():
|
530 |
+
with gr.Column():
|
531 |
+
textbox_3 = gr.TextArea(label="Text",
|
532 |
+
placeholder="Type your sentence here",
|
533 |
+
value="Welcome back, Master. What can I do for you today?", elem_id=f"tts-input")
|
534 |
+
language_dropdown_3 = gr.Dropdown(choices=['auto-detect', 'English', '中文', '日本語', 'Mix'], value='auto-detect',
|
535 |
+
label='language')
|
536 |
+
accent_dropdown_3 = gr.Dropdown(choices=['no-accent', 'English', '中文', '日本語'], value='no-accent',
|
537 |
+
label='accent')
|
538 |
+
preset_dropdown_3 = gr.Dropdown(choices=preset_list, value=None, label='Voice preset')
|
539 |
+
prompt_file = gr.File(file_count='single', file_types=['.npz'], interactive=True)
|
540 |
+
with gr.Column():
|
541 |
+
text_output_3 = gr.Textbox(label="Message")
|
542 |
+
audio_output_3 = gr.Audio(label="Output Audio", elem_id="tts-audio")
|
543 |
+
btn_3 = gr.Button("Generate!")
|
544 |
+
btn_3.click(infer_from_prompt,
|
545 |
+
inputs=[textbox_3, language_dropdown_3, accent_dropdown_3, preset_dropdown_3, prompt_file],
|
546 |
+
outputs=[text_output_3, audio_output_3])
|
547 |
+
gr.Examples(examples=infer_from_prompt_examples,
|
548 |
+
inputs=[textbox_3, language_dropdown_3, accent_dropdown_3, preset_dropdown_3, prompt_file],
|
549 |
+
outputs=[text_output_3, audio_output_3],
|
550 |
+
fn=infer_from_prompt,
|
551 |
+
cache_examples=False,)
|
552 |
+
with gr.Tab("Infer long text"):
|
553 |
+
gr.Markdown(long_text_md)
|
554 |
+
with gr.Row():
|
555 |
+
with gr.Column():
|
556 |
+
textbox_4 = gr.TextArea(label="Text",
|
557 |
+
placeholder="Type your sentence here",
|
558 |
+
value=long_text_example, elem_id=f"tts-input")
|
559 |
+
language_dropdown_4 = gr.Dropdown(choices=['auto-detect', 'English', '中文', '日本語'], value='auto-detect',
|
560 |
+
label='language')
|
561 |
+
accent_dropdown_4 = gr.Dropdown(choices=['no-accent', 'English', '中文', '日本語'], value='no-accent',
|
562 |
+
label='accent')
|
563 |
+
preset_dropdown_4 = gr.Dropdown(choices=preset_list, value=None, label='Voice preset')
|
564 |
+
prompt_file_4 = gr.File(file_count='single', file_types=['.npz'], interactive=True)
|
565 |
+
with gr.Column():
|
566 |
+
text_output_4 = gr.TextArea(label="Message")
|
567 |
+
audio_output_4 = gr.Audio(label="Output Audio", elem_id="tts-audio")
|
568 |
+
btn_4 = gr.Button("Generate!")
|
569 |
+
btn_4.click(infer_long_text,
|
570 |
+
inputs=[textbox_4, preset_dropdown_4, prompt_file_4, language_dropdown_4, accent_dropdown_4],
|
571 |
+
outputs=[text_output_4, audio_output_4])
|
572 |
+
|
573 |
+
app.launch()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|