Spaces:
Build error
Build error
jason-on-salt-a40
commited on
Commit
•
cf33c41
1
Parent(s):
93adc07
use updated model and prompt
Browse files- app.py +29 -42
- demo/YOU1000000115_S0000252.wav +0 -0
app.py
CHANGED
@@ -78,9 +78,14 @@ class WhisperxModel:
|
|
78 |
def load_models(whisper_backend_name, whisper_model_name, alignment_model_name, voicecraft_model_name):
|
79 |
global transcribe_model, align_model, voicecraft_model
|
80 |
|
81 |
-
if voicecraft_model_name == "
|
|
|
|
|
|
|
|
|
82 |
voicecraft_model_name = "gigaHalfLibri330M_TTSEnhanced_max16s"
|
83 |
-
|
|
|
84 |
if alignment_model_name is not None:
|
85 |
align_model = WhisperxAlignModel()
|
86 |
|
@@ -365,50 +370,32 @@ If disabled, you should write the target transcript yourself:</br>
|
|
365 |
- In Edit mode write full prompt</br>
|
366 |
"""
|
367 |
|
368 |
-
demo_original_transcript = "
|
369 |
|
370 |
demo_text = {
|
371 |
"TTS": {
|
372 |
"smart": "I cannot believe that the same model can also do text to speech synthesis too!",
|
373 |
-
"regular": "
|
374 |
},
|
375 |
"Edit": {
|
376 |
-
"smart": "
|
377 |
-
"regular": "
|
378 |
},
|
379 |
"Long TTS": {
|
380 |
"smart": "You can run the model on a big text!\n"
|
381 |
"Just write it line-by-line. Or sentence-by-sentence.\n"
|
382 |
"If some sentences sound odd, just rerun the model on them, no need to generate the whole text again!",
|
383 |
-
"regular": "
|
384 |
-
"
|
385 |
-
"
|
386 |
}
|
387 |
}
|
388 |
|
389 |
all_demo_texts = {vv for k, v in demo_text.items() for kk, vv in v.items()}
|
390 |
|
391 |
-
demo_words = [
|
392 |
-
|
393 |
-
|
394 |
-
'3.717 which 3.898', '3.958 the 4.058', '4.098 sense 4.359', '4.419 deceives, 4.92', '5.101 lost 5.481', '5.682 not 5.963',
|
395 |
-
'6.043 by 6.183', '6.223 distance 6.644', '6.905 any 7.065', '7.125 of 7.185', '7.245 its 7.346', '7.406 marks. 7.727'
|
396 |
-
]
|
397 |
-
|
398 |
-
demo_words_info = [
|
399 |
-
{'word': 'But', 'start': 0.029, 'end': 0.149, 'score': 0.834}, {'word': 'when', 'start': 0.189, 'end': 0.33, 'score': 0.879},
|
400 |
-
{'word': 'I', 'start': 0.43, 'end': 0.49, 'score': 0.984}, {'word': 'had', 'start': 0.53, 'end': 0.65, 'score': 0.998},
|
401 |
-
{'word': 'approached', 'start': 0.711, 'end': 1.152, 'score': 0.822}, {'word': 'so', 'start': 1.352, 'end': 1.593, 'score': 0.822},
|
402 |
-
{'word': 'near', 'start': 1.693, 'end': 1.933, 'score': 0.752}, {'word': 'to', 'start': 1.994, 'end': 2.074, 'score': 0.924},
|
403 |
-
{'word': 'them,', 'start': 2.134, 'end': 2.354, 'score': 0.914}, {'word': 'the', 'start': 2.535, 'end': 2.655, 'score': 0.818},
|
404 |
-
{'word': 'common', 'start': 2.695, 'end': 3.016, 'score': 0.971}, {'word': 'object,', 'start': 3.196, 'end': 3.577, 'score': 0.823},
|
405 |
-
{'word': 'which', 'start': 3.717, 'end': 3.898, 'score': 0.701}, {'word': 'the', 'start': 3.958, 'end': 4.058, 'score': 0.798},
|
406 |
-
{'word': 'sense', 'start': 4.098, 'end': 4.359, 'score': 0.797}, {'word': 'deceives,', 'start': 4.419, 'end': 4.92, 'score': 0.802},
|
407 |
-
{'word': 'lost', 'start': 5.101, 'end': 5.481, 'score': 0.71}, {'word': 'not', 'start': 5.682, 'end': 5.963, 'score': 0.781},
|
408 |
-
{'word': 'by', 'start': 6.043, 'end': 6.183, 'score': 0.834}, {'word': 'distance', 'start': 6.223, 'end': 6.644, 'score': 0.899},
|
409 |
-
{'word': 'any', 'start': 6.905, 'end': 7.065, 'score': 0.893}, {'word': 'of', 'start': 7.125, 'end': 7.185, 'score': 0.772},
|
410 |
-
{'word': 'its', 'start': 7.245, 'end': 7.346, 'score': 0.778}, {'word': 'marks.', 'start': 7.406, 'end': 7.727, 'score': 0.955}
|
411 |
-
]
|
412 |
|
413 |
|
414 |
def update_demo(mode, smart_transcript, edit_word_mode, transcript, edit_from_word, edit_to_word):
|
@@ -435,19 +422,19 @@ def get_app():
|
|
435 |
with gr.Column(scale=5):
|
436 |
with gr.Accordion("Select models", open=False) as models_selector:
|
437 |
with gr.Row():
|
438 |
-
voicecraft_model_choice = gr.Radio(label="VoiceCraft model", value="
|
439 |
-
choices=["
|
440 |
-
whisper_backend_choice = gr.Radio(label="Whisper backend", value="whisperX", choices=["
|
441 |
whisper_model_choice = gr.Radio(label="Whisper model", value="base.en",
|
442 |
choices=[None, "base.en", "small.en", "medium.en", "large"])
|
443 |
-
align_model_choice = gr.Radio(label="Forced alignment model", value="whisperX", choices=[
|
444 |
|
445 |
with gr.Row():
|
446 |
with gr.Column(scale=2):
|
447 |
-
input_audio = gr.Audio(value=f"{DEMO_PATH}/
|
448 |
with gr.Group():
|
449 |
original_transcript = gr.Textbox(label="Original transcript", lines=5, value=demo_original_transcript,
|
450 |
-
info="Use
|
451 |
with gr.Accordion("Word start time", open=False):
|
452 |
transcript_with_start_time = gr.Textbox(label="Start time", lines=5, interactive=False, info="Start time before each word")
|
453 |
with gr.Accordion("Word end time", open=False):
|
@@ -472,16 +459,16 @@ def get_app():
|
|
472 |
info="What to do with first and last word", visible=False)
|
473 |
|
474 |
with gr.Group() as tts_mode_controls:
|
475 |
-
prompt_to_word = gr.Dropdown(label="Last word in prompt", choices=demo_words, value=demo_words[
|
476 |
-
prompt_end_time = gr.Slider(label="Prompt end time", minimum=0, maximum=7.
|
477 |
|
478 |
with gr.Group(visible=False) as edit_mode_controls:
|
479 |
with gr.Row():
|
480 |
-
edit_from_word = gr.Dropdown(label="First word to edit", choices=demo_words, value=demo_words[
|
481 |
edit_to_word = gr.Dropdown(label="Last word to edit", choices=demo_words, value=demo_words[12], interactive=True)
|
482 |
with gr.Row():
|
483 |
-
edit_start_time = gr.Slider(label="Edit from time", minimum=0, maximum=7.
|
484 |
-
edit_end_time = gr.Slider(label="Edit to time", minimum=0, maximum=7.
|
485 |
|
486 |
run_btn = gr.Button(value="Run")
|
487 |
|
@@ -498,7 +485,7 @@ def get_app():
|
|
498 |
|
499 |
with gr.Row():
|
500 |
with gr.Accordion("Generation Parameters - change these if you are unhappy with the generation", open=False):
|
501 |
-
stop_repetition = gr.Radio(label="stop_repetition", choices=[-1, 1, 2, 3, 4], value=
|
502 |
info="if there are long silence in the generated audio, reduce the stop_repetition to 2 or 1. -1 = disabled")
|
503 |
sample_batch_size = gr.Number(label="speech rate", value=4, precision=0,
|
504 |
info="The higher the number, the faster the output will be. "
|
|
|
78 |
def load_models(whisper_backend_name, whisper_model_name, alignment_model_name, voicecraft_model_name):
|
79 |
global transcribe_model, align_model, voicecraft_model
|
80 |
|
81 |
+
if voicecraft_model_name == "330M":
|
82 |
+
voicecraft_model_name = "giga330M"
|
83 |
+
elif voicecraft_model_name == "830M":
|
84 |
+
voicecraft_model_name = "giga830M"
|
85 |
+
elif voicecraft_model_name == "330M_TTSEnhanced":
|
86 |
voicecraft_model_name = "gigaHalfLibri330M_TTSEnhanced_max16s"
|
87 |
+
elif voicecraft_model_name == "830M_TTSEnhanced":
|
88 |
+
voicecraft_model_name = "830M_TTSEnhanced"
|
89 |
if alignment_model_name is not None:
|
90 |
align_model = WhisperxAlignModel()
|
91 |
|
|
|
370 |
- In Edit mode write full prompt</br>
|
371 |
"""
|
372 |
|
373 |
+
demo_original_transcript = "And again in two thousand and eight when the United States Central Bank, the Federal Reserve, printed over two trillion dollars."
|
374 |
|
375 |
demo_text = {
|
376 |
"TTS": {
|
377 |
"smart": "I cannot believe that the same model can also do text to speech synthesis too!",
|
378 |
+
"regular": "And again in two thousand and eight when the united states central bank, I cannot believe that the same model can also do text to speech synthesis too!"
|
379 |
},
|
380 |
"Edit": {
|
381 |
+
"smart": "Central Bank of the United States, also called",
|
382 |
+
"regular": "And again in two thousand and eight when the Central Bank of the United States, also called the Federal Reserve, printed over two trillion dollars."
|
383 |
},
|
384 |
"Long TTS": {
|
385 |
"smart": "You can run the model on a big text!\n"
|
386 |
"Just write it line-by-line. Or sentence-by-sentence.\n"
|
387 |
"If some sentences sound odd, just rerun the model on them, no need to generate the whole text again!",
|
388 |
+
"regular": "And again in two thousand and eight when the united states central bank, You can run the model on a big text!\n"
|
389 |
+
"And again in two thousand and eight when the united states central bank, Just write it line-by-line. Or sentence-by-sentence.\n"
|
390 |
+
"And again in two thousand and eight when the united states central bank, If some sentences sound odd, just rerun the model on them, no need to generate the whole text again!"
|
391 |
}
|
392 |
}
|
393 |
|
394 |
all_demo_texts = {vv for k, v in demo_text.items() for kk, vv in v.items()}
|
395 |
|
396 |
+
demo_words = ['0.12 And 0.221', '0.261 again 0.561', '0.622 in 0.682', '0.742 two 0.922', '0.983 thousand 1.464', '1.504 and 1.584', '1.684 eight 1.865', '1.945 when 2.085', '2.125 the 2.206', '2.266 United 2.667', '2.707 States 2.968', '3.008 Central 3.349', '3.389 Bank, 3.649', '3.83 the 3.93', '4.01 Federal 4.451', '4.532 Reserve, 5.113', '5.314 printed 5.674', '5.835 over 6.035', '6.176 two 6.517', '6.637 trillion 7.098', '7.118 dollars. 7.479']
|
397 |
+
|
398 |
+
demo_words_info = [{'word': 'And', 'start': 0.12, 'end': 0.221, 'score': 0.792}, {'word': 'again', 'start': 0.261, 'end': 0.561, 'score': 0.795}, {'word': 'in', 'start': 0.622, 'end': 0.682, 'score': 0.75}, {'word': 'two', 'start': 0.742, 'end': 0.922, 'score': 0.755}, {'word': 'thousand', 'start': 0.983, 'end': 1.464, 'score': 0.82}, {'word': 'and', 'start': 1.504, 'end': 1.584, 'score': 0.715}, {'word': 'eight', 'start': 1.684, 'end': 1.865, 'score': 0.885}, {'word': 'when', 'start': 1.945, 'end': 2.085, 'score': 0.987}, {'word': 'the', 'start': 2.125, 'end': 2.206, 'score': 0.833}, {'word': 'United', 'start': 2.266, 'end': 2.667, 'score': 0.818}, {'word': 'States', 'start': 2.707, 'end': 2.968, 'score': 0.842}, {'word': 'Central', 'start': 3.008, 'end': 3.349, 'score': 0.852}, {'word': 'Bank,', 'start': 3.389, 'end': 3.649, 'score': 0.98}, {'word': 'the', 'start': 3.83, 'end': 3.93, 'score': 0.996}, {'word': 'Federal', 'start': 4.01, 'end': 4.451, 'score': 0.795}, {'word': 'Reserve,', 'start': 4.532, 'end': 5.113, 'score': 0.852}, {'word': 'printed', 'start': 5.314, 'end': 5.674, 'score': 0.785}, {'word': 'over', 'start': 5.835, 'end': 6.035, 'score': 0.84}, {'word': 'two', 'start': 6.176, 'end': 6.517, 'score': 0.757}, {'word': 'trillion', 'start': 6.637, 'end': 7.098, 'score': 0.796}, {'word': 'dollars.', 'start': 7.118, 'end': 7.479, 'score': 0.939}]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
399 |
|
400 |
|
401 |
def update_demo(mode, smart_transcript, edit_word_mode, transcript, edit_from_word, edit_to_word):
|
|
|
422 |
with gr.Column(scale=5):
|
423 |
with gr.Accordion("Select models", open=False) as models_selector:
|
424 |
with gr.Row():
|
425 |
+
voicecraft_model_choice = gr.Radio(label="VoiceCraft model", value="830M_TTSEnhanced",
|
426 |
+
choices=["330M", "830M", "330M_TTSEnhanced", "830M_TTSEnhanced"])
|
427 |
+
whisper_backend_choice = gr.Radio(label="Whisper backend", value="whisperX", choices=["whisperX", "whisper"])
|
428 |
whisper_model_choice = gr.Radio(label="Whisper model", value="base.en",
|
429 |
choices=[None, "base.en", "small.en", "medium.en", "large"])
|
430 |
+
align_model_choice = gr.Radio(label="Forced alignment model", value="whisperX", choices=["whisperX", None])
|
431 |
|
432 |
with gr.Row():
|
433 |
with gr.Column(scale=2):
|
434 |
+
input_audio = gr.Audio(value=f"{DEMO_PATH}/YOU1000000115_S0000252.wav", label="Input Audio", type="filepath", interactive=True)
|
435 |
with gr.Group():
|
436 |
original_transcript = gr.Textbox(label="Original transcript", lines=5, value=demo_original_transcript,
|
437 |
+
info="Use whisperx model to get the transcript. Fix and align it if necessary.")
|
438 |
with gr.Accordion("Word start time", open=False):
|
439 |
transcript_with_start_time = gr.Textbox(label="Start time", lines=5, interactive=False, info="Start time before each word")
|
440 |
with gr.Accordion("Word end time", open=False):
|
|
|
459 |
info="What to do with first and last word", visible=False)
|
460 |
|
461 |
with gr.Group() as tts_mode_controls:
|
462 |
+
prompt_to_word = gr.Dropdown(label="Last word in prompt", choices=demo_words, value=demo_words[12], interactive=True)
|
463 |
+
prompt_end_time = gr.Slider(label="Prompt end time", minimum=0, maximum=7.479, step=0.001, value=3.700)
|
464 |
|
465 |
with gr.Group(visible=False) as edit_mode_controls:
|
466 |
with gr.Row():
|
467 |
+
edit_from_word = gr.Dropdown(label="First word to edit", choices=demo_words, value=demo_words[9], interactive=True)
|
468 |
edit_to_word = gr.Dropdown(label="Last word to edit", choices=demo_words, value=demo_words[12], interactive=True)
|
469 |
with gr.Row():
|
470 |
+
edit_start_time = gr.Slider(label="Edit from time", minimum=0, maximum=7.479, step=0.001, value=2.266)
|
471 |
+
edit_end_time = gr.Slider(label="Edit to time", minimum=0, maximum=7.479, step=0.001, value=3.649)
|
472 |
|
473 |
run_btn = gr.Button(value="Run")
|
474 |
|
|
|
485 |
|
486 |
with gr.Row():
|
487 |
with gr.Accordion("Generation Parameters - change these if you are unhappy with the generation", open=False):
|
488 |
+
stop_repetition = gr.Radio(label="stop_repetition", choices=[-1, 1, 2, 3, 4], value=2,
|
489 |
info="if there are long silence in the generated audio, reduce the stop_repetition to 2 or 1. -1 = disabled")
|
490 |
sample_batch_size = gr.Number(label="speech rate", value=4, precision=0,
|
491 |
info="The higher the number, the faster the output will be. "
|
demo/YOU1000000115_S0000252.wav
ADDED
Binary file (252 kB). View file
|
|