yavorr commited on
Commit
6b64247
·
1 Parent(s): 9397c10

Expose AudioToken/Syllable ratio as param to allow manual speed and audio length control

Browse files
Files changed (1) hide show
  1. app.py +51 -13
app.py CHANGED
@@ -27,11 +27,50 @@ def run_asr(audio):
27
 
28
  return transcript.text, transcript.text, word_times
29
 
30
- def run_inpainter(input_text, output_text, word_times, audio, num_steps, init_temp, init_diversity, guidance, rescale, topk):
31
- return inpainter.inpaint(InpaintInput(input_text=input_text, output_text=output_text, input_word_times=word_times, audio=audio, num_steps=num_steps, init_temp=init_temp, init_diversity=init_diversity, guidance=guidance, rescale=rescale, topk=topk))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
 
33
- def run_inpainter_tts(input_text, voice_audio):
34
- return inpainter.tts(TTSInput(output_text=input_text, voice=voice_audio))
35
 
36
  if __name__ == '__main__':
37
  with gr.Blocks(analytics_enabled=False, title="PlayDiffusion") as demo:
@@ -43,13 +82,7 @@ if __name__ == '__main__':
43
  gr.Markdown("### Run the inpainter to generate the modified audio.")
44
  gr.Markdown("### Note: The model and demo are currently targeted for English.")
45
 
46
- with gr.Accordion("Advanced options", open=False):
47
- num_steps_slider = gr.Slider(minimum=1, maximum=100, step=1, label="number of sampling steps codebook", value=30)
48
- init_temp_slider = gr.Slider(minimum=0.5, maximum=10, step=0.1, label="Initial temperature", value=1)
49
- init_diversity_slider = gr.Slider(minimum=0, maximum=10, step=0.1, label="Initial diversity", value=1)
50
- guidance_slider = gr.Slider(minimum=0, maximum=10, step=0.1, label="guidance", value=0.5)
51
- rescale_slider = gr.Slider(minimum=0, maximum=1, step=0.1, label="guidance rescale factor", value=0.7)
52
- topk_slider = gr.Slider(minimum=1, maximum=10000, step=1, label="sampling from top-k logits", value=25)
53
 
54
  with gr.Row():
55
  audio_input = gr.Audio(label="Upload audio to be modified", sources=["upload", "microphone"], type="filepath")
@@ -71,10 +104,15 @@ if __name__ == '__main__':
71
  audio_output = gr.Audio(label="Output audio")
72
 
73
  asr_submit.click(run_asr, inputs=[audio_input], outputs=[text_input, text_output, word_times])
74
- inpainter_submit.click(run_inpainter, inputs=[text_input, text_output, word_times, audio_input, num_steps_slider, init_temp_slider, init_diversity_slider, guidance_slider, rescale_slider, topk_slider], outputs=[audio_output])
 
 
 
75
 
76
  with gr.Tab("Text to Speech"):
77
  gr.Markdown("### Text to Speech")
 
 
78
  tts_text = gr.Textbox(label="TTS Input", placeholder="Enter text to convert to speech", lines=2)
79
  tts_voice = gr.Audio(label="Voice to use for TTS",
80
  sources=["upload", "microphone"], type="filepath",
@@ -84,7 +122,7 @@ if __name__ == '__main__':
84
 
85
  tts_submit.click(
86
  run_inpainter_tts,
87
- inputs=[tts_text, tts_voice],
88
  outputs=[tts_output]
89
  )
90
 
 
27
 
28
  return transcript.text, transcript.text, word_times
29
 
30
+ def run_inpainter(input_text, output_text, word_times, audio, num_steps, init_temp, init_diversity, guidance, rescale, topk, use_manual_ratio, audio_token_syllable_ratio):
31
+ if not use_manual_ratio:
32
+ audio_token_syllable_ratio = None
33
+ return inpainter.inpaint(InpaintInput(input_text=input_text, output_text=output_text, input_word_times=word_times, audio=audio, num_steps=num_steps,
34
+ init_temp=init_temp, init_diversity=init_diversity, guidance=guidance, rescale=rescale, topk=topk,
35
+ audio_token_syllable_ratio=audio_token_syllable_ratio))
36
+
37
+ def run_inpainter_tts(input_text, voice_audio, num_steps, init_temp, init_diversity, guidance, rescale, topk, use_manual_ratio, audio_token_syllable_ratio):
38
+ if not use_manual_ratio:
39
+ audio_token_syllable_ratio = None
40
+ return inpainter.tts(TTSInput(output_text=input_text, voice=voice_audio, num_steps=num_steps, init_temp=init_temp,
41
+ init_diversity=init_diversity, guidance=guidance, rescale=rescale, topk=topk,
42
+ audio_token_syllable_ratio=audio_token_syllable_ratio))
43
+
44
+ def toggle_ratio_input(use_manual):
45
+ return gr.update(visible=use_manual, interactive=use_manual)
46
+
47
+ def create_advanced_options_accordion():
48
+ with gr.Accordion("Advanced options", open=False):
49
+ num_steps_slider = gr.Slider(1, 100, 30, step=1, label="number of sampling steps codebook")
50
+ init_temp_slider = gr.Slider(0.5, 10, 1, step=0.1, label="Initial temperature")
51
+ init_diversity_slider = gr.Slider(0, 10, 1, step=0.1, label="Initial diversity")
52
+ guidance_slider = gr.Slider(0, 10, 0.5, step=0.1, label="guidance")
53
+ rescale_slider = gr.Slider(0, 1, 0.7, step=0.1, label="guidance rescale factor")
54
+ topk_slider = gr.Slider(1, 10000, 25, step=1, label="sampling from top-k logits")
55
+
56
+ gr.Markdown("#### Audio Token Syllable Ratio")
57
+ gr.Markdown("*Automatic calculation (recommended) provides the best results in most cases.*")
58
+ use_manual_ratio = gr.Checkbox(label="Use manual audio token syllable ratio", value=False)
59
+ audio_token_syllable_ratio = gr.Number(
60
+ label="Audio token syllable ratio (manual)",
61
+ value=12.5, precision=2, minimum=5.0, maximum=25.0,
62
+ visible=False, interactive=False
63
+ )
64
+ use_manual_ratio.change(
65
+ toggle_ratio_input,
66
+ inputs=[use_manual_ratio],
67
+ outputs=[audio_token_syllable_ratio]
68
+ )
69
+
70
+ return (num_steps_slider, init_temp_slider, init_diversity_slider,
71
+ guidance_slider, rescale_slider, topk_slider,
72
+ use_manual_ratio, audio_token_syllable_ratio)
73
 
 
 
74
 
75
  if __name__ == '__main__':
76
  with gr.Blocks(analytics_enabled=False, title="PlayDiffusion") as demo:
 
82
  gr.Markdown("### Run the inpainter to generate the modified audio.")
83
  gr.Markdown("### Note: The model and demo are currently targeted for English.")
84
 
85
+ inpaint_advanced_options = create_advanced_options_accordion()
 
 
 
 
 
 
86
 
87
  with gr.Row():
88
  audio_input = gr.Audio(label="Upload audio to be modified", sources=["upload", "microphone"], type="filepath")
 
104
  audio_output = gr.Audio(label="Output audio")
105
 
106
  asr_submit.click(run_asr, inputs=[audio_input], outputs=[text_input, text_output, word_times])
107
+ inpainter_submit.click(
108
+ run_inpainter,
109
+ inputs=[text_input, text_output, word_times, audio_input] + list(inpaint_advanced_options),
110
+ outputs=[audio_output])
111
 
112
  with gr.Tab("Text to Speech"):
113
  gr.Markdown("### Text to Speech")
114
+ tts_advanced_options = create_advanced_options_accordion()
115
+
116
  tts_text = gr.Textbox(label="TTS Input", placeholder="Enter text to convert to speech", lines=2)
117
  tts_voice = gr.Audio(label="Voice to use for TTS",
118
  sources=["upload", "microphone"], type="filepath",
 
122
 
123
  tts_submit.click(
124
  run_inpainter_tts,
125
+ inputs=[tts_text, tts_voice] + list(tts_advanced_options),
126
  outputs=[tts_output]
127
  )
128