Spaces:
Running
on
A10G
Running
on
A10G
Expose AudioToken/Syllable ratio as param to allow manual speed and audio length control
Browse files
app.py
CHANGED
@@ -27,11 +27,50 @@ def run_asr(audio):
|
|
27 |
|
28 |
return transcript.text, transcript.text, word_times
|
29 |
|
30 |
-
def run_inpainter(input_text, output_text, word_times, audio, num_steps, init_temp, init_diversity, guidance, rescale, topk):
|
31 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
|
33 |
-
def run_inpainter_tts(input_text, voice_audio):
|
34 |
-
return inpainter.tts(TTSInput(output_text=input_text, voice=voice_audio))
|
35 |
|
36 |
if __name__ == '__main__':
|
37 |
with gr.Blocks(analytics_enabled=False, title="PlayDiffusion") as demo:
|
@@ -43,13 +82,7 @@ if __name__ == '__main__':
|
|
43 |
gr.Markdown("### Run the inpainter to generate the modified audio.")
|
44 |
gr.Markdown("### Note: The model and demo are currently targeted for English.")
|
45 |
|
46 |
-
|
47 |
-
num_steps_slider = gr.Slider(minimum=1, maximum=100, step=1, label="number of sampling steps codebook", value=30)
|
48 |
-
init_temp_slider = gr.Slider(minimum=0.5, maximum=10, step=0.1, label="Initial temperature", value=1)
|
49 |
-
init_diversity_slider = gr.Slider(minimum=0, maximum=10, step=0.1, label="Initial diversity", value=1)
|
50 |
-
guidance_slider = gr.Slider(minimum=0, maximum=10, step=0.1, label="guidance", value=0.5)
|
51 |
-
rescale_slider = gr.Slider(minimum=0, maximum=1, step=0.1, label="guidance rescale factor", value=0.7)
|
52 |
-
topk_slider = gr.Slider(minimum=1, maximum=10000, step=1, label="sampling from top-k logits", value=25)
|
53 |
|
54 |
with gr.Row():
|
55 |
audio_input = gr.Audio(label="Upload audio to be modified", sources=["upload", "microphone"], type="filepath")
|
@@ -71,10 +104,15 @@ if __name__ == '__main__':
|
|
71 |
audio_output = gr.Audio(label="Output audio")
|
72 |
|
73 |
asr_submit.click(run_asr, inputs=[audio_input], outputs=[text_input, text_output, word_times])
|
74 |
-
inpainter_submit.click(
|
|
|
|
|
|
|
75 |
|
76 |
with gr.Tab("Text to Speech"):
|
77 |
gr.Markdown("### Text to Speech")
|
|
|
|
|
78 |
tts_text = gr.Textbox(label="TTS Input", placeholder="Enter text to convert to speech", lines=2)
|
79 |
tts_voice = gr.Audio(label="Voice to use for TTS",
|
80 |
sources=["upload", "microphone"], type="filepath",
|
@@ -84,7 +122,7 @@ if __name__ == '__main__':
|
|
84 |
|
85 |
tts_submit.click(
|
86 |
run_inpainter_tts,
|
87 |
-
inputs=[tts_text, tts_voice],
|
88 |
outputs=[tts_output]
|
89 |
)
|
90 |
|
|
|
27 |
|
28 |
return transcript.text, transcript.text, word_times
|
29 |
|
30 |
+
def run_inpainter(input_text, output_text, word_times, audio, num_steps, init_temp, init_diversity, guidance, rescale, topk, use_manual_ratio, audio_token_syllable_ratio):
|
31 |
+
if not use_manual_ratio:
|
32 |
+
audio_token_syllable_ratio = None
|
33 |
+
return inpainter.inpaint(InpaintInput(input_text=input_text, output_text=output_text, input_word_times=word_times, audio=audio, num_steps=num_steps,
|
34 |
+
init_temp=init_temp, init_diversity=init_diversity, guidance=guidance, rescale=rescale, topk=topk,
|
35 |
+
audio_token_syllable_ratio=audio_token_syllable_ratio))
|
36 |
+
|
37 |
+
def run_inpainter_tts(input_text, voice_audio, num_steps, init_temp, init_diversity, guidance, rescale, topk, use_manual_ratio, audio_token_syllable_ratio):
|
38 |
+
if not use_manual_ratio:
|
39 |
+
audio_token_syllable_ratio = None
|
40 |
+
return inpainter.tts(TTSInput(output_text=input_text, voice=voice_audio, num_steps=num_steps, init_temp=init_temp,
|
41 |
+
init_diversity=init_diversity, guidance=guidance, rescale=rescale, topk=topk,
|
42 |
+
audio_token_syllable_ratio=audio_token_syllable_ratio))
|
43 |
+
|
44 |
+
def toggle_ratio_input(use_manual):
|
45 |
+
return gr.update(visible=use_manual, interactive=use_manual)
|
46 |
+
|
47 |
+
def create_advanced_options_accordion():
|
48 |
+
with gr.Accordion("Advanced options", open=False):
|
49 |
+
num_steps_slider = gr.Slider(1, 100, 30, step=1, label="number of sampling steps codebook")
|
50 |
+
init_temp_slider = gr.Slider(0.5, 10, 1, step=0.1, label="Initial temperature")
|
51 |
+
init_diversity_slider = gr.Slider(0, 10, 1, step=0.1, label="Initial diversity")
|
52 |
+
guidance_slider = gr.Slider(0, 10, 0.5, step=0.1, label="guidance")
|
53 |
+
rescale_slider = gr.Slider(0, 1, 0.7, step=0.1, label="guidance rescale factor")
|
54 |
+
topk_slider = gr.Slider(1, 10000, 25, step=1, label="sampling from top-k logits")
|
55 |
+
|
56 |
+
gr.Markdown("#### Audio Token Syllable Ratio")
|
57 |
+
gr.Markdown("*Automatic calculation (recommended) provides the best results in most cases.*")
|
58 |
+
use_manual_ratio = gr.Checkbox(label="Use manual audio token syllable ratio", value=False)
|
59 |
+
audio_token_syllable_ratio = gr.Number(
|
60 |
+
label="Audio token syllable ratio (manual)",
|
61 |
+
value=12.5, precision=2, minimum=5.0, maximum=25.0,
|
62 |
+
visible=False, interactive=False
|
63 |
+
)
|
64 |
+
use_manual_ratio.change(
|
65 |
+
toggle_ratio_input,
|
66 |
+
inputs=[use_manual_ratio],
|
67 |
+
outputs=[audio_token_syllable_ratio]
|
68 |
+
)
|
69 |
+
|
70 |
+
return (num_steps_slider, init_temp_slider, init_diversity_slider,
|
71 |
+
guidance_slider, rescale_slider, topk_slider,
|
72 |
+
use_manual_ratio, audio_token_syllable_ratio)
|
73 |
|
|
|
|
|
74 |
|
75 |
if __name__ == '__main__':
|
76 |
with gr.Blocks(analytics_enabled=False, title="PlayDiffusion") as demo:
|
|
|
82 |
gr.Markdown("### Run the inpainter to generate the modified audio.")
|
83 |
gr.Markdown("### Note: The model and demo are currently targeted for English.")
|
84 |
|
85 |
+
inpaint_advanced_options = create_advanced_options_accordion()
|
|
|
|
|
|
|
|
|
|
|
|
|
86 |
|
87 |
with gr.Row():
|
88 |
audio_input = gr.Audio(label="Upload audio to be modified", sources=["upload", "microphone"], type="filepath")
|
|
|
104 |
audio_output = gr.Audio(label="Output audio")
|
105 |
|
106 |
asr_submit.click(run_asr, inputs=[audio_input], outputs=[text_input, text_output, word_times])
|
107 |
+
inpainter_submit.click(
|
108 |
+
run_inpainter,
|
109 |
+
inputs=[text_input, text_output, word_times, audio_input] + list(inpaint_advanced_options),
|
110 |
+
outputs=[audio_output])
|
111 |
|
112 |
with gr.Tab("Text to Speech"):
|
113 |
gr.Markdown("### Text to Speech")
|
114 |
+
tts_advanced_options = create_advanced_options_accordion()
|
115 |
+
|
116 |
tts_text = gr.Textbox(label="TTS Input", placeholder="Enter text to convert to speech", lines=2)
|
117 |
tts_voice = gr.Audio(label="Voice to use for TTS",
|
118 |
sources=["upload", "microphone"], type="filepath",
|
|
|
122 |
|
123 |
tts_submit.click(
|
124 |
run_inpainter_tts,
|
125 |
+
inputs=[tts_text, tts_voice] + list(tts_advanced_options),
|
126 |
outputs=[tts_output]
|
127 |
)
|
128 |
|