Spaces:
Running
on
L40S
Running
on
L40S
Set examples
Browse files
app.py
CHANGED
@@ -114,19 +114,31 @@ print("Using", USED_VRAM_PARAMS, "for num_persistent_param_in_dit")
|
|
114 |
|
115 |
|
116 |
|
117 |
-
def create_temp_input_json(prompt: str, cond_image_path: str,
|
118 |
"""
|
119 |
Create a temporary JSON file with the user-provided prompt, image, and audio paths.
|
120 |
Returns the path to the temporary JSON file.
|
121 |
"""
|
122 |
# Structure based on your original JSON format
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
128 |
}
|
129 |
-
}
|
130 |
|
131 |
# Create a temp file
|
132 |
temp_json = tempfile.NamedTemporaryFile(delete=False, suffix=".json", mode='w', encoding='utf-8')
|
@@ -138,14 +150,19 @@ def create_temp_input_json(prompt: str, cond_image_path: str, cond_audio_path: s
|
|
138 |
return temp_json_path
|
139 |
|
140 |
|
141 |
-
def infer(prompt, cond_image_path,
|
142 |
|
143 |
if is_shared_ui:
|
144 |
-
|
145 |
-
|
|
|
|
|
|
|
|
|
|
|
146 |
|
147 |
# Prepare input JSON
|
148 |
-
input_json_path = create_temp_input_json(prompt, cond_image_path,
|
149 |
|
150 |
# Base args
|
151 |
common_args = [
|
@@ -229,13 +246,19 @@ with gr.Blocks(title="MultiTalk Inference") as demo:
|
|
229 |
label="Conditioning Image"
|
230 |
)
|
231 |
|
232 |
-
|
|
|
|
|
|
|
|
|
|
|
233 |
type="filepath",
|
234 |
-
label="Conditioning Audio (.wav)"
|
235 |
)
|
236 |
|
237 |
with gr.Accordion("Advanced settings", open=False):
|
238 |
sample_steps = gr.Slider(
|
|
|
239 |
value=6,
|
240 |
minimum=2,
|
241 |
maximum=25,
|
@@ -247,9 +270,14 @@ with gr.Blocks(title="MultiTalk Inference") as demo:
|
|
247 |
|
248 |
gr.Examples(
|
249 |
examples = [
|
250 |
-
["A woman sings passionately in a dimly lit studio.", "examples/single/single1.png", "examples/single/1.wav"]
|
|
|
251 |
],
|
252 |
-
|
|
|
|
|
|
|
|
|
253 |
)
|
254 |
|
255 |
with gr.Column(scale=3):
|
@@ -257,8 +285,8 @@ with gr.Blocks(title="MultiTalk Inference") as demo:
|
|
257 |
|
258 |
submit_btn.click(
|
259 |
fn=infer,
|
260 |
-
inputs=[prompt_input, image_input,
|
261 |
outputs=output_video
|
262 |
)
|
263 |
|
264 |
-
demo.launch()
|
|
|
114 |
|
115 |
|
116 |
|
117 |
+
def create_temp_input_json(prompt: str, cond_image_path: str, cond_audio_path_spk1: str, cond_audio_path_spk2: str) -> str:
|
118 |
"""
|
119 |
Create a temporary JSON file with the user-provided prompt, image, and audio paths.
|
120 |
Returns the path to the temporary JSON file.
|
121 |
"""
|
122 |
# Structure based on your original JSON format
|
123 |
+
if cond_audio_path_spk2 is None:
|
124 |
+
data = {
|
125 |
+
"prompt": prompt,
|
126 |
+
"cond_image": cond_image_path,
|
127 |
+
"cond_audio": {
|
128 |
+
"person1": cond_audio_path_spk1
|
129 |
+
}
|
130 |
+
}
|
131 |
+
|
132 |
+
else:
|
133 |
+
data = {
|
134 |
+
"prompt": prompt,
|
135 |
+
"cond_image": cond_image_path,
|
136 |
+
"audio_type": "para",
|
137 |
+
"cond_audio": {
|
138 |
+
"person1": cond_audio_path_spk1,
|
139 |
+
"person2": cond_audio_path_spk2
|
140 |
+
}
|
141 |
}
|
|
|
142 |
|
143 |
# Create a temp file
|
144 |
temp_json = tempfile.NamedTemporaryFile(delete=False, suffix=".json", mode='w', encoding='utf-8')
|
|
|
150 |
return temp_json_path
|
151 |
|
152 |
|
153 |
+
def infer(prompt, cond_image_path, cond_audio_path_spk1, cond_audio_path_spk2, sample_steps):
|
154 |
|
155 |
if is_shared_ui:
|
156 |
+
|
157 |
+
trimmed_audio_path_spk1 = trim_audio_to_5s_temp(cond_audio_path_spk1)
|
158 |
+
cond_audio_path_spk1 = trimmed_audio_path_spk1
|
159 |
+
|
160 |
+
if cond_audio_path_spk2 is not None:
|
161 |
+
trimmed_audio_path_spk2 = trim_audio_to_5s_temp(cond_audio_path_spk2)
|
162 |
+
cond_audio_path_spk2 = trimmed_audio_path_spk2
|
163 |
|
164 |
# Prepare input JSON
|
165 |
+
input_json_path = create_temp_input_json(prompt, cond_image_path, cond_audio_path_spk1, cond_audio_path_spk2)
|
166 |
|
167 |
# Base args
|
168 |
common_args = [
|
|
|
246 |
label="Conditioning Image"
|
247 |
)
|
248 |
|
249 |
+
audio_input_spk1 = gr.Audio(
|
250 |
+
type="filepath",
|
251 |
+
label="Conditioning Audio for speaker 1(.wav)"
|
252 |
+
)
|
253 |
+
|
254 |
+
audio_input_spk2 = gr.Audio(
|
255 |
type="filepath",
|
256 |
+
label="Conditioning Audio for speaker 2(.wav)"
|
257 |
)
|
258 |
|
259 |
with gr.Accordion("Advanced settings", open=False):
|
260 |
sample_steps = gr.Slider(
|
261 |
+
label="sample steps",
|
262 |
value=6,
|
263 |
minimum=2,
|
264 |
maximum=25,
|
|
|
270 |
|
271 |
gr.Examples(
|
272 |
examples = [
|
273 |
+
["A woman sings passionately in a dimly lit studio.", "examples/single/single1.png", "examples/single/1.wav", None, 6],
|
274 |
+
["In a cozy recording studio, a man and a woman are singing together. The man, with tousled brown hair, stands to the left, wearing a light green button-down shirt. His gaze is directed towards the woman, who is smiling warmly. She, with wavy dark hair, is dressed in a black floral dress and stands to the right, her eyes closed in enjoyment. Between them is a professional microphone, capturing their harmonious voices. The background features wooden panels and various audio equipment, creating an intimate and focused atmosphere. The lighting is soft and warm, highlighting their expressions and the intimate setting. A medium shot captures their interaction closely.", "examples/multi/3/multi3.png", "examples/multi/3/1-man.WAV", "examples/multi/3/1-woman.WAV", 6],
|
275 |
],
|
276 |
+
fn=infer,
|
277 |
+
inputs = [prompt_input, image_input, audio_input_spk1, audio_input_spk2, sample_steps]
|
278 |
+
outputs=output_video,
|
279 |
+
cache_examples = True,
|
280 |
+
cache_mode = "lazy"
|
281 |
)
|
282 |
|
283 |
with gr.Column(scale=3):
|
|
|
285 |
|
286 |
submit_btn.click(
|
287 |
fn=infer,
|
288 |
+
inputs=[prompt_input, image_input, audio_input_spk1, sample_steps],
|
289 |
outputs=output_video
|
290 |
)
|
291 |
|
292 |
+
demo.launch(ssr_mode=False, show_error=True, show_api=False)
|