Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -243,7 +243,6 @@ def get_video_duration(video_path):
|
|
243 |
@torch.inference_mode()
|
244 |
@torch.no_grad()
|
245 |
def synthesize_video_with_audio(video_file, caption, cot):
|
246 |
-
audio_path = get_audio(video_file, caption, cot)
|
247 |
video_path = video_file
|
248 |
if caption is None:
|
249 |
caption = ''
|
@@ -291,6 +290,7 @@ def synthesize_video_with_audio(video_file, caption, cot):
|
|
291 |
conditioning['sync_features'][~video_exist] = training_wrapper.diffusion.model.model.empty_sync_feat
|
292 |
|
293 |
yield "⏳ Inferring…", None
|
|
|
294 |
cond_inputs = training_wrapper.diffusion.get_conditioning_inputs(conditioning)
|
295 |
noise = torch.randn([batch_size, training_wrapper.diffusion.io_channels, length]).to(training_wrapper.device)
|
296 |
with torch.amp.autocast(device):
|
@@ -308,7 +308,6 @@ def synthesize_video_with_audio(video_file, caption, cot):
|
|
308 |
fakes = training_wrapper.diffusion.pretransform.decode(fakes)
|
309 |
|
310 |
audios = fakes.to(torch.float32).div(torch.max(torch.abs(fakes))).clamp(-1, 1).mul(32767).to(torch.int16).cpu()
|
311 |
-
|
312 |
with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp_audio:
|
313 |
torchaudio.save(tmp_audio.name, audios[0], 44100)
|
314 |
audio_path = tmp_audio.name
|
@@ -326,36 +325,32 @@ def synthesize_video_with_audio(video_file, caption, cot):
|
|
326 |
# return output_video_path
|
327 |
yield "✅ Generation completed!", output_video_path
|
328 |
|
329 |
-
|
330 |
-
|
331 |
-
|
332 |
-
""
|
333 |
-
|
334 |
-
|
335 |
-
|
336 |
-
|
337 |
-
|
338 |
-
""
|
339 |
-
|
340 |
-
|
341 |
-
|
342 |
-
|
343 |
-
|
344 |
-
|
345 |
-
|
346 |
-
|
347 |
-
|
348 |
-
|
349 |
-
|
350 |
-
|
351 |
-
|
352 |
-
|
353 |
-
|
354 |
-
|
355 |
-
],
|
356 |
-
inputs=[video_input, caption_input, cot_input, output_video],
|
357 |
-
)
|
358 |
-
|
359 |
demo.launch(share=True)
|
360 |
|
361 |
|
|
|
243 |
@torch.inference_mode()
|
244 |
@torch.no_grad()
|
245 |
def synthesize_video_with_audio(video_file, caption, cot):
|
|
|
246 |
video_path = video_file
|
247 |
if caption is None:
|
248 |
caption = ''
|
|
|
290 |
conditioning['sync_features'][~video_exist] = training_wrapper.diffusion.model.model.empty_sync_feat
|
291 |
|
292 |
yield "⏳ Inferring…", None
|
293 |
+
|
294 |
cond_inputs = training_wrapper.diffusion.get_conditioning_inputs(conditioning)
|
295 |
noise = torch.randn([batch_size, training_wrapper.diffusion.io_channels, length]).to(training_wrapper.device)
|
296 |
with torch.amp.autocast(device):
|
|
|
308 |
fakes = training_wrapper.diffusion.pretransform.decode(fakes)
|
309 |
|
310 |
audios = fakes.to(torch.float32).div(torch.max(torch.abs(fakes))).clamp(-1, 1).mul(32767).to(torch.int16).cpu()
|
|
|
311 |
with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp_audio:
|
312 |
torchaudio.save(tmp_audio.name, audios[0], 44100)
|
313 |
audio_path = tmp_audio.name
|
|
|
325 |
# return output_video_path
|
326 |
yield "✅ Generation completed!", output_video_path
|
327 |
|
328 |
+
demo = gr.Interface(
|
329 |
+
fn=synthesize_video_with_audio,
|
330 |
+
inputs=[
|
331 |
+
gr.Video(label="Upload Video"),
|
332 |
+
gr.Textbox(label="Caption (optional)", placeholder="can be empty",),
|
333 |
+
gr.Textbox(label="CoT Description (optional)", lines=6, placeholder="can be empty",),
|
334 |
+
],
|
335 |
+
outputs=[
|
336 |
+
gr.Text(label="Status"),
|
337 |
+
gr.Video(label="Result"),
|
338 |
+
],
|
339 |
+
title="ThinkSound Demo",
|
340 |
+
description="Upload a video, caption, or CoT to generate audio. For an enhanced experience, we automatically merge the generated audio with your original silent video. (Note: Flexible audio generation lengths are supported.:)",
|
341 |
+
examples=[
|
342 |
+
["examples/1_mute.mp4", "Playing Trumpet","Generate a continuous trumpet sound with melodic variations, mimicking the sound of a person playing the trumpet ideally in a musical setting, ensuring clarity and realistic tone. Avoid extraneous noise or background sounds to reflect the focus on trumpet playing. The audio should resemble a skilled player producing expressive, melodious trumpet notes. Pay attention to pitch changes caused by hand movements."],
|
343 |
+
["examples/2_mute.mp4", "Printer Printing", "Generate a continuous printer printing sound with periodic beeps and paper movement, plus a cat pawing at the machine. Add subtle ambient room noise for authenticity, keeping the focus on printing, beeps, and the cat's interaction."],
|
344 |
+
["examples/3_mute.mp4", "Gentle Sucking Sounds From the Pacifier", "Begin by creating a soft, steady background of light pacifier suckling. Add subtle, breathy rhythms to mimic a newborn's gentle mouth movements. Keep the sound smooth, natural, and soothing."],
|
345 |
+
["examples/4_mute.mp4", "Plastic Debris Handling", "Begin with the sound of hands scooping up loose plastic debris, followed by the subtle cascading noise as the pieces fall and scatter back down. Include soft crinkling and rustling to emphasize the texture of the plastic. Add ambient factory background noise with distant machinery to create an industrial atmosphere."],
|
346 |
+
["examples/5_mute.mp4", "Lighting Firecrackers", "Generate the sound of firecrackers lighting and exploding repeatedly on the ground, followed by fireworks bursting in the sky. Incorporate occasional subtle echoes to mimic an outdoor night ambiance, with no human voices present."]
|
347 |
+
],
|
348 |
+
cache_examples=True
|
349 |
+
)
|
350 |
+
|
351 |
+
if __name__ == "__main__":
|
352 |
+
demo.queue().launch(share=True)
|
353 |
+
|
|
|
|
|
|
|
|
|
354 |
demo.launch(share=True)
|
355 |
|
356 |
|