Spaces:
Running
on
Zero
Running
on
Zero
visualize states
Browse files
app.py
CHANGED
@@ -242,7 +242,9 @@ def get_video_duration(video_path):
|
|
242 |
@spaces.GPU(duration=60)
|
243 |
@torch.inference_mode()
|
244 |
@torch.no_grad()
|
245 |
-
def
|
|
|
|
|
246 |
if caption is None:
|
247 |
caption = ''
|
248 |
if cot is None:
|
@@ -254,7 +256,7 @@ def get_audio(video_path, caption, cot):
|
|
254 |
preprocesser = VGGSound(duration_sec=duration_sec)
|
255 |
data = preprocesser.sample(video_path, caption, cot)
|
256 |
|
257 |
-
|
258 |
|
259 |
preprocessed_data = {}
|
260 |
metaclip_global_text_features, metaclip_text_features = feature_extractor.encode_text(data['caption'])
|
@@ -288,6 +290,7 @@ def get_audio(video_path, caption, cot):
|
|
288 |
conditioning['metaclip_features'][~video_exist] = training_wrapper.diffusion.model.model.empty_clip_feat
|
289 |
conditioning['sync_features'][~video_exist] = training_wrapper.diffusion.model.model.empty_sync_feat
|
290 |
|
|
|
291 |
cond_inputs = training_wrapper.diffusion.get_conditioning_inputs(conditioning)
|
292 |
noise = torch.randn([batch_size, training_wrapper.diffusion.io_channels, length]).to(training_wrapper.device)
|
293 |
with torch.amp.autocast(device):
|
@@ -305,14 +308,11 @@ def get_audio(video_path, caption, cot):
|
|
305 |
fakes = training_wrapper.diffusion.pretransform.decode(fakes)
|
306 |
|
307 |
audios = fakes.to(torch.float32).div(torch.max(torch.abs(fakes))).clamp(-1, 1).mul(32767).to(torch.int16).cpu()
|
|
|
308 |
with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp_audio:
|
309 |
torchaudio.save(tmp_audio.name, audios[0], 44100)
|
310 |
audio_path = tmp_audio.name
|
311 |
|
312 |
-
return audio_path
|
313 |
-
|
314 |
-
def synthesize_video_with_audio(video_file, caption, cot):
|
315 |
-
audio_path = get_audio(video_file, caption, cot)
|
316 |
with tempfile.NamedTemporaryFile(suffix='.mp4', delete=False) as tmp_video:
|
317 |
output_video_path = tmp_video.name
|
318 |
|
@@ -323,29 +323,40 @@ def synthesize_video_with_audio(video_file, caption, cot):
|
|
323 |
]
|
324 |
subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
325 |
|
326 |
-
return output_video_path
|
327 |
-
|
328 |
-
|
329 |
-
|
330 |
-
|
331 |
-
|
332 |
-
|
333 |
-
|
334 |
-
|
335 |
-
|
336 |
-
|
337 |
-
|
338 |
-
|
339 |
-
|
340 |
-
|
341 |
-
|
342 |
-
|
343 |
-
|
344 |
-
|
345 |
-
|
346 |
-
|
347 |
-
)
|
348 |
-
|
349 |
-
|
350 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
351 |
|
|
|
242 |
@spaces.GPU(duration=60)
|
243 |
@torch.inference_mode()
|
244 |
@torch.no_grad()
|
245 |
+
def synthesize_video_with_audio(video_file, caption, cot):
|
246 |
+
audio_path = get_audio(video_file, caption, cot)
|
247 |
+
video_path = video_file
|
248 |
if caption is None:
|
249 |
caption = ''
|
250 |
if cot is None:
|
|
|
256 |
preprocesser = VGGSound(duration_sec=duration_sec)
|
257 |
data = preprocesser.sample(video_path, caption, cot)
|
258 |
|
259 |
+
yield "⏳ Extracting Features…", None
|
260 |
|
261 |
preprocessed_data = {}
|
262 |
metaclip_global_text_features, metaclip_text_features = feature_extractor.encode_text(data['caption'])
|
|
|
290 |
conditioning['metaclip_features'][~video_exist] = training_wrapper.diffusion.model.model.empty_clip_feat
|
291 |
conditioning['sync_features'][~video_exist] = training_wrapper.diffusion.model.model.empty_sync_feat
|
292 |
|
293 |
+
yield "⏳ Inferring…", None
|
294 |
cond_inputs = training_wrapper.diffusion.get_conditioning_inputs(conditioning)
|
295 |
noise = torch.randn([batch_size, training_wrapper.diffusion.io_channels, length]).to(training_wrapper.device)
|
296 |
with torch.amp.autocast(device):
|
|
|
308 |
fakes = training_wrapper.diffusion.pretransform.decode(fakes)
|
309 |
|
310 |
audios = fakes.to(torch.float32).div(torch.max(torch.abs(fakes))).clamp(-1, 1).mul(32767).to(torch.int16).cpu()
|
311 |
+
|
312 |
with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp_audio:
|
313 |
torchaudio.save(tmp_audio.name, audios[0], 44100)
|
314 |
audio_path = tmp_audio.name
|
315 |
|
|
|
|
|
|
|
|
|
316 |
with tempfile.NamedTemporaryFile(suffix='.mp4', delete=False) as tmp_video:
|
317 |
output_video_path = tmp_video.name
|
318 |
|
|
|
323 |
]
|
324 |
subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
325 |
|
326 |
+
# return output_video_path
|
327 |
+
yield "✅ Generation completed!", output_video_path
|
328 |
+
|
329 |
+
# Gradio界面
|
330 |
+
with gr.Blocks() as demo:
|
331 |
+
gr.Markdown(
|
332 |
+
"""
|
333 |
+
# ThinkSound\n
|
334 |
+
ThinkSound is a unified Any2Audio generation framework with flow matching guided by Chain-of-Thought (CoT) reasoning.
|
335 |
+
|
336 |
+
Upload video and caption (optional), and get video with audio!
|
337 |
+
|
338 |
+
"""
|
339 |
+
)
|
340 |
+
with gr.Row():
|
341 |
+
video_input = gr.Video(label="upload video")
|
342 |
+
with gr.Column():
|
343 |
+
caption_input = gr.Textbox(label="caption(optional)", placeholder="can be empty", lines=1)
|
344 |
+
cot_input = gr.Textbox(label="CoT Description (optional)", placeholder="can be empty", lines=6)
|
345 |
+
output_video = gr.Video(label="output video")
|
346 |
+
btn = gr.Button("start synthesize")
|
347 |
+
btn.click(fn=synthesize_video_with_audio, inputs=[video_input, caption_input, cot_input], outputs=output_video)
|
348 |
+
|
349 |
+
gr.Examples(
|
350 |
+
examples=[
|
351 |
+
["./examples/3_mute.mp4", "Gentle Sucking Sounds From the Pacifier", "Begin by creating a soft, steady background of light pacifier suckling. Add subtle, breathy rhythms to mimic a newborn's gentle mouth movements. Keep the sound smooth, natural, and soothing.","./examples/3.mp4"],
|
352 |
+
["./examples/2_mute.mp4", "Printer Printing", "Generate a continuous printer printing sound with periodic beeps and paper movement, plus a cat pawing at the machine. Add subtle ambient room noise for authenticity, keeping the focus on printing, beeps, and the cat's interaction.", "./examples/2.mp4"],
|
353 |
+
["./examples/5_mute.mp4", "Lighting Firecrackers", "Generate the sound of firecrackers lighting and exploding repeatedly on the ground, followed by fireworks bursting in the sky. Incorporate occasional subtle echoes to mimic an outdoor night ambiance, with no human voices present.","./examples/5.mp4"],
|
354 |
+
["./examples/4_mute.mp4", "Plastic Debris Handling", "Begin with the sound of hands scooping up loose plastic debris, followed by the subtle cascading noise as the pieces fall and scatter back down. Include soft crinkling and rustling to emphasize the texture of the plastic. Add ambient factory background noise with distant machinery to create an industrial atmosphere.","./examples/4.mp4"]
|
355 |
+
],
|
356 |
+
inputs=[video_input, caption_input, cot_input, output_video],
|
357 |
+
)
|
358 |
+
|
359 |
+
demo.launch(share=True)
|
360 |
+
|
361 |
+
|
362 |
|