liuhuadai commited on
Commit
22a2689
·
verified ·
1 Parent(s): 8e3f699

visualize states

Browse files
Files changed (1) hide show
  1. app.py +42 -31
app.py CHANGED
@@ -242,7 +242,9 @@ def get_video_duration(video_path):
242
  @spaces.GPU(duration=60)
243
  @torch.inference_mode()
244
  @torch.no_grad()
245
- def get_audio(video_path, caption, cot):
 
 
246
  if caption is None:
247
  caption = ''
248
  if cot is None:
@@ -254,7 +256,7 @@ def get_audio(video_path, caption, cot):
254
  preprocesser = VGGSound(duration_sec=duration_sec)
255
  data = preprocesser.sample(video_path, caption, cot)
256
 
257
-
258
 
259
  preprocessed_data = {}
260
  metaclip_global_text_features, metaclip_text_features = feature_extractor.encode_text(data['caption'])
@@ -288,6 +290,7 @@ def get_audio(video_path, caption, cot):
288
  conditioning['metaclip_features'][~video_exist] = training_wrapper.diffusion.model.model.empty_clip_feat
289
  conditioning['sync_features'][~video_exist] = training_wrapper.diffusion.model.model.empty_sync_feat
290
 
 
291
  cond_inputs = training_wrapper.diffusion.get_conditioning_inputs(conditioning)
292
  noise = torch.randn([batch_size, training_wrapper.diffusion.io_channels, length]).to(training_wrapper.device)
293
  with torch.amp.autocast(device):
@@ -305,14 +308,11 @@ def get_audio(video_path, caption, cot):
305
  fakes = training_wrapper.diffusion.pretransform.decode(fakes)
306
 
307
  audios = fakes.to(torch.float32).div(torch.max(torch.abs(fakes))).clamp(-1, 1).mul(32767).to(torch.int16).cpu()
 
308
  with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp_audio:
309
  torchaudio.save(tmp_audio.name, audios[0], 44100)
310
  audio_path = tmp_audio.name
311
 
312
- return audio_path
313
-
314
- def synthesize_video_with_audio(video_file, caption, cot):
315
- audio_path = get_audio(video_file, caption, cot)
316
  with tempfile.NamedTemporaryFile(suffix='.mp4', delete=False) as tmp_video:
317
  output_video_path = tmp_video.name
318
 
@@ -323,29 +323,40 @@ def synthesize_video_with_audio(video_file, caption, cot):
323
  ]
324
  subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
325
 
326
- return output_video_path
327
-
328
- demo = gr.Interface(
329
- fn=synthesize_video_with_audio,
330
- inputs=[
331
- gr.Video(label="Upload Video"),
332
- gr.Textbox(label="Caption (optional)", placeholder="can be empty",),
333
- gr.Textbox(label="CoT Description (optional)", lines=6, placeholder="can be empty",),
334
- ],
335
- outputs=[
336
- gr.Video(label="Result"),
337
- ],
338
- title="ThinkSound Demo",
339
- description="Upload a video, caption, or CoT to generate audio. For an enhanced experience, we automatically merge the generated audio with your original silent video. (Note: Flexible audio generation lengths are supported.:)",
340
- examples=[
341
- ["examples/3_mute.mp4", "Gentle Sucking Sounds From the Pacifier", "Begin by creating a soft, steady background of light pacifier suckling. Add subtle, breathy rhythms to mimic a newborn's gentle mouth movements. Keep the sound smooth, natural, and soothing."],
342
- ["examples/2_mute.mp4", "Printer Printing", "Generate a continuous printer printing sound with periodic beeps and paper movement, plus a cat pawing at the machine. Add subtle ambient room noise for authenticity, keeping the focus on printing, beeps, and the cat's interaction."],
343
- ["examples/4_mute.mp4", "Plastic Debris Handling", "Begin with the sound of hands scooping up loose plastic debris, followed by the subtle cascading noise as the pieces fall and scatter back down. Include soft crinkling and rustling to emphasize the texture of the plastic. Add ambient factory background noise with distant machinery to create an industrial atmosphere."],
344
- ["examples/5_mute.mp4", "Lighting Firecrackers", "Generate the sound of firecrackers lighting and exploding repeatedly on the ground, followed by fireworks bursting in the sky. Incorporate occasional subtle echoes to mimic an outdoor night ambiance, with no human voices present."]
345
- ],
346
- cache_examples=True
347
- )
348
-
349
- if __name__ == "__main__":
350
- demo.launch(share=True)
 
 
 
 
 
 
 
 
 
 
 
351
 
 
242
  @spaces.GPU(duration=60)
243
  @torch.inference_mode()
244
  @torch.no_grad()
245
+ def synthesize_video_with_audio(video_file, caption, cot):
246
+ audio_path = get_audio(video_file, caption, cot)
247
+ video_path = video_file
248
  if caption is None:
249
  caption = ''
250
  if cot is None:
 
256
  preprocesser = VGGSound(duration_sec=duration_sec)
257
  data = preprocesser.sample(video_path, caption, cot)
258
 
259
+ yield "⏳ Extracting Features…", None
260
 
261
  preprocessed_data = {}
262
  metaclip_global_text_features, metaclip_text_features = feature_extractor.encode_text(data['caption'])
 
290
  conditioning['metaclip_features'][~video_exist] = training_wrapper.diffusion.model.model.empty_clip_feat
291
  conditioning['sync_features'][~video_exist] = training_wrapper.diffusion.model.model.empty_sync_feat
292
 
293
+ yield "⏳ Inferring…", None
294
  cond_inputs = training_wrapper.diffusion.get_conditioning_inputs(conditioning)
295
  noise = torch.randn([batch_size, training_wrapper.diffusion.io_channels, length]).to(training_wrapper.device)
296
  with torch.amp.autocast(device):
 
308
  fakes = training_wrapper.diffusion.pretransform.decode(fakes)
309
 
310
  audios = fakes.to(torch.float32).div(torch.max(torch.abs(fakes))).clamp(-1, 1).mul(32767).to(torch.int16).cpu()
311
+
312
  with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp_audio:
313
  torchaudio.save(tmp_audio.name, audios[0], 44100)
314
  audio_path = tmp_audio.name
315
 
 
 
 
 
316
  with tempfile.NamedTemporaryFile(suffix='.mp4', delete=False) as tmp_video:
317
  output_video_path = tmp_video.name
318
 
 
323
  ]
324
  subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
325
 
326
+ # return output_video_path
327
+ yield "✅ Generation completed!", output_video_path
328
+
329
+ # Gradio界面
330
+ with gr.Blocks() as demo:
331
+ gr.Markdown(
332
+ """
333
+ # ThinkSound\n
334
+ ThinkSound is a unified Any2Audio generation framework with flow matching guided by Chain-of-Thought (CoT) reasoning.
335
+
336
+ Upload video and caption (optional), and get video with audio!
337
+
338
+ """
339
+ )
340
+ with gr.Row():
341
+ video_input = gr.Video(label="upload video")
342
+ with gr.Column():
343
+ caption_input = gr.Textbox(label="caption(optional)", placeholder="can be empty", lines=1)
344
+ cot_input = gr.Textbox(label="CoT Description (optional)", placeholder="can be empty", lines=6)
345
+ output_video = gr.Video(label="output video")
346
+ btn = gr.Button("start synthesize")
347
+ btn.click(fn=synthesize_video_with_audio, inputs=[video_input, caption_input, cot_input], outputs=output_video)
348
+
349
+ gr.Examples(
350
+ examples=[
351
+ ["./examples/3_mute.mp4", "Gentle Sucking Sounds From the Pacifier", "Begin by creating a soft, steady background of light pacifier suckling. Add subtle, breathy rhythms to mimic a newborn's gentle mouth movements. Keep the sound smooth, natural, and soothing.","./examples/3.mp4"],
352
+ ["./examples/2_mute.mp4", "Printer Printing", "Generate a continuous printer printing sound with periodic beeps and paper movement, plus a cat pawing at the machine. Add subtle ambient room noise for authenticity, keeping the focus on printing, beeps, and the cat's interaction.", "./examples/2.mp4"],
353
+ ["./examples/5_mute.mp4", "Lighting Firecrackers", "Generate the sound of firecrackers lighting and exploding repeatedly on the ground, followed by fireworks bursting in the sky. Incorporate occasional subtle echoes to mimic an outdoor night ambiance, with no human voices present.","./examples/5.mp4"],
354
+ ["./examples/4_mute.mp4", "Plastic Debris Handling", "Begin with the sound of hands scooping up loose plastic debris, followed by the subtle cascading noise as the pieces fall and scatter back down. Include soft crinkling and rustling to emphasize the texture of the plastic. Add ambient factory background noise with distant machinery to create an industrial atmosphere.","./examples/4.mp4"]
355
+ ],
356
+ inputs=[video_input, caption_input, cot_input, output_video],
357
+ )
358
+
359
+ demo.launch(share=True)
360
+
361
+
362