liuhuadai commited on
Commit
eac65ef
·
verified ·
1 Parent(s): 22a2689

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +27 -32
app.py CHANGED
@@ -243,7 +243,6 @@ def get_video_duration(video_path):
243
  @torch.inference_mode()
244
  @torch.no_grad()
245
  def synthesize_video_with_audio(video_file, caption, cot):
246
- audio_path = get_audio(video_file, caption, cot)
247
  video_path = video_file
248
  if caption is None:
249
  caption = ''
@@ -291,6 +290,7 @@ def synthesize_video_with_audio(video_file, caption, cot):
291
  conditioning['sync_features'][~video_exist] = training_wrapper.diffusion.model.model.empty_sync_feat
292
 
293
  yield "⏳ Inferring…", None
 
294
  cond_inputs = training_wrapper.diffusion.get_conditioning_inputs(conditioning)
295
  noise = torch.randn([batch_size, training_wrapper.diffusion.io_channels, length]).to(training_wrapper.device)
296
  with torch.amp.autocast(device):
@@ -308,7 +308,6 @@ def synthesize_video_with_audio(video_file, caption, cot):
308
  fakes = training_wrapper.diffusion.pretransform.decode(fakes)
309
 
310
  audios = fakes.to(torch.float32).div(torch.max(torch.abs(fakes))).clamp(-1, 1).mul(32767).to(torch.int16).cpu()
311
-
312
  with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp_audio:
313
  torchaudio.save(tmp_audio.name, audios[0], 44100)
314
  audio_path = tmp_audio.name
@@ -326,36 +325,32 @@ def synthesize_video_with_audio(video_file, caption, cot):
326
  # return output_video_path
327
  yield "✅ Generation completed!", output_video_path
328
 
329
- # Gradio界面
330
- with gr.Blocks() as demo:
331
- gr.Markdown(
332
- """
333
- # ThinkSound\n
334
- ThinkSound is a unified Any2Audio generation framework with flow matching guided by Chain-of-Thought (CoT) reasoning.
335
-
336
- Upload video and caption (optional), and get video with audio!
337
-
338
- """
339
- )
340
- with gr.Row():
341
- video_input = gr.Video(label="upload video")
342
- with gr.Column():
343
- caption_input = gr.Textbox(label="caption(optional)", placeholder="can be empty", lines=1)
344
- cot_input = gr.Textbox(label="CoT Description (optional)", placeholder="can be empty", lines=6)
345
- output_video = gr.Video(label="output video")
346
- btn = gr.Button("start synthesize")
347
- btn.click(fn=synthesize_video_with_audio, inputs=[video_input, caption_input, cot_input], outputs=output_video)
348
-
349
- gr.Examples(
350
- examples=[
351
- ["./examples/3_mute.mp4", "Gentle Sucking Sounds From the Pacifier", "Begin by creating a soft, steady background of light pacifier suckling. Add subtle, breathy rhythms to mimic a newborn's gentle mouth movements. Keep the sound smooth, natural, and soothing.","./examples/3.mp4"],
352
- ["./examples/2_mute.mp4", "Printer Printing", "Generate a continuous printer printing sound with periodic beeps and paper movement, plus a cat pawing at the machine. Add subtle ambient room noise for authenticity, keeping the focus on printing, beeps, and the cat's interaction.", "./examples/2.mp4"],
353
- ["./examples/5_mute.mp4", "Lighting Firecrackers", "Generate the sound of firecrackers lighting and exploding repeatedly on the ground, followed by fireworks bursting in the sky. Incorporate occasional subtle echoes to mimic an outdoor night ambiance, with no human voices present.","./examples/5.mp4"],
354
- ["./examples/4_mute.mp4", "Plastic Debris Handling", "Begin with the sound of hands scooping up loose plastic debris, followed by the subtle cascading noise as the pieces fall and scatter back down. Include soft crinkling and rustling to emphasize the texture of the plastic. Add ambient factory background noise with distant machinery to create an industrial atmosphere.","./examples/4.mp4"]
355
- ],
356
- inputs=[video_input, caption_input, cot_input, output_video],
357
- )
358
-
359
  demo.launch(share=True)
360
 
361
 
 
243
  @torch.inference_mode()
244
  @torch.no_grad()
245
  def synthesize_video_with_audio(video_file, caption, cot):
 
246
  video_path = video_file
247
  if caption is None:
248
  caption = ''
 
290
  conditioning['sync_features'][~video_exist] = training_wrapper.diffusion.model.model.empty_sync_feat
291
 
292
  yield "⏳ Inferring…", None
293
+
294
  cond_inputs = training_wrapper.diffusion.get_conditioning_inputs(conditioning)
295
  noise = torch.randn([batch_size, training_wrapper.diffusion.io_channels, length]).to(training_wrapper.device)
296
  with torch.amp.autocast(device):
 
308
  fakes = training_wrapper.diffusion.pretransform.decode(fakes)
309
 
310
  audios = fakes.to(torch.float32).div(torch.max(torch.abs(fakes))).clamp(-1, 1).mul(32767).to(torch.int16).cpu()
 
311
  with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp_audio:
312
  torchaudio.save(tmp_audio.name, audios[0], 44100)
313
  audio_path = tmp_audio.name
 
325
  # return output_video_path
326
  yield "✅ Generation completed!", output_video_path
327
 
328
+ demo = gr.Interface(
329
+ fn=synthesize_video_with_audio,
330
+ inputs=[
331
+ gr.Video(label="Upload Video"),
332
+ gr.Textbox(label="Caption (optional)", placeholder="can be empty",),
333
+ gr.Textbox(label="CoT Description (optional)", lines=6, placeholder="can be empty",),
334
+ ],
335
+ outputs=[
336
+ gr.Text(label="Status"),
337
+ gr.Video(label="Result"),
338
+ ],
339
+ title="ThinkSound Demo",
340
+ description="Upload a video, caption, or CoT to generate audio. For an enhanced experience, we automatically merge the generated audio with your original silent video. (Note: Flexible audio generation lengths are supported.:)",
341
+ examples=[
342
+ ["examples/1_mute.mp4", "Playing Trumpet","Generate a continuous trumpet sound with melodic variations, mimicking the sound of a person playing the trumpet ideally in a musical setting, ensuring clarity and realistic tone. Avoid extraneous noise or background sounds to reflect the focus on trumpet playing. The audio should resemble a skilled player producing expressive, melodious trumpet notes. Pay attention to pitch changes caused by hand movements."],
343
+ ["examples/2_mute.mp4", "Printer Printing", "Generate a continuous printer printing sound with periodic beeps and paper movement, plus a cat pawing at the machine. Add subtle ambient room noise for authenticity, keeping the focus on printing, beeps, and the cat's interaction."],
344
+ ["examples/3_mute.mp4", "Gentle Sucking Sounds From the Pacifier", "Begin by creating a soft, steady background of light pacifier suckling. Add subtle, breathy rhythms to mimic a newborn's gentle mouth movements. Keep the sound smooth, natural, and soothing."],
345
+ ["examples/4_mute.mp4", "Plastic Debris Handling", "Begin with the sound of hands scooping up loose plastic debris, followed by the subtle cascading noise as the pieces fall and scatter back down. Include soft crinkling and rustling to emphasize the texture of the plastic. Add ambient factory background noise with distant machinery to create an industrial atmosphere."],
346
+ ["examples/5_mute.mp4", "Lighting Firecrackers", "Generate the sound of firecrackers lighting and exploding repeatedly on the ground, followed by fireworks bursting in the sky. Incorporate occasional subtle echoes to mimic an outdoor night ambiance, with no human voices present."]
347
+ ],
348
+ cache_examples=True
349
+ )
350
+
351
+ if __name__ == "__main__":
352
+ demo.queue().launch(share=True)
353
+
 
 
 
 
354
  demo.launch(share=True)
355
 
356