myniu commited on
Commit
bfb52d0
·
1 Parent(s): bf1ebc4
Files changed (2) hide show
  1. app.py +68 -76
  2. oldapp.py +838 -0
app.py CHANGED
@@ -89,6 +89,7 @@ def get_sparseflow_and_mask_forward(
89
  return s_flow, mask
90
 
91
 
 
92
  def init_models(pretrained_model_name_or_path, resume_from_checkpoint, weight_dtype, device='cuda', enable_xformers_memory_efficient_attention=False, allow_tf32=False):
93
 
94
  from models.unet_spatio_temporal_condition_controlnet import UNetSpatioTemporalConditionControlNetModel
@@ -214,24 +215,45 @@ def visualize_drag_v2(background_image_path, splited_tracks, width, height):
214
  return trajectory_maps, transparent_layer
215
 
216
 
217
- class Drag:
218
- @spaces.GPU(duration=200)
219
- def __init__(self, height, width):
220
 
221
- svd_ckpt = "ckpts/stable-video-diffusion-img2vid-xt-1-1"
222
- mofa_ckpt = "ckpts/controlnet"
223
 
224
- self.pipeline, self.cmp = init_models(
225
- svd_ckpt,
226
- mofa_ckpt,
227
- weight_dtype=torch.float16,
228
- device='cuda'
229
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
230
 
231
- self.height = height
232
- self.width = width
 
 
 
 
233
 
234
- def get_cmp_flow(self, frames, sparse_optical_flow, mask, brush_mask=None):
235
 
236
  '''
237
  frames: [b, 13, 3, 384, 384] (0, 1) tensor
@@ -244,7 +266,7 @@ class Drag:
244
  frames = frames.flatten(0, 1) # [b*13, 3, 256, 256]
245
  sparse_optical_flow = sparse_optical_flow.flatten(0, 1) # [b*13, 2, 256, 256]
246
  mask = mask.flatten(0, 1) # [b*13, 2, 256, 256]
247
- cmp_flow = self.cmp.run(frames, sparse_optical_flow, mask) # [b*13, 2, 256, 256]
248
 
249
  if brush_mask is not None:
250
  brush_mask = torch.from_numpy(brush_mask) / 255.
@@ -256,19 +278,19 @@ class Drag:
256
  return cmp_flow
257
 
258
 
259
- def get_flow(self, pixel_values_384, sparse_optical_flow_384, mask_384, motion_brush_mask=None):
260
 
261
  fb, fl, fc, _, _ = pixel_values_384.shape
262
 
263
- controlnet_flow = self.get_cmp_flow(
264
  pixel_values_384[:, 0:1, :, :, :].repeat(1, fl, 1, 1, 1),
265
  sparse_optical_flow_384,
266
  mask_384, motion_brush_mask
267
  )
268
 
269
- if self.height != 384 or self.width != 384:
270
- scales = [self.height / 384, self.width / 384]
271
- controlnet_flow = F.interpolate(controlnet_flow.flatten(0, 1), (self.height, self.width), mode='nearest').reshape(fb, fl, 2, self.height, self.width)
272
  controlnet_flow[:, :, 0] *= scales[1]
273
  controlnet_flow[:, :, 1] *= scales[0]
274
 
@@ -276,7 +298,7 @@ class Drag:
276
 
277
 
278
  @torch.no_grad()
279
- def forward_sample(self, input_drag_384_inmask, input_drag_384_outmask, input_first_frame, input_mask_384_inmask, input_mask_384_outmask, in_mask_flag, out_mask_flag, motion_brush_mask=None, ctrl_scale=1., outputs=dict()):
280
  '''
281
  input_drag: [1, 13, 320, 576, 2]
282
  input_drag_384: [1, 13, 384, 384, 2]
@@ -308,29 +330,29 @@ class Drag:
308
  input_first_frame_384 = input_first_frame_384.to('cuda', dtype=torch.float16)
309
 
310
  if in_mask_flag:
311
- flow_inmask = self.get_flow(
312
  input_first_frame_384,
313
  input_drag_384_inmask, mask_384_inmask, motion_brush_mask
314
  )
315
  else:
316
  fb, fl = mask_384_inmask.shape[:2]
317
- flow_inmask = torch.zeros(fb, fl, 2, self.height, self.width).to('cuda', dtype=torch.float16)
318
 
319
  if out_mask_flag:
320
- flow_outmask = self.get_flow(
321
  input_first_frame_384,
322
  input_drag_384_outmask, mask_384_outmask
323
  )
324
  else:
325
  fb, fl = mask_384_outmask.shape[:2]
326
- flow_outmask = torch.zeros(fb, fl, 2, self.height, self.width).to('cuda', dtype=torch.float16)
327
 
328
  inmask_no_zero = (flow_inmask != 0).all(dim=2)
329
  inmask_no_zero = inmask_no_zero.unsqueeze(2).expand_as(flow_inmask)
330
 
331
  controlnet_flow = torch.where(inmask_no_zero, flow_inmask, flow_outmask)
332
 
333
- val_output = self.pipeline(
334
  input_first_frame_pil,
335
  input_first_frame_pil,
336
  controlnet_flow,
@@ -369,16 +391,16 @@ class Drag:
369
 
370
  @spaces.GPU
371
  @torch.no_grad()
372
- def get_cmp_flow_from_tracking_points(self, tracking_points, motion_brush_mask, first_frame_path):
373
 
374
- original_width, original_height = self.width, self.height
375
 
376
  input_all_points = tracking_points.constructor_args['value']
377
 
378
  if len(input_all_points) == 0 or len(input_all_points[-1]) == 1:
379
  return np.uint8(np.ones((original_width, original_height, 3))*255)
380
 
381
- resized_all_points = [tuple([tuple([int(e1[0]*self.width/original_width), int(e1[1]*self.height/original_height)]) for e1 in e]) for e in input_all_points]
382
  resized_all_points_384 = [tuple([tuple([int(e1[0]*384/original_width), int(e1[1]*384/original_height)]) for e1 in e]) for e in input_all_points]
383
 
384
  new_resized_all_points = []
@@ -456,22 +478,22 @@ class Drag:
456
  input_first_frame_384 = input_first_frame_384.to('cuda', dtype=torch.float16)
457
 
458
  if in_mask_flag:
459
- flow_inmask = self.get_flow(
460
  input_first_frame_384,
461
  input_drag_384_inmask, mask_384_inmask, motion_brush_mask_384
462
  )
463
  else:
464
  fb, fl = mask_384_inmask.shape[:2]
465
- flow_inmask = torch.zeros(fb, fl, 2, self.height, self.width).to('cuda', dtype=torch.float16)
466
 
467
  if out_mask_flag:
468
- flow_outmask = self.get_flow(
469
  input_first_frame_384,
470
  input_drag_384_outmask, mask_384_outmask
471
  )
472
  else:
473
  fb, fl = mask_384_outmask.shape[:2]
474
- flow_outmask = torch.zeros(fb, fl, 2, self.height, self.width).to('cuda', dtype=torch.float16)
475
 
476
  inmask_no_zero = (flow_inmask != 0).all(dim=2)
477
  inmask_no_zero = inmask_no_zero.unsqueeze(2).expand_as(flow_inmask)
@@ -484,12 +506,12 @@ class Drag:
484
  return viz_esti_flows
485
 
486
  @spaces.GPU(duration=200)
487
- def run(self, first_frame_path, tracking_points, inference_batch_size, motion_brush_mask, motion_brush_viz, ctrl_scale):
488
 
489
- original_width, original_height = self.width, self.height
490
 
491
  input_all_points = tracking_points.constructor_args['value']
492
- resized_all_points = [tuple([tuple([int(e1[0]*self.width/original_width), int(e1[1]*self.height/original_height)]) for e1 in e]) for e in input_all_points]
493
  resized_all_points_384 = [tuple([tuple([int(e1[0]*384/original_width), int(e1[1]*384/original_height)]) for e1 in e]) for e in input_all_points]
494
 
495
  new_resized_all_points = []
@@ -542,9 +564,9 @@ class Drag:
542
  id = base.split('_')[0]
543
 
544
  image_pil = image2pil(first_frame_path)
545
- image_pil = image_pil.resize((self.width, self.height), Image.BILINEAR).convert('RGB')
546
 
547
- visualized_drag, _ = visualize_drag_v2(first_frame_path, resized_all_points, self.width, self.height)
548
 
549
  motion_brush_viz_pil = Image.fromarray(motion_brush_viz.astype(np.uint8)).convert('RGBA')
550
  visualized_drag = visualized_drag[0].convert('RGBA')
@@ -567,7 +589,7 @@ class Drag:
567
  first_frames = outputs['logits_imgs'][:, -1]
568
 
569
 
570
- outputs = self.forward_sample(
571
  input_drag_384_inmask.to('cuda'),
572
  input_drag_384_outmask.to('cuda'),
573
  first_frames.to('cuda'),
@@ -630,43 +652,13 @@ class Drag:
630
 
631
  return hint_path, outputs_path, flows_path, outputs_mp4_path, flows_mp4_path
632
 
633
-
634
- with gr.Blocks() as demo:
635
- gr.Markdown("""<h1 align="center">MOFA-Video</h1><br>""")
636
-
637
- gr.Markdown("""Official Gradio Demo for <a href='https://myniuuu.github.io/MOFA_Video'><b>MOFA-Video: Controllable Image Animation via Generative Motion Field Adaptions in Frozen Image-to-Video Diffusion Model</b></a>.<br>""")
638
-
639
- gr.Markdown(
640
- """
641
- During the inference, kindly follow these instructions:
642
- <br>
643
- 1. Use the "Upload Image" button to upload an image. Avoid dragging the image directly into the window. <br>
644
- 2. Proceed to draw trajectories: <br>
645
- 2.1. Click "Add Trajectory" first, then select points on the "Add Trajectory Here" image. The first click sets the starting point. Click multiple points to create a non-linear trajectory. To add a new trajectory, click "Add Trajectory" again and select points on the image. Avoid clicking the "Add Trajectory" button multiple times without clicking points in the image to add the trajectory, as this can lead to errors. <br>
646
- 2.2. After adding each trajectory, an optical flow image will be displayed automatically. Use it as a reference to adjust the trajectory for desired effects (e.g., area, intensity). <br>
647
- 2.3. To delete the latest trajectory, click "Delete Last Trajectory." <br>
648
- 2.4. Choose the Control Scale in the bar. This determines the control intensity. Setting it to 0 means no control (pure generation result of SVD itself), while setting it to 1 results in the strongest control (which will not lead to good results in most cases because of twisting artifacts). A preset value of 0.6 is recommended for most cases. <br>
649
- 2.5. To use the motion brush for restraining the control area of the trajectory, click to add masks on the "Add Motion Brush Here" image. The motion brush restricts the optical flow area derived from the trajectory whose starting point is within the motion brush. The displayed optical flow image will change correspondingly. Adjust the motion brush radius using the "Motion Brush Radius" bar. <br>
650
- 3. Click the "Run" button to animate the image according to the path. <br>
651
- """
652
- )
653
-
654
- target_size = 512
655
- DragNUWA_net = Drag(target_size, target_size)
656
- first_frame_path = gr.State()
657
- tracking_points = gr.State([])
658
- motion_brush_points = gr.State([])
659
- motion_brush_mask = gr.State()
660
- motion_brush_viz = gr.State()
661
- inference_batch_size = gr.State(1)
662
-
663
  def preprocess_image(image):
664
 
665
  image_pil = image2pil(image.name)
666
  raw_w, raw_h = image_pil.size
667
 
668
  max_edge = min(raw_w, raw_h)
669
- resize_ratio = target_size / max_edge
670
 
671
  image_pil = image_pil.resize((round(raw_w * resize_ratio), round(raw_h * resize_ratio)), Image.BILINEAR)
672
 
@@ -676,8 +668,8 @@ with gr.Blocks() as demo:
676
 
677
  image_pil = transforms.CenterCrop((crop_h, crop_w))(image_pil.convert('RGB'))
678
 
679
- DragNUWA_net.width = crop_w
680
- DragNUWA_net.height = crop_h
681
 
682
  id = str(time.time()).split('.')[0]
683
  os.makedirs(os.path.join(output_dir_video, str(id)), exist_ok=True)
@@ -722,7 +714,7 @@ with gr.Blocks() as demo:
722
  transparent_layer = Image.fromarray(transparent_layer.astype(np.uint8))
723
  trajectory_map = Image.alpha_composite(transparent_background, transparent_layer)
724
 
725
- viz_flow = DragNUWA_net.get_cmp_flow_from_tracking_points(tracking_points, motion_brush_mask, first_frame_path)
726
 
727
  return tracking_points, trajectory_map, viz_flow
728
 
@@ -742,7 +734,7 @@ with gr.Blocks() as demo:
742
  transparent_layer_pil = Image.fromarray(transparent_layer.astype(np.uint8))
743
  motion_map = Image.alpha_composite(transparent_background, transparent_layer_pil)
744
 
745
- viz_flow = DragNUWA_net.get_cmp_flow_from_tracking_points(tracking_points, motion_brush_mask, first_frame_path)
746
 
747
  return motion_brush_mask, transparent_layer, motion_map, viz_flow
748
 
@@ -778,7 +770,7 @@ with gr.Blocks() as demo:
778
  transparent_layer = Image.fromarray(transparent_layer.astype(np.uint8))
779
  trajectory_map = Image.alpha_composite(transparent_background, transparent_layer)
780
 
781
- viz_flow = DragNUWA_net.get_cmp_flow_from_tracking_points(tracking_points, motion_brush_mask, first_frame_path)
782
 
783
  return tracking_points, trajectory_map, viz_flow
784
 
@@ -833,6 +825,6 @@ with gr.Blocks() as demo:
833
 
834
  input_image_mask.select(add_motion_brushes, [motion_brush_points, motion_brush_mask, motion_brush_viz, first_frame_path, brush_radius, tracking_points], [motion_brush_mask, motion_brush_viz, input_image_mask, viz_flow])
835
 
836
- run_button.click(DragNUWA_net.run, [first_frame_path, tracking_points, inference_batch_size, motion_brush_mask, motion_brush_viz, ctrl_scale], [hint_image, output_video, output_flow, output_video_mp4, output_flow_mp4])
837
 
838
  demo.launch()
 
89
  return s_flow, mask
90
 
91
 
92
+ @spaces.GPU(duration=200)
93
  def init_models(pretrained_model_name_or_path, resume_from_checkpoint, weight_dtype, device='cuda', enable_xformers_memory_efficient_attention=False, allow_tf32=False):
94
 
95
  from models.unet_spatio_temporal_condition_controlnet import UNetSpatioTemporalConditionControlNetModel
 
215
  return trajectory_maps, transparent_layer
216
 
217
 
218
+ with gr.Blocks() as demo:
219
+ gr.Markdown("""<h1 align="center">MOFA-Video</h1><br>""")
 
220
 
221
+ gr.Markdown("""Official Gradio Demo for <a href='https://myniuuu.github.io/MOFA_Video'><b>MOFA-Video: Controllable Image Animation via Generative Motion Field Adaptions in Frozen Image-to-Video Diffusion Model</b></a>.<br>""")
 
222
 
223
+ gr.Markdown(
224
+ """
225
+ During the inference, kindly follow these instructions:
226
+ <br>
227
+ 1. Use the "Upload Image" button to upload an image. Avoid dragging the image directly into the window. <br>
228
+ 2. Proceed to draw trajectories: <br>
229
+ 2.1. Click "Add Trajectory" first, then select points on the "Add Trajectory Here" image. The first click sets the starting point. Click multiple points to create a non-linear trajectory. To add a new trajectory, click "Add Trajectory" again and select points on the image. Avoid clicking the "Add Trajectory" button multiple times without clicking points in the image to add the trajectory, as this can lead to errors. <br>
230
+ 2.2. After adding each trajectory, an optical flow image will be displayed automatically. Use it as a reference to adjust the trajectory for desired effects (e.g., area, intensity). <br>
231
+ 2.3. To delete the latest trajectory, click "Delete Last Trajectory." <br>
232
+ 2.4. Choose the Control Scale in the bar. This determines the control intensity. Setting it to 0 means no control (pure generation result of SVD itself), while setting it to 1 results in the strongest control (which will not lead to good results in most cases because of twisting artifacts). A preset value of 0.6 is recommended for most cases. <br>
233
+ 2.5. To use the motion brush for restraining the control area of the trajectory, click to add masks on the "Add Motion Brush Here" image. The motion brush restricts the optical flow area derived from the trajectory whose starting point is within the motion brush. The displayed optical flow image will change correspondingly. Adjust the motion brush radius using the "Motion Brush Radius" bar. <br>
234
+ 3. Click the "Run" button to animate the image according to the path. <br>
235
+ """
236
+ )
237
+
238
+ height, width = 512, 512
239
+ svd_ckpt = "ckpts/stable-video-diffusion-img2vid-xt-1-1"
240
+ mofa_ckpt = "ckpts/controlnet"
241
+
242
+ pipeline, cmp = init_models(
243
+ svd_ckpt,
244
+ mofa_ckpt,
245
+ weight_dtype=torch.float16,
246
+ device='cuda'
247
+ )
248
 
249
+ first_frame_path = gr.State()
250
+ tracking_points = gr.State([])
251
+ motion_brush_points = gr.State([])
252
+ motion_brush_mask = gr.State()
253
+ motion_brush_viz = gr.State()
254
+ inference_batch_size = gr.State(1)
255
 
256
+ def get_cmp_flow(frames, sparse_optical_flow, mask, brush_mask=None):
257
 
258
  '''
259
  frames: [b, 13, 3, 384, 384] (0, 1) tensor
 
266
  frames = frames.flatten(0, 1) # [b*13, 3, 256, 256]
267
  sparse_optical_flow = sparse_optical_flow.flatten(0, 1) # [b*13, 2, 256, 256]
268
  mask = mask.flatten(0, 1) # [b*13, 2, 256, 256]
269
+ cmp_flow = cmp.run(frames, sparse_optical_flow, mask) # [b*13, 2, 256, 256]
270
 
271
  if brush_mask is not None:
272
  brush_mask = torch.from_numpy(brush_mask) / 255.
 
278
  return cmp_flow
279
 
280
 
281
+ def get_flow(pixel_values_384, sparse_optical_flow_384, mask_384, motion_brush_mask=None):
282
 
283
  fb, fl, fc, _, _ = pixel_values_384.shape
284
 
285
+ controlnet_flow = get_cmp_flow(
286
  pixel_values_384[:, 0:1, :, :, :].repeat(1, fl, 1, 1, 1),
287
  sparse_optical_flow_384,
288
  mask_384, motion_brush_mask
289
  )
290
 
291
+ if height != 384 or width != 384:
292
+ scales = [height / 384, width / 384]
293
+ controlnet_flow = F.interpolate(controlnet_flow.flatten(0, 1), (height, width), mode='nearest').reshape(fb, fl, 2, height, width)
294
  controlnet_flow[:, :, 0] *= scales[1]
295
  controlnet_flow[:, :, 1] *= scales[0]
296
 
 
298
 
299
 
300
  @torch.no_grad()
301
+ def forward_sample(input_drag_384_inmask, input_drag_384_outmask, input_first_frame, input_mask_384_inmask, input_mask_384_outmask, in_mask_flag, out_mask_flag, motion_brush_mask=None, ctrl_scale=1., outputs=dict()):
302
  '''
303
  input_drag: [1, 13, 320, 576, 2]
304
  input_drag_384: [1, 13, 384, 384, 2]
 
330
  input_first_frame_384 = input_first_frame_384.to('cuda', dtype=torch.float16)
331
 
332
  if in_mask_flag:
333
+ flow_inmask = get_flow(
334
  input_first_frame_384,
335
  input_drag_384_inmask, mask_384_inmask, motion_brush_mask
336
  )
337
  else:
338
  fb, fl = mask_384_inmask.shape[:2]
339
+ flow_inmask = torch.zeros(fb, fl, 2, height, width).to('cuda', dtype=torch.float16)
340
 
341
  if out_mask_flag:
342
+ flow_outmask = get_flow(
343
  input_first_frame_384,
344
  input_drag_384_outmask, mask_384_outmask
345
  )
346
  else:
347
  fb, fl = mask_384_outmask.shape[:2]
348
+ flow_outmask = torch.zeros(fb, fl, 2, height, width).to('cuda', dtype=torch.float16)
349
 
350
  inmask_no_zero = (flow_inmask != 0).all(dim=2)
351
  inmask_no_zero = inmask_no_zero.unsqueeze(2).expand_as(flow_inmask)
352
 
353
  controlnet_flow = torch.where(inmask_no_zero, flow_inmask, flow_outmask)
354
 
355
+ val_output = pipeline(
356
  input_first_frame_pil,
357
  input_first_frame_pil,
358
  controlnet_flow,
 
391
 
392
  @spaces.GPU
393
  @torch.no_grad()
394
+ def get_cmp_flow_from_tracking_points(tracking_points, motion_brush_mask, first_frame_path):
395
 
396
+ original_width, original_height = width, height
397
 
398
  input_all_points = tracking_points.constructor_args['value']
399
 
400
  if len(input_all_points) == 0 or len(input_all_points[-1]) == 1:
401
  return np.uint8(np.ones((original_width, original_height, 3))*255)
402
 
403
+ resized_all_points = [tuple([tuple([int(e1[0]*width/original_width), int(e1[1]*height/original_height)]) for e1 in e]) for e in input_all_points]
404
  resized_all_points_384 = [tuple([tuple([int(e1[0]*384/original_width), int(e1[1]*384/original_height)]) for e1 in e]) for e in input_all_points]
405
 
406
  new_resized_all_points = []
 
478
  input_first_frame_384 = input_first_frame_384.to('cuda', dtype=torch.float16)
479
 
480
  if in_mask_flag:
481
+ flow_inmask = get_flow(
482
  input_first_frame_384,
483
  input_drag_384_inmask, mask_384_inmask, motion_brush_mask_384
484
  )
485
  else:
486
  fb, fl = mask_384_inmask.shape[:2]
487
+ flow_inmask = torch.zeros(fb, fl, 2, height, width).to('cuda', dtype=torch.float16)
488
 
489
  if out_mask_flag:
490
+ flow_outmask = get_flow(
491
  input_first_frame_384,
492
  input_drag_384_outmask, mask_384_outmask
493
  )
494
  else:
495
  fb, fl = mask_384_outmask.shape[:2]
496
+ flow_outmask = torch.zeros(fb, fl, 2, height, width).to('cuda', dtype=torch.float16)
497
 
498
  inmask_no_zero = (flow_inmask != 0).all(dim=2)
499
  inmask_no_zero = inmask_no_zero.unsqueeze(2).expand_as(flow_inmask)
 
506
  return viz_esti_flows
507
 
508
  @spaces.GPU(duration=200)
509
+ def run(first_frame_path, tracking_points, inference_batch_size, motion_brush_mask, motion_brush_viz, ctrl_scale):
510
 
511
+ original_width, original_height = width, height
512
 
513
  input_all_points = tracking_points.constructor_args['value']
514
+ resized_all_points = [tuple([tuple([int(e1[0]*width/original_width), int(e1[1]*height/original_height)]) for e1 in e]) for e in input_all_points]
515
  resized_all_points_384 = [tuple([tuple([int(e1[0]*384/original_width), int(e1[1]*384/original_height)]) for e1 in e]) for e in input_all_points]
516
 
517
  new_resized_all_points = []
 
564
  id = base.split('_')[0]
565
 
566
  image_pil = image2pil(first_frame_path)
567
+ image_pil = image_pil.resize((width, height), Image.BILINEAR).convert('RGB')
568
 
569
+ visualized_drag, _ = visualize_drag_v2(first_frame_path, resized_all_points, width, height)
570
 
571
  motion_brush_viz_pil = Image.fromarray(motion_brush_viz.astype(np.uint8)).convert('RGBA')
572
  visualized_drag = visualized_drag[0].convert('RGBA')
 
589
  first_frames = outputs['logits_imgs'][:, -1]
590
 
591
 
592
+ outputs = forward_sample(
593
  input_drag_384_inmask.to('cuda'),
594
  input_drag_384_outmask.to('cuda'),
595
  first_frames.to('cuda'),
 
652
 
653
  return hint_path, outputs_path, flows_path, outputs_mp4_path, flows_mp4_path
654
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
655
  def preprocess_image(image):
656
 
657
  image_pil = image2pil(image.name)
658
  raw_w, raw_h = image_pil.size
659
 
660
  max_edge = min(raw_w, raw_h)
661
+ resize_ratio = width / max_edge
662
 
663
  image_pil = image_pil.resize((round(raw_w * resize_ratio), round(raw_h * resize_ratio)), Image.BILINEAR)
664
 
 
668
 
669
  image_pil = transforms.CenterCrop((crop_h, crop_w))(image_pil.convert('RGB'))
670
 
671
+ width = crop_w
672
+ height = crop_h
673
 
674
  id = str(time.time()).split('.')[0]
675
  os.makedirs(os.path.join(output_dir_video, str(id)), exist_ok=True)
 
714
  transparent_layer = Image.fromarray(transparent_layer.astype(np.uint8))
715
  trajectory_map = Image.alpha_composite(transparent_background, transparent_layer)
716
 
717
+ viz_flow = get_cmp_flow_from_tracking_points(tracking_points, motion_brush_mask, first_frame_path)
718
 
719
  return tracking_points, trajectory_map, viz_flow
720
 
 
734
  transparent_layer_pil = Image.fromarray(transparent_layer.astype(np.uint8))
735
  motion_map = Image.alpha_composite(transparent_background, transparent_layer_pil)
736
 
737
+ viz_flow = get_cmp_flow_from_tracking_points(tracking_points, motion_brush_mask, first_frame_path)
738
 
739
  return motion_brush_mask, transparent_layer, motion_map, viz_flow
740
 
 
770
  transparent_layer = Image.fromarray(transparent_layer.astype(np.uint8))
771
  trajectory_map = Image.alpha_composite(transparent_background, transparent_layer)
772
 
773
+ viz_flow = get_cmp_flow_from_tracking_points(tracking_points, motion_brush_mask, first_frame_path)
774
 
775
  return tracking_points, trajectory_map, viz_flow
776
 
 
825
 
826
  input_image_mask.select(add_motion_brushes, [motion_brush_points, motion_brush_mask, motion_brush_viz, first_frame_path, brush_radius, tracking_points], [motion_brush_mask, motion_brush_viz, input_image_mask, viz_flow])
827
 
828
+ run_button.click(run, [first_frame_path, tracking_points, inference_batch_size, motion_brush_mask, motion_brush_viz, ctrl_scale], [hint_image, output_video, output_flow, output_video_mp4, output_flow_mp4])
829
 
830
  demo.launch()
oldapp.py ADDED
@@ -0,0 +1,838 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import spaces
3
+ import numpy as np
4
+ import cv2
5
+ import os
6
+ from PIL import Image, ImageFilter
7
+ import uuid
8
+ from scipy.interpolate import interp1d, PchipInterpolator
9
+ import torchvision
10
+ # from utils import *
11
+ import time
12
+ from tqdm import tqdm
13
+ import imageio
14
+
15
+ import torch
16
+ import torch.nn.functional as F
17
+ import torchvision
18
+ import torchvision.transforms as transforms
19
+ from einops import rearrange, repeat
20
+
21
+ from packaging import version
22
+
23
+ from accelerate.utils import set_seed
24
+ from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
25
+
26
+ from diffusers import AutoencoderKLTemporalDecoder, EulerDiscreteScheduler
27
+ from diffusers.utils import check_min_version
28
+ from diffusers.utils.import_utils import is_xformers_available
29
+
30
+ from utils.flow_viz import flow_to_image
31
+ from utils.utils import split_filename, image2arr, image2pil, ensure_dirname
32
+
33
+
34
+ output_dir_video = "./outputs/videos"
35
+ output_dir_frame = "./outputs/frames"
36
+
37
+
38
+ ensure_dirname(output_dir_video)
39
+ ensure_dirname(output_dir_frame)
40
+
41
+ # os.system('nvcc -V')
42
+
43
+
44
+ def divide_points_afterinterpolate(resized_all_points, motion_brush_mask):
45
+ k = resized_all_points.shape[0]
46
+ starts = resized_all_points[:, 0] # [K, 2]
47
+
48
+ in_masks = []
49
+ out_masks = []
50
+
51
+ for i in range(k):
52
+ x, y = int(starts[i][1]), int(starts[i][0])
53
+ if motion_brush_mask[x][y] == 255:
54
+ in_masks.append(resized_all_points[i])
55
+ else:
56
+ out_masks.append(resized_all_points[i])
57
+
58
+ in_masks = np.array(in_masks)
59
+ out_masks = np.array(out_masks)
60
+
61
+ return in_masks, out_masks
62
+
63
+
64
+ def get_sparseflow_and_mask_forward(
65
+ resized_all_points,
66
+ n_steps, H, W,
67
+ is_backward_flow=False
68
+ ):
69
+
70
+ K = resized_all_points.shape[0]
71
+
72
+ starts = resized_all_points[:, 0] # [K, 2]
73
+
74
+ interpolated_ends = resized_all_points[:, 1:]
75
+
76
+ s_flow = np.zeros((K, n_steps, H, W, 2))
77
+ mask = np.zeros((K, n_steps, H, W))
78
+
79
+ for k in range(K):
80
+ for i in range(n_steps):
81
+ start, end = starts[k], interpolated_ends[k][i]
82
+ flow = np.int64(end - start) * (-1 if is_backward_flow is True else 1)
83
+ s_flow[k][i][int(start[1]), int(start[0])] = flow
84
+ mask[k][i][int(start[1]), int(start[0])] = 1
85
+
86
+ s_flow = np.sum(s_flow, axis=0)
87
+ mask = np.sum(mask, axis=0)
88
+
89
+ return s_flow, mask
90
+
91
+
92
+ def init_models(pretrained_model_name_or_path, resume_from_checkpoint, weight_dtype, device='cuda', enable_xformers_memory_efficient_attention=False, allow_tf32=False):
93
+
94
+ from models.unet_spatio_temporal_condition_controlnet import UNetSpatioTemporalConditionControlNetModel
95
+ from pipeline.pipeline import FlowControlNetPipeline
96
+ from models.svdxt_featureflow_forward_controlnet_s2d_fixcmp_norefine import FlowControlNet, CMP_demo
97
+
98
+ print('start loading models...')
99
+ # Load scheduler, tokenizer and models.
100
+ image_encoder = CLIPVisionModelWithProjection.from_pretrained(
101
+ pretrained_model_name_or_path, subfolder="image_encoder", revision=None, variant="fp16"
102
+ )
103
+ vae = AutoencoderKLTemporalDecoder.from_pretrained(
104
+ pretrained_model_name_or_path, subfolder="vae", revision=None, variant="fp16")
105
+ unet = UNetSpatioTemporalConditionControlNetModel.from_pretrained(
106
+ pretrained_model_name_or_path,
107
+ subfolder="unet",
108
+ low_cpu_mem_usage=True,
109
+ variant="fp16",
110
+ )
111
+
112
+ controlnet = FlowControlNet.from_pretrained(resume_from_checkpoint)
113
+
114
+ cmp = CMP_demo(
115
+ './models/cmp/experiments/semiauto_annot/resnet50_vip+mpii_liteflow/config.yaml',
116
+ 42000
117
+ ).to(device)
118
+ cmp.requires_grad_(False)
119
+
120
+ # Freeze vae and image_encoder
121
+ vae.requires_grad_(False)
122
+ image_encoder.requires_grad_(False)
123
+ unet.requires_grad_(False)
124
+ controlnet.requires_grad_(False)
125
+
126
+ # Move image_encoder and vae to gpu and cast to weight_dtype
127
+ image_encoder.to(device, dtype=weight_dtype)
128
+ vae.to(device, dtype=weight_dtype)
129
+ unet.to(device, dtype=weight_dtype)
130
+ controlnet.to(device, dtype=weight_dtype)
131
+
132
+ if enable_xformers_memory_efficient_attention:
133
+ if is_xformers_available():
134
+ import xformers
135
+
136
+ xformers_version = version.parse(xformers.__version__)
137
+ if xformers_version == version.parse("0.0.16"):
138
+ print(
139
+ "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
140
+ )
141
+ unet.enable_xformers_memory_efficient_attention()
142
+ else:
143
+ raise ValueError(
144
+ "xformers is not available. Make sure it is installed correctly")
145
+
146
+ if allow_tf32:
147
+ torch.backends.cuda.matmul.allow_tf32 = True
148
+
149
+ pipeline = FlowControlNetPipeline.from_pretrained(
150
+ pretrained_model_name_or_path,
151
+ unet=unet,
152
+ controlnet=controlnet,
153
+ image_encoder=image_encoder,
154
+ vae=vae,
155
+ torch_dtype=weight_dtype,
156
+ )
157
+ pipeline = pipeline.to(device)
158
+
159
+ print('models loaded.')
160
+
161
+ return pipeline, cmp
162
+
163
+
164
+ def interpolate_trajectory(points, n_points):
165
+ x = [point[0] for point in points]
166
+ y = [point[1] for point in points]
167
+
168
+ t = np.linspace(0, 1, len(points))
169
+
170
+ fx = PchipInterpolator(t, x)
171
+ fy = PchipInterpolator(t, y)
172
+
173
+ new_t = np.linspace(0, 1, n_points)
174
+
175
+ new_x = fx(new_t)
176
+ new_y = fy(new_t)
177
+ new_points = list(zip(new_x, new_y))
178
+
179
+ return new_points
180
+
181
+
182
+ def visualize_drag_v2(background_image_path, splited_tracks, width, height):
183
+ trajectory_maps = []
184
+
185
+ background_image = Image.open(background_image_path).convert('RGBA')
186
+ background_image = background_image.resize((width, height))
187
+ w, h = background_image.size
188
+ transparent_background = np.array(background_image)
189
+ transparent_background[:, :, -1] = 128
190
+ transparent_background = Image.fromarray(transparent_background)
191
+
192
+ # Create a transparent layer with the same size as the background image
193
+ transparent_layer = np.zeros((h, w, 4))
194
+ for splited_track in splited_tracks:
195
+ if len(splited_track) > 1:
196
+ splited_track = interpolate_trajectory(splited_track, 16)
197
+ splited_track = splited_track[:16]
198
+ for i in range(len(splited_track)-1):
199
+ start_point = (int(splited_track[i][0]), int(splited_track[i][1]))
200
+ end_point = (int(splited_track[i+1][0]), int(splited_track[i+1][1]))
201
+ vx = end_point[0] - start_point[0]
202
+ vy = end_point[1] - start_point[1]
203
+ arrow_length = np.sqrt(vx**2 + vy**2)
204
+ if i == len(splited_track)-2:
205
+ cv2.arrowedLine(transparent_layer, start_point, end_point, (255, 0, 0, 192), 2, tipLength=8 / arrow_length)
206
+ else:
207
+ cv2.line(transparent_layer, start_point, end_point, (255, 0, 0, 192), 2)
208
+ else:
209
+ cv2.circle(transparent_layer, (int(splited_track[0][0]), int(splited_track[0][1])), 2, (255, 0, 0, 192), -1)
210
+
211
+ transparent_layer = Image.fromarray(transparent_layer.astype(np.uint8))
212
+ trajectory_map = Image.alpha_composite(transparent_background, transparent_layer)
213
+ trajectory_maps.append(trajectory_map)
214
+ return trajectory_maps, transparent_layer
215
+
216
+
217
+ class Drag:
218
+ @spaces.GPU(duration=200)
219
+ def __init__(self, height, width):
220
+
221
+ svd_ckpt = "ckpts/stable-video-diffusion-img2vid-xt-1-1"
222
+ mofa_ckpt = "ckpts/controlnet"
223
+
224
+ self.pipeline, self.cmp = init_models(
225
+ svd_ckpt,
226
+ mofa_ckpt,
227
+ weight_dtype=torch.float16,
228
+ device='cuda'
229
+ )
230
+
231
+ self.height = height
232
+ self.width = width
233
+
234
+ def get_cmp_flow(self, frames, sparse_optical_flow, mask, brush_mask=None):
235
+
236
+ '''
237
+ frames: [b, 13, 3, 384, 384] (0, 1) tensor
238
+ sparse_optical_flow: [b, 13, 2, 384, 384] (-384, 384) tensor
239
+ mask: [b, 13, 2, 384, 384] {0, 1} tensor
240
+ '''
241
+
242
+ b, t, c, h, w = frames.shape
243
+ assert h == 384 and w == 384
244
+ frames = frames.flatten(0, 1) # [b*13, 3, 256, 256]
245
+ sparse_optical_flow = sparse_optical_flow.flatten(0, 1) # [b*13, 2, 256, 256]
246
+ mask = mask.flatten(0, 1) # [b*13, 2, 256, 256]
247
+ cmp_flow = self.cmp.run(frames, sparse_optical_flow, mask) # [b*13, 2, 256, 256]
248
+
249
+ if brush_mask is not None:
250
+ brush_mask = torch.from_numpy(brush_mask) / 255.
251
+ brush_mask = brush_mask.to(cmp_flow.device, dtype=cmp_flow.dtype)
252
+ brush_mask = brush_mask.unsqueeze(0).unsqueeze(0)
253
+ cmp_flow = cmp_flow * brush_mask
254
+
255
+ cmp_flow = cmp_flow.reshape(b, t, 2, h, w)
256
+ return cmp_flow
257
+
258
+
259
+ def get_flow(self, pixel_values_384, sparse_optical_flow_384, mask_384, motion_brush_mask=None):
260
+
261
+ fb, fl, fc, _, _ = pixel_values_384.shape
262
+
263
+ controlnet_flow = self.get_cmp_flow(
264
+ pixel_values_384[:, 0:1, :, :, :].repeat(1, fl, 1, 1, 1),
265
+ sparse_optical_flow_384,
266
+ mask_384, motion_brush_mask
267
+ )
268
+
269
+ if self.height != 384 or self.width != 384:
270
+ scales = [self.height / 384, self.width / 384]
271
+ controlnet_flow = F.interpolate(controlnet_flow.flatten(0, 1), (self.height, self.width), mode='nearest').reshape(fb, fl, 2, self.height, self.width)
272
+ controlnet_flow[:, :, 0] *= scales[1]
273
+ controlnet_flow[:, :, 1] *= scales[0]
274
+
275
+ return controlnet_flow
276
+
277
+
278
+ @torch.no_grad()
279
+ def forward_sample(self, input_drag_384_inmask, input_drag_384_outmask, input_first_frame, input_mask_384_inmask, input_mask_384_outmask, in_mask_flag, out_mask_flag, motion_brush_mask=None, ctrl_scale=1., outputs=dict()):
280
+ '''
281
+ input_drag: [1, 13, 320, 576, 2]
282
+ input_drag_384: [1, 13, 384, 384, 2]
283
+ input_first_frame: [1, 3, 320, 576]
284
+ '''
285
+
286
+ seed = 42
287
+ num_frames = 25
288
+
289
+ set_seed(seed)
290
+
291
+ input_first_frame_384 = F.interpolate(input_first_frame, (384, 384))
292
+ input_first_frame_384 = input_first_frame_384.repeat(num_frames - 1, 1, 1, 1).unsqueeze(0)
293
+ input_first_frame_pil = Image.fromarray(np.uint8(input_first_frame[0].cpu().permute(1, 2, 0)*255))
294
+ height, width = input_first_frame.shape[-2:]
295
+
296
+ input_drag_384_inmask = input_drag_384_inmask.permute(0, 1, 4, 2, 3) # [1, 13, 2, 384, 384]
297
+ mask_384_inmask = input_mask_384_inmask.unsqueeze(2).repeat(1, 1, 2, 1, 1) # [1, 13, 2, 384, 384]
298
+ input_drag_384_outmask = input_drag_384_outmask.permute(0, 1, 4, 2, 3) # [1, 13, 2, 384, 384]
299
+ mask_384_outmask = input_mask_384_outmask.unsqueeze(2).repeat(1, 1, 2, 1, 1) # [1, 13, 2, 384, 384]
300
+
301
+ print('start diffusion process...')
302
+
303
+ input_drag_384_inmask = input_drag_384_inmask.to('cuda', dtype=torch.float16)
304
+ mask_384_inmask = mask_384_inmask.to('cuda', dtype=torch.float16)
305
+ input_drag_384_outmask = input_drag_384_outmask.to('cuda', dtype=torch.float16)
306
+ mask_384_outmask = mask_384_outmask.to('cuda', dtype=torch.float16)
307
+
308
+ input_first_frame_384 = input_first_frame_384.to('cuda', dtype=torch.float16)
309
+
310
+ if in_mask_flag:
311
+ flow_inmask = self.get_flow(
312
+ input_first_frame_384,
313
+ input_drag_384_inmask, mask_384_inmask, motion_brush_mask
314
+ )
315
+ else:
316
+ fb, fl = mask_384_inmask.shape[:2]
317
+ flow_inmask = torch.zeros(fb, fl, 2, self.height, self.width).to('cuda', dtype=torch.float16)
318
+
319
+ if out_mask_flag:
320
+ flow_outmask = self.get_flow(
321
+ input_first_frame_384,
322
+ input_drag_384_outmask, mask_384_outmask
323
+ )
324
+ else:
325
+ fb, fl = mask_384_outmask.shape[:2]
326
+ flow_outmask = torch.zeros(fb, fl, 2, self.height, self.width).to('cuda', dtype=torch.float16)
327
+
328
+ inmask_no_zero = (flow_inmask != 0).all(dim=2)
329
+ inmask_no_zero = inmask_no_zero.unsqueeze(2).expand_as(flow_inmask)
330
+
331
+ controlnet_flow = torch.where(inmask_no_zero, flow_inmask, flow_outmask)
332
+
333
+ val_output = self.pipeline(
334
+ input_first_frame_pil,
335
+ input_first_frame_pil,
336
+ controlnet_flow,
337
+ height=height,
338
+ width=width,
339
+ num_frames=num_frames,
340
+ decode_chunk_size=8,
341
+ motion_bucket_id=127,
342
+ fps=7,
343
+ noise_aug_strength=0.02,
344
+ controlnet_cond_scale=ctrl_scale,
345
+ )
346
+
347
+ video_frames, estimated_flow = val_output.frames[0], val_output.controlnet_flow
348
+
349
+ for i in range(num_frames):
350
+ img = video_frames[i]
351
+ video_frames[i] = np.array(img)
352
+ video_frames = torch.from_numpy(np.array(video_frames)).cuda().permute(0, 3, 1, 2).unsqueeze(0) / 255.
353
+
354
+ print(video_frames.shape)
355
+
356
+ viz_esti_flows = []
357
+ for i in range(estimated_flow.shape[1]):
358
+ temp_flow = estimated_flow[0][i].permute(1, 2, 0)
359
+ viz_esti_flows.append(flow_to_image(temp_flow))
360
+ viz_esti_flows = [np.uint8(np.ones_like(viz_esti_flows[-1]) * 255)] + viz_esti_flows
361
+ viz_esti_flows = np.stack(viz_esti_flows) # [t-1, h, w, c]
362
+
363
+ total_nps = viz_esti_flows
364
+
365
+ outputs['logits_imgs'] = video_frames
366
+ outputs['flows'] = torch.from_numpy(total_nps).cuda().permute(0, 3, 1, 2).unsqueeze(0) / 255.
367
+
368
+ return outputs
369
+
370
+ @spaces.GPU
371
+ @torch.no_grad()
372
+ def get_cmp_flow_from_tracking_points(self, tracking_points, motion_brush_mask, first_frame_path):
373
+
374
+ original_width, original_height = self.width, self.height
375
+
376
+ input_all_points = tracking_points.constructor_args['value']
377
+
378
+ if len(input_all_points) == 0 or len(input_all_points[-1]) == 1:
379
+ return np.uint8(np.ones((original_width, original_height, 3))*255)
380
+
381
+ resized_all_points = [tuple([tuple([int(e1[0]*self.width/original_width), int(e1[1]*self.height/original_height)]) for e1 in e]) for e in input_all_points]
382
+ resized_all_points_384 = [tuple([tuple([int(e1[0]*384/original_width), int(e1[1]*384/original_height)]) for e1 in e]) for e in input_all_points]
383
+
384
+ new_resized_all_points = []
385
+ new_resized_all_points_384 = []
386
+ for tnum in range(len(resized_all_points)):
387
+ new_resized_all_points.append(interpolate_trajectory(input_all_points[tnum], 25))
388
+ new_resized_all_points_384.append(interpolate_trajectory(resized_all_points_384[tnum], 25))
389
+
390
+ resized_all_points = np.array(new_resized_all_points)
391
+ resized_all_points_384 = np.array(new_resized_all_points_384)
392
+
393
+ motion_brush_mask_384 = cv2.resize(motion_brush_mask, (384, 384), cv2.INTER_NEAREST)
394
+
395
+ resized_all_points_384_inmask, resized_all_points_384_outmask = \
396
+ divide_points_afterinterpolate(resized_all_points_384, motion_brush_mask_384)
397
+
398
+ in_mask_flag = False
399
+ out_mask_flag = False
400
+
401
+ if resized_all_points_384_inmask.shape[0] != 0:
402
+ in_mask_flag = True
403
+ input_drag_384_inmask, input_mask_384_inmask = \
404
+ get_sparseflow_and_mask_forward(
405
+ resized_all_points_384_inmask,
406
+ 25 - 1, 384, 384
407
+ )
408
+ else:
409
+ input_drag_384_inmask, input_mask_384_inmask = \
410
+ np.zeros((25 - 1, 384, 384, 2)), \
411
+ np.zeros((25 - 1, 384, 384))
412
+
413
+ if resized_all_points_384_outmask.shape[0] != 0:
414
+ out_mask_flag = True
415
+ input_drag_384_outmask, input_mask_384_outmask = \
416
+ get_sparseflow_and_mask_forward(
417
+ resized_all_points_384_outmask,
418
+ 25 - 1, 384, 384
419
+ )
420
+ else:
421
+ input_drag_384_outmask, input_mask_384_outmask = \
422
+ np.zeros((25 - 1, 384, 384, 2)), \
423
+ np.zeros((25 - 1, 384, 384))
424
+
425
+ input_drag_384_inmask = torch.from_numpy(input_drag_384_inmask).unsqueeze(0).to('cuda') # [1, 13, h, w, 2]
426
+ input_mask_384_inmask = torch.from_numpy(input_mask_384_inmask).unsqueeze(0).to('cuda') # [1, 13, h, w]
427
+ input_drag_384_outmask = torch.from_numpy(input_drag_384_outmask).unsqueeze(0).to('cuda') # [1, 13, h, w, 2]
428
+ input_mask_384_outmask = torch.from_numpy(input_mask_384_outmask).unsqueeze(0).to('cuda') # [1, 13, h, w]
429
+
430
+ first_frames_transform = transforms.Compose([
431
+ lambda x: Image.fromarray(x),
432
+ transforms.ToTensor(),
433
+ ])
434
+
435
+ input_first_frame = image2arr(first_frame_path)
436
+ input_first_frame = repeat(first_frames_transform(input_first_frame), 'c h w -> b c h w', b=1).to('cuda')
437
+
438
+ seed = 42
439
+ num_frames = 25
440
+
441
+ set_seed(seed)
442
+
443
+ input_first_frame_384 = F.interpolate(input_first_frame, (384, 384))
444
+ input_first_frame_384 = input_first_frame_384.repeat(num_frames - 1, 1, 1, 1).unsqueeze(0)
445
+
446
+ input_drag_384_inmask = input_drag_384_inmask.permute(0, 1, 4, 2, 3) # [1, 13, 2, 384, 384]
447
+ mask_384_inmask = input_mask_384_inmask.unsqueeze(2).repeat(1, 1, 2, 1, 1) # [1, 13, 2, 384, 384]
448
+ input_drag_384_outmask = input_drag_384_outmask.permute(0, 1, 4, 2, 3) # [1, 13, 2, 384, 384]
449
+ mask_384_outmask = input_mask_384_outmask.unsqueeze(2).repeat(1, 1, 2, 1, 1) # [1, 13, 2, 384, 384]
450
+
451
+ input_drag_384_inmask = input_drag_384_inmask.to('cuda', dtype=torch.float16)
452
+ mask_384_inmask = mask_384_inmask.to('cuda', dtype=torch.float16)
453
+ input_drag_384_outmask = input_drag_384_outmask.to('cuda', dtype=torch.float16)
454
+ mask_384_outmask = mask_384_outmask.to('cuda', dtype=torch.float16)
455
+
456
+ input_first_frame_384 = input_first_frame_384.to('cuda', dtype=torch.float16)
457
+
458
+ if in_mask_flag:
459
+ flow_inmask = self.get_flow(
460
+ input_first_frame_384,
461
+ input_drag_384_inmask, mask_384_inmask, motion_brush_mask_384
462
+ )
463
+ else:
464
+ fb, fl = mask_384_inmask.shape[:2]
465
+ flow_inmask = torch.zeros(fb, fl, 2, self.height, self.width).to('cuda', dtype=torch.float16)
466
+
467
+ if out_mask_flag:
468
+ flow_outmask = self.get_flow(
469
+ input_first_frame_384,
470
+ input_drag_384_outmask, mask_384_outmask
471
+ )
472
+ else:
473
+ fb, fl = mask_384_outmask.shape[:2]
474
+ flow_outmask = torch.zeros(fb, fl, 2, self.height, self.width).to('cuda', dtype=torch.float16)
475
+
476
+ inmask_no_zero = (flow_inmask != 0).all(dim=2)
477
+ inmask_no_zero = inmask_no_zero.unsqueeze(2).expand_as(flow_inmask)
478
+
479
+ controlnet_flow = torch.where(inmask_no_zero, flow_inmask, flow_outmask)
480
+
481
+ controlnet_flow = controlnet_flow[0, -1].permute(1, 2, 0)
482
+ viz_esti_flows = flow_to_image(controlnet_flow) # [h, w, c]
483
+
484
+ return viz_esti_flows
485
+
486
+ @spaces.GPU(duration=200)
487
+ def run(self, first_frame_path, tracking_points, inference_batch_size, motion_brush_mask, motion_brush_viz, ctrl_scale):
488
+
489
+ original_width, original_height = self.width, self.height
490
+
491
+ input_all_points = tracking_points.constructor_args['value']
492
+ resized_all_points = [tuple([tuple([int(e1[0]*self.width/original_width), int(e1[1]*self.height/original_height)]) for e1 in e]) for e in input_all_points]
493
+ resized_all_points_384 = [tuple([tuple([int(e1[0]*384/original_width), int(e1[1]*384/original_height)]) for e1 in e]) for e in input_all_points]
494
+
495
+ new_resized_all_points = []
496
+ new_resized_all_points_384 = []
497
+ for tnum in range(len(resized_all_points)):
498
+ new_resized_all_points.append(interpolate_trajectory(input_all_points[tnum], 25))
499
+ new_resized_all_points_384.append(interpolate_trajectory(resized_all_points_384[tnum], 25))
500
+
501
+ resized_all_points = np.array(new_resized_all_points)
502
+ resized_all_points_384 = np.array(new_resized_all_points_384)
503
+
504
+ motion_brush_mask_384 = cv2.resize(motion_brush_mask, (384, 384), cv2.INTER_NEAREST)
505
+
506
+ resized_all_points_384_inmask, resized_all_points_384_outmask = \
507
+ divide_points_afterinterpolate(resized_all_points_384, motion_brush_mask_384)
508
+
509
+ in_mask_flag = False
510
+ out_mask_flag = False
511
+
512
+ if resized_all_points_384_inmask.shape[0] != 0:
513
+ in_mask_flag = True
514
+ input_drag_384_inmask, input_mask_384_inmask = \
515
+ get_sparseflow_and_mask_forward(
516
+ resized_all_points_384_inmask,
517
+ 25 - 1, 384, 384
518
+ )
519
+ else:
520
+ input_drag_384_inmask, input_mask_384_inmask = \
521
+ np.zeros((25 - 1, 384, 384, 2)), \
522
+ np.zeros((25 - 1, 384, 384))
523
+
524
+ if resized_all_points_384_outmask.shape[0] != 0:
525
+ out_mask_flag = True
526
+ input_drag_384_outmask, input_mask_384_outmask = \
527
+ get_sparseflow_and_mask_forward(
528
+ resized_all_points_384_outmask,
529
+ 25 - 1, 384, 384
530
+ )
531
+ else:
532
+ input_drag_384_outmask, input_mask_384_outmask = \
533
+ np.zeros((25 - 1, 384, 384, 2)), \
534
+ np.zeros((25 - 1, 384, 384))
535
+
536
+ input_drag_384_inmask = torch.from_numpy(input_drag_384_inmask).unsqueeze(0) # [1, 13, h, w, 2]
537
+ input_mask_384_inmask = torch.from_numpy(input_mask_384_inmask).unsqueeze(0) # [1, 13, h, w]
538
+ input_drag_384_outmask = torch.from_numpy(input_drag_384_outmask).unsqueeze(0) # [1, 13, h, w, 2]
539
+ input_mask_384_outmask = torch.from_numpy(input_mask_384_outmask).unsqueeze(0) # [1, 13, h, w]
540
+
541
+ dir, base, ext = split_filename(first_frame_path)
542
+ id = base.split('_')[0]
543
+
544
+ image_pil = image2pil(first_frame_path)
545
+ image_pil = image_pil.resize((self.width, self.height), Image.BILINEAR).convert('RGB')
546
+
547
+ visualized_drag, _ = visualize_drag_v2(first_frame_path, resized_all_points, self.width, self.height)
548
+
549
+ motion_brush_viz_pil = Image.fromarray(motion_brush_viz.astype(np.uint8)).convert('RGBA')
550
+ visualized_drag = visualized_drag[0].convert('RGBA')
551
+ visualized_drag_brush = Image.alpha_composite(motion_brush_viz_pil, visualized_drag)
552
+
553
+ first_frames_transform = transforms.Compose([
554
+ lambda x: Image.fromarray(x),
555
+ transforms.ToTensor(),
556
+ ])
557
+
558
+ outputs = None
559
+ ouput_video_list = []
560
+ ouput_flow_list = []
561
+ num_inference = 1
562
+ for i in tqdm(range(num_inference)):
563
+ if not outputs:
564
+ first_frames = image2arr(first_frame_path)
565
+ first_frames = repeat(first_frames_transform(first_frames), 'c h w -> b c h w', b=inference_batch_size).to('cuda')
566
+ else:
567
+ first_frames = outputs['logits_imgs'][:, -1]
568
+
569
+
570
+ outputs = self.forward_sample(
571
+ input_drag_384_inmask.to('cuda'),
572
+ input_drag_384_outmask.to('cuda'),
573
+ first_frames.to('cuda'),
574
+ input_mask_384_inmask.to('cuda'),
575
+ input_mask_384_outmask.to('cuda'),
576
+ in_mask_flag,
577
+ out_mask_flag,
578
+ motion_brush_mask_384,
579
+ ctrl_scale)
580
+
581
+ ouput_video_list.append(outputs['logits_imgs'])
582
+ ouput_flow_list.append(outputs['flows'])
583
+
584
+ hint_path = os.path.join(output_dir_video, str(id), f'{id}_hint.png')
585
+ visualized_drag_brush.save(hint_path)
586
+
587
+ for i in range(inference_batch_size):
588
+ output_tensor = [ouput_video_list[0][i]]
589
+ flow_tensor = [ouput_flow_list[0][i]]
590
+ output_tensor = torch.cat(output_tensor, dim=0)
591
+ flow_tensor = torch.cat(flow_tensor, dim=0)
592
+
593
+ outputs_path = os.path.join(output_dir_video, str(id), f's{ctrl_scale}', f'{id}_output.gif')
594
+ flows_path = os.path.join(output_dir_video, str(id), f's{ctrl_scale}', f'{id}_flow.gif')
595
+
596
+ outputs_mp4_path = os.path.join(output_dir_video, str(id), f's{ctrl_scale}', f'{id}_output.mp4')
597
+ flows_mp4_path = os.path.join(output_dir_video, str(id), f's{ctrl_scale}', f'{id}_flow.mp4')
598
+
599
+ outputs_frames_path = os.path.join(output_dir_frame, str(id), f's{ctrl_scale}', f'{id}_output')
600
+ flows_frames_path = os.path.join(output_dir_frame, str(id), f's{ctrl_scale}', f'{id}_flow')
601
+
602
+ os.makedirs(os.path.join(output_dir_video, str(id), f's{ctrl_scale}'), exist_ok=True)
603
+ os.makedirs(os.path.join(outputs_frames_path), exist_ok=True)
604
+ os.makedirs(os.path.join(flows_frames_path), exist_ok=True)
605
+
606
+ print(output_tensor.shape)
607
+
608
+ output_RGB = output_tensor.permute(0, 2, 3, 1).mul(255).cpu().numpy()
609
+ flow_RGB = flow_tensor.permute(0, 2, 3, 1).mul(255).cpu().numpy()
610
+
611
+ torchvision.io.write_video(
612
+ outputs_mp4_path,
613
+ output_RGB,
614
+ fps=20, video_codec='h264', options={'crf': '10'}
615
+ )
616
+
617
+ torchvision.io.write_video(
618
+ flows_mp4_path,
619
+ flow_RGB,
620
+ fps=20, video_codec='h264', options={'crf': '10'}
621
+ )
622
+
623
+ imageio.mimsave(outputs_path, np.uint8(output_RGB), fps=20, loop=0)
624
+
625
+ imageio.mimsave(flows_path, np.uint8(flow_RGB), fps=20, loop=0)
626
+
627
+ for f in range(output_RGB.shape[0]):
628
+ Image.fromarray(np.uint8(output_RGB[f])).save(os.path.join(outputs_frames_path, f'{str(f).zfill(3)}.png'))
629
+ Image.fromarray(np.uint8(flow_RGB[f])).save(os.path.join(flows_frames_path, f'{str(f).zfill(3)}.png'))
630
+
631
+ return hint_path, outputs_path, flows_path, outputs_mp4_path, flows_mp4_path
632
+
633
+
634
+ with gr.Blocks() as demo:
635
+ gr.Markdown("""<h1 align="center">MOFA-Video</h1><br>""")
636
+
637
+ gr.Markdown("""Official Gradio Demo for <a href='https://myniuuu.github.io/MOFA_Video'><b>MOFA-Video: Controllable Image Animation via Generative Motion Field Adaptions in Frozen Image-to-Video Diffusion Model</b></a>.<br>""")
638
+
639
+ gr.Markdown(
640
+ """
641
+ During the inference, kindly follow these instructions:
642
+ <br>
643
+ 1. Use the "Upload Image" button to upload an image. Avoid dragging the image directly into the window. <br>
644
+ 2. Proceed to draw trajectories: <br>
645
+ 2.1. Click "Add Trajectory" first, then select points on the "Add Trajectory Here" image. The first click sets the starting point. Click multiple points to create a non-linear trajectory. To add a new trajectory, click "Add Trajectory" again and select points on the image. Avoid clicking the "Add Trajectory" button multiple times without clicking points in the image to add the trajectory, as this can lead to errors. <br>
646
+ 2.2. After adding each trajectory, an optical flow image will be displayed automatically. Use it as a reference to adjust the trajectory for desired effects (e.g., area, intensity). <br>
647
+ 2.3. To delete the latest trajectory, click "Delete Last Trajectory." <br>
648
+ 2.4. Choose the Control Scale in the bar. This determines the control intensity. Setting it to 0 means no control (pure generation result of SVD itself), while setting it to 1 results in the strongest control (which will not lead to good results in most cases because of twisting artifacts). A preset value of 0.6 is recommended for most cases. <br>
649
+ 2.5. To use the motion brush for restraining the control area of the trajectory, click to add masks on the "Add Motion Brush Here" image. The motion brush restricts the optical flow area derived from the trajectory whose starting point is within the motion brush. The displayed optical flow image will change correspondingly. Adjust the motion brush radius using the "Motion Brush Radius" bar. <br>
650
+ 3. Click the "Run" button to animate the image according to the path. <br>
651
+ """
652
+ )
653
+
654
+ target_size = 512
655
+ DragNUWA_net = Drag(target_size, target_size)
656
+ first_frame_path = gr.State()
657
+ tracking_points = gr.State([])
658
+ motion_brush_points = gr.State([])
659
+ motion_brush_mask = gr.State()
660
+ motion_brush_viz = gr.State()
661
+ inference_batch_size = gr.State(1)
662
+
663
+ def preprocess_image(image):
664
+
665
+ image_pil = image2pil(image.name)
666
+ raw_w, raw_h = image_pil.size
667
+
668
+ max_edge = min(raw_w, raw_h)
669
+ resize_ratio = target_size / max_edge
670
+
671
+ image_pil = image_pil.resize((round(raw_w * resize_ratio), round(raw_h * resize_ratio)), Image.BILINEAR)
672
+
673
+ new_w, new_h = image_pil.size
674
+ crop_w = new_w - (new_w % 64)
675
+ crop_h = new_h - (new_h % 64)
676
+
677
+ image_pil = transforms.CenterCrop((crop_h, crop_w))(image_pil.convert('RGB'))
678
+
679
+ DragNUWA_net.width = crop_w
680
+ DragNUWA_net.height = crop_h
681
+
682
+ id = str(time.time()).split('.')[0]
683
+ os.makedirs(os.path.join(output_dir_video, str(id)), exist_ok=True)
684
+ os.makedirs(os.path.join(output_dir_frame, str(id)), exist_ok=True)
685
+
686
+ first_frame_path = os.path.join(output_dir_video, str(id), f"{id}_input.png")
687
+ image_pil.save(first_frame_path)
688
+
689
+ return first_frame_path, first_frame_path, first_frame_path, gr.State([]), gr.State([]), np.zeros((crop_h, crop_w)), np.zeros((crop_h, crop_w, 4))
690
+
691
+ def add_drag(tracking_points):
692
+ if len(tracking_points.constructor_args['value']) != 0 and tracking_points.constructor_args['value'][-1] == []:
693
+ return tracking_points
694
+ tracking_points.constructor_args['value'].append([])
695
+ return tracking_points
696
+
697
+ def add_mask(motion_brush_points):
698
+ motion_brush_points.constructor_args['value'].append([])
699
+ return motion_brush_points
700
+
701
+ def delete_last_drag(tracking_points, first_frame_path, motion_brush_mask):
702
+ if len(tracking_points.constructor_args['value']) > 0:
703
+ tracking_points.constructor_args['value'].pop()
704
+ transparent_background = Image.open(first_frame_path).convert('RGBA')
705
+ w, h = transparent_background.size
706
+ transparent_layer = np.zeros((h, w, 4))
707
+ for track in tracking_points.constructor_args['value']:
708
+ if len(track) > 1:
709
+ for i in range(len(track)-1):
710
+ start_point = track[i]
711
+ end_point = track[i+1]
712
+ vx = end_point[0] - start_point[0]
713
+ vy = end_point[1] - start_point[1]
714
+ arrow_length = np.sqrt(vx**2 + vy**2)
715
+ if i == len(track)-2:
716
+ cv2.arrowedLine(transparent_layer, tuple(start_point), tuple(end_point), (255, 0, 0, 255), 2, tipLength=8 / arrow_length)
717
+ else:
718
+ cv2.line(transparent_layer, tuple(start_point), tuple(end_point), (255, 0, 0, 255), 2,)
719
+ else:
720
+ cv2.circle(transparent_layer, tuple(track[0]), 5, (255, 0, 0, 255), -1)
721
+
722
+ transparent_layer = Image.fromarray(transparent_layer.astype(np.uint8))
723
+ trajectory_map = Image.alpha_composite(transparent_background, transparent_layer)
724
+
725
+ viz_flow = DragNUWA_net.get_cmp_flow_from_tracking_points(tracking_points, motion_brush_mask, first_frame_path)
726
+
727
+ return tracking_points, trajectory_map, viz_flow
728
+
729
+ def add_motion_brushes(motion_brush_points, motion_brush_mask, transparent_layer, first_frame_path, radius, tracking_points, evt: gr.SelectData):
730
+
731
+ transparent_background = Image.open(first_frame_path).convert('RGBA')
732
+ w, h = transparent_background.size
733
+
734
+ motion_points = motion_brush_points.constructor_args['value']
735
+ motion_points.append(evt.index)
736
+
737
+ x, y = evt.index
738
+
739
+ cv2.circle(motion_brush_mask, (x, y), radius, 255, -1)
740
+ cv2.circle(transparent_layer, (x, y), radius, (0, 0, 255, 255), -1)
741
+
742
+ transparent_layer_pil = Image.fromarray(transparent_layer.astype(np.uint8))
743
+ motion_map = Image.alpha_composite(transparent_background, transparent_layer_pil)
744
+
745
+ viz_flow = DragNUWA_net.get_cmp_flow_from_tracking_points(tracking_points, motion_brush_mask, first_frame_path)
746
+
747
+ return motion_brush_mask, transparent_layer, motion_map, viz_flow
748
+
749
+ def add_tracking_points(tracking_points, first_frame_path, motion_brush_mask, evt: gr.SelectData):
750
+
751
+ print(f"You selected {evt.value} at {evt.index} from {evt.target}")
752
+
753
+ if len(tracking_points.constructor_args['value']) == 0:
754
+ tracking_points.constructor_args['value'].append([])
755
+
756
+ tracking_points.constructor_args['value'][-1].append(evt.index)
757
+
758
+ # print(tracking_points.constructor_args['value'])
759
+
760
+ transparent_background = Image.open(first_frame_path).convert('RGBA')
761
+ w, h = transparent_background.size
762
+ transparent_layer = np.zeros((h, w, 4))
763
+ for track in tracking_points.constructor_args['value']:
764
+ if len(track) > 1:
765
+ for i in range(len(track)-1):
766
+ start_point = track[i]
767
+ end_point = track[i+1]
768
+ vx = end_point[0] - start_point[0]
769
+ vy = end_point[1] - start_point[1]
770
+ arrow_length = np.sqrt(vx**2 + vy**2)
771
+ if i == len(track)-2:
772
+ cv2.arrowedLine(transparent_layer, tuple(start_point), tuple(end_point), (255, 0, 0, 255), 2, tipLength=8 / arrow_length)
773
+ else:
774
+ cv2.line(transparent_layer, tuple(start_point), tuple(end_point), (255, 0, 0, 255), 2,)
775
+ else:
776
+ cv2.circle(transparent_layer, tuple(track[0]), 3, (255, 0, 0, 255), -1)
777
+
778
+ transparent_layer = Image.fromarray(transparent_layer.astype(np.uint8))
779
+ trajectory_map = Image.alpha_composite(transparent_background, transparent_layer)
780
+
781
+ viz_flow = DragNUWA_net.get_cmp_flow_from_tracking_points(tracking_points, motion_brush_mask, first_frame_path)
782
+
783
+ return tracking_points, trajectory_map, viz_flow
784
+
785
+ with gr.Row():
786
+ with gr.Column(scale=2):
787
+ image_upload_button = gr.UploadButton(label="Upload Image",file_types=["image"])
788
+ add_drag_button = gr.Button(value="Add Trajectory")
789
+ run_button = gr.Button(value="Run")
790
+ delete_last_drag_button = gr.Button(value="Delete Last Trajectory")
791
+ brush_radius = gr.Slider(label='Motion Brush Radius',
792
+ minimum=1,
793
+ maximum=100,
794
+ step=1,
795
+ value=10)
796
+ ctrl_scale = gr.Slider(label='Control Scale',
797
+ minimum=0,
798
+ maximum=1.,
799
+ step=0.01,
800
+ value=0.6)
801
+
802
+ with gr.Column(scale=5):
803
+ input_image = gr.Image(label="Add Trajectory Here",
804
+ interactive=True)
805
+ with gr.Column(scale=5):
806
+ input_image_mask = gr.Image(label="Add Motion Brush Here",
807
+ interactive=True)
808
+
809
+ with gr.Row():
810
+ with gr.Column(scale=6):
811
+ viz_flow = gr.Image(label="Visualized Flow")
812
+ with gr.Column(scale=6):
813
+ hint_image = gr.Image(label="Visualized Hint Image")
814
+ with gr.Row():
815
+ with gr.Column(scale=6):
816
+ output_video = gr.Image(label="Output Video")
817
+ with gr.Column(scale=6):
818
+ output_flow = gr.Image(label="Output Flow")
819
+
820
+ with gr.Row():
821
+ with gr.Column(scale=6):
822
+ output_video_mp4 = gr.Video(label="Output Video mp4")
823
+ with gr.Column(scale=6):
824
+ output_flow_mp4 = gr.Video(label="Output Flow mp4")
825
+
826
+ image_upload_button.upload(preprocess_image, image_upload_button, [input_image, input_image_mask, first_frame_path, tracking_points, motion_brush_points, motion_brush_mask, motion_brush_viz])
827
+
828
+ add_drag_button.click(add_drag, tracking_points, tracking_points)
829
+
830
+ delete_last_drag_button.click(delete_last_drag, [tracking_points, first_frame_path, motion_brush_mask], [tracking_points, input_image, viz_flow])
831
+
832
+ input_image.select(add_tracking_points, [tracking_points, first_frame_path, motion_brush_mask], [tracking_points, input_image, viz_flow])
833
+
834
+ input_image_mask.select(add_motion_brushes, [motion_brush_points, motion_brush_mask, motion_brush_viz, first_frame_path, brush_radius, tracking_points], [motion_brush_mask, motion_brush_viz, input_image_mask, viz_flow])
835
+
836
+ run_button.click(DragNUWA_net.run, [first_frame_path, tracking_points, inference_batch_size, motion_brush_mask, motion_brush_viz, ctrl_scale], [hint_image, output_video, output_flow, output_video_mp4, output_flow_mp4])
837
+
838
+ demo.launch()