vinesmsuic commited on
Commit
15186bb
·
1 Parent(s): 3350655
Files changed (2) hide show
  1. app.py +83 -57
  2. gradio_demo.py +83 -57
app.py CHANGED
@@ -44,7 +44,7 @@ class ImageEditor:
44
  self.image_edit_model = InstructPix2Pix()
45
 
46
  @torch.no_grad()
47
- @spaces.GPU(duration=60)
48
  def perform_edit(self, video_path, prompt, force_512=False, seed=42, negative_prompt=""):
49
  edited_image_path = infer_video(self.image_edit_model,
50
  video_path,
@@ -240,6 +240,14 @@ def btn_preprocess_video_fn(video_path, width, height, start_time, end_time, cen
240
  else:
241
  return False
242
 
 
 
 
 
 
 
 
 
243
  if check_video(video_path) == False:
244
  processed_video_path = crop_and_resize_video(input_video_path=video_path,
245
  output_folder=TEMP_DIR,
@@ -252,9 +260,11 @@ def btn_preprocess_video_fn(video_path, width, height, start_time, end_time, cen
252
  x_offset=x_offset,
253
  y_offset=y_offset,
254
  longest_to_width=longest_to_width)
255
- return processed_video_path
 
256
  else:
257
- return video_path
 
258
 
259
  def btn_image_edit_fn(video_path, instruct_prompt, ie_force_512, ie_seed, ie_neg_prompt):
260
  """
@@ -312,61 +322,77 @@ def btn_infer_fn(video_path,
312
  with gr.Blocks() as demo:
313
  gr.Markdown("# <img src='https://tiger-ai-lab.github.io/AnyV2V/static/images/icon.png' width='30'/> AnyV2V")
314
  gr.Markdown("Official 🤗 Gradio demo for [AnyV2V: A Plug-and-Play Framework For Any Video-to-Video Editing Tasks](https://tiger-ai-lab.github.io/AnyV2V/)")
315
- with gr.Row():
316
- with gr.Column():
317
- gr.Markdown("# Preprocessing Video Stage")
318
- gr.Markdown("AnyV2V only support video with 2 seconds duration and 8 fps. If your video is not in this format, we will preprocess it for you. Click on the Preprocess video button!")
319
- video_raw = gr.Video(label="Raw Video Input")
320
- btn_pv = gr.Button("Preprocess Video")
321
- video_input = gr.Video(label="Preprocessed Video Input")
322
- advanced_settings_pv = gr.Accordion("Advanced Settings for Video Preprocessing", open=False)
323
- with advanced_settings_pv:
324
- with gr.Column():
325
- pv_width = gr.Number(label="Width", value=512, minimum=1, maximum=4096)
326
- pv_height = gr.Number(label="Height", value=512, minimum=1, maximum=4096)
327
- pv_start_time = gr.Number(label="Start Time (End time - Start time must be = 2)", value=0, minimum=0)
328
- pv_end_time = gr.Number(label="End Time (End time - Start time must be = 2)", value=2, minimum=0)
329
- pv_center_crop = gr.Checkbox(label="Center Crop", value=True)
330
- pv_x_offset = gr.Number(label="Horizontal Offset (-1 to 1)", value=0, minimum=-1, maximum=1)
331
- pv_y_offset = gr.Number(label="Vertical Offset (-1 to 1)", value=0, minimum=-1, maximum=1)
332
- pv_longest_to_width = gr.Checkbox(label="Resize Longest Dimension to Width")
333
-
334
- with gr.Column():
335
- gr.Markdown("# Image Editing Stage")
336
- gr.Markdown("Edit the first frame of the video to your liking! Click on the Edit the first frame button after inputting the editing instruction prompt.")
337
- image_input_output = gr.Image(label="Edited Frame", type="filepath")
338
- image_instruct_prompt = gr.Textbox(label="Editing instruction prompt")
339
- btn_image_edit = gr.Button("Edit the first frame")
340
- advanced_settings_image_edit = gr.Accordion("Advanced Settings for Image Editing", open=True)
341
- with advanced_settings_image_edit:
342
- with gr.Column():
343
- ie_neg_prompt = gr.Textbox(label="Negative Prompt", value="low res, blurry, watermark, jpeg artifacts")
344
- ie_seed = gr.Number(label="Seed (-1 means random)", value=-1, minimum=-1, maximum=sys.maxsize)
345
- ie_force_512 = gr.Checkbox(label="Force resize to 512x512 before feeding into the image editing model")
346
-
347
- with gr.Column():
348
- gr.Markdown("# Video Editing Stage")
349
- gr.Markdown("Enjoy the full control of the video editing process using the edited image and the preprocessed video! Click on the Run AnyV2V button after inputting the video description prompt. Try tweak with the setting if the output does not satisfy you!")
350
- video_output = gr.Video(label="Video Output")
351
- video_prompt = gr.Textbox(label="Video description prompt")
352
- btn_infer = gr.Button("Run Video Editing")
353
- settings_anyv2v = gr.Accordion("Settings for AnyV2V")
354
- with settings_anyv2v:
355
- with gr.Column():
356
- av_pnp_f_t = gr.Slider(minimum=0, maximum=1, step=0.01, value=0.2, label="Convolutional injection (pnp_f_t)")
357
- av_pnp_spatial_attn_t = gr.Slider(minimum=0, maximum=1, step=0.01, value=0.2, label="Spatial Attention injection (pnp_spatial_attn_t)")
358
- av_pnp_temp_attn_t = gr.Slider(minimum=0, maximum=1, step=0.01, value=0.5, label="Temporal Attention injection (pnp_temp_attn_t)")
359
- advanced_settings_anyv2v = gr.Accordion("Advanced Settings for AnyV2V", open=False)
360
- with advanced_settings_anyv2v:
361
- with gr.Column():
362
- av_ddim_init_latents_t_idx = gr.Number(label="DDIM Initial Latents t Index", value=0, minimum=0)
363
- av_ddim_inversion_steps = gr.Number(label="DDIM Inversion Steps", value=100, minimum=1)
364
- av_num_inference_steps = gr.Number(label="Number of Inference Steps", value=50, minimum=1)
365
- av_guidance_scale = gr.Number(label="Guidance Scale", value=9, minimum=0)
366
- av_seed = gr.Number(label="Seed (-1 means random)", value=42, minimum=-1, maximum=sys.maxsize)
367
- av_neg_prompt = gr.Textbox(label="Negative Prompt", value="Distorted, discontinuous, Ugly, blurry, low resolution, motionless, static, disfigured, disconnected limbs, Ugly faces, incomplete arms")
368
 
369
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
370
  examples = gr.Examples(examples=demo_examples,
371
  label="Examples (Just click on Video Editing button after loading them into the UI)",
372
  inputs=[video_input, image_input_output, video_prompt, av_pnp_f_t, av_pnp_spatial_attn_t, av_pnp_temp_attn_t])
@@ -374,7 +400,7 @@ with gr.Blocks() as demo:
374
  btn_pv.click(
375
  btn_preprocess_video_fn,
376
  inputs=[video_raw, pv_width, pv_height, pv_start_time, pv_end_time, pv_center_crop, pv_x_offset, pv_y_offset, pv_longest_to_width],
377
- outputs=video_input
378
  )
379
 
380
  btn_image_edit.click(
 
44
  self.image_edit_model = InstructPix2Pix()
45
 
46
  @torch.no_grad()
47
+ @spaces.GPU(duration=30)
48
  def perform_edit(self, video_path, prompt, force_512=False, seed=42, negative_prompt=""):
49
  edited_image_path = infer_video(self.image_edit_model,
50
  video_path,
 
240
  else:
241
  return False
242
 
243
+ def get_first_frame_as_pil(video_path):
244
+ with VideoFileClip(video_path) as clip:
245
+ # Extract the first frame (at t=0) as an array
246
+ first_frame_array = clip.get_frame(0)
247
+ # Convert the numpy array to a PIL Image
248
+ first_frame_image = Image.fromarray(first_frame_array)
249
+ return first_frame_image
250
+
251
  if check_video(video_path) == False:
252
  processed_video_path = crop_and_resize_video(input_video_path=video_path,
253
  output_folder=TEMP_DIR,
 
260
  x_offset=x_offset,
261
  y_offset=y_offset,
262
  longest_to_width=longest_to_width)
263
+ frame = get_first_frame_as_pil(processed_video_path)
264
+ return processed_video_path, frame
265
  else:
266
+ frame = get_first_frame_as_pil(video_path)
267
+ return video_path, frame
268
 
269
  def btn_image_edit_fn(video_path, instruct_prompt, ie_force_512, ie_seed, ie_neg_prompt):
270
  """
 
322
  with gr.Blocks() as demo:
323
  gr.Markdown("# <img src='https://tiger-ai-lab.github.io/AnyV2V/static/images/icon.png' width='30'/> AnyV2V")
324
  gr.Markdown("Official 🤗 Gradio demo for [AnyV2V: A Plug-and-Play Framework For Any Video-to-Video Editing Tasks](https://tiger-ai-lab.github.io/AnyV2V/)")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
325
 
326
 
327
+ with gr.Tabs():
328
+ with gr.TabItem('AnyV2V + InstructPix2Pix'):
329
+ with gr.Group():
330
+ gr.Markdown("# Preprocessing Video Stage")
331
+ gr.Markdown("AnyV2V only support video with 2 seconds duration and 8 fps. If your video is not in this format, we will preprocess it for you. Click on the Preprocess video button!")
332
+ with gr.Row():
333
+ with gr.Column():
334
+ video_raw = gr.Video(label="Raw Video Input")
335
+ btn_pv = gr.Button("Preprocess Video")
336
+
337
+ with gr.Column():
338
+ video_input = gr.Video(label="Preprocessed Video Input", interactive=False)
339
+ with gr.Column():
340
+ advanced_settings_pv = gr.Accordion("Advanced Settings for Video Preprocessing", open=False)
341
+ with advanced_settings_pv:
342
+ with gr.Column():
343
+ pv_width = gr.Number(label="Width", value=512, minimum=1, maximum=4096)
344
+ pv_height = gr.Number(label="Height", value=512, minimum=1, maximum=4096)
345
+ pv_start_time = gr.Number(label="Start Time (End time - Start time must be = 2)", value=0, minimum=0)
346
+ pv_end_time = gr.Number(label="End Time (End time - Start time must be = 2)", value=2, minimum=0)
347
+ pv_center_crop = gr.Checkbox(label="Center Crop", value=True)
348
+ pv_x_offset = gr.Number(label="Horizontal Offset (-1 to 1)", value=0, minimum=-1, maximum=1)
349
+ pv_y_offset = gr.Number(label="Vertical Offset (-1 to 1)", value=0, minimum=-1, maximum=1)
350
+ pv_longest_to_width = gr.Checkbox(label="Resize Longest Dimension to Width")
351
+
352
+ with gr.Group():
353
+ gr.Markdown("# Image Editing Stage")
354
+ gr.Markdown("Edit the first frame of the video to your liking! Click on the Edit the first frame button after inputting the editing instruction prompt.")
355
+ with gr.Row():
356
+ with gr.Column():
357
+ src_first_frame = gr.Image(label="First Frame", type="filepath", interactive=False)
358
+ image_instruct_prompt = gr.Textbox(label="Editing instruction prompt")
359
+ btn_image_edit = gr.Button("Edit the first frame")
360
+ with gr.Column():
361
+ image_input_output = gr.Image(label="Edited Frame", type="filepath")
362
+ with gr.Column():
363
+ advanced_settings_image_edit = gr.Accordion("Advanced Settings for Image Editing", open=True)
364
+ with advanced_settings_image_edit:
365
+ with gr.Column():
366
+ ie_neg_prompt = gr.Textbox(label="Negative Prompt", value="low res, blurry, watermark, jpeg artifacts")
367
+ ie_seed = gr.Number(label="Seed (-1 means random)", value=-1, minimum=-1, maximum=sys.maxsize)
368
+ ie_force_512 = gr.Checkbox(label="Force resize to 512x512 before feeding into the image editing model")
369
+
370
+ with gr.Group():
371
+ gr.Markdown("# Video Editing Stage")
372
+ gr.Markdown("Enjoy the full control of the video editing process using the edited image and the preprocessed video! Click on the Run AnyV2V button after inputting the video description prompt. Try tweak with the setting if the output does not satisfy you!")
373
+ with gr.Row():
374
+ with gr.Column():
375
+ video_prompt = gr.Textbox(label="Video description prompt")
376
+ settings_anyv2v = gr.Accordion("Settings for AnyV2V")
377
+ with settings_anyv2v:
378
+ with gr.Column():
379
+ av_pnp_f_t = gr.Slider(minimum=0, maximum=1, step=0.01, value=0.2, label="Convolutional injection (pnp_f_t)")
380
+ av_pnp_spatial_attn_t = gr.Slider(minimum=0, maximum=1, step=0.01, value=0.2, label="Spatial Attention injection (pnp_spatial_attn_t)")
381
+ av_pnp_temp_attn_t = gr.Slider(minimum=0, maximum=1, step=0.01, value=0.5, label="Temporal Attention injection (pnp_temp_attn_t)")
382
+ btn_infer = gr.Button("Run Video Editing")
383
+ with gr.Column():
384
+ video_output = gr.Video(label="Video Output")
385
+ with gr.Column():
386
+ advanced_settings_anyv2v = gr.Accordion("Advanced Settings for AnyV2V", open=False)
387
+ with advanced_settings_anyv2v:
388
+ with gr.Column():
389
+ av_ddim_init_latents_t_idx = gr.Number(label="DDIM Initial Latents t Index", value=0, minimum=0)
390
+ av_ddim_inversion_steps = gr.Number(label="DDIM Inversion Steps", value=100, minimum=1)
391
+ av_num_inference_steps = gr.Number(label="Number of Inference Steps", value=50, minimum=1)
392
+ av_guidance_scale = gr.Number(label="Guidance Scale", value=9, minimum=0)
393
+ av_seed = gr.Number(label="Seed (-1 means random)", value=42, minimum=-1, maximum=sys.maxsize)
394
+ av_neg_prompt = gr.Textbox(label="Negative Prompt", value="Distorted, discontinuous, Ugly, blurry, low resolution, motionless, static, disfigured, disconnected limbs, Ugly faces, incomplete arms")
395
+
396
  examples = gr.Examples(examples=demo_examples,
397
  label="Examples (Just click on Video Editing button after loading them into the UI)",
398
  inputs=[video_input, image_input_output, video_prompt, av_pnp_f_t, av_pnp_spatial_attn_t, av_pnp_temp_attn_t])
 
400
  btn_pv.click(
401
  btn_preprocess_video_fn,
402
  inputs=[video_raw, pv_width, pv_height, pv_start_time, pv_end_time, pv_center_crop, pv_x_offset, pv_y_offset, pv_longest_to_width],
403
+ outputs=[video_input, src_first_frame]
404
  )
405
 
406
  btn_image_edit.click(
gradio_demo.py CHANGED
@@ -44,7 +44,7 @@ class ImageEditor:
44
  self.image_edit_model = InstructPix2Pix()
45
 
46
  @torch.no_grad()
47
- @spaces.GPU(duration=60)
48
  def perform_edit(self, video_path, prompt, force_512=False, seed=42, negative_prompt=""):
49
  edited_image_path = infer_video(self.image_edit_model,
50
  video_path,
@@ -240,6 +240,14 @@ def btn_preprocess_video_fn(video_path, width, height, start_time, end_time, cen
240
  else:
241
  return False
242
 
 
 
 
 
 
 
 
 
243
  if check_video(video_path) == False:
244
  processed_video_path = crop_and_resize_video(input_video_path=video_path,
245
  output_folder=TEMP_DIR,
@@ -252,9 +260,11 @@ def btn_preprocess_video_fn(video_path, width, height, start_time, end_time, cen
252
  x_offset=x_offset,
253
  y_offset=y_offset,
254
  longest_to_width=longest_to_width)
255
- return processed_video_path
 
256
  else:
257
- return video_path
 
258
 
259
  def btn_image_edit_fn(video_path, instruct_prompt, ie_force_512, ie_seed, ie_neg_prompt):
260
  """
@@ -312,61 +322,77 @@ def btn_infer_fn(video_path,
312
  with gr.Blocks() as demo:
313
  gr.Markdown("# <img src='https://tiger-ai-lab.github.io/AnyV2V/static/images/icon.png' width='30'/> AnyV2V")
314
  gr.Markdown("Official 🤗 Gradio demo for [AnyV2V: A Plug-and-Play Framework For Any Video-to-Video Editing Tasks](https://tiger-ai-lab.github.io/AnyV2V/)")
315
- with gr.Row():
316
- with gr.Column():
317
- gr.Markdown("# Preprocessing Video Stage")
318
- gr.Markdown("AnyV2V only support video with 2 seconds duration and 8 fps. If your video is not in this format, we will preprocess it for you. Click on the Preprocess video button!")
319
- video_raw = gr.Video(label="Raw Video Input")
320
- btn_pv = gr.Button("Preprocess Video")
321
- video_input = gr.Video(label="Preprocessed Video Input")
322
- advanced_settings_pv = gr.Accordion("Advanced Settings for Video Preprocessing", open=False)
323
- with advanced_settings_pv:
324
- with gr.Column():
325
- pv_width = gr.Number(label="Width", value=512, minimum=1, maximum=4096)
326
- pv_height = gr.Number(label="Height", value=512, minimum=1, maximum=4096)
327
- pv_start_time = gr.Number(label="Start Time (End time - Start time must be = 2)", value=0, minimum=0)
328
- pv_end_time = gr.Number(label="End Time (End time - Start time must be = 2)", value=2, minimum=0)
329
- pv_center_crop = gr.Checkbox(label="Center Crop", value=True)
330
- pv_x_offset = gr.Number(label="Horizontal Offset (-1 to 1)", value=0, minimum=-1, maximum=1)
331
- pv_y_offset = gr.Number(label="Vertical Offset (-1 to 1)", value=0, minimum=-1, maximum=1)
332
- pv_longest_to_width = gr.Checkbox(label="Resize Longest Dimension to Width")
333
-
334
- with gr.Column():
335
- gr.Markdown("# Image Editing Stage")
336
- gr.Markdown("Edit the first frame of the video to your liking! Click on the Edit the first frame button after inputting the editing instruction prompt.")
337
- image_input_output = gr.Image(label="Edited Frame", type="filepath")
338
- image_instruct_prompt = gr.Textbox(label="Editing instruction prompt")
339
- btn_image_edit = gr.Button("Edit the first frame")
340
- advanced_settings_image_edit = gr.Accordion("Advanced Settings for Image Editing", open=True)
341
- with advanced_settings_image_edit:
342
- with gr.Column():
343
- ie_neg_prompt = gr.Textbox(label="Negative Prompt", value="low res, blurry, watermark, jpeg artifacts")
344
- ie_seed = gr.Number(label="Seed (-1 means random)", value=-1, minimum=-1, maximum=sys.maxsize)
345
- ie_force_512 = gr.Checkbox(label="Force resize to 512x512 before feeding into the image editing model")
346
-
347
- with gr.Column():
348
- gr.Markdown("# Video Editing Stage")
349
- gr.Markdown("Enjoy the full control of the video editing process using the edited image and the preprocessed video! Click on the Run AnyV2V button after inputting the video description prompt. Try tweak with the setting if the output does not satisfy you!")
350
- video_output = gr.Video(label="Video Output")
351
- video_prompt = gr.Textbox(label="Video description prompt")
352
- btn_infer = gr.Button("Run Video Editing")
353
- settings_anyv2v = gr.Accordion("Settings for AnyV2V")
354
- with settings_anyv2v:
355
- with gr.Column():
356
- av_pnp_f_t = gr.Slider(minimum=0, maximum=1, step=0.01, value=0.2, label="Convolutional injection (pnp_f_t)")
357
- av_pnp_spatial_attn_t = gr.Slider(minimum=0, maximum=1, step=0.01, value=0.2, label="Spatial Attention injection (pnp_spatial_attn_t)")
358
- av_pnp_temp_attn_t = gr.Slider(minimum=0, maximum=1, step=0.01, value=0.5, label="Temporal Attention injection (pnp_temp_attn_t)")
359
- advanced_settings_anyv2v = gr.Accordion("Advanced Settings for AnyV2V", open=False)
360
- with advanced_settings_anyv2v:
361
- with gr.Column():
362
- av_ddim_init_latents_t_idx = gr.Number(label="DDIM Initial Latents t Index", value=0, minimum=0)
363
- av_ddim_inversion_steps = gr.Number(label="DDIM Inversion Steps", value=100, minimum=1)
364
- av_num_inference_steps = gr.Number(label="Number of Inference Steps", value=50, minimum=1)
365
- av_guidance_scale = gr.Number(label="Guidance Scale", value=9, minimum=0)
366
- av_seed = gr.Number(label="Seed (-1 means random)", value=42, minimum=-1, maximum=sys.maxsize)
367
- av_neg_prompt = gr.Textbox(label="Negative Prompt", value="Distorted, discontinuous, Ugly, blurry, low resolution, motionless, static, disfigured, disconnected limbs, Ugly faces, incomplete arms")
368
 
369
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
370
  examples = gr.Examples(examples=demo_examples,
371
  label="Examples (Just click on Video Editing button after loading them into the UI)",
372
  inputs=[video_input, image_input_output, video_prompt, av_pnp_f_t, av_pnp_spatial_attn_t, av_pnp_temp_attn_t])
@@ -374,7 +400,7 @@ with gr.Blocks() as demo:
374
  btn_pv.click(
375
  btn_preprocess_video_fn,
376
  inputs=[video_raw, pv_width, pv_height, pv_start_time, pv_end_time, pv_center_crop, pv_x_offset, pv_y_offset, pv_longest_to_width],
377
- outputs=video_input
378
  )
379
 
380
  btn_image_edit.click(
 
44
  self.image_edit_model = InstructPix2Pix()
45
 
46
  @torch.no_grad()
47
+ @spaces.GPU(duration=30)
48
  def perform_edit(self, video_path, prompt, force_512=False, seed=42, negative_prompt=""):
49
  edited_image_path = infer_video(self.image_edit_model,
50
  video_path,
 
240
  else:
241
  return False
242
 
243
+ def get_first_frame_as_pil(video_path):
244
+ with VideoFileClip(video_path) as clip:
245
+ # Extract the first frame (at t=0) as an array
246
+ first_frame_array = clip.get_frame(0)
247
+ # Convert the numpy array to a PIL Image
248
+ first_frame_image = Image.fromarray(first_frame_array)
249
+ return first_frame_image
250
+
251
  if check_video(video_path) == False:
252
  processed_video_path = crop_and_resize_video(input_video_path=video_path,
253
  output_folder=TEMP_DIR,
 
260
  x_offset=x_offset,
261
  y_offset=y_offset,
262
  longest_to_width=longest_to_width)
263
+ frame = get_first_frame_as_pil(processed_video_path)
264
+ return processed_video_path, frame
265
  else:
266
+ frame = get_first_frame_as_pil(video_path)
267
+ return video_path, frame
268
 
269
  def btn_image_edit_fn(video_path, instruct_prompt, ie_force_512, ie_seed, ie_neg_prompt):
270
  """
 
322
  with gr.Blocks() as demo:
323
  gr.Markdown("# <img src='https://tiger-ai-lab.github.io/AnyV2V/static/images/icon.png' width='30'/> AnyV2V")
324
  gr.Markdown("Official 🤗 Gradio demo for [AnyV2V: A Plug-and-Play Framework For Any Video-to-Video Editing Tasks](https://tiger-ai-lab.github.io/AnyV2V/)")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
325
 
326
 
327
+ with gr.Tabs():
328
+ with gr.TabItem('AnyV2V + InstructPix2Pix'):
329
+ with gr.Group():
330
+ gr.Markdown("# Preprocessing Video Stage")
331
+ gr.Markdown("AnyV2V only support video with 2 seconds duration and 8 fps. If your video is not in this format, we will preprocess it for you. Click on the Preprocess video button!")
332
+ with gr.Row():
333
+ with gr.Column():
334
+ video_raw = gr.Video(label="Raw Video Input")
335
+ btn_pv = gr.Button("Preprocess Video")
336
+
337
+ with gr.Column():
338
+ video_input = gr.Video(label="Preprocessed Video Input", interactive=False)
339
+ with gr.Column():
340
+ advanced_settings_pv = gr.Accordion("Advanced Settings for Video Preprocessing", open=False)
341
+ with advanced_settings_pv:
342
+ with gr.Column():
343
+ pv_width = gr.Number(label="Width", value=512, minimum=1, maximum=4096)
344
+ pv_height = gr.Number(label="Height", value=512, minimum=1, maximum=4096)
345
+ pv_start_time = gr.Number(label="Start Time (End time - Start time must be = 2)", value=0, minimum=0)
346
+ pv_end_time = gr.Number(label="End Time (End time - Start time must be = 2)", value=2, minimum=0)
347
+ pv_center_crop = gr.Checkbox(label="Center Crop", value=True)
348
+ pv_x_offset = gr.Number(label="Horizontal Offset (-1 to 1)", value=0, minimum=-1, maximum=1)
349
+ pv_y_offset = gr.Number(label="Vertical Offset (-1 to 1)", value=0, minimum=-1, maximum=1)
350
+ pv_longest_to_width = gr.Checkbox(label="Resize Longest Dimension to Width")
351
+
352
+ with gr.Group():
353
+ gr.Markdown("# Image Editing Stage")
354
+ gr.Markdown("Edit the first frame of the video to your liking! Click on the Edit the first frame button after inputting the editing instruction prompt.")
355
+ with gr.Row():
356
+ with gr.Column():
357
+ src_first_frame = gr.Image(label="First Frame", type="filepath", interactive=False)
358
+ image_instruct_prompt = gr.Textbox(label="Editing instruction prompt")
359
+ btn_image_edit = gr.Button("Edit the first frame")
360
+ with gr.Column():
361
+ image_input_output = gr.Image(label="Edited Frame", type="filepath")
362
+ with gr.Column():
363
+ advanced_settings_image_edit = gr.Accordion("Advanced Settings for Image Editing", open=True)
364
+ with advanced_settings_image_edit:
365
+ with gr.Column():
366
+ ie_neg_prompt = gr.Textbox(label="Negative Prompt", value="low res, blurry, watermark, jpeg artifacts")
367
+ ie_seed = gr.Number(label="Seed (-1 means random)", value=-1, minimum=-1, maximum=sys.maxsize)
368
+ ie_force_512 = gr.Checkbox(label="Force resize to 512x512 before feeding into the image editing model")
369
+
370
+ with gr.Group():
371
+ gr.Markdown("# Video Editing Stage")
372
+ gr.Markdown("Enjoy the full control of the video editing process using the edited image and the preprocessed video! Click on the Run AnyV2V button after inputting the video description prompt. Try tweak with the setting if the output does not satisfy you!")
373
+ with gr.Row():
374
+ with gr.Column():
375
+ video_prompt = gr.Textbox(label="Video description prompt")
376
+ settings_anyv2v = gr.Accordion("Settings for AnyV2V")
377
+ with settings_anyv2v:
378
+ with gr.Column():
379
+ av_pnp_f_t = gr.Slider(minimum=0, maximum=1, step=0.01, value=0.2, label="Convolutional injection (pnp_f_t)")
380
+ av_pnp_spatial_attn_t = gr.Slider(minimum=0, maximum=1, step=0.01, value=0.2, label="Spatial Attention injection (pnp_spatial_attn_t)")
381
+ av_pnp_temp_attn_t = gr.Slider(minimum=0, maximum=1, step=0.01, value=0.5, label="Temporal Attention injection (pnp_temp_attn_t)")
382
+ btn_infer = gr.Button("Run Video Editing")
383
+ with gr.Column():
384
+ video_output = gr.Video(label="Video Output")
385
+ with gr.Column():
386
+ advanced_settings_anyv2v = gr.Accordion("Advanced Settings for AnyV2V", open=False)
387
+ with advanced_settings_anyv2v:
388
+ with gr.Column():
389
+ av_ddim_init_latents_t_idx = gr.Number(label="DDIM Initial Latents t Index", value=0, minimum=0)
390
+ av_ddim_inversion_steps = gr.Number(label="DDIM Inversion Steps", value=100, minimum=1)
391
+ av_num_inference_steps = gr.Number(label="Number of Inference Steps", value=50, minimum=1)
392
+ av_guidance_scale = gr.Number(label="Guidance Scale", value=9, minimum=0)
393
+ av_seed = gr.Number(label="Seed (-1 means random)", value=42, minimum=-1, maximum=sys.maxsize)
394
+ av_neg_prompt = gr.Textbox(label="Negative Prompt", value="Distorted, discontinuous, Ugly, blurry, low resolution, motionless, static, disfigured, disconnected limbs, Ugly faces, incomplete arms")
395
+
396
  examples = gr.Examples(examples=demo_examples,
397
  label="Examples (Just click on Video Editing button after loading them into the UI)",
398
  inputs=[video_input, image_input_output, video_prompt, av_pnp_f_t, av_pnp_spatial_attn_t, av_pnp_temp_attn_t])
 
400
  btn_pv.click(
401
  btn_preprocess_video_fn,
402
  inputs=[video_raw, pv_width, pv_height, pv_start_time, pv_end_time, pv_center_crop, pv_x_offset, pv_y_offset, pv_longest_to_width],
403
+ outputs=[video_input, src_first_frame]
404
  )
405
 
406
  btn_image_edit.click(