vinesmsuic
commited on
Commit
·
15186bb
1
Parent(s):
3350655
update
Browse files- app.py +83 -57
- gradio_demo.py +83 -57
app.py
CHANGED
@@ -44,7 +44,7 @@ class ImageEditor:
|
|
44 |
self.image_edit_model = InstructPix2Pix()
|
45 |
|
46 |
@torch.no_grad()
|
47 |
-
@spaces.GPU(duration=
|
48 |
def perform_edit(self, video_path, prompt, force_512=False, seed=42, negative_prompt=""):
|
49 |
edited_image_path = infer_video(self.image_edit_model,
|
50 |
video_path,
|
@@ -240,6 +240,14 @@ def btn_preprocess_video_fn(video_path, width, height, start_time, end_time, cen
|
|
240 |
else:
|
241 |
return False
|
242 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
243 |
if check_video(video_path) == False:
|
244 |
processed_video_path = crop_and_resize_video(input_video_path=video_path,
|
245 |
output_folder=TEMP_DIR,
|
@@ -252,9 +260,11 @@ def btn_preprocess_video_fn(video_path, width, height, start_time, end_time, cen
|
|
252 |
x_offset=x_offset,
|
253 |
y_offset=y_offset,
|
254 |
longest_to_width=longest_to_width)
|
255 |
-
|
|
|
256 |
else:
|
257 |
-
|
|
|
258 |
|
259 |
def btn_image_edit_fn(video_path, instruct_prompt, ie_force_512, ie_seed, ie_neg_prompt):
|
260 |
"""
|
@@ -312,61 +322,77 @@ def btn_infer_fn(video_path,
|
|
312 |
with gr.Blocks() as demo:
|
313 |
gr.Markdown("# <img src='https://tiger-ai-lab.github.io/AnyV2V/static/images/icon.png' width='30'/> AnyV2V")
|
314 |
gr.Markdown("Official 🤗 Gradio demo for [AnyV2V: A Plug-and-Play Framework For Any Video-to-Video Editing Tasks](https://tiger-ai-lab.github.io/AnyV2V/)")
|
315 |
-
with gr.Row():
|
316 |
-
with gr.Column():
|
317 |
-
gr.Markdown("# Preprocessing Video Stage")
|
318 |
-
gr.Markdown("AnyV2V only support video with 2 seconds duration and 8 fps. If your video is not in this format, we will preprocess it for you. Click on the Preprocess video button!")
|
319 |
-
video_raw = gr.Video(label="Raw Video Input")
|
320 |
-
btn_pv = gr.Button("Preprocess Video")
|
321 |
-
video_input = gr.Video(label="Preprocessed Video Input")
|
322 |
-
advanced_settings_pv = gr.Accordion("Advanced Settings for Video Preprocessing", open=False)
|
323 |
-
with advanced_settings_pv:
|
324 |
-
with gr.Column():
|
325 |
-
pv_width = gr.Number(label="Width", value=512, minimum=1, maximum=4096)
|
326 |
-
pv_height = gr.Number(label="Height", value=512, minimum=1, maximum=4096)
|
327 |
-
pv_start_time = gr.Number(label="Start Time (End time - Start time must be = 2)", value=0, minimum=0)
|
328 |
-
pv_end_time = gr.Number(label="End Time (End time - Start time must be = 2)", value=2, minimum=0)
|
329 |
-
pv_center_crop = gr.Checkbox(label="Center Crop", value=True)
|
330 |
-
pv_x_offset = gr.Number(label="Horizontal Offset (-1 to 1)", value=0, minimum=-1, maximum=1)
|
331 |
-
pv_y_offset = gr.Number(label="Vertical Offset (-1 to 1)", value=0, minimum=-1, maximum=1)
|
332 |
-
pv_longest_to_width = gr.Checkbox(label="Resize Longest Dimension to Width")
|
333 |
-
|
334 |
-
with gr.Column():
|
335 |
-
gr.Markdown("# Image Editing Stage")
|
336 |
-
gr.Markdown("Edit the first frame of the video to your liking! Click on the Edit the first frame button after inputting the editing instruction prompt.")
|
337 |
-
image_input_output = gr.Image(label="Edited Frame", type="filepath")
|
338 |
-
image_instruct_prompt = gr.Textbox(label="Editing instruction prompt")
|
339 |
-
btn_image_edit = gr.Button("Edit the first frame")
|
340 |
-
advanced_settings_image_edit = gr.Accordion("Advanced Settings for Image Editing", open=True)
|
341 |
-
with advanced_settings_image_edit:
|
342 |
-
with gr.Column():
|
343 |
-
ie_neg_prompt = gr.Textbox(label="Negative Prompt", value="low res, blurry, watermark, jpeg artifacts")
|
344 |
-
ie_seed = gr.Number(label="Seed (-1 means random)", value=-1, minimum=-1, maximum=sys.maxsize)
|
345 |
-
ie_force_512 = gr.Checkbox(label="Force resize to 512x512 before feeding into the image editing model")
|
346 |
-
|
347 |
-
with gr.Column():
|
348 |
-
gr.Markdown("# Video Editing Stage")
|
349 |
-
gr.Markdown("Enjoy the full control of the video editing process using the edited image and the preprocessed video! Click on the Run AnyV2V button after inputting the video description prompt. Try tweak with the setting if the output does not satisfy you!")
|
350 |
-
video_output = gr.Video(label="Video Output")
|
351 |
-
video_prompt = gr.Textbox(label="Video description prompt")
|
352 |
-
btn_infer = gr.Button("Run Video Editing")
|
353 |
-
settings_anyv2v = gr.Accordion("Settings for AnyV2V")
|
354 |
-
with settings_anyv2v:
|
355 |
-
with gr.Column():
|
356 |
-
av_pnp_f_t = gr.Slider(minimum=0, maximum=1, step=0.01, value=0.2, label="Convolutional injection (pnp_f_t)")
|
357 |
-
av_pnp_spatial_attn_t = gr.Slider(minimum=0, maximum=1, step=0.01, value=0.2, label="Spatial Attention injection (pnp_spatial_attn_t)")
|
358 |
-
av_pnp_temp_attn_t = gr.Slider(minimum=0, maximum=1, step=0.01, value=0.5, label="Temporal Attention injection (pnp_temp_attn_t)")
|
359 |
-
advanced_settings_anyv2v = gr.Accordion("Advanced Settings for AnyV2V", open=False)
|
360 |
-
with advanced_settings_anyv2v:
|
361 |
-
with gr.Column():
|
362 |
-
av_ddim_init_latents_t_idx = gr.Number(label="DDIM Initial Latents t Index", value=0, minimum=0)
|
363 |
-
av_ddim_inversion_steps = gr.Number(label="DDIM Inversion Steps", value=100, minimum=1)
|
364 |
-
av_num_inference_steps = gr.Number(label="Number of Inference Steps", value=50, minimum=1)
|
365 |
-
av_guidance_scale = gr.Number(label="Guidance Scale", value=9, minimum=0)
|
366 |
-
av_seed = gr.Number(label="Seed (-1 means random)", value=42, minimum=-1, maximum=sys.maxsize)
|
367 |
-
av_neg_prompt = gr.Textbox(label="Negative Prompt", value="Distorted, discontinuous, Ugly, blurry, low resolution, motionless, static, disfigured, disconnected limbs, Ugly faces, incomplete arms")
|
368 |
|
369 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
370 |
examples = gr.Examples(examples=demo_examples,
|
371 |
label="Examples (Just click on Video Editing button after loading them into the UI)",
|
372 |
inputs=[video_input, image_input_output, video_prompt, av_pnp_f_t, av_pnp_spatial_attn_t, av_pnp_temp_attn_t])
|
@@ -374,7 +400,7 @@ with gr.Blocks() as demo:
|
|
374 |
btn_pv.click(
|
375 |
btn_preprocess_video_fn,
|
376 |
inputs=[video_raw, pv_width, pv_height, pv_start_time, pv_end_time, pv_center_crop, pv_x_offset, pv_y_offset, pv_longest_to_width],
|
377 |
-
outputs=video_input
|
378 |
)
|
379 |
|
380 |
btn_image_edit.click(
|
|
|
44 |
self.image_edit_model = InstructPix2Pix()
|
45 |
|
46 |
@torch.no_grad()
|
47 |
+
@spaces.GPU(duration=30)
|
48 |
def perform_edit(self, video_path, prompt, force_512=False, seed=42, negative_prompt=""):
|
49 |
edited_image_path = infer_video(self.image_edit_model,
|
50 |
video_path,
|
|
|
240 |
else:
|
241 |
return False
|
242 |
|
243 |
+
def get_first_frame_as_pil(video_path):
|
244 |
+
with VideoFileClip(video_path) as clip:
|
245 |
+
# Extract the first frame (at t=0) as an array
|
246 |
+
first_frame_array = clip.get_frame(0)
|
247 |
+
# Convert the numpy array to a PIL Image
|
248 |
+
first_frame_image = Image.fromarray(first_frame_array)
|
249 |
+
return first_frame_image
|
250 |
+
|
251 |
if check_video(video_path) == False:
|
252 |
processed_video_path = crop_and_resize_video(input_video_path=video_path,
|
253 |
output_folder=TEMP_DIR,
|
|
|
260 |
x_offset=x_offset,
|
261 |
y_offset=y_offset,
|
262 |
longest_to_width=longest_to_width)
|
263 |
+
frame = get_first_frame_as_pil(processed_video_path)
|
264 |
+
return processed_video_path, frame
|
265 |
else:
|
266 |
+
frame = get_first_frame_as_pil(video_path)
|
267 |
+
return video_path, frame
|
268 |
|
269 |
def btn_image_edit_fn(video_path, instruct_prompt, ie_force_512, ie_seed, ie_neg_prompt):
|
270 |
"""
|
|
|
322 |
with gr.Blocks() as demo:
|
323 |
gr.Markdown("# <img src='https://tiger-ai-lab.github.io/AnyV2V/static/images/icon.png' width='30'/> AnyV2V")
|
324 |
gr.Markdown("Official 🤗 Gradio demo for [AnyV2V: A Plug-and-Play Framework For Any Video-to-Video Editing Tasks](https://tiger-ai-lab.github.io/AnyV2V/)")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
325 |
|
326 |
|
327 |
+
with gr.Tabs():
|
328 |
+
with gr.TabItem('AnyV2V + InstructPix2Pix'):
|
329 |
+
with gr.Group():
|
330 |
+
gr.Markdown("# Preprocessing Video Stage")
|
331 |
+
gr.Markdown("AnyV2V only support video with 2 seconds duration and 8 fps. If your video is not in this format, we will preprocess it for you. Click on the Preprocess video button!")
|
332 |
+
with gr.Row():
|
333 |
+
with gr.Column():
|
334 |
+
video_raw = gr.Video(label="Raw Video Input")
|
335 |
+
btn_pv = gr.Button("Preprocess Video")
|
336 |
+
|
337 |
+
with gr.Column():
|
338 |
+
video_input = gr.Video(label="Preprocessed Video Input", interactive=False)
|
339 |
+
with gr.Column():
|
340 |
+
advanced_settings_pv = gr.Accordion("Advanced Settings for Video Preprocessing", open=False)
|
341 |
+
with advanced_settings_pv:
|
342 |
+
with gr.Column():
|
343 |
+
pv_width = gr.Number(label="Width", value=512, minimum=1, maximum=4096)
|
344 |
+
pv_height = gr.Number(label="Height", value=512, minimum=1, maximum=4096)
|
345 |
+
pv_start_time = gr.Number(label="Start Time (End time - Start time must be = 2)", value=0, minimum=0)
|
346 |
+
pv_end_time = gr.Number(label="End Time (End time - Start time must be = 2)", value=2, minimum=0)
|
347 |
+
pv_center_crop = gr.Checkbox(label="Center Crop", value=True)
|
348 |
+
pv_x_offset = gr.Number(label="Horizontal Offset (-1 to 1)", value=0, minimum=-1, maximum=1)
|
349 |
+
pv_y_offset = gr.Number(label="Vertical Offset (-1 to 1)", value=0, minimum=-1, maximum=1)
|
350 |
+
pv_longest_to_width = gr.Checkbox(label="Resize Longest Dimension to Width")
|
351 |
+
|
352 |
+
with gr.Group():
|
353 |
+
gr.Markdown("# Image Editing Stage")
|
354 |
+
gr.Markdown("Edit the first frame of the video to your liking! Click on the Edit the first frame button after inputting the editing instruction prompt.")
|
355 |
+
with gr.Row():
|
356 |
+
with gr.Column():
|
357 |
+
src_first_frame = gr.Image(label="First Frame", type="filepath", interactive=False)
|
358 |
+
image_instruct_prompt = gr.Textbox(label="Editing instruction prompt")
|
359 |
+
btn_image_edit = gr.Button("Edit the first frame")
|
360 |
+
with gr.Column():
|
361 |
+
image_input_output = gr.Image(label="Edited Frame", type="filepath")
|
362 |
+
with gr.Column():
|
363 |
+
advanced_settings_image_edit = gr.Accordion("Advanced Settings for Image Editing", open=True)
|
364 |
+
with advanced_settings_image_edit:
|
365 |
+
with gr.Column():
|
366 |
+
ie_neg_prompt = gr.Textbox(label="Negative Prompt", value="low res, blurry, watermark, jpeg artifacts")
|
367 |
+
ie_seed = gr.Number(label="Seed (-1 means random)", value=-1, minimum=-1, maximum=sys.maxsize)
|
368 |
+
ie_force_512 = gr.Checkbox(label="Force resize to 512x512 before feeding into the image editing model")
|
369 |
+
|
370 |
+
with gr.Group():
|
371 |
+
gr.Markdown("# Video Editing Stage")
|
372 |
+
gr.Markdown("Enjoy the full control of the video editing process using the edited image and the preprocessed video! Click on the Run AnyV2V button after inputting the video description prompt. Try tweak with the setting if the output does not satisfy you!")
|
373 |
+
with gr.Row():
|
374 |
+
with gr.Column():
|
375 |
+
video_prompt = gr.Textbox(label="Video description prompt")
|
376 |
+
settings_anyv2v = gr.Accordion("Settings for AnyV2V")
|
377 |
+
with settings_anyv2v:
|
378 |
+
with gr.Column():
|
379 |
+
av_pnp_f_t = gr.Slider(minimum=0, maximum=1, step=0.01, value=0.2, label="Convolutional injection (pnp_f_t)")
|
380 |
+
av_pnp_spatial_attn_t = gr.Slider(minimum=0, maximum=1, step=0.01, value=0.2, label="Spatial Attention injection (pnp_spatial_attn_t)")
|
381 |
+
av_pnp_temp_attn_t = gr.Slider(minimum=0, maximum=1, step=0.01, value=0.5, label="Temporal Attention injection (pnp_temp_attn_t)")
|
382 |
+
btn_infer = gr.Button("Run Video Editing")
|
383 |
+
with gr.Column():
|
384 |
+
video_output = gr.Video(label="Video Output")
|
385 |
+
with gr.Column():
|
386 |
+
advanced_settings_anyv2v = gr.Accordion("Advanced Settings for AnyV2V", open=False)
|
387 |
+
with advanced_settings_anyv2v:
|
388 |
+
with gr.Column():
|
389 |
+
av_ddim_init_latents_t_idx = gr.Number(label="DDIM Initial Latents t Index", value=0, minimum=0)
|
390 |
+
av_ddim_inversion_steps = gr.Number(label="DDIM Inversion Steps", value=100, minimum=1)
|
391 |
+
av_num_inference_steps = gr.Number(label="Number of Inference Steps", value=50, minimum=1)
|
392 |
+
av_guidance_scale = gr.Number(label="Guidance Scale", value=9, minimum=0)
|
393 |
+
av_seed = gr.Number(label="Seed (-1 means random)", value=42, minimum=-1, maximum=sys.maxsize)
|
394 |
+
av_neg_prompt = gr.Textbox(label="Negative Prompt", value="Distorted, discontinuous, Ugly, blurry, low resolution, motionless, static, disfigured, disconnected limbs, Ugly faces, incomplete arms")
|
395 |
+
|
396 |
examples = gr.Examples(examples=demo_examples,
|
397 |
label="Examples (Just click on Video Editing button after loading them into the UI)",
|
398 |
inputs=[video_input, image_input_output, video_prompt, av_pnp_f_t, av_pnp_spatial_attn_t, av_pnp_temp_attn_t])
|
|
|
400 |
btn_pv.click(
|
401 |
btn_preprocess_video_fn,
|
402 |
inputs=[video_raw, pv_width, pv_height, pv_start_time, pv_end_time, pv_center_crop, pv_x_offset, pv_y_offset, pv_longest_to_width],
|
403 |
+
outputs=[video_input, src_first_frame]
|
404 |
)
|
405 |
|
406 |
btn_image_edit.click(
|
gradio_demo.py
CHANGED
@@ -44,7 +44,7 @@ class ImageEditor:
|
|
44 |
self.image_edit_model = InstructPix2Pix()
|
45 |
|
46 |
@torch.no_grad()
|
47 |
-
@spaces.GPU(duration=
|
48 |
def perform_edit(self, video_path, prompt, force_512=False, seed=42, negative_prompt=""):
|
49 |
edited_image_path = infer_video(self.image_edit_model,
|
50 |
video_path,
|
@@ -240,6 +240,14 @@ def btn_preprocess_video_fn(video_path, width, height, start_time, end_time, cen
|
|
240 |
else:
|
241 |
return False
|
242 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
243 |
if check_video(video_path) == False:
|
244 |
processed_video_path = crop_and_resize_video(input_video_path=video_path,
|
245 |
output_folder=TEMP_DIR,
|
@@ -252,9 +260,11 @@ def btn_preprocess_video_fn(video_path, width, height, start_time, end_time, cen
|
|
252 |
x_offset=x_offset,
|
253 |
y_offset=y_offset,
|
254 |
longest_to_width=longest_to_width)
|
255 |
-
|
|
|
256 |
else:
|
257 |
-
|
|
|
258 |
|
259 |
def btn_image_edit_fn(video_path, instruct_prompt, ie_force_512, ie_seed, ie_neg_prompt):
|
260 |
"""
|
@@ -312,61 +322,77 @@ def btn_infer_fn(video_path,
|
|
312 |
with gr.Blocks() as demo:
|
313 |
gr.Markdown("# <img src='https://tiger-ai-lab.github.io/AnyV2V/static/images/icon.png' width='30'/> AnyV2V")
|
314 |
gr.Markdown("Official 🤗 Gradio demo for [AnyV2V: A Plug-and-Play Framework For Any Video-to-Video Editing Tasks](https://tiger-ai-lab.github.io/AnyV2V/)")
|
315 |
-
with gr.Row():
|
316 |
-
with gr.Column():
|
317 |
-
gr.Markdown("# Preprocessing Video Stage")
|
318 |
-
gr.Markdown("AnyV2V only support video with 2 seconds duration and 8 fps. If your video is not in this format, we will preprocess it for you. Click on the Preprocess video button!")
|
319 |
-
video_raw = gr.Video(label="Raw Video Input")
|
320 |
-
btn_pv = gr.Button("Preprocess Video")
|
321 |
-
video_input = gr.Video(label="Preprocessed Video Input")
|
322 |
-
advanced_settings_pv = gr.Accordion("Advanced Settings for Video Preprocessing", open=False)
|
323 |
-
with advanced_settings_pv:
|
324 |
-
with gr.Column():
|
325 |
-
pv_width = gr.Number(label="Width", value=512, minimum=1, maximum=4096)
|
326 |
-
pv_height = gr.Number(label="Height", value=512, minimum=1, maximum=4096)
|
327 |
-
pv_start_time = gr.Number(label="Start Time (End time - Start time must be = 2)", value=0, minimum=0)
|
328 |
-
pv_end_time = gr.Number(label="End Time (End time - Start time must be = 2)", value=2, minimum=0)
|
329 |
-
pv_center_crop = gr.Checkbox(label="Center Crop", value=True)
|
330 |
-
pv_x_offset = gr.Number(label="Horizontal Offset (-1 to 1)", value=0, minimum=-1, maximum=1)
|
331 |
-
pv_y_offset = gr.Number(label="Vertical Offset (-1 to 1)", value=0, minimum=-1, maximum=1)
|
332 |
-
pv_longest_to_width = gr.Checkbox(label="Resize Longest Dimension to Width")
|
333 |
-
|
334 |
-
with gr.Column():
|
335 |
-
gr.Markdown("# Image Editing Stage")
|
336 |
-
gr.Markdown("Edit the first frame of the video to your liking! Click on the Edit the first frame button after inputting the editing instruction prompt.")
|
337 |
-
image_input_output = gr.Image(label="Edited Frame", type="filepath")
|
338 |
-
image_instruct_prompt = gr.Textbox(label="Editing instruction prompt")
|
339 |
-
btn_image_edit = gr.Button("Edit the first frame")
|
340 |
-
advanced_settings_image_edit = gr.Accordion("Advanced Settings for Image Editing", open=True)
|
341 |
-
with advanced_settings_image_edit:
|
342 |
-
with gr.Column():
|
343 |
-
ie_neg_prompt = gr.Textbox(label="Negative Prompt", value="low res, blurry, watermark, jpeg artifacts")
|
344 |
-
ie_seed = gr.Number(label="Seed (-1 means random)", value=-1, minimum=-1, maximum=sys.maxsize)
|
345 |
-
ie_force_512 = gr.Checkbox(label="Force resize to 512x512 before feeding into the image editing model")
|
346 |
-
|
347 |
-
with gr.Column():
|
348 |
-
gr.Markdown("# Video Editing Stage")
|
349 |
-
gr.Markdown("Enjoy the full control of the video editing process using the edited image and the preprocessed video! Click on the Run AnyV2V button after inputting the video description prompt. Try tweak with the setting if the output does not satisfy you!")
|
350 |
-
video_output = gr.Video(label="Video Output")
|
351 |
-
video_prompt = gr.Textbox(label="Video description prompt")
|
352 |
-
btn_infer = gr.Button("Run Video Editing")
|
353 |
-
settings_anyv2v = gr.Accordion("Settings for AnyV2V")
|
354 |
-
with settings_anyv2v:
|
355 |
-
with gr.Column():
|
356 |
-
av_pnp_f_t = gr.Slider(minimum=0, maximum=1, step=0.01, value=0.2, label="Convolutional injection (pnp_f_t)")
|
357 |
-
av_pnp_spatial_attn_t = gr.Slider(minimum=0, maximum=1, step=0.01, value=0.2, label="Spatial Attention injection (pnp_spatial_attn_t)")
|
358 |
-
av_pnp_temp_attn_t = gr.Slider(minimum=0, maximum=1, step=0.01, value=0.5, label="Temporal Attention injection (pnp_temp_attn_t)")
|
359 |
-
advanced_settings_anyv2v = gr.Accordion("Advanced Settings for AnyV2V", open=False)
|
360 |
-
with advanced_settings_anyv2v:
|
361 |
-
with gr.Column():
|
362 |
-
av_ddim_init_latents_t_idx = gr.Number(label="DDIM Initial Latents t Index", value=0, minimum=0)
|
363 |
-
av_ddim_inversion_steps = gr.Number(label="DDIM Inversion Steps", value=100, minimum=1)
|
364 |
-
av_num_inference_steps = gr.Number(label="Number of Inference Steps", value=50, minimum=1)
|
365 |
-
av_guidance_scale = gr.Number(label="Guidance Scale", value=9, minimum=0)
|
366 |
-
av_seed = gr.Number(label="Seed (-1 means random)", value=42, minimum=-1, maximum=sys.maxsize)
|
367 |
-
av_neg_prompt = gr.Textbox(label="Negative Prompt", value="Distorted, discontinuous, Ugly, blurry, low resolution, motionless, static, disfigured, disconnected limbs, Ugly faces, incomplete arms")
|
368 |
|
369 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
370 |
examples = gr.Examples(examples=demo_examples,
|
371 |
label="Examples (Just click on Video Editing button after loading them into the UI)",
|
372 |
inputs=[video_input, image_input_output, video_prompt, av_pnp_f_t, av_pnp_spatial_attn_t, av_pnp_temp_attn_t])
|
@@ -374,7 +400,7 @@ with gr.Blocks() as demo:
|
|
374 |
btn_pv.click(
|
375 |
btn_preprocess_video_fn,
|
376 |
inputs=[video_raw, pv_width, pv_height, pv_start_time, pv_end_time, pv_center_crop, pv_x_offset, pv_y_offset, pv_longest_to_width],
|
377 |
-
outputs=video_input
|
378 |
)
|
379 |
|
380 |
btn_image_edit.click(
|
|
|
44 |
self.image_edit_model = InstructPix2Pix()
|
45 |
|
46 |
@torch.no_grad()
|
47 |
+
@spaces.GPU(duration=30)
|
48 |
def perform_edit(self, video_path, prompt, force_512=False, seed=42, negative_prompt=""):
|
49 |
edited_image_path = infer_video(self.image_edit_model,
|
50 |
video_path,
|
|
|
240 |
else:
|
241 |
return False
|
242 |
|
243 |
+
def get_first_frame_as_pil(video_path):
|
244 |
+
with VideoFileClip(video_path) as clip:
|
245 |
+
# Extract the first frame (at t=0) as an array
|
246 |
+
first_frame_array = clip.get_frame(0)
|
247 |
+
# Convert the numpy array to a PIL Image
|
248 |
+
first_frame_image = Image.fromarray(first_frame_array)
|
249 |
+
return first_frame_image
|
250 |
+
|
251 |
if check_video(video_path) == False:
|
252 |
processed_video_path = crop_and_resize_video(input_video_path=video_path,
|
253 |
output_folder=TEMP_DIR,
|
|
|
260 |
x_offset=x_offset,
|
261 |
y_offset=y_offset,
|
262 |
longest_to_width=longest_to_width)
|
263 |
+
frame = get_first_frame_as_pil(processed_video_path)
|
264 |
+
return processed_video_path, frame
|
265 |
else:
|
266 |
+
frame = get_first_frame_as_pil(video_path)
|
267 |
+
return video_path, frame
|
268 |
|
269 |
def btn_image_edit_fn(video_path, instruct_prompt, ie_force_512, ie_seed, ie_neg_prompt):
|
270 |
"""
|
|
|
322 |
with gr.Blocks() as demo:
|
323 |
gr.Markdown("# <img src='https://tiger-ai-lab.github.io/AnyV2V/static/images/icon.png' width='30'/> AnyV2V")
|
324 |
gr.Markdown("Official 🤗 Gradio demo for [AnyV2V: A Plug-and-Play Framework For Any Video-to-Video Editing Tasks](https://tiger-ai-lab.github.io/AnyV2V/)")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
325 |
|
326 |
|
327 |
+
with gr.Tabs():
|
328 |
+
with gr.TabItem('AnyV2V + InstructPix2Pix'):
|
329 |
+
with gr.Group():
|
330 |
+
gr.Markdown("# Preprocessing Video Stage")
|
331 |
+
gr.Markdown("AnyV2V only support video with 2 seconds duration and 8 fps. If your video is not in this format, we will preprocess it for you. Click on the Preprocess video button!")
|
332 |
+
with gr.Row():
|
333 |
+
with gr.Column():
|
334 |
+
video_raw = gr.Video(label="Raw Video Input")
|
335 |
+
btn_pv = gr.Button("Preprocess Video")
|
336 |
+
|
337 |
+
with gr.Column():
|
338 |
+
video_input = gr.Video(label="Preprocessed Video Input", interactive=False)
|
339 |
+
with gr.Column():
|
340 |
+
advanced_settings_pv = gr.Accordion("Advanced Settings for Video Preprocessing", open=False)
|
341 |
+
with advanced_settings_pv:
|
342 |
+
with gr.Column():
|
343 |
+
pv_width = gr.Number(label="Width", value=512, minimum=1, maximum=4096)
|
344 |
+
pv_height = gr.Number(label="Height", value=512, minimum=1, maximum=4096)
|
345 |
+
pv_start_time = gr.Number(label="Start Time (End time - Start time must be = 2)", value=0, minimum=0)
|
346 |
+
pv_end_time = gr.Number(label="End Time (End time - Start time must be = 2)", value=2, minimum=0)
|
347 |
+
pv_center_crop = gr.Checkbox(label="Center Crop", value=True)
|
348 |
+
pv_x_offset = gr.Number(label="Horizontal Offset (-1 to 1)", value=0, minimum=-1, maximum=1)
|
349 |
+
pv_y_offset = gr.Number(label="Vertical Offset (-1 to 1)", value=0, minimum=-1, maximum=1)
|
350 |
+
pv_longest_to_width = gr.Checkbox(label="Resize Longest Dimension to Width")
|
351 |
+
|
352 |
+
with gr.Group():
|
353 |
+
gr.Markdown("# Image Editing Stage")
|
354 |
+
gr.Markdown("Edit the first frame of the video to your liking! Click on the Edit the first frame button after inputting the editing instruction prompt.")
|
355 |
+
with gr.Row():
|
356 |
+
with gr.Column():
|
357 |
+
src_first_frame = gr.Image(label="First Frame", type="filepath", interactive=False)
|
358 |
+
image_instruct_prompt = gr.Textbox(label="Editing instruction prompt")
|
359 |
+
btn_image_edit = gr.Button("Edit the first frame")
|
360 |
+
with gr.Column():
|
361 |
+
image_input_output = gr.Image(label="Edited Frame", type="filepath")
|
362 |
+
with gr.Column():
|
363 |
+
advanced_settings_image_edit = gr.Accordion("Advanced Settings for Image Editing", open=True)
|
364 |
+
with advanced_settings_image_edit:
|
365 |
+
with gr.Column():
|
366 |
+
ie_neg_prompt = gr.Textbox(label="Negative Prompt", value="low res, blurry, watermark, jpeg artifacts")
|
367 |
+
ie_seed = gr.Number(label="Seed (-1 means random)", value=-1, minimum=-1, maximum=sys.maxsize)
|
368 |
+
ie_force_512 = gr.Checkbox(label="Force resize to 512x512 before feeding into the image editing model")
|
369 |
+
|
370 |
+
with gr.Group():
|
371 |
+
gr.Markdown("# Video Editing Stage")
|
372 |
+
gr.Markdown("Enjoy the full control of the video editing process using the edited image and the preprocessed video! Click on the Run AnyV2V button after inputting the video description prompt. Try tweak with the setting if the output does not satisfy you!")
|
373 |
+
with gr.Row():
|
374 |
+
with gr.Column():
|
375 |
+
video_prompt = gr.Textbox(label="Video description prompt")
|
376 |
+
settings_anyv2v = gr.Accordion("Settings for AnyV2V")
|
377 |
+
with settings_anyv2v:
|
378 |
+
with gr.Column():
|
379 |
+
av_pnp_f_t = gr.Slider(minimum=0, maximum=1, step=0.01, value=0.2, label="Convolutional injection (pnp_f_t)")
|
380 |
+
av_pnp_spatial_attn_t = gr.Slider(minimum=0, maximum=1, step=0.01, value=0.2, label="Spatial Attention injection (pnp_spatial_attn_t)")
|
381 |
+
av_pnp_temp_attn_t = gr.Slider(minimum=0, maximum=1, step=0.01, value=0.5, label="Temporal Attention injection (pnp_temp_attn_t)")
|
382 |
+
btn_infer = gr.Button("Run Video Editing")
|
383 |
+
with gr.Column():
|
384 |
+
video_output = gr.Video(label="Video Output")
|
385 |
+
with gr.Column():
|
386 |
+
advanced_settings_anyv2v = gr.Accordion("Advanced Settings for AnyV2V", open=False)
|
387 |
+
with advanced_settings_anyv2v:
|
388 |
+
with gr.Column():
|
389 |
+
av_ddim_init_latents_t_idx = gr.Number(label="DDIM Initial Latents t Index", value=0, minimum=0)
|
390 |
+
av_ddim_inversion_steps = gr.Number(label="DDIM Inversion Steps", value=100, minimum=1)
|
391 |
+
av_num_inference_steps = gr.Number(label="Number of Inference Steps", value=50, minimum=1)
|
392 |
+
av_guidance_scale = gr.Number(label="Guidance Scale", value=9, minimum=0)
|
393 |
+
av_seed = gr.Number(label="Seed (-1 means random)", value=42, minimum=-1, maximum=sys.maxsize)
|
394 |
+
av_neg_prompt = gr.Textbox(label="Negative Prompt", value="Distorted, discontinuous, Ugly, blurry, low resolution, motionless, static, disfigured, disconnected limbs, Ugly faces, incomplete arms")
|
395 |
+
|
396 |
examples = gr.Examples(examples=demo_examples,
|
397 |
label="Examples (Just click on Video Editing button after loading them into the UI)",
|
398 |
inputs=[video_input, image_input_output, video_prompt, av_pnp_f_t, av_pnp_spatial_attn_t, av_pnp_temp_attn_t])
|
|
|
400 |
btn_pv.click(
|
401 |
btn_preprocess_video_fn,
|
402 |
inputs=[video_raw, pv_width, pv_height, pv_start_time, pv_end_time, pv_center_crop, pv_x_offset, pv_y_offset, pv_longest_to_width],
|
403 |
+
outputs=[video_input, src_first_frame]
|
404 |
)
|
405 |
|
406 |
btn_image_edit.click(
|