|
import gradio as gr |
|
import spaces |
|
|
|
import os |
|
import sys |
|
import time |
|
import subprocess |
|
import shutil |
|
|
|
import random |
|
from omegaconf import OmegaConf |
|
from moviepy.editor import VideoFileClip |
|
from PIL import Image |
|
import torch |
|
import numpy as np |
|
|
|
from black_box_image_edit.instructpix2pix import InstructPix2Pix |
|
from prepare_video import crop_and_resize_video |
|
from edit_image import infer_video |
|
|
|
sys.path.insert(0, "i2vgen-xl") |
|
from utils import load_ddim_latents_at_t |
|
from pipelines.pipeline_i2vgen_xl import I2VGenXLPipeline |
|
from run_group_ddim_inversion import ddim_inversion |
|
from run_group_pnp_edit import init_pnp |
|
from diffusers import DDIMInverseScheduler, DDIMScheduler |
|
from diffusers.utils import load_image |
|
import imageio |
|
from transformers import pipeline |
|
|
|
|
|
translator = pipeline("translation", model="Helsinki-NLP/opus-mt-ko-en") |
|
|
|
DEBUG_MODE = False |
|
|
|
demo_examples = [ |
|
["./demo/Man Walking.mp4", "./demo/Man Walking/edited_first_frame/turn the man into darth vader.png", "man walking", 0.1, 0.1, 1.0], |
|
["./demo/A kitten turning its head on a wooden floor.mp4", "./demo/A kitten turning its head on a wooden floor/edited_first_frame/A dog turning its head on a wooden floor.png", "A dog turning its head on a wooden floor", 0.2, 0.2, 0.5], |
|
["./demo/An Old Man Doing Exercises For The Body And Mind.mp4", "./demo/An Old Man Doing Exercises For The Body And Mind/edited_first_frame/jack ma.png", "a man doing exercises for the body and mind", 0.8, 0.8, 1.0], |
|
["./demo/Ballet.mp4", "./demo/Ballet/edited_first_frame/van gogh style.png", "girl dancing ballet, in the style of van gogh", 1.0, 1.0, 1.0], |
|
["./demo/A Couple In A Public Display Of Affection.mp4", "./demo/A Couple In A Public Display Of Affection/edited_first_frame/Snowing.png", "A couple in a public display of affection, snowing", 0.3, 0.3, 1.0] |
|
] |
|
|
|
TEMP_DIR = "_demo_temp" |
|
|
|
|
|
image_edit_model = InstructPix2Pix() |
|
|
|
@torch.no_grad() |
|
@spaces.GPU(duration=30) |
|
def perform_edit(video_path, prompt, force_512=False, seed=42, negative_prompt=""): |
|
edited_image_path = infer_video(image_edit_model, |
|
video_path, |
|
output_dir=TEMP_DIR, |
|
prompt=prompt, |
|
prompt_type="instruct", |
|
force_512=force_512, |
|
seed=seed, |
|
negative_prompt=negative_prompt, |
|
overwrite=True) |
|
return edited_image_path |
|
|
|
|
|
|
|
config = { |
|
|
|
"inverse_config": { |
|
"image_size": [512, 512], |
|
"n_frames": 16, |
|
"cfg": 1.0, |
|
"target_fps": 8, |
|
"ddim_inv_prompt": "", |
|
"prompt": "", |
|
"negative_prompt": "", |
|
}, |
|
"pnp_config": { |
|
"random_ratio": 0.0, |
|
"target_fps": 8, |
|
}, |
|
} |
|
config = OmegaConf.create(config) |
|
|
|
|
|
pipe = I2VGenXLPipeline.from_pretrained( |
|
"ali-vilab/i2vgen-xl", |
|
torch_dtype=torch.float16, |
|
variant="fp16", |
|
).to("cuda:0") |
|
|
|
|
|
inverse_scheduler = DDIMInverseScheduler.from_pretrained( |
|
"ali-vilab/i2vgen-xl", |
|
subfolder="scheduler", |
|
) |
|
|
|
ddim_scheduler = DDIMScheduler.from_pretrained( |
|
"ali-vilab/i2vgen-xl", |
|
subfolder="scheduler", |
|
) |
|
|
|
@torch.no_grad() |
|
@spaces.GPU(duration=150) |
|
def perform_anyv2v( |
|
video_path, |
|
video_prompt, |
|
video_negative_prompt, |
|
edited_first_frame_path, |
|
conv_inj, |
|
spatial_inj, |
|
temp_inj, |
|
num_inference_steps, |
|
guidance_scale, |
|
ddim_init_latents_t_idx, |
|
ddim_inversion_steps, |
|
seed, |
|
): |
|
|
|
tmp_dir = os.path.join(TEMP_DIR, "AnyV2V") |
|
if os.path.exists(tmp_dir): |
|
shutil.rmtree(tmp_dir) |
|
os.makedirs(tmp_dir) |
|
|
|
ddim_latents_path = os.path.join(tmp_dir, "ddim_latents") |
|
|
|
def read_frames(video_path): |
|
frames = [] |
|
with imageio.get_reader(video_path) as reader: |
|
for i, frame in enumerate(reader): |
|
pil_image = Image.fromarray(frame) |
|
frames.append(pil_image) |
|
return frames |
|
frame_list = read_frames(str(video_path)) |
|
|
|
config.inverse_config.image_size = list(frame_list[0].size) |
|
config.inverse_config.n_steps = ddim_inversion_steps |
|
config.inverse_config.n_frames = len(frame_list) |
|
config.inverse_config.output_dir = ddim_latents_path |
|
ddim_init_latents_t_idx = min(ddim_init_latents_t_idx, num_inference_steps - 1) |
|
|
|
|
|
first_frame = frame_list[0] |
|
|
|
generator = torch.Generator(device="cuda:0") |
|
generator = generator.manual_seed(seed) |
|
_ddim_latents = ddim_inversion( |
|
config.inverse_config, |
|
first_frame, |
|
frame_list, |
|
pipe, |
|
inverse_scheduler, |
|
generator, |
|
) |
|
|
|
|
|
|
|
edited_1st_frame = load_image(edited_first_frame_path).resize( |
|
config.inverse_config.image_size, resample=Image.Resampling.LANCZOS |
|
) |
|
|
|
ddim_scheduler.set_timesteps(num_inference_steps) |
|
print(f"ddim_scheduler.timesteps: {ddim_scheduler.timesteps}") |
|
ddim_latents_at_t = load_ddim_latents_at_t( |
|
ddim_scheduler.timesteps[ddim_init_latents_t_idx], |
|
ddim_latents_path=ddim_latents_path, |
|
) |
|
print( |
|
f"ddim_scheduler.timesteps[t_idx]: {ddim_scheduler.timesteps[ddim_init_latents_t_idx]}" |
|
) |
|
print(f"ddim_latents_at_t.shape: {ddim_latents_at_t.shape}") |
|
|
|
|
|
random_latents = torch.randn_like(ddim_latents_at_t) |
|
print( |
|
f"Blending random_ratio (1 means random latent): {config.pnp_config.random_ratio}" |
|
) |
|
mixed_latents = ( |
|
random_latents * config.pnp_config.random_ratio |
|
+ ddim_latents_at_t * (1 - config.pnp_config.random_ratio) |
|
) |
|
|
|
|
|
config.pnp_config.n_steps = num_inference_steps |
|
config.pnp_config.pnp_f_t = conv_inj |
|
config.pnp_config.pnp_spatial_attn_t = spatial_inj |
|
config.pnp_config.pnp_temp_attn_t = temp_inj |
|
config.pnp_config.ddim_init_latents_t_idx = ddim_init_latents_t_idx |
|
init_pnp(pipe, ddim_scheduler, config.pnp_config) |
|
|
|
pipe.register_modules(scheduler=ddim_scheduler) |
|
|
|
edited_video = pipe.sample_with_pnp( |
|
prompt=video_prompt, |
|
image=edited_1st_frame, |
|
height=config.inverse_config.image_size[1], |
|
width=config.inverse_config.image_size[0], |
|
num_frames=config.inverse_config.n_frames, |
|
num_inference_steps=config.pnp_config.n_steps, |
|
guidance_scale=guidance_scale, |
|
negative_prompt=video_negative_prompt, |
|
target_fps=config.pnp_config.target_fps, |
|
latents=mixed_latents, |
|
generator=generator, |
|
return_dict=True, |
|
ddim_init_latents_t_idx=ddim_init_latents_t_idx, |
|
ddim_inv_latents_path=ddim_latents_path, |
|
ddim_inv_prompt=config.inverse_config.ddim_inv_prompt, |
|
ddim_inv_1st_frame=first_frame, |
|
).frames[0] |
|
|
|
edited_video = [ |
|
frame.resize(config.inverse_config.image_size, resample=Image.LANCZOS) |
|
for frame in edited_video |
|
] |
|
|
|
def images_to_video(images, output_path, fps=24): |
|
writer = imageio.get_writer(output_path, fps=fps) |
|
|
|
for img in images: |
|
img_np = np.array(img) |
|
writer.append_data(img_np) |
|
|
|
writer.close() |
|
output_path = os.path.join(tmp_dir, "edited_video.mp4") |
|
images_to_video( |
|
edited_video, output_path, fps=config.pnp_config.target_fps |
|
) |
|
return output_path |
|
|
|
|
|
|
|
def get_first_frame_as_pil(video_path): |
|
with VideoFileClip(video_path) as clip: |
|
|
|
first_frame_array = clip.get_frame(0) |
|
|
|
first_frame_image = Image.fromarray(first_frame_array) |
|
return first_frame_image |
|
|
|
def btn_preprocess_video_fn(video_path, width, height, start_time, end_time, center_crop, x_offset, y_offset, longest_to_width): |
|
def check_video(video_path): |
|
with VideoFileClip(video_path) as clip: |
|
if clip.duration == 2 and clip.fps == 8: |
|
return True |
|
else: |
|
return False |
|
|
|
if check_video(video_path) == False: |
|
processed_video_path = crop_and_resize_video(input_video_path=video_path, |
|
output_folder=TEMP_DIR, |
|
clip_duration=2, |
|
width=width, |
|
height=height, |
|
start_time=start_time, |
|
end_time=end_time, |
|
center_crop=center_crop, |
|
x_offset=x_offset, |
|
y_offset=y_offset, |
|
longest_to_width=longest_to_width) |
|
|
|
return processed_video_path |
|
else: |
|
return video_path |
|
|
|
def btn_image_edit_fn(video_path, instruct_prompt, ie_force_512, ie_seed, ie_neg_prompt): |
|
""" |
|
Generate an image based on the video and text input. |
|
""" |
|
|
|
translated_prompt = translator(instruct_prompt, src_lang="ko", tgt_lang="en")[0]['translation_text'] |
|
|
|
if ie_seed < 0: |
|
ie_seed = int.from_bytes(os.urandom(2), "big") |
|
print(f"Using seed: {ie_seed}") |
|
|
|
edited_image_path = perform_edit(video_path=video_path, |
|
prompt=translated_prompt, |
|
force_512=ie_force_512, |
|
seed=ie_seed, |
|
negative_prompt=ie_neg_prompt) |
|
return edited_image_path |
|
|
|
|
|
def btn_infer_fn(video_path, |
|
video_prompt, |
|
video_negative_prompt, |
|
edited_first_frame_path, |
|
conv_inj, |
|
spatial_inj, |
|
temp_inj, |
|
num_inference_steps, |
|
guidance_scale, |
|
ddim_init_latents_t_idx, |
|
ddim_inversion_steps, |
|
seed, |
|
): |
|
|
|
translated_video_prompt = translator(video_prompt, src_lang="ko", tgt_lang="en")[0]['translation_text'] |
|
|
|
if seed < 0: |
|
seed = int.from_bytes(os.urandom(2), "big") |
|
print(f"Using seed: {seed}") |
|
|
|
result_video_path = perform_anyv2v(video_path=video_path, |
|
video_prompt=translated_video_prompt, |
|
video_negative_prompt=video_negative_prompt, |
|
edited_first_frame_path=edited_first_frame_path, |
|
conv_inj=conv_inj, |
|
spatial_inj=spatial_inj, |
|
temp_inj=temp_inj, |
|
num_inference_steps=num_inference_steps, |
|
guidance_scale=guidance_scale, |
|
ddim_init_latents_t_idx=ddim_init_latents_t_idx, |
|
ddim_inversion_steps=ddim_inversion_steps, |
|
seed=seed) |
|
|
|
return result_video_path |
|
|
|
|
|
|
|
with gr.Blocks() as demo: |
|
gr.Markdown("# text + video + image") |
|
|
|
with gr.Tabs(): |
|
with gr.TabItem('AnyV2V(I2VGenXL) + InstructPix2Pix'): |
|
with gr.Row(): |
|
with gr.Column(): |
|
video_raw = gr.Video(label="Raw Video Input") |
|
btn_pv = gr.Button("Preprocess Video") |
|
|
|
with gr.Column(): |
|
video_input = gr.Video(label="Preprocessed Video Input", interactive=False) |
|
with gr.Column(): |
|
advanced_settings_pv = gr.Accordion("Advanced Settings for Video Preprocessing", open=False) |
|
with advanced_settings_pv: |
|
with gr.Column(): |
|
pv_width = gr.Number(label="Width", value=512, minimum=1, maximum=4096) |
|
pv_height = gr.Number(label="Height", value=512, minimum=1, maximum=4096) |
|
pv_start_time = gr.Number(label="Start Time (End time - Start time must be = 2)", value=0, minimum=0) |
|
pv_end_time = gr.Number(label="End Time (End time - Start time must be = 2)", value=2, minimum=0) |
|
pv_center_crop = gr.Checkbox(label="Center Crop", value=True) |
|
pv_x_offset = gr.Number(label="Horizontal Offset (-1 to 1)", value=0, minimum=-1, maximum=1) |
|
pv_y_offset = gr.Number(label="Vertical Offset (-1 to 1)", value=0, minimum=-1, maximum=1) |
|
pv_longest_to_width = gr.Checkbox(label="Resize Longest Dimension to Width") |
|
|
|
gr.Markdown("# Image Editing Stage") |
|
with gr.Row(): |
|
with gr.Column(): |
|
src_first_frame = gr.Image(label="First Frame", type="filepath", interactive=False) |
|
image_instruct_prompt = gr.Textbox(label="Editing instruction prompt") |
|
btn_image_edit = gr.Button("Edit the first frame") |
|
with gr.Column(): |
|
image_input_output = gr.Image(label="Edited Frame", type="filepath") |
|
with gr.Column(): |
|
advanced_settings_image_edit = gr.Accordion("Advanced Settings for Image Editing", open=True) |
|
with advanced_settings_image_edit: |
|
with gr.Column(): |
|
ie_neg_prompt = gr.Textbox(label="Negative Prompt", value="low res, blurry, watermark, jpeg artifacts") |
|
ie_seed = gr.Number(label="Seed (-1 means random)", value=-1, minimum=-1, maximum=sys.maxsize) |
|
ie_force_512 = gr.Checkbox(label="Force resize to 512x512 before feeding into the image editing model") |
|
|
|
gr.Markdown("# Video Editing Stage") |
|
with gr.Row(): |
|
with gr.Column(): |
|
video_prompt = gr.Textbox(label="Video description prompt") |
|
settings_anyv2v = gr.Accordion("Settings for AnyV2V") |
|
with settings_anyv2v: |
|
with gr.Column(): |
|
av_pnp_f_t = gr.Slider(minimum=0, maximum=1, step=0.01, value=0.2, label="Convolutional injection (pnp_f_t)") |
|
av_pnp_spatial_attn_t = gr.Slider(minimum=0, maximum=1, step=0.01, value=0.2, label="Spatial Attention injection (pnp_spatial_attn_t)") |
|
av_pnp_temp_attn_t = gr.Slider(minimum=0, maximum=1, step=0.01, value=0.5, label="Temporal Attention injection (pnp_temp_attn_t)") |
|
btn_infer = gr.Button("Run Video Editing") |
|
with gr.Column(): |
|
video_output = gr.Video(label="Video Output") |
|
with gr.Column(): |
|
advanced_settings_anyv2v = gr.Accordion("Advanced Settings for AnyV2V", open=False) |
|
with advanced_settings_anyv2v: |
|
with gr.Column(): |
|
av_ddim_init_latents_t_idx = gr.Number(label="DDIM Initial Latents t Index", value=0, minimum=0) |
|
av_ddim_inversion_steps = gr.Number(label="DDIM Inversion Steps", value=100, minimum=1) |
|
av_num_inference_steps = gr.Number(label="Number of Inference Steps", value=50, minimum=1) |
|
av_guidance_scale = gr.Number(label="Guidance Scale", value=9, minimum=0) |
|
av_seed = gr.Number(label="Seed (-1 means random)", value=42, minimum=-1, maximum=sys.maxsize) |
|
av_neg_prompt = gr.Textbox(label="Negative Prompt", value="Distorted, discontinuous, Ugly, blurry, low resolution, motionless, static, disfigured, disconnected limbs, Ugly faces, incomplete arms") |
|
|
|
examples = gr.Examples(examples=demo_examples, |
|
label="Examples (Just click on Video Editing button after loading them into the UI)", |
|
inputs=[video_input, image_input_output, video_prompt, av_pnp_f_t, av_pnp_spatial_attn_t, av_pnp_temp_attn_t]) |
|
gr.Markdown('Reference: You can find good source videos from https://www.pexels.com/videos/') |
|
|
|
btn_pv.click( |
|
btn_preprocess_video_fn, |
|
inputs=[video_raw, pv_width, pv_height, pv_start_time, pv_end_time, pv_center_crop, pv_x_offset, pv_y_offset, pv_longest_to_width], |
|
outputs=video_input |
|
) |
|
|
|
btn_image_edit.click( |
|
btn_image_edit_fn, |
|
inputs=[video_input, image_instruct_prompt, ie_force_512, ie_seed, ie_neg_prompt], |
|
outputs=image_input_output |
|
) |
|
|
|
btn_infer.click( |
|
btn_infer_fn, |
|
inputs=[video_input, |
|
video_prompt, |
|
av_neg_prompt, |
|
image_input_output, |
|
av_pnp_f_t, |
|
av_pnp_spatial_attn_t, |
|
av_pnp_temp_attn_t, |
|
av_num_inference_steps, |
|
av_guidance_scale, |
|
av_ddim_init_latents_t_idx, |
|
av_ddim_inversion_steps, |
|
av_seed], |
|
outputs=video_output |
|
) |
|
|
|
video_input.change(fn=get_first_frame_as_pil, inputs=video_input, outputs=src_first_frame) |
|
|
|
|
|
|
|
|
|
torch.set_grad_enabled(False) |
|
|
|
|
|
demo.launch() |
|
|