import argparse
import os
import tempfile
import gradio as gr
import numpy as np
import torch
from glob import glob
from torchvision.transforms import CenterCrop, Compose, Resize
from gradio_utils.camera_utils import CAMERA_MOTION_MODE, process_camera, create_relative
from gradio_utils.utils import vis_camera
from gradio_utils.motionctrl_cmcm_gradio import build_model, motionctrl_sample
os.environ['KMP_DUPLICATE_LIB_OK']='True'
SPACE_ID = os.environ.get('SPACE_ID', '')
#### Description ####
title = r"""
MotionCtrl: A Unified and Flexible Motion Controller for Video Generation
"""
subtitle = r"""Deployed on SVD Generation
"""
description = r"""
Official Gradio demo for MotionCtrl: A Unified and Flexible Motion Controller for Video Generation.
🔥 MotionCtrl is capable of independently and flexibly controling the camera motion and object motion of a generated video, with only a unified model.
🤗 Try to control the motion of the generated videos yourself!
❗❗❗ Please note **ONLY** Camera Motion Control in the current version of **MotionCtrl** deployed on **SVD** is avaliable.
❗❗❗ Showcases and
Tutorial can be found
here
.
"""
#
article = r"""
If MotionCtrl is helpful, please help to ⭐ the Github Repo. Thanks!
[![GitHub Stars](https://img.shields.io/github/stars/TencentARC%2FMotionCtrl
)](https://github.com/TencentARC/MotionCtrl)
---
📝 **Citation**
If our work is useful for your research, please consider citing:
```bibtex
@inproceedings{wang2023motionctrl,
title={MotionCtrl: A Unified and Flexible Motion Controller for Video Generation},
author={Wang, Zhouxia and Yuan, Ziyang and Wang, Xintao and Chen, Tianshui and Xia, Menghan and Luo, Ping and Shan, Yin},
booktitle={arXiv preprint arXiv:2312.03641},
year={2023}
}
```
📧 **Contact**
If you have any questions, please feel free to reach me out at wzhoux@connect.hku.hk.
"""
css = """
.gradio-container {width: 85% !important}
.gr-monochrome-group {border-radius: 5px !important; border: revert-layer !important; border-width: 2px !important; color: black !important;}
span.svelte-s1r2yt {font-size: 17px !important; font-weight: bold !important; color: #d30f2f !important;}
button {border-radius: 8px !important;}
.add_button {background-color: #4CAF50 !important;}
.remove_button {background-color: #f44336 !important;}
.clear_button {background-color: gray !important;}
.mask_button_group {gap: 10px !important;}
.video {height: 300px !important;}
.image {height: 300px !important;}
.video .wrap.svelte-lcpz3o {display: flex !important; align-items: center !important; justify-content: center !important;}
.video .wrap.svelte-lcpz3o > :first-child {height: 100% !important;}
.margin_center {width: 50% !important; margin: auto !important;}
.jc_center {justify-content: center !important;}
"""
T_base = [
[1.,0.,0.], ## W2C x 的正方向: 相机朝左 left
[-1.,0.,0.], ## W2C x 的负方向: 相机朝右 right
[0., 1., 0.], ## W2C y 的正方向: 相机朝上 up
[0.,-1.,0.], ## W2C y 的负方向: 相机朝下 down
[0.,0.,1.], ## W2C z 的正方向: 相机往前 zoom out
[0.,0.,-1.], ## W2C z 的负方向: 相机往前 zoom in
]
radius = 1
n = 16
# step =
look_at = np.array([0, 0, 0.8]).reshape(3,1)
# look_at = np.array([0, 0, 0.2]).reshape(3,1)
T_list = []
base_R = np.array([[1., 0., 0.],
[0., 1., 0.],
[0., 0., 1.]])
res = []
res_forsave = []
T_range = 1.8
for i in range(0, 16):
# theta = (1)*np.pi*i/n
R = base_R[:,:3]
T = np.array([0.,0.,1.]).reshape(3,1) * (i/n)*2
RT = np.concatenate([R,T], axis=1)
res.append(RT)
fig = vis_camera(res)
# MODE = ["camera motion control", "object motion control", "camera + object motion control"]
MODE = ["control camera poses", "control object trajectory", "control both camera and object motion"]
RESIZE_MODE = ['Center Crop To 576x1024', 'Keep original spatial ratio']
DIY_MODE = ['Customized Mode 1: First A then B',
'Customized Mode 2: Both A and B',
'Customized Mode 3: RAW Camera Poses']
## load default model
num_frames = 14
num_steps = 25
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device {device}")
config = "configs/inference/config_motionctrl_cmcm.yaml"
ckpt='checkpoints/motionctrl_svd.ckpt'
if not os.path.exists(ckpt):
os.system(f'wget https://huggingface.co/TencentARC/MotionCtrl/resolve/main/motionctrl_svd.ckpt?download=true -P .')
os.system(f'mkdir checkpoints')
os.system(f'mv motionctrl_svd.ckpt?download=true {ckpt}')
model = build_model(config, ckpt, device, num_frames, num_steps)
width, height = 1024, 576
traj_list = []
camera_dict = {
"motion":[],
"mode": "Customized Mode 1: First A then B", # "First A then B", "Both A and B", "Custom"
"speed": 1.0,
"complex": None
}
def fn_vis_camera(camera_args):
global camera_dict, num_frames, width, height
RT = process_camera(camera_dict, camera_args, num_frames=num_frames, width=width, height=height) # [t, 3, 4]
rescale_T = 1.0
rescale_T = max(rescale_T, np.max(np.abs(RT[:,:,-1])) / 1.9)
fig = vis_camera(create_relative(RT), rescale_T=rescale_T)
vis_step3_prompt_generate = True
vis_generation_dec = True
vis_prompt = True
vis_num_samples = True
vis_seed = True
vis_start = True
vis_gen_video = True
vis_repeat_highlight = True
return fig, \
gr.update(visible=vis_step3_prompt_generate), \
gr.update(visible=vis_generation_dec), \
gr.update(visible=vis_prompt), \
gr.update(visible=vis_num_samples), \
gr.update(visible=vis_seed), \
gr.update(visible=vis_start), \
gr.update(visible=vis_gen_video, value=None), \
gr.update(visible=vis_repeat_highlight)
def display_camera_info(camera_dict, camera_mode=None):
if camera_dict['complex'] is not None:
res = f"complex : {camera_dict['complex']}. "
res += f"speed : {camera_dict['speed']}. "
else:
res = ""
res += f"motion : {[_ for _ in camera_dict['motion']]}. "
res += f"speed : {camera_dict['speed']}. "
if camera_mode == CAMERA_MOTION_MODE[2]:
res += f"mode : {camera_dict['mode']}. "
return res
def add_camera_motion(camera_motion, camera_mode):
global camera_dict
if camera_dict['complex'] is not None:
camera_dict['complex'] = None
if camera_mode == CAMERA_MOTION_MODE[2] and len(camera_dict['motion']) <2:
camera_dict['motion'].append(camera_motion)
else:
camera_dict['motion']=[camera_motion]
return display_camera_info(camera_dict, camera_mode)
def add_complex_camera_motion(camera_motion):
global camera_dict
camera_dict['complex']=camera_motion
return display_camera_info(camera_dict)
def input_raw_camera_pose(combine_type, camera_mode):
global camera_dict
camera_dict['mode'] = combine_type
vis_U = False
vis_D = False
vis_L = False
vis_R = False
vis_I = False
vis_O = False
vis_ACW = False
vis_CW = False
vis_speed = True
vis_combine3_des = True
return gr.update(value='1 0 0 0 0 1 0 0 0 0 1 0\n1 0 0 0 0 1 0 0 0 0 1 -0.225\n1 0 0 0 0 1 0 0 0 0 1 -0.45\n1 0 0 0 0 1 0 0 0 0 1 -0.675\n1 0 0 0 0 1 0 0 0 0 1 -0.9\n1 0 0 0 0 1 0 0 0 0 1 -1.125\n1 0 0 0 0 1 0 0 0 0 1 -1.35\n1 0 0 0 0 1 0 0 0 0 1 -1.575\n1 0 0 0 0 1 0 0 0 0 1 -1.8\n1 0 0 0 0 1 0 0 0 0 1 -2.025\n1 0 0 0 0 1 0 0 0 0 1 -2.25\n1 0 0 0 0 1 0 0 0 0 1 -2.475\n1 0 0 0 0 1 0 0 0 0 1 -2.7\n1 0 0 0 0 1 0 0 0 0 1 -2.925\n', max_lines=16, interactive=True), \
gr.update(visible=vis_U), \
gr.update(visible=vis_D), \
gr.update(visible=vis_L),\
gr.update(visible=vis_R), \
gr.update(visible=vis_I), \
gr.update(visible=vis_O), \
gr.update(visible=vis_ACW), \
gr.update(visible=vis_CW), \
gr.update(visible=vis_speed), \
gr.update(visible=vis_combine3_des)
def change_camera_mode(combine_type, camera_mode):
global camera_dict
camera_dict['mode'] = combine_type
vis_U = True
vis_D = True
vis_L = True
vis_R = True
vis_I = True
vis_O = True
vis_ACW = True
vis_CW = True
vis_speed = True
vis_combine3_des = False
return display_camera_info(camera_dict, camera_mode), \
gr.update(visible=vis_U), \
gr.update(visible=vis_D), \
gr.update(visible=vis_L),\
gr.update(visible=vis_R), \
gr.update(visible=vis_I), \
gr.update(visible=vis_O), \
gr.update(visible=vis_ACW), \
gr.update(visible=vis_CW), \
gr.update(visible=vis_speed), \
gr.update(visible=vis_combine3_des)
def change_camera_speed(camera_speed):
global camera_dict
camera_dict['speed'] = camera_speed
return display_camera_info(camera_dict)
def reset_camera():
global camera_dict
camera_dict = {
"motion":[],
"mode": "Customized Mode 1: First A then B",
"speed": 1.0,
"complex": None
}
return display_camera_info(camera_dict)
def visualized_camera_poses(step2_camera_motion):
reset_camera()
# generate video
vis_step3_prompt_generate = False
vis_generation_dec = False
vis_prompt = False
vis_num_samples = False
vis_seed = False
vis_start = False
vis_gen_video = False
vis_repeat_highlight = False
if step2_camera_motion == CAMERA_MOTION_MODE[0]:
vis_basic_camera_motion = True
vis_basic_camera_motion_des = True
vis_custom_camera_motion = False
vis_custom_run_status = False
vis_complex_camera_motion = False
vis_complex_camera_motion_des = False
vis_U = True
vis_D = True
vis_L = True
vis_R = True
vis_I = True
vis_O = True
vis_ACW = True
vis_CW = True
vis_combine1 = False
vis_combine2 = False
vis_combine3 = False
vis_combine3_des = False
vis_speed = True
vis_Pose_1, vis_Pose_2, vis_Pose_3, vis_Pose_4 = False, False, False, False
vis_Pose_5, vis_Pose_6, vis_Pose_7, vis_Pose_8 = False, False, False, False
elif step2_camera_motion == CAMERA_MOTION_MODE[1]:
vis_basic_camera_motion = False
vis_basic_camera_motion_des = False
vis_custom_camera_motion = False
vis_custom_run_status = False
vis_complex_camera_motion = True
vis_complex_camera_motion_des = True
vis_U = False
vis_D = False
vis_L = False
vis_R = False
vis_I = False
vis_O = False
vis_ACW = False
vis_CW = False
vis_combine1 = False
vis_combine2 = False
vis_combine3 = False
vis_combine3_des = False
vis_speed = True
vis_Pose_1, vis_Pose_2, vis_Pose_3, vis_Pose_4 = True, True, True, True
vis_Pose_5, vis_Pose_6, vis_Pose_7, vis_Pose_8 = True, True, True, True
else: # step2_camera_motion = CAMERA_MOTION_MODE[2]:
vis_basic_camera_motion = False
vis_basic_camera_motion_des = False
vis_custom_camera_motion = True
vis_custom_run_status = True
vis_complex_camera_motion = False
vis_complex_camera_motion_des = False
vis_U = False
vis_D = False
vis_L = False
vis_R = False
vis_I = False
vis_O = False
vis_ACW = False
vis_CW = False
vis_combine1 = True
vis_combine2 = True
vis_combine3 = True
vis_combine3_des = False
vis_speed = False
vis_Pose_1, vis_Pose_2, vis_Pose_3, vis_Pose_4 = False, False, False, False
vis_Pose_5, vis_Pose_6, vis_Pose_7, vis_Pose_8 = False, False, False, False
vis_camera_args = True
vis_camera_reset = True
vis_camera_vis = True
vis_vis_camera = True
return gr.update(visible=vis_basic_camera_motion), \
gr.update(visible=vis_basic_camera_motion_des), \
gr.update(visible=vis_custom_camera_motion), \
gr.update(visible=vis_custom_run_status), \
gr.update(visible=vis_complex_camera_motion), \
gr.update(visible=vis_complex_camera_motion_des), \
gr.update(visible=vis_U), gr.update(visible=vis_D), gr.update(visible=vis_L), gr.update(visible=vis_R), \
gr.update(visible=vis_I), gr.update(visible=vis_O), gr.update(visible=vis_ACW), gr.update(visible=vis_CW), \
gr.update(visible=vis_combine1), gr.update(visible=vis_combine2), gr.update(visible=vis_combine3), \
gr.update(visible=vis_combine3_des), \
gr.update(visible=vis_speed), \
gr.update(visible=vis_Pose_1), gr.update(visible=vis_Pose_2), gr.update(visible=vis_Pose_3), gr.update(visible=vis_Pose_4), \
gr.update(visible=vis_Pose_5), gr.update(visible=vis_Pose_6), gr.update(visible=vis_Pose_7), gr.update(visible=vis_Pose_8), \
gr.update(visible=vis_camera_args, value=None), \
gr.update(visible=vis_camera_reset), gr.update(visible=vis_camera_vis), \
gr.update(visible=vis_vis_camera, value=None), \
gr.update(visible=vis_step3_prompt_generate), \
gr.update(visible=vis_generation_dec), \
gr.update(visible=vis_prompt), \
gr.update(visible=vis_num_samples), \
gr.update(visible=vis_seed), \
gr.update(visible=vis_start), \
gr.update(visible=vis_gen_video), \
gr.update(visible=vis_repeat_highlight)
def process_input_image(input_image, resize_mode):
global width, height
if resize_mode == RESIZE_MODE[0]:
height = 576
width = 1024
w, h = input_image.size
h_ratio = h / height
w_ratio = w / width
if h_ratio > w_ratio:
h = int(h / w_ratio)
if h < height:
h = height
input_image = Resize((h, width))(input_image)
else:
w = int(w / h_ratio)
if w < width:
w = width
input_image = Resize((height, w))(input_image)
transformer = Compose([
# Resize(width),
CenterCrop((height, width)),
])
input_image = transformer(input_image)
else:
w, h = input_image.size
if h > w:
height = 576
width = int(w * height / h)
else:
width = 1024
height = int(h * width / w)
input_image = Resize((height, width))(input_image)
# print(f'input_image size: {input_image.size}')
vis_step2_camera_motion = True
vis_step2_camera_motion_des = True
vis_camera_mode = True
vis_camera_info = True
####
# camera motion control
vis_basic_camera_motion = False
vis_basic_camera_motion_des = False
vis_custom_camera_motion = False
vis_custom_run_status = False
vis_complex_camera_motion = False
vis_complex_camera_motion_des = False
vis_U = False
vis_D = False
vis_L = False
vis_R = False
vis_I = False
vis_O = False
vis_ACW = False
vis_CW = False
vis_combine1 = False
vis_combine2 = False
vis_combine3 = False
vis_combine3_des = False
vis_speed = False
vis_Pose_1, vis_Pose_2, vis_Pose_3, vis_Pose_4 = False, False, False, False
vis_Pose_5, vis_Pose_6, vis_Pose_7, vis_Pose_8 = False, False, False, False
vis_camera_args = False
vis_camera_reset = False
vis_camera_vis = False
vis_vis_camera = False
# generate video
vis_step3_prompt_generate = False
vis_generation_dec = False
vis_prompt = False
vis_num_samples = False
vis_seed = False
vis_start = False
vis_gen_video = False
vis_repeat_highlight = False
return gr.update(visible=True, value=input_image, height=height, width=width), \
gr.update(visible=vis_step2_camera_motion), \
gr.update(visible=vis_step2_camera_motion_des), \
gr.update(visible=vis_camera_mode), \
gr.update(visible=vis_camera_info), \
gr.update(visible=vis_basic_camera_motion), \
gr.update(visible=vis_basic_camera_motion_des), \
gr.update(visible=vis_custom_camera_motion), \
gr.update(visible=vis_custom_run_status), \
gr.update(visible=vis_complex_camera_motion), \
gr.update(visible=vis_complex_camera_motion_des), \
gr.update(visible=vis_U), gr.update(visible=vis_D), gr.update(visible=vis_L), gr.update(visible=vis_R), \
gr.update(visible=vis_I), gr.update(visible=vis_O), gr.update(visible=vis_ACW), gr.update(visible=vis_CW), \
gr.update(visible=vis_combine1), gr.update(visible=vis_combine2), gr.update(visible=vis_combine3), \
gr.update(visible=vis_combine3_des), \
gr.update(visible=vis_speed), \
gr.update(visible=vis_Pose_1), gr.update(visible=vis_Pose_2), gr.update(visible=vis_Pose_3), gr.update(visible=vis_Pose_4), \
gr.update(visible=vis_Pose_5), gr.update(visible=vis_Pose_6), gr.update(visible=vis_Pose_7), gr.update(visible=vis_Pose_8), \
gr.update(visible=vis_camera_args, value=None), \
gr.update(visible=vis_camera_reset), gr.update(visible=vis_camera_vis), \
gr.update(visible=vis_vis_camera, value=None), \
gr.update(visible=vis_step3_prompt_generate), \
gr.update(visible=vis_generation_dec), \
gr.update(visible=vis_prompt), \
gr.update(visible=vis_num_samples), \
gr.update(visible=vis_seed), \
gr.update(visible=vis_start), \
gr.update(visible=vis_gen_video), \
gr.update(visible=vis_repeat_highlight)
def model_run(input_image, fps_id, seed, n_samples, camera_args):
global model, device, camera_dict, num_frames, num_steps, width, height
RT = process_camera(camera_dict, camera_args, num_frames=num_frames, width=width, height=height).reshape(-1,12)
video_path = motionctrl_sample(
model=model,
image=input_image,
RT=RT,
num_frames=num_frames,
fps_id=fps_id,
decoding_t=1,
seed=seed,
sample_num=n_samples,
device=device
)
return video_path
def main(args):
demo = gr.Blocks()
with demo:
gr.Markdown(title)
gr.Markdown(subtitle)
gr.Markdown(description)
with gr.Column():
# step 0: Some useful tricks
gr.Markdown("## Step 0/3: Some Useful Tricks", show_label=False)
gr.HighlightedText(value=[("",""), (f"1. If the motion control is not obvious, try to increase the `Motion Speed`. \
\n 2. If the generated videos are distored severely, try to descrease the `Motion Speed` \
or increase `FPS`.", "Normal")],
color_map={"Normal": "green", "Error": "red", "Clear clicks": "gray", "Add mask": "green", "Remove mask": "red"}, visible=True)
# step 2: input an image
step2_title = gr.Markdown("---\n## Step 1/3: Input an Image", show_label=False, visible=True)
step2_dec = gr.Markdown(f"\n 1. Upload an Image by `Drag` or Click `Upload Image`; \
\n 2. Click `{RESIZE_MODE[0]}` or `{RESIZE_MODE[1]}` to select the image resize mode. \
You will get a processed image and go into the next step. \
\n - `{RESIZE_MODE[0]}`: Our MotionCtrl is train on image with spatial size 576x1024. Choose `{RESIZE_MODE[0]}` can get better generated video. \
\n - `{RESIZE_MODE[1]}`: Choose `{RESIZE_MODE[1]}` if you want to generate video with the same spatial ratio as the input image.",
show_label=False, visible=True)
with gr.Row(equal_height=True):
with gr.Column(scale=2):
input_image = gr.Image(type="pil", interactive=True, elem_id="input_image", elem_classes='image', visible=True)
# process_input_image_button = gr.Button(value="Process Input Image", visible=False)
with gr.Row():
center_crop_botton = gr.Button(value=RESIZE_MODE[0], visible=True)
keep_spatial_raition_botton = gr.Button(value=RESIZE_MODE[1], visible=True)
with gr.Column(scale=2):
process_image = gr.Image(type="pil", interactive=False, elem_id="process_image", elem_classes='image', visible=False)
# step2_proceed_button = gr.Button(value="Proceed", visible=False)
# step3 - camera motion control
step2_camera_motion = gr.Markdown("---\n## Step 2/3: Select the camera poses", show_label=False, visible=False)
step2_camera_motion_des = gr.Markdown(f"\n - {CAMERA_MOTION_MODE[0]}: Including 8 basic camera poses, such as pan up, pan down, zoom in, and zoom out. \
\n - {CAMERA_MOTION_MODE[1]}: Complex camera poses extracted from the real videos. \
\n - {CAMERA_MOTION_MODE[2]}: You can customize complex camera poses yourself by combining or fusing two of the eight basic camera poses or input RAW RT matrix. \
\n - Click `Proceed` to go into next step",
show_label=False, visible=False)
camera_mode = gr.Radio(choices=CAMERA_MOTION_MODE, value=CAMERA_MOTION_MODE[0], label="Camera Motion Control Mode", interactive=True, visible=False)
camera_info = gr.Button(value="Proceed", visible=False)
with gr.Row():
with gr.Column():
# step3.1 - camera motion control - basic
basic_camera_motion = gr.Markdown("---\n### Basic Camera Poses", show_label=False, visible=False)
basic_camera_motion_des = gr.Markdown(f"\n 1. Click one of the basic camera poses, such as `Pan Up`; \
\n 2. Slide the `Motion speed` to get a speed value. The large the value, the fast the camera motion; \
\n 3. Click `Visualize Camera and Proceed` to visualize the camera poses and go proceed; \
\n 4. Click `Reset Camera` to reset the camera poses (If needed). ",
show_label=False, visible=False)
# step3.2 - camera motion control - provided complex
complex_camera_motion = gr.Markdown("---\n### Provided Complex Camera Poses", show_label=False, visible=False)
complex_camera_motion_des = gr.Markdown(f"\n 1. Click one of the complex camera poses, such as `Pose_1`; \
\n 2. Click `Visualize Camera and Proceed` to visualize the camera poses and go proceed; \
\n 3. Click `Reset Camera` to reset the camera poses (If needed). ",
show_label=False, visible=False)
# step3.3 - camera motion control - custom
custom_camera_motion = gr.Markdown(f"---\n### {CAMERA_MOTION_MODE[2]}", show_label=False, visible=False)
custom_run_status = gr.Markdown(f"\n 1. Click `{DIY_MODE[0]}`, `{DIY_MODE[1]}`, or `{DIY_MODE[2]}` \
\n - `Customized Mode 1: First A then B`: For example, click `Pan Up` and `Pan Left`, the camera will first `Pan Up` and then `Pan Left`; \
\n - `Customized Mode 2: Both A and B`: For example, click `Pan Up` and `Pan Left`, the camera will move towards the upper left corner; \
\n - `{DIY_MODE[2]}`: Input the RAW RT matrix yourselves. \
\n 2. Slide the `Motion speed` to get a speed value. The large the value, the fast the camera motion; \
\n 3. Click `Visualize Camera and Proceed` to visualize the camera poses and go proceed; \
\n 4. Click `Reset Camera` to reset the camera poses (If needed). ",
show_label=False, visible=False)
gr.HighlightedText(value=[("",""), ("1. Select two of the basic camera poses; 2. Select Customized Mode 1 OR Customized Mode 2. 3. Visualized Camera to show the customized camera poses", "Normal")],
color_map={"Normal": "green", "Error": "red", "Clear clicks": "gray", "Add mask": "green", "Remove mask": "red"}, visible=False)
with gr.Row():
combine1 = gr.Button(value=DIY_MODE[0], visible=False)
combine2 = gr.Button(value=DIY_MODE[1], visible=False)
combine3 = gr.Button(value=DIY_MODE[2], visible=False)
with gr.Row():
combine3_des = gr.Markdown(f"---\n#### Input your camera pose in the following textbox. \
A total of 14 lines and each line contains 12 float number, indicated \
the RT matrix in the shape of 1x12. \
The example is RT matrix of ZOOM IN.", show_label=False, visible=False)
with gr.Row():
U = gr.Button(value="Pan Up", visible=False)
D = gr.Button(value="Pan Down", visible=False)
L = gr.Button(value="Pan Left", visible=False)
R = gr.Button(value="Pan Right", visible=False)
with gr.Row():
I = gr.Button(value="Zoom In", visible=False)
O = gr.Button(value="Zoom Out", visible=False)
ACW = gr.Button(value="ACW", visible=False)
CW = gr.Button(value="CW", visible=False)
with gr.Row():
speed = gr.Slider(minimum=0, maximum=8, step=0.2, label="Motion Speed", value=1.0, visible=False)
with gr.Row():
Pose_1 = gr.Button(value="Pose_1", visible=False)
Pose_2 = gr.Button(value="Pose_2", visible=False)
Pose_3 = gr.Button(value="Pose_3", visible=False)
Pose_4 = gr.Button(value="Pose_4", visible=False)
with gr.Row():
Pose_5 = gr.Button(value="Pose_5", visible=False)
Pose_6 = gr.Button(value="Pose_6", visible=False)
Pose_7 = gr.Button(value="Pose_7", visible=False)
Pose_8 = gr.Button(value="Pose_8", visible=False)
with gr.Row():
camera_args = gr.Textbox(value="Camera Type", label="Camera Type", visible=False)
with gr.Row():
camera_vis= gr.Button(value="Visualize Camera and Proceed", visible=False)
camera_reset = gr.Button(value="Reset Camera", visible=False)
with gr.Column():
vis_camera = gr.Plot(fig, label='Camera Poses', visible=False)
# step4 - Generate videos
with gr.Row():
with gr.Column():
step3_prompt_generate = gr.Markdown("---\n## Step 3/3: Generate videos", show_label=False, visible=False)
generation_dec = gr.Markdown(f"\n 1. Set `FPS`.; \
\n 2. Set `n_samples`; \
\n 3. Set `seed`; \
\n 4. Click `Start generation !` to generate videos; ", visible=False)
# prompt = gr.Textbox(value="a dog sitting on grass", label="Prompt", interactive=True, visible=False)
prompt = gr.Slider(minimum=5, maximum=30, step=1, label="FPS", value=10, visible=False)
n_samples = gr.Number(value=1, precision=0, interactive=True, label="n_samples", visible=False)
seed = gr.Number(value=1234, precision=0, interactive=True, label="Seed", visible=False)
start = gr.Button(value="Start generation !", visible=False)
with gr.Column():
gen_video = gr.Video(value=None, label="Generate Video", visible=False)
repeat_highlight=gr.HighlightedText(value=[("",""), (f"1. If the motion control is not obvious, try to increase the `Motion Speed`. \
\n 2. If the generated videos are distored severely, try to descrease the `Motion Speed` \
or increase `FPS`.", "Normal")],
color_map={"Normal": "green", "Error": "red", "Clear clicks": "gray", "Add mask": "green", "Remove mask": "red"}, visible=False)
center_crop_botton.click(
fn=process_input_image,
inputs=[input_image, center_crop_botton],
outputs=[
process_image,
step2_camera_motion,
step2_camera_motion_des,
camera_mode,
camera_info,
basic_camera_motion,
basic_camera_motion_des,
custom_camera_motion,
custom_run_status,
complex_camera_motion,
complex_camera_motion_des,
U, D, L, R,
I, O, ACW, CW,
combine1, combine2, combine3, combine3_des,
speed,
Pose_1, Pose_2, Pose_3, Pose_4,
Pose_5, Pose_6, Pose_7, Pose_8,
camera_args,
camera_reset, camera_vis,
vis_camera,
step3_prompt_generate,
generation_dec,
prompt,
n_samples,
seed, start, gen_video, repeat_highlight])
keep_spatial_raition_botton.click(
fn=process_input_image,
inputs=[input_image, keep_spatial_raition_botton],
outputs=[
process_image,
step2_camera_motion,
step2_camera_motion_des,
camera_mode,
camera_info,
basic_camera_motion,
basic_camera_motion_des,
custom_camera_motion,
custom_run_status,
complex_camera_motion,
complex_camera_motion_des,
U, D, L, R,
I, O, ACW, CW,
combine1, combine2, combine3, combine3_des,
speed,
Pose_1, Pose_2, Pose_3, Pose_4,
Pose_5, Pose_6, Pose_7, Pose_8,
camera_args,
camera_reset, camera_vis,
vis_camera,
step3_prompt_generate,
generation_dec,
prompt,
n_samples,
seed, start, gen_video, repeat_highlight])
camera_info.click(
fn=visualized_camera_poses,
inputs=[camera_mode],
outputs=[basic_camera_motion,
basic_camera_motion_des,
custom_camera_motion,
custom_run_status,
complex_camera_motion,
complex_camera_motion_des,
U, D, L, R,
I, O, ACW, CW,
combine1, combine2, combine3, combine3_des,
speed,
Pose_1, Pose_2, Pose_3, Pose_4,
Pose_5, Pose_6, Pose_7, Pose_8,
camera_args,
camera_reset, camera_vis,
vis_camera,
step3_prompt_generate, generation_dec, prompt, n_samples, seed, start, gen_video, repeat_highlight],
)
U.click(fn=add_camera_motion, inputs=[U, camera_mode], outputs=camera_args)
D.click(fn=add_camera_motion, inputs=[D, camera_mode], outputs=camera_args)
L.click(fn=add_camera_motion, inputs=[L, camera_mode], outputs=camera_args)
R.click(fn=add_camera_motion, inputs=[R, camera_mode], outputs=camera_args)
I.click(fn=add_camera_motion, inputs=[I, camera_mode], outputs=camera_args)
O.click(fn=add_camera_motion, inputs=[O, camera_mode], outputs=camera_args)
ACW.click(fn=add_camera_motion, inputs=[ACW, camera_mode], outputs=camera_args)
CW.click(fn=add_camera_motion, inputs=[CW, camera_mode], outputs=camera_args)
speed.change(fn=change_camera_speed, inputs=speed, outputs=camera_args)
camera_reset.click(fn=reset_camera, inputs=None, outputs=[camera_args])
combine1.click(fn=change_camera_mode,
inputs=[combine1, camera_mode],
outputs=[camera_args,
U, D, L, R,
I, O, ACW, CW, speed,
combine3_des])
combine2.click(fn=change_camera_mode,
inputs=[combine2, camera_mode],
outputs=[camera_args,
U, D, L, R,
I, O, ACW, CW,
speed,
combine3_des])
combine3.click(fn=input_raw_camera_pose,
inputs=[combine3, camera_mode],
outputs=[camera_args,
U, D, L, R,
I, O, ACW, CW,
speed,
combine3_des])
camera_vis.click(fn=fn_vis_camera, inputs=[camera_args],
outputs=[vis_camera,
step3_prompt_generate,
generation_dec,
prompt,
n_samples,
seed,
start,
gen_video,
repeat_highlight])
Pose_1.click(fn=add_complex_camera_motion, inputs=Pose_1, outputs=camera_args)
Pose_2.click(fn=add_complex_camera_motion, inputs=Pose_2, outputs=camera_args)
Pose_3.click(fn=add_complex_camera_motion, inputs=Pose_3, outputs=camera_args)
Pose_4.click(fn=add_complex_camera_motion, inputs=Pose_4, outputs=camera_args)
Pose_5.click(fn=add_complex_camera_motion, inputs=Pose_5, outputs=camera_args)
Pose_6.click(fn=add_complex_camera_motion, inputs=Pose_6, outputs=camera_args)
Pose_7.click(fn=add_complex_camera_motion, inputs=Pose_7, outputs=camera_args)
Pose_8.click(fn=add_complex_camera_motion, inputs=Pose_8, outputs=camera_args)
start.click(fn=model_run,
inputs=[process_image, prompt, seed, n_samples, camera_args],
outputs=gen_video)
# set example
gr.Markdown("## Examples")
examples = glob(os.path.join(os.path.dirname(__file__), "./assets/demo/images", "*.png"))
gr.Examples(
examples=examples,
inputs=[input_image],
examples_per_page=15
)
gr.Markdown(article)
# demo.launch(server_name='0.0.0.0', share=False, server_port=args['server_port'])
# demo.queue(concurrency_count=1, max_size=10)
# demo.launch()
demo.queue(max_size=10).launch(**args)
if __name__=="__main__":
parser = argparse.ArgumentParser()
# parser.add_argument("--port", type=int, default=12345)
parser.add_argument(
'--listen',
type=str,
default='0.0.0.0' if 'SPACE_ID' in os.environ else '127.0.0.1',
help='IP to listen on for connections to Gradio',
)
parser.add_argument(
'--username', type=str, default='', help='Username for authentication'
)
parser.add_argument(
'--password', type=str, default='', help='Password for authentication'
)
parser.add_argument(
'--server_port',
type=int,
default=0,
help='Port to run the server listener on',
)
parser.add_argument(
'--inbrowser', action='store_true', help='Open in browser'
)
parser.add_argument(
'--share', action='store_true', help='Share the gradio UI'
)
args = parser.parse_args()
launch_kwargs = {}
launch_kwargs['server_name'] = args.listen
if args.username and args.password:
launch_kwargs['auth'] = (args.username, args.password)
if args.server_port:
launch_kwargs['server_port'] = args.server_port
if args.inbrowser:
launch_kwargs['inbrowser'] = args.inbrowser
if args.share:
launch_kwargs['share'] = args.share
main(launch_kwargs)