Spaces:

weifeng-chen
/

Control-A-Video

Runtime error

File size: 8,659 Bytes

from model.video_diffusion.models.controlnet3d import ControlNet3DModel
from model.video_diffusion.models.unet_3d_condition import UNetPseudo3DConditionModel
from model.video_diffusion.pipelines.pipeline_stable_diffusion_controlnet3d import Controlnet3DStableDiffusionPipeline
from transformers import DPTForDepthEstimation
from model.annotator.hed import HEDNetwork
import torch
from einops import rearrange,repeat
import imageio
import numpy as np
import cv2
import torch.nn.functional as F
from PIL import Image
import argparse
import tempfile
import os
import gradio as gr


control_mode = 'depth'
control_net_path = f"wf-genius/controlavideo-{control_mode}"
unet = UNetPseudo3DConditionModel.from_pretrained(control_net_path,
                        torch_dtype = torch.float16,
                        subfolder='unet',
                        ).to("cuda") 
controlnet = ControlNet3DModel.from_pretrained(control_net_path,
                        torch_dtype = torch.float16,
                        subfolder='controlnet',
                        ).to("cuda")

if control_mode == 'depth':
    annotator_model = DPTForDepthEstimation.from_pretrained("Intel/dpt-hybrid-midas").to("cuda")
elif control_mode == 'canny':
    annotator_model = None
elif control_mode == 'hed':
    # firstly download from https://huggingface.co/wf-genius/controlavideo-hed/resolve/main/hed-network.pth 
    annotator_model = HEDNetwork('hed-network.pth').to("cuda")

video_controlnet_pipe = Controlnet3DStableDiffusionPipeline.from_pretrained(control_net_path, unet=unet, 
                        controlnet=controlnet, annotator_model=annotator_model,
                        torch_dtype = torch.float16,
                        ).to("cuda")


def to_video(frames, fps: int) -> str:
    out_file = tempfile.NamedTemporaryFile(suffix='.mp4', delete=False)
    writer = imageio.get_writer(out_file.name, format='FFMPEG', fps=fps)
    for frame in frames:
        writer.append_data(np.array(frame))
    writer.close()
    return out_file.name

def inference(input_video, 
                prompt,
                seed,
                num_inference_steps,
                guidance_scale,
                sampling_rate,
                video_scale,
                init_noise_thres,
                each_sample_frame,
                iter_times,
                h,
                w,
                ):
    num_sample_frames = iter_times * each_sample_frame
    testing_prompt = [prompt]
    np_frames, fps_vid = Controlnet3DStableDiffusionPipeline.get_frames_preprocess(input_video, num_frames=num_sample_frames, sampling_rate=sampling_rate, return_np=True)
    if control_mode == 'depth':
        frames = torch.from_numpy(np_frames).div(255) * 2 - 1
        frames = rearrange(frames, "f h w c -> c f h w").unsqueeze(0)
        frames = rearrange(frames, 'b c f h w -> (b f) c h w')
        control_maps = video_controlnet_pipe.get_depth_map(frames, h, w, return_standard_norm=False)  # (b f) 1 h w
    elif control_mode == 'canny':
        control_maps = np.stack([cv2.Canny(inp, 100, 200) for inp in np_frames])
        control_maps = repeat(control_maps, 'f h w -> f c h w',c=1)
        control_maps = torch.from_numpy(control_maps).div(255)  # 0~1
    elif control_mode == 'hed':
        control_maps = np.stack([video_controlnet_pipe.get_hed_map(inp) for inp in np_frames])
        control_maps = repeat(control_maps, 'f h w -> f c h w',c=1)
        control_maps = torch.from_numpy(control_maps).div(255)  # 0~1
    control_maps = control_maps.to(dtype=controlnet.dtype, device=controlnet.device)
    control_maps = F.interpolate(control_maps, size=(h,w), mode='bilinear', align_corners=False)
    control_maps = rearrange(control_maps, "(b f) c h w -> b c f h w", f=num_sample_frames)
    if control_maps.shape[1] == 1:
        control_maps = repeat(control_maps, 'b c f h w -> b (n c) f h w',  n=3)

    frames = torch.from_numpy(np_frames).div(255)
    frames = rearrange(frames, 'f h w c -> f c h w')
    v2v_input_frames =  torch.nn.functional.interpolate(
                frames,
                size=(h, w),
                mode="bicubic",
                antialias=True,
            ) 
    v2v_input_frames = rearrange(v2v_input_frames, '(b f) c h w -> b c f h w ', f=num_sample_frames)

    out = []
    for i in range(num_sample_frames//each_sample_frame):
        out1 = video_controlnet_pipe(
                # controlnet_hint= control_maps[:,:,:each_sample_frame,:,:],
                # images= v2v_input_frames[:,:,:each_sample_frame,:,:],
                controlnet_hint=control_maps[:,:,i*each_sample_frame-1:(i+1)*each_sample_frame-1,:,:] if i>0 else control_maps[:,:,:each_sample_frame,:,:],
                images=v2v_input_frames[:,:,i*each_sample_frame-1:(i+1)*each_sample_frame-1,:,:] if i>0 else v2v_input_frames[:,:,:each_sample_frame,:,:],
                first_frame_output=out[-1] if i>0 else None,
                prompt=testing_prompt,
                num_inference_steps=num_inference_steps,
                width=w,
                height=h,
                guidance_scale=guidance_scale,
                generator=[torch.Generator(device="cuda").manual_seed(seed)],
                video_scale = video_scale, 
                init_noise_by_residual_thres = init_noise_thres,    # residual-based init. larger thres ==> more smooth.
                controlnet_conditioning_scale=1.0,
                fix_first_frame=True, 
                in_domain=True,
        )
        out1 = out1.images[0]    
        if len(out1) > 1:
            out1 = out1[1:] # drop the first frame
        out.extend(out1)

    return to_video(out, 8)


examples = [
        ["bear.mp4",
        "a bear walking through stars, artstation"],
        ["car-shadow.mp4",
        "a car, sunset, cartoon style, artstation."],
        ["libby.mp4",
        "a dog running, chinese ink painting."],
]

def preview_inference(
        input_video, 
        prompt, seed, 
        num_inference_steps, guidance_scale, 
        sampling_rate, video_scale, init_noise_thres,
        each_sample_frame,iter_times, h, w,
    ):
    return inference(input_video, 
        prompt, seed, 
        num_inference_steps, guidance_scale, 
        sampling_rate, 0.0, 0.0, 1, 1, h, w,)

if __name__ == '__main__':
    with gr.Blocks() as demo:
        with gr.Row():
            # with gr.Column(scale=1):
            input_video = gr.Video(
                label="Input Video", source='upload', format="mp4", visible=True)
            with gr.Column():
                init_noise_thres = gr.Slider(0, 1, value=0.1, step=0.1, label="init_noise_thress")
                each_sample_frame = gr.Slider(6, 16, value=8, step=1, label="each_sample_frame")
                iter_times = gr.Slider(1, 4, value=1, step=1, label="iter_times")
                sampling_rate = gr.Slider(1, 8, value=3, step=1, label="sampling_rate")
                h = gr.Slider(256, 768, value=512, step=64, label="height")
                w = gr.Slider(256, 768, value=512, step=64, label="width")
            with gr.Column():
                seed =  gr.Slider(0, 6666, value=1, step=1, label="seed")
                num_inference_steps =  gr.Slider(5, 50, value=20, step=1, label="num_inference_steps")
                guidance_scale = gr.Slider(1, 20, value=7.5, step=0.5, label="guidance_scale")
                video_scale = gr.Slider(0, 2.5, value=1.5, step=0.1, label="video_scale")
                prompt = gr.Textbox(label='Prompt')
                # preview_button = gr.Button('Preview')
                run_button = gr.Button('Generate Video')
            # with gr.Column(scale=1):
            result = gr.Video(label="Generated Video")
        inputs = [
                input_video,
                prompt,
                seed,
                num_inference_steps,
                guidance_scale,
                sampling_rate,
                video_scale,
                init_noise_thres,
                each_sample_frame,
                iter_times,
                h,
                w,
            ]
    
        gr.Examples(examples=examples,
                    inputs=inputs,
                    outputs=result,
                    fn=inference,
                    cache_examples=False,
                    run_on_click=False,
                    )

        run_button.click(fn=inference,
                            inputs=inputs,
                            outputs=result,)
        # preview_button.click(fn=preview_inference,
        #                     inputs=inputs,
        #                     outputs=result,)
    
    demo.launch(server_name="0.0.0.0", server_port=7860)