AnimateLCM-SVD

Paused

File size: 7,625 Bytes

import spaces

import gradio as gr
# import gradio.helpers
import torch
import os
from glob import glob
from pathlib import Path
from typing import Optional

from PIL import Image
from diffusers.utils import load_image, export_to_video
from pipeline import StableVideoDiffusionPipeline

import random
from safetensors import safe_open
from lcm_scheduler import AnimateLCMSVDStochasticIterativeScheduler


def get_safetensors_files():
    models_dir = "./safetensors"
    safetensors_files = [
        f for f in os.listdir(models_dir) if f.endswith(".safetensors")
    ]
    return safetensors_files


def model_select(selected_file):
    print("load model weights", selected_file)
    pipe.unet.cpu()
    file_path = os.path.join("./safetensors", selected_file)
    state_dict = {}
    with safe_open(file_path, framework="pt", device="cpu") as f:
        for key in f.keys():
            state_dict[key] = f.get_tensor(key)
    missing, unexpected = pipe.unet.load_state_dict(state_dict, strict=True)
    pipe.unet.cuda()
    del state_dict
    return


noise_scheduler = AnimateLCMSVDStochasticIterativeScheduler(
    num_train_timesteps=40,
    sigma_min=0.002,
    sigma_max=700.0,
    sigma_data=1.0,
    s_noise=1.0,
    rho=7,
    clip_denoised=False,
)
pipe = StableVideoDiffusionPipeline.from_pretrained(
    "stabilityai/stable-video-diffusion-img2vid-xt",
    scheduler=noise_scheduler,
    torch_dtype=torch.float16,
    variant="fp16",
)
pipe.to("cuda")
pipe.enable_model_cpu_offload()  # for smaller cost
model_select("AnimateLCM-SVD-xt-1.1.safetensors")
# pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True) # for faster inference


max_64_bit_int = 2**63 - 1

@spaces.GPU
def sample(
    image: Image,
    seed: Optional[int] = 42,
    randomize_seed: bool = False,
    motion_bucket_id: int = 80,
    fps_id: int = 8,
    max_guidance_scale: float = 1.2,
    min_guidance_scale: float = 1,
    width: int = 1024,
    height: int = 576,
    num_inference_steps: int = 4,
    decoding_t: int = 4,  # Number of frames decoded at a time! This eats most VRAM. Reduce if necessary.
    output_folder: str = "outputs_gradio",
):
    if image.mode == "RGBA":
        image = image.convert("RGB")

    if randomize_seed:
        seed = random.randint(0, max_64_bit_int)
    generator = torch.manual_seed(seed)

    os.makedirs(output_folder, exist_ok=True)
    base_count = len(glob(os.path.join(output_folder, "*.mp4")))
    video_path = os.path.join(output_folder, f"{base_count:06d}.mp4")

    with torch.autocast("cuda"):
        frames = pipe(
            image,
            decode_chunk_size=decoding_t,
            generator=generator,
            motion_bucket_id=motion_bucket_id,
            height=height,
            width=width,
            num_inference_steps=num_inference_steps,
            min_guidance_scale=min_guidance_scale,
            max_guidance_scale=max_guidance_scale,
        ).frames[0]
    export_to_video(frames, video_path, fps=fps_id)
    torch.manual_seed(seed)

    return video_path, seed


def resize_image(image, output_size=(1024, 576)):
    # Calculate aspect ratios
    target_aspect = output_size[0] / output_size[1]  # Aspect ratio of the desired size
    image_aspect = image.width / image.height  # Aspect ratio of the original image

    # Resize then crop if the original image is larger
    if image_aspect > target_aspect:
        # Resize the image to match the target height, maintaining aspect ratio
        new_height = output_size[1]
        new_width = int(new_height * image_aspect)
        resized_image = image.resize((new_width, new_height), Image.LANCZOS)
        # Calculate coordinates for cropping
        left = (new_width - output_size[0]) / 2
        top = 0
        right = (new_width + output_size[0]) / 2
        bottom = output_size[1]
    else:
        # Resize the image to match the target width, maintaining aspect ratio
        new_width = output_size[0]
        new_height = int(new_width / image_aspect)
        resized_image = image.resize((new_width, new_height), Image.LANCZOS)
        # Calculate coordinates for cropping
        left = 0
        top = (new_height - output_size[1]) / 2
        right = output_size[0]
        bottom = (new_height + output_size[1]) / 2

    # Crop the image
    cropped_image = resized_image.crop((left, top, right, bottom))
    return cropped_image


with gr.Blocks(css="footer {visibility: hidden}") as demo:
    with gr.Row():
        with gr.Column():
            image = gr.Image(label="图片上传", type="pil")
            generate_btn = gr.Button("生成")
        video = gr.Video()
    with gr.Accordion("高级设置", open=False):
        safetensors_dropdown = gr.Dropdown(
            label="选择模型", choices=get_safetensors_files()
        )
        seed = gr.Slider(
            label="种子",
            value=42,
            randomize=False,
            minimum=0,
            maximum=max_64_bit_int,
            step=1,
        )
        randomize_seed = gr.Checkbox(label="随机种子", value=False)
        motion_bucket_id = gr.Slider(
            label="Motion bucket id",
            info="Controls how much motion to add/remove from the image",
            value=80,
            minimum=1,
            maximum=255,
        )
        fps_id = gr.Slider(
            label="FPS",
            info="The length of your video in seconds will be 25/fps",
            value=8,
            minimum=5,
            maximum=30,
        )
        width = gr.Slider(
            label="输入图片宽度",
            info="It should be divisible by 64",
            value=1024,
            minimum=576,
            maximum=2048,
        )
        height = gr.Slider(
            label="输入图片高度",
            info="It should be divisible by 64",
            value=576,
            minimum=320,
            maximum=1152,
        )
        max_guidance_scale = gr.Slider(
            label="最大运动控制",
            info="classifier-free guidance strength",
            value=1.2,
            minimum=1,
            maximum=2,
        )
        min_guidance_scale = gr.Slider(
            label="最小运动控制",
            info="classifier-free guidance strength",
            value=1,
            minimum=1,
            maximum=1.5,
        )
        num_inference_steps = gr.Slider(
            label="推理步数",
            info="steps for inference",
            value=4,
            minimum=1,
            maximum=20,
            step=1,
        )

    image.upload(fn=resize_image, inputs=image, outputs=image, queue=False)
    generate_btn.click(
        fn=sample,
        inputs=[
            image,
            seed,
            randomize_seed,
            motion_bucket_id,
            fps_id,
            max_guidance_scale,
            min_guidance_scale,
            width,
            height,
            num_inference_steps,
        ],
        outputs=[video, seed],
        api_name="video",
    )
    safetensors_dropdown.change(fn=model_select, inputs=safetensors_dropdown)

    gr.Examples(
        examples=[
            ["test_imgs/ai-generated-8496135_1280.jpg"],
            ["test_imgs/dog-7396912_1280.jpg"],
            ["test_imgs/ship-7833921_1280.jpg"],
            ["test_imgs/girl-4898696_1280.jpg"],
            ["test_imgs/power-station-6579092_1280.jpg"]
        ],
        inputs=[image],
        outputs=[video, seed],
        fn=sample,
        cache_examples=True,
    )

if __name__ == "__main__":
    demo.queue(max_size=30, api_open=False)
    demo.launch(share=True, show_api=False)