File size: 2,266 Bytes
a623d98
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import torch
from diffusers import (
    AnimateDiffControlNetPipeline, AutoencoderKL, 
    ControlNetModel, MotionAdapter, LCMScheduler
)
from diffusers.utils import export_to_gif, load_video
from controlnet_aux import MidasDetector  # Faster than ZoeDetector

# Load depth-based ControlNet (in diffusers format)
controlnet = ControlNetModel.from_pretrained(
    "lllyasviel/sd-controlnet-depth", torch_dtype=torch.float16
)

# Load AnimateDiff Motion Adapter (AnimateLCM)
motion_adapter = MotionAdapter.from_pretrained("wangfuyun/AnimateLCM")

# Load VAE for SD 1.5
vae = AutoencoderKL.from_pretrained("stabilityai/sd-vae-ft-mse", torch_dtype=torch.float16)

# Load AnimateDiff pipeline with ControlNet
pipe = AnimateDiffControlNetPipeline.from_pretrained(
    "SG161222/Realistic_Vision_V5.1_noVAE",
    motion_adapter=motion_adapter,
    controlnet=controlnet,
    vae=vae,
).to(device="cuda", dtype=torch.float16)

# Use LCM Scheduler (optimized for AnimateLCM)
pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)

# Load AnimateLCM LoRA
pipe.load_lora_weights(
    "wangfuyun/AnimateLCM", 
    weight_name="AnimateLCM_sd15_t2v_lora.safetensors", 
    adapter_name="lcm-lora"
)
pipe.set_adapters(["lcm-lora"], adapter_scales=[0.8])

# Use MiDaS for depth extraction (faster)
depth_detector = MidasDetector.from_pretrained("lllyasviel/Annotators").to("cuda")

# Load input video for depth-based conditioning
video = load_video("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff-vid2vid-input-1.gif")
conditioning_frames = []

# Process video frames into depth maps
for frame in video:
    conditioning_frames.append(depth_detector(frame))

# Define prompts
prompt = "a panda, playing a guitar, sitting in a pink boat, in the ocean, mountains in background, realistic, high quality"
negative_prompt = "blurry, deformed, distorted, bad quality"

# Generate animated output
output = pipe(
    prompt=prompt,
    negative_prompt=negative_prompt,
    num_frames=len(video),
    num_inference_steps=10,
    guidance_scale=2.0,
    conditioning_frames=conditioning_frames,
    generator=torch.manual_seed(42),
).frames[0]

# Save animation as GIF
export_to_gif(output, "animatediff_controlnet.gif", fps=8)