File size: 6,425 Bytes
aa2b83c
 
 
cb5a657
aa2b83c
cb5a657
 
 
 
 
 
 
 
 
 
 
 
 
18ed7af
cb5a657
78731ee
 
b75265f
78731ee
b75265f
 
 
321b7b2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b75265f
 
 
 
 
 
 
 
 
 
 
 
 
 
2927d16
49812df
 
321b7b2
cb5a657
 
ebdbb36
cb5a657
1892558
cb5a657
 
 
 
 
 
 
 
 
 
 
321b7b2
49812df
 
cb5a657
18ed7af
cb5a657
 
 
 
 
 
 
 
 
 
 
 
321b7b2
cb5a657
 
 
 
 
1892558
 
cb5a657
 
1892558
cb5a657
 
 
1892558
cb5a657
1892558
 
cb5a657
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
aa2b83c
 
321b7b2
18365d1
aa2b83c
 
1892558
 
 
aa2b83c
 
1892558
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cb5a657
 
 
7231c46
1892558
cb5a657
 
 
1892558
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
import torch._dynamo
torch._dynamo.config.suppress_errors = True

import torch
import gradio as gr
import os
import base64
from glob import glob
from pathlib import Path
from typing import Optional

from diffusers import StableVideoDiffusionPipeline
from diffusers.utils import load_image, export_to_video
from PIL import Image

import uuid
import random
from huggingface_hub import login, hf_hub_download
import spaces

model_directory = './checkpoints'

try:
    hf_hub_download(repo_id="vdo/stable-video-diffusion-img2vid-xt-1-1", filename="svd_xt_1_1.safetensors", local_dir=model_directory, cache_dir=model_directory)
except (Exception, BaseException) as error:
    print(error)

# pipe = StableVideoDiffusionPipeline.from_pretrained(
#     # "stabilityai/stable-video-diffusion-img2vid-xt-1-1",
#     "vdo/stable-video-diffusion-img2vid-xt-1-1",
#     torch_dtype=torch.float16,
#     variant="fp16"
# )

# pipe.save_pretrained("./checkpoints", variant="fp16")

if not os.path.exists(model_directory):
    pipe = StableVideoDiffusionPipeline.from_pretrained(
        # "stabilityai/stable-video-diffusion-img2vid-xt-1-1",
        "vdo/stable-video-diffusion-img2vid-xt-1-1",
        torch_dtype=torch.float16,
        variant="fp16"
    )
    pipe.save_pretrained("./checkpoints", variant="fp16")
else:
    try:
        pipe = StableVideoDiffusionPipeline.from_pretrained(
            model_directory,
            torch_dtype=torch.float16,
            variant="fp16"
        )
    except:
        pipe = StableVideoDiffusionPipeline.from_pretrained(
            # "stabilityai/stable-video-diffusion-img2vid-xt-1-1",
            "vdo/stable-video-diffusion-img2vid-xt-1-1",
            torch_dtype=torch.float16,
            variant="fp16"
        )
        pipe.save_pretrained("./checkpoints", variant="fp16")

# device = "cuda" if torch.cuda.is_available() else "cpu"
# pipe.to(device)
# pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
#pipe.vae = torch.compile(pipe.vae, mode="reduce-overhead", fullgraph=True)

max_64_bit_int = 2**63 - 1

@spaces.GPU(enable_queue=True, duration=240)
def generate_video(
    image: Image,
    seed: int,
    motion_bucket_id: int = 127,
    fps_id: int = 6,
    version: str = "svd_xt",
    cond_aug: float = 0.02,
    decoding_t: int = 3,  # Number of frames decoded at a time! This eats most VRAM. Reduce if necessary.
    device: str = "cuda",
    output_folder: str = "outputs",
):
    global pipe
    device = "cuda" if torch.cuda.is_available() else "cpu"
    pipe.to(device)
    # note julian: normally we should resize input images, but normally they are already in 1024x576, so..

    # also, I would like to experiment with vertical videos, and 1024x512 videos
    image = resize_image(image)
    
    if image.mode == "RGBA":
        image = image.convert("RGB")
        
    generator = torch.manual_seed(seed)
    
    os.makedirs(output_folder, exist_ok=True)
    base_count = len(glob(os.path.join(output_folder, "*.mp4")))
    video_path = os.path.join(output_folder, f"{base_count:06d}.mp4")

    # pipe.to("cuda")
    frames = pipe(image, decode_chunk_size=decoding_t, generator=generator, motion_bucket_id=motion_bucket_id, noise_aug_strength=0.1, num_frames=25).frames[0]
    export_to_video(frames, video_path, fps=fps_id)
    torch.manual_seed(seed)

    # Read the content of the video file and encode it to base64
    # with open(video_path, "rb") as video_file:
    #     video_base64 = base64.b64encode(video_file.read()).decode('utf-8')

    # Prepend the appropriate data URI header with MIME type
    # video_data_uri = 'data:video/mp4;base64,' + video_base64
    
    # clean-up (otherwise there is a risk of "ghosting", eg. someone seeing the previous generated video",
    # of one of the steps go wrong)
    # os.remove(video_path)
    
    # return video_data_uri
    return video_path

def resize_image(image, output_size=(1024, 576)):
    # Calculate aspect ratios
    target_aspect = output_size[0] / output_size[1]  # Aspect ratio of the desired size
    image_aspect = image.width / image.height  # Aspect ratio of the original image

    # Resize then crop if the original image is larger
    if image_aspect > target_aspect:
        # Resize the image to match the target height, maintaining aspect ratio
        new_height = output_size[1]
        new_width = int(new_height * image_aspect)
        resized_image = image.resize((new_width, new_height), Image.LANCZOS)
        # Calculate coordinates for cropping
        left = (new_width - output_size[0]) / 2
        top = 0
        right = (new_width + output_size[0]) / 2
        bottom = output_size[1]
    else:
        # Resize the image to match the target width, maintaining aspect ratio
        new_width = output_size[0]
        new_height = int(new_width / image_aspect)
        resized_image = image.resize((new_width, new_height), Image.LANCZOS)
        # Calculate coordinates for cropping
        left = 0
        top = (new_height - output_size[1]) / 2
        right = output_size[0]
        bottom = (new_height + output_size[1]) / 2

    # Crop the image
    cropped_image = resized_image.crop((left, top, right, bottom))
    return cropped_image


css = """
img, video {
    max-height: 400px;
    object-fit: contain;
}
video {
    margin: 0 auto
}
"""

with gr.Blocks(css=css) as SVD_XT_1_1:
    with gr.Row():
        with gr.Column():
            image = gr.Image(label="Upload your image", type="pil")
            generate_btn = gr.Button("Generate")
            # base64_out = gr.Textbox(label="Base64 Video")
            seed = gr.Slider(label="Seed", value=42, randomize=False, minimum=0, maximum=max_64_bit_int, step=1)
            motion_bucket_id = gr.Slider(label="Motion bucket id", info="Controls how much motion to add/remove from the image", value=127, minimum=1, maximum=255)
            fps_id = gr.Slider(label="Frames per second", info="The length of your video in seconds will be 25/fps", value=6, minimum=5, maximum=30)

        with gr.Column():
            video_out = gr.Video(
                autoplay=True,
                # height=512,
                # width=512,
                # elem_id="video_output"
            )
       
    generate_btn.click(
        fn=generate_video,
        inputs=[image, seed, motion_bucket_id, fps_id],
        outputs=video_out,
        api_name="run"
    )

SVD_XT_1_1.launch()