File size: 5,840 Bytes
4afdd4d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3f3c66b
 
4afdd4d
28fb111
4afdd4d
28fb111
 
4afdd4d
28fb111
4afdd4d
28fb111
4afdd4d
 
3f3c66b
 
71d7ef7
 
3f3c66b
 
28fb111
4afdd4d
28fb111
 
4afdd4d
 
 
 
 
 
 
 
 
 
 
 
28fb111
 
4afdd4d
 
 
 
 
 
 
 
 
 
28fb111
4afdd4d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34a5803
 
 
 
 
 
 
 
 
 
 
 
 
 
4afdd4d
 
631a46f
 
 
 
 
774e5ad
631a46f
 
4afdd4d
a13ef7a
 
 
 
 
 
 
 
 
 
 
 
 
 
34a5803
4afdd4d
a13ef7a
4afdd4d
a13ef7a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
import gradio as gr
import os
import sys
import argparse
import random
import time
from omegaconf import OmegaConf
import torch
import torchvision
from pytorch_lightning import seed_everything
from huggingface_hub import hf_hub_download
from einops import repeat
import torchvision.transforms as transforms
from utils.utils import instantiate_from_config
sys.path.insert(0, "scripts/evaluation")
from funcs import (
    batch_ddim_sampling,
    load_model_checkpoint,
    get_latent_z,
    save_videos
)

SECRET_TOKEN = os.getenv('SECRET_TOKEN', 'default_secret')

def download_model():
    REPO_ID = 'Doubiiu/DynamiCrafter_1024'
    filename_list = ['model.ckpt']
    if not os.path.exists('./checkpoints/dynamicrafter_1024_v1/'):
        os.makedirs('./checkpoints/dynamicrafter_1024_v1/')
    for filename in filename_list:
        local_file = os.path.join('./checkpoints/dynamicrafter_1024_v1/', filename)
        if not os.path.exists(local_file):
            hf_hub_download(repo_id=REPO_ID, filename=filename, local_dir='./checkpoints/dynamicrafter_1024_v1/', force_download=True)
    

def infer(secret_token, image, prompt, steps=50, cfg_scale=7.5, eta=1.0, fs=3, seed=123):
    if secret_token != SECRET_TOKEN:
        raise gr.Error(
            f'Invalid secret token. Please fork the original space if you want to use it for yourself.')


    resolution = (576, 1024)
    download_model()
    ckpt_path='checkpoints/dynamicrafter_1024_v1/model.ckpt'
    config_file='configs/inference_1024_v1.0.yaml'
    config = OmegaConf.load(config_file)
    model_config = config.pop("model", OmegaConf.create())
    model_config['params']['unet_config']['params']['use_checkpoint']=False   
    model = instantiate_from_config(model_config)
    assert os.path.exists(ckpt_path), "Error: checkpoint Not Found!"
    model = load_model_checkpoint(model, ckpt_path)
    model.eval()
    model = model.cuda()
    save_fps = 8

    seed_everything(seed)
    transform = transforms.Compose([
        transforms.Resize(min(resolution)),
        transforms.CenterCrop(resolution),
        ])
    torch.cuda.empty_cache()
    print('start:', prompt, time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())))
    start = time.time()
    if steps > 60:
        steps = 60 

    batch_size=1
    channels = model.model.diffusion_model.out_channels
    frames = model.temporal_length
    h, w = resolution[0] // 8, resolution[1] // 8
    noise_shape = [batch_size, channels, frames, h, w]

    # text cond
    text_emb = model.get_learned_conditioning([prompt])

    # img cond
    img_tensor = torch.from_numpy(image).permute(2, 0, 1).float().to(model.device)
    img_tensor = (img_tensor / 255. - 0.5) * 2

    image_tensor_resized = transform(img_tensor) #3,256,256
    videos = image_tensor_resized.unsqueeze(0) # bchw
    
    z = get_latent_z(model, videos.unsqueeze(2)) #bc,1,hw
    
    img_tensor_repeat = repeat(z, 'b c t h w -> b c (repeat t) h w', repeat=frames)

    cond_images = model.embedder(img_tensor.unsqueeze(0)) ## blc
    img_emb = model.image_proj_model(cond_images)

    imtext_cond = torch.cat([text_emb, img_emb], dim=1)

    fs = torch.tensor([fs], dtype=torch.long, device=model.device)
    cond = {"c_crossattn": [imtext_cond], "fs": fs, "c_concat": [img_tensor_repeat]}
    
    ## inference
    batch_samples = batch_ddim_sampling(model, cond, noise_shape, n_samples=1, ddim_steps=steps, ddim_eta=eta, cfg_scale=cfg_scale)
    ## b,samples,c,t,h,w

    video_path = './output.mp4'
    save_videos(batch_samples, './', filenames=['output'], fps=save_fps)
    model = model.cpu()

        # Read the content of the video file and encode it to base64
    with open(video_path, "rb") as video_file:
        video_base64 = base64.b64encode(video_file.read()).decode('utf-8')

    # Prepend the appropriate data URI header with MIME type
    video_data_uri = 'data:video/mp4;base64,' + video_base64
    
    # clean-up (otherwise there is a risk of "ghosting", eg. someone seeing the previous generated video",
    # of one of the steps go wrong)
    os.remove(video_path)

    return video_data_uri



with gr.Blocks() as app:
    gr.HTML("""
        <div style="z-index: 100; position: fixed; top: 0px; right: 0px; left: 0px; bottom: 0px; width: 100%; height: 100%; background: white; display: flex; align-items: center; justify-content: center; color: black;">
        <div style="text-align: center; color: black;">
        <p style="color: black;">This space is a REST API to programmatically generate MP4 videos.</p>
        <p style="color: black;">Interested in using it? Look no further than the <a href="https://huggingface.co/spaces/Doubiiu/DynamiCrafter" target="_blank">original space</a>!</p>
        </div>
        </div>""")
    

    secret_token = gr.Text(label='Secret Token', max_lines=1)
    i2v_input_image = gr.Image(label="Input Image",elem_id="input_img")
    i2v_input_text = gr.Text(label='Prompts')
    i2v_seed = gr.Slider(label='Random Seed', minimum=0, maximum=10000, step=1, value=123)
    i2v_eta = gr.Slider(minimum=0.0, maximum=1.0, step=0.1, label='ETA', value=1.0, elem_id="i2v_eta")
    i2v_cfg_scale = gr.Slider(minimum=1.0, maximum=15.0, step=0.5, label='CFG Scale', value=7.5, elem_id="i2v_cfg_scale")
    i2v_steps = gr.Slider(minimum=1, maximum=60, step=1, elem_id="i2v_steps", label="Sampling steps", value=50)
    i2v_motion = gr.Slider(minimum=5, maximum=20, step=1, elem_id="i2v_motion", label="FPS", value=10)
    i2v_end_btn = gr.Button("Generate")

    i2v_output_video_base64 = gr.Text()

    i2v_end_btn.click(inputs=[secret_token, i2v_input_image, i2v_input_text, i2v_steps, i2v_cfg_scale, i2v_eta, i2v_motion, i2v_seed],
                        outputs=[i2v_output_video_base64],
                        fn = infer
    )

app.queue(max_size=4).launch(show_api=True)