ai-tube-model-dynamicrafter

Paused

File size: 5,840 Bytes

4afdd4d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3f3c66b
 
4afdd4d
28fb111
4afdd4d
28fb111
 
4afdd4d
28fb111
4afdd4d
28fb111
4afdd4d
 
3f3c66b
 
71d7ef7
 
3f3c66b
 
28fb111
4afdd4d
28fb111
 
4afdd4d
 
 
 
 
 
 
 
 
 
 
 
28fb111
 
4afdd4d
 
 
 
 
 
 
 
 
 
28fb111
4afdd4d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34a5803
 
 
 
 
 
 
 
 
 
 
 
 
 
4afdd4d
 
631a46f
 
 
 
 
774e5ad
631a46f
 
4afdd4d
a13ef7a
 
 
 
 
 
 
 
 
 
 
 
 
 
34a5803
4afdd4d
a13ef7a
4afdd4d
a13ef7a

import gradio as gr
import os
import sys
import argparse
import random
import time
from omegaconf import OmegaConf
import torch
import torchvision
from pytorch_lightning import seed_everything
from huggingface_hub import hf_hub_download
from einops import repeat
import torchvision.transforms as transforms
from utils.utils import instantiate_from_config
sys.path.insert(0, "scripts/evaluation")
from funcs import (
    batch_ddim_sampling,
    load_model_checkpoint,
    get_latent_z,
    save_videos
)

SECRET_TOKEN = os.getenv('SECRET_TOKEN', 'default_secret')

def download_model():
    REPO_ID = 'Doubiiu/DynamiCrafter_1024'
    filename_list = ['model.ckpt']
    if not os.path.exists('./checkpoints/dynamicrafter_1024_v1/'):
        os.makedirs('./checkpoints/dynamicrafter_1024_v1/')
    for filename in filename_list:
        local_file = os.path.join('./checkpoints/dynamicrafter_1024_v1/', filename)
        if not os.path.exists(local_file):
            hf_hub_download(repo_id=REPO_ID, filename=filename, local_dir='./checkpoints/dynamicrafter_1024_v1/', force_download=True)
    

def infer(secret_token, image, prompt, steps=50, cfg_scale=7.5, eta=1.0, fs=3, seed=123):
    if secret_token != SECRET_TOKEN:
        raise gr.Error(
            f'Invalid secret token. Please fork the original space if you want to use it for yourself.')


    resolution = (576, 1024)
    download_model()
    ckpt_path='checkpoints/dynamicrafter_1024_v1/model.ckpt'
    config_file='configs/inference_1024_v1.0.yaml'
    config = OmegaConf.load(config_file)
    model_config = config.pop("model", OmegaConf.create())
    model_config['params']['unet_config']['params']['use_checkpoint']=False   
    model = instantiate_from_config(model_config)
    assert os.path.exists(ckpt_path), "Error: checkpoint Not Found!"
    model = load_model_checkpoint(model, ckpt_path)
    model.eval()
    model = model.cuda()
    save_fps = 8

    seed_everything(seed)
    transform = transforms.Compose([
        transforms.Resize(min(resolution)),
        transforms.CenterCrop(resolution),
        ])
    torch.cuda.empty_cache()
    print('start:', prompt, time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())))
    start = time.time()
    if steps > 60:
        steps = 60 

    batch_size=1
    channels = model.model.diffusion_model.out_channels
    frames = model.temporal_length
    h, w = resolution[0] // 8, resolution[1] // 8
    noise_shape = [batch_size, channels, frames, h, w]

    # text cond
    text_emb = model.get_learned_conditioning([prompt])

    # img cond
    img_tensor = torch.from_numpy(image).permute(2, 0, 1).float().to(model.device)
    img_tensor = (img_tensor / 255. - 0.5) * 2

    image_tensor_resized = transform(img_tensor) #3,256,256
    videos = image_tensor_resized.unsqueeze(0) # bchw
    
    z = get_latent_z(model, videos.unsqueeze(2)) #bc,1,hw
    
    img_tensor_repeat = repeat(z, 'b c t h w -> b c (repeat t) h w', repeat=frames)

    cond_images = model.embedder(img_tensor.unsqueeze(0)) ## blc
    img_emb = model.image_proj_model(cond_images)

    imtext_cond = torch.cat([text_emb, img_emb], dim=1)

    fs = torch.tensor([fs], dtype=torch.long, device=model.device)
    cond = {"c_crossattn": [imtext_cond], "fs": fs, "c_concat": [img_tensor_repeat]}
    
    ## inference
    batch_samples = batch_ddim_sampling(model, cond, noise_shape, n_samples=1, ddim_steps=steps, ddim_eta=eta, cfg_scale=cfg_scale)
    ## b,samples,c,t,h,w

    video_path = './output.mp4'
    save_videos(batch_samples, './', filenames=['output'], fps=save_fps)
    model = model.cpu()

        # Read the content of the video file and encode it to base64
    with open(video_path, "rb") as video_file:
        video_base64 = base64.b64encode(video_file.read()).decode('utf-8')

    # Prepend the appropriate data URI header with MIME type
    video_data_uri = 'data:video/mp4;base64,' + video_base64
    
    # clean-up (otherwise there is a risk of "ghosting", eg. someone seeing the previous generated video",
    # of one of the steps go wrong)
    os.remove(video_path)

    return video_data_uri



with gr.Blocks() as app:
    gr.HTML("""
        <div style="z-index: 100; position: fixed; top: 0px; right: 0px; left: 0px; bottom: 0px; width: 100%; height: 100%; background: white; display: flex; align-items: center; justify-content: center; color: black;">
        <div style="text-align: center; color: black;">
        <p style="color: black;">This space is a REST API to programmatically generate MP4 videos.</p>
        <p style="color: black;">Interested in using it? Look no further than the <a href="https://huggingface.co/spaces/Doubiiu/DynamiCrafter" target="_blank">original space</a>!</p>
        </div>
        </div>""")
    

    secret_token = gr.Text(label='Secret Token', max_lines=1)
    i2v_input_image = gr.Image(label="Input Image",elem_id="input_img")
    i2v_input_text = gr.Text(label='Prompts')
    i2v_seed = gr.Slider(label='Random Seed', minimum=0, maximum=10000, step=1, value=123)
    i2v_eta = gr.Slider(minimum=0.0, maximum=1.0, step=0.1, label='ETA', value=1.0, elem_id="i2v_eta")
    i2v_cfg_scale = gr.Slider(minimum=1.0, maximum=15.0, step=0.5, label='CFG Scale', value=7.5, elem_id="i2v_cfg_scale")
    i2v_steps = gr.Slider(minimum=1, maximum=60, step=1, elem_id="i2v_steps", label="Sampling steps", value=50)
    i2v_motion = gr.Slider(minimum=5, maximum=20, step=1, elem_id="i2v_motion", label="FPS", value=10)
    i2v_end_btn = gr.Button("Generate")

    i2v_output_video_base64 = gr.Text()

    i2v_end_btn.click(inputs=[secret_token, i2v_input_image, i2v_input_text, i2v_steps, i2v_cfg_scale, i2v_eta, i2v_motion, i2v_seed],
                        outputs=[i2v_output_video_base64],
                        fn = infer
    )

app.queue(max_size=4).launch(show_api=True)