ltx-video-distilled

Running on Zero

File size: 6,934 Bytes

7fe98ab
 
 
e1d5bb5
 
 
 
7fe98ab
d61a0bc
 
7fe98ab
ddd3c88
7fe98ab
 
 
 
 
d191aca
 
 
7fe98ab
 
 
 
 
 
 
 
 
64041b2
 
7fe98ab
ddd3c88
 
d191aca
e1d5bb5
ddd3c88
64041b2
 
 
e1d5bb5
 
ddd3c88
 
e1d5bb5
 
ddd3c88
64041b2
 
 
 
 
 
d191aca
64041b2
 
01900db
ddd3c88
 
 
e1d5bb5
 
ddd3c88
 
 
 
bd83817
e1d5bb5
ddd3c88
01900db
 
 
 
e1d5bb5
 
01900db
 
 
 
bd83817
e1d5bb5
01900db
 
e1d5bb5
01900db
 
 
 
 
 
 
 
 
 
 
 
 
 
ddd3c88
 
 
e1d5bb5
 
 
 
 
 
 
 
ddd3c88
e1d5bb5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ddd3c88
 
e1d5bb5
bd83817
afc7727
ddd3c88
7fe98ab
 
2eea82e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9f55bc7
7fe98ab
 
 
2eea82e
 
 
64041b2
 
 
 
 
 
 
 
2eea82e
75c7fe9
2eea82e
 
 
 
7fe98ab
 
806b2b0
7fe98ab
 
 
 
 
64041b2
 
ddd3c88
 
1a236aa
 
 
64041b2
 
1a236aa
 
 
d191aca
1a236aa
7fe98ab

import gradio as gr
import spaces 
import torch
# from pipeline_ltx_condition import LTXVideoCondition, LTXConditionPipeline
# from diffusers import LTXLatentUpsamplePipeline
from diffusers import LTXConditionPipeline, LTXLatentUpsamplePipeline
from diffusers.pipelines.ltx.pipeline_ltx_condition import LTXVideoCondition
from diffusers.utils import export_to_video, load_video
import numpy as np


pipe = LTXConditionPipeline.from_pretrained("linoyts/LTX-Video-0.9.7-distilled-diffusers", torch_dtype=torch.bfloat16)
pipe_upsample = LTXLatentUpsamplePipeline.from_pretrained("a-r-r-o-w/LTX-Video-0.9.7-Latent-Spatial-Upsampler-diffusers", vae=pipe.vae, torch_dtype=torch.bfloat16)
pipe.to("cuda")
pipe_upsample.to("cuda")
pipe.vae.enable_tiling()

MAX_SEED = np.iinfo(np.int32).max
MAX_IMAGE_SIZE = 2048


def round_to_nearest_resolution_acceptable_by_vae(height, width):
    height = height - (height % pipe.vae_temporal_compression_ratio)
    width = width - (width % pipe.vae_temporal_compression_ratio)
    return height, width
    
@spaces.GPU
def generate(prompt,
             negative_prompt,
             image,
             video,
             steps,
             num_frames,
             seed,
             randomize_seed,
             t2v, improve_texture=False, progress=gr.Progress(track_tqdm=True)):
    
    if randomize_seed:
        seed = random.randint(0, MAX_SEED)
        
    # Part 1. Generate video at smaller resolution
    # Text-only conditioning is also supported without the need to pass `conditions`
    expected_height, expected_width = 768, 1152
    downscale_factor = 2 / 3
    downscaled_height, downscaled_width = int(expected_height * downscale_factor), int(expected_width * downscale_factor)
    downscaled_height, downscaled_width = round_to_nearest_resolution_acceptable_by_vae(downscaled_height, downscaled_width)

    condition = image is not None or video is not None
    if video:
        frames_to_use = 21
        video = load_video(video)[:frames_to_use]
    else:
        video = [image]

    if condition and (not t2v):
        condition1 = LTXVideoCondition(video=video, frame_index=0)
        latents = pipe(
            conditions=condition1,
            prompt=prompt,
            negative_prompt=negative_prompt,
            width=downscaled_width,
            height=downscaled_height,
            num_frames=num_frames,
            num_inference_steps=steps,
            decode_timestep = 0.05,
            decode_noise_scale = 0.025,
            generator=torch.Generator(device="cuda").manual_seed(seed),
            output_type="latent",
        ).frames
    else:
        latents = pipe(
            prompt=prompt,
            negative_prompt=negative_prompt,
            width=downscaled_width,
            height=downscaled_height,
            num_frames=num_frames,
            num_inference_steps=steps,
            decode_timestep = 0.05,
            decode_noise_scale = 0.025,
            generator=torch.Generator(device="cuda").manual_seed(seed),
            output_type="latent",
        ).frames

   
    
    # latents = pipe(
    #         conditions=condition1,
    #         prompt=prompt,
    #         negative_prompt=negative_prompt,
    #         # width=downscaled_width,
    #         # height=downscaled_height,
    #         num_frames=num_frames,
    #         num_inference_steps=steps,
    #         decode_timestep = 0.05,
    #         decode_noise_scale = 0.025,
    #         generator=torch.Generator().manual_seed(seed),
    #         #output_type="latent",
    #     ).frames
        
    # Part 2. Upscale generated video using latent upsampler with fewer inference steps
    # The available latent upsampler upscales the height/width by 2x
    if improve_texture:
        upscaled_height, upscaled_width = downscaled_height * 2, downscaled_width * 2
        upscaled_latents = pipe_upsample(
            latents=latents,
            output_type="latent"
        ).frames
        
        # Part 3. Denoise the upscaled video with few steps to improve texture (optional, but recommended)
    
        video = pipe(
            conditions=condition1,
            prompt=prompt,
            negative_prompt=negative_prompt,
            width=upscaled_width,
            height=upscaled_height,
            num_frames=num_frames,
            denoise_strength=0.4,  # Effectively, 4 inference steps out of 10
            num_inference_steps=10,
            latents=upscaled_latents,
            decode_timestep=0.05,
            image_cond_noise_scale=0.025,
            generator=torch.Generator().manual_seed(seed),
            output_type="pil",
        ).frames[0]
    else:
        upscaled_height, upscaled_width = downscaled_height * 2, downscaled_width * 2
        video = pipe_upsample(
            latents=latents,
            # output_type="latent"
        ).frames[0]
    
    # Part 4. Downscale the video to the expected resolution
    video = [frame.resize((expected_width, expected_height)) for frame in video]
    export_to_video(video, "output.mp4", fps=24)
    return "output.mp4"



css="""
#col-container {
    margin: 0 auto;
    max-width: 900px;
}
"""

js_func = """
function refresh() {
    const url = new URL(window.location);

    if (url.searchParams.get('__theme') !== 'dark') {
        url.searchParams.set('__theme', 'dark');
        window.location.href = url.href;
    }
}
"""

with gr.Blocks(css=css, theme=gr.themes.Ocean()) as demo:

  gr.Markdown("# LTX Video 0.9.7 Distilled")

  with gr.Row():
    with gr.Column():
      with gr.Group():
        with gr.Tab("text-to-video"):
          image = gr.Image(label="", visible=False)
          #prompt = gr.Textbox(label="prompt")
        with gr.Tab("image-to-video"):
          image = gr.Image(label="")
          #prompt = gr.Textbox(label="prompt")
        with gr.Tab("video-to-video"):
          video = gr.Video(label="")
        prompt = gr.Textbox(label="prompt")
        t2v = gr.Checkbox(label="t2v", value=False)
      run_button = gr.Button()
    with gr.Column():
      output = gr.Video(interactive=False)
      

  with gr.Accordion("Advanced settings", open=False):
     negative_prompt = gr.Textbox(label="negative prompt", value="worst quality, inconsistent motion, blurry, jittery, distorted", visible=False)  
     with gr.Row():
      seed = gr.Number(label="seed", value=0, precision=0)
      randomize_seed = gr.Checkbox(label="randomize seed")
     with gr.Row():
      steps = gr.Slider(label="Steps", minimum=1, maximum=30, value=8, step=1)
      num_frames = gr.Slider(label="# frames", minimum=1, maximum=30, value=8, step=1)
    

  
  run_button.click(fn=generate, 
                   inputs=[prompt,
             negative_prompt,
             image,
             video, 
             steps,
             num_frames,
             seed,
             randomize_seed, t2v], 
                   outputs=[output])
  

demo.launch()