ltx-video-distilled

Running on Zero

App Files Files Community

ltx-video-distilled / app.py

linoyts HF Staff

Update requirements.txt (#2)

806b2b0 verified 18 days ago

raw

history blame

6.93 kB

	import gradio as gr
	import spaces
	import torch
	# from pipeline_ltx_condition import LTXVideoCondition, LTXConditionPipeline
	# from diffusers import LTXLatentUpsamplePipeline
	from diffusers import LTXConditionPipeline, LTXLatentUpsamplePipeline
	from diffusers.pipelines.ltx.pipeline_ltx_condition import LTXVideoCondition
	from diffusers.utils import export_to_video, load_video
	import numpy as np


	pipe = LTXConditionPipeline.from_pretrained("linoyts/LTX-Video-0.9.7-distilled-diffusers", torch_dtype=torch.bfloat16)
	pipe_upsample = LTXLatentUpsamplePipeline.from_pretrained("a-r-r-o-w/LTX-Video-0.9.7-Latent-Spatial-Upsampler-diffusers", vae=pipe.vae, torch_dtype=torch.bfloat16)
	pipe.to("cuda")
	pipe_upsample.to("cuda")
	pipe.vae.enable_tiling()

	MAX_SEED = np.iinfo(np.int32).max
	MAX_IMAGE_SIZE = 2048


	def round_to_nearest_resolution_acceptable_by_vae(height, width):
	height = height - (height % pipe.vae_temporal_compression_ratio)
	width = width - (width % pipe.vae_temporal_compression_ratio)
	return height, width

	@spaces.GPU
	def generate(prompt,
	negative_prompt,
	image,
	video,
	steps,
	num_frames,
	seed,
	randomize_seed,
	t2v, improve_texture=False, progress=gr.Progress(track_tqdm=True)):

	if randomize_seed:
	seed = random.randint(0, MAX_SEED)

	# Part 1. Generate video at smaller resolution
	# Text-only conditioning is also supported without the need to pass `conditions`
	expected_height, expected_width = 768, 1152
	downscale_factor = 2 / 3
	downscaled_height, downscaled_width = int(expected_height * downscale_factor), int(expected_width * downscale_factor)
	downscaled_height, downscaled_width = round_to_nearest_resolution_acceptable_by_vae(downscaled_height, downscaled_width)

	condition = image is not None or video is not None
	if video:
	frames_to_use = 21
	video = load_video(video)[:frames_to_use]
	else:
	video = [image]

	if condition and (not t2v):
	condition1 = LTXVideoCondition(video=video, frame_index=0)
	latents = pipe(
	conditions=condition1,
	prompt=prompt,
	negative_prompt=negative_prompt,
	width=downscaled_width,
	height=downscaled_height,
	num_frames=num_frames,
	num_inference_steps=steps,
	decode_timestep = 0.05,
	decode_noise_scale = 0.025,
	generator=torch.Generator(device="cuda").manual_seed(seed),
	output_type="latent",
	).frames
	else:
	latents = pipe(
	prompt=prompt,
	negative_prompt=negative_prompt,
	width=downscaled_width,
	height=downscaled_height,
	num_frames=num_frames,
	num_inference_steps=steps,
	decode_timestep = 0.05,
	decode_noise_scale = 0.025,
	generator=torch.Generator(device="cuda").manual_seed(seed),
	output_type="latent",
	).frames



	# latents = pipe(
	# conditions=condition1,
	# prompt=prompt,
	# negative_prompt=negative_prompt,
	# # width=downscaled_width,
	# # height=downscaled_height,
	# num_frames=num_frames,
	# num_inference_steps=steps,
	# decode_timestep = 0.05,
	# decode_noise_scale = 0.025,
	# generator=torch.Generator().manual_seed(seed),
	# #output_type="latent",
	# ).frames

	# Part 2. Upscale generated video using latent upsampler with fewer inference steps
	# The available latent upsampler upscales the height/width by 2x
	if improve_texture:
	upscaled_height, upscaled_width = downscaled_height * 2, downscaled_width * 2
	upscaled_latents = pipe_upsample(
	latents=latents,
	output_type="latent"
	).frames

	# Part 3. Denoise the upscaled video with few steps to improve texture (optional, but recommended)

	video = pipe(
	conditions=condition1,
	prompt=prompt,
	negative_prompt=negative_prompt,
	width=upscaled_width,
	height=upscaled_height,
	num_frames=num_frames,
	denoise_strength=0.4, # Effectively, 4 inference steps out of 10
	num_inference_steps=10,
	latents=upscaled_latents,
	decode_timestep=0.05,
	image_cond_noise_scale=0.025,
	generator=torch.Generator().manual_seed(seed),
	output_type="pil",
	).frames[0]
	else:
	upscaled_height, upscaled_width = downscaled_height * 2, downscaled_width * 2
	video = pipe_upsample(
	latents=latents,
	# output_type="latent"
	).frames[0]

	# Part 4. Downscale the video to the expected resolution
	video = [frame.resize((expected_width, expected_height)) for frame in video]
	export_to_video(video, "output.mp4", fps=24)
	return "output.mp4"



	css="""
	#col-container {
	margin: 0 auto;
	max-width: 900px;
	}
	"""

	js_func = """
	function refresh() {
	const url = new URL(window.location);

	if (url.searchParams.get('__theme') !== 'dark') {
	url.searchParams.set('__theme', 'dark');
	window.location.href = url.href;
	}
	}
	"""

	with gr.Blocks(css=css, theme=gr.themes.Ocean()) as demo:

	gr.Markdown("# LTX Video 0.9.7 Distilled")

	with gr.Row():
	with gr.Column():
	with gr.Group():
	with gr.Tab("text-to-video"):
	image = gr.Image(label="", visible=False)
	#prompt = gr.Textbox(label="prompt")
	with gr.Tab("image-to-video"):
	image = gr.Image(label="")
	#prompt = gr.Textbox(label="prompt")
	with gr.Tab("video-to-video"):
	video = gr.Video(label="")
	prompt = gr.Textbox(label="prompt")
	t2v = gr.Checkbox(label="t2v", value=False)
	run_button = gr.Button()
	with gr.Column():
	output = gr.Video(interactive=False)


	with gr.Accordion("Advanced settings", open=False):
	negative_prompt = gr.Textbox(label="negative prompt", value="worst quality, inconsistent motion, blurry, jittery, distorted", visible=False)
	with gr.Row():
	seed = gr.Number(label="seed", value=0, precision=0)
	randomize_seed = gr.Checkbox(label="randomize seed")
	with gr.Row():
	steps = gr.Slider(label="Steps", minimum=1, maximum=30, value=8, step=1)
	num_frames = gr.Slider(label="# frames", minimum=1, maximum=30, value=8, step=1)



	run_button.click(fn=generate,
	inputs=[prompt,
	negative_prompt,
	image,
	video,
	steps,
	num_frames,
	seed,
	randomize_seed, t2v],
	outputs=[output])


	demo.launch()