multimodalart HF Staff commited on
Commit
6c12bfc
·
verified ·
1 Parent(s): 95ef81b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +423 -164
app.py CHANGED
@@ -1,137 +1,379 @@
1
  import gradio as gr
2
- import spaces
3
  import torch
4
- # from pipeline_ltx_condition import LTXVideoCondition, LTXConditionPipeline
5
- # from diffusers import LTXLatentUpsamplePipeline
6
- from diffusers import LTXConditionPipeline, LTXLatentUpsamplePipeline
7
- from diffusers.pipelines.ltx.pipeline_ltx_condition import LTXVideoCondition
8
- from diffusers.utils import export_to_video, load_video
9
  import numpy as np
 
 
 
 
 
 
 
10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
- pipe = LTXConditionPipeline.from_pretrained("linoyts/LTX-Video-0.9.7-distilled-diffusers", torch_dtype=torch.bfloat16)
13
- pipe_upsample = LTXLatentUpsamplePipeline.from_pretrained("a-r-r-o-w/LTX-Video-0.9.7-Latent-Spatial-Upsampler-diffusers", vae=pipe.vae, torch_dtype=torch.bfloat16)
14
- pipe.to("cuda")
15
- pipe_upsample.to("cuda")
16
- pipe.vae.enable_tiling()
17
 
18
- MAX_SEED = np.iinfo(np.int32).max
19
- MAX_IMAGE_SIZE = 2048
 
 
20
 
 
 
 
 
21
 
22
- def round_to_nearest_resolution_acceptable_by_vae(height, width):
23
- print("before rounding",height, width)
24
- height = height - (height % pipe.vae_spatial_compression_ratio)
25
- width = width - (width % pipe.vae_spatial_compression_ratio)
26
- print("after rounding",height, width)
27
- return height, width
28
 
29
- def change_mode_to_text():
30
- return gr.update(value="text-to-video")
 
 
 
 
 
 
 
 
 
 
31
 
32
- def change_mode_to_image():
33
- return gr.update(value="image-to-video")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
- def change_mode_to_video():
36
- return gr.update(value="video-to-video")
37
-
38
  @spaces.GPU
39
  def generate(prompt,
40
  negative_prompt,
41
- image,
42
- video,
43
  height,
44
  width,
45
  mode,
46
- steps,
47
  num_frames,
48
  frames_to_use,
49
  seed,
50
  randomize_seed,
51
  guidance_scale,
52
  improve_texture=False, progress=gr.Progress(track_tqdm=True)):
53
-
54
  if randomize_seed:
55
  seed = random.randint(0, MAX_SEED)
56
-
57
- # Part 1. Generate video at smaller resolution
58
- # Text-only conditioning is also supported without the need to pass `conditions`
59
- expected_height, expected_width = height, width
60
- downscale_factor = 2 / 3
61
- downscaled_height, downscaled_width = int(expected_height * downscale_factor), int(expected_width * downscale_factor)
62
- downscaled_height, downscaled_width = round_to_nearest_resolution_acceptable_by_vae(downscaled_height, downscaled_width)
63
-
64
- print(mode)
65
- if mode == "text-to-video" and (video is not None):
66
- video = load_video(video)[:frames_to_use]
67
- condition = True
68
- elif mode == "image-to-video" and (image is not None):
69
- print("WTFFFFFF 1")
70
- video = [image]
71
- condition = True
72
- else:
73
- condition=False
74
 
75
- if condition:
76
- print("WTFFFFFF 2")
77
- condition1 = LTXVideoCondition(video=video, frame_index=0)
78
- else:
79
- condition1 = None
 
 
 
 
 
 
 
 
80
 
81
- latents = pipe(
82
- conditions=condition1,
83
- prompt=prompt,
84
- negative_prompt=negative_prompt,
85
- width=downscaled_width,
86
- height=downscaled_height,
87
- num_frames=num_frames,
88
- num_inference_steps=steps,
89
- decode_timestep = 0.05,
90
- decode_noise_scale = 0.025,
91
- guidance_scale=guidance_scale,
92
- generator=torch.Generator(device="cuda").manual_seed(seed),
93
- output_type="latent",
94
- ).frames
95
-
96
-
97
- # Part 2. Upscale generated video using latent upsampler with fewer inference steps
98
- # The available latent upsampler upscales the height/width by 2x
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
  if improve_texture:
100
- upscaled_height, upscaled_width = downscaled_height * 2, downscaled_width * 2
101
- upscaled_latents = pipe_upsample(
102
- latents=latents,
103
- output_type="latent"
104
- ).frames
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
 
106
- # Part 3. Denoise the upscaled video with few steps to improve texture (optional, but recommended)
107
- video = pipe(
108
- conditions=condition1,
109
- prompt=prompt,
110
- negative_prompt=negative_prompt,
111
- width=upscaled_width,
112
- height=upscaled_height,
113
- num_frames=num_frames,
114
- guidance_scale=guidance_scale,
115
- denoise_strength=0.6, # Effectively, 0.6 * 3 inference steps
116
- num_inference_steps=3,
117
- latents=upscaled_latents,
118
- decode_timestep=0.05,
119
- image_cond_noise_scale=0.025,
120
- generator=torch.Generator().manual_seed(seed),
121
- output_type="pil",
122
- ).frames[0]
123
  else:
124
- upscaled_height, upscaled_width = downscaled_height * 2, downscaled_width * 2
125
- video = pipe_upsample(
126
- latents=latents,
127
- # output_type="latent"
128
- ).frames[0]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129
 
130
- # Part 4. Downscale the video to the expected resolution
131
- video = [frame.resize((expected_width, expected_height)) for frame in video]
132
- export_to_video(video, "output.mp4", fps=24)
133
- return "output.mp4"
134
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
135
 
136
 
137
  css="""
@@ -141,72 +383,89 @@ css="""
141
  }
142
  """
143
 
144
- js_func = """
145
- function refresh() {
146
- const url = new URL(window.location);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
 
148
- if (url.searchParams.get('__theme') !== 'dark') {
149
- url.searchParams.set('__theme', 'dark');
150
- window.location.href = url.href;
151
- }
152
- }
153
- """
154
 
155
- with gr.Blocks(css=css, theme=gr.themes.Ocean()) as demo:
 
156
 
157
- gr.Markdown("# LTX Video 0.9.7 Distilled")
158
- mode = gr.State(value="text-to-video")
159
- with gr.Row():
160
- with gr.Column():
161
- with gr.Group():
162
- with gr.Tab("text-to-video") as text_tab:
163
- image_n = gr.Image(label="", visible=False)
164
- with gr.Tab("image-to-video") as image_tab:
165
- image = gr.Image(label="input image")
166
- with gr.Tab("video-to-video") as video_tab:
167
- video = gr.Video(label="input video")
168
- frames_to_use = gr.Number(label="num frames to use",info="first # of frames to use from the input video", value=1)
169
- prompt = gr.Textbox(label="prompt")
170
- improve_texture = gr.Checkbox(label="improve texture", value=False, info="slows down generation")
171
- run_button = gr.Button()
172
- with gr.Column():
173
- output = gr.Video(interactive=False)
174
-
175
-
176
- with gr.Accordion("Advanced settings", open=False):
177
- negative_prompt = gr.Textbox(label="negative prompt", value="worst quality, inconsistent motion, blurry, jittery, distorted", visible=False)
178
- with gr.Row():
179
- seed = gr.Number(label="seed", value=0, precision=0)
180
- randomize_seed = gr.Checkbox(label="randomize seed")
181
- with gr.Row():
182
- guidance_scale= gr.Slider(label="guidance scale", minimum=0, maximum=10, value=3, step=1)
183
- steps = gr.Slider(label="Steps", minimum=1, maximum=30, value=8, step=1)
184
- num_frames = gr.Slider(label="# frames", minimum=1, maximum=161, value=96, step=1)
185
- with gr.Row():
186
- height = gr.Slider(label="height", value=512, step=1, maximum=2048)
187
- width = gr.Slider(label="width", value=704, step=1, maximum=2048)
188
-
189
 
190
- text_tab.select(fn=change_mode_to_text, inputs=[], outputs=[mode])
191
- image_tab.select(fn=change_mode_to_image, inputs=[], outputs=[mode])
192
- video_tab.select(fn=change_mode_to_video, inputs=[], outputs=[mode])
193
-
194
- run_button.click(fn=generate,
195
- inputs=[prompt,
196
- negative_prompt,
197
- image,
198
- video,
199
- height,
200
- width,
201
- mode,
202
- steps,
203
- num_frames,
204
- frames_to_use,
205
- seed,
206
- randomize_seed,guidance_scale, improve_texture],
207
- outputs=[output])
208
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
209
 
210
-
 
 
 
 
 
 
 
 
 
 
 
 
 
211
 
212
- demo.launch()
 
1
  import gradio as gr
2
+ import spaces
3
  import torch
 
 
 
 
 
4
  import numpy as np
5
+ import os
6
+ import yaml
7
+ import random
8
+ from PIL import Image
9
+ import imageio # For export_to_video and reading video frames
10
+ from pathlib import Path
11
+ from huggingface_hub import hf_hub_download
12
 
13
+ # --- LTX-Video Imports (from your provided codebase) ---
14
+ from ltx_video.pipelines.pipeline_ltx_video import (
15
+ ConditioningItem,
16
+ LTXVideoPipeline,
17
+ LTXMultiScalePipeline,
18
+ )
19
+ from ltx_video.models.autoencoders.vae_encode import vae_decode, vae_encode, un_normalize_latents, normalize_latents
20
+ from inference import (
21
+ create_ltx_video_pipeline,
22
+ create_latent_upsampler,
23
+ load_image_to_tensor_with_resize_and_crop, # Re-using for image conditioning
24
+ load_media_file, # Re-using for video conditioning
25
+ get_device,
26
+ seed_everething,
27
+ calculate_padding,
28
+ )
29
+ from ltx_video.utils.skip_layer_strategy import SkipLayerStrategy
30
+ from ltx_video.models.autoencoders.latent_upsampler import LatentUpsampler
31
+ # --- End LTX-Video Imports ---
32
 
33
+ # --- Diffusers/Original utils (keeping export_to_video for convenience if it works) ---
34
+ from diffusers.utils import export_to_video # Keep if it works with PIL list
35
+ # ---
 
 
36
 
37
+ # --- Global Configuration & Model Loading ---
38
+ DEVICE = get_device()
39
+ MODEL_DIR = "downloaded_models" # Directory to store downloaded models
40
+ Path(MODEL_DIR).mkdir(parents=True, exist_ok=True)
41
 
42
+ # Load YAML configuration
43
+ YAML_CONFIG_PATH = "ltxv-13b-0.9.7-distilled.yaml" # Place this file in the same directory
44
+ with open(YAML_CONFIG_PATH, "r") as f:
45
+ PIPELINE_CONFIG_YAML = yaml.safe_load(f)
46
 
47
+ # Download and prepare model paths from YAML
48
+ LTXV_MODEL_FILENAME = PIPELINE_CONFIG_YAML["checkpoint_path"]
49
+ SPATIAL_UPSCALER_FILENAME = PIPELINE_CONFIG_YAML["spatial_upscaler_model_path"]
50
+ TEXT_ENCODER_PATH = PIPELINE_CONFIG_YAML["text_encoder_model_name_or_path"] # This is usually a repo name
 
 
51
 
52
+ try:
53
+ # Main LTX-Video model
54
+ if not os.path.isfile(os.path.join(MODEL_DIR, LTXV_MODEL_FILENAME)):
55
+ print(f"Downloading {LTXV_MODEL_FILENAME}...")
56
+ ltxv_checkpoint_path = hf_hub_download(
57
+ repo_id="LTX-Colab/LTX-Video-Preview", # Assuming the distilled model is also here or adjust repo_id
58
+ filename=LTXV_MODEL_FILENAME,
59
+ local_dir=MODEL_DIR,
60
+ repo_type="model",
61
+ )
62
+ else:
63
+ ltxv_checkpoint_path = os.path.join(MODEL_DIR, LTXV_MODEL_FILENAME)
64
 
65
+ # Spatial Upsampler model
66
+ if not os.path.isfile(os.path.join(MODEL_DIR, SPATIAL_UPSCALER_FILENAME)):
67
+ print(f"Downloading {SPATIAL_UPSCALER_FILENAME}...")
68
+ spatial_upsampler_path = hf_hub_download(
69
+ repo_id="Lightricks/LTX-Video",
70
+ filename=SPATIAL_UPSCALER_FILENAME,
71
+ local_dir=MODEL_DIR,
72
+ repo_type="model",
73
+ )
74
+ else:
75
+ spatial_upsampler_path = os.path.join(MODEL_DIR, SPATIAL_UPSCALER_FILENAME)
76
+ except Exception as e:
77
+ print(f"Error downloading models: {e}")
78
+ print("Please ensure model files are correctly specified and accessible.")
79
+ # Depending on severity, you might want to exit or disable GPU features
80
+ # For now, we'll let it proceed and potentially fail later if paths are invalid.
81
+ ltxv_checkpoint_path = LTXV_MODEL_FILENAME # Fallback to filename if download fails
82
+ spatial_upsampler_path = SPATIAL_UPSCALER_FILENAME
83
+
84
+
85
+ print(f"Using LTX-Video checkpoint: {ltxv_checkpoint_path}")
86
+ print(f"Using Spatial Upsampler: {spatial_upsampler_path}")
87
+ print(f"Using Text Encoder: {TEXT_ENCODER_PATH}")
88
+
89
+ # Create LTX-Video pipeline
90
+ pipe = create_ltx_video_pipeline(
91
+ ckpt_path=ltxv_checkpoint_path,
92
+ precision=PIPELINE_CONFIG_YAML["precision"],
93
+ text_encoder_model_name_or_path=TEXT_ENCODER_PATH,
94
+ sampler=PIPELINE_CONFIG_YAML["sampler"], # "from_checkpoint" or specific sampler
95
+ device=DEVICE,
96
+ enhance_prompt=False, # Assuming Gradio controls this, or set based on YAML later
97
+ )
98
+
99
+ # Create Latent Upsampler
100
+ latent_upsampler = create_latent_upsampler(
101
+ latent_upsampler_model_path=spatial_upsampler_path,
102
+ device=DEVICE
103
+ )
104
+ latent_upsampler = latent_upsampler.to(torch.bfloat16 if PIPELINE_CONFIG_YAML["precision"] == "bfloat16" else torch.float32)
105
+
106
+
107
+ # Multi-scale pipeline (wrapper)
108
+ multi_scale_pipe = LTXMultiScalePipeline(
109
+ video_pipeline=pipe,
110
+ latent_upsampler=latent_upsampler
111
+ )
112
+ # --- End Global Configuration & Model Loading ---
113
+
114
+
115
+ MAX_SEED = np.iinfo(np.int32).max
116
+ MAX_IMAGE_SIZE = 2048 # Not strictly used here, but good to keep in mind
117
+
118
+
119
+ def round_to_nearest_resolution_acceptable_by_vae(height, width, vae_scale_factor):
120
+ # print("before rounding",height, width)
121
+ height = height - (height % vae_scale_factor)
122
+ width = width - (width % vae_scale_factor)
123
+ # print("after rounding",height, width)
124
+ return height, width
125
 
 
 
 
126
  @spaces.GPU
127
  def generate(prompt,
128
  negative_prompt,
129
+ image_path, # Gradio gives filepath for Image component
130
+ video_path, # Gradio gives filepath for Video component
131
  height,
132
  width,
133
  mode,
134
+ steps, # This will map to num_inference_steps for the first pass
135
  num_frames,
136
  frames_to_use,
137
  seed,
138
  randomize_seed,
139
  guidance_scale,
140
  improve_texture=False, progress=gr.Progress(track_tqdm=True)):
141
+
142
  if randomize_seed:
143
  seed = random.randint(0, MAX_SEED)
144
+ seed_everething(seed)
145
+
146
+ generator = torch.Generator(device=DEVICE).manual_seed(seed)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
 
148
+ # --- Prepare conditioning items ---
149
+ conditioning_items_list = []
150
+ input_media_for_vid2vid = None # For the specific vid2vid mode in LTX pipeline
151
+
152
+ # Pad target dimensions
153
+ # VAE scale factor is typically 8 for spatial, but LTX might have its own specific factor.
154
+ # CausalVideoAutoencoder has spatial_downscale_factor and temporal_downscale_factor
155
+ vae_spatial_scale_factor = pipe.vae.spatial_downscale_factor
156
+ vae_temporal_scale_factor = pipe.vae.temporal_downscale_factor
157
+
158
+ # Ensure target height/width are multiples of VAE spatial scale factor
159
+ height_padded_target = ((height - 1) // vae_spatial_scale_factor + 1) * vae_spatial_scale_factor
160
+ width_padded_target = ((width - 1) // vae_spatial_scale_factor + 1) * vae_spatial_scale_factor
161
 
162
+ # Ensure num_frames is multiple of VAE temporal scale factor + 1 (for causal VAE)
163
+ # (num_frames - 1) should be multiple of temporal_scale_factor for non-causal parts
164
+ # For CausalVAE, it's often (N * temporal_factor) + 1 frames.
165
+ # The inference script uses: num_frames_padded = ((num_frames - 2) // 8 + 1) * 8 + 1
166
+ # Assuming 8 is the temporal scale factor here for simplicity, adjust if different
167
+ num_frames_padded_target = ((num_frames - 2) // vae_temporal_scale_factor + 1) * vae_temporal_scale_factor + 1
168
+
169
+
170
+ padding_target = calculate_padding(height, width, height_padded_target, width_padded_target)
171
+
172
+
173
+ if mode == "video-to-video" and video_path:
174
+ # LTX pipeline's vid2vid uses `media_items` argument for the full video to transform
175
+ # and `conditioning_items` for specific keyframes if needed.
176
+ # Here, the Gradio's "video-to-video" seems to imply transforming the input video.
177
+ input_media_for_vid2vid = load_media_file(
178
+ media_path=video_path,
179
+ height=height, # Original height before padding for loading
180
+ width=width, # Original width
181
+ max_frames=min(num_frames_padded_target, frames_to_use if frames_to_use > 0 else num_frames_padded_target),
182
+ padding=padding_target, # Padding to make it compatible with VAE of target size
183
+ )
184
+ # If we also want to strongly condition on the first frame(s) of this video:
185
+ conditioning_media = load_media_file(
186
+ media_path=video_path,
187
+ height=height, width=width,
188
+ max_frames=min(frames_to_use if frames_to_use > 0 else 1, num_frames_padded_target), # Use specified frames or just the first
189
+ padding=padding_target,
190
+ just_crop=True # Crop to aspect ratio, then resize
191
+ )
192
+ conditioning_items_list.append(ConditioningItem(media_item=conditioning_media, media_frame_number=0, conditioning_strength=1.0))
193
+
194
+ elif mode == "image-to-video" and image_path:
195
+ conditioning_media = load_image_to_tensor_with_resize_and_crop(
196
+ image_input=image_path,
197
+ target_height=height, # Original height
198
+ target_width=width # Original width
199
+ )
200
+ # Apply padding to the loaded tensor
201
+ conditioning_media = torch.nn.functional.pad(conditioning_media, padding_target)
202
+ conditioning_items_list.append(ConditioningItem(media_item=conditioning_media, media_frame_number=0, conditioning_strength=1.0))
203
+
204
+ # else mode is "text-to-video", no explicit conditioning items unless defined elsewhere
205
+
206
+ # --- Get pipeline parameters from YAML ---
207
+ first_pass_config = PIPELINE_CONFIG_YAML.get("first_pass", {})
208
+ second_pass_config = PIPELINE_CONFIG_YAML.get("second_pass", {})
209
+ downscale_factor = PIPELINE_CONFIG_YAML.get("downscale_factor", 2/3)
210
+
211
+ # Override steps from Gradio if provided, for the first pass
212
+ if steps:
213
+ # The YAML timesteps are specific, so overriding num_inference_steps might not be what we want
214
+ # If YAML has `timesteps`, `num_inference_steps` is ignored by LTXVideoPipeline.
215
+ # If YAML does not have `timesteps`, then `num_inference_steps` from Gradio will be used for the first pass.
216
+ first_pass_config["num_inference_steps"] = steps
217
+ # For distilled model, the second pass steps are usually very few, defined by its timesteps.
218
+ # We won't override second_pass_config["num_inference_steps"] from the Gradio `steps`
219
+ # as it's meant for the primary generation.
220
+
221
+ # Determine initial generation dimensions (downscaled)
222
+ # These are the dimensions for the *first pass* of the multi-scale pipeline
223
+ initial_gen_height = int(height_padded_target * downscale_factor)
224
+ initial_gen_width = int(width_padded_target * downscale_factor)
225
+
226
+ initial_gen_height, initial_gen_width = round_to_nearest_resolution_acceptable_by_vae(
227
+ initial_gen_height, initial_gen_width, vae_spatial_scale_factor
228
+ )
229
+
230
+ shared_pipeline_args = {
231
+ "prompt": prompt,
232
+ "negative_prompt": negative_prompt,
233
+ "num_frames": num_frames_padded_target, # Always generate padded num_frames
234
+ "frame_rate": 30, # Example, or get from UI if available
235
+ "guidance_scale": guidance_scale,
236
+ "generator": generator,
237
+ "conditioning_items": conditioning_items_list if conditioning_items_list else None,
238
+ "skip_layer_strategy": SkipLayerStrategy.AttentionValues, # Default or from YAML
239
+ "offload_to_cpu": False, # Managed by global DEVICE
240
+ "is_video": True,
241
+ "vae_per_channel_normalize": True, # Common default
242
+ "mixed_precision": (PIPELINE_CONFIG_YAML["precision"] == "bfloat16"),
243
+ "enhance_prompt": False, # Controlled by Gradio app logic if needed for full LTX script
244
+ "image_cond_noise_scale": 0.025, # from YAML decode_noise_scale, or make it a param
245
+ "media_items": input_media_for_vid2vid if mode == "video-to-video" else None,
246
+ # "decode_timestep" and "decode_noise_scale" are part of first_pass/second_pass or direct call
247
+ }
248
+
249
+ # --- Generation ---
250
  if improve_texture:
251
+ print("Using LTXMultiScalePipeline for generation...")
252
+ # Ensure first_pass_config and second_pass_config have necessary overrides
253
+ # The 'steps' from Gradio applies to the first pass's num_inference_steps if timesteps not set
254
+ if "timesteps" not in first_pass_config:
255
+ first_pass_config["num_inference_steps"] = steps
256
+
257
+ first_pass_config.setdefault("decode_timestep", PIPELINE_CONFIG_YAML.get("decode_timestep", 0.05))
258
+ first_pass_config.setdefault("decode_noise_scale", PIPELINE_CONFIG_YAML.get("decode_noise_scale", 0.025))
259
+ second_pass_config.setdefault("decode_timestep", PIPELINE_CONFIG_YAML.get("decode_timestep", 0.05))
260
+ second_pass_config.setdefault("decode_noise_scale", PIPELINE_CONFIG_YAML.get("decode_noise_scale", 0.025))
261
+
262
+ # The multi_scale_pipe's __call__ expects width and height for the *initial* (downscaled) generation
263
+ result_frames_tensor = multi_scale_pipe(
264
+ **shared_pipeline_args,
265
+ width=initial_gen_width,
266
+ height=initial_gen_height,
267
+ downscale_factor=downscale_factor, # This might be used internally by multi_scale_pipe
268
+ first_pass=first_pass_config,
269
+ second_pass=second_pass_config,
270
+ output_type="pt" # Get tensor for further processing
271
+ ).images
272
 
273
+ # LTXMultiScalePipeline should return images at 2x the initial_gen_width/height
274
+ # So, result_frames_tensor is at initial_gen_width*2, initial_gen_height*2
275
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
276
  else:
277
+ print("Using LTXVideoPipeline (first pass) + Manual Upsample + Decode...")
278
+ # 1. First pass generation at downscaled resolution
279
+ if "timesteps" not in first_pass_config:
280
+ first_pass_config["num_inference_steps"] = steps
281
+
282
+ first_pass_args = {
283
+ **shared_pipeline_args,
284
+ **first_pass_config,
285
+ "width": initial_gen_width,
286
+ "height": initial_gen_height,
287
+ "output_type": "latent"
288
+ }
289
+ latents = pipe(**first_pass_args).images # .images here is actually latents
290
+
291
+ # 2. Upsample latents manually
292
+ # Need to handle normalization around latent upsampler if it expects unnormalized latents
293
+ latents_unnorm = un_normalize_latents(latents, pipe.vae, vae_per_channel_normalize=True)
294
+ upsampled_latents_unnorm = latent_upsampler(latents_unnorm)
295
+ upsampled_latents = normalize_latents(upsampled_latents_unnorm, pipe.vae, vae_per_channel_normalize=True)
296
+
297
+ # 3. Decode upsampled latents
298
+ # The upsampler typically doubles the spatial dimensions
299
+ upscaled_height_for_decode = initial_gen_height * 2
300
+ upscaled_width_for_decode = initial_gen_width * 2
301
+
302
+ # Prepare target_shape for VAE decoder
303
+ # batch_size, channels, num_frames, height, width
304
+ # Latents are (B, C, F_latent, H_latent, W_latent)
305
+ # Target shape for vae.decode is pixel space
306
+ # num_video_frames_final = upsampled_latents.shape[2] * pipe.vae.temporal_downscale_factor
307
+ # if causal, it might be (F_latent - 1) * factor + 1
308
+ num_video_frames_final = (upsampled_latents.shape[2] -1) * pipe.vae.temporal_downscale_factor + 1
309
+
310
+
311
+ decode_kwargs = {
312
+ "target_shape": (
313
+ upsampled_latents.shape[0], # batch
314
+ 3, # out channels
315
+ num_video_frames_final,
316
+ upscaled_height_for_decode,
317
+ upscaled_width_for_decode
318
+ )
319
+ }
320
+ if pipe.vae.decoder.timestep_conditioning:
321
+ decode_kwargs["timestep"] = torch.tensor([PIPELINE_CONFIG_YAML.get("decode_timestep", 0.05)] * upsampled_latents.shape[0]).to(DEVICE)
322
+ # Add noise for decode if specified, similar to LTXVideoPipeline's call
323
+ noise = torch.randn_like(upsampled_latents)
324
+ decode_noise_val = PIPELINE_CONFIG_YAML.get("decode_noise_scale", 0.025)
325
+ upsampled_latents = upsampled_latents * (1 - decode_noise_val) + noise * decode_noise_val
326
+
327
+
328
+ result_frames_tensor = pipe.vae.decode(upsampled_latents, **decode_kwargs).sample
329
+ # result_frames_tensor shape: (B, C, F_video, H_video, W_video)
330
+
331
+ # --- Post-processing: Cropping and Converting to PIL ---
332
+ # Crop to original num_frames (before padding)
333
+ result_frames_tensor = result_frames_tensor[:, :, :num_frames, :, :]
334
+
335
+ # Unpad to target height and width
336
+ _, _, _, current_h, current_w = result_frames_tensor.shape
337
+
338
+ # Calculate crop needed if current dimensions are larger than padded_target
339
+ # This happens if multi_scale_pipe output is larger than height_padded_target
340
+ crop_y_start = (current_h - height_padded_target) // 2
341
+ crop_x_start = (current_w - width_padded_target) // 2
342
+
343
+ result_frames_tensor = result_frames_tensor[
344
+ :, :, :,
345
+ crop_y_start : crop_y_start + height_padded_target,
346
+ crop_x_start : crop_x_start + width_padded_target
347
+ ]
348
 
349
+ # Now remove the padding added for VAE compatibility
350
+ pad_left, pad_right, pad_top, pad_bottom = padding_target
351
+ unpad_bottom = -pad_bottom if pad_bottom > 0 else result_frames_tensor.shape[3]
352
+ unpad_right = -pad_right if pad_right > 0 else result_frames_tensor.shape[4]
353
 
354
+ result_frames_tensor = result_frames_tensor[
355
+ :, :, :,
356
+ pad_top : unpad_bottom,
357
+ pad_left : unpad_right
358
+ ]
359
+
360
+
361
+ # Convert tensor to list of PIL Images
362
+ video_pil_list = []
363
+ # result_frames_tensor shape: (B, C, F, H, W)
364
+ # We expect B=1 from typical generation
365
+ video_single_batch = result_frames_tensor[0] # Shape: (C, F, H, W)
366
+ video_single_batch = (video_single_batch / 2 + 0.5).clamp(0, 1) # Normalize to [0,1]
367
+ video_single_batch = video_single_batch.permute(1, 2, 3, 0).cpu().numpy() # F, H, W, C
368
+
369
+ for frame_idx in range(video_single_batch.shape[0]):
370
+ frame_np = (video_single_batch[frame_idx] * 255).astype(np.uint8)
371
+ video_pil_list.append(Image.fromarray(frame_np))
372
+
373
+ # Save video
374
+ output_video_path = "output.mp4" # Gradio handles temp files
375
+ export_to_video(video_pil_list, output_video_path, fps=24) # Assuming fps from original script
376
+ return output_video_path
377
 
378
 
379
  css="""
 
383
  }
384
  """
385
 
386
+ with gr.Blocks(css=css, theme=gr.themes.Ocean()) as demo:
387
+ gr.Markdown("# LTX Video 0.9.7 Distilled (using LTX-Video lib)")
388
+ with gr.Row():
389
+ with gr.Column():
390
+ with gr.Group():
391
+ with gr.Tab("text-to-video") as text_tab:
392
+ image_n = gr.Image(label="", visible=False, value=None) # Ensure None for path
393
+ video_n = gr.Video(label="", visible=False, value=None) # Ensure None for path
394
+ t2v_prompt = gr.Textbox(label="prompt", value="A majestic dragon flying over a medieval castle")
395
+ t2v_button = gr.Button("Generate Text-to-Video")
396
+ with gr.Tab("image-to-video") as image_tab:
397
+ video_i = gr.Video(label="", visible=False, value=None)
398
+ image_i2v = gr.Image(label="input image", type="filepath")
399
+ i2v_prompt = gr.Textbox(label="prompt", value="The creature from the image starts to move")
400
+ i2v_button = gr.Button("Generate Image-to-Video")
401
+ with gr.Tab("video-to-video") as video_tab:
402
+ image_v = gr.Image(label="", visible=False, value=None)
403
+ video_v2v = gr.Video(label="input video", type="filepath")
404
+ frames_to_use = gr.Number(label="num frames to use",info="first # of frames to use from the input video for conditioning/transformation", value=9)
405
+ v2v_prompt = gr.Textbox(label="prompt", value="Change the style to cinematic anime")
406
+ v2v_button = gr.Button("Generate Video-to-Video")
407
 
408
+ improve_texture = gr.Checkbox(label="improve texture (multi-scale)", value=True, info="Uses a two-pass generation for better quality, but is slower.")
 
 
 
 
 
409
 
410
+ with gr.Column():
411
+ output = gr.Video(interactive=False)
412
 
413
+ with gr.Accordion("Advanced settings", open=False):
414
+ negative_prompt_input = gr.Textbox(label="negative prompt", value="worst quality, inconsistent motion, blurry, jittery, distorted")
415
+ with gr.Row():
416
+ seed_input = gr.Number(label="seed", value=42, precision=0)
417
+ randomize_seed_input = gr.Checkbox(label="randomize seed", value=False)
418
+ with gr.Row():
419
+ guidance_scale_input = gr.Slider(label="guidance scale", minimum=0, maximum=10, value=1.0, step=0.1, info="For distilled models, CFG is often 1.0 (disabled) or very low.") # Distilled model might not need high CFG
420
+ steps_input = gr.Slider(label="Steps (for first pass if multi-scale)", minimum=1, maximum=30, value=PIPELINE_CONFIG_YAML.get("first_pass", {}).get("timesteps", [1]*8).__len__(), step=1, info="Number of inference steps. If YAML defines timesteps, this is ignored for that pass.") # Default to length of first_pass timesteps
421
+ num_frames_input = gr.Slider(label="# frames", minimum=9, maximum=121, value=25, step=8, info="Should be N*8+1, e.g., 9, 17, 25...") # Adjusted for LTX structure
422
+ with gr.Row():
423
+ height_input = gr.Slider(label="height", value=512, step=8, minimum=256, maximum=MAX_IMAGE_SIZE) # Step by VAE factor
424
+ width_input = gr.Slider(label="width", value=704, step=8, minimum=256, maximum=MAX_IMAGE_SIZE) # Step by VAE factor
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
425
 
426
+ t2v_button.click(fn=generate,
427
+ inputs=[t2v_prompt,
428
+ negative_prompt_input,
429
+ image_n, # Pass None for image
430
+ video_n, # Pass None for video
431
+ height_input,
432
+ width_input,
433
+ gr.State("text-to-video"),
434
+ steps_input,
435
+ num_frames_input,
436
+ gr.State(0), # frames_to_use not relevant for t2v
437
+ seed_input,
438
+ randomize_seed_input, guidance_scale_input, improve_texture],
439
+ outputs=[output])
 
 
 
 
440
 
441
+ i2v_button.click(fn=generate,
442
+ inputs=[i2v_prompt,
443
+ negative_prompt_input,
444
+ image_i2v,
445
+ video_i, # Pass None for video
446
+ height_input,
447
+ width_input,
448
+ gr.State("image-to-video"),
449
+ steps_input,
450
+ num_frames_input,
451
+ gr.State(0), # frames_to_use not relevant for i2v initial frame
452
+ seed_input,
453
+ randomize_seed_input, guidance_scale_input, improve_texture],
454
+ outputs=[output])
455
 
456
+ v2v_button.click(fn=generate,
457
+ inputs=[v2v_prompt,
458
+ negative_prompt_input,
459
+ image_v, # Pass None for image
460
+ video_v2v,
461
+ height_input,
462
+ width_input,
463
+ gr.State("video-to-video"),
464
+ steps_input,
465
+ num_frames_input,
466
+ frames_to_use,
467
+ seed_input,
468
+ randomize_seed_input, guidance_scale_input, improve_texture],
469
+ outputs=[output])
470
 
471
+ demo.launch()