multimodalart HF Staff commited on
Commit
626b672
·
verified ·
1 Parent(s): 70c8ddd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +118 -117
app.py CHANGED
@@ -62,22 +62,17 @@ DISTILLED_MODEL_REPO = "LTX-Colab/LTX-Video-Preview"
62
  DISTILLED_MODEL_FILENAME = "ltxv-13b-0.9.7-distilled-rc3.safetensors"
63
 
64
  UPSCALER_REPO = "Lightricks/LTX-Video"
65
- # SPATIAL_UPSCALER_FILENAME will be taken from PIPELINE_CONFIG_YAML after it's loaded
66
 
67
- MAX_IMAGE_SIZE = PIPELINE_CONFIG_YAML.get("max_resolution", 1280) # Max width/height from UI
68
- MAX_NUM_FRAMES = 257 # From inference.py
69
 
70
  # --- Global variables for loaded models ---
71
  pipeline_instance = None
72
  latent_upsampler_instance = None
73
- current_device = get_device()
74
- models_dir = "downloaded_models_gradio" # Use a distinct name
75
  Path(models_dir).mkdir(parents=True, exist_ok=True)
76
 
77
- # Download models and update config paths
78
- print(f"Using device: {current_device}")
79
- print("Downloading models...")
80
-
81
  distilled_model_actual_path = hf_hub_download(
82
  repo_id=DISTILLED_MODEL_REPO,
83
  filename=DISTILLED_MODEL_FILENAME,
@@ -85,7 +80,7 @@ distilled_model_actual_path = hf_hub_download(
85
  local_dir_use_symlinks=False
86
  )
87
  PIPELINE_CONFIG_YAML["checkpoint_path"] = distilled_model_actual_path
88
- print(f"Distilled model downloaded to: {distilled_model_actual_path}")
89
 
90
  SPATIAL_UPSCALER_FILENAME = PIPELINE_CONFIG_YAML["spatial_upscaler_model_path"]
91
  spatial_upscaler_actual_path = hf_hub_download(
@@ -95,29 +90,28 @@ spatial_upscaler_actual_path = hf_hub_download(
95
  local_dir_use_symlinks=False
96
  )
97
  PIPELINE_CONFIG_YAML["spatial_upscaler_model_path"] = spatial_upscaler_actual_path
98
- print(f"Spatial upscaler model downloaded to: {spatial_upscaler_actual_path}")
99
 
100
- # Load pipelines
101
- print("Creating LTX Video pipeline...")
102
  pipeline_instance = create_ltx_video_pipeline(
103
  ckpt_path=PIPELINE_CONFIG_YAML["checkpoint_path"],
104
  precision=PIPELINE_CONFIG_YAML["precision"],
105
  text_encoder_model_name_or_path=PIPELINE_CONFIG_YAML["text_encoder_model_name_or_path"],
106
  sampler=PIPELINE_CONFIG_YAML["sampler"],
107
- device=current_device,
108
- enhance_prompt=False, # Prompt enhancement handled by UI choice / Gradio logic if desired
109
  prompt_enhancer_image_caption_model_name_or_path=PIPELINE_CONFIG_YAML["prompt_enhancer_image_caption_model_name_or_path"],
110
  prompt_enhancer_llm_model_name_or_path=PIPELINE_CONFIG_YAML["prompt_enhancer_llm_model_name_or_path"],
111
  )
112
- print("LTX Video pipeline created.")
113
 
114
  if PIPELINE_CONFIG_YAML.get("spatial_upscaler_model_path"):
115
- print("Creating latent upsampler...")
116
  latent_upsampler_instance = create_latent_upsampler(
117
  PIPELINE_CONFIG_YAML["spatial_upscaler_model_path"],
118
- device=current_device
119
  )
120
- print("Latent upsampler created.")
121
 
122
 
123
  def generate(prompt, negative_prompt, input_image_filepath, input_video_filepath,
@@ -125,7 +119,10 @@ def generate(prompt, negative_prompt, input_image_filepath, input_video_filepath
125
  ui_steps, num_frames_ui,
126
  ui_frames_to_use,
127
  seed_ui, randomize_seed, ui_guidance_scale, improve_texture_flag,
128
- progress=gr.Progress(track_tqdm=True)):
 
 
 
129
 
130
  if randomize_seed:
131
  seed_ui = random.randint(0, 2**32 - 1)
@@ -135,7 +132,6 @@ def generate(prompt, negative_prompt, input_image_filepath, input_video_filepath
135
  actual_width = int(width_ui)
136
  actual_num_frames = int(num_frames_ui)
137
 
138
- # Padded dimensions for pipeline
139
  height_padded = ((actual_height - 1) // 32 + 1) * 32
140
  width_padded = ((actual_width - 1) // 32 + 1) * 32
141
  num_frames_padded = ((actual_num_frames - 2) // 8 + 1) * 8 + 1
@@ -145,23 +141,23 @@ def generate(prompt, negative_prompt, input_image_filepath, input_video_filepath
145
  call_kwargs = {
146
  "prompt": prompt,
147
  "negative_prompt": negative_prompt,
148
- "height": height_padded, # Use padded for pipeline
149
- "width": width_padded, # Use padded for pipeline
150
- "num_frames": num_frames_padded, # Use padded for pipeline
151
  "frame_rate": 30,
152
- "generator": torch.Generator(device=current_device).manual_seed(int(seed_ui)),
153
- "output_type": "pt",
154
  "conditioning_items": None,
155
  "media_items": None,
156
  "decode_timestep": PIPELINE_CONFIG_YAML["decode_timestep"],
157
  "decode_noise_scale": PIPELINE_CONFIG_YAML["decode_noise_scale"],
158
  "stochastic_sampling": PIPELINE_CONFIG_YAML["stochastic_sampling"],
159
- "image_cond_noise_scale": 0.15, # from inference.py defaults
160
- "is_video": True, # Assume video output
161
- "vae_per_channel_normalize": True, # from inference.py defaults
162
  "mixed_precision": (PIPELINE_CONFIG_YAML["precision"] == "mixed_precision"),
163
- "offload_to_cpu": False, # For Gradio, keep on device
164
- "enhance_prompt": False, # Assuming no UI for this yet, stick to YAML or handle separately
165
  }
166
 
167
  stg_mode_str = PIPELINE_CONFIG_YAML.get("stg_mode", "attention_values")
@@ -178,17 +174,14 @@ def generate(prompt, negative_prompt, input_image_filepath, input_video_filepath
178
 
179
  if mode == "image-to-video" and input_image_filepath:
180
  try:
181
- # Ensure the input image is loaded with original H/W for correct aspect ratio handling by the function
182
  media_tensor = load_image_to_tensor_with_resize_and_crop(
183
  input_image_filepath, actual_height, actual_width
184
  )
185
  media_tensor = torch.nn.functional.pad(media_tensor, padding_values)
186
- call_kwargs["conditioning_items"] = [ConditioningItem(media_tensor.to(current_device), 0, 1.0)]
187
  except Exception as e:
188
  print(f"Error loading image {input_image_filepath}: {e}")
189
  raise gr.Error(f"Could not load image: {e}")
190
-
191
-
192
  elif mode == "video-to-video" and input_video_filepath:
193
  try:
194
  call_kwargs["media_items"] = load_media_file(
@@ -197,73 +190,84 @@ def generate(prompt, negative_prompt, input_image_filepath, input_video_filepath
197
  width=actual_width,
198
  max_frames=int(ui_frames_to_use),
199
  padding=padding_values
200
- ).to(current_device)
201
  except Exception as e:
202
  print(f"Error loading video {input_video_filepath}: {e}")
203
  raise gr.Error(f"Could not load video: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
204
 
205
- # Multi-scale or single-scale pipeline call
206
- if improve_texture_flag:
207
- if not latent_upsampler_instance:
208
- raise gr.Error("Spatial upscaler model not loaded, cannot use multi-scale.")
 
209
 
210
- multi_scale_pipeline_obj = LTXMultiScalePipeline(pipeline_instance, latent_upsampler_instance)
211
-
212
- # Prepare pass-specific arguments, overriding with UI inputs where appropriate
213
- first_pass_args = PIPELINE_CONFIG_YAML.get("first_pass", {}).copy()
214
- first_pass_args["guidance_scale"] = float(ui_guidance_scale)
215
- if "timesteps" not in first_pass_args: # Only if YAML doesn't define timesteps
216
- first_pass_args["num_inference_steps"] = int(ui_steps)
217
-
218
- second_pass_args = PIPELINE_CONFIG_YAML.get("second_pass", {}).copy()
219
- second_pass_args["guidance_scale"] = float(ui_guidance_scale)
220
- # num_inference_steps for second pass is typically determined by its YAML timesteps
221
-
222
- multi_scale_call_kwargs = call_kwargs.copy()
223
- multi_scale_call_kwargs.update({
224
- "downscale_factor": PIPELINE_CONFIG_YAML["downscale_factor"],
225
- "first_pass": first_pass_args,
226
- "second_pass": second_pass_args,
227
- })
228
-
229
- print(f"Calling multi-scale pipeline with effective height={actual_height}, width={actual_width}")
230
- result_images_tensor = multi_scale_pipeline_obj(**multi_scale_call_kwargs).images
231
- else:
232
- # Single pass call (using base pipeline)
233
- single_pass_call_kwargs = call_kwargs.copy()
234
- single_pass_call_kwargs["guidance_scale"] = float(ui_guidance_scale)
235
-
236
- # For single pass, if YAML doesn't have top-level timesteps, use ui_steps
237
- # The current YAML is multi-scale focused, so it lacks top-level step control.
238
- # We'll assume for a base call, num_inference_steps is directly taken from UI.
239
- single_pass_call_kwargs["num_inference_steps"] = int(ui_steps)
240
- # Remove pass-specific args if they accidentally slipped in
241
- single_pass_call_kwargs.pop("first_pass", None)
242
- single_pass_call_kwargs.pop("second_pass", None)
243
- single_pass_call_kwargs.pop("downscale_factor", None)
244
-
245
- print(f"Calling base pipeline with height={height_padded}, width={width_padded}")
246
- result_images_tensor = pipeline_instance(**single_pass_call_kwargs).images
247
 
248
- # Crop to original requested dimensions (num_frames, height, width)
249
- # Padding: (pad_left, pad_right, pad_top, pad_bottom)
250
  pad_left, pad_right, pad_top, pad_bottom = padding_values
251
-
252
- # Calculate slice indices, ensuring they don't go negative if padding was zero
253
  slice_h_end = -pad_bottom if pad_bottom > 0 else None
254
  slice_w_end = -pad_right if pad_right > 0 else None
255
-
256
  result_images_tensor = result_images_tensor[
257
  :, :, :actual_num_frames, pad_top:slice_h_end, pad_left:slice_w_end
258
  ]
259
 
260
- # Convert tensor to video file
261
  video_np = result_images_tensor[0].permute(1, 2, 3, 0).cpu().float().numpy()
262
- video_np = np.clip(video_np * 0.5 + 0.5, 0, 1) # from [-1,1] to [0,1]
 
 
263
  video_np = (video_np * 255).astype(np.uint8)
264
 
265
  temp_dir = tempfile.mkdtemp()
266
- timestamp = random.randint(10000,99999) # Add timestamp to avoid caching issues
267
  output_video_path = os.path.join(temp_dir, f"output_{timestamp}.mp4")
268
 
269
  try:
@@ -272,31 +276,39 @@ def generate(prompt, negative_prompt, input_image_filepath, input_video_filepath
272
  progress(frame_idx / video_np.shape[0], desc="Saving video")
273
  video_writer.append_data(video_np[frame_idx])
274
  except Exception as e:
275
- print(f"Error saving video: {e}")
276
- # Fallback to saving frame by frame if container issue
277
  try:
278
- with imageio.get_writer(output_video_path, fps=call_kwargs["frame_rate"], format='FFMPEG', codec='libx264', quality=8, macro_block_size=None) as video_writer:
279
  for frame_idx in range(video_np.shape[0]):
280
- progress(frame_idx / video_np.shape[0], desc="Saving video (fallback)")
281
  video_writer.append_data(video_np[frame_idx])
282
  except Exception as e2:
283
  print(f"Fallback video saving error: {e2}")
284
  raise gr.Error(f"Failed to save video: {e2}")
285
 
286
-
287
- # Clean up temporary image/video files if they were created by Gradio
288
  if isinstance(input_image_filepath, tempfile._TemporaryFileWrapper):
289
- input_image_filepath.close()
290
- if os.path.exists(input_image_filepath.name):
291
- os.remove(input_image_filepath.name)
 
 
 
 
 
 
292
  if isinstance(input_video_filepath, tempfile._TemporaryFileWrapper):
293
- input_video_filepath.close()
294
  if os.path.exists(input_video_filepath.name):
295
- os.remove(input_video_filepath.name)
 
 
 
 
 
 
296
 
297
  return output_video_path
298
 
299
- # --- Gradio UI Definition (from user) ---
300
  css="""
301
  #col-container {
302
  margin: 0 auto;
@@ -304,14 +316,13 @@ css="""
304
  }
305
  """
306
 
307
- with gr.Blocks(css=css, theme=gr.themes.Glass()) as demo: # Changed theme for variety
308
  gr.Markdown("# LTX Video 0.9.7 Distilled (using LTX-Video lib)")
309
- gr.Markdown("Generates a short video based on text prompt, image, or existing video.")
310
  with gr.Row():
311
  with gr.Column():
312
  with gr.Group():
313
  with gr.Tab("text-to-video") as text_tab:
314
- # Hidden inputs for consistent generate() signature
315
  image_n_hidden = gr.Textbox(label="image_n", visible=False, value=None)
316
  video_n_hidden = gr.Textbox(label="video_n", visible=False, value=None)
317
  t2v_prompt = gr.Textbox(label="Prompt", value="A majestic dragon flying over a medieval castle", lines=3)
@@ -340,10 +351,8 @@ with gr.Blocks(css=css, theme=gr.themes.Glass()) as demo: # Changed theme for va
340
  seed_input = gr.Number(label="Seed", value=42, precision=0, minimum=0, maximum=2**32-1)
341
  randomize_seed_input = gr.Checkbox(label="Randomize Seed", value=False)
342
  with gr.Row():
343
- # For distilled models, CFG is often 1.0 (disabled) or very low.
344
  guidance_scale_input = gr.Slider(label="Guidance Scale (CFG)", minimum=1.0, maximum=10.0, value=PIPELINE_CONFIG_YAML.get("first_pass", {}).get("guidance_scale", 1.0), step=0.1, info="Controls how much the prompt influences the output. Higher values = stronger influence.")
345
- # Default to length of first_pass timesteps, if available
346
- default_steps = len(PIPELINE_CONFIG_YAML.get("first_pass", {}).get("timesteps", [1]*7)) # Fallback to 7 if not defined
347
  steps_input = gr.Slider(label="Inference Steps (for first pass if multi-scale)", minimum=1, maximum=30, value=default_steps, step=1, info="Number of denoising steps. More steps can improve quality but increase time. If YAML defines 'timesteps' for a pass, this UI value is ignored for that pass.")
348
  with gr.Row():
349
  num_frames_input = gr.Slider(label="Number of Frames to Generate", minimum=9, maximum=MAX_NUM_FRAMES, value=25, step=8, info="Total frames in the output video. Should be N*8+1 (e.g., 9, 17, 25...).")
@@ -351,19 +360,14 @@ with gr.Blocks(css=css, theme=gr.themes.Glass()) as demo: # Changed theme for va
351
  height_input = gr.Slider(label="Height", value=512, step=32, minimum=256, maximum=MAX_IMAGE_SIZE, info="Must be divisible by 32.")
352
  width_input = gr.Slider(label="Width", value=704, step=32, minimum=256, maximum=MAX_IMAGE_SIZE, info="Must be divisible by 32.")
353
 
354
- # Define click actions
355
- # Note: gr.State passes the current value of the component without creating a UI element for it.
356
- # We use hidden Textbox inputs for image_n, video_n etc. and pass their `value` (which is None)
357
- # to ensure the `generate` function always receives these arguments.
358
-
359
  t2v_inputs = [t2v_prompt, negative_prompt_input, image_n_hidden, video_n_hidden,
360
  height_input, width_input, gr.State("text-to-video"),
361
- steps_input, num_frames_input, gr.State(0), # frames_to_use not relevant for t2v
362
  seed_input, randomize_seed_input, guidance_scale_input, improve_texture]
363
 
364
  i2v_inputs = [i2v_prompt, negative_prompt_input, image_i2v, video_i_hidden,
365
  height_input, width_input, gr.State("image-to-video"),
366
- steps_input, num_frames_input, gr.State(0), # frames_to_use not relevant for i2v initial frame
367
  seed_input, randomize_seed_input, guidance_scale_input, improve_texture]
368
 
369
  v2v_inputs = [v2v_prompt, negative_prompt_input, image_v_hidden, video_v2v,
@@ -371,15 +375,12 @@ with gr.Blocks(css=css, theme=gr.themes.Glass()) as demo: # Changed theme for va
371
  steps_input, num_frames_input, frames_to_use,
372
  seed_input, randomize_seed_input, guidance_scale_input, improve_texture]
373
 
374
- t2v_button.click(fn=generate, inputs=t2v_inputs, outputs=[output_video])
375
- i2v_button.click(fn=generate, inputs=i2v_inputs, outputs=[output_video])
376
- v2v_button.click(fn=generate, inputs=v2v_inputs, outputs=[output_video])
377
 
378
  if __name__ == "__main__":
379
- # Clean up old model directory if it exists from previous runs
380
  if os.path.exists(models_dir) and os.path.isdir(models_dir):
381
- print(f"Cleaning up old model directory: {models_dir}")
382
- # shutil.rmtree(models_dir) # Optional: uncomment to force re-download on every run
383
- Path(models_dir).mkdir(parents=True, exist_ok=True)
384
 
385
  demo.queue().launch(debug=True, share=False)
 
62
  DISTILLED_MODEL_FILENAME = "ltxv-13b-0.9.7-distilled-rc3.safetensors"
63
 
64
  UPSCALER_REPO = "Lightricks/LTX-Video"
 
65
 
66
+ MAX_IMAGE_SIZE = PIPELINE_CONFIG_YAML.get("max_resolution", 1280)
67
+ MAX_NUM_FRAMES = 257
68
 
69
  # --- Global variables for loaded models ---
70
  pipeline_instance = None
71
  latent_upsampler_instance = None
72
+ models_dir = "downloaded_models_gradio_cpu_init"
 
73
  Path(models_dir).mkdir(parents=True, exist_ok=True)
74
 
75
+ print("Downloading models (if not present)...")
 
 
 
76
  distilled_model_actual_path = hf_hub_download(
77
  repo_id=DISTILLED_MODEL_REPO,
78
  filename=DISTILLED_MODEL_FILENAME,
 
80
  local_dir_use_symlinks=False
81
  )
82
  PIPELINE_CONFIG_YAML["checkpoint_path"] = distilled_model_actual_path
83
+ print(f"Distilled model path: {distilled_model_actual_path}")
84
 
85
  SPATIAL_UPSCALER_FILENAME = PIPELINE_CONFIG_YAML["spatial_upscaler_model_path"]
86
  spatial_upscaler_actual_path = hf_hub_download(
 
90
  local_dir_use_symlinks=False
91
  )
92
  PIPELINE_CONFIG_YAML["spatial_upscaler_model_path"] = spatial_upscaler_actual_path
93
+ print(f"Spatial upscaler model path: {spatial_upscaler_actual_path}")
94
 
95
+ print("Creating LTX Video pipeline on CPU...")
 
96
  pipeline_instance = create_ltx_video_pipeline(
97
  ckpt_path=PIPELINE_CONFIG_YAML["checkpoint_path"],
98
  precision=PIPELINE_CONFIG_YAML["precision"],
99
  text_encoder_model_name_or_path=PIPELINE_CONFIG_YAML["text_encoder_model_name_or_path"],
100
  sampler=PIPELINE_CONFIG_YAML["sampler"],
101
+ device="cpu",
102
+ enhance_prompt=False,
103
  prompt_enhancer_image_caption_model_name_or_path=PIPELINE_CONFIG_YAML["prompt_enhancer_image_caption_model_name_or_path"],
104
  prompt_enhancer_llm_model_name_or_path=PIPELINE_CONFIG_YAML["prompt_enhancer_llm_model_name_or_path"],
105
  )
106
+ print("LTX Video pipeline created on CPU.")
107
 
108
  if PIPELINE_CONFIG_YAML.get("spatial_upscaler_model_path"):
109
+ print("Creating latent upsampler on CPU...")
110
  latent_upsampler_instance = create_latent_upsampler(
111
  PIPELINE_CONFIG_YAML["spatial_upscaler_model_path"],
112
+ device="cpu"
113
  )
114
+ print("Latent upsampler created on CPU.")
115
 
116
 
117
  def generate(prompt, negative_prompt, input_image_filepath, input_video_filepath,
 
119
  ui_steps, num_frames_ui,
120
  ui_frames_to_use,
121
  seed_ui, randomize_seed, ui_guidance_scale, improve_texture_flag,
122
+ progress=gr.Progress(track_ τότε=True)):
123
+
124
+ target_inference_device = get_device()
125
+ print(f"Target inference device: {target_inference_device}")
126
 
127
  if randomize_seed:
128
  seed_ui = random.randint(0, 2**32 - 1)
 
132
  actual_width = int(width_ui)
133
  actual_num_frames = int(num_frames_ui)
134
 
 
135
  height_padded = ((actual_height - 1) // 32 + 1) * 32
136
  width_padded = ((actual_width - 1) // 32 + 1) * 32
137
  num_frames_padded = ((actual_num_frames - 2) // 8 + 1) * 8 + 1
 
141
  call_kwargs = {
142
  "prompt": prompt,
143
  "negative_prompt": negative_prompt,
144
+ "height": height_padded,
145
+ "width": width_padded,
146
+ "num_frames": num_frames_padded,
147
  "frame_rate": 30,
148
+ "generator": torch.Generator(device=target_inference_device).manual_seed(int(seed_ui)),
149
+ "output_type": "pt", # Crucial: pipeline will output [0,1] range tensors
150
  "conditioning_items": None,
151
  "media_items": None,
152
  "decode_timestep": PIPELINE_CONFIG_YAML["decode_timestep"],
153
  "decode_noise_scale": PIPELINE_CONFIG_YAML["decode_noise_scale"],
154
  "stochastic_sampling": PIPELINE_CONFIG_YAML["stochastic_sampling"],
155
+ "image_cond_noise_scale": 0.15,
156
+ "is_video": True,
157
+ "vae_per_channel_normalize": True,
158
  "mixed_precision": (PIPELINE_CONFIG_YAML["precision"] == "mixed_precision"),
159
+ "offload_to_cpu": False,
160
+ "enhance_prompt": False,
161
  }
162
 
163
  stg_mode_str = PIPELINE_CONFIG_YAML.get("stg_mode", "attention_values")
 
174
 
175
  if mode == "image-to-video" and input_image_filepath:
176
  try:
 
177
  media_tensor = load_image_to_tensor_with_resize_and_crop(
178
  input_image_filepath, actual_height, actual_width
179
  )
180
  media_tensor = torch.nn.functional.pad(media_tensor, padding_values)
181
+ call_kwargs["conditioning_items"] = [ConditioningItem(media_tensor.to(target_inference_device), 0, 1.0)]
182
  except Exception as e:
183
  print(f"Error loading image {input_image_filepath}: {e}")
184
  raise gr.Error(f"Could not load image: {e}")
 
 
185
  elif mode == "video-to-video" and input_video_filepath:
186
  try:
187
  call_kwargs["media_items"] = load_media_file(
 
190
  width=actual_width,
191
  max_frames=int(ui_frames_to_use),
192
  padding=padding_values
193
+ ).to(target_inference_device)
194
  except Exception as e:
195
  print(f"Error loading video {input_video_filepath}: {e}")
196
  raise gr.Error(f"Could not load video: {e}")
197
+
198
+ print(f"Moving models to {target_inference_device} for inference...")
199
+ pipeline_instance.to(target_inference_device)
200
+ active_latent_upsampler = None
201
+ if improve_texture_flag and latent_upsampler_instance:
202
+ latent_upsampler_instance.to(target_inference_device)
203
+ active_latent_upsampler = latent_upsampler_instance
204
+ print("Models moved.")
205
+
206
+ result_images_tensor = None
207
+ try:
208
+ if improve_texture_flag:
209
+ if not active_latent_upsampler:
210
+ raise gr.Error("Spatial upscaler model not loaded or improve_texture not selected, cannot use multi-scale.")
211
+
212
+ multi_scale_pipeline_obj = LTXMultiScalePipeline(pipeline_instance, active_latent_upsampler)
213
+
214
+ first_pass_args = PIPELINE_CONFIG_YAML.get("first_pass", {}).copy()
215
+ first_pass_args["guidance_scale"] = float(ui_guidance_scale)
216
+ if "timesteps" not in first_pass_args:
217
+ first_pass_args["num_inference_steps"] = int(ui_steps)
218
+
219
+ second_pass_args = PIPELINE_CONFIG_YAML.get("second_pass", {}).copy()
220
+ second_pass_args["guidance_scale"] = float(ui_guidance_scale)
221
+
222
+ multi_scale_call_kwargs = call_kwargs.copy()
223
+ multi_scale_call_kwargs.update({
224
+ "downscale_factor": PIPELINE_CONFIG_YAML["downscale_factor"],
225
+ "first_pass": first_pass_args,
226
+ "second_pass": second_pass_args,
227
+ })
228
+
229
+ print(f"Calling multi-scale pipeline (eff. HxW: {actual_height}x{actual_width}) on {target_inference_device}")
230
+ result_images_tensor = multi_scale_pipeline_obj(**multi_scale_call_kwargs).images
231
+ else:
232
+ single_pass_call_kwargs = call_kwargs.copy()
233
+ single_pass_call_kwargs["guidance_scale"] = float(ui_guidance_scale)
234
+ single_pass_call_kwargs["num_inference_steps"] = int(ui_steps)
235
+ single_pass_call_kwargs.pop("first_pass", None)
236
+ single_pass_call_kwargs.pop("second_pass", None)
237
+ single_pass_call_kwargs.pop("downscale_factor", None)
238
+
239
+ print(f"Calling base pipeline (padded HxW: {height_padded}x{width_padded}) on {target_inference_device}")
240
+ result_images_tensor = pipeline_instance(**single_pass_call_kwargs).images
241
 
242
+ finally:
243
+ print(f"Moving models back to CPU...")
244
+ pipeline_instance.to("cpu")
245
+ if active_latent_upsampler:
246
+ active_latent_upsampler.to("cpu")
247
 
248
+ if target_inference_device == "cuda":
249
+ torch.cuda.empty_cache()
250
+ print("Models moved back to CPU and cache cleared (if CUDA).")
251
+
252
+ if result_images_tensor is None:
253
+ raise gr.Error("Generation failed.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
254
 
 
 
255
  pad_left, pad_right, pad_top, pad_bottom = padding_values
 
 
256
  slice_h_end = -pad_bottom if pad_bottom > 0 else None
257
  slice_w_end = -pad_right if pad_right > 0 else None
 
258
  result_images_tensor = result_images_tensor[
259
  :, :, :actual_num_frames, pad_top:slice_h_end, pad_left:slice_w_end
260
  ]
261
 
262
+ # The pipeline with output_type="pt" should return tensors in the [0, 1] range.
263
  video_np = result_images_tensor[0].permute(1, 2, 3, 0).cpu().float().numpy()
264
+
265
+ # Clip to ensure values are indeed in [0, 1] before scaling to uint8
266
+ video_np = np.clip(video_np, 0, 1)
267
  video_np = (video_np * 255).astype(np.uint8)
268
 
269
  temp_dir = tempfile.mkdtemp()
270
+ timestamp = random.randint(10000,99999)
271
  output_video_path = os.path.join(temp_dir, f"output_{timestamp}.mp4")
272
 
273
  try:
 
276
  progress(frame_idx / video_np.shape[0], desc="Saving video")
277
  video_writer.append_data(video_np[frame_idx])
278
  except Exception as e:
279
+ print(f"Error saving video with macro_block_size=1: {e}")
 
280
  try:
281
+ with imageio.get_writer(output_video_path, fps=call_kwargs["frame_rate"], format='FFMPEG', codec='libx264', quality=8) as video_writer:
282
  for frame_idx in range(video_np.shape[0]):
283
+ progress(frame_idx / video_np.shape[0], desc="Saving video (fallback ffmpeg)")
284
  video_writer.append_data(video_np[frame_idx])
285
  except Exception as e2:
286
  print(f"Fallback video saving error: {e2}")
287
  raise gr.Error(f"Failed to save video: {e2}")
288
 
 
 
289
  if isinstance(input_image_filepath, tempfile._TemporaryFileWrapper):
290
+ if os.path.exists(input_image_filepath.name): # Check if it's already closed by Gradio
291
+ try:
292
+ input_image_filepath.close()
293
+ os.remove(input_image_filepath.name)
294
+ except: pass # May already be closed/removed
295
+ elif input_image_filepath and os.path.exists(input_image_filepath) and input_image_filepath.startswith(tempfile.gettempdir()):
296
+ try: os.remove(input_image_filepath) # If Gradio passed a path to a temp file
297
+ except: pass
298
+
299
  if isinstance(input_video_filepath, tempfile._TemporaryFileWrapper):
 
300
  if os.path.exists(input_video_filepath.name):
301
+ try:
302
+ input_video_filepath.close()
303
+ os.remove(input_video_filepath.name)
304
+ except: pass
305
+ elif input_video_filepath and os.path.exists(input_video_filepath) and input_video_filepath.startswith(tempfile.gettempdir()):
306
+ try: os.remove(input_video_filepath)
307
+ except: pass
308
 
309
  return output_video_path
310
 
311
+ # --- Gradio UI Definition ---
312
  css="""
313
  #col-container {
314
  margin: 0 auto;
 
316
  }
317
  """
318
 
319
+ with gr.Blocks(css=css, theme=gr.themes.Glass()) as demo:
320
  gr.Markdown("# LTX Video 0.9.7 Distilled (using LTX-Video lib)")
321
+ gr.Markdown("Generates a short video based on text prompt, image, or existing video. Models are moved to GPU during generation and back to CPU afterwards to save VRAM.")
322
  with gr.Row():
323
  with gr.Column():
324
  with gr.Group():
325
  with gr.Tab("text-to-video") as text_tab:
 
326
  image_n_hidden = gr.Textbox(label="image_n", visible=False, value=None)
327
  video_n_hidden = gr.Textbox(label="video_n", visible=False, value=None)
328
  t2v_prompt = gr.Textbox(label="Prompt", value="A majestic dragon flying over a medieval castle", lines=3)
 
351
  seed_input = gr.Number(label="Seed", value=42, precision=0, minimum=0, maximum=2**32-1)
352
  randomize_seed_input = gr.Checkbox(label="Randomize Seed", value=False)
353
  with gr.Row():
 
354
  guidance_scale_input = gr.Slider(label="Guidance Scale (CFG)", minimum=1.0, maximum=10.0, value=PIPELINE_CONFIG_YAML.get("first_pass", {}).get("guidance_scale", 1.0), step=0.1, info="Controls how much the prompt influences the output. Higher values = stronger influence.")
355
+ default_steps = len(PIPELINE_CONFIG_YAML.get("first_pass", {}).get("timesteps", [1]*7))
 
356
  steps_input = gr.Slider(label="Inference Steps (for first pass if multi-scale)", minimum=1, maximum=30, value=default_steps, step=1, info="Number of denoising steps. More steps can improve quality but increase time. If YAML defines 'timesteps' for a pass, this UI value is ignored for that pass.")
357
  with gr.Row():
358
  num_frames_input = gr.Slider(label="Number of Frames to Generate", minimum=9, maximum=MAX_NUM_FRAMES, value=25, step=8, info="Total frames in the output video. Should be N*8+1 (e.g., 9, 17, 25...).")
 
360
  height_input = gr.Slider(label="Height", value=512, step=32, minimum=256, maximum=MAX_IMAGE_SIZE, info="Must be divisible by 32.")
361
  width_input = gr.Slider(label="Width", value=704, step=32, minimum=256, maximum=MAX_IMAGE_SIZE, info="Must be divisible by 32.")
362
 
 
 
 
 
 
363
  t2v_inputs = [t2v_prompt, negative_prompt_input, image_n_hidden, video_n_hidden,
364
  height_input, width_input, gr.State("text-to-video"),
365
+ steps_input, num_frames_input, gr.State(0),
366
  seed_input, randomize_seed_input, guidance_scale_input, improve_texture]
367
 
368
  i2v_inputs = [i2v_prompt, negative_prompt_input, image_i2v, video_i_hidden,
369
  height_input, width_input, gr.State("image-to-video"),
370
+ steps_input, num_frames_input, gr.State(0),
371
  seed_input, randomize_seed_input, guidance_scale_input, improve_texture]
372
 
373
  v2v_inputs = [v2v_prompt, negative_prompt_input, image_v_hidden, video_v2v,
 
375
  steps_input, num_frames_input, frames_to_use,
376
  seed_input, randomize_seed_input, guidance_scale_input, improve_texture]
377
 
378
+ t2v_button.click(fn=generate, inputs=t2v_inputs, outputs=[output_video], api_name="text_to_video")
379
+ i2v_button.click(fn=generate, inputs=i2v_inputs, outputs=[output_video], api_name="image_to_video")
380
+ v2v_button.click(fn=generate, inputs=v2v_inputs, outputs=[output_video], api_name="video_to_video")
381
 
382
  if __name__ == "__main__":
 
383
  if os.path.exists(models_dir) and os.path.isdir(models_dir):
384
+ print(f"Model directory: {Path(models_dir).resolve()}")
 
 
385
 
386
  demo.queue().launch(debug=True, share=False)