LPX55 commited on
Commit
f8d314f
·
verified ·
1 Parent(s): 20c7e22

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +63 -57
app.py CHANGED
@@ -27,7 +27,7 @@ from diffusers.pipelines.pipeline_utils import DiffusionPipeline
27
  from diffusers.pipelines.hunyuan_video.pipeline_output import HunyuanVideoPipelineOutput
28
  from diffusers.pipelines.hunyuan_video.pipeline_hunyuan_video import retrieve_timesteps, DEFAULT_PROMPT_TEMPLATE
29
  from diffusers.utils import load_image
30
- from huggingface_hub import hf_hub_download
31
  import requests
32
  import io
33
 
@@ -38,8 +38,10 @@ video_transforms = transforms.Compose(
38
  transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
39
  ]
40
  )
 
41
  model_id = "hunyuanvideo-community/HunyuanVideo"
42
  lora_path = hf_hub_download("dashtoon/hunyuan-video-keyframe-control-lora", "i2v.sft") # Replace with the actual LORA path
 
43
  transformer = HunyuanVideoTransformer3DModel.from_pretrained(model_id, subfolder="transformer", torch_dtype=torch.bfloat16)
44
  global pipe
45
  pipe = HunyuanVideoPipeline.from_pretrained(model_id, transformer=transformer, torch_dtype=torch.bfloat16)
@@ -64,6 +66,7 @@ with torch.no_grad(): # enable image inputs
64
 
65
  lora_state_dict = safetensors.torch.load_file(lora_path, device="cpu")
66
  transformer_lora_state_dict = {f'{k.replace("transformer.", "")}': v for k, v in lora_state_dict.items() if k.startswith("transformer.") and "lora" in k}
 
67
  pipe.load_lora_into_transformer(transformer_lora_state_dict, transformer=pipe.transformer, adapter_name="i2v", _pipeline=pipe)
68
  pipe.set_adapters(["i2v"], adapter_weights=[1.0])
69
  pipe.fuse_lora(components=["transformer"], lora_scale=1.0, adapter_names=["i2v"])
@@ -77,7 +80,6 @@ def resize_image_to_bucket(image: Union[Image.Image, np.ndarray], bucket_reso: T
77
  image = np.array(image)
78
  elif not isinstance(image, np.ndarray):
79
  raise ValueError("Image must be a PIL Image or NumPy array")
80
-
81
  image_height, image_width = image.shape[:2]
82
  if bucket_reso == (image_width, image_height):
83
  return image
@@ -99,17 +101,14 @@ def resize_image_to_bucket(image: Union[Image.Image, np.ndarray], bucket_reso: T
99
  image = image[crop_top:crop_top + bucket_height, crop_left:crop_left + bucket_width]
100
  return image
101
 
102
-
103
  @spaces.GPU(duration=120)
104
  def generate_video(prompt: str, frame1: Image.Image, frame2: Image.Image, resolution: str, guidance_scale: float, num_frames: int, num_inference_steps: int) -> bytes:
105
  # Debugging print statements
106
  print(f"Frame 1 Type: {type(frame1)}")
107
  print(f"Frame 2 Type: {type(frame2)}")
108
  print(f"Resolution: {resolution}")
109
-
110
  # Parse resolution
111
  width, height = map(int, resolution.split('x'))
112
-
113
  # Load and preprocess frames
114
  cond_frame1 = np.array(frame1)
115
  cond_frame2 = np.array(frame2)
@@ -139,12 +138,12 @@ def generate_video(prompt: str, frame1: Image.Image, frame2: Image.Image, resolu
139
  generator=torch.Generator(device="cuda").manual_seed(0),
140
  ).frames[0]
141
  # Export to video
 
142
  video_path = "output.mp4"
143
- # video_bytes = io.BytesIO()
144
  export_to_video(video, video_path, fps=24)
145
  torch.cuda.empty_cache()
146
  return video_path
147
-
148
  @torch.inference_mode()
149
  def call_pipe(
150
  pipe,
@@ -159,7 +158,7 @@ def call_pipe(
159
  num_videos_per_prompt: Optional[int] = 1,
160
  generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
161
  latents: Optional[torch.Tensor] = None,
162
- prompt_embeds: Optional[torch.Tensor] = None,
163
  pooled_prompt_embeds: Optional[torch.Tensor] = None,
164
  prompt_attention_mask: Optional[torch.Tensor] = None,
165
  output_type: Optional[str] = "pil",
@@ -173,7 +172,6 @@ def call_pipe(
173
  ):
174
  if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
175
  callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
176
-
177
  # 1. Check inputs. Raise error if not correct
178
  pipe.check_inputs(
179
  prompt,
@@ -184,13 +182,11 @@ def call_pipe(
184
  callback_on_step_end_tensor_inputs,
185
  prompt_template,
186
  )
187
-
188
  pipe._guidance_scale = guidance_scale
189
  pipe._attention_kwargs = attention_kwargs
190
  pipe._current_timestep = None
191
  pipe._interrupt = False
192
  device = pipe._execution_device
193
-
194
  # 2. Define call parameters
195
  if prompt is not None and isinstance(prompt, str):
196
  batch_size = 1
@@ -198,7 +194,6 @@ def call_pipe(
198
  batch_size = len(prompt)
199
  else:
200
  batch_size = prompt_embeds.shape[0]
201
-
202
  # 3. Encode input prompt
203
  prompt_embeds, pooled_prompt_embeds, prompt_attention_mask = pipe.encode_prompt(
204
  prompt=prompt,
@@ -211,13 +206,11 @@ def call_pipe(
211
  device=device,
212
  max_sequence_length=max_sequence_length,
213
  )
214
-
215
  transformer_dtype = pipe.transformer.dtype
216
  prompt_embeds = prompt_embeds.to(transformer_dtype)
217
  prompt_attention_mask = prompt_attention_mask.to(transformer_dtype)
218
  if pooled_prompt_embeds is not None:
219
  pooled_prompt_embeds = pooled_prompt_embeds.to(transformer_dtype)
220
-
221
  # 4. Prepare timesteps
222
  sigmas = np.linspace(1.0, 0.0, num_inference_steps + 1)[:-1] if sigmas is None else sigmas
223
  timesteps, num_inference_steps = retrieve_timesteps(
@@ -226,7 +219,6 @@ def call_pipe(
226
  device,
227
  sigmas=sigmas,
228
  )
229
-
230
  # 5. Prepare latent variables
231
  num_channels_latents = pipe.transformer.config.in_channels
232
  num_latent_frames = (num_frames - 1) // pipe.vae_scale_factor_temporal + 1
@@ -241,17 +233,14 @@ def call_pipe(
241
  generator,
242
  latents,
243
  )
244
-
245
  # 6. Prepare guidance condition
246
  guidance = torch.tensor([guidance_scale] * latents.shape[0], dtype=transformer_dtype, device=device) * 1000.0
247
-
248
  # 7. Denoising loop
249
  num_warmup_steps = len(timesteps) - num_inference_steps * pipe.scheduler.order
250
  pipe._num_timesteps = len(timesteps)
251
- pipe.text_encoder.to("cpu")
252
  pipe.text_encoder_2.to("cpu")
253
- torch.cuda.empty_cache()
254
-
255
  with pipe.progress_bar(total=num_inference_steps) as progress_bar:
256
  for i, t in enumerate(timesteps):
257
  if pipe.interrupt:
@@ -269,10 +258,8 @@ def call_pipe(
269
  attention_kwargs=attention_kwargs,
270
  return_dict=False,
271
  )[0]
272
-
273
  # compute the previous noisy sample x_t -> x_t-1
274
  latents = pipe.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
275
-
276
  if callback_on_step_end is not None:
277
  callback_kwargs = {}
278
  for k in callback_on_step_end_tensor_inputs:
@@ -280,11 +267,9 @@ def call_pipe(
280
  callback_outputs = callback_on_step_end(pipe, i, t, callback_kwargs)
281
  latents = callback_outputs.pop("latents", latents)
282
  prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
283
-
284
  # call the callback, if provided
285
- if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % pipe.scheduler.order == 0):
286
  progress_bar.update()
287
-
288
  pipe._current_timestep = None
289
  if not output_type == "latent":
290
  latents = latents.to(pipe.vae.dtype) / pipe.vae.config.scaling_factor
@@ -292,48 +277,69 @@ def call_pipe(
292
  video = pipe.video_processor.postprocess_video(video, output_type=output_type)
293
  else:
294
  video = latents
295
-
296
  # Offload all models
297
  pipe.maybe_free_model_hooks()
298
-
299
  if not return_dict:
300
  return (video,)
301
  return HunyuanVideoPipelineOutput(frames=video)
302
 
303
-
304
  def main():
305
  # Define the interface inputs
306
- inputs = [
307
- gr.Textbox(label="Prompt", value="a woman"),
308
- gr.Image(label="Frame 1", type="pil"),
309
- gr.Image(label="Frame 2", type="pil"),
310
- gr.Dropdown(
311
- label="Resolution",
312
- choices=["720x1280", "544x960", "1280x720", "960x544", "720x720"],
313
- value="544x960"
314
- ),
315
- gr.Slider(minimum=0.1, maximum=20, step=0.1, label="Guidance Scale", value=6.0),
316
- gr.Slider(minimum=1, maximum=129, step=1, label="Number of Frames", value=49),
317
- gr.Slider(minimum=1, maximum=100, step=1, label="Number of Inference Steps", value=30)
318
- ]
319
-
320
  # Define the interface outputs
321
- outputs = [
322
- gr.Video(label="Generated Video"),
323
- ]
324
 
325
-
326
- # Create the Gradio interface
327
- iface = gr.Interface(
328
- fn=generate_video,
329
- inputs=inputs,
330
- outputs=outputs,
331
- title="HunyuanVideo Keyframe IMG+IMG2VID Control Lora",
332
- description="Generate videos using the HunyuanVideo model with a prompt and two frames as conditions. Gradio / HF Spaces implementation demo.",
333
- )
334
-
335
- # Launch the Gradio app
336
- iface.launch(show_error=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
337
 
338
  if __name__ == "__main__":
339
  main()
 
27
  from diffusers.pipelines.hunyuan_video.pipeline_output import HunyuanVideoPipelineOutput
28
  from diffusers.pipelines.hunyuan_video.pipeline_hunyuan_video import retrieve_timesteps, DEFAULT_PROMPT_TEMPLATE
29
  from diffusers.utils import load_image
30
+ from huggingface_hub import hf_hub_download
31
  import requests
32
  import io
33
 
 
38
  transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
39
  ]
40
  )
41
+
42
  model_id = "hunyuanvideo-community/HunyuanVideo"
43
  lora_path = hf_hub_download("dashtoon/hunyuan-video-keyframe-control-lora", "i2v.sft") # Replace with the actual LORA path
44
+
45
  transformer = HunyuanVideoTransformer3DModel.from_pretrained(model_id, subfolder="transformer", torch_dtype=torch.bfloat16)
46
  global pipe
47
  pipe = HunyuanVideoPipeline.from_pretrained(model_id, transformer=transformer, torch_dtype=torch.bfloat16)
 
66
 
67
  lora_state_dict = safetensors.torch.load_file(lora_path, device="cpu")
68
  transformer_lora_state_dict = {f'{k.replace("transformer.", "")}': v for k, v in lora_state_dict.items() if k.startswith("transformer.") and "lora" in k}
69
+
70
  pipe.load_lora_into_transformer(transformer_lora_state_dict, transformer=pipe.transformer, adapter_name="i2v", _pipeline=pipe)
71
  pipe.set_adapters(["i2v"], adapter_weights=[1.0])
72
  pipe.fuse_lora(components=["transformer"], lora_scale=1.0, adapter_names=["i2v"])
 
80
  image = np.array(image)
81
  elif not isinstance(image, np.ndarray):
82
  raise ValueError("Image must be a PIL Image or NumPy array")
 
83
  image_height, image_width = image.shape[:2]
84
  if bucket_reso == (image_width, image_height):
85
  return image
 
101
  image = image[crop_top:crop_top + bucket_height, crop_left:crop_left + bucket_width]
102
  return image
103
 
 
104
  @spaces.GPU(duration=120)
105
  def generate_video(prompt: str, frame1: Image.Image, frame2: Image.Image, resolution: str, guidance_scale: float, num_frames: int, num_inference_steps: int) -> bytes:
106
  # Debugging print statements
107
  print(f"Frame 1 Type: {type(frame1)}")
108
  print(f"Frame 2 Type: {type(frame2)}")
109
  print(f"Resolution: {resolution}")
 
110
  # Parse resolution
111
  width, height = map(int, resolution.split('x'))
 
112
  # Load and preprocess frames
113
  cond_frame1 = np.array(frame1)
114
  cond_frame2 = np.array(frame2)
 
138
  generator=torch.Generator(device="cuda").manual_seed(0),
139
  ).frames[0]
140
  # Export to video
141
+ # TO-DO: Implement alternate method
142
  video_path = "output.mp4"
 
143
  export_to_video(video, video_path, fps=24)
144
  torch.cuda.empty_cache()
145
  return video_path
146
+
147
  @torch.inference_mode()
148
  def call_pipe(
149
  pipe,
 
158
  num_videos_per_prompt: Optional[int] = 1,
159
  generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
160
  latents: Optional[torch.Tensor] = None,
161
+ prompt_embeds<|im_start|>output> Optional[torch.Tensor] = None,
162
  pooled_prompt_embeds: Optional[torch.Tensor] = None,
163
  prompt_attention_mask: Optional[torch.Tensor] = None,
164
  output_type: Optional[str] = "pil",
 
172
  ):
173
  if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
174
  callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
 
175
  # 1. Check inputs. Raise error if not correct
176
  pipe.check_inputs(
177
  prompt,
 
182
  callback_on_step_end_tensor_inputs,
183
  prompt_template,
184
  )
 
185
  pipe._guidance_scale = guidance_scale
186
  pipe._attention_kwargs = attention_kwargs
187
  pipe._current_timestep = None
188
  pipe._interrupt = False
189
  device = pipe._execution_device
 
190
  # 2. Define call parameters
191
  if prompt is not None and isinstance(prompt, str):
192
  batch_size = 1
 
194
  batch_size = len(prompt)
195
  else:
196
  batch_size = prompt_embeds.shape[0]
 
197
  # 3. Encode input prompt
198
  prompt_embeds, pooled_prompt_embeds, prompt_attention_mask = pipe.encode_prompt(
199
  prompt=prompt,
 
206
  device=device,
207
  max_sequence_length=max_sequence_length,
208
  )
 
209
  transformer_dtype = pipe.transformer.dtype
210
  prompt_embeds = prompt_embeds.to(transformer_dtype)
211
  prompt_attention_mask = prompt_attention_mask.to(transformer_dtype)
212
  if pooled_prompt_embeds is not None:
213
  pooled_prompt_embeds = pooled_prompt_embeds.to(transformer_dtype)
 
214
  # 4. Prepare timesteps
215
  sigmas = np.linspace(1.0, 0.0, num_inference_steps + 1)[:-1] if sigmas is None else sigmas
216
  timesteps, num_inference_steps = retrieve_timesteps(
 
219
  device,
220
  sigmas=sigmas,
221
  )
 
222
  # 5. Prepare latent variables
223
  num_channels_latents = pipe.transformer.config.in_channels
224
  num_latent_frames = (num_frames - 1) // pipe.vae_scale_factor_temporal + 1
 
233
  generator,
234
  latents,
235
  )
 
236
  # 6. Prepare guidance condition
237
  guidance = torch.tensor([guidance_scale] * latents.shape[0], dtype=transformer_dtype, device=device) * 1000.0
 
238
  # 7. Denoising loop
239
  num_warmup_steps = len(timesteps) - num_inference_steps * pipe.scheduler.order
240
  pipe._num_timesteps = len(timesteps)
241
+ pipe.text_encoder.to("cpu")
242
  pipe.text_encoder_2.to("cpu")
243
+ torch.cuda.empty_cache()
 
244
  with pipe.progress_bar(total=num_inference_steps) as progress_bar:
245
  for i, t in enumerate(timesteps):
246
  if pipe.interrupt:
 
258
  attention_kwargs=attention_kwargs,
259
  return_dict=False,
260
  )[0]
 
261
  # compute the previous noisy sample x_t -> x_t-1
262
  latents = pipe.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
 
263
  if callback_on_step_end is not None:
264
  callback_kwargs = {}
265
  for k in callback_on_step_end_tensor_inputs:
 
267
  callback_outputs = callback_on_step_end(pipe, i, t, callback_kwargs)
268
  latents = callback_outputs.pop("latents", latents)
269
  prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
 
270
  # call the callback, if provided
271
+ if i < len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % pipe.scheduler.order == 0):
272
  progress_bar.update()
 
273
  pipe._current_timestep = None
274
  if not output_type == "latent":
275
  latents = latents.to(pipe.vae.dtype) / pipe.vae.config.scaling_factor
 
277
  video = pipe.video_processor.postprocess_video(video, output_type=output_type)
278
  else:
279
  video = latents
 
280
  # Offload all models
281
  pipe.maybe_free_model_hooks()
 
282
  if not return_dict:
283
  return (video,)
284
  return HunyuanVideoPipelineOutput(frames=video)
285
 
 
286
  def main():
287
  # Define the interface inputs
288
+ prompt_textbox = gr.Textbox(label="Prompt", value="a subject ... ")
289
+ frame1 = gr.Image(label="Frame 1", type="pil")
290
+ frame2 = gr.Image(label="Frame 2", type="pil")
291
+ resolution = gr.Dropdown(
292
+ label="Resolution",
293
+ choices=["720x1280", "544x960", "1280x720", "960x544", "720x720"],
294
+ value="544x960"
295
+ )
296
+ guidance_scale = gr.Slider(minimum=0.1, maximum=20, step=0.1, label="Guidance Scale", value=6.0)
297
+ num_frames = gr.Slider(minimum=1, maximum=129, step=1, label="Number of Frames", value=49)
298
+ num_inference_steps = gr.Slider(minimum=1, maximum=100, step=1, label="Number of Inference Steps", value=30)
299
+
 
 
300
  # Define the interface outputs
301
+ outputs = gr.Video(label="Generated Video")
 
 
302
 
303
+ with gr.Blocks() as demo:
304
+ gr.Markdown("# HunyuanVideo Keyframes Gen Control Lora")
305
+ gr.Markdown("Generate videos using the HunyuanVideo model with a prompt and two (or more) frames as conditions. Gradio / HF Spaces implementation demo.")
306
+ gr.Markdown("Unfortunately still difficult to run on ZeroGPU spaces, but getting closer. **DUPLICATE THE SPACE** and select hardware with more VRAM. I will fill out request for GPU allocation for the demo with HF shortly.")
307
+ gr.Markdown("For more technical information check out the [original repo by dashtoon.](dashtoon/hunyuan-video-keyframe-control-lora) Special shoutout to @pftq for work on optimization and ideas.")
308
+
309
+ with gr.Row():
310
+ with gr.Column():
311
+ prompt_textbox
312
+ resolution
313
+ with gr.Column():
314
+ guided_scale
315
+ num_frames
316
+
317
+ with gr.Row():
318
+ frame1
319
+ frame2
320
+
321
+ with gr.Row():
322
+ num_inference_steps
323
+ outputs
324
+
325
+ generate_button = gr.Button("Generate Video")
326
+ generate_button.click(generate_video, inputs=[prompt_textbox, frame1, frame2, resolution, guidance_scale, num_frames, num_inference_steps], outputs=outputs)
327
+
328
+ gr.Markdown("""
329
+ @dashtoon: HunyuanVideo Keyframe Control Lora is an adapter for HunyuanVideo T2V model for keyframe-based video generation. ​Our architecture builds upon existing models, introducing key enhancements to optimize keyframe-based video generation:​
330
+ * We modify the input patch embedding projection layer to effectively incorporate keyframe information. By adjusting the convolutional input parameters, we enable the model to process image inputs within the Diffusion Transformer (DiT) framework.​
331
+ * We apply Low-Rank Adaptation (LoRA) across all linear layers and the convolutional input layer. This approach facilitates efficient fine-tuning by introducing low-rank matrices that approximate the weight updates, thereby preserving the base model's foundational capabilities while reducing the number of trainable parameters.
332
+ * The model is conditioned on user-defined keyframes, allowing precise control over the generated video's start and end frames. This conditioning ensures that the generated content aligns seamlessly with the specified keyframes, enhancing the coherence and narrative flow of the video.​
333
+
334
+ | Image 1 | Image 2 | Generated Video |
335
+ |---------|---------|-----------------|
336
+ | ![Image 1](https://content.dashtoon.ai/stability-images/41aeca63-064a-4003-8c8b-bfe2cc80d275.png) | ![Image 2](https://content.dashtoon.ai/stability-images/28956177-3455-4b56-bb6c-73eacef323ca.png) | <video controls autoplay src="https://content.dashtoon.ai/stability-images/14b7dd1a-1f46-4c4c-b4ec-9d0f948712af.mp4"></video> |
337
+ | ![Image 1](https://content.dashtoon.ai/stability-images/ddabbf2f-4218-497b-8239-b7b882d93000.png) | ![Image 2](https://content.dashtoon.ai/stability-images/b603acba-40a4-44ba-aa26-ed79403df580.png) | <video controls autoplay src="https://content.dashtoon.ai/stability-images/b00ba193-b3b7-41a1-9bc1-9fdaceba6efa.mp4"></video> |
338
+ | ![Image 1](https://content.dashtoon.ai/stability-images/5298cf0c-0955-4568-935a-2fb66045f21d.png) | ![Image 2](https://content.dashtoon.ai/stability-images/722a4ea7-7092-4323-8e83-3f627e8fd7f8.png) | <video controls autoplay src="https://content.dashtoon.ai/stability-images/0cb84780-4fdf-4ecc-ab48-12e7e1055a39.mp4"></video> |
339
+ | ![Image 1](https://content.dashtoon.ai/stability-images/69d9a49f-95c0-4e85-bd49-14a039373c8b.png) | ![Image 2](https://content.dashtoon.ai/stability-images/0cef7fa9-e15a-48ec-9bd3-c61921181802.png) | <video controls autoplay src="https://content.dashtoon.ai/stability-images/ce12156f-0ac2-4d16-b489-37e85c61b5b2.mp4"></video> |
340
+ """)
341
+
342
+ demo.launch(show_error=True)
343
 
344
  if __name__ == "__main__":
345
  main()