Spaces:

multimodalart
/

wan2-1-fast

Running on Zero

App Files Files Community

multimodalart HF Staff commited on May 20

Commit

c103ac7

verified ·

1 Parent(s): cb85dbc

Update app.py

Browse files

Files changed (1) hide show

app.py +84 -43

app.py CHANGED Viewed

@@ -1,11 +1,10 @@
 import torch
 from diffusers import AutoencoderKLWan, WanImageToVideoPipeline, UniPCMultistepScheduler
-from diffusers.utils import export_to_video, load_image
 from transformers import CLIPVisionModel
 import gradio as gr
 import tempfile
-import os
-import spaces # Assuming this is for Hugging Face Spaces GPU decorator
 from huggingface_hub import hf_hub_download
 import logging
 import numpy as np
@@ -25,21 +24,21 @@ logger.info(f"Loading Image Encoder for {MODEL_ID}...")
 image_encoder = CLIPVisionModel.from_pretrained(
     MODEL_ID,
     subfolder="image_encoder",
-    torch_dtype=torch.float32
 )
 logger.info(f"Loading VAE for {MODEL_ID}...")
 vae = AutoencoderKLWan.from_pretrained(
     MODEL_ID,
     subfolder="vae",
-    torch_dtype=torch.float32
 )
 logger.info(f"Loading Pipeline {MODEL_ID}...")
 pipe = WanImageToVideoPipeline.from_pretrained(
     MODEL_ID,
     vae=vae,
     image_encoder=image_encoder,
-    torch_dtype=torch.bfloat16
 )
 flow_shift = 8.0
 pipe.scheduler = UniPCMultistepScheduler.from_config(
@@ -57,44 +56,68 @@ pipe.load_lora_weights(causvid_path, adapter_name="causvid_lora")
 logger.info("Setting LoRA adapter...")
 pipe.set_adapters(["causvid_lora"], adapter_weights=[1.0])
-MOD_VALUE = 128
 MOD_VALUE_H = MOD_VALUE_W = MOD_VALUE
-DEFAULT_H_SLIDER_VALUE = 384 # (3 * 128)
-DEFAULT_W_SLIDER_VALUE = 640 # (5 * 128)
-DEFAULT_TARGET_AREA = float(DEFAULT_H_SLIDER_VALUE * DEFAULT_W_SLIDER_VALUE)
 SLIDER_MIN_H = 128
-SLIDER_MAX_H = 512
 SLIDER_MIN_W = 128
-SLIDER_MAX_W = 854
-def _calculate_new_dimensions_wan(pil_image: Image.Image, mod_val: int, target_area: float,
-                                 min_h: int, max_h: int, min_w: int, max_w: int,
                                  default_h: int, default_w: int) -> tuple[int, int]:
     orig_w, orig_h = pil_image.size
-    if orig_w == 0 or orig_h == 0:
-        logger.warning("Uploaded image has zero width or height. Using default slider dimensions.")
         return default_h, default_w
     aspect_ratio = orig_h / orig_w
-    ideal_h = np.sqrt(target_area * aspect_ratio)
-    ideal_w = np.sqrt(target_area / aspect_ratio)
-    calc_h = round(ideal_h / mod_val) * mod_val
-    calc_w = round(ideal_w / mod_val) * mod_val
-    calc_h = mod_val if calc_h < mod_val else calc_h # Ensure at least one mod_val unit
-    calc_w = mod_val if calc_w < mod_val else calc_w # Ensure at least one mod_val unit
-    new_h = int(np.clip(calc_h, min_h, max_h))
-    new_w = int(np.clip(calc_w, min_w, max_w))
-    logger.info(f"Auto-dim: Original {orig_w}x{orig_h} (AR: {aspect_ratio:.2f}). Target Area: {target_area}.")
-    logger.info(f"Auto-dim: Ideal HxW: {ideal_h:.0f}x{ideal_w:.0f}. Rounded (step {mod_val}): {calc_h}x{calc_w}.")
-    logger.info(f"Auto-dim: Clamped HxW: {new_h}x{new_w} (H_range:[{min_h}-{max_h}], W_range:[{min_w}-{max_w}]).")
     return new_h, new_w
@@ -105,8 +128,8 @@ def handle_image_upload_for_dims_wan(uploaded_pil_image: Image.Image | None, cur
     try:
         new_h, new_w = _calculate_new_dimensions_wan(
             uploaded_pil_image,
-            MOD_VALUE, # Use the globally determined MOD_VALUE
-            DEFAULT_TARGET_AREA,
             SLIDER_MIN_H, SLIDER_MAX_H,
             SLIDER_MIN_W, SLIDER_MAX_W,
             DEFAULT_H_SLIDER_VALUE, DEFAULT_W_SLIDER_VALUE
@@ -114,11 +137,12 @@ def handle_image_upload_for_dims_wan(uploaded_pil_image: Image.Image | None, cur
         return gr.update(value=new_h), gr.update(value=new_w)
     except Exception as e:
         logger.error(f"Error auto-adjusting H/W from image: {e}", exc_info=True)
         return gr.update(value=DEFAULT_H_SLIDER_VALUE), gr.update(value=DEFAULT_W_SLIDER_VALUE)
 # --- Gradio Interface Function ---
-@spaces.GPU # type: ignore
 def generate_video(input_image: Image.Image, prompt: str, negative_prompt: str,
                    height: int, width: int, num_frames: int,
                    guidance_scale: float, steps: int, fps_for_conditioning_and_export: int,
@@ -141,16 +165,21 @@ def generate_video(input_image: Image.Image, prompt: str, negative_prompt: str,
     guidance_scale_val = float(guidance_scale)
     steps_val = int(steps)
-    # Ensure dimensions are compatible (already handled by slider steps and auto-adjustment)
     if target_height % MOD_VALUE_H != 0:
         logger.warning(f"Height {target_height} is not a multiple of {MOD_VALUE_H}. Adjusting...")
         target_height = (target_height // MOD_VALUE_H) * MOD_VALUE_H
     if target_width % MOD_VALUE_W != 0:
         logger.warning(f"Width {target_width} is not a multiple of {MOD_VALUE_W}. Adjusting...")
         target_width = (target_width // MOD_VALUE_W) * MOD_VALUE_W
-    target_height = max(MOD_VALUE_H, target_height) # Ensure minimum size
-    target_width = max(MOD_VALUE_W, target_width)   # Ensure minimum size
     resized_image = input_image.resize((target_width, target_height))
@@ -166,9 +195,10 @@ def generate_video(input_image: Image.Image, prompt: str, negative_prompt: str,
             num_frames=num_frames,
             guidance_scale=guidance_scale_val,
             num_inference_steps=steps_val,
-            generator=torch.Generator(device="cuda").manual_seed(0)
         ).frames[0]
     with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmpfile:
         video_path = tmpfile.name
@@ -187,10 +217,12 @@ with gr.Blocks() as demo:
     Powered by `diffusers` and `{MODEL_ID}`.
     Model is loaded into memory when the app starts. This might take a few minutes.
     Ensure you have a GPU with sufficient VRAM (e.g., ~24GB+ for these default settings).
-    Output Height and Width must be multiples of **{MOD_VALUE}**. Uploading an image will suggest dimensions based on its aspect ratio and a target area.
     """)
     with gr.Row():
-        with gr.Column(scale=2):
             input_image_component = gr.Image(type="pil", label="Input Image (will be resized to target H/W)")
             prompt_input = gr.Textbox(label="Prompt", value=default_prompt_i2v, lines=3)
@@ -204,22 +236,30 @@ with gr.Blocks() as demo:
                     height_input = gr.Slider(minimum=SLIDER_MIN_H, maximum=SLIDER_MAX_H, step=MOD_VALUE, value=DEFAULT_H_SLIDER_VALUE, label=f"Output Height (multiple of {MOD_VALUE})")
                     width_input = gr.Slider(minimum=SLIDER_MIN_W, maximum=SLIDER_MAX_W, step=MOD_VALUE, value=DEFAULT_W_SLIDER_VALUE, label=f"Output Width (multiple of {MOD_VALUE})")
                 with gr.Row():
-                    num_frames_input = gr.Slider(minimum=8, maximum=81, step=1, value=41, label="Number of Frames")
                     fps_input = gr.Slider(minimum=5, maximum=30, step=1, value=24, label="FPS (for conditioning & export)")
-                steps_slider = gr.Slider(minimum=1, maximum=30, step=1, value=4, label="Inference Steps")
-                guidance_scale_input = gr.Slider(minimum=0.0, maximum=20.0, step=0.5, value=1.0, label="Guidance Scale")
             generate_button = gr.Button("Generate Video", variant="primary")
-        with gr.Column(scale=3):
             video_output = gr.Video(label="Generated Video", interactive=False)
     input_image_component.upload(
         fn=handle_image_upload_for_dims_wan,
         inputs=[input_image_component, height_input, width_input],
         outputs=[height_input, width_input]
     )
     inputs_for_click_and_examples = [
         input_image_component,
         prompt_input,
@@ -240,12 +280,13 @@ with gr.Blocks() as demo:
     gr.Examples(
         examples=[
-            [penguin_image_url, "a penguin playfully dancing in the snow, Antarctica", default_negative_prompt, DEFAULT_H_SLIDER_VALUE, DEFAULT_W_SLIDER_VALUE, 25, 1.0, 4, 16],
         ],
         inputs=inputs_for_click_and_examples,
         outputs=video_output,
         fn=generate_video,
-        cache_examples=False
     )
 if __name__ == "__main__":

 import torch
 from diffusers import AutoencoderKLWan, WanImageToVideoPipeline, UniPCMultistepScheduler
+from diffusers.utils import export_to_video
 from transformers import CLIPVisionModel
 import gradio as gr
 import tempfile
+import spaces
 from huggingface_hub import hf_hub_download
 import logging
 import numpy as np
 image_encoder = CLIPVisionModel.from_pretrained(
     MODEL_ID,
     subfolder="image_encoder",
+    torch_dtype=torch.float32 # Using float32 for image encoder as sometimes bfloat16/float16 can be problematic
 )
 logger.info(f"Loading VAE for {MODEL_ID}...")
 vae = AutoencoderKLWan.from_pretrained(
     MODEL_ID,
     subfolder="vae",
+    torch_dtype=torch.float32 # Using float32 for VAE for precision
 )
 logger.info(f"Loading Pipeline {MODEL_ID}...")
 pipe = WanImageToVideoPipeline.from_pretrained(
     MODEL_ID,
     vae=vae,
     image_encoder=image_encoder,
+    torch_dtype=torch.bfloat16 # Main pipeline can use bfloat16 for speed/memory
 )
 flow_shift = 8.0
 pipe.scheduler = UniPCMultistepScheduler.from_config(
 logger.info("Setting LoRA adapter...")
 pipe.set_adapters(["causvid_lora"], adapter_weights=[1.0])
+# --- Constants for Dimension Calculation ---
+MOD_VALUE = 32
 MOD_VALUE_H = MOD_VALUE_W = MOD_VALUE
+DEFAULT_H_SLIDER_VALUE = 512
+DEFAULT_W_SLIDER_VALUE = 896
+# New fixed max_area for the calculation formula
+NEW_FORMULA_MAX_AREA = float(480 * 832)
 SLIDER_MIN_H = 128
+SLIDER_MAX_H = 896
 SLIDER_MIN_W = 128
+SLIDER_MAX_W = 896
+def _calculate_new_dimensions_wan(pil_image: Image.Image, mod_val: int, calculation_max_area: float,
+                                 min_slider_h: int, max_slider_h: int,
+                                 min_slider_w: int, max_slider_w: int,
                                  default_h: int, default_w: int) -> tuple[int, int]:
     orig_w, orig_h = pil_image.size
+    if orig_w <= 0 or orig_h <= 0: # Changed to <= 0 for robustness
+        logger.warning(f"Uploaded image has non-positive width or height ({orig_w}x{orig_h}). Using default slider dimensions.")
         return default_h, default_w
     aspect_ratio = orig_h / orig_w
+    # New calculation logic as per user request:
+    # height = round(np.sqrt(max_area * aspect_ratio)) // mod_value * mod_value
+    # width = round(np.sqrt(max_area / aspect_ratio)) // mod_value * mod_value
+    # Calculate sqrt terms
+    sqrt_h_term = np.sqrt(calculation_max_area * aspect_ratio)
+    sqrt_w_term = np.sqrt(calculation_max_area / aspect_ratio)
+    # Apply the formula: round(sqrt_term) then floor_division by mod_val, then multiply by mod_val
+    calc_h = round(sqrt_h_term) // mod_val * mod_val
+    calc_w = round(sqrt_w_term) // mod_val * mod_val
+    # Ensure calculated dimensions are at least mod_val (since round(...) // mod_val * mod_val can yield 0 if round(sqrt_term) < mod_val)
+    calc_h = mod_val if calc_h < mod_val else calc_h
+    calc_w = mod_val if calc_w < mod_val else calc_w
+    # Determine effective min/max dimensions from slider limits, ensuring they are multiples of mod_val.
+    # Slider min values (min_slider_h, min_slider_w) are assumed to be multiples of mod_val.
+    effective_min_h = min_slider_h
+    effective_min_w = min_slider_w
+    # Slider max values (max_slider_h, max_slider_w) might not be multiples of mod_val.
+    # The actual maximum value a slider can output is (its_max_limit // mod_val) * mod_val.
+    effective_max_h_from_slider = (max_slider_h // mod_val) * mod_val
+    effective_max_w_from_slider = (max_slider_w // mod_val) * mod_val
+    # Clip calc_h and calc_w (which are already multiples of mod_val)
+    # to the effective slider range (which are also multiples of mod_val).
+    # The results (new_h, new_w) will therefore also be multiples of mod_val.
+    new_h = int(np.clip(calc_h, effective_min_h, effective_max_h_from_slider))
+    new_w = int(np.clip(calc_w, effective_min_w, effective_max_w_from_slider))
+    logger.info(f"Auto-dim: Original {orig_w}x{orig_h} (AR: {aspect_ratio:.2f}). Max Area for calc: {calculation_max_area}.")
+    logger.info(f"Auto-dim: Sqrt terms HxW: {sqrt_h_term:.0f}x{sqrt_w_term:.0f}. Calculated (round(sqrt_term)//{mod_val}*{mod_val}): {calc_h}x{calc_w}.")
+    logger.info(f"Auto-dim: Clamped HxW: {new_h}x{new_w} (Effective H_range:[{effective_min_h}-{effective_max_h_from_slider}], Effective W_range:[{effective_min_w}-{effective_max_w_from_slider}]).")
     return new_h, new_w
     try:
         new_h, new_w = _calculate_new_dimensions_wan(
             uploaded_pil_image,
+            MOD_VALUE,
+            NEW_FORMULA_MAX_AREA, # Use the globally defined max_area for the new formula
             SLIDER_MIN_H, SLIDER_MAX_H,
             SLIDER_MIN_W, SLIDER_MAX_W,
             DEFAULT_H_SLIDER_VALUE, DEFAULT_W_SLIDER_VALUE
         return gr.update(value=new_h), gr.update(value=new_w)
     except Exception as e:
         logger.error(f"Error auto-adjusting H/W from image: {e}", exc_info=True)
+        # Fallback to default slider values on error, as in the original code
         return gr.update(value=DEFAULT_H_SLIDER_VALUE), gr.update(value=DEFAULT_W_SLIDER_VALUE)
 # --- Gradio Interface Function ---
+@spaces.GPU
 def generate_video(input_image: Image.Image, prompt: str, negative_prompt: str,
                    height: int, width: int, num_frames: int,
                    guidance_scale: float, steps: int, fps_for_conditioning_and_export: int,
     guidance_scale_val = float(guidance_scale)
     steps_val = int(steps)
+    # Ensure dimensions are compatible.
+    # With the updated _calculate_new_dimensions_wan, height and width from sliders
+    # (after image upload auto-adjustment) should already be multiples of MOD_VALUE.
+    # This block acts as a safeguard if values come from direct slider interaction
+    # before an image upload, or if something unexpected happens.
     if target_height % MOD_VALUE_H != 0:
         logger.warning(f"Height {target_height} is not a multiple of {MOD_VALUE_H}. Adjusting...")
         target_height = (target_height // MOD_VALUE_H) * MOD_VALUE_H
     if target_width % MOD_VALUE_W != 0:
         logger.warning(f"Width {target_width} is not a multiple of {MOD_VALUE_W}. Adjusting...")
         target_width = (target_width // MOD_VALUE_W) * MOD_VALUE_W
+    # Ensure minimum size (should already be handled by _calculate_new_dimensions_wan and slider mins)
+    target_height = max(MOD_VALUE_H, target_height if target_height > 0 else MOD_VALUE_H)
+    target_width = max(MOD_VALUE_W, target_width if target_width > 0 else MOD_VALUE_W)
     resized_image = input_image.resize((target_width, target_height))
             num_frames=num_frames,
             guidance_scale=guidance_scale_val,
             num_inference_steps=steps_val,
+            generator=torch.Generator(device="cuda").manual_seed(0) # Consider making seed configurable
         ).frames[0]
+    # Using a temporary file for video export
     with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmpfile:
         video_path = tmpfile.name
     Powered by `diffusers` and `{MODEL_ID}`.
     Model is loaded into memory when the app starts. This might take a few minutes.
     Ensure you have a GPU with sufficient VRAM (e.g., ~24GB+ for these default settings).
+    Output Height and Width will be multiples of **{MOD_VALUE}**.
+    Uploading an image will suggest dimensions based on its aspect ratio and a pre-defined target pixel area ({NEW_FORMULA_MAX_AREA:.0f} pixels),
+    clamped to slider limits.
     """)
     with gr.Row():
+        with gr.Column():
             input_image_component = gr.Image(type="pil", label="Input Image (will be resized to target H/W)")
             prompt_input = gr.Textbox(label="Prompt", value=default_prompt_i2v, lines=3)
                     height_input = gr.Slider(minimum=SLIDER_MIN_H, maximum=SLIDER_MAX_H, step=MOD_VALUE, value=DEFAULT_H_SLIDER_VALUE, label=f"Output Height (multiple of {MOD_VALUE})")
                     width_input = gr.Slider(minimum=SLIDER_MIN_W, maximum=SLIDER_MAX_W, step=MOD_VALUE, value=DEFAULT_W_SLIDER_VALUE, label=f"Output Width (multiple of {MOD_VALUE})")
                 with gr.Row():
+                    num_frames_input = gr.Slider(minimum=8, maximum=81, step=1, value=41, label="Number of Frames") # Max 81 for this model
                     fps_input = gr.Slider(minimum=5, maximum=30, step=1, value=24, label="FPS (for conditioning & export)")
+                steps_slider = gr.Slider(minimum=1, maximum=30, step=1, value=4, label="Inference Steps") # WanI2V is good with few steps
+                guidance_scale_input = gr.Slider(minimum=0.0, maximum=20.0, step=0.5, value=1.0, label="Guidance Scale") # Low CFG usually better for I2V
             generate_button = gr.Button("Generate Video", variant="primary")
+        with gr.Column():
             video_output = gr.Video(label="Generated Video", interactive=False)
+    # Connect image upload to dimension auto-adjustment
     input_image_component.upload(
+        fn=handle_image_upload_for_dims_wan,
+        inputs=[input_image_component, height_input, width_input], # Pass current slider values for fallback on error
+        outputs=[height_input, width_input]
+    )
+    # Also trigger on clear, though handle_image_upload_for_dims_wan handles None input
+    input_image_component.clear(
         fn=handle_image_upload_for_dims_wan,
         inputs=[input_image_component, height_input, width_input],
         outputs=[height_input, width_input]
     )
     inputs_for_click_and_examples = [
         input_image_component,
         prompt_input,
     gr.Examples(
         examples=[
+            [penguin_image_url, "a penguin playfully dancing in the snow, Antarctica", default_negative_prompt, DEFAULT_H_SLIDER_VALUE, DEFAULT_W_SLIDER_VALUE, 41, 1.0, 4, 24],
+            ["https://huggingface.co/datasets/diffusers/docs-images/resolve/main/i2vgen_xl_images/img_0001.jpg", "the frog jumps around", default_negative_prompt, 384, 640, 60, 1.0, 4, 24],
         ],
         inputs=inputs_for_click_and_examples,
         outputs=video_output,
         fn=generate_video,
+        cache_examples="lazy"
     )
 if __name__ == "__main__":