fast-rendering-node-for-clapper

Paused

App Files Files Community

jbilcke-hf HF Staff commited on Jul 7

Commit

257dc46

1 Parent(s): 6373d0a

experimenting stuff

Browse files

Files changed (1) hide show

app.py +52 -1

app.py CHANGED Viewed

@@ -57,6 +57,56 @@ device = "cuda" if torch.cuda.is_available() else "cpu"
 DEFAULT_WIDTH = 832
 DEFAULT_HEIGHT = 480
 # --- Argument Parsing ---
 parser = argparse.ArgumentParser(description="Gradio Demo for Self-Forcing with Frame Streaming")
 parser.add_argument('--port', type=int, default=7860, help="Port to run the Gradio app on.")
@@ -290,7 +340,8 @@ def video_generation_handler(prompt, seed=42, fps=15, width=DEFAULT_WIDTH, heigh
     vae_cache, latents_cache = None, None
     if not APP_STATE["current_use_taehv"] and not args.trt:
-        vae_cache = [c.to(device=gpu, dtype=torch.float16) for c in ZERO_VAE_CACHE]
     # Calculate number of blocks based on duration
     # Current setup generates approximately 5 seconds with 7 blocks

 DEFAULT_WIDTH = 832
 DEFAULT_HEIGHT = 480
+def create_vae_cache_for_resolution(latent_height, latent_width, device, dtype):
+    """
+    Create VAE cache tensors dynamically based on the latent resolution.
+    The cache structure mirrors ZERO_VAE_CACHE but with resolution-dependent dimensions.
+    """
+    # Scale dimensions based on latent resolution
+    # The original cache assumes 832x480 -> 104x60 latent dimensions
+    # We need to scale proportionally for other resolutions
+    cache = [
+        torch.zeros(1, 16, 2, latent_height, latent_width, device=device, dtype=dtype),
+        # First set of 384-channel caches at latent resolution
+        torch.zeros(1, 384, 2, latent_height, latent_width, device=device, dtype=dtype),
+        torch.zeros(1, 384, 2, latent_height, latent_width, device=device, dtype=dtype),
+        torch.zeros(1, 384, 2, latent_height, latent_width, device=device, dtype=dtype),
+        torch.zeros(1, 384, 2, latent_height, latent_width, device=device, dtype=dtype),
+        torch.zeros(1, 384, 2, latent_height, latent_width, device=device, dtype=dtype),
+        torch.zeros(1, 384, 2, latent_height, latent_width, device=device, dtype=dtype),
+        torch.zeros(1, 384, 2, latent_height, latent_width, device=device, dtype=dtype),
+        torch.zeros(1, 384, 2, latent_height, latent_width, device=device, dtype=dtype),
+        torch.zeros(1, 384, 2, latent_height, latent_width, device=device, dtype=dtype),
+        torch.zeros(1, 384, 2, latent_height, latent_width, device=device, dtype=dtype),
+        torch.zeros(1, 384, 2, latent_height, latent_width, device=device, dtype=dtype),
+        # Second set at 2x upsampled resolution
+        torch.zeros(1, 192, 2, latent_height * 2, latent_width * 2, device=device, dtype=dtype),
+        torch.zeros(1, 384, 2, latent_height * 2, latent_width * 2, device=device, dtype=dtype),
+        torch.zeros(1, 384, 2, latent_height * 2, latent_width * 2, device=device, dtype=dtype),
+        torch.zeros(1, 384, 2, latent_height * 2, latent_width * 2, device=device, dtype=dtype),
+        torch.zeros(1, 384, 2, latent_height * 2, latent_width * 2, device=device, dtype=dtype),
+        torch.zeros(1, 384, 2, latent_height * 2, latent_width * 2, device=device, dtype=dtype),
+        torch.zeros(1, 384, 2, latent_height * 2, latent_width * 2, device=device, dtype=dtype),
+        # Third set at 4x upsampled resolution
+        torch.zeros(1, 192, 2, latent_height * 4, latent_width * 4, device=device, dtype=dtype),
+        torch.zeros(1, 192, 2, latent_height * 4, latent_width * 4, device=device, dtype=dtype),
+        torch.zeros(1, 192, 2, latent_height * 4, latent_width * 4, device=device, dtype=dtype),
+        torch.zeros(1, 192, 2, latent_height * 4, latent_width * 4, device=device, dtype=dtype),
+        torch.zeros(1, 192, 2, latent_height * 4, latent_width * 4, device=device, dtype=dtype),
+        torch.zeros(1, 192, 2, latent_height * 4, latent_width * 4, device=device, dtype=dtype),
+        # Fourth set at 8x upsampled resolution (final output resolution)
+        torch.zeros(1, 96, 2, latent_height * 8, latent_width * 8, device=device, dtype=dtype),
+        torch.zeros(1, 96, 2, latent_height * 8, latent_width * 8, device=device, dtype=dtype),
+        torch.zeros(1, 96, 2, latent_height * 8, latent_width * 8, device=device, dtype=dtype),
+        torch.zeros(1, 96, 2, latent_height * 8, latent_width * 8, device=device, dtype=dtype),
+        torch.zeros(1, 96, 2, latent_height * 8, latent_width * 8, device=device, dtype=dtype),
+        torch.zeros(1, 96, 2, latent_height * 8, latent_width * 8, device=device, dtype=dtype),
+        torch.zeros(1, 96, 2, latent_height * 8, latent_width * 8, device=device, dtype=dtype)
+    ]
+    return cache
 # --- Argument Parsing ---
 parser = argparse.ArgumentParser(description="Gradio Demo for Self-Forcing with Frame Streaming")
 parser.add_argument('--port', type=int, default=7860, help="Port to run the Gradio app on.")
     vae_cache, latents_cache = None, None
     if not APP_STATE["current_use_taehv"] and not args.trt:
+        # Create resolution-dependent VAE cache
+        vae_cache = create_vae_cache_for_resolution(latent_height, latent_width, device=gpu, dtype=torch.float16)
     # Calculate number of blocks based on duration
     # Current setup generates approximately 5 seconds with 7 blocks