fast-rendering-node-for-clapper

Paused

jbilcke-hf HF Staff commited on Jul 3

Commit

3a23852

1 Parent(s): b2c19b1

up

Files changed (1) hide show

app.py CHANGED Viewed

@@ -94,10 +94,16 @@ APP_STATE = {
 # I've tried to enable it, but I didn't notice a significant performance improvement..
 ENABLE_TORCH_COMPILATION = False
 # Apply torch.compile for maximum performance
 if not APP_STATE["torch_compile_applied"] and ENABLE_TORCH_COMPILATION:
     print("🚀 Applying torch.compile for speed optimization...")
-    transformer.compile(mode="max-autotune-no-cudagraphs")
     APP_STATE["torch_compile_applied"] = True
     print("✅ torch.compile applied to transformer")
@@ -199,7 +205,7 @@ def initialize_vae_decoder(use_taehv=False, use_trt=False):
     # Apply torch.compile to VAE decoder if enabled (following demo.py pattern)
     if APP_STATE["torch_compile_applied"] and not use_taehv and not use_trt:
         print("🚀 Applying torch.compile to VAE decoder...")
-        vae_decoder.compile(mode="max-autotune-no-cudagraphs")
         print("✅ torch.compile applied to VAE decoder")
     APP_STATE["current_vae_decoder"] = vae_decoder

 # I've tried to enable it, but I didn't notice a significant performance improvement..
 ENABLE_TORCH_COMPILATION = False
+# “default”: The default mode, used when no mode parameter is specified. It provides a good balance between performance and overhead.
+# “reduce-overhead”: Minimizes Python-related overhead using CUDA graphs. However, it may increase memory usage.
+# “max-autotune”: Uses Triton or template-based matrix multiplications on supported devices. It takes longer to compile but optimizes for the fastest possible execution. On GPUs it enables CUDA graphs by default.
+# “max-autotune-no-cudagraphs”: Similar to “max-autotune”, but without CUDA graphs.
+TORCH_COMPILATION_MODE = "default"
 # Apply torch.compile for maximum performance
 if not APP_STATE["torch_compile_applied"] and ENABLE_TORCH_COMPILATION:
     print("🚀 Applying torch.compile for speed optimization...")
+    transformer.compile(mode=TORCH_COMPILATION_MODE)
     APP_STATE["torch_compile_applied"] = True
     print("✅ torch.compile applied to transformer")
     # Apply torch.compile to VAE decoder if enabled (following demo.py pattern)
     if APP_STATE["torch_compile_applied"] and not use_taehv and not use_trt:
         print("🚀 Applying torch.compile to VAE decoder...")
+        vae_decoder.compile(mode=TORCH_COMPILATION_MODE)
         print("✅ torch.compile applied to VAE decoder")
     APP_STATE["current_vae_decoder"] = vae_decoder