Spaces:

multimodalart
/

self-forcing

Running on Zero

App Files Files Community

multimodalart HF Staff commited on 29 days ago

Commit

481a175

verified ·

1 Parent(s): 6e67c38

Update app.py

Browse files

Files changed (1) hide show

app.py +13 -37

app.py CHANGED Viewed

@@ -87,8 +87,6 @@ APP_STATE = {
 }
 def initialize_vae_decoder(use_taehv=False, use_trt=False):
-    global APP_STATE
     if use_trt:
         from demo_utils.vae import VAETRTWrapper
         print("Initializing TensorRT VAE Decoder...")
@@ -138,6 +136,13 @@ def initialize_vae_decoder(use_taehv=False, use_trt=False):
 # Initialize with default VAE
 initialize_vae_decoder(use_taehv=False, use_trt=args.trt)
 # --- Additional Outputs Handler ---
 def handle_additional_outputs(status_html_update, video_update, webrtc_output):
     return status_html_update, video_update, webrtc_output
@@ -145,41 +150,17 @@ def handle_additional_outputs(status_html_update, video_update, webrtc_output):
 # --- FastRTC Video Generation Handler ---
 @torch.no_grad()
 @spaces.GPU
-def video_generation_handler(prompt, seed, enable_torch_compile, enable_fp8, use_taehv, progress=gr.Progress()):
     """
     Generator function that yields BGR NumPy frames for real-time streaming.
     Returns cleanly when done - no infinite loops.
     """
-    global APP_STATE
     if seed == -1:
         seed = random.randint(0, 2**32 - 1)
     print(f"🎬 Starting video generation with prompt: '{prompt}' and seed: {seed}")
-    # --- Model & Pipeline Configuration ---
-    if use_taehv != APP_STATE["current_use_taehv"]:
-        print(f"🔄 Switching VAE to {'TAEHV' if use_taehv else 'Default VAE'}")
-        initialize_vae_decoder(use_taehv=use_taehv, use_trt=args.trt)
-    pipeline = CausalInferencePipeline(
-        config, device=gpu, generator=transformer, text_encoder=text_encoder,
-        vae=APP_STATE["current_vae_decoder"]
-    )
-    if enable_fp8 and not APP_STATE["fp8_applied"]:
-        print("⚡ Applying FP8 Quantization...")
-        from torchao.quantization.quant_api import quantize_, Float8DynamicActivationFloat8Weight, PerTensor
-        quantize_(pipeline.generator.model, Float8DynamicActivationFloat8Weight(granularity=PerTensor()))
-        APP_STATE["fp8_applied"] = True
-    if enable_torch_compile and not APP_STATE["torch_compile_applied"]:
-        print("🔥 Applying torch.compile (this may take a few minutes)...")
-        pipeline.generator.model = torch.compile(pipeline.generator.model, mode="max-autotune-no-cudagraphs")
-        if not use_taehv and not LOW_MEMORY and not args.trt:
-            pipeline.vae.decoder = torch.compile(pipeline.vae.decoder, mode="max-autotune-no-cudagraphs")
-        APP_STATE["torch_compile_applied"] = True
     print("🔤 Encoding text prompt...")
     conditional_dict = text_encoder(text_prompts=[prompt])
     for key, value in conditional_dict.items():
@@ -187,14 +168,14 @@ def video_generation_handler(prompt, seed, enable_torch_compile, enable_fp8, use
     # --- Generation Loop ---
     rnd = torch.Generator(gpu).manual_seed(int(seed))
-    pipeline._initialize_kv_cache(1, torch.float16, gpu)
-    pipeline._initialize_crossattn_cache(1, torch.float16, gpu)
     noise = torch.randn([1, 21, 16, 60, 104], device=gpu, dtype=torch.float16, generator=rnd)
     vae_cache, latents_cache = None, None
     if not APP_STATE["current_use_taehv"] and not args.trt:
         vae_cache = [c.to(device=gpu, dtype=torch.float16) for c in ZERO_VAE_CACHE]
     num_blocks = 7
     current_start_frame = 0
     all_num_frames = [pipeline.num_frame_per_block] * num_blocks
@@ -303,7 +284,6 @@ def video_generation_handler(prompt, seed, enable_torch_compile, enable_fp8, use
                 status_html = (
                     f"<div style='padding: 10px; border: 1px solid #ddd; border-radius: 8px; font-family: sans-serif;'>"
                     f"  <p style='margin: 0 0 8px 0; font-size: 16px; font-weight: bold;'>Generating Video...</p>"
-                    # Correctly implemented progress bar
                     f"  <div style='background: #e9ecef; border-radius: 4px; width: 100%; overflow: hidden;'>"
                     f"    <div style='width: {frame_progress:.1f}%; height: 20px; background-color: #0d6efd; transition: width 0.2s;'></div>"
                     f"  </div>"
@@ -352,11 +332,7 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Self-Forcing FastRTC Demo") as dem
             with gr.Accordion("⚙️ Performance Options", open=False):
                 gr.Markdown("*These optimizations are applied once per session*")
-                with gr.Row():
-                    torch_compile_toggle = gr.Checkbox(label="🔥 torch.compile", value=False)
-                    fp8_toggle = gr.Checkbox(label="⚡ FP8 Quantization", value=False, visible=not args.trt)
-                    taehv_toggle = gr.Checkbox(label="⚡ TAEHV VAE", value=False, visible=not args.trt)
             start_btn = gr.Button("🎬 Start Generation", variant="primary", size="lg")
         with gr.Column(scale=3):
@@ -385,7 +361,7 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Self-Forcing FastRTC Demo") as dem
     # Connect the generator to the WebRTC stream
     webrtc_output.stream(
         fn=video_generation_handler,
-        inputs=[prompt, seed, torch_compile_toggle, fp8_toggle, taehv_toggle],
         outputs=[webrtc_output],
         time_limit=300,  # 5 minutes max
         trigger=start_btn.click,

 }
 def initialize_vae_decoder(use_taehv=False, use_trt=False):
     if use_trt:
         from demo_utils.vae import VAETRTWrapper
         print("Initializing TensorRT VAE Decoder...")
 # Initialize with default VAE
 initialize_vae_decoder(use_taehv=False, use_trt=args.trt)
+pipeline = CausalInferencePipeline(
+    config, device=gpu, generator=transformer, text_encoder=text_encoder,
+    vae=APP_STATE["current_vae_decoder"]
+)
+pipeline.to(gpu)
 # --- Additional Outputs Handler ---
 def handle_additional_outputs(status_html_update, video_update, webrtc_output):
     return status_html_update, video_update, webrtc_output
 # --- FastRTC Video Generation Handler ---
 @torch.no_grad()
 @spaces.GPU
+def video_generation_handler(prompt, seed, progress=gr.Progress()):
     """
     Generator function that yields BGR NumPy frames for real-time streaming.
     Returns cleanly when done - no infinite loops.
     """
     if seed == -1:
         seed = random.randint(0, 2**32 - 1)
     print(f"🎬 Starting video generation with prompt: '{prompt}' and seed: {seed}")
     print("🔤 Encoding text prompt...")
     conditional_dict = text_encoder(text_prompts=[prompt])
     for key, value in conditional_dict.items():
     # --- Generation Loop ---
     rnd = torch.Generator(gpu).manual_seed(int(seed))
+    pipeline._initialize_kv_cache(1, torch.float16, device=gpu)
+    pipeline._initialize_crossattn_cache(1, torch.float16, device=gpu)
     noise = torch.randn([1, 21, 16, 60, 104], device=gpu, dtype=torch.float16, generator=rnd)
     vae_cache, latents_cache = None, None
     if not APP_STATE["current_use_taehv"] and not args.trt:
         vae_cache = [c.to(device=gpu, dtype=torch.float16) for c in ZERO_VAE_CACHE]
     num_blocks = 7
     current_start_frame = 0
     all_num_frames = [pipeline.num_frame_per_block] * num_blocks
                 status_html = (
                     f"<div style='padding: 10px; border: 1px solid #ddd; border-radius: 8px; font-family: sans-serif;'>"
                     f"  <p style='margin: 0 0 8px 0; font-size: 16px; font-weight: bold;'>Generating Video...</p>"
                     f"  <div style='background: #e9ecef; border-radius: 4px; width: 100%; overflow: hidden;'>"
                     f"    <div style='width: {frame_progress:.1f}%; height: 20px; background-color: #0d6efd; transition: width 0.2s;'></div>"
                     f"  </div>"
             with gr.Accordion("⚙️ Performance Options", open=False):
                 gr.Markdown("*These optimizations are applied once per session*")
             start_btn = gr.Button("🎬 Start Generation", variant="primary", size="lg")
         with gr.Column(scale=3):
     # Connect the generator to the WebRTC stream
     webrtc_output.stream(
         fn=video_generation_handler,
+        inputs=[prompt, seed],
         outputs=[webrtc_output],
         time_limit=300,  # 5 minutes max
         trigger=start_btn.click,