Spaces:

multimodalart
/

self-forcing

Running on Zero

App Files Files Community

multimodalart HF Staff commited on 1 day ago

Commit

b720739

verified ·

1 Parent(s): e37adb1

Update app.py

Browse files

Files changed (1) hide show

app.py +427 -0

app.py CHANGED Viewed

	@@ -0,0 +1,427 @@

+from huggingface_hub import snapshot_download, hf_hub_download
+snapshot_download(
+    repo_id="Wan-AI/Wan2.1-T2V-1.3B",
+    local_dir="wan_models/Wan2.1-T2V-1.3B",
+    local_dir_use_symlinks=False,
+    resume_download=True,
+    repo_type="model"
+)
+hf_hub_download(
+    repo_id="gdhe17/Self-Forcing",
+    filename="checkpoints/self_forcing_dmd.pt",
+    local_dir=".",
+    local_dir_use_symlinks=False
+)
+import os
+import re
+import random
+import argparse
+import hashlib
+import urllib.request
+from PIL import Image
+import spaces
+import numpy as np
+import torch
+import gradio as gr
+from omegaconf import OmegaConf
+from tqdm import tqdm
+import imageio # Added for final video rendering
+# FastRTC imports
+from fastrtc import WebRTC, get_turn_credentials
+from fastrtc.utils import AdditionalOutputs, CloseStream
+# Original project imports
+from pipeline import CausalInferencePipeline
+from demo_utils.constant import ZERO_VAE_CACHE
+from demo_utils.vae_block3 import VAEDecoderWrapper
+from utils.wan_wrapper import WanDiffusionWrapper, WanTextEncoder
+from demo_utils.memory import gpu, get_cuda_free_memory_gb, DynamicSwapInstaller
+# --- Argument Parsing ---
+parser = argparse.ArgumentParser(description="Gradio Demo for Self-Forcing with FastRTC")
+parser.add_argument('--port', type=int, default=7860, help="Port to run the Gradio app on.")
+parser.add_argument('--host', type=str, default='0.0.0.0', help="Host to bind the Gradio app to.")
+parser.add_argument("--checkpoint_path", type=str, default='./checkpoints/self_forcing_dmd.pt', help="Path to the model checkpoint.")
+parser.add_argument("--config_path", type=str, default='./configs/self_forcing_dmd.yaml', help="Path to the model config.")
+parser.add_argument('--share', action='store_true', help="Create a public Gradio link.")
+parser.add_argument('--trt', action='store_true', help="Use TensorRT optimized VAE decoder.")
+args = parser.parse_args()
+# --- Global Setup & Model Loading ---
+print(f"CUDA device: {gpu}")
+print(f'Initial Free VRAM: {get_cuda_free_memory_gb(gpu):.2f} GB')
+LOW_MEMORY = get_cuda_free_memory_gb(gpu) < 40
+# Load configs
+try:
+    config = OmegaConf.load(args.config_path)
+    default_config = OmegaConf.load("configs/default_config.yaml")
+    config = OmegaConf.merge(default_config, config)
+except FileNotFoundError as e:
+    print(f"Error loading config file: {e}\n. Please ensure config files are in the correct path.")
+    exit(1)
+# Initialize Models
+print("Initializing models...")
+text_encoder = WanTextEncoder()
+transformer = WanDiffusionWrapper(is_causal=True)
+try:
+    state_dict = torch.load(args.checkpoint_path, map_location="cpu")
+    transformer.load_state_dict(state_dict.get('generator_ema', state_dict.get('generator')))
+except FileNotFoundError as e:
+    print(f"Error loading checkpoint: {e}\nPlease ensure the checkpoint '{args.checkpoint_path}' exists.")
+    exit(1)
+# Prepare models for inference
+text_encoder.eval().to(dtype=torch.bfloat16).requires_grad_(False)
+transformer.eval().to(dtype=torch.float16).requires_grad_(False)
+if LOW_MEMORY:
+    print("Low memory mode enabled. Using dynamic model swapping.")
+    DynamicSwapInstaller.install_model(text_encoder, device=gpu)
+else:
+    text_encoder.to(gpu)
+transformer.to(gpu)
+# --- VAE Decoder Management ---
+APP_STATE = {
+    "torch_compile_applied": False,
+    "fp8_applied": False,
+    "current_use_taehv": False,
+    "current_vae_decoder": None,
+}
+def initialize_vae_decoder(use_taehv=False, use_trt=False):
+    global APP_STATE
+    if use_trt:
+        from demo_utils.vae import VAETRTWrapper
+        print("Initializing TensorRT VAE Decoder...")
+        vae_decoder = VAETRTWrapper()
+        APP_STATE["current_use_taehv"] = False
+    elif use_taehv:
+        print("Initializing TAEHV VAE Decoder...")
+        from demo_utils.taehv import TAEHV
+        taehv_checkpoint_path = "checkpoints/taew2_1.pth"
+        if not os.path.exists(taehv_checkpoint_path):
+            print(f"Downloading TAEHV checkpoint to {taehv_checkpoint_path}...")
+            os.makedirs("checkpoints", exist_ok=True)
+            download_url = "https://github.com/madebyollin/taehv/raw/main/taew2_1.pth"
+            try:
+                urllib.request.urlretrieve(download_url, taehv_checkpoint_path)
+            except Exception as e:
+                raise RuntimeError(f"Failed to download taew2_1.pth: {e}")
+        class DotDict(dict): __getattr__ = dict.get
+        class TAEHVDiffusersWrapper(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.dtype = torch.float16
+                self.taehv = TAEHV(checkpoint_path=taehv_checkpoint_path).to(self.dtype)
+                self.config = DotDict(scaling_factor=1.0)
+            def decode(self, latents, return_dict=None):
+                return self.taehv.decode_video(latents, parallel=not LOW_MEMORY).mul_(2).sub_(1)
+        vae_decoder = TAEHVDiffusersWrapper()
+        APP_STATE["current_use_taehv"] = True
+    else:
+        print("Initializing Default VAE Decoder...")
+        vae_decoder = VAEDecoderWrapper()
+        try:
+            vae_state_dict = torch.load('wan_models/Wan2.1-T2V-1.3B/Wan2.1_VAE.pth', map_location="cpu")
+            decoder_state_dict = {k: v for k, v in vae_state_dict.items() if 'decoder.' in k or 'conv2' in k}
+            vae_decoder.load_state_dict(decoder_state_dict)
+        except FileNotFoundError:
+            print("Warning: Default VAE weights not found.")
+        APP_STATE["current_use_taehv"] = False
+    vae_decoder.eval().to(dtype=torch.float16).requires_grad_(False).to(gpu)
+    APP_STATE["current_vae_decoder"] = vae_decoder
+    print(f"✅ VAE decoder initialized: {'TAEHV' if use_taehv else 'Default VAE'}")
+# Initialize with default VAE
+initialize_vae_decoder(use_taehv=False, use_trt=args.trt)
+# --- Additional Outputs Handler ---
+def handle_additional_outputs(status_html_update, video_update, webrtc_output):
+    return status_html_update, video_update, webrtc_output
+# --- FastRTC Video Generation Handler ---
+@torch.no_grad()
+@spaces.GPU
+def video_generation_handler(prompt, seed, enable_torch_compile, enable_fp8, use_taehv, progress=gr.Progress()):
+    """
+    Generator function that yields BGR NumPy frames for real-time streaming.
+    Returns cleanly when done - no infinite loops.
+    """
+    global APP_STATE
+    if seed == -1:
+        seed = random.randint(0, 2**32 - 1)
+    print(f"🎬 Starting video generation with prompt: '{prompt}' and seed: {seed}")
+    # --- Model & Pipeline Configuration ---
+    if use_taehv != APP_STATE["current_use_taehv"]:
+        print(f"🔄 Switching VAE to {'TAEHV' if use_taehv else 'Default VAE'}")
+        initialize_vae_decoder(use_taehv=use_taehv, use_trt=args.trt)
+    pipeline = CausalInferencePipeline(
+        config, device=gpu, generator=transformer, text_encoder=text_encoder,
+        vae=APP_STATE["current_vae_decoder"]
+    )
+    if enable_fp8 and not APP_STATE["fp8_applied"]:
+        print("⚡ Applying FP8 Quantization...")
+        from torchao.quantization.quant_api import quantize_, Float8DynamicActivationFloat8Weight, PerTensor
+        quantize_(pipeline.generator.model, Float8DynamicActivationFloat8Weight(granularity=PerTensor()))
+        APP_STATE["fp8_applied"] = True
+    if enable_torch_compile and not APP_STATE["torch_compile_applied"]:
+        print("🔥 Applying torch.compile (this may take a few minutes)...")
+        pipeline.generator.model = torch.compile(pipeline.generator.model, mode="max-autotune-no-cudagraphs")
+        if not use_taehv and not LOW_MEMORY and not args.trt:
+            pipeline.vae.decoder = torch.compile(pipeline.vae.decoder, mode="max-autotune-no-cudagraphs")
+        APP_STATE["torch_compile_applied"] = True
+    print("🔤 Encoding text prompt...")
+    conditional_dict = text_encoder(text_prompts=[prompt])
+    for key, value in conditional_dict.items():
+        conditional_dict[key] = value.to(dtype=torch.float16)
+    # --- Generation Loop ---
+    rnd = torch.Generator(gpu).manual_seed(int(seed))
+    pipeline._initialize_kv_cache(1, torch.float16, gpu)
+    pipeline._initialize_crossattn_cache(1, torch.float16, gpu)
+    noise = torch.randn([1, 21, 16, 60, 104], device=gpu, dtype=torch.float16, generator=rnd)
+    vae_cache, latents_cache = None, None
+    if not APP_STATE["current_use_taehv"] and not args.trt:
+        vae_cache = [c.to(device=gpu, dtype=torch.float16) for c in ZERO_VAE_CACHE]
+    num_blocks = 7
+    current_start_frame = 0
+    all_num_frames = [pipeline.num_frame_per_block] * num_blocks
+    total_frames_yielded = 0
+    all_frames_for_video = [] # To collect frames for final video
+    for idx, current_num_frames in enumerate(all_num_frames):
+        print(f"📦 Processing block {idx+1}/{num_blocks} with {current_num_frames} frames")
+        noisy_input = noise[:, current_start_frame : current_start_frame + current_num_frames]
+        for step_idx, current_timestep in enumerate(pipeline.denoising_step_list):
+            timestep = torch.ones([1, current_num_frames], device=noise.device, dtype=torch.int64) * current_timestep
+            _, denoised_pred = pipeline.generator(
+                noisy_image_or_video=noisy_input, conditional_dict=conditional_dict,
+                timestep=timestep, kv_cache=pipeline.kv_cache1,
+                crossattn_cache=pipeline.crossattn_cache,
+                current_start=current_start_frame * pipeline.frame_seq_length
+            )
+            if step_idx < len(pipeline.denoising_step_list) - 1:
+                next_timestep = pipeline.denoising_step_list[step_idx + 1]
+                noisy_input = pipeline.scheduler.add_noise(
+                    denoised_pred.flatten(0, 1), torch.randn_like(denoised_pred.flatten(0, 1)),
+                    next_timestep * torch.ones([1 * current_num_frames], device=noise.device, dtype=torch.long)
+                ).unflatten(0, denoised_pred.shape[:2])
+        if idx < len(all_num_frames) - 1:
+            pipeline.generator(
+                noisy_image_or_video=denoised_pred, conditional_dict=conditional_dict,
+                timestep=torch.zeros_like(timestep), kv_cache=pipeline.kv_cache1,
+                crossattn_cache=pipeline.crossattn_cache,
+                current_start=current_start_frame * pipeline.frame_seq_length,
+            )
+        # Decode to pixels
+        if args.trt:
+            pixels, vae_cache = pipeline.vae.forward(denoised_pred.half(), *vae_cache)
+        elif APP_STATE["current_use_taehv"]:
+            if latents_cache is None:
+                latents_cache = denoised_pred
+            else:
+                denoised_pred = torch.cat([latents_cache, denoised_pred], dim=1)
+                latents_cache = denoised_pred[:, -3:]
+            pixels = pipeline.vae.decode(denoised_pred)
+        else:
+            pixels, vae_cache = pipeline.vae(denoised_pred.half(), *vae_cache)
+        # Handle frame skipping for first block
+        if idx == 0 and not args.trt:
+            pixels = pixels[:, 3:]
+        elif APP_STATE["current_use_taehv"] and idx > 0:
+            pixels = pixels[:, 12:]
+        print(f"📹 Decoded pixels shape: {pixels.shape}")
+        # Yield individual frames WITH status updates
+        for frame_idx in range(pixels.shape[1]):
+            frame_tensor = pixels[0, frame_idx]  # Get single frame [C, H, W]
+            # Normalize from [-1, 1] to [0, 255]
+            frame_np = torch.clamp(frame_tensor.float(), -1., 1.) * 127.5 + 127.5
+            frame_np = frame_np.to(torch.uint8).cpu().numpy()
+            # Convert from CHW to HWC format
+            frame_np = np.transpose(frame_np, (1, 2, 0))  # CHW -> HWC
+            all_frames_for_video.append(frame_np)
+            # Convert RGB to BGR for FastRTC (OpenCV format)
+            frame_bgr = frame_np[:, :, ::-1]  # RGB -> BGR
+            total_frames_yielded += 1
+            print(f"📺 Yielding frame {total_frames_yielded}: shape {frame_bgr.shape}, dtype {frame_bgr.dtype}")
+            # Calculate progress
+            total_expected_frames = num_blocks * pipeline.num_frame_per_block
+            current_frame_count = (idx * pipeline.num_frame_per_block) + frame_idx + 1
+            frame_progress = 100 * (current_frame_count / total_expected_frames)
+            # --- REVISED HTML START ---
+            if frame_idx == pixels.shape[1] - 1 and idx + 1 == num_blocks: # last frame
+                status_html = (
+                    f"<div style='padding: 16px; border: 1px solid #198754; background-color: #d1e7dd; border-radius: 8px; font-family: sans-serif; text-align: center;'>"
+                    f"  <h4 style='margin: 0 0 8px 0; color: #0f5132; font-size: 18px;'>🎉 Generation Complete!</h4>"
+                    f"  <p style='margin: 0; color: #0f5132;'>"
+                    f"    Total frames: {total_frames_yielded}. The final video is now available."
+                    f"  </p>"
+                    f"</div>"
+                )
+                print("💾 Saving final rendered video...")
+                video_update = gr.update() # Default to no-op
+                try:
+                    video_path = f"gradio_tmp/{seed}_{hashlib.md5(prompt.encode()).hexdigest()}.mp4"
+                    imageio.mimwrite(video_path, all_frames_for_video, fps=15, quality=8)
+                    print(f"✅ Video saved to {video_path}")
+                    video_update = gr.update(value=video_path, visible=True)
+                except Exception as e:
+                    print(f"⚠️ Could not save final video: {e}")
+                yield frame_bgr, AdditionalOutputs(status_html, video_update, gr.update(visible=False))
+                yield CloseStream("🎉 Video generation completed successfully!")
+                return
+            else:  # Regular frames - simpler status
+                status_html = (
+                    f"<div style='padding: 10px; border: 1px solid #ddd; border-radius: 8px; font-family: sans-serif;'>"
+                    f"  <p style='margin: 0 0 8px 0; font-size: 16px; font-weight: bold;'>Generating Video...</p>"
+                    # Correctly implemented progress bar
+                    f"  <div style='background: #e9ecef; border-radius: 4px; width: 100%; overflow: hidden;'>"
+                    f"    <div style='width: {frame_progress:.1f}%; height: 20px; background-color: #0d6efd; transition: width 0.2s;'></div>"
+                    f"  </div>"
+                    f"  <p style='margin: 8px 0 0 0; color: #555; font-size: 14px; text-align: right;'>"
+                    f"    Block {idx+1}/{num_blocks}   |   Frame {total_frames_yielded}   |   {frame_progress:.1f}%"
+                    f"  </p>"
+                    f"</div>"
+                )
+            # --- REVISED HTML END ---
+            yield frame_bgr, AdditionalOutputs(status_html, gr.update(visible=False), gr.update(visible=True))
+        current_start_frame += current_num_frames
+    print(f"✅ Video generation completed! Total frames yielded: {total_frames_yielded}")
+    # Signal completion
+    yield CloseStream("🎉 Video generation completed successfully!")
+# --- Gradio UI Layout ---
+with gr.Blocks(theme=gr.themes.Soft(), title="Self-Forcing FastRTC Demo") as demo:
+    gr.Markdown("# 🚀 Self-Forcing Video Generation with FastRTC Streaming")
+    gr.Markdown("*Real-time video generation streaming via WebRTC*")
+    with gr.Row():
+        with gr.Column(scale=2):
+            gr.Markdown("### 📝 Configure Generation")
+            with gr.Group():
+                prompt = gr.Textbox(
+                    label="Prompt",
+                    placeholder="A stylish woman walks down a Tokyo street...",
+                    lines=4,
+                    value="A stylish woman walks down a Tokyo street filled with warm glowing neon and animated city signage."
+                )
+                gr.Examples(
+                    examples=[
+                        "A stylish woman walks down a Tokyo street filled with warm glowing neon and animated city signage. She wears a black leather jacket, a long red dress, and black boots, and carries a black purse.",
+                        "A white and orange tabby cat is seen happily darting through a dense garden, as if chasing something. Its eyes are wide and happy as it jogs forward, scanning the branches, flowers, and leaves.",
+                        "A drone shot of a surfer riding a wave on a sunny day. The camera follows the surfer as they carve through the water.",
+                    ],
+                    inputs=[prompt]
+                )
+            with gr.Row():
+                seed = gr.Number(label="Seed", value=-1, info="Use -1 for a random seed.")
+            with gr.Accordion("⚙️ Performance Options", open=False):
+                gr.Markdown("*These optimizations are applied once per session*")
+                with gr.Row():
+                    torch_compile_toggle = gr.Checkbox(label="🔥 torch.compile", value=False)
+                    fp8_toggle = gr.Checkbox(label="⚡ FP8 Quantization", value=False, visible=not args.trt)
+                    taehv_toggle = gr.Checkbox(label="⚡ TAEHV VAE", value=False, visible=not args.trt)
+            start_btn = gr.Button("🎬 Start Generation", variant="primary", size="lg")
+        with gr.Column(scale=3):
+            gr.Markdown("### 📺 Live Video Stream")
+            gr.Markdown("*Click 'Start Generation' to begin streaming*")
+            try:
+                rtc_config = get_turn_credentials()
+            except Exception as e:
+                print(f"Warning: Could not get TURN credentials: {e}")
+                rtc_config = None
+            webrtc_output = WebRTC(
+                label="Generated Video Stream",
+                modality="video",
+                mode="receive",  # Server sends video to client
+                height=480,
+                width=832,
+                rtc_configuration=rtc_config,
+                elem_id="video_stream"
+            )
+            final_video = gr.Video(label="Final Rendered Video", visible=False, interactive=False)
+            status_html = gr.HTML(
+                value="<div style='text-align: center; padding: 20px; color: #666;'>Ready to start generation...</div>",
+                label="Generation Status"
+            )
+    # Connect the generator to the WebRTC stream
+    webrtc_output.stream(
+        fn=video_generation_handler,
+        inputs=[prompt, seed, torch_compile_toggle, fp8_toggle, taehv_toggle],
+        outputs=[webrtc_output],
+        time_limit=300,  # 5 minutes max
+        trigger=start_btn.click,
+    )
+    # MODIFIED: Handle additional outputs (status updates AND final video)
+    webrtc_output.on_additional_outputs(
+        fn=handle_additional_outputs,
+        outputs=[status_html, final_video, webrtc_output]
+    )
+# --- Launch App ---
+if __name__ == "__main__":
+    if os.path.exists("gradio_tmp"):
+        import shutil
+        shutil.rmtree("gradio_tmp")
+    os.makedirs("gradio_tmp", exist_ok=True)
+    demo.queue().launch(
+        server_name=args.host,
+        server_port=args.port,
+        share=args.share,
+        show_error=True
+    )