fast-rendering-node-for-clapper

Paused

App Files Files Community

jbilcke-hf HF Staff commited on Jul 3

Commit

b55bb25

1 Parent(s): f5f96d3

ok

Browse files

Files changed (4) hide show

app.py +242 -159
app_last_working.py +0 -460
demo.py +0 -631
utils/wan_wrapper.py +11 -5

app.py CHANGED Viewed

@@ -2,11 +2,17 @@ import subprocess
 # not sure why it works in the original space but says "pip not found" in mine
 #subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
 from huggingface_hub import snapshot_download, hf_hub_download
 snapshot_download(
     repo_id="Wan-AI/Wan2.1-T2V-1.3B",
-    local_dir="wan_models/Wan2.1-T2V-1.3B",
     local_dir_use_symlinks=False,
     resume_download=True,
     repo_type="model"
@@ -15,11 +21,9 @@ snapshot_download(
 hf_hub_download(
     repo_id="gdhe17/Self-Forcing",
     filename="checkpoints/self_forcing_dmd.pt",
-    local_dir=".",
     local_dir_use_symlinks=False
 )
-import os
 import re
 import random
 import argparse
@@ -34,6 +38,10 @@ from tqdm import tqdm
 import imageio
 import av
 import uuid
 from pipeline import CausalInferencePipeline
 from demo_utils.constant import ZERO_VAE_CACHE
@@ -45,11 +53,25 @@ import numpy as np
 device = "cuda" if torch.cuda.is_available() else "cpu"
 # --- Argument Parsing ---
 parser = argparse.ArgumentParser(description="Gradio Demo for Self-Forcing with Frame Streaming")
 parser.add_argument('--port', type=int, default=7860, help="Port to run the Gradio app on.")
 parser.add_argument('--host', type=str, default='0.0.0.0', help="Host to bind the Gradio app to.")
-parser.add_argument("--checkpoint_path", type=str, default='./checkpoints/self_forcing_dmd.pt', help="Path to the model checkpoint.")
 parser.add_argument("--config_path", type=str, default='./configs/self_forcing_dmd.yaml', help="Path to the model config.")
 parser.add_argument('--share', action='store_true', help="Create a public Gradio link.")
 parser.add_argument('--trt', action='store_true', help="Use TensorRT optimized VAE decoder.")
@@ -107,6 +129,89 @@ if not APP_STATE["torch_compile_applied"] and ENABLE_TORCH_COMPILATION:
     APP_STATE["torch_compile_applied"] = True
     print("✅ torch.compile applied to transformer")
 def frames_to_ts_file(frames, filepath, fps = 15):
     """
     Convert frames directly to .ts file using PyAV.
@@ -193,7 +298,7 @@ def initialize_vae_decoder(use_taehv=False, use_trt=False):
         print("Initializing Default VAE Decoder...")
         vae_decoder = VAEDecoderWrapper()
         try:
-            vae_state_dict = torch.load('wan_models/Wan2.1-T2V-1.3B/Wan2.1_VAE.pth', map_location="cpu")
             decoder_state_dict = {k: v for k, v in vae_state_dict.items() if 'decoder.' in k or 'conv2' in k}
             vae_decoder.load_state_dict(decoder_state_dict)
         except FileNotFoundError:
@@ -222,26 +327,22 @@ pipeline = CausalInferencePipeline(
 pipeline.to(dtype=torch.float16).to(gpu)
 @torch.no_grad()
-def video_generation_handler_streaming(prompt, seed=42, fps=15, width=400, height=224, duration=5, buffering=2):
     """
     Generator function that yields .ts video chunks using PyAV for streaming.
-    Now optimized for block-based processing with smart buffering.
     """
     if seed == -1:
         seed = random.randint(0, 2**32 - 1)
-    print(f"🎬 Starting PyAV streaming: '{prompt}', seed: {seed}, duration: {duration}s, buffering: {buffering}s")
-    # Show initial buffering status but don't wait - start generating immediately
-    if buffering > 0:
-        buffering_status_html = (
-            f"<div style='padding: 10px; border: 1px solid #ffc107; background: #fff3cd; border-radius: 8px; font-family: sans-serif;'>"
-            f"  <p style='margin: 0 0 8px 0; font-size: 16px; font-weight: bold;'>⏳ Buffering...</p>"
-            f"  <p style='margin: 0; color: #856404; font-size: 14px;'>Generating content, will stream when {buffering} seconds of video is ready</p>"
-            f"</div>"
-        )
-        yield None, buffering_status_html
     # Setup
     conditional_dict = text_encoder(text_prompts=[prompt])
     for key, value in conditional_dict.items():
@@ -260,7 +361,7 @@ def video_generation_handler_streaming(prompt, seed=42, fps=15, width=400, heigh
     # Current setup generates approximately 5 seconds with 7 blocks
     # So we scale proportionally
     base_duration = 5.0  # seconds
-    base_blocks = 7
     num_blocks = max(1, int(base_blocks * duration / base_duration))
     current_start_frame = 0
@@ -270,13 +371,7 @@ def video_generation_handler_streaming(prompt, seed=42, fps=15, width=400, heigh
     # Ensure temp directory exists
     os.makedirs("gradio_tmp", exist_ok=True)
-    # Buffer management - collect chunks before streaming
-    buffer_chunks = []
-    buffer_duration = 0.0
-    frames_per_second = fps
-    streaming_started = False
     # Generation loop
     for idx, current_num_frames in enumerate(all_num_frames):
         print(f"📦 Processing block {idx+1}/{num_blocks}")
@@ -375,45 +470,11 @@ def video_generation_handler_streaming(prompt, seed=42, fps=15, width=400, heigh
                 frames_to_ts_file(all_frames_from_block, ts_path, fps)
-                # Calculate duration of this chunk
-                chunk_duration = len(all_frames_from_block) / frames_per_second
-                # Add to buffer
-                buffer_chunks.append(ts_path)
-                buffer_duration += chunk_duration
-                # Check if we have enough buffered content to start streaming
-                if not streaming_started and buffer_duration >= buffering:
-                    print(f"🚀 Buffer filled ({buffer_duration:.2f}s >= {buffering}s), starting stream!")
-                    streaming_started = True
-                    # Stream all buffered chunks
-                    for buffered_chunk in buffer_chunks:
-                        yield buffered_chunk, gr.update()
-                    # Clear buffer since we've streamed it
-                    buffer_chunks.clear()
-                    buffer_duration = 0.0
-                elif streaming_started:
-                    # Stream immediately if we're already streaming
-                    yield ts_path, gr.update()
-                elif buffering == 0:
-                    # No buffering requested, stream immediately
-                    yield ts_path, gr.update()
-                else:
-                    # Still buffering, show progress
-                    buffer_progress = (buffer_duration / buffering) * 100
-                    buffering_progress_html = (
-                        f"<div style='padding: 10px; border: 1px solid #ffc107; background: #fff3cd; border-radius: 8px; font-family: sans-serif;'>"
-                        f"  <p style='margin: 0 0 8px 0; font-size: 16px; font-weight: bold;'>⏳ Buffering... ({buffer_duration:.1f}s/{buffering}s)</p>"
-                        f"  <div style='background: #e9ecef; border-radius: 4px; width: 100%; overflow: hidden;'>"
-                        f"    <div style='width: {buffer_progress:.1f}%; height: 20px; background-color: #ffc107; transition: width 0.2s;'></div>"
-                        f"  </div>"
-                        f"  <p style='margin: 4px 0 0 0; color: #856404; font-size: 14px;'>Generating content for smooth playback...</p>"
-                        f"</div>"
-                    )
-                    yield None, buffering_progress_html
             except Exception as e:
                 print(f"⚠️ Error encoding block {idx}: {e}")
@@ -422,12 +483,6 @@ def video_generation_handler_streaming(prompt, seed=42, fps=15, width=400, heigh
         current_start_frame += current_num_frames
-    # Stream any remaining buffered content
-    if buffer_chunks:
-        print(f"🎬 Streaming remaining {len(buffer_chunks)} buffered chunks")
-        for buffered_chunk in buffer_chunks:
-            yield buffered_chunk, gr.update()
     # Final completion status
     final_status_html = (
         f"<div style='padding: 16px; border: 1px solid #198754; background: linear-gradient(135deg, #d1e7dd, #f8f9fa); border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1);'>"
@@ -449,104 +504,132 @@ def video_generation_handler_streaming(prompt, seed=42, fps=15, width=400, heigh
     print(f"✅ PyAV streaming complete! {total_frames_yielded} frames across {num_blocks} blocks")
 # --- Gradio UI Layout ---
-with gr.Blocks(title="Self-Forcing Streaming Demo") as demo:
-    gr.Markdown("# 🚀 Self-Forcing Video Generation")
-    gr.Markdown("Real-time video generation with distilled Wan2-1 1.3B [[Model]](https://huggingface.co/gdhe17/Self-Forcing), [[Project page]](https://self-forcing.github.io), [[Paper]](https://huggingface.co/papers/2506.08009)")
-    with gr.Row():
-        with gr.Column(scale=2):
-            with gr.Group():
-                prompt = gr.Textbox(
-                    label="Prompt",
-                    placeholder="A stylish woman walks down a Tokyo street...",
-                    lines=4,
-                    value=""
-                )
-            start_btn = gr.Button("🎬 Start Streaming", variant="primary", size="lg")
-            gr.Markdown("### ⚙️ Settings")
             with gr.Row():
-                seed = gr.Number(
-                    label="Seed",
-                    value=-1,
-                    info="Use -1 for random seed",
-                    precision=0
-                )
-                fps = gr.Slider(
-                    label="Playback FPS",
-                    minimum=1,
-                    maximum=30,
-                    value=args.fps,
-                    step=1,
-                    visible=False,
-                    info="Frames per second for playback"
-                )
             with gr.Row():
-                duration = gr.Slider(
-                    label="Duration (seconds)",
-                    minimum=1,
-                    maximum=10,
-                    value=5,
-                    step=1,
-                    info="Video duration in seconds"
-                )
-                buffering = gr.Slider(
-                    label="Buffering (seconds)",
-                    minimum=0,
-                    maximum=5,
-                    value=2,
-                    step=0.5,
-                    info="Wait time before starting stream"
-                )
             with gr.Row():
-                width = gr.Slider(
-                    label="Width",
-                    minimum=320,
-                    maximum=720,
-                    value=400,
-                    step=8,
-                    info="Video width in pixels (8px steps)"
-                )
-                height = gr.Slider(
-                    label="Height",
-                    minimum=320,
-                    maximum=720,
-                    value=224,
-                    step=8,
-                    info="Video height in pixels (8px steps)"
-                )
-        with gr.Column(scale=3):
-            gr.Markdown("### 📺 Video Stream")
-            streaming_video = gr.Video(
-                label="Live Stream",
-                streaming=True,
-                loop=True,
-                height=400,
-                autoplay=True,
-                show_label=False
-            )
-            status_display = gr.HTML(
-                value=(
-                    "<div style='text-align: center; padding: 20px; color: #666; border: 1px dashed #ddd; border-radius: 8px;'>"
-                    "🎬 Ready to start streaming...<br>"
-                    "<small>Configure your prompt and click 'Start Streaming'</small>"
-                    "</div>"
-                ),
-                label="Generation Status"
-            )
     # Connect the generator to the streaming video
     start_btn.click(
         fn=video_generation_handler_streaming,
-        inputs=[prompt, seed, fps, width, height, duration, buffering],
         outputs=[streaming_video, status_display]
     )
 # --- Launch App ---

 # not sure why it works in the original space but says "pip not found" in mine
 #subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
+import os
 from huggingface_hub import snapshot_download, hf_hub_download
+# Configuration for data paths
+DATA_ROOT = os.path.normpath(os.getenv('DATA_ROOT', '.'))
+WAN_MODELS_PATH = os.path.join(DATA_ROOT, 'wan_models')
+OTHER_MODELS_PATH = os.path.join(DATA_ROOT, 'other_models')
 snapshot_download(
     repo_id="Wan-AI/Wan2.1-T2V-1.3B",
+    local_dir=os.path.join(WAN_MODELS_PATH, "Wan2.1-T2V-1.3B"),
     local_dir_use_symlinks=False,
     resume_download=True,
     repo_type="model"
 hf_hub_download(
     repo_id="gdhe17/Self-Forcing",
     filename="checkpoints/self_forcing_dmd.pt",
+    local_dir=OTHER_MODELS_PATH,
     local_dir_use_symlinks=False
 )
 import re
 import random
 import argparse
 import imageio
 import av
 import uuid
+import tempfile
+import shutil
+from pathlib import Path
+from typing import Dict, Any, List, Optional, Tuple, Union
 from pipeline import CausalInferencePipeline
 from demo_utils.constant import ZERO_VAE_CACHE
 device = "cuda" if torch.cuda.is_available() else "cpu"
+# LoRA Storage Configuration
+STORAGE_PATH = Path(DATA_ROOT) / "storage"
+LORA_PATH = STORAGE_PATH / "loras"
+OUTPUT_PATH = STORAGE_PATH / "output"
+# Create necessary directories
+STORAGE_PATH.mkdir(parents=True, exist_ok=True)
+LORA_PATH.mkdir(parents=True, exist_ok=True)
+OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
+# Global variables for LoRA management
+current_lora_id = None
+current_lora_path = None
 # --- Argument Parsing ---
 parser = argparse.ArgumentParser(description="Gradio Demo for Self-Forcing with Frame Streaming")
 parser.add_argument('--port', type=int, default=7860, help="Port to run the Gradio app on.")
 parser.add_argument('--host', type=str, default='0.0.0.0', help="Host to bind the Gradio app to.")
+parser.add_argument("--checkpoint_path", type=str, default=os.path.join(OTHER_MODELS_PATH, 'checkpoints', 'self_forcing_dmd.pt'), help="Path to the model checkpoint.")
 parser.add_argument("--config_path", type=str, default='./configs/self_forcing_dmd.yaml', help="Path to the model config.")
 parser.add_argument('--share', action='store_true', help="Create a public Gradio link.")
 parser.add_argument('--trt', action='store_true', help="Use TensorRT optimized VAE decoder.")
     APP_STATE["torch_compile_applied"] = True
     print("✅ torch.compile applied to transformer")
+def upload_lora_file(file: tempfile._TemporaryFileWrapper) -> Tuple[str, str]:
+    """Upload a LoRA file and return a hash-based ID for future reference"""
+    if file is None:
+        return "", ""
+    try:
+        # Calculate SHA256 hash of the file
+        sha256_hash = hashlib.sha256()
+        with open(file.name, "rb") as f:
+            for chunk in iter(lambda: f.read(4096), b""):
+                sha256_hash.update(chunk)
+        file_hash = sha256_hash.hexdigest()
+        # Create destination path using hash
+        dest_path = LORA_PATH / f"{file_hash}.safetensors"
+        # Check if file already exists
+        if dest_path.exists():
+            print(f"LoRA file already exists!")
+            return file_hash, file_hash
+        # Copy the file to the destination
+        shutil.copy(file.name, dest_path)
+        print(f"LoRA file uploaded!")
+        return file_hash, file_hash
+    except Exception as e:
+        print(f"Error uploading LoRA file: {e}")
+        raise gr.Error(f"Failed to upload LoRA file: {str(e)}")
+def get_lora_file_path(lora_id: Optional[str]) -> Optional[Path]:
+    """Get the path to a LoRA file from its hash-based ID"""
+    if not lora_id:
+        return None
+    # Check if file exists
+    lora_path = LORA_PATH / f"{lora_id}.safetensors"
+    if lora_path.exists():
+        return lora_path
+    return None
+def manage_lora_weights(lora_id: Optional[str], lora_weight: float) -> Tuple[bool, Optional[Path]]:
+    """Manage LoRA weights for the transformer model"""
+    global current_lora_id, current_lora_path
+    # Determine if we should use LoRA
+    using_lora = lora_id is not None and lora_id.strip() != "" and lora_weight > 0
+    # If not using LoRA but we have one loaded, clear it
+    if not using_lora and current_lora_id is not None:
+        print(f"Clearing current LoRA")
+        current_lora_id = None
+        current_lora_path = None
+        return False, None
+    # If using LoRA, check if we need to change weights
+    if using_lora:
+        lora_path = get_lora_file_path(lora_id)
+        if not lora_path:
+            print(f"A LoRA file with this ID was found. Using base model instead.")
+            # If we had a LoRA loaded, clear it
+            if current_lora_id is not None:
+                print(f"Clearing current LoRA")
+                current_lora_id = None
+                current_lora_path = None
+            return False, None
+        # If LoRA ID changed, update
+        if lora_id != current_lora_id:
+            print(f"Loading LoRA..")
+            current_lora_id = lora_id
+            current_lora_path = lora_path
+        else:
+            print(f"Using a LoRA!")
+        return True, lora_path
+    return False, None
 def frames_to_ts_file(frames, filepath, fps = 15):
     """
     Convert frames directly to .ts file using PyAV.
         print("Initializing Default VAE Decoder...")
         vae_decoder = VAEDecoderWrapper()
         try:
+            vae_state_dict = torch.load(os.path.join(WAN_MODELS_PATH, 'Wan2.1-T2V-1.3B', 'Wan2.1_VAE.pth'), map_location="cpu")
             decoder_state_dict = {k: v for k, v in vae_state_dict.items() if 'decoder.' in k or 'conv2' in k}
             vae_decoder.load_state_dict(decoder_state_dict)
         except FileNotFoundError:
 pipeline.to(dtype=torch.float16).to(gpu)
 @torch.no_grad()
+def video_generation_handler_streaming(prompt, seed=42, fps=15, width=400, height=224, duration=5, lora_id=None, lora_weight=0.0):
     """
     Generator function that yields .ts video chunks using PyAV for streaming.
     """
     if seed == -1:
         seed = random.randint(0, 2**32 - 1)
+    # print(f"🎬 Starting PyAV streaming: seed: {seed}, duration: {duration}s")
+    # Handle LoRA weights
+    using_lora, lora_path = manage_lora_weights(lora_id, lora_weight)
+    if using_lora:
+        print(f"🎨 Using LoRA with weight factor {lora_weight}")
+    else:
+        print("🎨 Using base model (no LoRA)")
     # Setup
     conditional_dict = text_encoder(text_prompts=[prompt])
     for key, value in conditional_dict.items():
     # Current setup generates approximately 5 seconds with 7 blocks
     # So we scale proportionally
     base_duration = 5.0  # seconds
+    base_blocks = 8
     num_blocks = max(1, int(base_blocks * duration / base_duration))
     current_start_frame = 0
     # Ensure temp directory exists
     os.makedirs("gradio_tmp", exist_ok=True)
     # Generation loop
     for idx, current_num_frames in enumerate(all_num_frames):
         print(f"📦 Processing block {idx+1}/{num_blocks}")
                 frames_to_ts_file(all_frames_from_block, ts_path, fps)
+                # Calculate final progress for this block
+                total_progress = (idx + 1) / num_blocks * 100
+                # Yield the actual video chunk
+                yield ts_path, gr.update()
             except Exception as e:
                 print(f"⚠️ Error encoding block {idx}: {e}")
         current_start_frame += current_num_frames
     # Final completion status
     final_status_html = (
         f"<div style='padding: 16px; border: 1px solid #198754; background: linear-gradient(135deg, #d1e7dd, #f8f9fa); border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1);'>"
     print(f"✅ PyAV streaming complete! {total_frames_yielded} frames across {num_blocks} blocks")
 # --- Gradio UI Layout ---
+with gr.Blocks(title="Wan2.1 1.3B LoRA Self-Forcing streaming demo") as demo:
+    gr.Markdown("# 🚀 Run Any LoRA in near real-time!")
+    gr.Markdown("Real-time video generation with distilled Wan2-1 1.3B and LoRA [[Model]](https://huggingface.co/gdhe17/Self-Forcing), [[Project page]](https://self-forcing.github.io), [[Paper]](https://huggingface.co/papers/2506.08009)")
+    with gr.Tabs():
+        # LoRA Upload Tab
+        with gr.TabItem("1️⃣ Upload LoRA"):
+            gr.Markdown("## Upload LoRA Weights")
+            gr.Markdown("Upload your custom LoRA weights file to use for generation. The file will be automatically stored and you'll receive a unique hash-based ID.")
             with gr.Row():
+                lora_file = gr.File(label="LoRA File (safetensors format)")
             with gr.Row():
+                lora_id_output = gr.Textbox(label="LoRA Hash ID (use this in the generation tab)", interactive=False)
+        # Video Generation Tab
+        with gr.TabItem("2️⃣ Generate Video"):
             with gr.Row():
+                with gr.Column(scale=2):
+                    with gr.Group():
+                        prompt = gr.Textbox(
+                            label="Prompt",
+                            placeholder="A stylish woman walks down a Tokyo street...",
+                            lines=4,
+                            value=""
+                        )
+                    start_btn = gr.Button("🎬 Start Streaming", variant="primary", size="lg")
+                    gr.Markdown("### ⚙️ Settings")
+                    with gr.Row():
+                        seed = gr.Number(
+                            label="Seed",
+                            value=-1,
+                            info="Use -1 for random seed",
+                            precision=0
+                        )
+                        fps = gr.Slider(
+                            label="Playback FPS",
+                            minimum=1,
+                            maximum=30,
+                            value=args.fps,
+                            step=1,
+                            visible=False,
+                            info="Frames per second for playback"
+                        )
+                    with gr.Row():
+                        duration = gr.Slider(
+                            label="Duration (seconds)",
+                            minimum=1,
+                            maximum=5,
+                            value=3,
+                            step=1,
+                            info="Video duration in seconds"
+                        )
+                    with gr.Row():
+                        width = gr.Slider(
+                            label="Width",
+                            minimum=224,
+                            maximum=720,
+                            value=400,
+                            step=8,
+                            info="Video width in pixels (8px steps)"
+                        )
+                        height = gr.Slider(
+                            label="Height",
+                            minimum=224,
+                            maximum=720,
+                            value=224,
+                            step=8,
+                            info="Video height in pixels (8px steps)"
+                        )
+                    gr.Markdown("### 🎨 LoRA Settings")
+                    lora_id = gr.Textbox(
+                        label="LoRA ID (from upload tab)",
+                        placeholder="Enter your LoRA ID here...",
+                    )
+                    lora_weight = gr.Slider(
+                        label="LoRA Weight",
+                        minimum=0.0,
+                        maximum=1.0,
+                        step=0.01,
+                        value=1.0,
+                        info="Strength of LoRA influence"
+                    )
+                with gr.Column(scale=3):
+                    gr.Markdown("### 📺 Video Stream")
+                    streaming_video = gr.Video(
+                        label="Live Stream",
+                        streaming=True,
+                        loop=True,
+                        height=400,
+                        autoplay=True,
+                        show_label=False
+                    )
+                    status_display = gr.HTML(
+                        value=(
+                            "<div style='text-align: center; padding: 20px; color: #666; border: 1px dashed #ddd; border-radius: 8px;'>"
+                            "🎬 Ready to start streaming...<br>"
+                            "<small>Configure your prompt and click 'Start Streaming'</small>"
+                            "</div>"
+                        ),
+                        label="Generation Status"
+                    )
     # Connect the generator to the streaming video
     start_btn.click(
         fn=video_generation_handler_streaming,
+        inputs=[prompt, seed, fps, width, height, duration, lora_id, lora_weight],
         outputs=[streaming_video, status_display]
     )
+    # Connect LoRA upload to both display fields
+    lora_file.change(
+        fn=upload_lora_file,
+        inputs=[lora_file],
+        outputs=[lora_id_output, lora_id]
+    )
 # --- Launch App ---

app_last_working.py DELETED Viewed

@@ -1,460 +0,0 @@
-import subprocess
-# not sure why it works in the original space but says "pip not found" in mine
-#subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
-from huggingface_hub import snapshot_download, hf_hub_download
-snapshot_download(
-    repo_id="Wan-AI/Wan2.1-T2V-1.3B",
-    local_dir="wan_models/Wan2.1-T2V-1.3B",
-    local_dir_use_symlinks=False,
-    resume_download=True,
-    repo_type="model"
-)
-hf_hub_download(
-    repo_id="gdhe17/Self-Forcing",
-    filename="checkpoints/self_forcing_dmd.pt",
-    local_dir=".",
-    local_dir_use_symlinks=False
-)
-import os
-import re
-import random
-import argparse
-import hashlib
-import urllib.request
-import time
-from PIL import Image
-import torch
-import gradio as gr
-from omegaconf import OmegaConf
-from tqdm import tqdm
-import imageio
-import av
-import uuid
-from pipeline import CausalInferencePipeline
-from demo_utils.constant import ZERO_VAE_CACHE
-from demo_utils.vae_block3 import VAEDecoderWrapper
-from utils.wan_wrapper import WanDiffusionWrapper, WanTextEncoder
-from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM #, BitsAndBytesConfig
-import numpy as np
-device = "cuda" if torch.cuda.is_available() else "cpu"
-# --- Argument Parsing ---
-parser = argparse.ArgumentParser(description="Gradio Demo for Self-Forcing with Frame Streaming")
-parser.add_argument('--port', type=int, default=7860, help="Port to run the Gradio app on.")
-parser.add_argument('--host', type=str, default='0.0.0.0', help="Host to bind the Gradio app to.")
-parser.add_argument("--checkpoint_path", type=str, default='./checkpoints/self_forcing_dmd.pt', help="Path to the model checkpoint.")
-parser.add_argument("--config_path", type=str, default='./configs/self_forcing_dmd.yaml', help="Path to the model config.")
-parser.add_argument('--share', action='store_true', help="Create a public Gradio link.")
-parser.add_argument('--trt', action='store_true', help="Use TensorRT optimized VAE decoder.")
-parser.add_argument('--fps', type=float, default=15.0, help="Playback FPS for frame streaming.")
-args = parser.parse_args()
-gpu = "cuda"
-try:
-    config = OmegaConf.load(args.config_path)
-    default_config = OmegaConf.load("configs/default_config.yaml")
-    config = OmegaConf.merge(default_config, config)
-except FileNotFoundError as e:
-    print(f"Error loading config file: {e}\n. Please ensure config files are in the correct path.")
-    exit(1)
-# Initialize Models
-print("Initializing models...")
-text_encoder = WanTextEncoder()
-transformer = WanDiffusionWrapper(is_causal=True)
-try:
-    state_dict = torch.load(args.checkpoint_path, map_location="cpu")
-    transformer.load_state_dict(state_dict.get('generator_ema', state_dict.get('generator')))
-except FileNotFoundError as e:
-    print(f"Error loading checkpoint: {e}\nPlease ensure the checkpoint '{args.checkpoint_path}' exists.")
-    exit(1)
-text_encoder.eval().to(dtype=torch.float16).requires_grad_(False)
-transformer.eval().to(dtype=torch.float16).requires_grad_(False)
-text_encoder.to(gpu)
-transformer.to(gpu)
-APP_STATE = {
-    "torch_compile_applied": False,
-    "fp8_applied": False,
-    "current_use_taehv": False,
-    "current_vae_decoder": None,
-}
-def frames_to_ts_file(frames, filepath, fps = 15):
-    """
-    Convert frames directly to .ts file using PyAV.
-    Args:
-        frames: List of numpy arrays (HWC, RGB, uint8)
-        filepath: Output file path
-        fps: Frames per second
-    Returns:
-        The filepath of the created file
-    """
-    if not frames:
-        return filepath
-    height, width = frames[0].shape[:2]
-    # Create container for MPEG-TS format
-    container = av.open(filepath, mode='w', format='mpegts')
-    # Add video stream with optimized settings for streaming
-    stream = container.add_stream('h264', rate=fps)
-    stream.width = width
-    stream.height = height
-    stream.pix_fmt = 'yuv420p'
-    # Optimize for low latency streaming
-    stream.options = {
-        'preset': 'ultrafast',
-        'tune': 'zerolatency',
-        'crf': '23',
-        'profile': 'baseline',
-        'level': '3.0'
-    }
-    try:
-        for frame_np in frames:
-            frame = av.VideoFrame.from_ndarray(frame_np, format='rgb24')
-            frame = frame.reformat(format=stream.pix_fmt)
-            for packet in stream.encode(frame):
-                container.mux(packet)
-        for packet in stream.encode():
-            container.mux(packet)
-    finally:
-        container.close()
-    return filepath
-def initialize_vae_decoder(use_taehv=False, use_trt=False):
-    if use_trt:
-        from demo_utils.vae import VAETRTWrapper
-        print("Initializing TensorRT VAE Decoder...")
-        vae_decoder = VAETRTWrapper()
-        APP_STATE["current_use_taehv"] = False
-    elif use_taehv:
-        print("Initializing TAEHV VAE Decoder...")
-        from demo_utils.taehv import TAEHV
-        taehv_checkpoint_path = "checkpoints/taew2_1.pth"
-        if not os.path.exists(taehv_checkpoint_path):
-            print(f"Downloading TAEHV checkpoint to {taehv_checkpoint_path}...")
-            os.makedirs("checkpoints", exist_ok=True)
-            download_url = "https://github.com/madebyollin/taehv/raw/main/taew2_1.pth"
-            try:
-                urllib.request.urlretrieve(download_url, taehv_checkpoint_path)
-            except Exception as e:
-                raise RuntimeError(f"Failed to download taew2_1.pth: {e}")
-        class DotDict(dict): __getattr__ = dict.get
-        class TAEHVDiffusersWrapper(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.dtype = torch.float16
-                self.taehv = TAEHV(checkpoint_path=taehv_checkpoint_path).to(self.dtype)
-                self.config = DotDict(scaling_factor=1.0)
-            def decode(self, latents, return_dict=None):
-                return self.taehv.decode_video(latents, parallel=not LOW_MEMORY).mul_(2).sub_(1)
-        vae_decoder = TAEHVDiffusersWrapper()
-        APP_STATE["current_use_taehv"] = True
-    else:
-        print("Initializing Default VAE Decoder...")
-        vae_decoder = VAEDecoderWrapper()
-        try:
-            vae_state_dict = torch.load('wan_models/Wan2.1-T2V-1.3B/Wan2.1_VAE.pth', map_location="cpu")
-            decoder_state_dict = {k: v for k, v in vae_state_dict.items() if 'decoder.' in k or 'conv2' in k}
-            vae_decoder.load_state_dict(decoder_state_dict)
-        except FileNotFoundError:
-            print("Warning: Default VAE weights not found.")
-        APP_STATE["current_use_taehv"] = False
-    vae_decoder.eval().to(dtype=torch.float16).requires_grad_(False).to(gpu)
-    APP_STATE["current_vae_decoder"] = vae_decoder
-    print(f"✅ VAE decoder initialized: {'TAEHV' if use_taehv else 'Default VAE'}")
-# Initialize with default VAE
-initialize_vae_decoder(use_taehv=False, use_trt=args.trt)
-pipeline = CausalInferencePipeline(
-    config, device=gpu, generator=transformer, text_encoder=text_encoder,
-    vae=APP_STATE["current_vae_decoder"]
-)
-pipeline.to(dtype=torch.float16).to(gpu)
-@torch.no_grad()
-def video_generation_handler_streaming(prompt, seed=42, fps=15):
-    """
-    Generator function that yields .ts video chunks using PyAV for streaming.
-    Now optimized for block-based processing.
-    """
-    if seed == -1:
-        seed = random.randint(0, 2**32 - 1)
-    print(f"🎬 Starting PyAV streaming: '{prompt}', seed: {seed}")
-    # Setup
-    conditional_dict = text_encoder(text_prompts=[prompt])
-    for key, value in conditional_dict.items():
-        conditional_dict[key] = value.to(dtype=torch.float16)
-    rnd = torch.Generator(gpu).manual_seed(int(seed))
-    pipeline._initialize_kv_cache(1, torch.float16, device=gpu)
-    pipeline._initialize_crossattn_cache(1, torch.float16, device=gpu)
-    noise = torch.randn([1, 21, 16, 60, 104], device=gpu, dtype=torch.float16, generator=rnd)
-    vae_cache, latents_cache = None, None
-    if not APP_STATE["current_use_taehv"] and not args.trt:
-        vae_cache = [c.to(device=gpu, dtype=torch.float16) for c in ZERO_VAE_CACHE]
-    num_blocks = 7
-    current_start_frame = 0
-    all_num_frames = [pipeline.num_frame_per_block] * num_blocks
-    total_frames_yielded = 0
-    # Ensure temp directory exists
-    os.makedirs("gradio_tmp", exist_ok=True)
-    # Generation loop
-    for idx, current_num_frames in enumerate(all_num_frames):
-        print(f"📦 Processing block {idx+1}/{num_blocks}")
-        noisy_input = noise[:, current_start_frame : current_start_frame + current_num_frames]
-        # Denoising steps
-        for step_idx, current_timestep in enumerate(pipeline.denoising_step_list):
-            timestep = torch.ones([1, current_num_frames], device=noise.device, dtype=torch.int64) * current_timestep
-            _, denoised_pred = pipeline.generator(
-                noisy_image_or_video=noisy_input, conditional_dict=conditional_dict,
-                timestep=timestep, kv_cache=pipeline.kv_cache1,
-                crossattn_cache=pipeline.crossattn_cache,
-                current_start=current_start_frame * pipeline.frame_seq_length
-            )
-            if step_idx < len(pipeline.denoising_step_list) - 1:
-                next_timestep = pipeline.denoising_step_list[step_idx + 1]
-                noisy_input = pipeline.scheduler.add_noise(
-                    denoised_pred.flatten(0, 1), torch.randn_like(denoised_pred.flatten(0, 1)),
-                    next_timestep * torch.ones([1 * current_num_frames], device=noise.device, dtype=torch.long)
-                ).unflatten(0, denoised_pred.shape[:2])
-        if idx < len(all_num_frames) - 1:
-            pipeline.generator(
-                noisy_image_or_video=denoised_pred, conditional_dict=conditional_dict,
-                timestep=torch.zeros_like(timestep), kv_cache=pipeline.kv_cache1,
-                crossattn_cache=pipeline.crossattn_cache,
-                current_start=current_start_frame * pipeline.frame_seq_length,
-            )
-        # Decode to pixels
-        if args.trt:
-            pixels, vae_cache = pipeline.vae.forward(denoised_pred.half(), *vae_cache)
-        elif APP_STATE["current_use_taehv"]:
-            if latents_cache is None:
-                latents_cache = denoised_pred
-            else:
-                denoised_pred = torch.cat([latents_cache, denoised_pred], dim=1)
-                latents_cache = denoised_pred[:, -3:]
-            pixels = pipeline.vae.decode(denoised_pred)
-        else:
-            pixels, vae_cache = pipeline.vae(denoised_pred.half(), *vae_cache)
-        # Handle frame skipping
-        if idx == 0 and not args.trt:
-            pixels = pixels[:, 3:]
-        elif APP_STATE["current_use_taehv"] and idx > 0:
-            pixels = pixels[:, 12:]
-        print(f"🔍 DEBUG Block {idx}: Pixels shape after skipping: {pixels.shape}")
-        # Process all frames from this block at once
-        all_frames_from_block = []
-        for frame_idx in range(pixels.shape[1]):
-            frame_tensor = pixels[0, frame_idx]
-            # Convert to numpy (HWC, RGB, uint8)
-            frame_np = torch.clamp(frame_tensor.float(), -1., 1.) * 127.5 + 127.5
-            frame_np = frame_np.to(torch.uint8).cpu().numpy()
-            frame_np = np.transpose(frame_np, (1, 2, 0))  # CHW -> HWC
-            all_frames_from_block.append(frame_np)
-            total_frames_yielded += 1
-            # Yield status update for each frame (cute tracking!)
-            blocks_completed = idx
-            current_block_progress = (frame_idx + 1) / pixels.shape[1]
-            total_progress = (blocks_completed + current_block_progress) / num_blocks * 100
-            # Cap at 100% to avoid going over
-            total_progress = min(total_progress, 100.0)
-            frame_status_html = (
-                f"<div style='padding: 10px; border: 1px solid #ddd; border-radius: 8px; font-family: sans-serif;'>"
-                f"  <p style='margin: 0 0 8px 0; font-size: 16px; font-weight: bold;'>Generating Video...</p>"
-                f"  <div style='background: #e9ecef; border-radius: 4px; width: 100%; overflow: hidden;'>"
-                f"    <div style='width: {total_progress:.1f}%; height: 20px; background-color: #0d6efd; transition: width 0.2s;'></div>"
-                f"  </div>"
-                f"  <p style='margin: 8px 0 0 0; color: #555; font-size: 14px; text-align: right;'>"
-                f"    Block {idx+1}/{num_blocks}   |   Frame {total_frames_yielded}   |   {total_progress:.1f}%"
-                f"  </p>"
-                f"</div>"
-            )
-            # Yield None for video but update status (frame-by-frame tracking)
-            yield None, frame_status_html
-        # Encode entire block as one chunk immediately
-        if all_frames_from_block:
-            print(f"📹 Encoding block {idx} with {len(all_frames_from_block)} frames")
-            try:
-                chunk_uuid = str(uuid.uuid4())[:8]
-                ts_filename = f"block_{idx:04d}_{chunk_uuid}.ts"
-                ts_path = os.path.join("gradio_tmp", ts_filename)
-                frames_to_ts_file(all_frames_from_block, ts_path, fps)
-                # Calculate final progress for this block
-                total_progress = (idx + 1) / num_blocks * 100
-                # Yield the actual video chunk
-                yield ts_path, gr.update()
-            except Exception as e:
-                print(f"⚠️ Error encoding block {idx}: {e}")
-                import traceback
-                traceback.print_exc()
-        current_start_frame += current_num_frames
-    # Final completion status
-    final_status_html = (
-        f"<div style='padding: 16px; border: 1px solid #198754; background: linear-gradient(135deg, #d1e7dd, #f8f9fa); border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1);'>"
-        f"  <div style='display: flex; align-items: center; margin-bottom: 8px;'>"
-        f"    <span style='font-size: 24px; margin-right: 12px;'>🎉</span>"
-        f"    <h4 style='margin: 0; color: #0f5132; font-size: 18px;'>Stream Complete!</h4>"
-        f"  </div>"
-        f"  <div style='background: rgba(255,255,255,0.7); padding: 8px; border-radius: 4px;'>"
-        f"    <p style='margin: 0; color: #0f5132; font-weight: 500;'>"
-        f"      📊 Generated {total_frames_yielded} frames across {num_blocks} blocks"
-        f"    </p>"
-        f"    <p style='margin: 4px 0 0 0; color: #0f5132; font-size: 14px;'>"
-        f"      🎬 Playback: {fps} FPS • 📁 Format: MPEG-TS/H.264"
-        f"    </p>"
-        f"  </div>"
-        f"</div>"
-    )
-    yield None, final_status_html
-    print(f"✅ PyAV streaming complete! {total_frames_yielded} frames across {num_blocks} blocks")
-# --- Gradio UI Layout ---
-with gr.Blocks(title="Self-Forcing Streaming Demo") as demo:
-    gr.Markdown("# 🚀 Self-Forcing Video Generation")
-    gr.Markdown("Real-time video generation with distilled Wan2-1 1.3B [[Model]](https://huggingface.co/gdhe17/Self-Forcing), [[Project page]](https://self-forcing.github.io), [[Paper]](https://huggingface.co/papers/2506.08009)")
-    with gr.Row():
-        with gr.Column(scale=2):
-            with gr.Group():
-                prompt = gr.Textbox(
-                    label="Prompt",
-                    placeholder="A stylish woman walks down a Tokyo street...",
-                    lines=4,
-                    value=""
-                )
-            start_btn = gr.Button("🎬 Start Streaming", variant="primary", size="lg")
-            gr.Markdown("### 🎯 Examples")
-            gr.Examples(
-                examples=[
-                    "A close-up shot of a ceramic teacup slowly pouring water into a glass mug.",
-                    "A playful cat is seen playing an electronic guitar, strumming the strings with its front paws. The cat has distinctive black facial markings and a bushy tail. It sits comfortably on a small stool, its body slightly tilted as it focuses intently on the instrument. The setting is a cozy, dimly lit room with vintage posters on the walls, adding a retro vibe. The cat's expressive eyes convey a sense of joy and concentration. Medium close-up shot, focusing on the cat's face and hands interacting with the guitar.",
-                    "A dynamic over-the-shoulder perspective of a chef meticulously plating a dish in a bustling kitchen. The chef, a middle-aged woman, deftly arranges ingredients on a pristine white plate. Her hands move with precision, each gesture deliberate and practiced. The background shows a crowded kitchen with steaming pots, whirring blenders, and the clatter of utensils. Bright lights highlight the scene, casting shadows across the busy workspace. The camera angle captures the chef's detailed work from behind, emphasizing his skill and dedication.",
-                ],
-                inputs=[prompt],
-            )
-            gr.Markdown("### ⚙️ Settings")
-            with gr.Row():
-                seed = gr.Number(
-                    label="Seed",
-                    value=-1,
-                    info="Use -1 for random seed",
-                    precision=0
-                )
-                fps = gr.Slider(
-                    label="Playback FPS",
-                    minimum=1,
-                    maximum=30,
-                    value=args.fps,
-                    step=1,
-                    visible=False,
-                    info="Frames per second for playback"
-                )
-        with gr.Column(scale=3):
-            gr.Markdown("### 📺 Video Stream")
-            streaming_video = gr.Video(
-                label="Live Stream",
-                streaming=True,
-                loop=True,
-                height=400,
-                autoplay=True,
-                show_label=False
-            )
-            status_display = gr.HTML(
-                value=(
-                    "<div style='text-align: center; padding: 20px; color: #666; border: 1px dashed #ddd; border-radius: 8px;'>"
-                    "🎬 Ready to start streaming...<br>"
-                    "<small>Configure your prompt and click 'Start Streaming'</small>"
-                    "</div>"
-                ),
-                label="Generation Status"
-            )
-    # Connect the generator to the streaming video
-    start_btn.click(
-        fn=video_generation_handler_streaming,
-        inputs=[prompt, seed, fps],
-        outputs=[streaming_video, status_display]
-    )
-# --- Launch App ---
-if __name__ == "__main__":
-    if os.path.exists("gradio_tmp"):
-        import shutil
-        shutil.rmtree("gradio_tmp")
-    os.makedirs("gradio_tmp", exist_ok=True)
-    print("🚀 Starting Self-Forcing Streaming Demo")
-    print(f"📁 Temporary files will be stored in: gradio_tmp/")
-    print(f"🎯 Chunk encoding: PyAV (MPEG-TS/H.264)")
-    print(f"⚡ GPU acceleration: {gpu}")
-    demo.queue().launch(
-        server_name=args.host,
-        server_port=args.port,
-        share=args.share,
-        show_error=True,
-        max_threads=40,
-        mcp_server=True
-    )

demo.py DELETED Viewed

@@ -1,631 +0,0 @@
-"""
-Demo for Self-Forcing.
-"""
-import os
-import re
-import random
-import time
-import base64
-import argparse
-import hashlib
-import subprocess
-import urllib.request
-from io import BytesIO
-from PIL import Image
-import numpy as np
-import torch
-from omegaconf import OmegaConf
-from flask import Flask, render_template, jsonify
-from flask_socketio import SocketIO, emit
-import queue
-from threading import Thread, Event
-from pipeline import CausalInferencePipeline
-from demo_utils.constant import ZERO_VAE_CACHE
-from demo_utils.vae_block3 import VAEDecoderWrapper
-from utils.wan_wrapper import WanDiffusionWrapper, WanTextEncoder
-from demo_utils.utils import generate_timestamp
-from demo_utils.memory import gpu, get_cuda_free_memory_gb, DynamicSwapInstaller, move_model_to_device_with_memory_preservation
-# Parse arguments
-parser = argparse.ArgumentParser()
-parser.add_argument('--port', type=int, default=5001)
-parser.add_argument('--host', type=str, default='0.0.0.0')
-parser.add_argument("--checkpoint_path", type=str, default='./checkpoints/self_forcing_dmd.pt')
-parser.add_argument("--config_path", type=str, default='./configs/self_forcing_dmd.yaml')
-parser.add_argument('--trt', action='store_true')
-args = parser.parse_args()
-print(f'Free VRAM {get_cuda_free_memory_gb(gpu)} GB')
-low_memory = get_cuda_free_memory_gb(gpu) < 40
-# Load models
-config = OmegaConf.load(args.config_path)
-default_config = OmegaConf.load("configs/default_config.yaml")
-config = OmegaConf.merge(default_config, config)
-text_encoder = WanTextEncoder()
-# Global variables for dynamic model switching
-current_vae_decoder = None
-current_use_taehv = False
-fp8_applied = False
-torch_compile_applied = False
-global frame_number
-frame_number = 0
-anim_name = ""
-frame_rate = 6
-def initialize_vae_decoder(use_taehv=False, use_trt=False):
-    """Initialize VAE decoder based on the selected option"""
-    global current_vae_decoder, current_use_taehv
-    if use_trt:
-        from demo_utils.vae import VAETRTWrapper
-        current_vae_decoder = VAETRTWrapper()
-        return current_vae_decoder
-    if use_taehv:
-        from demo_utils.taehv import TAEHV
-        # Check if taew2_1.pth exists in checkpoints folder, download if missing
-        taehv_checkpoint_path = "checkpoints/taew2_1.pth"
-        if not os.path.exists(taehv_checkpoint_path):
-            print(f"taew2_1.pth not found in checkpoints folder {taehv_checkpoint_path}. Downloading...")
-            os.makedirs("checkpoints", exist_ok=True)
-            download_url = "https://github.com/madebyollin/taehv/raw/main/taew2_1.pth"
-            try:
-                urllib.request.urlretrieve(download_url, taehv_checkpoint_path)
-                print(f"Successfully downloaded taew2_1.pth to {taehv_checkpoint_path}")
-            except Exception as e:
-                print(f"Failed to download taew2_1.pth: {e}")
-                raise
-        class DotDict(dict):
-            __getattr__ = dict.__getitem__
-            __setattr__ = dict.__setitem__
-        class TAEHVDiffusersWrapper(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.dtype = torch.float16
-                self.taehv = TAEHV(checkpoint_path=taehv_checkpoint_path).to(self.dtype)
-                self.config = DotDict(scaling_factor=1.0)
-            def decode(self, latents, return_dict=None):
-                # n, c, t, h, w = latents.shape
-                # low-memory, set parallel=True for faster + higher memory
-                return self.taehv.decode_video(latents, parallel=False).mul_(2).sub_(1)
-        current_vae_decoder = TAEHVDiffusersWrapper()
-    else:
-        current_vae_decoder = VAEDecoderWrapper()
-        vae_state_dict = torch.load('wan_models/Wan2.1-T2V-1.3B/Wan2.1_VAE.pth', map_location="cpu")
-        decoder_state_dict = {}
-        for key, value in vae_state_dict.items():
-            if 'decoder.' in key or 'conv2' in key:
-                decoder_state_dict[key] = value
-        current_vae_decoder.load_state_dict(decoder_state_dict)
-    current_vae_decoder.eval()
-    current_vae_decoder.to(dtype=torch.float16)
-    current_vae_decoder.requires_grad_(False)
-    current_vae_decoder.to(gpu)
-    current_use_taehv = use_taehv
-    print(f"✅ VAE decoder initialized with {'TAEHV' if use_taehv else 'default VAE'}")
-    return current_vae_decoder
-# Initialize with default VAE
-vae_decoder = initialize_vae_decoder(use_taehv=False, use_trt=args.trt)
-transformer = WanDiffusionWrapper(is_causal=True)
-state_dict = torch.load(args.checkpoint_path, map_location="cpu")
-transformer.load_state_dict(state_dict['generator_ema'])
-text_encoder.eval()
-transformer.eval()
-transformer.to(dtype=torch.float16)
-text_encoder.to(dtype=torch.bfloat16)
-text_encoder.requires_grad_(False)
-transformer.requires_grad_(False)
-pipeline = CausalInferencePipeline(
-    config,
-    device=gpu,
-    generator=transformer,
-    text_encoder=text_encoder,
-    vae=vae_decoder
-)
-if low_memory:
-    DynamicSwapInstaller.install_model(text_encoder, device=gpu)
-else:
-    text_encoder.to(gpu)
-transformer.to(gpu)
-# Flask and SocketIO setup
-app = Flask(__name__)
-app.config['SECRET_KEY'] = 'frontend_buffered_demo'
-socketio = SocketIO(app, cors_allowed_origins="*")
-generation_active = False
-stop_event = Event()
-frame_send_queue = queue.Queue()
-sender_thread = None
-models_compiled = False
-def tensor_to_base64_frame(frame_tensor):
-    """Convert a single frame tensor to base64 image string."""
-    global frame_number, anim_name
-    # Clamp and normalize to 0-255
-    frame = torch.clamp(frame_tensor.float(), -1., 1.) * 127.5 + 127.5
-    frame = frame.to(torch.uint8).cpu().numpy()
-    # CHW -> HWC
-    if len(frame.shape) == 3:
-        frame = np.transpose(frame, (1, 2, 0))
-    # Convert to PIL Image
-    if frame.shape[2] == 3:  # RGB
-        image = Image.fromarray(frame, 'RGB')
-    else:  # Handle other formats
-        image = Image.fromarray(frame)
-    # Convert to base64
-    buffer = BytesIO()
-    image.save(buffer, format='JPEG', quality=100)
-    if not os.path.exists("./images/%s" % anim_name):
-        os.makedirs("./images/%s" % anim_name)
-    frame_number += 1
-    image.save("./images/%s/%s_%03d.jpg" % (anim_name, anim_name, frame_number))
-    img_str = base64.b64encode(buffer.getvalue()).decode()
-    return f"data:image/jpeg;base64,{img_str}"
-def frame_sender_worker():
-    """Background thread that processes frame send queue non-blocking."""
-    global frame_send_queue, generation_active, stop_event
-    print("📡 Frame sender thread started")
-    while True:
-        frame_data = None
-        try:
-            # Get frame data from queue
-            frame_data = frame_send_queue.get(timeout=1.0)
-            if frame_data is None:  # Shutdown signal
-                frame_send_queue.task_done()  # Mark shutdown signal as done
-                break
-            frame_tensor, frame_index, block_index, job_id = frame_data
-            # Convert tensor to base64
-            base64_frame = tensor_to_base64_frame(frame_tensor)
-            # Send via SocketIO
-            try:
-                socketio.emit('frame_ready', {
-                    'data': base64_frame,
-                    'frame_index': frame_index,
-                    'block_index': block_index,
-                    'job_id': job_id
-                })
-            except Exception as e:
-                print(f"⚠️ Failed to send frame {frame_index}: {e}")
-            frame_send_queue.task_done()
-        except queue.Empty:
-            # Check if we should continue running
-            if not generation_active and frame_send_queue.empty():
-                break
-        except Exception as e:
-            print(f"❌ Frame sender error: {e}")
-            # Make sure to mark task as done even if there's an error
-            if frame_data is not None:
-                try:
-                    frame_send_queue.task_done()
-                except Exception as e:
-                    print(f"❌ Failed to mark frame task as done: {e}")
-            break
-    print("📡 Frame sender thread stopped")
-@torch.no_grad()
-def generate_video_stream(prompt, seed, enable_torch_compile=False, enable_fp8=False, use_taehv=False):
-    """Generate video and push frames immediately to frontend."""
-    global generation_active, stop_event, frame_send_queue, sender_thread, models_compiled, torch_compile_applied, fp8_applied, current_vae_decoder, current_use_taehv, frame_rate, anim_name
-    try:
-        generation_active = True
-        stop_event.clear()
-        job_id = generate_timestamp()
-        # Start frame sender thread if not already running
-        if sender_thread is None or not sender_thread.is_alive():
-            sender_thread = Thread(target=frame_sender_worker, daemon=True)
-            sender_thread.start()
-        # Emit progress updates
-        def emit_progress(message, progress):
-            try:
-                socketio.emit('progress', {
-                    'message': message,
-                    'progress': progress,
-                    'job_id': job_id
-                })
-            except Exception as e:
-                print(f"❌ Failed to emit progress: {e}")
-        emit_progress('Starting generation...', 0)
-        # Handle VAE decoder switching
-        if use_taehv != current_use_taehv:
-            emit_progress('Switching VAE decoder...', 2)
-            print(f"🔄 Switching VAE decoder to {'TAEHV' if use_taehv else 'default VAE'}")
-            current_vae_decoder = initialize_vae_decoder(use_taehv=use_taehv)
-            # Update pipeline with new VAE decoder
-            pipeline.vae = current_vae_decoder
-        # Handle FP8 quantization
-        if enable_fp8 and not fp8_applied:
-            emit_progress('Applying FP8 quantization...', 3)
-            print("🔧 Applying FP8 quantization to transformer")
-            from torchao.quantization.quant_api import quantize_, Float8DynamicActivationFloat8WeightConfig, PerTensor
-            quantize_(transformer, Float8DynamicActivationFloat8WeightConfig(granularity=PerTensor()))
-            fp8_applied = True
-        # Text encoding
-        emit_progress('Encoding text prompt...', 8)
-        conditional_dict = text_encoder(text_prompts=[prompt])
-        for key, value in conditional_dict.items():
-            conditional_dict[key] = value.to(dtype=torch.float16)
-        if low_memory:
-            gpu_memory_preservation = get_cuda_free_memory_gb(gpu) + 5
-            move_model_to_device_with_memory_preservation(
-                text_encoder,target_device=gpu, preserved_memory_gb=gpu_memory_preservation)
-        # Handle torch.compile if enabled
-        torch_compile_applied = enable_torch_compile
-        if enable_torch_compile and not models_compiled:
-            # Compile transformer and decoder
-            transformer.compile(mode="max-autotune-no-cudagraphs")
-            if not current_use_taehv and not low_memory and not args.trt:
-                current_vae_decoder.compile(mode="max-autotune-no-cudagraphs")
-        # Initialize generation
-        emit_progress('Initializing generation...', 12)
-        rnd = torch.Generator(gpu).manual_seed(seed)
-        # all_latents = torch.zeros([1, 21, 16, 60, 104], device=gpu, dtype=torch.bfloat16)
-        pipeline._initialize_kv_cache(batch_size=1, dtype=torch.float16, device=gpu)
-        pipeline._initialize_crossattn_cache(batch_size=1, dtype=torch.float16, device=gpu)
-        noise = torch.randn([1, 21, 16, 60, 104], device=gpu, dtype=torch.float16, generator=rnd)
-        # Generation parameters
-        num_blocks = 7
-        current_start_frame = 0
-        num_input_frames = 0
-        all_num_frames = [pipeline.num_frame_per_block] * num_blocks
-        if current_use_taehv:
-            vae_cache = None
-        else:
-            vae_cache = ZERO_VAE_CACHE
-            for i in range(len(vae_cache)):
-                vae_cache[i] = vae_cache[i].to(device=gpu, dtype=torch.float16)
-        total_frames_sent = 0
-        generation_start_time = time.time()
-        emit_progress('Generating frames... (frontend handles timing)', 15)
-        for idx, current_num_frames in enumerate(all_num_frames):
-            if not generation_active or stop_event.is_set():
-                break
-            progress = int(((idx + 1) / len(all_num_frames)) * 80) + 15
-            # Special message for first block with torch.compile
-            if idx == 0 and torch_compile_applied and not models_compiled:
-                emit_progress(
-                    f'Processing block 1/{len(all_num_frames)} - Compiling models (may take 5-10 minutes)...', progress)
-                print(f"🔥 Processing block {idx+1}/{len(all_num_frames)}")
-                models_compiled = True
-            else:
-                emit_progress(f'Processing block {idx+1}/{len(all_num_frames)}...', progress)
-                print(f"🔄 Processing block {idx+1}/{len(all_num_frames)}")
-            block_start_time = time.time()
-            noisy_input = noise[:, current_start_frame -
-                                num_input_frames:current_start_frame + current_num_frames - num_input_frames]
-            # Denoising loop
-            denoising_start = time.time()
-            for index, current_timestep in enumerate(pipeline.denoising_step_list):
-                if not generation_active or stop_event.is_set():
-                    break
-                timestep = torch.ones([1, current_num_frames], device=noise.device,
-                                      dtype=torch.int64) * current_timestep
-                if index < len(pipeline.denoising_step_list) - 1:
-                    _, denoised_pred = transformer(
-                        noisy_image_or_video=noisy_input,
-                        conditional_dict=conditional_dict,
-                        timestep=timestep,
-                        kv_cache=pipeline.kv_cache1,
-                        crossattn_cache=pipeline.crossattn_cache,
-                        current_start=current_start_frame * pipeline.frame_seq_length
-                    )
-                    next_timestep = pipeline.denoising_step_list[index + 1]
-                    noisy_input = pipeline.scheduler.add_noise(
-                        denoised_pred.flatten(0, 1),
-                        torch.randn_like(denoised_pred.flatten(0, 1)),
-                        next_timestep * torch.ones([1 * current_num_frames], device=noise.device, dtype=torch.long)
-                    ).unflatten(0, denoised_pred.shape[:2])
-                else:
-                    _, denoised_pred = transformer(
-                        noisy_image_or_video=noisy_input,
-                        conditional_dict=conditional_dict,
-                        timestep=timestep,
-                        kv_cache=pipeline.kv_cache1,
-                        crossattn_cache=pipeline.crossattn_cache,
-                        current_start=current_start_frame * pipeline.frame_seq_length
-                    )
-            if not generation_active or stop_event.is_set():
-                break
-            denoising_time = time.time() - denoising_start
-            print(f"⚡ Block {idx+1} denoising completed in {denoising_time:.2f}s")
-            # Record output
-            # all_latents[:, current_start_frame:current_start_frame + current_num_frames] = denoised_pred
-            # Update KV cache for next block
-            if idx != len(all_num_frames) - 1:
-                transformer(
-                    noisy_image_or_video=denoised_pred,
-                    conditional_dict=conditional_dict,
-                    timestep=torch.zeros_like(timestep),
-                    kv_cache=pipeline.kv_cache1,
-                    crossattn_cache=pipeline.crossattn_cache,
-                    current_start=current_start_frame * pipeline.frame_seq_length,
-                )
-            # Decode to pixels and send frames immediately
-            print(f"🎨 Decoding block {idx+1} to pixels...")
-            decode_start = time.time()
-            if args.trt:
-                all_current_pixels = []
-                for i in range(denoised_pred.shape[1]):
-                    is_first_frame = torch.tensor(1.0).cuda().half() if idx == 0 and i == 0 else \
-                        torch.tensor(0.0).cuda().half()
-                    outputs = vae_decoder.forward(denoised_pred[:, i:i + 1, :, :, :].half(), is_first_frame, *vae_cache)
-                    # outputs = vae_decoder.forward(denoised_pred.float(), *vae_cache)
-                    current_pixels, vae_cache = outputs[0], outputs[1:]
-                    print(current_pixels.max(), current_pixels.min())
-                    all_current_pixels.append(current_pixels.clone())
-                pixels = torch.cat(all_current_pixels, dim=1)
-                if idx == 0:
-                    pixels = pixels[:, 3:, :, :, :]  # Skip first 3 frames of first block
-            else:
-                if current_use_taehv:
-                    if vae_cache is None:
-                        vae_cache = denoised_pred
-                    else:
-                        denoised_pred = torch.cat([vae_cache, denoised_pred], dim=1)
-                        vae_cache = denoised_pred[:, -3:, :, :, :]
-                    pixels = current_vae_decoder.decode(denoised_pred)
-                    print(f"denoised_pred shape: {denoised_pred.shape}")
-                    print(f"pixels shape: {pixels.shape}")
-                    if idx == 0:
-                        pixels = pixels[:, 3:, :, :, :]  # Skip first 3 frames of first block
-                    else:
-                        pixels = pixels[:, 12:, :, :, :]
-                else:
-                    pixels, vae_cache = current_vae_decoder(denoised_pred.half(), *vae_cache)
-                    if idx == 0:
-                        pixels = pixels[:, 3:, :, :, :]  # Skip first 3 frames of first block
-            decode_time = time.time() - decode_start
-            print(f"🎨 Block {idx+1} VAE decoding completed in {decode_time:.2f}s")
-            # Queue frames for non-blocking sending
-            block_frames = pixels.shape[1]
-            print(f"📡 Queueing {block_frames} frames from block {idx+1} for sending...")
-            queue_start = time.time()
-            for frame_idx in range(block_frames):
-                if not generation_active or stop_event.is_set():
-                    break
-                frame_tensor = pixels[0, frame_idx].cpu()
-                # Queue frame data in non-blocking way
-                frame_send_queue.put((frame_tensor, total_frames_sent, idx, job_id))
-                total_frames_sent += 1
-            queue_time = time.time() - queue_start
-            block_time = time.time() - block_start_time
-            print(f"✅ Block {idx+1} completed in {block_time:.2f}s ({block_frames} frames queued in {queue_time:.3f}s)")
-            current_start_frame += current_num_frames
-        generation_time = time.time() - generation_start_time
-        print(f"🎉 Generation completed in {generation_time:.2f}s! {total_frames_sent} frames queued for sending")
-        # Wait for all frames to be sent before completing
-        emit_progress('Waiting for all frames to be sent...', 97)
-        print("⏳ Waiting for all frames to be sent...")
-        frame_send_queue.join()  # Wait for all queued frames to be processed
-        print("✅ All frames sent successfully!")
-        generate_mp4_from_images("./images","./videos/"+anim_name+".mp4", frame_rate )
-        # Final progress update
-        emit_progress('Generation complete!', 100)
-        try:
-            socketio.emit('generation_complete', {
-                'message': 'Video generation completed!',
-                'total_frames': total_frames_sent,
-                'generation_time': f"{generation_time:.2f}s",
-                'job_id': job_id
-            })
-        except Exception as e:
-            print(f"❌ Failed to emit generation complete: {e}")
-    except Exception as e:
-        print(f"❌ Generation failed: {e}")
-        try:
-            socketio.emit('error', {
-                'message': f'Generation failed: {str(e)}',
-                'job_id': job_id
-            })
-        except Exception as e:
-            print(f"❌ Failed to emit error: {e}")
-    finally:
-        generation_active = False
-        stop_event.set()
-        # Clean up sender thread
-        try:
-            frame_send_queue.put(None)
-        except Exception as e:
-            print(f"❌ Failed to put None in frame_send_queue: {e}")
-def generate_mp4_from_images(image_directory, output_video_path, fps=24):
-    """
-    Generate an MP4 video from a directory of images ordered alphabetically.
-    :param image_directory: Path to the directory containing images.
-    :param output_video_path: Path where the output MP4 will be saved.
-    :param fps: Frames per second for the output video.
-    """
-    global anim_name
-    # Construct the ffmpeg command
-    cmd = [
-        'ffmpeg',
-        '-framerate', str(fps),
-        '-i', os.path.join(image_directory, anim_name+'/'+anim_name+'_%03d.jpg'),  # Adjust the pattern if necessary
-        '-c:v', 'libx264',
-        '-pix_fmt', 'yuv420p',
-        output_video_path
-    ]
-    try:
-        subprocess.run(cmd, check=True)
-        print(f"Video saved to {output_video_path}")
-    except subprocess.CalledProcessError as e:
-        print(f"An error occurred: {e}")
-def calculate_sha256(data):
-    # Convert data to bytes if it's not already
-    if isinstance(data, str):
-        data = data.encode()
-    # Calculate SHA-256 hash
-    sha256_hash = hashlib.sha256(data).hexdigest()
-    return sha256_hash
-# Socket.IO event handlers
-@socketio.on('connect')
-def handle_connect():
-    print('Client connected')
-    emit('status', {'message': 'Connected to frontend-buffered demo server'})
-@socketio.on('disconnect')
-def handle_disconnect():
-    print('Client disconnected')
-@socketio.on('start_generation')
-def handle_start_generation(data):
-    global generation_active, frame_number, anim_name, frame_rate
-    frame_number = 0
-    if generation_active:
-        emit('error', {'message': 'Generation already in progress'})
-        return
-    prompt = data.get('prompt', '')
-    seed = data.get('seed', -1)
-    if seed==-1:
-        seed = random.randint(0, 2**32)
-    # Extract words up to the first punctuation or newline
-    words_up_to_punctuation = re.split(r'[^\w\s]', prompt)[0].strip() if prompt else ''
-    if not words_up_to_punctuation:
-        words_up_to_punctuation = re.split(r'[\n\r]', prompt)[0].strip()
-    # Calculate SHA-256 hash of the entire prompt
-    sha256_hash = calculate_sha256(prompt)
-    # Create anim_name with the extracted words and first 10 characters of the hash
-    anim_name = f"{words_up_to_punctuation[:20]}_{str(seed)}_{sha256_hash[:10]}"
-    generation_active = True
-    generation_start_time = time.time()
-    enable_torch_compile = data.get('enable_torch_compile', False)
-    enable_fp8 = data.get('enable_fp8', False)
-    use_taehv = data.get('use_taehv', False)
-    frame_rate = data.get('fps', 6)
-    if not prompt:
-        emit('error', {'message': 'Prompt is required'})
-        return
-    # Start generation in background thread
-    socketio.start_background_task(generate_video_stream, prompt, seed,
-                                   enable_torch_compile, enable_fp8, use_taehv)
-    emit('status', {'message': 'Generation started - frames will be sent immediately'})
-@socketio.on('stop_generation')
-def handle_stop_generation():
-    global generation_active, stop_event, frame_send_queue
-    generation_active = False
-    stop_event.set()
-    # Signal sender thread to stop (will be processed after current frames)
-    try:
-        frame_send_queue.put(None)
-    except Exception as e:
-        print(f"❌ Failed to put None in frame_send_queue: {e}")
-    emit('status', {'message': 'Generation stopped'})
-# Web routes
-@app.route('/')
-def index():
-    return render_template('demo.html')
-@app.route('/api/status')
-def api_status():
-    return jsonify({
-        'generation_active': generation_active,
-        'free_vram_gb': get_cuda_free_memory_gb(gpu),
-        'fp8_applied': fp8_applied,
-        'torch_compile_applied': torch_compile_applied,
-        'current_use_taehv': current_use_taehv
-    })
-if __name__ == '__main__':
-    print(f"🚀 Starting demo on http://{args.host}:{args.port}")
-    socketio.run(app, host=args.host, port=args.port, debug=False)

utils/wan_wrapper.py CHANGED Viewed

@@ -1,8 +1,14 @@
 import types
 from typing import List, Optional
 import torch
 from torch import nn
 from utils.scheduler import SchedulerInterface, FlowMatchScheduler
 from wan.modules.tokenizers import HuggingfaceTokenizer
 from wan.modules.model import WanModel, RegisterTokens, GanAttentionBlock
@@ -22,12 +28,12 @@ class WanTextEncoder(torch.nn.Module):
             device=torch.device('cpu')
         ).eval().requires_grad_(False)
         self.text_encoder.load_state_dict(
-            torch.load("wan_models/Wan2.1-T2V-1.3B/models_t5_umt5-xxl-enc-bf16.pth",
                        map_location='cpu', weights_only=False)
         )
         self.tokenizer = HuggingfaceTokenizer(
-            name="wan_models/Wan2.1-T2V-1.3B/google/umt5-xxl/", seq_len=512, clean='whitespace')
     @property
     def device(self):
@@ -66,7 +72,7 @@ class WanVAEWrapper(torch.nn.Module):
         # init model
         self.model = _video_vae(
-            pretrained_path="wan_models/Wan2.1-T2V-1.3B/Wan2.1_VAE.pth",
             z_dim=16,
         ).eval().requires_grad_(False)
@@ -125,9 +131,9 @@ class WanDiffusionWrapper(torch.nn.Module):
         if is_causal:
             self.model = CausalWanModel.from_pretrained(
-                f"wan_models/{model_name}/", local_attn_size=local_attn_size, sink_size=sink_size)
         else:
-            self.model = WanModel.from_pretrained(f"wan_models/{model_name}/")
         self.model.eval()
         # For non-causal diffusion, all frames share the same timestep

 import types
 from typing import List, Optional
+import os
 import torch
 from torch import nn
+# Configuration for data paths
+DATA_ROOT = os.path.normpath(os.getenv('DATA_ROOT', '.'))
+WAN_MODELS_PATH = os.path.join(DATA_ROOT, 'wan_models')
+OTHER_MODELS_PATH = os.path.join(DATA_ROOT, 'other_models')
 from utils.scheduler import SchedulerInterface, FlowMatchScheduler
 from wan.modules.tokenizers import HuggingfaceTokenizer
 from wan.modules.model import WanModel, RegisterTokens, GanAttentionBlock
             device=torch.device('cpu')
         ).eval().requires_grad_(False)
         self.text_encoder.load_state_dict(
+            torch.load(os.path.join(WAN_MODELS_PATH, "Wan2.1-T2V-1.3B", "models_t5_umt5-xxl-enc-bf16.pth"),
                        map_location='cpu', weights_only=False)
         )
         self.tokenizer = HuggingfaceTokenizer(
+            name=os.path.join(WAN_MODELS_PATH, "Wan2.1-T2V-1.3B", "google", "umt5-xxl") + "/", seq_len=512, clean='whitespace')
     @property
     def device(self):
         # init model
         self.model = _video_vae(
+            pretrained_path=os.path.join(WAN_MODELS_PATH, "Wan2.1-T2V-1.3B", "Wan2.1_VAE.pth"),
             z_dim=16,
         ).eval().requires_grad_(False)
         if is_causal:
             self.model = CausalWanModel.from_pretrained(
+                os.path.join(WAN_MODELS_PATH, model_name) + "/", local_attn_size=local_attn_size, sink_size=sink_size)
         else:
+            self.model = WanModel.from_pretrained(os.path.join(WAN_MODELS_PATH, model_name) + "/")
         self.model.eval()
         # For non-causal diffusion, all frames share the same timestep