Spaces:

multimodalart
/

self-forcing

Running on Zero

App Files Files Community

multimodalart HF Staff commited on Jun 19

Commit

b433294

verified ·

1 Parent(s): 26ef40e

Update app.py

Browse files

Files changed (1) hide show

app.py +13 -177

app.py CHANGED Viewed

@@ -255,170 +255,6 @@ pipeline = CausalInferencePipeline(
 pipeline.to(dtype=torch.float16).to(gpu)
-@torch.no_grad()
-@spaces.GPU
-@torch.no_grad()
-@spaces.GPU
-def video_generation_handler_streaming(prompt, seed=42, fps=15):
-    """
-    Generator function that yields .ts video chunks using PyAV for streaming.
-    Now optimized for block-based processing.
-    """
-    if seed == -1:
-        seed = random.randint(0, 2**32 - 1)
-    print(f"🎬 Starting PyAV streaming: '{prompt}', seed: {seed}")
-    # Setup
-    conditional_dict = text_encoder(text_prompts=[prompt])
-    for key, value in conditional_dict.items():
-        conditional_dict[key] = value.to(dtype=torch.float16)
-    rnd = torch.Generator(gpu).manual_seed(int(seed))
-    pipeline._initialize_kv_cache(1, torch.float16, device=gpu)
-    pipeline._initialize_crossattn_cache(1, torch.float16, device=gpu)
-    noise = torch.randn([1, 21, 16, 60, 104], device=gpu, dtype=torch.float16, generator=rnd)
-    vae_cache, latents_cache = None, None
-    if not APP_STATE["current_use_taehv"] and not args.trt:
-        vae_cache = [c.to(device=gpu, dtype=torch.float16) for c in ZERO_VAE_CACHE]
-    num_blocks = 7
-    current_start_frame = 0
-    all_num_frames = [pipeline.num_frame_per_block] * num_blocks
-    total_frames_yielded = 0
-    # Ensure temp directory exists
-    os.makedirs("gradio_tmp", exist_ok=True)
-    # Generation loop
-    for idx, current_num_frames in enumerate(all_num_frames):
-        print(f"📦 Processing block {idx+1}/{num_blocks}")
-        noisy_input = noise[:, current_start_frame : current_start_frame + current_num_frames]
-        # Denoising steps
-        for step_idx, current_timestep in enumerate(pipeline.denoising_step_list):
-            timestep = torch.ones([1, current_num_frames], device=noise.device, dtype=torch.int64) * current_timestep
-            _, denoised_pred = pipeline.generator(
-                noisy_image_or_video=noisy_input, conditional_dict=conditional_dict,
-                timestep=timestep, kv_cache=pipeline.kv_cache1,
-                crossattn_cache=pipeline.crossattn_cache,
-                current_start=current_start_frame * pipeline.frame_seq_length
-            )
-            if step_idx < len(pipeline.denoising_step_list) - 1:
-                next_timestep = pipeline.denoising_step_list[step_idx + 1]
-                noisy_input = pipeline.scheduler.add_noise(
-                    denoised_pred.flatten(0, 1), torch.randn_like(denoised_pred.flatten(0, 1)),
-                    next_timestep * torch.ones([1 * current_num_frames], device=noise.device, dtype=torch.long)
-                ).unflatten(0, denoised_pred.shape[:2])
-        if idx < len(all_num_frames) - 1:
-            pipeline.generator(
-                noisy_image_or_video=denoised_pred, conditional_dict=conditional_dict,
-                timestep=torch.zeros_like(timestep), kv_cache=pipeline.kv_cache1,
-                crossattn_cache=pipeline.crossattn_cache,
-                current_start=current_start_frame * pipeline.frame_seq_length,
-            )
-        # Decode to pixels
-        if args.trt:
-            pixels, vae_cache = pipeline.vae.forward(denoised_pred.half(), *vae_cache)
-        elif APP_STATE["current_use_taehv"]:
-            if latents_cache is None:
-                latents_cache = denoised_pred
-            else:
-                denoised_pred = torch.cat([latents_cache, denoised_pred], dim=1)
-                latents_cache = denoised_pred[:, -3:]
-            pixels = pipeline.vae.decode(denoised_pred)
-        else:
-            pixels, vae_cache = pipeline.vae(denoised_pred.half(), *vae_cache)
-        # Handle frame skipping
-        if idx == 0 and not args.trt:
-            pixels = pixels[:, 3:]
-        elif APP_STATE["current_use_taehv"] and idx > 0:
-            pixels = pixels[:, 12:]
-        print(f"🔍 DEBUG Block {idx}: Pixels shape after skipping: {pixels.shape}")
-        # Process all frames from this block at once
-        all_frames_from_block = []
-        for frame_idx in range(pixels.shape[1]):
-            frame_tensor = pixels[0, frame_idx]
-            # Convert to numpy (HWC, RGB, uint8)
-            frame_np = torch.clamp(frame_tensor.float(), -1., 1.) * 127.5 + 127.5
-            frame_np = frame_np.to(torch.uint8).cpu().numpy()
-            frame_np = np.transpose(frame_np, (1, 2, 0))  # CHW -> HWC
-            all_frames_from_block.append(frame_np)
-        # Encode entire block as one chunk immediately
-        if all_frames_from_block:
-            print(f"📹 Encoding block {idx} with {len(all_frames_from_block)} frames")
-            try:
-                chunk_uuid = str(uuid.uuid4())[:8]
-                ts_filename = f"block_{idx:04d}_{chunk_uuid}.ts"
-                ts_path = os.path.join("gradio_tmp", ts_filename)
-                frames_to_ts_file(all_frames_from_block, ts_path, fps)
-                total_frames_yielded += len(all_frames_from_block)
-                # Calculate progress
-                total_progress = (idx + 1) / num_blocks * 100
-                status_html = (
-                    f"<div style='padding: 12px; border: 1px solid #0d6efd; border-radius: 8px; background: linear-gradient(135deg, #f8f9fa, #e3f2fd);'>"
-                    f"  <div style='display: flex; align-items: center; margin-bottom: 8px;'>"
-                    f"    <span style='color: #dc3545; font-size: 16px; margin-right: 8px;'>🔴</span>"
-                    f"    <span style='font-weight: bold; color: #0d6efd;'>Live Streaming</span>"
-                    f"  </div>"
-                    f"  <div style='background: #e9ecef; border-radius: 4px; width: 100%; overflow: hidden; margin: 8px 0;'>"
-                    f"    <div style='width: {total_progress:.1f}%; height: 20px; background: linear-gradient(90deg, #0d6efd, #6610f2); transition: width 0.3s; display: flex; align-items: center; justify-content: center; color: white; font-size: 12px; font-weight: bold;'>"
-                    f"      {total_progress:.1f}%"
-                    f"    </div>"
-                    f"  </div>"
-                    f"  <div style='display: flex; justify-content: space-between; font-size: 14px; color: #666;'>"
-                    f"    <span>Block {idx+1}/{num_blocks}</span>"
-                    f"    <span>{len(all_frames_from_block)} frames</span>"
-                    f"    <span>Total: {total_frames_yielded}</span>"
-                    f"  </div>"
-                    f"</div>"
-                )
-                yield ts_path, status_html
-            except Exception as e:
-                print(f"⚠️ Error encoding block {idx}: {e}")
-                import traceback
-                traceback.print_exc()
-        current_start_frame += current_num_frames
-    # Final completion status
-    final_status_html = (
-        f"<div style='padding: 16px; border: 1px solid #198754; background: linear-gradient(135deg, #d1e7dd, #f8f9fa); border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1);'>"
-        f"  <div style='display: flex; align-items: center; margin-bottom: 8px;'>"
-        f"    <span style='font-size: 24px; margin-right: 12px;'>🎉</span>"
-        f"    <h4 style='margin: 0; color: #0f5132; font-size: 18px;'>Stream Complete!</h4>"
-        f"  </div>"
-        f"  <div style='background: rgba(255,255,255,0.7); padding: 8px; border-radius: 4px;'>"
-        f"    <p style='margin: 0; color: #0f5132; font-weight: 500;'>"
-        f"      📊 Generated {total_frames_yielded} frames across {num_blocks} blocks"
-        f"    </p>"
-        f"    <p style='margin: 4px 0 0 0; color: #0f5132; font-size: 14px;'>"
-        f"      🎬 Playback: {fps} FPS • 📁 Format: MPEG-TS/H.264"
-        f"    </p>"
-        f"  </div>"
-        f"</div>"
-    )
-    print(f"✅ PyAV streaming complete! {total_frames_yielded} frames across {num_blocks} blocks")
 @torch.no_grad()
 @spaces.GPU
 def video_generation_handler_streaming(prompt, seed=42, fps=15):
@@ -695,24 +531,25 @@ def video_generation_handler_example(prompt, seed=42, fps=15):
 # --- Gradio UI Layout ---
 with gr.Blocks(title="Self-Forcing Streaming Demo") as demo:
     gr.Markdown("# 🚀 Self-Forcing Video Generation with Streaming")
-    gr.Markdown("Real-time video generation with frame-by-frame streaming using PyAV encoding. [[Model]](https://huggingface.co/gdhe17/Self-Forcing), [[Project page]](https://self-forcing.github.io), [[Paper]](https://huggingface.co/papers/2506.08009)")
     with gr.Row():
         with gr.Column(scale=2):
-            gr.Markdown("### 📝 Configure Generation")
-            prompt = gr.Textbox(
-                label="Prompt",
-                placeholder="A stylish woman walks down a Tokyo street...",
-                lines=4,
-                value="A close-up shot of a ceramic teacup slowly pouring water into a glass mug."
-            )
-            enhance_button = gr.Button("✨ Enhance Prompt", variant="secondary")
             gr.Markdown("### 🎯 Examples")
             gr.Examples(
                 examples=[
-                    "A close-up shot of a ceramic teacup slowly pouring water into a glass mug. The water flows smoothly from the spout of the teacup into the mug, creating gentle ripples as it fills up. Both cups have detailed textures, with the teacup having a matte finish and the glass mug showcasing clear transparency. The background is a blurred kitchen countertop, adding context without distracting from the central action. The pouring motion is fluid and natural, emphasizing the interaction between the two cups.",
                     "A playful cat is seen playing an electronic guitar, strumming the strings with its front paws. The cat has distinctive black facial markings and a bushy tail. It sits comfortably on a small stool, its body slightly tilted as it focuses intently on the instrument. The setting is a cozy, dimly lit room with vintage posters on the walls, adding a retro vibe. The cat's expressive eyes convey a sense of joy and concentration. Medium close-up shot, focusing on the cat's face and hands interacting with the guitar.",
                     "A dynamic over-the-shoulder perspective of a chef meticulously plating a dish in a bustling kitchen. The chef, a middle-aged woman, deftly arranges ingredients on a pristine white plate. Her hands move with precision, each gesture deliberate and practiced. The background shows a crowded kitchen with steaming pots, whirring blenders, and the clatter of utensils. Bright lights highlight the scene, casting shadows across the busy workspace. The camera angle captures the chef's detailed work from behind, emphasizing his skill and dedication.",
                 ],
@@ -740,8 +577,6 @@ with gr.Blocks(title="Self-Forcing Streaming Demo") as demo:
                     visible=False,
                     info="Frames per second for playback"
                 )
-            start_btn = gr.Button("🎬 Start Streaming", variant="primary", size="lg")
         with gr.Column(scale=3):
             gr.Markdown("### 📺 Live Video Stream")
@@ -750,6 +585,7 @@ with gr.Blocks(title="Self-Forcing Streaming Demo") as demo:
             streaming_video = gr.Video(
                 label="Live Stream",
                 streaming=True,
                 height=400,
                 autoplay=True,
                 show_label=False

 pipeline.to(dtype=torch.float16).to(gpu)
 @torch.no_grad()
 @spaces.GPU
 def video_generation_handler_streaming(prompt, seed=42, fps=15):
 # --- Gradio UI Layout ---
 with gr.Blocks(title="Self-Forcing Streaming Demo") as demo:
     gr.Markdown("# 🚀 Self-Forcing Video Generation with Streaming")
+    gr.Markdown("Real-time video generation [[Model]](https://huggingface.co/gdhe17/Self-Forcing), [[Project page]](https://self-forcing.github.io), [[Paper]](https://huggingface.co/papers/2506.08009)")
     with gr.Row():
         with gr.Column(scale=2):
+            with gr.Group():
+                prompt = gr.Textbox(
+                    label="Prompt",
+                    placeholder="A stylish woman walks down a Tokyo street...",
+                    lines=4,
+                    value=""
+                )
+                enhance_button = gr.Button("✨ Enhance Prompt", variant="secondary")
+            start_btn = gr.Button("🎬 Start Streaming", variant="primary", size="lg")
             gr.Markdown("### 🎯 Examples")
             gr.Examples(
                 examples=[
+                    "A close-up shot of a ceramic teacup slowly pouring water into a glass mug.",
                     "A playful cat is seen playing an electronic guitar, strumming the strings with its front paws. The cat has distinctive black facial markings and a bushy tail. It sits comfortably on a small stool, its body slightly tilted as it focuses intently on the instrument. The setting is a cozy, dimly lit room with vintage posters on the walls, adding a retro vibe. The cat's expressive eyes convey a sense of joy and concentration. Medium close-up shot, focusing on the cat's face and hands interacting with the guitar.",
                     "A dynamic over-the-shoulder perspective of a chef meticulously plating a dish in a bustling kitchen. The chef, a middle-aged woman, deftly arranges ingredients on a pristine white plate. Her hands move with precision, each gesture deliberate and practiced. The background shows a crowded kitchen with steaming pots, whirring blenders, and the clatter of utensils. Bright lights highlight the scene, casting shadows across the busy workspace. The camera angle captures the chef's detailed work from behind, emphasizing his skill and dedication.",
                 ],
                     visible=False,
                     info="Frames per second for playback"
                 )
         with gr.Column(scale=3):
             gr.Markdown("### 📺 Live Video Stream")
             streaming_video = gr.Video(
                 label="Live Stream",
                 streaming=True,
+                loop=True,
                 height=400,
                 autoplay=True,
                 show_label=False