Spaces:

multimodalart
/

self-forcing

Running on Zero

App Files Files Community

multimodalart HF Staff commited on Jun 19

Commit

64c9783

verified ·

1 Parent(s): 5609307

Update app.py

Browse files

Files changed (1) hide show

app.py +129 -22

app.py CHANGED Viewed

@@ -319,25 +319,133 @@ def video_generation_handler(prompt, seed=42, fps=15):
         )
         yield None, None, error_status_html
 # --- Gradio UI Layout ---
 frame_display = gr.Image(
     label="Generated Frames",
     height=480,
     width=832,
     show_label=True,
-    container=True
 )
 final_video = gr.Video(
     label="Final Rendered Video",
-    visible=False,
     interactive=False,
     height=400,
     autoplay=True
 )
 status_html = gr.HTML(
-                value="<div style='text-align: center; padding: 20px; color: #666;'>Ready to start generation...</div>",
-                label="Generation Status"
-            )
 with gr.Blocks(title="Self-Forcing Frame Streaming Demo") as demo:
     gr.Markdown("# 🚀 Self-Forcing Video Generation with Frame Streaming")
     gr.Markdown("Real-time video generation with frame-by-frame display. [[Model]](https://huggingface.co/gdhe17/Self-Forcing), [[Project page]](https://self-forcing.github.io), [[Paper]](https://huggingface.co/papers/2506.08009)")
@@ -345,23 +453,22 @@ with gr.Blocks(title="Self-Forcing Frame Streaming Demo") as demo:
     with gr.Row():
         with gr.Column(scale=2):
             gr.Markdown("### 📝 Configure Generation")
-            with gr.Group():
-                prompt = gr.Textbox(
-                    label="Prompt",
-                    placeholder="A stylish woman walks down a Tokyo street...",
-                    lines=4,
-                )
-                gr.Examples(
-                    examples=[
-                        "A close-up shot of a ceramic teacup slowly pouring water into a glass mug. The water flows smoothly from the spout of the teacup into the mug, creating gentle ripples as it fills up. Both cups have detailed textures, with the teacup having a matte finish and the glass mug showcasing clear transparency. The background is a blurred kitchen countertop, adding context without distracting from the central action. The pouring motion is fluid and natural, emphasizing the interaction between the two cups.",
-                        "A playful capybara is seen playing an electronic guitar, strumming the strings with its front paws. The raccoon has distinctive black facial markings and a bushy tail. It sits comfortably on a small stool, its body slightly tilted as it focuses intently on the instrument. The setting is a cozy, dimly lit room with vintage posters on the walls, adding a retro vibe. The raccoon's expressive eyes convey a sense of joy and concentration. Medium close-up shot, focusing on the raccoon's face and hands interacting with the guitar.",
-                        "A dynamic over-the-shoulder perspective of a chef meticulously plating a dish in a bustling kitchen. The chef, a middle-aged man with a neatly trimmed beard and focused expression, deftly arranges ingredients on a pristine white plate. His hands move with precision, each gesture deliberate and practiced. The background shows a crowded kitchen with steaming pots, whirring blenders, and the clatter of utensils. Bright lights highlight the scene, casting shadows across the busy workspace. The camera angle captures the chef's detailed work from behind, emphasizing his skill and dedication.",
-                    ],
-                    inputs=[prompt],
-                    fn=video_generation_handler,
-                    outputs=[frame_display, final_video, status_html],
-                    cache_examples="lazy"
-                )
             with gr.Row():
                 seed = gr.Number(label="Seed", value=-1, info="Use -1 for a random seed.")

         )
         yield None, None, error_status_html
+@torch.no_grad()
+@spaces.GPU
+def video_generation_handler_example(prompt, seed=42, fps=15):
+    """
+    Simplified video generation function that returns the final video path.
+    """
+    if seed == -1:
+        seed = random.randint(0, 2**32 - 1)
+    print(f"🎬 Starting video generation with prompt: '{prompt}' and seed: {seed}")
+    # Encode text prompt
+    print("🔤 Encoding text prompt...")
+    conditional_dict = text_encoder(text_prompts=[prompt])
+    for key, value in conditional_dict.items():
+        conditional_dict[key] = value.to(dtype=torch.float16)
+    # Initialize generation
+    rnd = torch.Generator(gpu).manual_seed(int(seed))
+    pipeline._initialize_kv_cache(1, torch.float16, device=gpu)
+    pipeline._initialize_crossattn_cache(1, torch.float16, device=gpu)
+    noise = torch.randn([1, 21, 16, 60, 104], device=gpu, dtype=torch.float16, generator=rnd)
+    vae_cache, latents_cache = None, None
+    if not APP_STATE["current_use_taehv"] and not args.trt:
+        vae_cache = [c.to(device=gpu, dtype=torch.float16) for c in ZERO_VAE_CACHE]
+    num_blocks = 7
+    current_start_frame = 0
+    all_num_frames = [pipeline.num_frame_per_block] * num_blocks
+    all_frames_for_video = []
+    # Generation loop
+    for idx, current_num_frames in enumerate(all_num_frames):
+        print(f"📦 Processing block {idx+1}/{num_blocks} with {current_num_frames} frames")
+        noisy_input = noise[:, current_start_frame : current_start_frame + current_num_frames]
+        # Denoising steps
+        for step_idx, current_timestep in enumerate(pipeline.denoising_step_list):
+            timestep = torch.ones([1, current_num_frames], device=noise.device, dtype=torch.int64) * current_timestep
+            _, denoised_pred = pipeline.generator(
+                noisy_image_or_video=noisy_input, conditional_dict=conditional_dict,
+                timestep=timestep, kv_cache=pipeline.kv_cache1,
+                crossattn_cache=pipeline.crossattn_cache,
+                current_start=current_start_frame * pipeline.frame_seq_length
+            )
+            if step_idx < len(pipeline.denoising_step_list) - 1:
+                next_timestep = pipeline.denoising_step_list[step_idx + 1]
+                noisy_input = pipeline.scheduler.add_noise(
+                    denoised_pred.flatten(0, 1), torch.randn_like(denoised_pred.flatten(0, 1)),
+                    next_timestep * torch.ones([1 * current_num_frames], device=noise.device, dtype=torch.long)
+                ).unflatten(0, denoised_pred.shape[:2])
+        if idx < len(all_num_frames) - 1:
+            pipeline.generator(
+                noisy_image_or_video=denoised_pred, conditional_dict=conditional_dict,
+                timestep=torch.zeros_like(timestep), kv_cache=pipeline.kv_cache1,
+                crossattn_cache=pipeline.crossattn_cache,
+                current_start=current_start_frame * pipeline.frame_seq_length,
+            )
+        # Decode to pixels
+        if args.trt:
+            pixels, vae_cache = pipeline.vae.forward(denoised_pred.half(), *vae_cache)
+        elif APP_STATE["current_use_taehv"]:
+            if latents_cache is None:
+                latents_cache = denoised_pred
+            else:
+                denoised_pred = torch.cat([latents_cache, denoised_pred], dim=1)
+                latents_cache = denoised_pred[:, -3:]
+            pixels = pipeline.vae.decode(denoised_pred)
+        else:
+            pixels, vae_cache = pipeline.vae(denoised_pred.half(), *vae_cache)
+        # Handle frame skipping for first block
+        if idx == 0 and not args.trt:
+            pixels = pixels[:, 3:]
+        elif APP_STATE["current_use_taehv"] and idx > 0:
+            pixels = pixels[:, 12:]
+        print(f"📹 Decoded pixels shape: {pixels.shape}")
+        # Collect all frames from this block
+        for frame_idx in range(pixels.shape[1]):
+            frame_tensor = pixels[0, frame_idx]  # Get single frame [C, H, W]
+            # Normalize from [-1, 1] to [0, 255]
+            frame_np = torch.clamp(frame_tensor.float(), -1., 1.) * 127.5 + 127.5
+            frame_np = frame_np.to(torch.uint8).cpu().numpy()
+            # Convert from CHW to HWC format (RGB)
+            frame_np = np.transpose(frame_np, (1, 2, 0))  # CHW -> HWC
+            all_frames_for_video.append(frame_np)
+        current_start_frame += current_num_frames
+    print(f"✅ Video generation completed! Total frames: {len(all_frames_for_video)}")
+    # Save final video
+    video_path = f"gradio_tmp/{seed}_{hashlib.md5(prompt.encode()).hexdigest()}.mp4"
+    imageio.mimwrite(video_path, all_frames_for_video, fps=fps, quality=8)
+    print(f"✅ Video saved to {video_path}")
+    return video_path
 # --- Gradio UI Layout ---
 frame_display = gr.Image(
     label="Generated Frames",
     height=480,
     width=832,
     show_label=True,
+    container=True,
+    visible=False
 )
 final_video = gr.Video(
     label="Final Rendered Video",
+    visible=True,
     interactive=False,
     height=400,
     autoplay=True
 )
 status_html = gr.HTML(
+    value="<div style='text-align: center; padding: 20px; color: #666;'>Ready to start generation...</div>",
+    label="Generation Status"
+)
 with gr.Blocks(title="Self-Forcing Frame Streaming Demo") as demo:
     gr.Markdown("# 🚀 Self-Forcing Video Generation with Frame Streaming")
     gr.Markdown("Real-time video generation with frame-by-frame display. [[Model]](https://huggingface.co/gdhe17/Self-Forcing), [[Project page]](https://self-forcing.github.io), [[Paper]](https://huggingface.co/papers/2506.08009)")
     with gr.Row():
         with gr.Column(scale=2):
             gr.Markdown("### 📝 Configure Generation")
+            prompt = gr.Textbox(
+                label="Prompt",
+                placeholder="A stylish woman walks down a Tokyo street...",
+                lines=4,
+            )
+            gr.Examples(
+                examples=[
+                    "A close-up shot of a ceramic teacup slowly pouring water into a glass mug. The water flows smoothly from the spout of the teacup into the mug, creating gentle ripples as it fills up. Both cups have detailed textures, with the teacup having a matte finish and the glass mug showcasing clear transparency. The background is a blurred kitchen countertop, adding context without distracting from the central action. The pouring motion is fluid and natural, emphasizing the interaction between the two cups.",
+                    "A playful capybara is seen playing an electronic guitar, strumming the strings with its front paws. The raccoon has distinctive black facial markings and a bushy tail. It sits comfortably on a small stool, its body slightly tilted as it focuses intently on the instrument. The setting is a cozy, dimly lit room with vintage posters on the walls, adding a retro vibe. The raccoon's expressive eyes convey a sense of joy and concentration. Medium close-up shot, focusing on the raccoon's face and hands interacting with the guitar.",
+                    "A dynamic over-the-shoulder perspective of a chef meticulously plating a dish in a bustling kitchen. The chef, a middle-aged man with a neatly trimmed beard and focused expression, deftly arranges ingredients on a pristine white plate. His hands move with precision, each gesture deliberate and practiced. The background shows a crowded kitchen with steaming pots, whirring blenders, and the clatter of utensils. Bright lights highlight the scene, casting shadows across the busy workspace. The camera angle captures the chef's detailed work from behind, emphasizing his skill and dedication.",
+                ],
+                inputs=[prompt],
+                fn=video_generation_handler_example,
+                outputs=[frame_display, final_video, status_html],
+                cache_examples="lazy"
+            )
             with gr.Row():
                 seed = gr.Number(label="Seed", value=-1, info="Use -1 for a random seed.")