Spaces:

multimodalart
/

self-forcing

Running on Zero

App Files Files Community

multimodalart HF Staff commited on 23 days ago

Commit

374f68b

verified ·

1 Parent(s): 4fcc110

Update app.py

Browse files

Files changed (1) hide show

app.py +8 -120

app.py CHANGED Viewed

@@ -421,121 +421,6 @@ def video_generation_handler_streaming(prompt, seed=42, fps=15):
     yield None, final_status_html
     print(f"✅ PyAV streaming complete! {total_frames_yielded} frames across {num_blocks} blocks")
-@torch.no_grad()
-@spaces.GPU
-def video_generation_handler_example(prompt, seed=42, fps=15):
-    """
-    Simplified video generation function that returns the final video path.
-    """
-    if seed == -1:
-        seed = random.randint(0, 2**32 - 1)
-    print(f"🎬 Starting video generation with prompt: '{prompt}' and seed: {seed}")
-    # Encode text prompt
-    print("🔤 Encoding text prompt...")
-    conditional_dict = text_encoder(text_prompts=[prompt])
-    for key, value in conditional_dict.items():
-        conditional_dict[key] = value.to(dtype=torch.float16)
-    # Initialize generation
-    rnd = torch.Generator(gpu).manual_seed(int(seed))
-    pipeline._initialize_kv_cache(1, torch.float16, device=gpu)
-    pipeline._initialize_crossattn_cache(1, torch.float16, device=gpu)
-    noise = torch.randn([1, 21, 16, 60, 104], device=gpu, dtype=torch.float16, generator=rnd)
-    vae_cache, latents_cache = None, None
-    if not APP_STATE["current_use_taehv"] and not args.trt:
-        vae_cache = [c.to(device=gpu, dtype=torch.float16) for c in ZERO_VAE_CACHE]
-    num_blocks = 7
-    current_start_frame = 0
-    all_num_frames = [pipeline.num_frame_per_block] * num_blocks
-    all_frames_for_video = []
-    # Generation loop
-    for idx, current_num_frames in enumerate(all_num_frames):
-        print(f"📦 Processing block {idx+1}/{num_blocks} with {current_num_frames} frames")
-        noisy_input = noise[:, current_start_frame : current_start_frame + current_num_frames]
-        # Denoising steps
-        for step_idx, current_timestep in enumerate(pipeline.denoising_step_list):
-            timestep = torch.ones([1, current_num_frames], device=noise.device, dtype=torch.int64) * current_timestep
-            _, denoised_pred = pipeline.generator(
-                noisy_image_or_video=noisy_input, conditional_dict=conditional_dict,
-                timestep=timestep, kv_cache=pipeline.kv_cache1,
-                crossattn_cache=pipeline.crossattn_cache,
-                current_start=current_start_frame * pipeline.frame_seq_length
-            )
-            if step_idx < len(pipeline.denoising_step_list) - 1:
-                next_timestep = pipeline.denoising_step_list[step_idx + 1]
-                noisy_input = pipeline.scheduler.add_noise(
-                    denoised_pred.flatten(0, 1), torch.randn_like(denoised_pred.flatten(0, 1)),
-                    next_timestep * torch.ones([1 * current_num_frames], device=noise.device, dtype=torch.long)
-                ).unflatten(0, denoised_pred.shape[:2])
-        if idx < len(all_num_frames) - 1:
-            pipeline.generator(
-                noisy_image_or_video=denoised_pred, conditional_dict=conditional_dict,
-                timestep=torch.zeros_like(timestep), kv_cache=pipeline.kv_cache1,
-                crossattn_cache=pipeline.crossattn_cache,
-                current_start=current_start_frame * pipeline.frame_seq_length,
-            )
-        # Decode to pixels
-        if args.trt:
-            pixels, vae_cache = pipeline.vae.forward(denoised_pred.half(), *vae_cache)
-        elif APP_STATE["current_use_taehv"]:
-            if latents_cache is None:
-                latents_cache = denoised_pred
-            else:
-                denoised_pred = torch.cat([latents_cache, denoised_pred], dim=1)
-                latents_cache = denoised_pred[:, -3:]
-            pixels = pipeline.vae.decode(denoised_pred)
-        else:
-            pixels, vae_cache = pipeline.vae(denoised_pred.half(), *vae_cache)
-        # Handle frame skipping for first block
-        if idx == 0 and not args.trt:
-            pixels = pixels[:, 3:]
-        elif APP_STATE["current_use_taehv"] and idx > 0:
-            pixels = pixels[:, 12:]
-        print(f"📹 Decoded pixels shape: {pixels.shape}")
-        # Collect all frames from this block
-        for frame_idx in range(pixels.shape[1]):
-            frame_tensor = pixels[0, frame_idx]  # Get single frame [C, H, W]
-            # Normalize from [-1, 1] to [0, 255]
-            frame_np = torch.clamp(frame_tensor.float(), -1., 1.) * 127.5 + 127.5
-            frame_np = frame_np.to(torch.uint8).cpu().numpy()
-            # Convert from CHW to HWC format (RGB)
-            frame_np = np.transpose(frame_np, (1, 2, 0))  # CHW -> HWC
-            all_frames_for_video.append(frame_np)
-        current_start_frame += current_num_frames
-    print(f"✅ Video generation completed! Total frames: {len(all_frames_for_video)}")
-    # Save final video
-    video_path = f"gradio_tmp/{seed}_{hashlib.md5(prompt.encode()).hexdigest()}.mp4"
-    imageio.mimwrite(video_path, all_frames_for_video, fps=fps, quality=8)
-    print(f"✅ Video saved to {video_path}")
-    return gr.update(value=video_path)
-streaming_video = gr.Video(
-    label="Live Stream",
-    streaming=True,
-    loop=True,
-    height=400,
-    autoplay=True,
-    show_label=False
-)
 # --- Gradio UI Layout ---
 with gr.Blocks(title="Self-Forcing Streaming Demo") as demo:
     gr.Markdown("# 🚀 Self-Forcing Video Generation")
@@ -562,10 +447,6 @@ with gr.Blocks(title="Self-Forcing Streaming Demo") as demo:
                     "A dynamic over-the-shoulder perspective of a chef meticulously plating a dish in a bustling kitchen. The chef, a middle-aged woman, deftly arranges ingredients on a pristine white plate. Her hands move with precision, each gesture deliberate and practiced. The background shows a crowded kitchen with steaming pots, whirring blenders, and the clatter of utensils. Bright lights highlight the scene, casting shadows across the busy workspace. The camera angle captures the chef's detailed work from behind, emphasizing his skill and dedication.",
                 ],
                 inputs=[prompt],
-                fn=video_generation_handler_example,
-                outputs=[streaming_video],
-                cache_examples="lazy",
-                label="Click any example to generate"
             )
             gr.Markdown("### ⚙️ Settings")
@@ -589,7 +470,14 @@ with gr.Blocks(title="Self-Forcing Streaming Demo") as demo:
         with gr.Column(scale=3):
             gr.Markdown("### 📺 Video Stream")
-            streaming_video.render()
             status_display = gr.HTML(
                 value=(

     yield None, final_status_html
     print(f"✅ PyAV streaming complete! {total_frames_yielded} frames across {num_blocks} blocks")
 # --- Gradio UI Layout ---
 with gr.Blocks(title="Self-Forcing Streaming Demo") as demo:
     gr.Markdown("# 🚀 Self-Forcing Video Generation")
                     "A dynamic over-the-shoulder perspective of a chef meticulously plating a dish in a bustling kitchen. The chef, a middle-aged woman, deftly arranges ingredients on a pristine white plate. Her hands move with precision, each gesture deliberate and practiced. The background shows a crowded kitchen with steaming pots, whirring blenders, and the clatter of utensils. Bright lights highlight the scene, casting shadows across the busy workspace. The camera angle captures the chef's detailed work from behind, emphasizing his skill and dedication.",
                 ],
                 inputs=[prompt],
             )
             gr.Markdown("### ⚙️ Settings")
         with gr.Column(scale=3):
             gr.Markdown("### 📺 Video Stream")
+            streaming_video = gr.Video(
+                label="Live Stream",
+                streaming=True,
+                loop=True,
+                height=400,
+                autoplay=True,
+                show_label=False
+            )
             status_display = gr.HTML(
                 value=(