Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -255,170 +255,6 @@ pipeline = CausalInferencePipeline(
|
|
255 |
|
256 |
pipeline.to(dtype=torch.float16).to(gpu)
|
257 |
|
258 |
-
@torch.no_grad()
|
259 |
-
@spaces.GPU
|
260 |
-
@torch.no_grad()
|
261 |
-
@spaces.GPU
|
262 |
-
def video_generation_handler_streaming(prompt, seed=42, fps=15):
|
263 |
-
"""
|
264 |
-
Generator function that yields .ts video chunks using PyAV for streaming.
|
265 |
-
Now optimized for block-based processing.
|
266 |
-
"""
|
267 |
-
if seed == -1:
|
268 |
-
seed = random.randint(0, 2**32 - 1)
|
269 |
-
|
270 |
-
print(f"🎬 Starting PyAV streaming: '{prompt}', seed: {seed}")
|
271 |
-
|
272 |
-
# Setup
|
273 |
-
conditional_dict = text_encoder(text_prompts=[prompt])
|
274 |
-
for key, value in conditional_dict.items():
|
275 |
-
conditional_dict[key] = value.to(dtype=torch.float16)
|
276 |
-
|
277 |
-
rnd = torch.Generator(gpu).manual_seed(int(seed))
|
278 |
-
pipeline._initialize_kv_cache(1, torch.float16, device=gpu)
|
279 |
-
pipeline._initialize_crossattn_cache(1, torch.float16, device=gpu)
|
280 |
-
noise = torch.randn([1, 21, 16, 60, 104], device=gpu, dtype=torch.float16, generator=rnd)
|
281 |
-
|
282 |
-
vae_cache, latents_cache = None, None
|
283 |
-
if not APP_STATE["current_use_taehv"] and not args.trt:
|
284 |
-
vae_cache = [c.to(device=gpu, dtype=torch.float16) for c in ZERO_VAE_CACHE]
|
285 |
-
|
286 |
-
num_blocks = 7
|
287 |
-
current_start_frame = 0
|
288 |
-
all_num_frames = [pipeline.num_frame_per_block] * num_blocks
|
289 |
-
|
290 |
-
total_frames_yielded = 0
|
291 |
-
|
292 |
-
# Ensure temp directory exists
|
293 |
-
os.makedirs("gradio_tmp", exist_ok=True)
|
294 |
-
|
295 |
-
# Generation loop
|
296 |
-
for idx, current_num_frames in enumerate(all_num_frames):
|
297 |
-
print(f"📦 Processing block {idx+1}/{num_blocks}")
|
298 |
-
|
299 |
-
noisy_input = noise[:, current_start_frame : current_start_frame + current_num_frames]
|
300 |
-
|
301 |
-
# Denoising steps
|
302 |
-
for step_idx, current_timestep in enumerate(pipeline.denoising_step_list):
|
303 |
-
timestep = torch.ones([1, current_num_frames], device=noise.device, dtype=torch.int64) * current_timestep
|
304 |
-
_, denoised_pred = pipeline.generator(
|
305 |
-
noisy_image_or_video=noisy_input, conditional_dict=conditional_dict,
|
306 |
-
timestep=timestep, kv_cache=pipeline.kv_cache1,
|
307 |
-
crossattn_cache=pipeline.crossattn_cache,
|
308 |
-
current_start=current_start_frame * pipeline.frame_seq_length
|
309 |
-
)
|
310 |
-
if step_idx < len(pipeline.denoising_step_list) - 1:
|
311 |
-
next_timestep = pipeline.denoising_step_list[step_idx + 1]
|
312 |
-
noisy_input = pipeline.scheduler.add_noise(
|
313 |
-
denoised_pred.flatten(0, 1), torch.randn_like(denoised_pred.flatten(0, 1)),
|
314 |
-
next_timestep * torch.ones([1 * current_num_frames], device=noise.device, dtype=torch.long)
|
315 |
-
).unflatten(0, denoised_pred.shape[:2])
|
316 |
-
|
317 |
-
if idx < len(all_num_frames) - 1:
|
318 |
-
pipeline.generator(
|
319 |
-
noisy_image_or_video=denoised_pred, conditional_dict=conditional_dict,
|
320 |
-
timestep=torch.zeros_like(timestep), kv_cache=pipeline.kv_cache1,
|
321 |
-
crossattn_cache=pipeline.crossattn_cache,
|
322 |
-
current_start=current_start_frame * pipeline.frame_seq_length,
|
323 |
-
)
|
324 |
-
|
325 |
-
# Decode to pixels
|
326 |
-
if args.trt:
|
327 |
-
pixels, vae_cache = pipeline.vae.forward(denoised_pred.half(), *vae_cache)
|
328 |
-
elif APP_STATE["current_use_taehv"]:
|
329 |
-
if latents_cache is None:
|
330 |
-
latents_cache = denoised_pred
|
331 |
-
else:
|
332 |
-
denoised_pred = torch.cat([latents_cache, denoised_pred], dim=1)
|
333 |
-
latents_cache = denoised_pred[:, -3:]
|
334 |
-
pixels = pipeline.vae.decode(denoised_pred)
|
335 |
-
else:
|
336 |
-
pixels, vae_cache = pipeline.vae(denoised_pred.half(), *vae_cache)
|
337 |
-
|
338 |
-
# Handle frame skipping
|
339 |
-
if idx == 0 and not args.trt:
|
340 |
-
pixels = pixels[:, 3:]
|
341 |
-
elif APP_STATE["current_use_taehv"] and idx > 0:
|
342 |
-
pixels = pixels[:, 12:]
|
343 |
-
|
344 |
-
print(f"🔍 DEBUG Block {idx}: Pixels shape after skipping: {pixels.shape}")
|
345 |
-
|
346 |
-
# Process all frames from this block at once
|
347 |
-
all_frames_from_block = []
|
348 |
-
for frame_idx in range(pixels.shape[1]):
|
349 |
-
frame_tensor = pixels[0, frame_idx]
|
350 |
-
|
351 |
-
# Convert to numpy (HWC, RGB, uint8)
|
352 |
-
frame_np = torch.clamp(frame_tensor.float(), -1., 1.) * 127.5 + 127.5
|
353 |
-
frame_np = frame_np.to(torch.uint8).cpu().numpy()
|
354 |
-
frame_np = np.transpose(frame_np, (1, 2, 0)) # CHW -> HWC
|
355 |
-
|
356 |
-
all_frames_from_block.append(frame_np)
|
357 |
-
|
358 |
-
# Encode entire block as one chunk immediately
|
359 |
-
if all_frames_from_block:
|
360 |
-
print(f"📹 Encoding block {idx} with {len(all_frames_from_block)} frames")
|
361 |
-
|
362 |
-
try:
|
363 |
-
chunk_uuid = str(uuid.uuid4())[:8]
|
364 |
-
ts_filename = f"block_{idx:04d}_{chunk_uuid}.ts"
|
365 |
-
ts_path = os.path.join("gradio_tmp", ts_filename)
|
366 |
-
|
367 |
-
frames_to_ts_file(all_frames_from_block, ts_path, fps)
|
368 |
-
|
369 |
-
total_frames_yielded += len(all_frames_from_block)
|
370 |
-
|
371 |
-
# Calculate progress
|
372 |
-
total_progress = (idx + 1) / num_blocks * 100
|
373 |
-
|
374 |
-
status_html = (
|
375 |
-
f"<div style='padding: 12px; border: 1px solid #0d6efd; border-radius: 8px; background: linear-gradient(135deg, #f8f9fa, #e3f2fd);'>"
|
376 |
-
f" <div style='display: flex; align-items: center; margin-bottom: 8px;'>"
|
377 |
-
f" <span style='color: #dc3545; font-size: 16px; margin-right: 8px;'>🔴</span>"
|
378 |
-
f" <span style='font-weight: bold; color: #0d6efd;'>Live Streaming</span>"
|
379 |
-
f" </div>"
|
380 |
-
f" <div style='background: #e9ecef; border-radius: 4px; width: 100%; overflow: hidden; margin: 8px 0;'>"
|
381 |
-
f" <div style='width: {total_progress:.1f}%; height: 20px; background: linear-gradient(90deg, #0d6efd, #6610f2); transition: width 0.3s; display: flex; align-items: center; justify-content: center; color: white; font-size: 12px; font-weight: bold;'>"
|
382 |
-
f" {total_progress:.1f}%"
|
383 |
-
f" </div>"
|
384 |
-
f" </div>"
|
385 |
-
f" <div style='display: flex; justify-content: space-between; font-size: 14px; color: #666;'>"
|
386 |
-
f" <span>Block {idx+1}/{num_blocks}</span>"
|
387 |
-
f" <span>{len(all_frames_from_block)} frames</span>"
|
388 |
-
f" <span>Total: {total_frames_yielded}</span>"
|
389 |
-
f" </div>"
|
390 |
-
f"</div>"
|
391 |
-
)
|
392 |
-
|
393 |
-
yield ts_path, status_html
|
394 |
-
|
395 |
-
except Exception as e:
|
396 |
-
print(f"⚠️ Error encoding block {idx}: {e}")
|
397 |
-
import traceback
|
398 |
-
traceback.print_exc()
|
399 |
-
|
400 |
-
current_start_frame += current_num_frames
|
401 |
-
|
402 |
-
# Final completion status
|
403 |
-
final_status_html = (
|
404 |
-
f"<div style='padding: 16px; border: 1px solid #198754; background: linear-gradient(135deg, #d1e7dd, #f8f9fa); border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1);'>"
|
405 |
-
f" <div style='display: flex; align-items: center; margin-bottom: 8px;'>"
|
406 |
-
f" <span style='font-size: 24px; margin-right: 12px;'>🎉</span>"
|
407 |
-
f" <h4 style='margin: 0; color: #0f5132; font-size: 18px;'>Stream Complete!</h4>"
|
408 |
-
f" </div>"
|
409 |
-
f" <div style='background: rgba(255,255,255,0.7); padding: 8px; border-radius: 4px;'>"
|
410 |
-
f" <p style='margin: 0; color: #0f5132; font-weight: 500;'>"
|
411 |
-
f" 📊 Generated {total_frames_yielded} frames across {num_blocks} blocks"
|
412 |
-
f" </p>"
|
413 |
-
f" <p style='margin: 4px 0 0 0; color: #0f5132; font-size: 14px;'>"
|
414 |
-
f" 🎬 Playback: {fps} FPS • 📁 Format: MPEG-TS/H.264"
|
415 |
-
f" </p>"
|
416 |
-
f" </div>"
|
417 |
-
f"</div>"
|
418 |
-
)
|
419 |
-
|
420 |
-
print(f"✅ PyAV streaming complete! {total_frames_yielded} frames across {num_blocks} blocks")
|
421 |
-
|
422 |
@torch.no_grad()
|
423 |
@spaces.GPU
|
424 |
def video_generation_handler_streaming(prompt, seed=42, fps=15):
|
@@ -695,24 +531,25 @@ def video_generation_handler_example(prompt, seed=42, fps=15):
|
|
695 |
# --- Gradio UI Layout ---
|
696 |
with gr.Blocks(title="Self-Forcing Streaming Demo") as demo:
|
697 |
gr.Markdown("# 🚀 Self-Forcing Video Generation with Streaming")
|
698 |
-
gr.Markdown("Real-time video generation
|
699 |
|
700 |
with gr.Row():
|
701 |
with gr.Column(scale=2):
|
702 |
-
gr.
|
703 |
-
|
704 |
-
|
705 |
-
|
706 |
-
|
707 |
-
|
708 |
-
|
709 |
-
|
710 |
-
|
|
|
711 |
|
712 |
gr.Markdown("### 🎯 Examples")
|
713 |
gr.Examples(
|
714 |
examples=[
|
715 |
-
"A close-up shot of a ceramic teacup slowly pouring water into a glass mug.
|
716 |
"A playful cat is seen playing an electronic guitar, strumming the strings with its front paws. The cat has distinctive black facial markings and a bushy tail. It sits comfortably on a small stool, its body slightly tilted as it focuses intently on the instrument. The setting is a cozy, dimly lit room with vintage posters on the walls, adding a retro vibe. The cat's expressive eyes convey a sense of joy and concentration. Medium close-up shot, focusing on the cat's face and hands interacting with the guitar.",
|
717 |
"A dynamic over-the-shoulder perspective of a chef meticulously plating a dish in a bustling kitchen. The chef, a middle-aged woman, deftly arranges ingredients on a pristine white plate. Her hands move with precision, each gesture deliberate and practiced. The background shows a crowded kitchen with steaming pots, whirring blenders, and the clatter of utensils. Bright lights highlight the scene, casting shadows across the busy workspace. The camera angle captures the chef's detailed work from behind, emphasizing his skill and dedication.",
|
718 |
],
|
@@ -740,8 +577,6 @@ with gr.Blocks(title="Self-Forcing Streaming Demo") as demo:
|
|
740 |
visible=False,
|
741 |
info="Frames per second for playback"
|
742 |
)
|
743 |
-
|
744 |
-
start_btn = gr.Button("🎬 Start Streaming", variant="primary", size="lg")
|
745 |
|
746 |
with gr.Column(scale=3):
|
747 |
gr.Markdown("### 📺 Live Video Stream")
|
@@ -750,6 +585,7 @@ with gr.Blocks(title="Self-Forcing Streaming Demo") as demo:
|
|
750 |
streaming_video = gr.Video(
|
751 |
label="Live Stream",
|
752 |
streaming=True,
|
|
|
753 |
height=400,
|
754 |
autoplay=True,
|
755 |
show_label=False
|
|
|
255 |
|
256 |
pipeline.to(dtype=torch.float16).to(gpu)
|
257 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
258 |
@torch.no_grad()
|
259 |
@spaces.GPU
|
260 |
def video_generation_handler_streaming(prompt, seed=42, fps=15):
|
|
|
531 |
# --- Gradio UI Layout ---
|
532 |
with gr.Blocks(title="Self-Forcing Streaming Demo") as demo:
|
533 |
gr.Markdown("# 🚀 Self-Forcing Video Generation with Streaming")
|
534 |
+
gr.Markdown("Real-time video generation [[Model]](https://huggingface.co/gdhe17/Self-Forcing), [[Project page]](https://self-forcing.github.io), [[Paper]](https://huggingface.co/papers/2506.08009)")
|
535 |
|
536 |
with gr.Row():
|
537 |
with gr.Column(scale=2):
|
538 |
+
with gr.Group():
|
539 |
+
prompt = gr.Textbox(
|
540 |
+
label="Prompt",
|
541 |
+
placeholder="A stylish woman walks down a Tokyo street...",
|
542 |
+
lines=4,
|
543 |
+
value=""
|
544 |
+
)
|
545 |
+
enhance_button = gr.Button("✨ Enhance Prompt", variant="secondary")
|
546 |
+
|
547 |
+
start_btn = gr.Button("🎬 Start Streaming", variant="primary", size="lg")
|
548 |
|
549 |
gr.Markdown("### 🎯 Examples")
|
550 |
gr.Examples(
|
551 |
examples=[
|
552 |
+
"A close-up shot of a ceramic teacup slowly pouring water into a glass mug.",
|
553 |
"A playful cat is seen playing an electronic guitar, strumming the strings with its front paws. The cat has distinctive black facial markings and a bushy tail. It sits comfortably on a small stool, its body slightly tilted as it focuses intently on the instrument. The setting is a cozy, dimly lit room with vintage posters on the walls, adding a retro vibe. The cat's expressive eyes convey a sense of joy and concentration. Medium close-up shot, focusing on the cat's face and hands interacting with the guitar.",
|
554 |
"A dynamic over-the-shoulder perspective of a chef meticulously plating a dish in a bustling kitchen. The chef, a middle-aged woman, deftly arranges ingredients on a pristine white plate. Her hands move with precision, each gesture deliberate and practiced. The background shows a crowded kitchen with steaming pots, whirring blenders, and the clatter of utensils. Bright lights highlight the scene, casting shadows across the busy workspace. The camera angle captures the chef's detailed work from behind, emphasizing his skill and dedication.",
|
555 |
],
|
|
|
577 |
visible=False,
|
578 |
info="Frames per second for playback"
|
579 |
)
|
|
|
|
|
580 |
|
581 |
with gr.Column(scale=3):
|
582 |
gr.Markdown("### 📺 Live Video Stream")
|
|
|
585 |
streaming_video = gr.Video(
|
586 |
label="Live Stream",
|
587 |
streaming=True,
|
588 |
+
loop=True,
|
589 |
height=400,
|
590 |
autoplay=True,
|
591 |
show_label=False
|