multimodalart HF Staff commited on
Commit
374f68b
·
verified ·
1 Parent(s): 4fcc110

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +8 -120
app.py CHANGED
@@ -421,121 +421,6 @@ def video_generation_handler_streaming(prompt, seed=42, fps=15):
421
  yield None, final_status_html
422
  print(f"✅ PyAV streaming complete! {total_frames_yielded} frames across {num_blocks} blocks")
423
 
424
- @torch.no_grad()
425
- @spaces.GPU
426
- def video_generation_handler_example(prompt, seed=42, fps=15):
427
- """
428
- Simplified video generation function that returns the final video path.
429
- """
430
- if seed == -1:
431
- seed = random.randint(0, 2**32 - 1)
432
-
433
- print(f"🎬 Starting video generation with prompt: '{prompt}' and seed: {seed}")
434
-
435
- # Encode text prompt
436
- print("🔤 Encoding text prompt...")
437
- conditional_dict = text_encoder(text_prompts=[prompt])
438
- for key, value in conditional_dict.items():
439
- conditional_dict[key] = value.to(dtype=torch.float16)
440
-
441
- # Initialize generation
442
- rnd = torch.Generator(gpu).manual_seed(int(seed))
443
- pipeline._initialize_kv_cache(1, torch.float16, device=gpu)
444
- pipeline._initialize_crossattn_cache(1, torch.float16, device=gpu)
445
- noise = torch.randn([1, 21, 16, 60, 104], device=gpu, dtype=torch.float16, generator=rnd)
446
-
447
- vae_cache, latents_cache = None, None
448
- if not APP_STATE["current_use_taehv"] and not args.trt:
449
- vae_cache = [c.to(device=gpu, dtype=torch.float16) for c in ZERO_VAE_CACHE]
450
-
451
- num_blocks = 7
452
- current_start_frame = 0
453
- all_num_frames = [pipeline.num_frame_per_block] * num_blocks
454
- all_frames_for_video = []
455
-
456
- # Generation loop
457
- for idx, current_num_frames in enumerate(all_num_frames):
458
- print(f"📦 Processing block {idx+1}/{num_blocks} with {current_num_frames} frames")
459
-
460
- noisy_input = noise[:, current_start_frame : current_start_frame + current_num_frames]
461
-
462
- # Denoising steps
463
- for step_idx, current_timestep in enumerate(pipeline.denoising_step_list):
464
- timestep = torch.ones([1, current_num_frames], device=noise.device, dtype=torch.int64) * current_timestep
465
- _, denoised_pred = pipeline.generator(
466
- noisy_image_or_video=noisy_input, conditional_dict=conditional_dict,
467
- timestep=timestep, kv_cache=pipeline.kv_cache1,
468
- crossattn_cache=pipeline.crossattn_cache,
469
- current_start=current_start_frame * pipeline.frame_seq_length
470
- )
471
- if step_idx < len(pipeline.denoising_step_list) - 1:
472
- next_timestep = pipeline.denoising_step_list[step_idx + 1]
473
- noisy_input = pipeline.scheduler.add_noise(
474
- denoised_pred.flatten(0, 1), torch.randn_like(denoised_pred.flatten(0, 1)),
475
- next_timestep * torch.ones([1 * current_num_frames], device=noise.device, dtype=torch.long)
476
- ).unflatten(0, denoised_pred.shape[:2])
477
-
478
- if idx < len(all_num_frames) - 1:
479
- pipeline.generator(
480
- noisy_image_or_video=denoised_pred, conditional_dict=conditional_dict,
481
- timestep=torch.zeros_like(timestep), kv_cache=pipeline.kv_cache1,
482
- crossattn_cache=pipeline.crossattn_cache,
483
- current_start=current_start_frame * pipeline.frame_seq_length,
484
- )
485
-
486
- # Decode to pixels
487
- if args.trt:
488
- pixels, vae_cache = pipeline.vae.forward(denoised_pred.half(), *vae_cache)
489
- elif APP_STATE["current_use_taehv"]:
490
- if latents_cache is None:
491
- latents_cache = denoised_pred
492
- else:
493
- denoised_pred = torch.cat([latents_cache, denoised_pred], dim=1)
494
- latents_cache = denoised_pred[:, -3:]
495
- pixels = pipeline.vae.decode(denoised_pred)
496
- else:
497
- pixels, vae_cache = pipeline.vae(denoised_pred.half(), *vae_cache)
498
-
499
- # Handle frame skipping for first block
500
- if idx == 0 and not args.trt:
501
- pixels = pixels[:, 3:]
502
- elif APP_STATE["current_use_taehv"] and idx > 0:
503
- pixels = pixels[:, 12:]
504
-
505
- print(f"📹 Decoded pixels shape: {pixels.shape}")
506
-
507
- # Collect all frames from this block
508
- for frame_idx in range(pixels.shape[1]):
509
- frame_tensor = pixels[0, frame_idx] # Get single frame [C, H, W]
510
-
511
- # Normalize from [-1, 1] to [0, 255]
512
- frame_np = torch.clamp(frame_tensor.float(), -1., 1.) * 127.5 + 127.5
513
- frame_np = frame_np.to(torch.uint8).cpu().numpy()
514
-
515
- # Convert from CHW to HWC format (RGB)
516
- frame_np = np.transpose(frame_np, (1, 2, 0)) # CHW -> HWC
517
-
518
- all_frames_for_video.append(frame_np)
519
-
520
- current_start_frame += current_num_frames
521
-
522
- print(f"✅ Video generation completed! Total frames: {len(all_frames_for_video)}")
523
-
524
- # Save final video
525
- video_path = f"gradio_tmp/{seed}_{hashlib.md5(prompt.encode()).hexdigest()}.mp4"
526
- imageio.mimwrite(video_path, all_frames_for_video, fps=fps, quality=8)
527
- print(f"✅ Video saved to {video_path}")
528
-
529
- return gr.update(value=video_path)
530
-
531
- streaming_video = gr.Video(
532
- label="Live Stream",
533
- streaming=True,
534
- loop=True,
535
- height=400,
536
- autoplay=True,
537
- show_label=False
538
- )
539
  # --- Gradio UI Layout ---
540
  with gr.Blocks(title="Self-Forcing Streaming Demo") as demo:
541
  gr.Markdown("# 🚀 Self-Forcing Video Generation")
@@ -562,10 +447,6 @@ with gr.Blocks(title="Self-Forcing Streaming Demo") as demo:
562
  "A dynamic over-the-shoulder perspective of a chef meticulously plating a dish in a bustling kitchen. The chef, a middle-aged woman, deftly arranges ingredients on a pristine white plate. Her hands move with precision, each gesture deliberate and practiced. The background shows a crowded kitchen with steaming pots, whirring blenders, and the clatter of utensils. Bright lights highlight the scene, casting shadows across the busy workspace. The camera angle captures the chef's detailed work from behind, emphasizing his skill and dedication.",
563
  ],
564
  inputs=[prompt],
565
- fn=video_generation_handler_example,
566
- outputs=[streaming_video],
567
- cache_examples="lazy",
568
- label="Click any example to generate"
569
  )
570
 
571
  gr.Markdown("### ⚙️ Settings")
@@ -589,7 +470,14 @@ with gr.Blocks(title="Self-Forcing Streaming Demo") as demo:
589
  with gr.Column(scale=3):
590
  gr.Markdown("### 📺 Video Stream")
591
 
592
- streaming_video.render()
 
 
 
 
 
 
 
593
 
594
  status_display = gr.HTML(
595
  value=(
 
421
  yield None, final_status_html
422
  print(f"✅ PyAV streaming complete! {total_frames_yielded} frames across {num_blocks} blocks")
423
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
424
  # --- Gradio UI Layout ---
425
  with gr.Blocks(title="Self-Forcing Streaming Demo") as demo:
426
  gr.Markdown("# 🚀 Self-Forcing Video Generation")
 
447
  "A dynamic over-the-shoulder perspective of a chef meticulously plating a dish in a bustling kitchen. The chef, a middle-aged woman, deftly arranges ingredients on a pristine white plate. Her hands move with precision, each gesture deliberate and practiced. The background shows a crowded kitchen with steaming pots, whirring blenders, and the clatter of utensils. Bright lights highlight the scene, casting shadows across the busy workspace. The camera angle captures the chef's detailed work from behind, emphasizing his skill and dedication.",
448
  ],
449
  inputs=[prompt],
 
 
 
 
450
  )
451
 
452
  gr.Markdown("### ⚙️ Settings")
 
470
  with gr.Column(scale=3):
471
  gr.Markdown("### 📺 Video Stream")
472
 
473
+ streaming_video = gr.Video(
474
+ label="Live Stream",
475
+ streaming=True,
476
+ loop=True,
477
+ height=400,
478
+ autoplay=True,
479
+ show_label=False
480
+ )
481
 
482
  status_display = gr.HTML(
483
  value=(