multimodalart HF Staff commited on
Commit
b433294
·
verified ·
1 Parent(s): 26ef40e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -177
app.py CHANGED
@@ -255,170 +255,6 @@ pipeline = CausalInferencePipeline(
255
 
256
  pipeline.to(dtype=torch.float16).to(gpu)
257
 
258
- @torch.no_grad()
259
- @spaces.GPU
260
- @torch.no_grad()
261
- @spaces.GPU
262
- def video_generation_handler_streaming(prompt, seed=42, fps=15):
263
- """
264
- Generator function that yields .ts video chunks using PyAV for streaming.
265
- Now optimized for block-based processing.
266
- """
267
- if seed == -1:
268
- seed = random.randint(0, 2**32 - 1)
269
-
270
- print(f"🎬 Starting PyAV streaming: '{prompt}', seed: {seed}")
271
-
272
- # Setup
273
- conditional_dict = text_encoder(text_prompts=[prompt])
274
- for key, value in conditional_dict.items():
275
- conditional_dict[key] = value.to(dtype=torch.float16)
276
-
277
- rnd = torch.Generator(gpu).manual_seed(int(seed))
278
- pipeline._initialize_kv_cache(1, torch.float16, device=gpu)
279
- pipeline._initialize_crossattn_cache(1, torch.float16, device=gpu)
280
- noise = torch.randn([1, 21, 16, 60, 104], device=gpu, dtype=torch.float16, generator=rnd)
281
-
282
- vae_cache, latents_cache = None, None
283
- if not APP_STATE["current_use_taehv"] and not args.trt:
284
- vae_cache = [c.to(device=gpu, dtype=torch.float16) for c in ZERO_VAE_CACHE]
285
-
286
- num_blocks = 7
287
- current_start_frame = 0
288
- all_num_frames = [pipeline.num_frame_per_block] * num_blocks
289
-
290
- total_frames_yielded = 0
291
-
292
- # Ensure temp directory exists
293
- os.makedirs("gradio_tmp", exist_ok=True)
294
-
295
- # Generation loop
296
- for idx, current_num_frames in enumerate(all_num_frames):
297
- print(f"📦 Processing block {idx+1}/{num_blocks}")
298
-
299
- noisy_input = noise[:, current_start_frame : current_start_frame + current_num_frames]
300
-
301
- # Denoising steps
302
- for step_idx, current_timestep in enumerate(pipeline.denoising_step_list):
303
- timestep = torch.ones([1, current_num_frames], device=noise.device, dtype=torch.int64) * current_timestep
304
- _, denoised_pred = pipeline.generator(
305
- noisy_image_or_video=noisy_input, conditional_dict=conditional_dict,
306
- timestep=timestep, kv_cache=pipeline.kv_cache1,
307
- crossattn_cache=pipeline.crossattn_cache,
308
- current_start=current_start_frame * pipeline.frame_seq_length
309
- )
310
- if step_idx < len(pipeline.denoising_step_list) - 1:
311
- next_timestep = pipeline.denoising_step_list[step_idx + 1]
312
- noisy_input = pipeline.scheduler.add_noise(
313
- denoised_pred.flatten(0, 1), torch.randn_like(denoised_pred.flatten(0, 1)),
314
- next_timestep * torch.ones([1 * current_num_frames], device=noise.device, dtype=torch.long)
315
- ).unflatten(0, denoised_pred.shape[:2])
316
-
317
- if idx < len(all_num_frames) - 1:
318
- pipeline.generator(
319
- noisy_image_or_video=denoised_pred, conditional_dict=conditional_dict,
320
- timestep=torch.zeros_like(timestep), kv_cache=pipeline.kv_cache1,
321
- crossattn_cache=pipeline.crossattn_cache,
322
- current_start=current_start_frame * pipeline.frame_seq_length,
323
- )
324
-
325
- # Decode to pixels
326
- if args.trt:
327
- pixels, vae_cache = pipeline.vae.forward(denoised_pred.half(), *vae_cache)
328
- elif APP_STATE["current_use_taehv"]:
329
- if latents_cache is None:
330
- latents_cache = denoised_pred
331
- else:
332
- denoised_pred = torch.cat([latents_cache, denoised_pred], dim=1)
333
- latents_cache = denoised_pred[:, -3:]
334
- pixels = pipeline.vae.decode(denoised_pred)
335
- else:
336
- pixels, vae_cache = pipeline.vae(denoised_pred.half(), *vae_cache)
337
-
338
- # Handle frame skipping
339
- if idx == 0 and not args.trt:
340
- pixels = pixels[:, 3:]
341
- elif APP_STATE["current_use_taehv"] and idx > 0:
342
- pixels = pixels[:, 12:]
343
-
344
- print(f"🔍 DEBUG Block {idx}: Pixels shape after skipping: {pixels.shape}")
345
-
346
- # Process all frames from this block at once
347
- all_frames_from_block = []
348
- for frame_idx in range(pixels.shape[1]):
349
- frame_tensor = pixels[0, frame_idx]
350
-
351
- # Convert to numpy (HWC, RGB, uint8)
352
- frame_np = torch.clamp(frame_tensor.float(), -1., 1.) * 127.5 + 127.5
353
- frame_np = frame_np.to(torch.uint8).cpu().numpy()
354
- frame_np = np.transpose(frame_np, (1, 2, 0)) # CHW -> HWC
355
-
356
- all_frames_from_block.append(frame_np)
357
-
358
- # Encode entire block as one chunk immediately
359
- if all_frames_from_block:
360
- print(f"📹 Encoding block {idx} with {len(all_frames_from_block)} frames")
361
-
362
- try:
363
- chunk_uuid = str(uuid.uuid4())[:8]
364
- ts_filename = f"block_{idx:04d}_{chunk_uuid}.ts"
365
- ts_path = os.path.join("gradio_tmp", ts_filename)
366
-
367
- frames_to_ts_file(all_frames_from_block, ts_path, fps)
368
-
369
- total_frames_yielded += len(all_frames_from_block)
370
-
371
- # Calculate progress
372
- total_progress = (idx + 1) / num_blocks * 100
373
-
374
- status_html = (
375
- f"<div style='padding: 12px; border: 1px solid #0d6efd; border-radius: 8px; background: linear-gradient(135deg, #f8f9fa, #e3f2fd);'>"
376
- f" <div style='display: flex; align-items: center; margin-bottom: 8px;'>"
377
- f" <span style='color: #dc3545; font-size: 16px; margin-right: 8px;'>🔴</span>"
378
- f" <span style='font-weight: bold; color: #0d6efd;'>Live Streaming</span>"
379
- f" </div>"
380
- f" <div style='background: #e9ecef; border-radius: 4px; width: 100%; overflow: hidden; margin: 8px 0;'>"
381
- f" <div style='width: {total_progress:.1f}%; height: 20px; background: linear-gradient(90deg, #0d6efd, #6610f2); transition: width 0.3s; display: flex; align-items: center; justify-content: center; color: white; font-size: 12px; font-weight: bold;'>"
382
- f" {total_progress:.1f}%"
383
- f" </div>"
384
- f" </div>"
385
- f" <div style='display: flex; justify-content: space-between; font-size: 14px; color: #666;'>"
386
- f" <span>Block {idx+1}/{num_blocks}</span>"
387
- f" <span>{len(all_frames_from_block)} frames</span>"
388
- f" <span>Total: {total_frames_yielded}</span>"
389
- f" </div>"
390
- f"</div>"
391
- )
392
-
393
- yield ts_path, status_html
394
-
395
- except Exception as e:
396
- print(f"⚠️ Error encoding block {idx}: {e}")
397
- import traceback
398
- traceback.print_exc()
399
-
400
- current_start_frame += current_num_frames
401
-
402
- # Final completion status
403
- final_status_html = (
404
- f"<div style='padding: 16px; border: 1px solid #198754; background: linear-gradient(135deg, #d1e7dd, #f8f9fa); border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1);'>"
405
- f" <div style='display: flex; align-items: center; margin-bottom: 8px;'>"
406
- f" <span style='font-size: 24px; margin-right: 12px;'>🎉</span>"
407
- f" <h4 style='margin: 0; color: #0f5132; font-size: 18px;'>Stream Complete!</h4>"
408
- f" </div>"
409
- f" <div style='background: rgba(255,255,255,0.7); padding: 8px; border-radius: 4px;'>"
410
- f" <p style='margin: 0; color: #0f5132; font-weight: 500;'>"
411
- f" 📊 Generated {total_frames_yielded} frames across {num_blocks} blocks"
412
- f" </p>"
413
- f" <p style='margin: 4px 0 0 0; color: #0f5132; font-size: 14px;'>"
414
- f" 🎬 Playback: {fps} FPS • 📁 Format: MPEG-TS/H.264"
415
- f" </p>"
416
- f" </div>"
417
- f"</div>"
418
- )
419
-
420
- print(f"✅ PyAV streaming complete! {total_frames_yielded} frames across {num_blocks} blocks")
421
-
422
  @torch.no_grad()
423
  @spaces.GPU
424
  def video_generation_handler_streaming(prompt, seed=42, fps=15):
@@ -695,24 +531,25 @@ def video_generation_handler_example(prompt, seed=42, fps=15):
695
  # --- Gradio UI Layout ---
696
  with gr.Blocks(title="Self-Forcing Streaming Demo") as demo:
697
  gr.Markdown("# 🚀 Self-Forcing Video Generation with Streaming")
698
- gr.Markdown("Real-time video generation with frame-by-frame streaming using PyAV encoding. [[Model]](https://huggingface.co/gdhe17/Self-Forcing), [[Project page]](https://self-forcing.github.io), [[Paper]](https://huggingface.co/papers/2506.08009)")
699
 
700
  with gr.Row():
701
  with gr.Column(scale=2):
702
- gr.Markdown("### 📝 Configure Generation")
703
- prompt = gr.Textbox(
704
- label="Prompt",
705
- placeholder="A stylish woman walks down a Tokyo street...",
706
- lines=4,
707
- value="A close-up shot of a ceramic teacup slowly pouring water into a glass mug."
708
- )
709
-
710
- enhance_button = gr.Button("✨ Enhance Prompt", variant="secondary")
 
711
 
712
  gr.Markdown("### 🎯 Examples")
713
  gr.Examples(
714
  examples=[
715
- "A close-up shot of a ceramic teacup slowly pouring water into a glass mug. The water flows smoothly from the spout of the teacup into the mug, creating gentle ripples as it fills up. Both cups have detailed textures, with the teacup having a matte finish and the glass mug showcasing clear transparency. The background is a blurred kitchen countertop, adding context without distracting from the central action. The pouring motion is fluid and natural, emphasizing the interaction between the two cups.",
716
  "A playful cat is seen playing an electronic guitar, strumming the strings with its front paws. The cat has distinctive black facial markings and a bushy tail. It sits comfortably on a small stool, its body slightly tilted as it focuses intently on the instrument. The setting is a cozy, dimly lit room with vintage posters on the walls, adding a retro vibe. The cat's expressive eyes convey a sense of joy and concentration. Medium close-up shot, focusing on the cat's face and hands interacting with the guitar.",
717
  "A dynamic over-the-shoulder perspective of a chef meticulously plating a dish in a bustling kitchen. The chef, a middle-aged woman, deftly arranges ingredients on a pristine white plate. Her hands move with precision, each gesture deliberate and practiced. The background shows a crowded kitchen with steaming pots, whirring blenders, and the clatter of utensils. Bright lights highlight the scene, casting shadows across the busy workspace. The camera angle captures the chef's detailed work from behind, emphasizing his skill and dedication.",
718
  ],
@@ -740,8 +577,6 @@ with gr.Blocks(title="Self-Forcing Streaming Demo") as demo:
740
  visible=False,
741
  info="Frames per second for playback"
742
  )
743
-
744
- start_btn = gr.Button("🎬 Start Streaming", variant="primary", size="lg")
745
 
746
  with gr.Column(scale=3):
747
  gr.Markdown("### 📺 Live Video Stream")
@@ -750,6 +585,7 @@ with gr.Blocks(title="Self-Forcing Streaming Demo") as demo:
750
  streaming_video = gr.Video(
751
  label="Live Stream",
752
  streaming=True,
 
753
  height=400,
754
  autoplay=True,
755
  show_label=False
 
255
 
256
  pipeline.to(dtype=torch.float16).to(gpu)
257
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
258
  @torch.no_grad()
259
  @spaces.GPU
260
  def video_generation_handler_streaming(prompt, seed=42, fps=15):
 
531
  # --- Gradio UI Layout ---
532
  with gr.Blocks(title="Self-Forcing Streaming Demo") as demo:
533
  gr.Markdown("# 🚀 Self-Forcing Video Generation with Streaming")
534
+ gr.Markdown("Real-time video generation [[Model]](https://huggingface.co/gdhe17/Self-Forcing), [[Project page]](https://self-forcing.github.io), [[Paper]](https://huggingface.co/papers/2506.08009)")
535
 
536
  with gr.Row():
537
  with gr.Column(scale=2):
538
+ with gr.Group():
539
+ prompt = gr.Textbox(
540
+ label="Prompt",
541
+ placeholder="A stylish woman walks down a Tokyo street...",
542
+ lines=4,
543
+ value=""
544
+ )
545
+ enhance_button = gr.Button("✨ Enhance Prompt", variant="secondary")
546
+
547
+ start_btn = gr.Button("🎬 Start Streaming", variant="primary", size="lg")
548
 
549
  gr.Markdown("### 🎯 Examples")
550
  gr.Examples(
551
  examples=[
552
+ "A close-up shot of a ceramic teacup slowly pouring water into a glass mug.",
553
  "A playful cat is seen playing an electronic guitar, strumming the strings with its front paws. The cat has distinctive black facial markings and a bushy tail. It sits comfortably on a small stool, its body slightly tilted as it focuses intently on the instrument. The setting is a cozy, dimly lit room with vintage posters on the walls, adding a retro vibe. The cat's expressive eyes convey a sense of joy and concentration. Medium close-up shot, focusing on the cat's face and hands interacting with the guitar.",
554
  "A dynamic over-the-shoulder perspective of a chef meticulously plating a dish in a bustling kitchen. The chef, a middle-aged woman, deftly arranges ingredients on a pristine white plate. Her hands move with precision, each gesture deliberate and practiced. The background shows a crowded kitchen with steaming pots, whirring blenders, and the clatter of utensils. Bright lights highlight the scene, casting shadows across the busy workspace. The camera angle captures the chef's detailed work from behind, emphasizing his skill and dedication.",
555
  ],
 
577
  visible=False,
578
  info="Frames per second for playback"
579
  )
 
 
580
 
581
  with gr.Column(scale=3):
582
  gr.Markdown("### 📺 Live Video Stream")
 
585
  streaming_video = gr.Video(
586
  label="Live Stream",
587
  streaming=True,
588
+ loop=True,
589
  height=400,
590
  autoplay=True,
591
  show_label=False