multimodalart HF Staff commited on
Commit
64c9783
·
verified ·
1 Parent(s): 5609307

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +129 -22
app.py CHANGED
@@ -319,25 +319,133 @@ def video_generation_handler(prompt, seed=42, fps=15):
319
  )
320
  yield None, None, error_status_html
321
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
322
  # --- Gradio UI Layout ---
323
  frame_display = gr.Image(
324
  label="Generated Frames",
325
  height=480,
326
  width=832,
327
  show_label=True,
328
- container=True
 
329
  )
330
  final_video = gr.Video(
331
  label="Final Rendered Video",
332
- visible=False,
333
  interactive=False,
334
  height=400,
335
  autoplay=True
336
  )
337
  status_html = gr.HTML(
338
- value="<div style='text-align: center; padding: 20px; color: #666;'>Ready to start generation...</div>",
339
- label="Generation Status"
340
- )
341
  with gr.Blocks(title="Self-Forcing Frame Streaming Demo") as demo:
342
  gr.Markdown("# 🚀 Self-Forcing Video Generation with Frame Streaming")
343
  gr.Markdown("Real-time video generation with frame-by-frame display. [[Model]](https://huggingface.co/gdhe17/Self-Forcing), [[Project page]](https://self-forcing.github.io), [[Paper]](https://huggingface.co/papers/2506.08009)")
@@ -345,23 +453,22 @@ with gr.Blocks(title="Self-Forcing Frame Streaming Demo") as demo:
345
  with gr.Row():
346
  with gr.Column(scale=2):
347
  gr.Markdown("### 📝 Configure Generation")
348
- with gr.Group():
349
- prompt = gr.Textbox(
350
- label="Prompt",
351
- placeholder="A stylish woman walks down a Tokyo street...",
352
- lines=4,
353
- )
354
- gr.Examples(
355
- examples=[
356
- "A close-up shot of a ceramic teacup slowly pouring water into a glass mug. The water flows smoothly from the spout of the teacup into the mug, creating gentle ripples as it fills up. Both cups have detailed textures, with the teacup having a matte finish and the glass mug showcasing clear transparency. The background is a blurred kitchen countertop, adding context without distracting from the central action. The pouring motion is fluid and natural, emphasizing the interaction between the two cups.",
357
- "A playful capybara is seen playing an electronic guitar, strumming the strings with its front paws. The raccoon has distinctive black facial markings and a bushy tail. It sits comfortably on a small stool, its body slightly tilted as it focuses intently on the instrument. The setting is a cozy, dimly lit room with vintage posters on the walls, adding a retro vibe. The raccoon's expressive eyes convey a sense of joy and concentration. Medium close-up shot, focusing on the raccoon's face and hands interacting with the guitar.",
358
- "A dynamic over-the-shoulder perspective of a chef meticulously plating a dish in a bustling kitchen. The chef, a middle-aged man with a neatly trimmed beard and focused expression, deftly arranges ingredients on a pristine white plate. His hands move with precision, each gesture deliberate and practiced. The background shows a crowded kitchen with steaming pots, whirring blenders, and the clatter of utensils. Bright lights highlight the scene, casting shadows across the busy workspace. The camera angle captures the chef's detailed work from behind, emphasizing his skill and dedication.",
359
- ],
360
- inputs=[prompt],
361
- fn=video_generation_handler,
362
- outputs=[frame_display, final_video, status_html],
363
- cache_examples="lazy"
364
- )
365
 
366
  with gr.Row():
367
  seed = gr.Number(label="Seed", value=-1, info="Use -1 for a random seed.")
 
319
  )
320
  yield None, None, error_status_html
321
 
322
+ @torch.no_grad()
323
+ @spaces.GPU
324
+ def video_generation_handler_example(prompt, seed=42, fps=15):
325
+ """
326
+ Simplified video generation function that returns the final video path.
327
+ """
328
+ if seed == -1:
329
+ seed = random.randint(0, 2**32 - 1)
330
+
331
+ print(f"🎬 Starting video generation with prompt: '{prompt}' and seed: {seed}")
332
+
333
+ # Encode text prompt
334
+ print("🔤 Encoding text prompt...")
335
+ conditional_dict = text_encoder(text_prompts=[prompt])
336
+ for key, value in conditional_dict.items():
337
+ conditional_dict[key] = value.to(dtype=torch.float16)
338
+
339
+ # Initialize generation
340
+ rnd = torch.Generator(gpu).manual_seed(int(seed))
341
+ pipeline._initialize_kv_cache(1, torch.float16, device=gpu)
342
+ pipeline._initialize_crossattn_cache(1, torch.float16, device=gpu)
343
+ noise = torch.randn([1, 21, 16, 60, 104], device=gpu, dtype=torch.float16, generator=rnd)
344
+
345
+ vae_cache, latents_cache = None, None
346
+ if not APP_STATE["current_use_taehv"] and not args.trt:
347
+ vae_cache = [c.to(device=gpu, dtype=torch.float16) for c in ZERO_VAE_CACHE]
348
+
349
+ num_blocks = 7
350
+ current_start_frame = 0
351
+ all_num_frames = [pipeline.num_frame_per_block] * num_blocks
352
+ all_frames_for_video = []
353
+
354
+ # Generation loop
355
+ for idx, current_num_frames in enumerate(all_num_frames):
356
+ print(f"📦 Processing block {idx+1}/{num_blocks} with {current_num_frames} frames")
357
+
358
+ noisy_input = noise[:, current_start_frame : current_start_frame + current_num_frames]
359
+
360
+ # Denoising steps
361
+ for step_idx, current_timestep in enumerate(pipeline.denoising_step_list):
362
+ timestep = torch.ones([1, current_num_frames], device=noise.device, dtype=torch.int64) * current_timestep
363
+ _, denoised_pred = pipeline.generator(
364
+ noisy_image_or_video=noisy_input, conditional_dict=conditional_dict,
365
+ timestep=timestep, kv_cache=pipeline.kv_cache1,
366
+ crossattn_cache=pipeline.crossattn_cache,
367
+ current_start=current_start_frame * pipeline.frame_seq_length
368
+ )
369
+ if step_idx < len(pipeline.denoising_step_list) - 1:
370
+ next_timestep = pipeline.denoising_step_list[step_idx + 1]
371
+ noisy_input = pipeline.scheduler.add_noise(
372
+ denoised_pred.flatten(0, 1), torch.randn_like(denoised_pred.flatten(0, 1)),
373
+ next_timestep * torch.ones([1 * current_num_frames], device=noise.device, dtype=torch.long)
374
+ ).unflatten(0, denoised_pred.shape[:2])
375
+
376
+ if idx < len(all_num_frames) - 1:
377
+ pipeline.generator(
378
+ noisy_image_or_video=denoised_pred, conditional_dict=conditional_dict,
379
+ timestep=torch.zeros_like(timestep), kv_cache=pipeline.kv_cache1,
380
+ crossattn_cache=pipeline.crossattn_cache,
381
+ current_start=current_start_frame * pipeline.frame_seq_length,
382
+ )
383
+
384
+ # Decode to pixels
385
+ if args.trt:
386
+ pixels, vae_cache = pipeline.vae.forward(denoised_pred.half(), *vae_cache)
387
+ elif APP_STATE["current_use_taehv"]:
388
+ if latents_cache is None:
389
+ latents_cache = denoised_pred
390
+ else:
391
+ denoised_pred = torch.cat([latents_cache, denoised_pred], dim=1)
392
+ latents_cache = denoised_pred[:, -3:]
393
+ pixels = pipeline.vae.decode(denoised_pred)
394
+ else:
395
+ pixels, vae_cache = pipeline.vae(denoised_pred.half(), *vae_cache)
396
+
397
+ # Handle frame skipping for first block
398
+ if idx == 0 and not args.trt:
399
+ pixels = pixels[:, 3:]
400
+ elif APP_STATE["current_use_taehv"] and idx > 0:
401
+ pixels = pixels[:, 12:]
402
+
403
+ print(f"📹 Decoded pixels shape: {pixels.shape}")
404
+
405
+ # Collect all frames from this block
406
+ for frame_idx in range(pixels.shape[1]):
407
+ frame_tensor = pixels[0, frame_idx] # Get single frame [C, H, W]
408
+
409
+ # Normalize from [-1, 1] to [0, 255]
410
+ frame_np = torch.clamp(frame_tensor.float(), -1., 1.) * 127.5 + 127.5
411
+ frame_np = frame_np.to(torch.uint8).cpu().numpy()
412
+
413
+ # Convert from CHW to HWC format (RGB)
414
+ frame_np = np.transpose(frame_np, (1, 2, 0)) # CHW -> HWC
415
+
416
+ all_frames_for_video.append(frame_np)
417
+
418
+ current_start_frame += current_num_frames
419
+
420
+ print(f"✅ Video generation completed! Total frames: {len(all_frames_for_video)}")
421
+
422
+ # Save final video
423
+ video_path = f"gradio_tmp/{seed}_{hashlib.md5(prompt.encode()).hexdigest()}.mp4"
424
+ imageio.mimwrite(video_path, all_frames_for_video, fps=fps, quality=8)
425
+ print(f"✅ Video saved to {video_path}")
426
+
427
+ return video_path
428
+
429
  # --- Gradio UI Layout ---
430
  frame_display = gr.Image(
431
  label="Generated Frames",
432
  height=480,
433
  width=832,
434
  show_label=True,
435
+ container=True,
436
+ visible=False
437
  )
438
  final_video = gr.Video(
439
  label="Final Rendered Video",
440
+ visible=True,
441
  interactive=False,
442
  height=400,
443
  autoplay=True
444
  )
445
  status_html = gr.HTML(
446
+ value="<div style='text-align: center; padding: 20px; color: #666;'>Ready to start generation...</div>",
447
+ label="Generation Status"
448
+ )
449
  with gr.Blocks(title="Self-Forcing Frame Streaming Demo") as demo:
450
  gr.Markdown("# 🚀 Self-Forcing Video Generation with Frame Streaming")
451
  gr.Markdown("Real-time video generation with frame-by-frame display. [[Model]](https://huggingface.co/gdhe17/Self-Forcing), [[Project page]](https://self-forcing.github.io), [[Paper]](https://huggingface.co/papers/2506.08009)")
 
453
  with gr.Row():
454
  with gr.Column(scale=2):
455
  gr.Markdown("### 📝 Configure Generation")
456
+ prompt = gr.Textbox(
457
+ label="Prompt",
458
+ placeholder="A stylish woman walks down a Tokyo street...",
459
+ lines=4,
460
+ )
461
+ gr.Examples(
462
+ examples=[
463
+ "A close-up shot of a ceramic teacup slowly pouring water into a glass mug. The water flows smoothly from the spout of the teacup into the mug, creating gentle ripples as it fills up. Both cups have detailed textures, with the teacup having a matte finish and the glass mug showcasing clear transparency. The background is a blurred kitchen countertop, adding context without distracting from the central action. The pouring motion is fluid and natural, emphasizing the interaction between the two cups.",
464
+ "A playful capybara is seen playing an electronic guitar, strumming the strings with its front paws. The raccoon has distinctive black facial markings and a bushy tail. It sits comfortably on a small stool, its body slightly tilted as it focuses intently on the instrument. The setting is a cozy, dimly lit room with vintage posters on the walls, adding a retro vibe. The raccoon's expressive eyes convey a sense of joy and concentration. Medium close-up shot, focusing on the raccoon's face and hands interacting with the guitar.",
465
+ "A dynamic over-the-shoulder perspective of a chef meticulously plating a dish in a bustling kitchen. The chef, a middle-aged man with a neatly trimmed beard and focused expression, deftly arranges ingredients on a pristine white plate. His hands move with precision, each gesture deliberate and practiced. The background shows a crowded kitchen with steaming pots, whirring blenders, and the clatter of utensils. Bright lights highlight the scene, casting shadows across the busy workspace. The camera angle captures the chef's detailed work from behind, emphasizing his skill and dedication.",
466
+ ],
467
+ inputs=[prompt],
468
+ fn=video_generation_handler_example,
469
+ outputs=[frame_display, final_video, status_html],
470
+ cache_examples="lazy"
471
+ )
 
472
 
473
  with gr.Row():
474
  seed = gr.Number(label="Seed", value=-1, info="Use -1 for a random seed.")