kiwhansong commited on
Commit
badacd0
·
1 Parent(s): 006ab10
Files changed (1) hide show
  1. app.py +573 -541
app.py CHANGED
@@ -1,4 +1,4 @@
1
- from typing import List
2
  from pathlib import Path
3
  from functools import partial
4
  import spaces
@@ -137,7 +137,7 @@ def any_images_to_short_video(
137
  pbar = CustomProgressBar(
138
  gr.Progress(track_tqdm=True).tqdm(
139
  iterable=None,
140
- desc="Sampling",
141
  total=dfot.sampling_timesteps,
142
  )
143
  )
@@ -200,7 +200,7 @@ def navigate_video(
200
  pbar = CustomProgressBar(
201
  gr.Progress(track_tqdm=True).tqdm(
202
  iterable=None,
203
- desc=f"Predicting next {n_prediction_frames} frames",
204
  total=dfot.sampling_timesteps,
205
  )
206
  )
@@ -408,363 +408,237 @@ def smooth_navigation(
408
  [(image, f"t={i}") for i, image in enumerate(images)],
409
  )
410
 
411
-
412
- # Create the Gradio Blocks
413
- with gr.Blocks(theme=gr.themes.Base(primary_hue="teal")) as demo:
414
- gr.HTML(
415
- """
416
- <style>
417
- [data-tab-id="task-1"], [data-tab-id="task-2"], [data-tab-id="task-3"] {
418
- font-size: 16px !important;
419
- font-weight: bold;
420
- }
421
- #header-button .button-icon {
422
- margin-right: 8px;
423
- }
424
- #basic-controls {
425
- column-gap: 0px;
426
- }
427
- #basic-controls button {
428
- border: 1px solid #e4e4e7;
429
- }
430
- #basic-controls-tab {
431
- padding: 0px;
432
- }
433
- #advanced-controls-tab {
434
- padding: 0px;
435
- }
436
- </style>
437
- """
438
- )
439
-
440
- gr.Markdown("# Diffusion Forcing Transformer with History Guidance")
441
  gr.Markdown(
442
- "### Official Interactive Demo for [_History-guided Video Diffusion_](https://arxiv.org/abs/2502.06764)"
 
 
 
 
443
  )
444
- with gr.Row():
445
- gr.Button(
446
- value="Website",
447
- link="https://boyuan.space/history-guidance",
448
- icon="https://simpleicons.org/icons/googlechrome.svg",
449
- elem_id="header-button",
450
- )
451
- gr.Button(
452
- value="Paper",
453
- link="https://arxiv.org/abs/2502.06764",
454
- icon="https://simpleicons.org/icons/arxiv.svg",
455
- elem_id="header-button",
456
- )
457
- gr.Button(
458
- value="Code",
459
- link="https://github.com/kwsong0113/diffusion-forcing-transformer",
460
- icon="https://simpleicons.org/icons/github.svg",
461
- elem_id="header-button",
462
- )
463
- gr.Button(
464
- value="Pretrained Models",
465
- link="https://huggingface.co/kiwhansong/DFoT",
466
- icon="https://simpleicons.org/icons/huggingface.svg",
467
- elem_id="header-button",
468
- )
469
-
470
- with gr.Accordion("Troubleshooting: Not Working or Too Slow?", open=False):
471
- gr.Markdown(
472
- """
473
- - Error or Unexpected Results? _Please try again after refreshing the page and ensure you do not click the same button multiple times._
474
- - Performance Issues or No GPU Allocation? _Consider running the demo locally (click the dots in the top-right corner). Alternatively, you can subscribe to Hugging Face Pro for an increased GPU quota._
475
- """
476
- )
477
-
478
- with gr.Tab("Any # of Images → Short Video", id="task-1"):
479
- gr.Markdown(
480
- """
481
- ## Demo 1: Any Number of Images → Short 2-second Video
482
- > #### _Diffusion Forcing Transformer is a flexible model that can generate videos given variable number of context frames._
483
- """
484
- )
485
-
486
- demo1_stage = gr.State(value="Scene")
487
- demo1_selected_scene_index = gr.State(value=None)
488
- demo1_selected_image_indices = gr.State(value=[])
489
-
490
- @gr.render(
491
- inputs=[
492
- demo1_stage,
493
- demo1_selected_scene_index,
494
- demo1_selected_image_indices,
495
- ]
496
- )
497
- def render_stage(s, scene_idx, image_indices):
498
- match s:
499
- case "Scene":
500
- with gr.Group():
501
- demo1_scene_gallery = gr.Gallery(
502
- height=300,
503
- value=gif_paths,
504
- label="Select a Scene to Generate Video",
505
- columns=[8],
506
- selected_index=scene_idx,
507
- )
508
-
509
- @demo1_scene_gallery.select(
510
- inputs=None, outputs=demo1_selected_scene_index
511
- )
512
- def update_selection(selection: gr.SelectData):
513
- return selection.index
514
-
515
- demo1_scene_select_button = gr.Button(
516
- "Select Scene", variant="primary"
517
- )
518
-
519
- @demo1_scene_select_button.click(
520
- inputs=demo1_selected_scene_index, outputs=demo1_stage
521
- )
522
- def move_to_image_selection(scene_idx: int):
523
- if scene_idx is None:
524
- gr.Warning("Scene not selected!")
525
- return "Scene"
526
- else:
527
- return "Image"
528
-
529
- case "Image":
530
- with gr.Group():
531
- demo1_image_gallery = gr.Gallery(
532
- height=150,
533
- value=[
534
- (image, f"t={i}")
535
- for i, image in enumerate(
536
- prepare_short_gt_video(scene_idx)
537
- )
538
- ],
539
- label="Select Input Images",
540
- columns=[8],
541
- )
542
-
543
- demo1_selector = gr.CheckboxGroup(
544
- label="Select Any Number of Input Images",
545
- info="Image-to-Video: Select t=0; Interpolation: Select t=0 and t=7",
546
- choices=[(f"t={i}", i) for i in range(8)],
547
- value=[],
548
- )
549
- demo1_image_select_button = gr.Button(
550
- "Select Input Images", variant="primary"
551
- )
552
-
553
- @demo1_image_select_button.click(
554
- inputs=[demo1_selector],
555
- outputs=[demo1_stage, demo1_selected_image_indices],
556
- )
557
- def generate_video(selected_indices):
558
- if len(selected_indices) == 0:
559
- gr.Warning("Select at least one image!")
560
- return "Image", []
561
- else:
562
- gr.Info('Click "Generate Video" on the left to start generating now!')
563
- return "Generation", selected_indices
564
-
565
- case "Generation":
566
- with gr.Group():
567
- gt_video = prepare_short_gt_video(scene_idx)
568
-
569
- demo1_input_image_gallery = gr.Gallery(
570
- height=150,
571
- value=video_to_gif_and_images(gt_video, image_indices),
572
- label="Input Images",
573
- columns=[9],
574
- )
575
- demo1_generated_gallery = gr.Gallery(
576
- height=150,
577
- value=[],
578
- label="Generated Video",
579
- columns=[9],
580
- )
581
-
582
- demo1_ground_truth_gallery = gr.Gallery(
583
- height=150,
584
- value=video_to_gif_and_images(gt_video, list(range(8))),
585
- label="Ground Truth Video",
586
- columns=[9],
587
- )
588
- with gr.Sidebar():
589
- gr.Markdown("### Sampling Parameters")
590
- demo1_guidance_scale = gr.Slider(
591
- minimum=1,
592
- maximum=6,
593
- value=4,
594
- step=0.5,
595
- label="History Guidance Scale",
596
- info="Without history guidance: 1.0; Recommended: 4.0",
597
- interactive=True,
598
- )
599
- gr.Button("Generate Video", variant="primary").click(
600
- fn=any_images_to_short_video,
601
- inputs=[
602
- demo1_selected_scene_index,
603
- demo1_selected_image_indices,
604
- demo1_guidance_scale,
605
- ],
606
- outputs=demo1_generated_gallery,
607
- )
608
-
609
- with gr.Tab("Single Image → Long Video", id="task-2"):
610
- gr.Markdown(
611
- f"""
612
- ## Demo 2: Single Image → Long {LONG_LENGTH}-second Video
613
- > #### _Diffusion Forcing Transformer, with History Guidance, can generate long videos via sliding window rollouts and temporal super-resolution._
614
  """
615
- )
616
-
617
- demo2_stage = gr.State(value="Selection")
618
- demo2_selected_index = gr.State(value=None)
619
-
620
- @gr.render(inputs=[demo2_stage, demo2_selected_index])
621
- def render_stage(s, idx):
622
- match s:
623
- case "Selection":
624
- with gr.Group():
625
- demo2_image_gallery = gr.Gallery(
626
- height=300,
627
- value=first_frame_list,
628
- label="Select an Image to Animate",
629
- columns=[8],
630
- selected_index=idx,
631
- )
632
-
633
- @demo2_image_gallery.select(
634
- inputs=None, outputs=demo2_selected_index
635
- )
636
- def update_selection(selection: gr.SelectData):
637
- return selection.index
638
-
639
- demo2_select_button = gr.Button(
640
- "Select Input Image", variant="primary"
641
- )
642
-
643
- @demo2_select_button.click(
644
- inputs=demo2_selected_index, outputs=demo2_stage
645
- )
646
- def move_to_generation(idx: int):
647
- if idx is None:
648
- gr.Warning("Image not selected!")
649
- return "Selection"
650
- else:
651
- gr.Info('Click "Generate Video" on the left to start generating now!')
652
- return "Generation"
653
-
654
- case "Generation":
655
- with gr.Row():
656
- gr.Image(
657
- value=first_frame_list[idx],
658
- label="Input Image",
659
- width=256,
660
- height=256,
661
- )
662
- gr.Video(
663
- value=prepare_long_gt_video(idx),
664
- label="Ground Truth Video",
665
- width=256,
666
- height=256,
667
- autoplay=True,
668
- loop=True,
669
- )
670
- demo2_video = gr.Video(
671
- label="Generated Video",
672
- width=256,
673
- height=256,
674
- autoplay=True,
675
- loop=True,
676
- show_share_button=True,
677
- show_download_button=True,
678
- )
679
-
680
- with gr.Sidebar():
681
- gr.Markdown("### Sampling Parameters")
682
 
683
- demo2_guidance_scale = gr.Slider(
684
- minimum=1,
685
- maximum=6,
686
- value=4,
687
- step=0.5,
688
- label="History Guidance Scale",
689
- info="Without history guidance: 1.0; Recommended: 4.0",
690
- interactive=True,
691
- )
692
- demo2_fps = gr.Slider(
693
- minimum=4,
694
- maximum=20,
695
- value=4,
696
- step=1,
697
- label="FPS",
698
- info=f"A {LONG_LENGTH}-second video will be generated at this FPS; Decrease for faster generation; Increase for a smoother video",
699
- interactive=True,
700
- )
701
- gr.Button("Generate Video", variant="primary").click(
702
- fn=single_image_to_long_video,
703
- inputs=[
704
- demo2_selected_index,
705
- demo2_guidance_scale,
706
- demo2_fps,
707
- ],
708
- outputs=demo2_video,
 
709
  )
710
-
711
- with gr.Tab("Single Image → Endless Video Navigation", id="task-3"):
712
- gr.Markdown(
713
- """
714
- ## Demo 3: Single Image → Extremely Long Video _(Navigate with Your Camera Movements!)_
715
- > #### _History Guidance significantly improves quality and temporal consistency, enabling stable rollouts for extremely long videos._
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
716
  """
717
- )
718
-
719
- demo3_stage = gr.State(value="Selection")
720
- demo3_selected_index = gr.State(value=None)
721
- demo3_current_video = gr.State(value=None)
722
- demo3_current_poses = gr.State(value=None)
723
-
724
- @gr.render(inputs=[demo3_stage, demo3_selected_index])
725
- def render_stage(s, idx):
726
- match s:
727
- case "Selection":
728
- with gr.Group():
729
- demo3_image_gallery = gr.Gallery(
730
- height=300,
731
- value=first_frame_list,
732
- label="Select an Image to Start Navigation",
733
- columns=[8],
734
- selected_index=idx,
735
- )
736
-
737
- @demo3_image_gallery.select(
738
- inputs=None, outputs=demo3_selected_index
739
- )
740
- def update_selection(selection: gr.SelectData):
741
- return selection.index
742
-
743
- demo3_select_button = gr.Button(
744
- "Select Input Image", variant="primary"
745
- )
746
-
747
- @demo3_select_button.click(
748
- inputs=demo3_selected_index,
749
- outputs=[
750
- demo3_stage,
751
- demo3_current_video,
752
- demo3_current_poses,
753
- ],
754
- )
755
- def move_to_generation(idx: int):
756
- if idx is None:
757
- gr.Warning("Image not selected!")
758
- return "Selection", None, None
759
- else:
760
- gr.Info('Start navigating with the "Let\'s Navigate!" sidebar on the left now!')
761
- return (
762
- "Generation",
763
- video_list[idx][:1],
764
- poses_list[idx][:1],
765
- )
766
 
767
- case "Generation":
 
 
768
  with gr.Row():
769
  demo3_current_view = gr.Image(
770
  value=first_frame_list[idx],
@@ -785,183 +659,86 @@ with gr.Blocks(theme=gr.themes.Base(primary_hue="teal")) as demo:
785
  demo3_generated_gallery = gr.Gallery(
786
  value=[],
787
  label="Generated Frames",
788
- columns=[8],
789
  )
790
 
791
- with gr.Sidebar():
792
- gr.Markdown(
793
- """
794
- ### Let's Navigate!
795
  - **The model will predict the next few frames based on your camera movements. Repeat the process to continue navigating through the scene.**
796
  - **At the end of your navigation, apply temporal super-resolution to increase the FPS,** also utilizing the DFoT model.
797
- - The most suitable history guidance scheme will be automatically selected based on your camera movements.
798
- """
799
- )
800
- with gr.Tab("Basic", elem_id="basic-controls-tab"):
801
- with gr.Group():
802
- gr.Markdown("_**Select a direction to move:**_")
803
- with gr.Row(elem_id="basic-controls"):
804
- gr.Button(
805
- "↰-60°\nTurn",
806
- size="sm",
807
- min_width=0,
808
- variant="primary",
809
- ).click(
810
- fn=partial(
811
- navigate_video,
812
- x_angle=0,
813
- y_angle=-60,
814
- distance=0,
815
- ),
816
- inputs=[
817
- demo3_current_video,
818
- demo3_current_poses,
819
- ],
820
- outputs=[
821
- demo3_current_video,
822
- demo3_current_poses,
823
- demo3_current_view,
824
- demo3_video,
825
- demo3_generated_gallery,
826
- ],
827
- )
828
-
829
- gr.Button(
830
- "↖-30°\nVeer",
831
- size="sm",
832
- min_width=0,
833
- variant="primary",
834
- ).click(
835
- fn=partial(
836
- navigate_video,
837
- x_angle=0,
838
- y_angle=-30,
839
- distance=50,
840
- ),
841
- inputs=[
842
- demo3_current_video,
843
- demo3_current_poses,
844
- ],
845
- outputs=[
846
- demo3_current_video,
847
- demo3_current_poses,
848
- demo3_current_view,
849
- demo3_video,
850
- demo3_generated_gallery,
851
- ],
852
- )
853
-
854
- gr.Button(
855
- "↑0°\nAhead",
856
- size="sm",
857
- min_width=0,
858
- variant="primary",
859
- ).click(
860
- fn=partial(
861
- navigate_video,
862
- x_angle=0,
863
- y_angle=0,
864
- distance=100,
865
- ),
866
- inputs=[
867
- demo3_current_video,
868
- demo3_current_poses,
869
- ],
870
- outputs=[
871
- demo3_current_video,
872
- demo3_current_poses,
873
- demo3_current_view,
874
- demo3_video,
875
- demo3_generated_gallery,
876
- ],
877
- )
878
- gr.Button(
879
- "↗30°\nVeer",
880
- size="sm",
881
- min_width=0,
882
- variant="primary",
883
- ).click(
884
- fn=partial(
885
- navigate_video,
886
- x_angle=0,
887
- y_angle=30,
888
- distance=50,
889
- ),
890
- inputs=[
891
- demo3_current_video,
892
- demo3_current_poses,
893
- ],
894
- outputs=[
895
- demo3_current_video,
896
- demo3_current_poses,
897
- demo3_current_view,
898
- demo3_video,
899
- demo3_generated_gallery,
900
- ],
901
- )
902
- gr.Button(
903
- "↱\n60° Turn",
904
- size="sm",
905
- min_width=0,
906
- variant="primary",
907
- ).click(
908
- fn=partial(
909
- navigate_video,
910
- x_angle=0,
911
- y_angle=60,
912
- distance=0,
913
- ),
914
- inputs=[
915
- demo3_current_video,
916
- demo3_current_poses,
917
- ],
918
- outputs=[
919
- demo3_current_video,
920
- demo3_current_poses,
921
- demo3_current_view,
922
- demo3_video,
923
- demo3_generated_gallery,
924
- ],
925
- )
926
- with gr.Tab("Advanced", elem_id="advanced-controls-tab"):
927
- with gr.Group():
928
- gr.Markdown("_**Select angles and distance:**_")
929
-
930
- demo3_y_angle = gr.Slider(
931
- minimum=-90,
932
- maximum=90,
933
- value=0,
934
- step=10,
935
- label="Horizontal Angle",
936
- interactive=True,
937
- )
938
- demo3_x_angle = gr.Slider(
939
- minimum=-40,
940
- maximum=40,
941
- value=0,
942
- step=10,
943
- label="Vertical Angle",
944
- interactive=True,
945
  )
946
- demo3_distance = gr.Slider(
947
- minimum=0,
948
- maximum=200,
949
- value=100,
950
- step=10,
951
- label="Distance",
952
- interactive=True,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
953
  )
954
 
955
  gr.Button(
956
- "Generate Next Move", variant="primary"
 
 
 
957
  ).click(
958
- fn=navigate_video,
 
 
 
 
 
959
  inputs=[
960
  demo3_current_video,
961
  demo3_current_poses,
962
- demo3_x_angle,
963
- demo3_y_angle,
964
- demo3_distance,
965
  ],
966
  outputs=[
967
  demo3_current_video,
@@ -971,37 +748,93 @@ with gr.Blocks(theme=gr.themes.Base(primary_hue="teal")) as demo:
971
  demo3_generated_gallery,
972
  ],
973
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
974
  with gr.Group():
975
- gr.Markdown("_You can always undo your last move:_")
976
- gr.Button("Undo Last Move", variant="huggingface").click(
977
- fn=undo_navigation,
978
- inputs=[demo3_current_video, demo3_current_poses],
979
- outputs=[
980
- demo3_current_video,
981
- demo3_current_poses,
982
- demo3_current_view,
983
- demo3_video,
984
- demo3_generated_gallery,
985
- ],
986
  )
987
- with gr.Group():
988
- gr.Markdown(
989
- "_At the end, apply temporal super-resolution to obtain a smoother video:_"
 
 
 
 
990
  )
991
- demo3_interpolation_factor = gr.Slider(
992
- minimum=2,
993
- maximum=10,
994
- value=2,
995
- step=1,
996
- label="By a Factor of",
997
  interactive=True,
998
  )
999
- gr.Button("Smooth Out Video", variant="huggingface").click(
1000
- fn=smooth_navigation,
 
 
 
1001
  inputs=[
1002
  demo3_current_video,
1003
  demo3_current_poses,
1004
- demo3_interpolation_factor,
 
 
1005
  ],
1006
  outputs=[
1007
  demo3_current_video,
@@ -1011,7 +844,206 @@ with gr.Blocks(theme=gr.themes.Base(primary_hue="teal")) as demo:
1011
  demo3_generated_gallery,
1012
  ],
1013
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1014
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1015
 
1016
  if __name__ == "__main__":
1017
  demo.launch()
 
1
+ from typing import List, Literal
2
  from pathlib import Path
3
  from functools import partial
4
  import spaces
 
137
  pbar = CustomProgressBar(
138
  gr.Progress(track_tqdm=True).tqdm(
139
  iterable=None,
140
+ desc="Sampling with DFoT",
141
  total=dfot.sampling_timesteps,
142
  )
143
  )
 
200
  pbar = CustomProgressBar(
201
  gr.Progress(track_tqdm=True).tqdm(
202
  iterable=None,
203
+ desc=f"Predicting next {n_prediction_frames} frames with DFoT",
204
  total=dfot.sampling_timesteps,
205
  )
206
  )
 
408
  [(image, f"t={i}") for i, image in enumerate(images)],
409
  )
410
 
411
+ def render_demo1(s: Literal["Selection", "Generation"], idx: int, demo1_stage: gr.State, demo1_selected_index: gr.State):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
412
  gr.Markdown(
413
+ f"""
414
+ ## Demo 1: Single Image → Long {LONG_LENGTH}-second Video
415
+ > #### _Diffusion Forcing Transformer can generate long videos via sliding window rollouts and temporal super-resolution._
416
+ """,
417
+ elem_classes=["task-title"]
418
  )
419
+ match s:
420
+ case "Selection":
421
+ with gr.Group():
422
+ demo1_image_gallery = gr.Gallery(
423
+ height=300,
424
+ value=first_frame_list,
425
+ label="Select an Image to Animate",
426
+ columns=[8],
427
+ selected_index=idx,
428
+ allow_preview=False,
429
+ preview=False,
430
+ )
431
+
432
+ @demo1_image_gallery.select(
433
+ inputs=None, outputs=[demo1_stage, demo1_selected_index]
434
+ )
435
+ def move_to_generation(selection: gr.SelectData):
436
+ return "Generation", selection.index
437
+
438
+ case "Generation":
439
+ with gr.Row():
440
+ gr.Image(
441
+ value=first_frame_list[idx],
442
+ label="Input Image",
443
+ width=256,
444
+ height=256,
445
+ )
446
+ gr.Video(
447
+ value=prepare_long_gt_video(idx),
448
+ label="Ground Truth Video",
449
+ width=256,
450
+ height=256,
451
+ autoplay=True,
452
+ loop=True,
453
+ )
454
+ demo1_video = gr.Video(
455
+ label="Generated Video",
456
+ width=256,
457
+ height=256,
458
+ autoplay=True,
459
+ loop=True,
460
+ show_share_button=True,
461
+ show_download_button=True,
462
+ )
463
+
464
+ gr.Markdown("### Generation Controls ↓")
465
+ demo1_guidance_scale = gr.Slider(
466
+ minimum=1,
467
+ maximum=6,
468
+ value=4,
469
+ step=0.5,
470
+ label="History Guidance Scale",
471
+ info="Without history guidance: 1.0; Recommended: 4.0",
472
+ interactive=True,
473
+ )
474
+ demo1_fps = gr.Slider(
475
+ minimum=4,
476
+ maximum=20,
477
+ value=4,
478
+ step=1,
479
+ label="FPS",
480
+ info=f"A {LONG_LENGTH}-second video will be generated at this FPS; Decrease for faster generation; Increase for a smoother video",
481
+ interactive=True,
482
+ )
483
+ gr.Button("Generate Video", variant="primary").click(
484
+ fn=single_image_to_long_video,
485
+ inputs=[
486
+ demo1_selected_index,
487
+ demo1_guidance_scale,
488
+ demo1_fps,
489
+ ],
490
+ outputs=demo1_video,
491
+ )
492
+
493
+ def render_demo2(s: Literal["Scene", "Image", "Generation"], scene_idx: int, image_indices: List[int], demo2_stage: gr.State, demo2_selected_scene_index: gr.State, demo2_selected_image_indices: gr.State):
494
+ gr.Markdown(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
495
  """
496
+ ## Demo 2: Any Number of Images → Short 2-second Video
497
+ > #### _Diffusion Forcing Transformer is a flexible model that can generate videos given variable number of context frames._
498
+ """,
499
+ elem_classes=["task-title"]
500
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
501
 
502
+ match s:
503
+ case "Scene":
504
+ with gr.Group():
505
+ demo2_scene_gallery = gr.Gallery(
506
+ height=300,
507
+ value=gif_paths,
508
+ label="Select a Scene to Generate Video",
509
+ columns=[8],
510
+ selected_index=scene_idx,
511
+ allow_preview=False,
512
+ preview=False,
513
+ )
514
+
515
+ @demo2_scene_gallery.select(
516
+ inputs=None, outputs=[demo2_stage, demo2_selected_scene_index]
517
+ )
518
+ def move_to_image_selection(selection: gr.SelectData):
519
+ return "Image", selection.index
520
+
521
+ case "Image":
522
+ with gr.Group():
523
+ demo2_image_gallery = gr.Gallery(
524
+ height=150,
525
+ value=[
526
+ (image, f"t={i}")
527
+ for i, image in enumerate(
528
+ prepare_short_gt_video(scene_idx)
529
  )
530
+ ],
531
+ label="Select Input Images",
532
+ columns=[8],
533
+ )
534
+
535
+ demo2_selector = gr.CheckboxGroup(
536
+ label="Select Any Number of Input Images",
537
+ info="Image-to-Video: Select t=0; Interpolation: Select t=0 and t=7",
538
+ choices=[(f"t={i}", i) for i in range(8)],
539
+ value=[],
540
+ )
541
+ demo2_image_select_button = gr.Button(
542
+ "Next Step", variant="primary"
543
+ )
544
+
545
+ @demo2_image_select_button.click(
546
+ inputs=[demo2_selector],
547
+ outputs=[demo2_stage, demo2_selected_image_indices],
548
+ )
549
+ def generate_video(selected_indices):
550
+ if len(selected_indices) == 0:
551
+ gr.Warning("Select at least one image!")
552
+ return "Image", []
553
+ else:
554
+ return "Generation", selected_indices
555
+
556
+ case "Generation":
557
+ with gr.Group():
558
+ gt_video = prepare_short_gt_video(scene_idx)
559
+
560
+ demo2_input_image_gallery = gr.Gallery(
561
+ height=150,
562
+ value=video_to_gif_and_images(gt_video, image_indices),
563
+ label="Input Images",
564
+ columns=[9],
565
+ )
566
+ demo2_generated_gallery = gr.Gallery(
567
+ height=150,
568
+ value=[],
569
+ label="Generated Video",
570
+ columns=[9],
571
+ )
572
+
573
+ demo2_ground_truth_gallery = gr.Gallery(
574
+ height=150,
575
+ value=video_to_gif_and_images(gt_video, list(range(8))),
576
+ label="Ground Truth Video",
577
+ columns=[9],
578
+ )
579
+ gr.Markdown("### Generation Controls ↓")
580
+ demo2_guidance_scale = gr.Slider(
581
+ minimum=1,
582
+ maximum=6,
583
+ value=4,
584
+ step=0.5,
585
+ label="History Guidance Scale",
586
+ info="Without history guidance: 1.0; Recommended: 4.0",
587
+ interactive=True,
588
+ )
589
+ gr.Button("Generate Video", variant="primary").click(
590
+ fn=any_images_to_short_video,
591
+ inputs=[
592
+ demo2_selected_scene_index,
593
+ demo2_selected_image_indices,
594
+ demo2_guidance_scale,
595
+ ],
596
+ outputs=demo2_generated_gallery,
597
+ )
598
+
599
+ def render_demo3(
600
+ s: Literal["Selection", "Generation"],
601
+ idx: int,
602
+ demo3_stage: gr.State,
603
+ demo3_selected_index: gr.State,
604
+ demo3_current_video: gr.State,
605
+ demo3_current_poses: gr.State
606
+ ):
607
+ gr.Markdown(
608
  """
609
+ ## Demo 3: Single Image → Extremely Long Video _(Navigate with Your Camera Movements!)_
610
+ > #### _History Guidance significantly improves quality and temporal consistency, enabling stable rollouts for extremely long videos._
611
+ """,
612
+ elem_classes=["task-title"]
613
+ )
614
+ match s:
615
+ case "Selection":
616
+ with gr.Group():
617
+ demo3_image_gallery = gr.Gallery(
618
+ height=300,
619
+ value=first_frame_list,
620
+ label="Select an Image to Start Navigation",
621
+ columns=[8],
622
+ selected_index=idx,
623
+ allow_preview=False,
624
+ preview=False,
625
+ )
626
+
627
+ @demo3_image_gallery.select(
628
+ inputs=None, outputs=[demo3_stage, demo3_selected_index, demo3_current_video, demo3_current_poses]
629
+ )
630
+ def move_to_generation(selection: gr.SelectData):
631
+ idx = selection.index
632
+ return (
633
+ "Generation",
634
+ idx,
635
+ video_list[idx][:1],
636
+ poses_list[idx][:1],
637
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
638
 
639
+ case "Generation":
640
+ with gr.Row():
641
+ with gr.Column(scale=3):
642
  with gr.Row():
643
  demo3_current_view = gr.Image(
644
  value=first_frame_list[idx],
 
659
  demo3_generated_gallery = gr.Gallery(
660
  value=[],
661
  label="Generated Frames",
662
+ columns=[6],
663
  )
664
 
665
+ with gr.Column():
666
+ gr.Markdown("### Navigation Controls ↓")
667
+ with gr.Accordion("Instructions", open=False):
668
+ gr.Markdown("""
669
  - **The model will predict the next few frames based on your camera movements. Repeat the process to continue navigating through the scene.**
670
  - **At the end of your navigation, apply temporal super-resolution to increase the FPS,** also utilizing the DFoT model.
671
+ - The most suitable history guidance scheme will be automatically selected based on your camera movements.
672
+ """)
673
+ with gr.Tab("Basic", elem_id="basic-controls-tab"):
674
+ with gr.Group():
675
+ gr.Markdown("_**Select a direction to move:**_")
676
+ with gr.Row(elem_id="basic-controls"):
677
+ gr.Button(
678
+ "↰-60°\nVeer",
679
+ size="sm",
680
+ min_width=0,
681
+ variant="primary",
682
+ ).click(
683
+ fn=partial(
684
+ navigate_video,
685
+ x_angle=0,
686
+ y_angle=-60,
687
+ distance=0,
688
+ ),
689
+ inputs=[
690
+ demo3_current_video,
691
+ demo3_current_poses,
692
+ ],
693
+ outputs=[
694
+ demo3_current_video,
695
+ demo3_current_poses,
696
+ demo3_current_view,
697
+ demo3_video,
698
+ demo3_generated_gallery,
699
+ ],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
700
  )
701
+
702
+ gr.Button(
703
+ "↖-30°\nTurn",
704
+ size="sm",
705
+ min_width=0,
706
+ variant="primary",
707
+ ).click(
708
+ fn=partial(
709
+ navigate_video,
710
+ x_angle=0,
711
+ y_angle=-30,
712
+ distance=50,
713
+ ),
714
+ inputs=[
715
+ demo3_current_video,
716
+ demo3_current_poses,
717
+ ],
718
+ outputs=[
719
+ demo3_current_video,
720
+ demo3_current_poses,
721
+ demo3_current_view,
722
+ demo3_video,
723
+ demo3_generated_gallery,
724
+ ],
725
  )
726
 
727
  gr.Button(
728
+ "↑0°\nAhead",
729
+ size="sm",
730
+ min_width=0,
731
+ variant="primary",
732
  ).click(
733
+ fn=partial(
734
+ navigate_video,
735
+ x_angle=0,
736
+ y_angle=0,
737
+ distance=100,
738
+ ),
739
  inputs=[
740
  demo3_current_video,
741
  demo3_current_poses,
 
 
 
742
  ],
743
  outputs=[
744
  demo3_current_video,
 
748
  demo3_generated_gallery,
749
  ],
750
  )
751
+ gr.Button(
752
+ "↗30°\nTurn",
753
+ size="sm",
754
+ min_width=0,
755
+ variant="primary",
756
+ ).click(
757
+ fn=partial(
758
+ navigate_video,
759
+ x_angle=0,
760
+ y_angle=30,
761
+ distance=50,
762
+ ),
763
+ inputs=[
764
+ demo3_current_video,
765
+ demo3_current_poses,
766
+ ],
767
+ outputs=[
768
+ demo3_current_video,
769
+ demo3_current_poses,
770
+ demo3_current_view,
771
+ demo3_video,
772
+ demo3_generated_gallery,
773
+ ],
774
+ )
775
+ gr.Button(
776
+ "↱\n60° Veer",
777
+ size="sm",
778
+ min_width=0,
779
+ variant="primary",
780
+ ).click(
781
+ fn=partial(
782
+ navigate_video,
783
+ x_angle=0,
784
+ y_angle=60,
785
+ distance=0,
786
+ ),
787
+ inputs=[
788
+ demo3_current_video,
789
+ demo3_current_poses,
790
+ ],
791
+ outputs=[
792
+ demo3_current_video,
793
+ demo3_current_poses,
794
+ demo3_current_view,
795
+ demo3_video,
796
+ demo3_generated_gallery,
797
+ ],
798
+ )
799
+ with gr.Tab("Advanced", elem_id="advanced-controls-tab"):
800
  with gr.Group():
801
+ gr.Markdown("_**Select angles and distance:**_")
802
+
803
+ demo3_y_angle = gr.Slider(
804
+ minimum=-90,
805
+ maximum=90,
806
+ value=0,
807
+ step=10,
808
+ label="Horizontal Angle",
809
+ interactive=True,
 
 
810
  )
811
+ demo3_x_angle = gr.Slider(
812
+ minimum=-40,
813
+ maximum=40,
814
+ value=0,
815
+ step=10,
816
+ label="Vertical Angle",
817
+ interactive=True,
818
  )
819
+ demo3_distance = gr.Slider(
820
+ minimum=0,
821
+ maximum=200,
822
+ value=100,
823
+ step=10,
824
+ label="Distance",
825
  interactive=True,
826
  )
827
+
828
+ gr.Button(
829
+ "Generate Next Move", variant="primary"
830
+ ).click(
831
+ fn=navigate_video,
832
  inputs=[
833
  demo3_current_video,
834
  demo3_current_poses,
835
+ demo3_x_angle,
836
+ demo3_y_angle,
837
+ demo3_distance,
838
  ],
839
  outputs=[
840
  demo3_current_video,
 
844
  demo3_generated_gallery,
845
  ],
846
  )
847
+ gr.Markdown("---")
848
+ with gr.Group():
849
+ gr.Markdown("_You can always undo your last move:_")
850
+ gr.Button("Undo Last Move", variant="huggingface").click(
851
+ fn=undo_navigation,
852
+ inputs=[demo3_current_video, demo3_current_poses],
853
+ outputs=[
854
+ demo3_current_video,
855
+ demo3_current_poses,
856
+ demo3_current_view,
857
+ demo3_video,
858
+ demo3_generated_gallery,
859
+ ],
860
+ )
861
+ with gr.Group():
862
+ gr.Markdown(
863
+ "_At the end, apply temporal super-resolution to obtain a smoother video:_"
864
+ )
865
+ demo3_interpolation_factor = gr.Slider(
866
+ minimum=2,
867
+ maximum=10,
868
+ value=2,
869
+ step=1,
870
+ label="By a Factor of",
871
+ interactive=True,
872
+ )
873
+ gr.Button("Smooth Out Video", variant="huggingface").click(
874
+ fn=smooth_navigation,
875
+ inputs=[
876
+ demo3_current_video,
877
+ demo3_current_poses,
878
+ demo3_interpolation_factor,
879
+ ],
880
+ outputs=[
881
+ demo3_current_video,
882
+ demo3_current_poses,
883
+ demo3_current_view,
884
+ demo3_video,
885
+ demo3_generated_gallery,
886
+ ],
887
+ )
888
+
889
+
890
 
891
+ # Create the Gradio Blocks
892
+ with gr.Blocks(theme=gr.themes.Base(primary_hue="teal")) as demo:
893
+ gr.HTML(
894
+ """
895
+ <style>
896
+ [data-tab-id="task-1"], [data-tab-id="task-2"], [data-tab-id="task-3"] {
897
+ font-size: 16px !important;
898
+ font-weight: bold;
899
+ }
900
+ #page-title h1 {
901
+ color: #0D9488 !important;
902
+ }
903
+ .task-title h2 {
904
+ color: #F59E0C !important;
905
+ }
906
+ .header-button-row {
907
+ gap: 4px !important;
908
+ }
909
+ .header-button-row div {
910
+ width: 131.0px !important;
911
+ }
912
+
913
+ .header-button-column {
914
+ width: 131.0px !important;
915
+ gap: 5px !important;
916
+ }
917
+ .header-button a {
918
+ border: 1px solid #e4e4e7;
919
+ }
920
+ .header-button .button-icon {
921
+ margin-right: 8px;
922
+ }
923
+ #basic-controls {
924
+ column-gap: 0px;
925
+ }
926
+ #basic-controls-tab {
927
+ padding: 0px;
928
+ }
929
+ #advanced-controls-tab {
930
+ padding: 0px;
931
+ }
932
+ #selected-demo-button {
933
+ color: #F59E0C;
934
+ text-decoration: underline;
935
+ }
936
+ .demo-button {
937
+ text-align: left !important;
938
+ display: block !important;
939
+ }
940
+ </style>
941
+ """
942
+ )
943
+
944
+ demo_idx = gr.State(value=1)
945
+
946
+ with gr.Sidebar():
947
+ gr.Markdown("# Diffusion Forcing Transformer with History Guidance", elem_id="page-title")
948
+ gr.Markdown(
949
+ "### Official Interactive Demo for [_History-Guided Video Diffusion_](https://arxiv.org/abs/2502.06764)"
950
+ )
951
+ gr.Markdown("---")
952
+ gr.Markdown("#### Links ↓")
953
+ with gr.Row(elem_classes=["header-button-row"]):
954
+ with gr.Column(elem_classes=["header-button-column"], min_width=0):
955
+ gr.Button(
956
+ value="Website",
957
+ link="https://boyuan.space/history-guidance",
958
+ icon="https://simpleicons.org/icons/googlechrome.svg",
959
+ elem_classes=["header-button"],
960
+ size="md",
961
+ min_width=0,
962
+ )
963
+ gr.Button(
964
+ value="Paper",
965
+ link="https://arxiv.org/abs/2502.06764",
966
+ icon="https://simpleicons.org/icons/arxiv.svg",
967
+ elem_classes=["header-button"],
968
+ size="md",
969
+ min_width=0,
970
+ )
971
+ with gr.Column(elem_classes=["header-button-column"], min_width=0):
972
+ gr.Button(
973
+ value="Code",
974
+ link="https://github.com/kwsong0113/diffusion-forcing-transformer",
975
+ icon="https://simpleicons.org/icons/github.svg",
976
+ elem_classes=["header-button"],
977
+ size="md",
978
+ min_width=0,
979
+ )
980
+ gr.Button(
981
+ value="Weights",
982
+ link="https://huggingface.co/kiwhansong/DFoT",
983
+ icon="https://simpleicons.org/icons/huggingface.svg",
984
+ elem_classes=["header-button"],
985
+ size="md",
986
+ min_width=0,
987
+ )
988
+ gr.Markdown("---")
989
+ gr.Markdown("#### Choose a Demo ↓")
990
+ with gr.Group():
991
+ @gr.render(inputs=[demo_idx])
992
+ def render_demo_tabs(idx):
993
+ demo_tab_button1 = gr.Button(
994
+ "1: Image → Long Video",
995
+ size="md", elem_classes=["demo-button"], **{"elem_id": "selected-demo-button"} if idx == 1 else {}
996
+ ).click(
997
+ fn=lambda: 1,
998
+ outputs=demo_idx
999
+ )
1000
+ demo_tab_button2 = gr.Button(
1001
+ "2: Any # of Images → Short Video",
1002
+ size="md", elem_classes=["demo-button"], **{"elem_id": "selected-demo-button"} if idx == 2 else {}
1003
+ ).click(
1004
+ fn=lambda: 2,
1005
+ outputs=demo_idx
1006
+ )
1007
+ demo_tab_button3 = gr.Button(
1008
+ "3: Image → Extremely Long Video",
1009
+ size="md", elem_classes=["demo-button"], **{"elem_id": "selected-demo-button"} if idx == 3 else {}
1010
+ ).click(
1011
+ fn=lambda: 3,
1012
+ outputs=demo_idx
1013
+ )
1014
+ gr.Markdown("---")
1015
+ gr.Markdown("#### Troubleshooting ↓")
1016
+ with gr.Group():
1017
+ with gr.Accordion("Error or Unexpected Results?", open=False):
1018
+ gr.Markdown("Please try again after refreshing the page and ensure you do not click the same button multiple times.")
1019
+ with gr.Accordion("Too Slow or No GPU Allocation?", open=False):
1020
+ gr.Markdown(
1021
+ "Consider running the demo locally (click the dots in the top-right corner). Alternatively, you can subscribe to Hugging Face Pro for an increased GPU quota."
1022
+ )
1023
+
1024
+ demo1_stage = gr.State(value="Selection")
1025
+ demo1_selected_index = gr.State(value=None)
1026
+ demo2_stage = gr.State(value="Scene")
1027
+ demo2_selected_scene_index = gr.State(value=None)
1028
+ demo2_selected_image_indices = gr.State(value=[])
1029
+ demo3_stage = gr.State(value="Selection")
1030
+ demo3_selected_index = gr.State(value=None)
1031
+ demo3_current_video = gr.State(value=None)
1032
+ demo3_current_poses = gr.State(value=None)
1033
+
1034
+ @gr.render(inputs=[demo_idx, demo1_stage, demo1_selected_index, demo2_stage, demo2_selected_scene_index, demo2_selected_image_indices, demo3_stage, demo3_selected_index])
1035
+ def render_demo(
1036
+ _demo_idx, _demo1_stage, _demo1_selected_index, _demo2_stage, _demo2_selected_scene_index, _demo2_selected_image_indices, _demo3_stage, _demo3_selected_index
1037
+ ):
1038
+ match _demo_idx:
1039
+ case 1:
1040
+ render_demo1(_demo1_stage, _demo1_selected_index, demo1_stage, demo1_selected_index)
1041
+ case 2:
1042
+ render_demo2(_demo2_stage, _demo2_selected_scene_index, _demo2_selected_image_indices,
1043
+ demo2_stage, demo2_selected_scene_index, demo2_selected_image_indices)
1044
+ case 3:
1045
+ render_demo3(_demo3_stage, _demo3_selected_index, demo3_stage, demo3_selected_index, demo3_current_video, demo3_current_poses)
1046
+
1047
 
1048
  if __name__ == "__main__":
1049
  demo.launch()