Spaces:
Running
on
Zero
Running
on
Zero
Commit
·
badacd0
1
Parent(s):
006ab10
new ui
Browse files
app.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
from typing import List
|
2 |
from pathlib import Path
|
3 |
from functools import partial
|
4 |
import spaces
|
@@ -137,7 +137,7 @@ def any_images_to_short_video(
|
|
137 |
pbar = CustomProgressBar(
|
138 |
gr.Progress(track_tqdm=True).tqdm(
|
139 |
iterable=None,
|
140 |
-
desc="Sampling",
|
141 |
total=dfot.sampling_timesteps,
|
142 |
)
|
143 |
)
|
@@ -200,7 +200,7 @@ def navigate_video(
|
|
200 |
pbar = CustomProgressBar(
|
201 |
gr.Progress(track_tqdm=True).tqdm(
|
202 |
iterable=None,
|
203 |
-
desc=f"Predicting next {n_prediction_frames} frames",
|
204 |
total=dfot.sampling_timesteps,
|
205 |
)
|
206 |
)
|
@@ -408,363 +408,237 @@ def smooth_navigation(
|
|
408 |
[(image, f"t={i}") for i, image in enumerate(images)],
|
409 |
)
|
410 |
|
411 |
-
|
412 |
-
# Create the Gradio Blocks
|
413 |
-
with gr.Blocks(theme=gr.themes.Base(primary_hue="teal")) as demo:
|
414 |
-
gr.HTML(
|
415 |
-
"""
|
416 |
-
<style>
|
417 |
-
[data-tab-id="task-1"], [data-tab-id="task-2"], [data-tab-id="task-3"] {
|
418 |
-
font-size: 16px !important;
|
419 |
-
font-weight: bold;
|
420 |
-
}
|
421 |
-
#header-button .button-icon {
|
422 |
-
margin-right: 8px;
|
423 |
-
}
|
424 |
-
#basic-controls {
|
425 |
-
column-gap: 0px;
|
426 |
-
}
|
427 |
-
#basic-controls button {
|
428 |
-
border: 1px solid #e4e4e7;
|
429 |
-
}
|
430 |
-
#basic-controls-tab {
|
431 |
-
padding: 0px;
|
432 |
-
}
|
433 |
-
#advanced-controls-tab {
|
434 |
-
padding: 0px;
|
435 |
-
}
|
436 |
-
</style>
|
437 |
-
"""
|
438 |
-
)
|
439 |
-
|
440 |
-
gr.Markdown("# Diffusion Forcing Transformer with History Guidance")
|
441 |
gr.Markdown(
|
442 |
-
"
|
|
|
|
|
|
|
|
|
443 |
)
|
444 |
-
|
445 |
-
|
446 |
-
|
447 |
-
|
448 |
-
|
449 |
-
|
450 |
-
|
451 |
-
|
452 |
-
|
453 |
-
|
454 |
-
|
455 |
-
|
456 |
-
|
457 |
-
|
458 |
-
|
459 |
-
|
460 |
-
|
461 |
-
|
462 |
-
|
463 |
-
|
464 |
-
|
465 |
-
|
466 |
-
|
467 |
-
|
468 |
-
|
469 |
-
|
470 |
-
|
471 |
-
|
472 |
-
|
473 |
-
|
474 |
-
|
475 |
-
|
476 |
-
|
477 |
-
|
478 |
-
|
479 |
-
|
480 |
-
|
481 |
-
|
482 |
-
|
483 |
-
|
484 |
-
|
485 |
-
|
486 |
-
|
487 |
-
|
488 |
-
|
489 |
-
|
490 |
-
|
491 |
-
|
492 |
-
|
493 |
-
|
494 |
-
|
495 |
-
|
496 |
-
|
497 |
-
|
498 |
-
|
499 |
-
|
500 |
-
|
501 |
-
|
502 |
-
|
503 |
-
|
504 |
-
|
505 |
-
|
506 |
-
|
507 |
-
|
508 |
-
|
509 |
-
|
510 |
-
|
511 |
-
|
512 |
-
|
513 |
-
|
514 |
-
|
515 |
-
|
516 |
-
|
517 |
-
|
518 |
-
|
519 |
-
|
520 |
-
inputs=demo1_selected_scene_index, outputs=demo1_stage
|
521 |
-
)
|
522 |
-
def move_to_image_selection(scene_idx: int):
|
523 |
-
if scene_idx is None:
|
524 |
-
gr.Warning("Scene not selected!")
|
525 |
-
return "Scene"
|
526 |
-
else:
|
527 |
-
return "Image"
|
528 |
-
|
529 |
-
case "Image":
|
530 |
-
with gr.Group():
|
531 |
-
demo1_image_gallery = gr.Gallery(
|
532 |
-
height=150,
|
533 |
-
value=[
|
534 |
-
(image, f"t={i}")
|
535 |
-
for i, image in enumerate(
|
536 |
-
prepare_short_gt_video(scene_idx)
|
537 |
-
)
|
538 |
-
],
|
539 |
-
label="Select Input Images",
|
540 |
-
columns=[8],
|
541 |
-
)
|
542 |
-
|
543 |
-
demo1_selector = gr.CheckboxGroup(
|
544 |
-
label="Select Any Number of Input Images",
|
545 |
-
info="Image-to-Video: Select t=0; Interpolation: Select t=0 and t=7",
|
546 |
-
choices=[(f"t={i}", i) for i in range(8)],
|
547 |
-
value=[],
|
548 |
-
)
|
549 |
-
demo1_image_select_button = gr.Button(
|
550 |
-
"Select Input Images", variant="primary"
|
551 |
-
)
|
552 |
-
|
553 |
-
@demo1_image_select_button.click(
|
554 |
-
inputs=[demo1_selector],
|
555 |
-
outputs=[demo1_stage, demo1_selected_image_indices],
|
556 |
-
)
|
557 |
-
def generate_video(selected_indices):
|
558 |
-
if len(selected_indices) == 0:
|
559 |
-
gr.Warning("Select at least one image!")
|
560 |
-
return "Image", []
|
561 |
-
else:
|
562 |
-
gr.Info('Click "Generate Video" on the left to start generating now!')
|
563 |
-
return "Generation", selected_indices
|
564 |
-
|
565 |
-
case "Generation":
|
566 |
-
with gr.Group():
|
567 |
-
gt_video = prepare_short_gt_video(scene_idx)
|
568 |
-
|
569 |
-
demo1_input_image_gallery = gr.Gallery(
|
570 |
-
height=150,
|
571 |
-
value=video_to_gif_and_images(gt_video, image_indices),
|
572 |
-
label="Input Images",
|
573 |
-
columns=[9],
|
574 |
-
)
|
575 |
-
demo1_generated_gallery = gr.Gallery(
|
576 |
-
height=150,
|
577 |
-
value=[],
|
578 |
-
label="Generated Video",
|
579 |
-
columns=[9],
|
580 |
-
)
|
581 |
-
|
582 |
-
demo1_ground_truth_gallery = gr.Gallery(
|
583 |
-
height=150,
|
584 |
-
value=video_to_gif_and_images(gt_video, list(range(8))),
|
585 |
-
label="Ground Truth Video",
|
586 |
-
columns=[9],
|
587 |
-
)
|
588 |
-
with gr.Sidebar():
|
589 |
-
gr.Markdown("### Sampling Parameters")
|
590 |
-
demo1_guidance_scale = gr.Slider(
|
591 |
-
minimum=1,
|
592 |
-
maximum=6,
|
593 |
-
value=4,
|
594 |
-
step=0.5,
|
595 |
-
label="History Guidance Scale",
|
596 |
-
info="Without history guidance: 1.0; Recommended: 4.0",
|
597 |
-
interactive=True,
|
598 |
-
)
|
599 |
-
gr.Button("Generate Video", variant="primary").click(
|
600 |
-
fn=any_images_to_short_video,
|
601 |
-
inputs=[
|
602 |
-
demo1_selected_scene_index,
|
603 |
-
demo1_selected_image_indices,
|
604 |
-
demo1_guidance_scale,
|
605 |
-
],
|
606 |
-
outputs=demo1_generated_gallery,
|
607 |
-
)
|
608 |
-
|
609 |
-
with gr.Tab("Single Image → Long Video", id="task-2"):
|
610 |
-
gr.Markdown(
|
611 |
-
f"""
|
612 |
-
## Demo 2: Single Image → Long {LONG_LENGTH}-second Video
|
613 |
-
> #### _Diffusion Forcing Transformer, with History Guidance, can generate long videos via sliding window rollouts and temporal super-resolution._
|
614 |
"""
|
615 |
-
|
616 |
-
|
617 |
-
|
618 |
-
|
619 |
-
|
620 |
-
@gr.render(inputs=[demo2_stage, demo2_selected_index])
|
621 |
-
def render_stage(s, idx):
|
622 |
-
match s:
|
623 |
-
case "Selection":
|
624 |
-
with gr.Group():
|
625 |
-
demo2_image_gallery = gr.Gallery(
|
626 |
-
height=300,
|
627 |
-
value=first_frame_list,
|
628 |
-
label="Select an Image to Animate",
|
629 |
-
columns=[8],
|
630 |
-
selected_index=idx,
|
631 |
-
)
|
632 |
-
|
633 |
-
@demo2_image_gallery.select(
|
634 |
-
inputs=None, outputs=demo2_selected_index
|
635 |
-
)
|
636 |
-
def update_selection(selection: gr.SelectData):
|
637 |
-
return selection.index
|
638 |
-
|
639 |
-
demo2_select_button = gr.Button(
|
640 |
-
"Select Input Image", variant="primary"
|
641 |
-
)
|
642 |
-
|
643 |
-
@demo2_select_button.click(
|
644 |
-
inputs=demo2_selected_index, outputs=demo2_stage
|
645 |
-
)
|
646 |
-
def move_to_generation(idx: int):
|
647 |
-
if idx is None:
|
648 |
-
gr.Warning("Image not selected!")
|
649 |
-
return "Selection"
|
650 |
-
else:
|
651 |
-
gr.Info('Click "Generate Video" on the left to start generating now!')
|
652 |
-
return "Generation"
|
653 |
-
|
654 |
-
case "Generation":
|
655 |
-
with gr.Row():
|
656 |
-
gr.Image(
|
657 |
-
value=first_frame_list[idx],
|
658 |
-
label="Input Image",
|
659 |
-
width=256,
|
660 |
-
height=256,
|
661 |
-
)
|
662 |
-
gr.Video(
|
663 |
-
value=prepare_long_gt_video(idx),
|
664 |
-
label="Ground Truth Video",
|
665 |
-
width=256,
|
666 |
-
height=256,
|
667 |
-
autoplay=True,
|
668 |
-
loop=True,
|
669 |
-
)
|
670 |
-
demo2_video = gr.Video(
|
671 |
-
label="Generated Video",
|
672 |
-
width=256,
|
673 |
-
height=256,
|
674 |
-
autoplay=True,
|
675 |
-
loop=True,
|
676 |
-
show_share_button=True,
|
677 |
-
show_download_button=True,
|
678 |
-
)
|
679 |
-
|
680 |
-
with gr.Sidebar():
|
681 |
-
gr.Markdown("### Sampling Parameters")
|
682 |
|
683 |
-
|
684 |
-
|
685 |
-
|
686 |
-
|
687 |
-
|
688 |
-
|
689 |
-
|
690 |
-
|
691 |
-
|
692 |
-
|
693 |
-
|
694 |
-
|
695 |
-
|
696 |
-
|
697 |
-
|
698 |
-
|
699 |
-
|
700 |
-
|
701 |
-
|
702 |
-
|
703 |
-
|
704 |
-
|
705 |
-
|
706 |
-
|
707 |
-
|
708 |
-
|
|
|
709 |
)
|
710 |
-
|
711 |
-
|
712 |
-
|
713 |
-
|
714 |
-
|
715 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
716 |
"""
|
717 |
-
)
|
718 |
-
|
719 |
-
|
720 |
-
|
721 |
-
|
722 |
-
|
723 |
-
|
724 |
-
|
725 |
-
|
726 |
-
|
727 |
-
|
728 |
-
|
729 |
-
|
730 |
-
|
731 |
-
|
732 |
-
|
733 |
-
|
734 |
-
|
735 |
-
|
736 |
-
|
737 |
-
|
738 |
-
|
739 |
-
|
740 |
-
|
741 |
-
|
742 |
-
|
743 |
-
|
744 |
-
|
745 |
-
|
746 |
-
|
747 |
-
@demo3_select_button.click(
|
748 |
-
inputs=demo3_selected_index,
|
749 |
-
outputs=[
|
750 |
-
demo3_stage,
|
751 |
-
demo3_current_video,
|
752 |
-
demo3_current_poses,
|
753 |
-
],
|
754 |
-
)
|
755 |
-
def move_to_generation(idx: int):
|
756 |
-
if idx is None:
|
757 |
-
gr.Warning("Image not selected!")
|
758 |
-
return "Selection", None, None
|
759 |
-
else:
|
760 |
-
gr.Info('Start navigating with the "Let\'s Navigate!" sidebar on the left now!')
|
761 |
-
return (
|
762 |
-
"Generation",
|
763 |
-
video_list[idx][:1],
|
764 |
-
poses_list[idx][:1],
|
765 |
-
)
|
766 |
|
767 |
-
|
|
|
|
|
768 |
with gr.Row():
|
769 |
demo3_current_view = gr.Image(
|
770 |
value=first_frame_list[idx],
|
@@ -785,183 +659,86 @@ with gr.Blocks(theme=gr.themes.Base(primary_hue="teal")) as demo:
|
|
785 |
demo3_generated_gallery = gr.Gallery(
|
786 |
value=[],
|
787 |
label="Generated Frames",
|
788 |
-
columns=[
|
789 |
)
|
790 |
|
791 |
-
|
792 |
-
|
793 |
-
|
794 |
-
|
795 |
- **The model will predict the next few frames based on your camera movements. Repeat the process to continue navigating through the scene.**
|
796 |
- **At the end of your navigation, apply temporal super-resolution to increase the FPS,** also utilizing the DFoT model.
|
797 |
-
- The most suitable history guidance scheme will be automatically selected based on your camera movements.
|
798 |
-
"""
|
799 |
-
|
800 |
-
with gr.
|
801 |
-
|
802 |
-
|
803 |
-
|
804 |
-
|
805 |
-
|
806 |
-
|
807 |
-
|
808 |
-
|
809 |
-
|
810 |
-
|
811 |
-
|
812 |
-
|
813 |
-
|
814 |
-
|
815 |
-
|
816 |
-
|
817 |
-
|
818 |
-
|
819 |
-
|
820 |
-
|
821 |
-
|
822 |
-
|
823 |
-
|
824 |
-
|
825 |
-
|
826 |
-
],
|
827 |
-
)
|
828 |
-
|
829 |
-
gr.Button(
|
830 |
-
"↖-30°\nVeer",
|
831 |
-
size="sm",
|
832 |
-
min_width=0,
|
833 |
-
variant="primary",
|
834 |
-
).click(
|
835 |
-
fn=partial(
|
836 |
-
navigate_video,
|
837 |
-
x_angle=0,
|
838 |
-
y_angle=-30,
|
839 |
-
distance=50,
|
840 |
-
),
|
841 |
-
inputs=[
|
842 |
-
demo3_current_video,
|
843 |
-
demo3_current_poses,
|
844 |
-
],
|
845 |
-
outputs=[
|
846 |
-
demo3_current_video,
|
847 |
-
demo3_current_poses,
|
848 |
-
demo3_current_view,
|
849 |
-
demo3_video,
|
850 |
-
demo3_generated_gallery,
|
851 |
-
],
|
852 |
-
)
|
853 |
-
|
854 |
-
gr.Button(
|
855 |
-
"↑0°\nAhead",
|
856 |
-
size="sm",
|
857 |
-
min_width=0,
|
858 |
-
variant="primary",
|
859 |
-
).click(
|
860 |
-
fn=partial(
|
861 |
-
navigate_video,
|
862 |
-
x_angle=0,
|
863 |
-
y_angle=0,
|
864 |
-
distance=100,
|
865 |
-
),
|
866 |
-
inputs=[
|
867 |
-
demo3_current_video,
|
868 |
-
demo3_current_poses,
|
869 |
-
],
|
870 |
-
outputs=[
|
871 |
-
demo3_current_video,
|
872 |
-
demo3_current_poses,
|
873 |
-
demo3_current_view,
|
874 |
-
demo3_video,
|
875 |
-
demo3_generated_gallery,
|
876 |
-
],
|
877 |
-
)
|
878 |
-
gr.Button(
|
879 |
-
"↗30°\nVeer",
|
880 |
-
size="sm",
|
881 |
-
min_width=0,
|
882 |
-
variant="primary",
|
883 |
-
).click(
|
884 |
-
fn=partial(
|
885 |
-
navigate_video,
|
886 |
-
x_angle=0,
|
887 |
-
y_angle=30,
|
888 |
-
distance=50,
|
889 |
-
),
|
890 |
-
inputs=[
|
891 |
-
demo3_current_video,
|
892 |
-
demo3_current_poses,
|
893 |
-
],
|
894 |
-
outputs=[
|
895 |
-
demo3_current_video,
|
896 |
-
demo3_current_poses,
|
897 |
-
demo3_current_view,
|
898 |
-
demo3_video,
|
899 |
-
demo3_generated_gallery,
|
900 |
-
],
|
901 |
-
)
|
902 |
-
gr.Button(
|
903 |
-
"↱\n60° Turn",
|
904 |
-
size="sm",
|
905 |
-
min_width=0,
|
906 |
-
variant="primary",
|
907 |
-
).click(
|
908 |
-
fn=partial(
|
909 |
-
navigate_video,
|
910 |
-
x_angle=0,
|
911 |
-
y_angle=60,
|
912 |
-
distance=0,
|
913 |
-
),
|
914 |
-
inputs=[
|
915 |
-
demo3_current_video,
|
916 |
-
demo3_current_poses,
|
917 |
-
],
|
918 |
-
outputs=[
|
919 |
-
demo3_current_video,
|
920 |
-
demo3_current_poses,
|
921 |
-
demo3_current_view,
|
922 |
-
demo3_video,
|
923 |
-
demo3_generated_gallery,
|
924 |
-
],
|
925 |
-
)
|
926 |
-
with gr.Tab("Advanced", elem_id="advanced-controls-tab"):
|
927 |
-
with gr.Group():
|
928 |
-
gr.Markdown("_**Select angles and distance:**_")
|
929 |
-
|
930 |
-
demo3_y_angle = gr.Slider(
|
931 |
-
minimum=-90,
|
932 |
-
maximum=90,
|
933 |
-
value=0,
|
934 |
-
step=10,
|
935 |
-
label="Horizontal Angle",
|
936 |
-
interactive=True,
|
937 |
-
)
|
938 |
-
demo3_x_angle = gr.Slider(
|
939 |
-
minimum=-40,
|
940 |
-
maximum=40,
|
941 |
-
value=0,
|
942 |
-
step=10,
|
943 |
-
label="Vertical Angle",
|
944 |
-
interactive=True,
|
945 |
)
|
946 |
-
|
947 |
-
|
948 |
-
|
949 |
-
|
950 |
-
|
951 |
-
|
952 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
953 |
)
|
954 |
|
955 |
gr.Button(
|
956 |
-
"
|
|
|
|
|
|
|
957 |
).click(
|
958 |
-
fn=
|
|
|
|
|
|
|
|
|
|
|
959 |
inputs=[
|
960 |
demo3_current_video,
|
961 |
demo3_current_poses,
|
962 |
-
demo3_x_angle,
|
963 |
-
demo3_y_angle,
|
964 |
-
demo3_distance,
|
965 |
],
|
966 |
outputs=[
|
967 |
demo3_current_video,
|
@@ -971,37 +748,93 @@ with gr.Blocks(theme=gr.themes.Base(primary_hue="teal")) as demo:
|
|
971 |
demo3_generated_gallery,
|
972 |
],
|
973 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
974 |
with gr.Group():
|
975 |
-
gr.Markdown("
|
976 |
-
|
977 |
-
|
978 |
-
|
979 |
-
|
980 |
-
|
981 |
-
|
982 |
-
|
983 |
-
|
984 |
-
demo3_generated_gallery,
|
985 |
-
],
|
986 |
)
|
987 |
-
|
988 |
-
|
989 |
-
|
|
|
|
|
|
|
|
|
990 |
)
|
991 |
-
|
992 |
-
minimum=
|
993 |
-
maximum=
|
994 |
-
value=
|
995 |
-
step=
|
996 |
-
label="
|
997 |
interactive=True,
|
998 |
)
|
999 |
-
|
1000 |
-
|
|
|
|
|
|
|
1001 |
inputs=[
|
1002 |
demo3_current_video,
|
1003 |
demo3_current_poses,
|
1004 |
-
|
|
|
|
|
1005 |
],
|
1006 |
outputs=[
|
1007 |
demo3_current_video,
|
@@ -1011,7 +844,206 @@ with gr.Blocks(theme=gr.themes.Base(primary_hue="teal")) as demo:
|
|
1011 |
demo3_generated_gallery,
|
1012 |
],
|
1013 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1014 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1015 |
|
1016 |
if __name__ == "__main__":
|
1017 |
demo.launch()
|
|
|
1 |
+
from typing import List, Literal
|
2 |
from pathlib import Path
|
3 |
from functools import partial
|
4 |
import spaces
|
|
|
137 |
pbar = CustomProgressBar(
|
138 |
gr.Progress(track_tqdm=True).tqdm(
|
139 |
iterable=None,
|
140 |
+
desc="Sampling with DFoT",
|
141 |
total=dfot.sampling_timesteps,
|
142 |
)
|
143 |
)
|
|
|
200 |
pbar = CustomProgressBar(
|
201 |
gr.Progress(track_tqdm=True).tqdm(
|
202 |
iterable=None,
|
203 |
+
desc=f"Predicting next {n_prediction_frames} frames with DFoT",
|
204 |
total=dfot.sampling_timesteps,
|
205 |
)
|
206 |
)
|
|
|
408 |
[(image, f"t={i}") for i, image in enumerate(images)],
|
409 |
)
|
410 |
|
411 |
+
def render_demo1(s: Literal["Selection", "Generation"], idx: int, demo1_stage: gr.State, demo1_selected_index: gr.State):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
412 |
gr.Markdown(
|
413 |
+
f"""
|
414 |
+
## Demo 1: Single Image → Long {LONG_LENGTH}-second Video
|
415 |
+
> #### _Diffusion Forcing Transformer can generate long videos via sliding window rollouts and temporal super-resolution._
|
416 |
+
""",
|
417 |
+
elem_classes=["task-title"]
|
418 |
)
|
419 |
+
match s:
|
420 |
+
case "Selection":
|
421 |
+
with gr.Group():
|
422 |
+
demo1_image_gallery = gr.Gallery(
|
423 |
+
height=300,
|
424 |
+
value=first_frame_list,
|
425 |
+
label="Select an Image to Animate",
|
426 |
+
columns=[8],
|
427 |
+
selected_index=idx,
|
428 |
+
allow_preview=False,
|
429 |
+
preview=False,
|
430 |
+
)
|
431 |
+
|
432 |
+
@demo1_image_gallery.select(
|
433 |
+
inputs=None, outputs=[demo1_stage, demo1_selected_index]
|
434 |
+
)
|
435 |
+
def move_to_generation(selection: gr.SelectData):
|
436 |
+
return "Generation", selection.index
|
437 |
+
|
438 |
+
case "Generation":
|
439 |
+
with gr.Row():
|
440 |
+
gr.Image(
|
441 |
+
value=first_frame_list[idx],
|
442 |
+
label="Input Image",
|
443 |
+
width=256,
|
444 |
+
height=256,
|
445 |
+
)
|
446 |
+
gr.Video(
|
447 |
+
value=prepare_long_gt_video(idx),
|
448 |
+
label="Ground Truth Video",
|
449 |
+
width=256,
|
450 |
+
height=256,
|
451 |
+
autoplay=True,
|
452 |
+
loop=True,
|
453 |
+
)
|
454 |
+
demo1_video = gr.Video(
|
455 |
+
label="Generated Video",
|
456 |
+
width=256,
|
457 |
+
height=256,
|
458 |
+
autoplay=True,
|
459 |
+
loop=True,
|
460 |
+
show_share_button=True,
|
461 |
+
show_download_button=True,
|
462 |
+
)
|
463 |
+
|
464 |
+
gr.Markdown("### Generation Controls ↓")
|
465 |
+
demo1_guidance_scale = gr.Slider(
|
466 |
+
minimum=1,
|
467 |
+
maximum=6,
|
468 |
+
value=4,
|
469 |
+
step=0.5,
|
470 |
+
label="History Guidance Scale",
|
471 |
+
info="Without history guidance: 1.0; Recommended: 4.0",
|
472 |
+
interactive=True,
|
473 |
+
)
|
474 |
+
demo1_fps = gr.Slider(
|
475 |
+
minimum=4,
|
476 |
+
maximum=20,
|
477 |
+
value=4,
|
478 |
+
step=1,
|
479 |
+
label="FPS",
|
480 |
+
info=f"A {LONG_LENGTH}-second video will be generated at this FPS; Decrease for faster generation; Increase for a smoother video",
|
481 |
+
interactive=True,
|
482 |
+
)
|
483 |
+
gr.Button("Generate Video", variant="primary").click(
|
484 |
+
fn=single_image_to_long_video,
|
485 |
+
inputs=[
|
486 |
+
demo1_selected_index,
|
487 |
+
demo1_guidance_scale,
|
488 |
+
demo1_fps,
|
489 |
+
],
|
490 |
+
outputs=demo1_video,
|
491 |
+
)
|
492 |
+
|
493 |
+
def render_demo2(s: Literal["Scene", "Image", "Generation"], scene_idx: int, image_indices: List[int], demo2_stage: gr.State, demo2_selected_scene_index: gr.State, demo2_selected_image_indices: gr.State):
|
494 |
+
gr.Markdown(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
495 |
"""
|
496 |
+
## Demo 2: Any Number of Images → Short 2-second Video
|
497 |
+
> #### _Diffusion Forcing Transformer is a flexible model that can generate videos given variable number of context frames._
|
498 |
+
""",
|
499 |
+
elem_classes=["task-title"]
|
500 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
501 |
|
502 |
+
match s:
|
503 |
+
case "Scene":
|
504 |
+
with gr.Group():
|
505 |
+
demo2_scene_gallery = gr.Gallery(
|
506 |
+
height=300,
|
507 |
+
value=gif_paths,
|
508 |
+
label="Select a Scene to Generate Video",
|
509 |
+
columns=[8],
|
510 |
+
selected_index=scene_idx,
|
511 |
+
allow_preview=False,
|
512 |
+
preview=False,
|
513 |
+
)
|
514 |
+
|
515 |
+
@demo2_scene_gallery.select(
|
516 |
+
inputs=None, outputs=[demo2_stage, demo2_selected_scene_index]
|
517 |
+
)
|
518 |
+
def move_to_image_selection(selection: gr.SelectData):
|
519 |
+
return "Image", selection.index
|
520 |
+
|
521 |
+
case "Image":
|
522 |
+
with gr.Group():
|
523 |
+
demo2_image_gallery = gr.Gallery(
|
524 |
+
height=150,
|
525 |
+
value=[
|
526 |
+
(image, f"t={i}")
|
527 |
+
for i, image in enumerate(
|
528 |
+
prepare_short_gt_video(scene_idx)
|
529 |
)
|
530 |
+
],
|
531 |
+
label="Select Input Images",
|
532 |
+
columns=[8],
|
533 |
+
)
|
534 |
+
|
535 |
+
demo2_selector = gr.CheckboxGroup(
|
536 |
+
label="Select Any Number of Input Images",
|
537 |
+
info="Image-to-Video: Select t=0; Interpolation: Select t=0 and t=7",
|
538 |
+
choices=[(f"t={i}", i) for i in range(8)],
|
539 |
+
value=[],
|
540 |
+
)
|
541 |
+
demo2_image_select_button = gr.Button(
|
542 |
+
"Next Step", variant="primary"
|
543 |
+
)
|
544 |
+
|
545 |
+
@demo2_image_select_button.click(
|
546 |
+
inputs=[demo2_selector],
|
547 |
+
outputs=[demo2_stage, demo2_selected_image_indices],
|
548 |
+
)
|
549 |
+
def generate_video(selected_indices):
|
550 |
+
if len(selected_indices) == 0:
|
551 |
+
gr.Warning("Select at least one image!")
|
552 |
+
return "Image", []
|
553 |
+
else:
|
554 |
+
return "Generation", selected_indices
|
555 |
+
|
556 |
+
case "Generation":
|
557 |
+
with gr.Group():
|
558 |
+
gt_video = prepare_short_gt_video(scene_idx)
|
559 |
+
|
560 |
+
demo2_input_image_gallery = gr.Gallery(
|
561 |
+
height=150,
|
562 |
+
value=video_to_gif_and_images(gt_video, image_indices),
|
563 |
+
label="Input Images",
|
564 |
+
columns=[9],
|
565 |
+
)
|
566 |
+
demo2_generated_gallery = gr.Gallery(
|
567 |
+
height=150,
|
568 |
+
value=[],
|
569 |
+
label="Generated Video",
|
570 |
+
columns=[9],
|
571 |
+
)
|
572 |
+
|
573 |
+
demo2_ground_truth_gallery = gr.Gallery(
|
574 |
+
height=150,
|
575 |
+
value=video_to_gif_and_images(gt_video, list(range(8))),
|
576 |
+
label="Ground Truth Video",
|
577 |
+
columns=[9],
|
578 |
+
)
|
579 |
+
gr.Markdown("### Generation Controls ↓")
|
580 |
+
demo2_guidance_scale = gr.Slider(
|
581 |
+
minimum=1,
|
582 |
+
maximum=6,
|
583 |
+
value=4,
|
584 |
+
step=0.5,
|
585 |
+
label="History Guidance Scale",
|
586 |
+
info="Without history guidance: 1.0; Recommended: 4.0",
|
587 |
+
interactive=True,
|
588 |
+
)
|
589 |
+
gr.Button("Generate Video", variant="primary").click(
|
590 |
+
fn=any_images_to_short_video,
|
591 |
+
inputs=[
|
592 |
+
demo2_selected_scene_index,
|
593 |
+
demo2_selected_image_indices,
|
594 |
+
demo2_guidance_scale,
|
595 |
+
],
|
596 |
+
outputs=demo2_generated_gallery,
|
597 |
+
)
|
598 |
+
|
599 |
+
def render_demo3(
|
600 |
+
s: Literal["Selection", "Generation"],
|
601 |
+
idx: int,
|
602 |
+
demo3_stage: gr.State,
|
603 |
+
demo3_selected_index: gr.State,
|
604 |
+
demo3_current_video: gr.State,
|
605 |
+
demo3_current_poses: gr.State
|
606 |
+
):
|
607 |
+
gr.Markdown(
|
608 |
"""
|
609 |
+
## Demo 3: Single Image → Extremely Long Video _(Navigate with Your Camera Movements!)_
|
610 |
+
> #### _History Guidance significantly improves quality and temporal consistency, enabling stable rollouts for extremely long videos._
|
611 |
+
""",
|
612 |
+
elem_classes=["task-title"]
|
613 |
+
)
|
614 |
+
match s:
|
615 |
+
case "Selection":
|
616 |
+
with gr.Group():
|
617 |
+
demo3_image_gallery = gr.Gallery(
|
618 |
+
height=300,
|
619 |
+
value=first_frame_list,
|
620 |
+
label="Select an Image to Start Navigation",
|
621 |
+
columns=[8],
|
622 |
+
selected_index=idx,
|
623 |
+
allow_preview=False,
|
624 |
+
preview=False,
|
625 |
+
)
|
626 |
+
|
627 |
+
@demo3_image_gallery.select(
|
628 |
+
inputs=None, outputs=[demo3_stage, demo3_selected_index, demo3_current_video, demo3_current_poses]
|
629 |
+
)
|
630 |
+
def move_to_generation(selection: gr.SelectData):
|
631 |
+
idx = selection.index
|
632 |
+
return (
|
633 |
+
"Generation",
|
634 |
+
idx,
|
635 |
+
video_list[idx][:1],
|
636 |
+
poses_list[idx][:1],
|
637 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
638 |
|
639 |
+
case "Generation":
|
640 |
+
with gr.Row():
|
641 |
+
with gr.Column(scale=3):
|
642 |
with gr.Row():
|
643 |
demo3_current_view = gr.Image(
|
644 |
value=first_frame_list[idx],
|
|
|
659 |
demo3_generated_gallery = gr.Gallery(
|
660 |
value=[],
|
661 |
label="Generated Frames",
|
662 |
+
columns=[6],
|
663 |
)
|
664 |
|
665 |
+
with gr.Column():
|
666 |
+
gr.Markdown("### Navigation Controls ↓")
|
667 |
+
with gr.Accordion("Instructions", open=False):
|
668 |
+
gr.Markdown("""
|
669 |
- **The model will predict the next few frames based on your camera movements. Repeat the process to continue navigating through the scene.**
|
670 |
- **At the end of your navigation, apply temporal super-resolution to increase the FPS,** also utilizing the DFoT model.
|
671 |
+
- The most suitable history guidance scheme will be automatically selected based on your camera movements.
|
672 |
+
""")
|
673 |
+
with gr.Tab("Basic", elem_id="basic-controls-tab"):
|
674 |
+
with gr.Group():
|
675 |
+
gr.Markdown("_**Select a direction to move:**_")
|
676 |
+
with gr.Row(elem_id="basic-controls"):
|
677 |
+
gr.Button(
|
678 |
+
"↰-60°\nVeer",
|
679 |
+
size="sm",
|
680 |
+
min_width=0,
|
681 |
+
variant="primary",
|
682 |
+
).click(
|
683 |
+
fn=partial(
|
684 |
+
navigate_video,
|
685 |
+
x_angle=0,
|
686 |
+
y_angle=-60,
|
687 |
+
distance=0,
|
688 |
+
),
|
689 |
+
inputs=[
|
690 |
+
demo3_current_video,
|
691 |
+
demo3_current_poses,
|
692 |
+
],
|
693 |
+
outputs=[
|
694 |
+
demo3_current_video,
|
695 |
+
demo3_current_poses,
|
696 |
+
demo3_current_view,
|
697 |
+
demo3_video,
|
698 |
+
demo3_generated_gallery,
|
699 |
+
],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
700 |
)
|
701 |
+
|
702 |
+
gr.Button(
|
703 |
+
"↖-30°\nTurn",
|
704 |
+
size="sm",
|
705 |
+
min_width=0,
|
706 |
+
variant="primary",
|
707 |
+
).click(
|
708 |
+
fn=partial(
|
709 |
+
navigate_video,
|
710 |
+
x_angle=0,
|
711 |
+
y_angle=-30,
|
712 |
+
distance=50,
|
713 |
+
),
|
714 |
+
inputs=[
|
715 |
+
demo3_current_video,
|
716 |
+
demo3_current_poses,
|
717 |
+
],
|
718 |
+
outputs=[
|
719 |
+
demo3_current_video,
|
720 |
+
demo3_current_poses,
|
721 |
+
demo3_current_view,
|
722 |
+
demo3_video,
|
723 |
+
demo3_generated_gallery,
|
724 |
+
],
|
725 |
)
|
726 |
|
727 |
gr.Button(
|
728 |
+
"↑0°\nAhead",
|
729 |
+
size="sm",
|
730 |
+
min_width=0,
|
731 |
+
variant="primary",
|
732 |
).click(
|
733 |
+
fn=partial(
|
734 |
+
navigate_video,
|
735 |
+
x_angle=0,
|
736 |
+
y_angle=0,
|
737 |
+
distance=100,
|
738 |
+
),
|
739 |
inputs=[
|
740 |
demo3_current_video,
|
741 |
demo3_current_poses,
|
|
|
|
|
|
|
742 |
],
|
743 |
outputs=[
|
744 |
demo3_current_video,
|
|
|
748 |
demo3_generated_gallery,
|
749 |
],
|
750 |
)
|
751 |
+
gr.Button(
|
752 |
+
"↗30°\nTurn",
|
753 |
+
size="sm",
|
754 |
+
min_width=0,
|
755 |
+
variant="primary",
|
756 |
+
).click(
|
757 |
+
fn=partial(
|
758 |
+
navigate_video,
|
759 |
+
x_angle=0,
|
760 |
+
y_angle=30,
|
761 |
+
distance=50,
|
762 |
+
),
|
763 |
+
inputs=[
|
764 |
+
demo3_current_video,
|
765 |
+
demo3_current_poses,
|
766 |
+
],
|
767 |
+
outputs=[
|
768 |
+
demo3_current_video,
|
769 |
+
demo3_current_poses,
|
770 |
+
demo3_current_view,
|
771 |
+
demo3_video,
|
772 |
+
demo3_generated_gallery,
|
773 |
+
],
|
774 |
+
)
|
775 |
+
gr.Button(
|
776 |
+
"↱\n60° Veer",
|
777 |
+
size="sm",
|
778 |
+
min_width=0,
|
779 |
+
variant="primary",
|
780 |
+
).click(
|
781 |
+
fn=partial(
|
782 |
+
navigate_video,
|
783 |
+
x_angle=0,
|
784 |
+
y_angle=60,
|
785 |
+
distance=0,
|
786 |
+
),
|
787 |
+
inputs=[
|
788 |
+
demo3_current_video,
|
789 |
+
demo3_current_poses,
|
790 |
+
],
|
791 |
+
outputs=[
|
792 |
+
demo3_current_video,
|
793 |
+
demo3_current_poses,
|
794 |
+
demo3_current_view,
|
795 |
+
demo3_video,
|
796 |
+
demo3_generated_gallery,
|
797 |
+
],
|
798 |
+
)
|
799 |
+
with gr.Tab("Advanced", elem_id="advanced-controls-tab"):
|
800 |
with gr.Group():
|
801 |
+
gr.Markdown("_**Select angles and distance:**_")
|
802 |
+
|
803 |
+
demo3_y_angle = gr.Slider(
|
804 |
+
minimum=-90,
|
805 |
+
maximum=90,
|
806 |
+
value=0,
|
807 |
+
step=10,
|
808 |
+
label="Horizontal Angle",
|
809 |
+
interactive=True,
|
|
|
|
|
810 |
)
|
811 |
+
demo3_x_angle = gr.Slider(
|
812 |
+
minimum=-40,
|
813 |
+
maximum=40,
|
814 |
+
value=0,
|
815 |
+
step=10,
|
816 |
+
label="Vertical Angle",
|
817 |
+
interactive=True,
|
818 |
)
|
819 |
+
demo3_distance = gr.Slider(
|
820 |
+
minimum=0,
|
821 |
+
maximum=200,
|
822 |
+
value=100,
|
823 |
+
step=10,
|
824 |
+
label="Distance",
|
825 |
interactive=True,
|
826 |
)
|
827 |
+
|
828 |
+
gr.Button(
|
829 |
+
"Generate Next Move", variant="primary"
|
830 |
+
).click(
|
831 |
+
fn=navigate_video,
|
832 |
inputs=[
|
833 |
demo3_current_video,
|
834 |
demo3_current_poses,
|
835 |
+
demo3_x_angle,
|
836 |
+
demo3_y_angle,
|
837 |
+
demo3_distance,
|
838 |
],
|
839 |
outputs=[
|
840 |
demo3_current_video,
|
|
|
844 |
demo3_generated_gallery,
|
845 |
],
|
846 |
)
|
847 |
+
gr.Markdown("---")
|
848 |
+
with gr.Group():
|
849 |
+
gr.Markdown("_You can always undo your last move:_")
|
850 |
+
gr.Button("Undo Last Move", variant="huggingface").click(
|
851 |
+
fn=undo_navigation,
|
852 |
+
inputs=[demo3_current_video, demo3_current_poses],
|
853 |
+
outputs=[
|
854 |
+
demo3_current_video,
|
855 |
+
demo3_current_poses,
|
856 |
+
demo3_current_view,
|
857 |
+
demo3_video,
|
858 |
+
demo3_generated_gallery,
|
859 |
+
],
|
860 |
+
)
|
861 |
+
with gr.Group():
|
862 |
+
gr.Markdown(
|
863 |
+
"_At the end, apply temporal super-resolution to obtain a smoother video:_"
|
864 |
+
)
|
865 |
+
demo3_interpolation_factor = gr.Slider(
|
866 |
+
minimum=2,
|
867 |
+
maximum=10,
|
868 |
+
value=2,
|
869 |
+
step=1,
|
870 |
+
label="By a Factor of",
|
871 |
+
interactive=True,
|
872 |
+
)
|
873 |
+
gr.Button("Smooth Out Video", variant="huggingface").click(
|
874 |
+
fn=smooth_navigation,
|
875 |
+
inputs=[
|
876 |
+
demo3_current_video,
|
877 |
+
demo3_current_poses,
|
878 |
+
demo3_interpolation_factor,
|
879 |
+
],
|
880 |
+
outputs=[
|
881 |
+
demo3_current_video,
|
882 |
+
demo3_current_poses,
|
883 |
+
demo3_current_view,
|
884 |
+
demo3_video,
|
885 |
+
demo3_generated_gallery,
|
886 |
+
],
|
887 |
+
)
|
888 |
+
|
889 |
+
|
890 |
|
891 |
+
# Create the Gradio Blocks
|
892 |
+
with gr.Blocks(theme=gr.themes.Base(primary_hue="teal")) as demo:
|
893 |
+
gr.HTML(
|
894 |
+
"""
|
895 |
+
<style>
|
896 |
+
[data-tab-id="task-1"], [data-tab-id="task-2"], [data-tab-id="task-3"] {
|
897 |
+
font-size: 16px !important;
|
898 |
+
font-weight: bold;
|
899 |
+
}
|
900 |
+
#page-title h1 {
|
901 |
+
color: #0D9488 !important;
|
902 |
+
}
|
903 |
+
.task-title h2 {
|
904 |
+
color: #F59E0C !important;
|
905 |
+
}
|
906 |
+
.header-button-row {
|
907 |
+
gap: 4px !important;
|
908 |
+
}
|
909 |
+
.header-button-row div {
|
910 |
+
width: 131.0px !important;
|
911 |
+
}
|
912 |
+
|
913 |
+
.header-button-column {
|
914 |
+
width: 131.0px !important;
|
915 |
+
gap: 5px !important;
|
916 |
+
}
|
917 |
+
.header-button a {
|
918 |
+
border: 1px solid #e4e4e7;
|
919 |
+
}
|
920 |
+
.header-button .button-icon {
|
921 |
+
margin-right: 8px;
|
922 |
+
}
|
923 |
+
#basic-controls {
|
924 |
+
column-gap: 0px;
|
925 |
+
}
|
926 |
+
#basic-controls-tab {
|
927 |
+
padding: 0px;
|
928 |
+
}
|
929 |
+
#advanced-controls-tab {
|
930 |
+
padding: 0px;
|
931 |
+
}
|
932 |
+
#selected-demo-button {
|
933 |
+
color: #F59E0C;
|
934 |
+
text-decoration: underline;
|
935 |
+
}
|
936 |
+
.demo-button {
|
937 |
+
text-align: left !important;
|
938 |
+
display: block !important;
|
939 |
+
}
|
940 |
+
</style>
|
941 |
+
"""
|
942 |
+
)
|
943 |
+
|
944 |
+
demo_idx = gr.State(value=1)
|
945 |
+
|
946 |
+
with gr.Sidebar():
|
947 |
+
gr.Markdown("# Diffusion Forcing Transformer with History Guidance", elem_id="page-title")
|
948 |
+
gr.Markdown(
|
949 |
+
"### Official Interactive Demo for [_History-Guided Video Diffusion_](https://arxiv.org/abs/2502.06764)"
|
950 |
+
)
|
951 |
+
gr.Markdown("---")
|
952 |
+
gr.Markdown("#### Links ↓")
|
953 |
+
with gr.Row(elem_classes=["header-button-row"]):
|
954 |
+
with gr.Column(elem_classes=["header-button-column"], min_width=0):
|
955 |
+
gr.Button(
|
956 |
+
value="Website",
|
957 |
+
link="https://boyuan.space/history-guidance",
|
958 |
+
icon="https://simpleicons.org/icons/googlechrome.svg",
|
959 |
+
elem_classes=["header-button"],
|
960 |
+
size="md",
|
961 |
+
min_width=0,
|
962 |
+
)
|
963 |
+
gr.Button(
|
964 |
+
value="Paper",
|
965 |
+
link="https://arxiv.org/abs/2502.06764",
|
966 |
+
icon="https://simpleicons.org/icons/arxiv.svg",
|
967 |
+
elem_classes=["header-button"],
|
968 |
+
size="md",
|
969 |
+
min_width=0,
|
970 |
+
)
|
971 |
+
with gr.Column(elem_classes=["header-button-column"], min_width=0):
|
972 |
+
gr.Button(
|
973 |
+
value="Code",
|
974 |
+
link="https://github.com/kwsong0113/diffusion-forcing-transformer",
|
975 |
+
icon="https://simpleicons.org/icons/github.svg",
|
976 |
+
elem_classes=["header-button"],
|
977 |
+
size="md",
|
978 |
+
min_width=0,
|
979 |
+
)
|
980 |
+
gr.Button(
|
981 |
+
value="Weights",
|
982 |
+
link="https://huggingface.co/kiwhansong/DFoT",
|
983 |
+
icon="https://simpleicons.org/icons/huggingface.svg",
|
984 |
+
elem_classes=["header-button"],
|
985 |
+
size="md",
|
986 |
+
min_width=0,
|
987 |
+
)
|
988 |
+
gr.Markdown("---")
|
989 |
+
gr.Markdown("#### Choose a Demo ↓")
|
990 |
+
with gr.Group():
|
991 |
+
@gr.render(inputs=[demo_idx])
|
992 |
+
def render_demo_tabs(idx):
|
993 |
+
demo_tab_button1 = gr.Button(
|
994 |
+
"1: Image → Long Video",
|
995 |
+
size="md", elem_classes=["demo-button"], **{"elem_id": "selected-demo-button"} if idx == 1 else {}
|
996 |
+
).click(
|
997 |
+
fn=lambda: 1,
|
998 |
+
outputs=demo_idx
|
999 |
+
)
|
1000 |
+
demo_tab_button2 = gr.Button(
|
1001 |
+
"2: Any # of Images → Short Video",
|
1002 |
+
size="md", elem_classes=["demo-button"], **{"elem_id": "selected-demo-button"} if idx == 2 else {}
|
1003 |
+
).click(
|
1004 |
+
fn=lambda: 2,
|
1005 |
+
outputs=demo_idx
|
1006 |
+
)
|
1007 |
+
demo_tab_button3 = gr.Button(
|
1008 |
+
"3: Image → Extremely Long Video",
|
1009 |
+
size="md", elem_classes=["demo-button"], **{"elem_id": "selected-demo-button"} if idx == 3 else {}
|
1010 |
+
).click(
|
1011 |
+
fn=lambda: 3,
|
1012 |
+
outputs=demo_idx
|
1013 |
+
)
|
1014 |
+
gr.Markdown("---")
|
1015 |
+
gr.Markdown("#### Troubleshooting ↓")
|
1016 |
+
with gr.Group():
|
1017 |
+
with gr.Accordion("Error or Unexpected Results?", open=False):
|
1018 |
+
gr.Markdown("Please try again after refreshing the page and ensure you do not click the same button multiple times.")
|
1019 |
+
with gr.Accordion("Too Slow or No GPU Allocation?", open=False):
|
1020 |
+
gr.Markdown(
|
1021 |
+
"Consider running the demo locally (click the dots in the top-right corner). Alternatively, you can subscribe to Hugging Face Pro for an increased GPU quota."
|
1022 |
+
)
|
1023 |
+
|
1024 |
+
demo1_stage = gr.State(value="Selection")
|
1025 |
+
demo1_selected_index = gr.State(value=None)
|
1026 |
+
demo2_stage = gr.State(value="Scene")
|
1027 |
+
demo2_selected_scene_index = gr.State(value=None)
|
1028 |
+
demo2_selected_image_indices = gr.State(value=[])
|
1029 |
+
demo3_stage = gr.State(value="Selection")
|
1030 |
+
demo3_selected_index = gr.State(value=None)
|
1031 |
+
demo3_current_video = gr.State(value=None)
|
1032 |
+
demo3_current_poses = gr.State(value=None)
|
1033 |
+
|
1034 |
+
@gr.render(inputs=[demo_idx, demo1_stage, demo1_selected_index, demo2_stage, demo2_selected_scene_index, demo2_selected_image_indices, demo3_stage, demo3_selected_index])
|
1035 |
+
def render_demo(
|
1036 |
+
_demo_idx, _demo1_stage, _demo1_selected_index, _demo2_stage, _demo2_selected_scene_index, _demo2_selected_image_indices, _demo3_stage, _demo3_selected_index
|
1037 |
+
):
|
1038 |
+
match _demo_idx:
|
1039 |
+
case 1:
|
1040 |
+
render_demo1(_demo1_stage, _demo1_selected_index, demo1_stage, demo1_selected_index)
|
1041 |
+
case 2:
|
1042 |
+
render_demo2(_demo2_stage, _demo2_selected_scene_index, _demo2_selected_image_indices,
|
1043 |
+
demo2_stage, demo2_selected_scene_index, demo2_selected_image_indices)
|
1044 |
+
case 3:
|
1045 |
+
render_demo3(_demo3_stage, _demo3_selected_index, demo3_stage, demo3_selected_index, demo3_current_video, demo3_current_poses)
|
1046 |
+
|
1047 |
|
1048 |
if __name__ == "__main__":
|
1049 |
demo.launch()
|