File size: 7,692 Bytes
d2dd1cd
 
 
 
 
 
 
 
 
6dcf9e0
 
d2dd1cd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d1820b1
 
 
d2dd1cd
 
 
 
 
 
 
d1820b1
 
 
 
 
 
d2dd1cd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65516a6
d2dd1cd
65516a6
 
d2dd1cd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d1820b1
d2dd1cd
 
 
6dcf9e0
 
 
 
 
 
 
 
 
 
 
 
 
 
d2dd1cd
 
 
 
 
d1820b1
d2dd1cd
 
 
 
 
3a8c535
f4c8778
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
#!/usr/bin/env python

from __future__ import annotations

import os

import gradio as gr

from inference_followyourpose import merge_config_then_run
import sys
sys.path.append('FollowYourPose')

HF_TOKEN = os.getenv('HF_TOKEN')
pipe = merge_config_then_run()

with gr.Blocks(css='style.css') as demo:
    gr.HTML(
    """
    <div style="text-align: center; max-width: 1200px; margin: 20px auto;">
    <h1 style="font-weight: 900; font-size: 2rem; margin: 0rem">
        🕺🕺🕺 Follow Your Pose 💃💃💃 </font></center> <br> <center>Pose-Guided Text-to-Video Generation using Pose-Free Videos
    </h1>
    <h2 style="font-weight: 450; font-size: 1rem; margin: 0rem">
            <a href="https://mayuelala.github.io/">Yue Ma*</a>
            <a href="https://github.com/YingqingHe">Yingqing He*</a> , <a href="http://vinthony.github.io/">Xiaodong Cun</a>, 
            <a href="https://xinntao.github.io/"> Xintao Wang </a>,
            <a href="https://scholar.google.com/citations?user=4oXBp9UAAAAJ&hl=zh-CN">Ying Shan</a>,
            <a href="https://scholar.google.com/citations?user=Xrh1OIUAAAAJ&hl=zh-CN">Xiu Li</a>,
            <a href="http://cqf.io">Qifeng Chen</a>
    </h2>

    <h2 style="font-weight: 450; font-size: 1rem; margin: 0rem">
                  <span class="link-block">
                    [<a href="https://arxiv.org/abs/2304.01186" target="_blank"
                    class="external-link ">
                    <span class="icon">
                      <i class="ai ai-arxiv"></i>
                    </span>
                    <span>arXiv</span>
                  </a>]
                </span>

                  <!-- Github link -->
                  <span class="link-block">
                    [<a href="https://github.com/mayuelala/FollowYourPose" target="_blank"
                    class="external-link ">
                    <span class="icon">
                      <i class="fab fa-github"></i>
                    </span>
                    <span>Code</span>
                  </a>]
                </span>

                <!-- Github link -->
                  <span class="link-block">
                    [<a href="https://follow-your-pose.github.io/" target="_blank"
                    class="external-link ">
                    <span class="icon">
                      <i class="fab fa-github"></i>
                    </span>
                    <span>Homepage</span>
                  </a>]
                </span>
    </h2>
    <h2 style="font-weight: 450; font-size: 1rem; margin-top: 0.5rem; margin-bottom: 0.5rem">
        TL;DR: We tune 2D stable-diffusion to generate the character videos from pose and text description.
    </h2>
    </div>
    """)


    gr.HTML("""
    <p>In order to run the demo successfully, we recommend the length of video is about <b>3~5 seconds</b>.
    The temporal crop offset and sampling stride are used to adjust the starting point and interval of video samples.
    Alternatively, try our GitHub <a href=https://github.com/mayuelala/FollowYourPose> code  </a> on your GPU.
    </p>""")

    with gr.Row():
        with gr.Column():
            with gr.Accordion('Input Video', open=True):
                # user_input_video = gr.File(label='Input Source Video')
                user_input_video = gr.Video(label='Input Source Video', source='upload', type='numpy', format="mp4", visible=True).style(height="auto")
                video_type = gr.Dropdown(
                  label='The type of input video',
                  choices=[
                      "Raw Video",
                      "Skeleton Video"
                  ], value="Raw Video")
                with gr.Accordion('Temporal Crop offset and Sampling Stride', open=False):
                    n_sample_frame = gr.Slider(label='Number of Frames',
                                        minimum=0,
                                        maximum=32,
                                        step=1,
                                        value=8)
                    stride = gr.Slider(label='Temporal stride',
                                            minimum=0,
                                            maximum=20,
                                            step=1,
                                            value=1)

                with gr.Accordion('Spatial Crop offset', open=False):
                    left_crop = gr.Number(label='Left crop',
                              value=0,
                              precision=0)
                    right_crop = gr.Number(label='Right crop',
                              value=0,
                              precision=0)
                    top_crop = gr.Number(label='Top crop',
                              value=0,
                              precision=0)
                    bottom_crop = gr.Number(label='Bottom crop',
                              value=0,
                              precision=0)
                    offset_list = [
                         left_crop,
                         right_crop,
                         top_crop,
                         bottom_crop,
                    ]
                
                ImageSequenceDataset_list = [
                   n_sample_frame,
                   stride
                ] + offset_list
                

            with gr.Accordion('Text Prompt', open=True):

                target_prompt = gr.Textbox(label='Target Prompt',
                                    info='The simple background may achieve better results(e.g., "beach", "moon" prompt is better than "street" and "market")',
                                    max_lines=1,
                                    placeholder='Example: "Iron man on the beach"',
                                    value='Iron man on the beach')





            run_button = gr.Button('Generate')

        with gr.Column():
            result = gr.Video(label='Result')
            # result.style(height=512, width=512)
            with gr.Accordion('DDIM Parameters', open=True):
                num_steps = gr.Slider(label='Number of Steps',
                                      info='larger value has better editing capacity, but takes more time and memory.',
                                      minimum=0,
                                      maximum=50,
                                      step=1,
                                      value=50)
                guidance_scale = gr.Slider(label='CFG Scale',
                                           minimum=0,
                                           maximum=50,
                                           step=0.1,
                                           value=12.0)
    with gr.Row():
        from example import style_example
        examples = style_example
        
        gr.Examples(examples=examples,
                    inputs = [
                        user_input_video,
                        target_prompt,
                        num_steps,
                        guidance_scale,
                        video_type,
                        *ImageSequenceDataset_list
                    ],
                    outputs=result,
                    fn=pipe.run,
                    cache_examples=True,
                    )
    inputs = [
            user_input_video,
            target_prompt,
            num_steps,
            guidance_scale,
            video_type,
            *ImageSequenceDataset_list
    ]
    target_prompt.submit(fn=pipe.run, inputs=inputs, outputs=result)
    run_button.click(fn=pipe.run, inputs=inputs, outputs=result)

demo.queue().launch()
# demo.queue().launch(share=False, server_name='0.0.0.0', server_port=80)