Spaces:
Runtime error
Runtime error
#!/usr/bin/env python | |
from __future__ import annotations | |
import os | |
import sys | |
import warnings | |
# os.system("cd Make-A-Protagonist/experts/GroundedSAM") | |
# os.system("python -m pip install -e segment_anything") | |
# os.system("python -m pip install -e GroundingDINO") | |
# os.system("cd ../../..") | |
# os.system("python -m pip install -e Make-A-Protagonist/experts/GroundedSAM/GroundingDINO") | |
# os.system("pip install --upgrade diffusers[torch]") | |
warnings.filterwarnings("ignore") | |
import gradio as gr | |
from inference import InferencePipeline | |
class InferenceUtil: | |
def __init__(self, hf_token: str | None): | |
self.hf_token = hf_token | |
def load_model_info(self, model_id: str) -> tuple[str, str]: | |
## TODO the modelcard is in the readme of huggingface repo, should know how to write it | |
try: | |
card = InferencePipeline.get_model_card(model_id, self.hf_token) | |
except Exception: | |
return '', '' | |
# return '' | |
base_model = getattr(card.data, 'base_model', '') | |
protagonist = getattr(card.data, 'protagonist', '') | |
training_prompt = getattr(card.data, 'training_prompt', '') | |
return protagonist, training_prompt | |
# return training_prompt | |
# TITLE = '# [Tune-A-Video](https://tuneavideo.github.io/)' | |
HF_TOKEN = os.getenv('HF_TOKEN') | |
# print("HF Token ===> ", HF_TOKEN) | |
pipe = InferencePipeline(HF_TOKEN) | |
app = InferenceUtil(HF_TOKEN) | |
with gr.Blocks(css='style.css') as demo: | |
# gr.Markdown(TITLE) | |
gr.HTML( | |
""" | |
<div style="text-align: center; max-width: 1200px; margin: 20px auto;"> | |
<h1 style="font-weight: 900; font-size: 2rem; margin: 0rem"> | |
Make-A-Protagonist: | |
<br> | |
Generic Video Editing with An Ensemble of Experts | |
</h1> | |
<h2 style="font-weight: 450; font-size: 1rem; margin: 0rem"> | |
<a href="https://yuyangzhao.com">Yuyang Zhao</a><sup>1</sup> | |
<a href="https://xieenze.github.io/">Enze Xie</a><sup>2</sup> | |
<a href="https://scholar.google.com.sg/citations?user=2p7x6OUAAAAJ&hl=en">Lanqing Hong</a><sup>2</sup> | |
<a href="https://scholar.google.com.sg/citations?user=XboZC1AAAAAJ&hl=en">Zhenguo Li</a><sup>2</sup> | |
<a href="https://www.comp.nus.edu.sg/~leegh/">Gim Hee Lee</a><sup>1</sup> | |
</h2> | |
<h2 style="font-weight: 450; font-size: 1rem; margin: 0rem"> | |
<sup>1 </sup>National University of Singapore | |
<sup>2 </sup>Huawei Noah's Ark Lab</span> | |
</h2> | |
<h2 style="font-weight: 450; font-size: 1rem; margin: 0rem"> | |
<span class="link-block"> | |
[<a href="https://arxiv.org/abs/2305.08850" target="_blank" | |
class="external-link "> | |
<span class="icon"> | |
<i class="ai ai-arxiv"></i> | |
</span> | |
<span>arXiv</span> | |
</a>] | |
</span> | |
<!-- Github link --> | |
<span class="link-block"> | |
[<a href="https://github.com/Make-A-Protagonist/Make-A-Protagonist" target="_blank" | |
class="external-link "> | |
<span class="icon"> | |
<i class="fab fa-github"></i> | |
</span> | |
<span>Code</span> | |
</a>] | |
</span> | |
<!-- Github link --> | |
<span class="link-block"> | |
[<a href="https://make-a-protagonist.github.io/" target="_blank" | |
class="external-link "> | |
<span class="icon"> | |
<i class="fab fa-github"></i> | |
</span> | |
<span>Homepage</span> | |
</a>] | |
</span> | |
</h2> | |
<h2 style="font-weight: 450; font-size: 1rem; margin-top: 0.5rem; margin-bottom: 0.5rem"> | |
TL;DR: The first framework for generic video editing with both visual and textual clues. | |
</h2> | |
</div> | |
""") | |
gr.HTML(""" | |
<p>We provide a <a href="https://github.com/Make-A-Protagonist/Make-A-Protagonist/blob/main/docs/demo_guidance.md"> Demo Guidance </a> to help users to choose hyperparameters when editing videos. | |
<p>You may duplicate the space and upgrade GPU for better performance and faster inference without waiting in the queue. | |
<p>Alternatively, try our GitHub <a href=https://github.com/Make-A-Protagonist/Make-A-Protagonist> code </a> on your GPU. | |
</p>""") | |
with gr.Row(): | |
with gr.Column(): | |
with gr.Box(): | |
model_id = gr.Dropdown( | |
label='Model ID', | |
choices=[ | |
'Make-A-Protagonist/ikun', | |
'Make-A-Protagonist/huaqiang', | |
'Make-A-Protagonist/yanzi', | |
'Make-A-Protagonist/car-turn', | |
], | |
value='Make-A-Protagonist/ikun') | |
with gr.Row(): | |
base_model_used_for_training = gr.Textbox( | |
label='Protagonist', interactive=False, value='man') | |
prompt_used_for_training = gr.Textbox( | |
label='Training prompt', interactive=False, value='A man is playing basketball') | |
with gr.Box(): | |
ref_image = gr.Image(label='Reference Image', type='pil', visible=True).style(height="auto") | |
ref_pro_prompt = gr.Textbox(label='Reference Image Protagonist Prompt', | |
max_lines=1, | |
placeholder='Example: "man"') | |
prompt = gr.Textbox(label='Prompt', | |
max_lines=1, | |
placeholder='Example: "A panda is surfing"') | |
video_length = gr.Slider(label='Video length', | |
minimum=4, | |
maximum=6, | |
step=1, | |
value=6) | |
fps = gr.Slider(label='FPS', | |
minimum=1, | |
maximum=6, | |
step=1, | |
value=3) | |
seed = gr.Slider(label='Seed', | |
minimum=0, | |
maximum=100000, | |
step=1, | |
value=0) | |
with gr.Accordion('ControlNet Parameters', open=True): | |
control_pose = gr.Slider(label='Pose', | |
minimum=0, | |
maximum=1, | |
step=0.1, | |
value=.5) | |
control_depth = gr.Slider(label='Depth', | |
minimum=0, | |
maximum=1, | |
step=0.1, | |
value=.5) | |
with gr.Accordion('Editing Function', open=True): | |
with gr.Row(): | |
source_pro = gr.Slider(label='Source Protagonist', | |
minimum=0, | |
maximum=1, | |
step=1, | |
value=0) | |
source_bg = gr.Slider(label='Source Background', | |
minimum=0, | |
maximum=1, | |
step=1, | |
value=0) | |
with gr.Accordion('Other Parameters', open=False): | |
num_steps = gr.Slider(label='Number of Steps', | |
minimum=0, | |
maximum=100, | |
step=1, | |
value=50) | |
start_step = gr.Slider(label='Mask Starting Step', | |
minimum=0, | |
maximum=100, | |
step=1, | |
value=0) | |
guidance_scale = gr.Slider(label='CFG Scale', | |
minimum=0, | |
maximum=50, | |
step=0.1, | |
value=12.5) | |
noise_level = gr.Slider(label='Noise Level', | |
minimum=0, | |
maximum=999, | |
step=1, | |
value=0) | |
run_button = gr.Button('Generate') | |
gr.Markdown(''' | |
- It takes a few minutes to download model first. | |
- It takes one minute to load model and conduct DDIM inverse | |
''') | |
with gr.Column(): | |
result = gr.Video(label='Result') | |
with gr.Row(): | |
examples = [ | |
[ | |
'Make-A-Protagonist/ikun', | |
'A man is playing basketball on the beach, anime style.', | |
6, | |
3, | |
33, | |
50, | |
12.5, | |
'data/ikun/reference_images/zhongli.jpg', | |
'man', | |
0, | |
0, | |
0.5, | |
0.5, | |
0, | |
0 | |
], | |
[ | |
'Make-A-Protagonist/huaqiang', | |
'Elon Musk walking down the street.', | |
6, | |
3, | |
33, | |
50, | |
12.5, | |
'data/huaqiang/reference_images/musk.jpg', | |
'man', | |
0, | |
0, | |
0.5, | |
0.5, | |
0, | |
1, | |
], | |
[ | |
'Make-A-Protagonist/yanzi', | |
'A panda walking down the snowy street.', | |
6, | |
3, | |
33, | |
50, | |
12.5, | |
'data/yanzi/reference_images/panda.jpeg', | |
'panda', | |
0, | |
0, | |
0.5, | |
0.5, | |
0, | |
0 | |
], | |
[ | |
'Make-A-Protagonist/car-turn', | |
'A car moving in the desert.', | |
6, | |
3, | |
33, | |
50, | |
12.5, | |
'data/car-turn/reference_images/audi.jpeg', | |
'car', | |
0, | |
0, | |
0.0, | |
1.0, | |
0, | |
0 | |
], | |
[ | |
'Make-A-Protagonist/car-turn', | |
'A Suzuki Jimny driving down a mountain road in the rain.', | |
6, | |
3, | |
33, | |
50, | |
12.5, | |
'data/car-turn/images/0000.jpg', | |
'car', | |
0, | |
0, | |
0.0, | |
1.0, | |
1, | |
0 | |
], | |
] | |
gr.Examples(examples=examples, | |
inputs=[ | |
model_id, | |
prompt, | |
video_length, | |
fps, | |
seed, | |
num_steps, | |
guidance_scale, | |
ref_image, | |
ref_pro_prompt, | |
noise_level, | |
start_step, | |
control_pose, | |
control_depth, | |
source_pro, | |
source_bg, | |
], | |
outputs=result, | |
fn=pipe.run, | |
cache_examples=os.getenv('SYSTEM') == 'spaces') | |
model_id.change(fn=app.load_model_info, | |
inputs=model_id, | |
outputs=[ | |
base_model_used_for_training, | |
prompt_used_for_training, | |
]) | |
inputs = [ | |
model_id, | |
prompt, | |
video_length, | |
fps, | |
seed, | |
num_steps, | |
guidance_scale, | |
ref_image, | |
ref_pro_prompt, | |
noise_level, | |
start_step, | |
control_pose, | |
control_depth, | |
source_pro, | |
source_bg, | |
] | |
prompt.submit(fn=pipe.run, inputs=inputs, outputs=result) | |
run_button.click(fn=pipe.run, inputs=inputs, outputs=result) | |
demo.queue().launch() | |