Spaces:
Runtime error
Runtime error
myniu
commited on
Commit
·
15183dc
1
Parent(s):
5696b48
init
Browse files
app.py
CHANGED
@@ -89,79 +89,6 @@ def get_sparseflow_and_mask_forward(
|
|
89 |
return s_flow, mask
|
90 |
|
91 |
|
92 |
-
@spaces.GPU(duration=100)
|
93 |
-
def init_models(pretrained_model_name_or_path, resume_from_checkpoint, weight_dtype, device='cuda', enable_xformers_memory_efficient_attention=False, allow_tf32=False):
|
94 |
-
|
95 |
-
from models.unet_spatio_temporal_condition_controlnet import UNetSpatioTemporalConditionControlNetModel
|
96 |
-
from pipeline.pipeline import FlowControlNetPipeline
|
97 |
-
from models.svdxt_featureflow_forward_controlnet_s2d_fixcmp_norefine import FlowControlNet, CMP_demo
|
98 |
-
|
99 |
-
print('start loading models...')
|
100 |
-
# Load scheduler, tokenizer and models.
|
101 |
-
image_encoder = CLIPVisionModelWithProjection.from_pretrained(
|
102 |
-
pretrained_model_name_or_path, subfolder="image_encoder", revision=None, variant="fp16"
|
103 |
-
)
|
104 |
-
vae = AutoencoderKLTemporalDecoder.from_pretrained(
|
105 |
-
pretrained_model_name_or_path, subfolder="vae", revision=None, variant="fp16")
|
106 |
-
unet = UNetSpatioTemporalConditionControlNetModel.from_pretrained(
|
107 |
-
pretrained_model_name_or_path,
|
108 |
-
subfolder="unet",
|
109 |
-
low_cpu_mem_usage=True,
|
110 |
-
variant="fp16",
|
111 |
-
)
|
112 |
-
|
113 |
-
controlnet = FlowControlNet.from_pretrained(resume_from_checkpoint)
|
114 |
-
|
115 |
-
cmp = CMP_demo(
|
116 |
-
'./models/cmp/experiments/semiauto_annot/resnet50_vip+mpii_liteflow/config.yaml',
|
117 |
-
42000
|
118 |
-
).to(device)
|
119 |
-
cmp.requires_grad_(False)
|
120 |
-
|
121 |
-
# Freeze vae and image_encoder
|
122 |
-
vae.requires_grad_(False)
|
123 |
-
image_encoder.requires_grad_(False)
|
124 |
-
unet.requires_grad_(False)
|
125 |
-
controlnet.requires_grad_(False)
|
126 |
-
|
127 |
-
# Move image_encoder and vae to gpu and cast to weight_dtype
|
128 |
-
image_encoder.to(device, dtype=weight_dtype)
|
129 |
-
vae.to(device, dtype=weight_dtype)
|
130 |
-
unet.to(device, dtype=weight_dtype)
|
131 |
-
controlnet.to(device, dtype=weight_dtype)
|
132 |
-
|
133 |
-
if enable_xformers_memory_efficient_attention:
|
134 |
-
if is_xformers_available():
|
135 |
-
import xformers
|
136 |
-
|
137 |
-
xformers_version = version.parse(xformers.__version__)
|
138 |
-
if xformers_version == version.parse("0.0.16"):
|
139 |
-
print(
|
140 |
-
"xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
|
141 |
-
)
|
142 |
-
unet.enable_xformers_memory_efficient_attention()
|
143 |
-
else:
|
144 |
-
raise ValueError(
|
145 |
-
"xformers is not available. Make sure it is installed correctly")
|
146 |
-
|
147 |
-
if allow_tf32:
|
148 |
-
torch.backends.cuda.matmul.allow_tf32 = True
|
149 |
-
|
150 |
-
pipeline = FlowControlNetPipeline.from_pretrained(
|
151 |
-
pretrained_model_name_or_path,
|
152 |
-
unet=unet,
|
153 |
-
controlnet=controlnet,
|
154 |
-
image_encoder=image_encoder,
|
155 |
-
vae=vae,
|
156 |
-
torch_dtype=weight_dtype,
|
157 |
-
)
|
158 |
-
pipeline = pipeline.to(device)
|
159 |
-
|
160 |
-
print('models loaded.')
|
161 |
-
|
162 |
-
return pipeline, cmp
|
163 |
-
|
164 |
-
|
165 |
def interpolate_trajectory(points, n_points):
|
166 |
x = [point[0] for point in points]
|
167 |
y = [point[1] for point in points]
|
@@ -215,20 +142,87 @@ def visualize_drag_v2(background_image_path, splited_tracks, width, height):
|
|
215 |
return trajectory_maps, transparent_layer
|
216 |
|
217 |
|
218 |
-
|
219 |
-
pipeline, cmp = init_models(
|
220 |
-
"ckpts/stable-video-diffusion-img2vid-xt-1-1",
|
221 |
-
"ckpts/controlnet",
|
222 |
-
weight_dtype=torch.float16,
|
223 |
-
device='cuda'
|
224 |
-
)
|
225 |
-
|
226 |
-
|
227 |
class Drag:
|
228 |
def __init__(self, height, width):
|
229 |
|
230 |
self.height = height
|
231 |
self.width = width
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
232 |
|
233 |
def get_cmp_flow(self, frames, sparse_optical_flow, mask, brush_mask=None):
|
234 |
|
@@ -652,6 +646,14 @@ with gr.Blocks() as demo:
|
|
652 |
|
653 |
target_size = 512
|
654 |
DragNUWA_net = Drag(target_size, target_size)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
655 |
first_frame_path = gr.State()
|
656 |
tracking_points = gr.State([])
|
657 |
motion_brush_points = gr.State([])
|
|
|
89 |
return s_flow, mask
|
90 |
|
91 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
92 |
def interpolate_trajectory(points, n_points):
|
93 |
x = [point[0] for point in points]
|
94 |
y = [point[1] for point in points]
|
|
|
142 |
return trajectory_maps, transparent_layer
|
143 |
|
144 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
145 |
class Drag:
|
146 |
def __init__(self, height, width):
|
147 |
|
148 |
self.height = height
|
149 |
self.width = width
|
150 |
+
self.pipeline = None
|
151 |
+
self.cmp = None
|
152 |
+
|
153 |
+
@spaces.GPU(duration=100)
|
154 |
+
def init_models(self, pretrained_model_name_or_path, resume_from_checkpoint, weight_dtype, device='cuda', enable_xformers_memory_efficient_attention=False, allow_tf32=False):
|
155 |
+
|
156 |
+
from models.unet_spatio_temporal_condition_controlnet import UNetSpatioTemporalConditionControlNetModel
|
157 |
+
from pipeline.pipeline import FlowControlNetPipeline
|
158 |
+
from models.svdxt_featureflow_forward_controlnet_s2d_fixcmp_norefine import FlowControlNet, CMP_demo
|
159 |
+
|
160 |
+
print('start loading models...')
|
161 |
+
# Load scheduler, tokenizer and models.
|
162 |
+
image_encoder = CLIPVisionModelWithProjection.from_pretrained(
|
163 |
+
pretrained_model_name_or_path, subfolder="image_encoder", revision=None, variant="fp16"
|
164 |
+
)
|
165 |
+
vae = AutoencoderKLTemporalDecoder.from_pretrained(
|
166 |
+
pretrained_model_name_or_path, subfolder="vae", revision=None, variant="fp16")
|
167 |
+
unet = UNetSpatioTemporalConditionControlNetModel.from_pretrained(
|
168 |
+
pretrained_model_name_or_path,
|
169 |
+
subfolder="unet",
|
170 |
+
low_cpu_mem_usage=True,
|
171 |
+
variant="fp16",
|
172 |
+
)
|
173 |
+
|
174 |
+
controlnet = FlowControlNet.from_pretrained(resume_from_checkpoint)
|
175 |
+
|
176 |
+
cmp = CMP_demo(
|
177 |
+
'./models/cmp/experiments/semiauto_annot/resnet50_vip+mpii_liteflow/config.yaml',
|
178 |
+
42000
|
179 |
+
).to(device)
|
180 |
+
cmp.requires_grad_(False)
|
181 |
+
|
182 |
+
self.cmp = cmp
|
183 |
+
|
184 |
+
# Freeze vae and image_encoder
|
185 |
+
vae.requires_grad_(False)
|
186 |
+
image_encoder.requires_grad_(False)
|
187 |
+
unet.requires_grad_(False)
|
188 |
+
controlnet.requires_grad_(False)
|
189 |
+
|
190 |
+
# Move image_encoder and vae to gpu and cast to weight_dtype
|
191 |
+
image_encoder.to(device, dtype=weight_dtype)
|
192 |
+
vae.to(device, dtype=weight_dtype)
|
193 |
+
unet.to(device, dtype=weight_dtype)
|
194 |
+
controlnet.to(device, dtype=weight_dtype)
|
195 |
+
|
196 |
+
if enable_xformers_memory_efficient_attention:
|
197 |
+
if is_xformers_available():
|
198 |
+
import xformers
|
199 |
+
|
200 |
+
xformers_version = version.parse(xformers.__version__)
|
201 |
+
if xformers_version == version.parse("0.0.16"):
|
202 |
+
print(
|
203 |
+
"xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
|
204 |
+
)
|
205 |
+
unet.enable_xformers_memory_efficient_attention()
|
206 |
+
else:
|
207 |
+
raise ValueError(
|
208 |
+
"xformers is not available. Make sure it is installed correctly")
|
209 |
+
|
210 |
+
if allow_tf32:
|
211 |
+
torch.backends.cuda.matmul.allow_tf32 = True
|
212 |
+
|
213 |
+
pipeline = FlowControlNetPipeline.from_pretrained(
|
214 |
+
pretrained_model_name_or_path,
|
215 |
+
unet=unet,
|
216 |
+
controlnet=controlnet,
|
217 |
+
image_encoder=image_encoder,
|
218 |
+
vae=vae,
|
219 |
+
torch_dtype=weight_dtype,
|
220 |
+
)
|
221 |
+
pipeline = pipeline.to(device)
|
222 |
+
|
223 |
+
self.pipeline = pipeline
|
224 |
+
|
225 |
+
print('models loaded.')
|
226 |
|
227 |
def get_cmp_flow(self, frames, sparse_optical_flow, mask, brush_mask=None):
|
228 |
|
|
|
646 |
|
647 |
target_size = 512
|
648 |
DragNUWA_net = Drag(target_size, target_size)
|
649 |
+
|
650 |
+
DragNUWA_net.init_models(
|
651 |
+
"ckpts/stable-video-diffusion-img2vid-xt-1-1",
|
652 |
+
"ckpts/controlnet",
|
653 |
+
weight_dtype=torch.float16,
|
654 |
+
device='cuda'
|
655 |
+
)
|
656 |
+
|
657 |
first_frame_path = gr.State()
|
658 |
tracking_points = gr.State([])
|
659 |
motion_brush_points = gr.State([])
|