myniu commited on
Commit
15183dc
·
1 Parent(s): 5696b48
Files changed (1) hide show
  1. app.py +84 -82
app.py CHANGED
@@ -89,79 +89,6 @@ def get_sparseflow_and_mask_forward(
89
  return s_flow, mask
90
 
91
 
92
- @spaces.GPU(duration=100)
93
- def init_models(pretrained_model_name_or_path, resume_from_checkpoint, weight_dtype, device='cuda', enable_xformers_memory_efficient_attention=False, allow_tf32=False):
94
-
95
- from models.unet_spatio_temporal_condition_controlnet import UNetSpatioTemporalConditionControlNetModel
96
- from pipeline.pipeline import FlowControlNetPipeline
97
- from models.svdxt_featureflow_forward_controlnet_s2d_fixcmp_norefine import FlowControlNet, CMP_demo
98
-
99
- print('start loading models...')
100
- # Load scheduler, tokenizer and models.
101
- image_encoder = CLIPVisionModelWithProjection.from_pretrained(
102
- pretrained_model_name_or_path, subfolder="image_encoder", revision=None, variant="fp16"
103
- )
104
- vae = AutoencoderKLTemporalDecoder.from_pretrained(
105
- pretrained_model_name_or_path, subfolder="vae", revision=None, variant="fp16")
106
- unet = UNetSpatioTemporalConditionControlNetModel.from_pretrained(
107
- pretrained_model_name_or_path,
108
- subfolder="unet",
109
- low_cpu_mem_usage=True,
110
- variant="fp16",
111
- )
112
-
113
- controlnet = FlowControlNet.from_pretrained(resume_from_checkpoint)
114
-
115
- cmp = CMP_demo(
116
- './models/cmp/experiments/semiauto_annot/resnet50_vip+mpii_liteflow/config.yaml',
117
- 42000
118
- ).to(device)
119
- cmp.requires_grad_(False)
120
-
121
- # Freeze vae and image_encoder
122
- vae.requires_grad_(False)
123
- image_encoder.requires_grad_(False)
124
- unet.requires_grad_(False)
125
- controlnet.requires_grad_(False)
126
-
127
- # Move image_encoder and vae to gpu and cast to weight_dtype
128
- image_encoder.to(device, dtype=weight_dtype)
129
- vae.to(device, dtype=weight_dtype)
130
- unet.to(device, dtype=weight_dtype)
131
- controlnet.to(device, dtype=weight_dtype)
132
-
133
- if enable_xformers_memory_efficient_attention:
134
- if is_xformers_available():
135
- import xformers
136
-
137
- xformers_version = version.parse(xformers.__version__)
138
- if xformers_version == version.parse("0.0.16"):
139
- print(
140
- "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
141
- )
142
- unet.enable_xformers_memory_efficient_attention()
143
- else:
144
- raise ValueError(
145
- "xformers is not available. Make sure it is installed correctly")
146
-
147
- if allow_tf32:
148
- torch.backends.cuda.matmul.allow_tf32 = True
149
-
150
- pipeline = FlowControlNetPipeline.from_pretrained(
151
- pretrained_model_name_or_path,
152
- unet=unet,
153
- controlnet=controlnet,
154
- image_encoder=image_encoder,
155
- vae=vae,
156
- torch_dtype=weight_dtype,
157
- )
158
- pipeline = pipeline.to(device)
159
-
160
- print('models loaded.')
161
-
162
- return pipeline, cmp
163
-
164
-
165
  def interpolate_trajectory(points, n_points):
166
  x = [point[0] for point in points]
167
  y = [point[1] for point in points]
@@ -215,20 +142,87 @@ def visualize_drag_v2(background_image_path, splited_tracks, width, height):
215
  return trajectory_maps, transparent_layer
216
 
217
 
218
-
219
- pipeline, cmp = init_models(
220
- "ckpts/stable-video-diffusion-img2vid-xt-1-1",
221
- "ckpts/controlnet",
222
- weight_dtype=torch.float16,
223
- device='cuda'
224
- )
225
-
226
-
227
  class Drag:
228
  def __init__(self, height, width):
229
 
230
  self.height = height
231
  self.width = width
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
232
 
233
  def get_cmp_flow(self, frames, sparse_optical_flow, mask, brush_mask=None):
234
 
@@ -652,6 +646,14 @@ with gr.Blocks() as demo:
652
 
653
  target_size = 512
654
  DragNUWA_net = Drag(target_size, target_size)
 
 
 
 
 
 
 
 
655
  first_frame_path = gr.State()
656
  tracking_points = gr.State([])
657
  motion_brush_points = gr.State([])
 
89
  return s_flow, mask
90
 
91
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
  def interpolate_trajectory(points, n_points):
93
  x = [point[0] for point in points]
94
  y = [point[1] for point in points]
 
142
  return trajectory_maps, transparent_layer
143
 
144
 
 
 
 
 
 
 
 
 
 
145
  class Drag:
146
  def __init__(self, height, width):
147
 
148
  self.height = height
149
  self.width = width
150
+ self.pipeline = None
151
+ self.cmp = None
152
+
153
+ @spaces.GPU(duration=100)
154
+ def init_models(self, pretrained_model_name_or_path, resume_from_checkpoint, weight_dtype, device='cuda', enable_xformers_memory_efficient_attention=False, allow_tf32=False):
155
+
156
+ from models.unet_spatio_temporal_condition_controlnet import UNetSpatioTemporalConditionControlNetModel
157
+ from pipeline.pipeline import FlowControlNetPipeline
158
+ from models.svdxt_featureflow_forward_controlnet_s2d_fixcmp_norefine import FlowControlNet, CMP_demo
159
+
160
+ print('start loading models...')
161
+ # Load scheduler, tokenizer and models.
162
+ image_encoder = CLIPVisionModelWithProjection.from_pretrained(
163
+ pretrained_model_name_or_path, subfolder="image_encoder", revision=None, variant="fp16"
164
+ )
165
+ vae = AutoencoderKLTemporalDecoder.from_pretrained(
166
+ pretrained_model_name_or_path, subfolder="vae", revision=None, variant="fp16")
167
+ unet = UNetSpatioTemporalConditionControlNetModel.from_pretrained(
168
+ pretrained_model_name_or_path,
169
+ subfolder="unet",
170
+ low_cpu_mem_usage=True,
171
+ variant="fp16",
172
+ )
173
+
174
+ controlnet = FlowControlNet.from_pretrained(resume_from_checkpoint)
175
+
176
+ cmp = CMP_demo(
177
+ './models/cmp/experiments/semiauto_annot/resnet50_vip+mpii_liteflow/config.yaml',
178
+ 42000
179
+ ).to(device)
180
+ cmp.requires_grad_(False)
181
+
182
+ self.cmp = cmp
183
+
184
+ # Freeze vae and image_encoder
185
+ vae.requires_grad_(False)
186
+ image_encoder.requires_grad_(False)
187
+ unet.requires_grad_(False)
188
+ controlnet.requires_grad_(False)
189
+
190
+ # Move image_encoder and vae to gpu and cast to weight_dtype
191
+ image_encoder.to(device, dtype=weight_dtype)
192
+ vae.to(device, dtype=weight_dtype)
193
+ unet.to(device, dtype=weight_dtype)
194
+ controlnet.to(device, dtype=weight_dtype)
195
+
196
+ if enable_xformers_memory_efficient_attention:
197
+ if is_xformers_available():
198
+ import xformers
199
+
200
+ xformers_version = version.parse(xformers.__version__)
201
+ if xformers_version == version.parse("0.0.16"):
202
+ print(
203
+ "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
204
+ )
205
+ unet.enable_xformers_memory_efficient_attention()
206
+ else:
207
+ raise ValueError(
208
+ "xformers is not available. Make sure it is installed correctly")
209
+
210
+ if allow_tf32:
211
+ torch.backends.cuda.matmul.allow_tf32 = True
212
+
213
+ pipeline = FlowControlNetPipeline.from_pretrained(
214
+ pretrained_model_name_or_path,
215
+ unet=unet,
216
+ controlnet=controlnet,
217
+ image_encoder=image_encoder,
218
+ vae=vae,
219
+ torch_dtype=weight_dtype,
220
+ )
221
+ pipeline = pipeline.to(device)
222
+
223
+ self.pipeline = pipeline
224
+
225
+ print('models loaded.')
226
 
227
  def get_cmp_flow(self, frames, sparse_optical_flow, mask, brush_mask=None):
228
 
 
646
 
647
  target_size = 512
648
  DragNUWA_net = Drag(target_size, target_size)
649
+
650
+ DragNUWA_net.init_models(
651
+ "ckpts/stable-video-diffusion-img2vid-xt-1-1",
652
+ "ckpts/controlnet",
653
+ weight_dtype=torch.float16,
654
+ device='cuda'
655
+ )
656
+
657
  first_frame_path = gr.State()
658
  tracking_points = gr.State([])
659
  motion_brush_points = gr.State([])