jadechoghari
commited on
Commit
•
a7f9357
1
Parent(s):
7470108
Create utils.py
Browse files
utils.py
CHANGED
@@ -1,343 +1,60 @@
|
|
1 |
-
import contextlib
|
2 |
-
import random
|
3 |
-
import numpy as np
|
4 |
-
import os
|
5 |
-
from glob import glob
|
6 |
-
from PIL import Image, ImageSequence
|
7 |
-
|
8 |
import torch
|
9 |
-
from torchvision.io import read_video, write_video
|
10 |
-
import torchvision.transforms as T
|
11 |
-
|
12 |
-
from diffusers import DDIMScheduler, StableDiffusionControlNetPipeline, StableDiffusionPipeline, StableDiffusionDepth2ImgPipeline, ControlNetModel
|
13 |
-
from .controlnet_utils import CONTROLNET_DICT, control_preprocess
|
14 |
from einops import rearrange
|
15 |
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
use_depth = False
|
22 |
-
if model_key is None:
|
23 |
-
if sd_version == '2.1':
|
24 |
-
model_key = "stabilityai/stable-diffusion-2-1-base"
|
25 |
-
elif sd_version == '2.0':
|
26 |
-
model_key = "stabilityai/stable-diffusion-2-base"
|
27 |
-
elif sd_version == '1.5':
|
28 |
-
model_key = "runwayml/stable-diffusion-v1-5"
|
29 |
-
elif sd_version == 'depth':
|
30 |
-
model_key = "stabilityai/stable-diffusion-2-depth"
|
31 |
-
use_depth = True
|
32 |
-
else:
|
33 |
-
raise ValueError(
|
34 |
-
f'Stable-diffusion version {sd_version} not supported.')
|
35 |
-
|
36 |
-
print(f'[INFO] loading stable diffusion from: {model_key}')
|
37 |
-
else:
|
38 |
-
print(f'[INFO] loading custome model from: {model_key}')
|
39 |
-
|
40 |
-
scheduler = DDIMScheduler.from_pretrained(
|
41 |
-
model_key, subfolder="scheduler")
|
42 |
-
|
43 |
-
if weight_dtype == "fp16":
|
44 |
-
weight_dtype = torch.float16
|
45 |
-
else:
|
46 |
-
weight_dtype = torch.float32
|
47 |
-
|
48 |
-
if control_type not in ["none", "pnp"]:
|
49 |
-
controlnet_key = CONTROLNET_DICT[control_type]
|
50 |
-
print(f'[INFO] loading controlnet from: {controlnet_key}')
|
51 |
-
controlnet = ControlNetModel.from_pretrained(
|
52 |
-
controlnet_key, torch_dtype=weight_dtype)
|
53 |
-
print(f'[INFO] loaded controlnet!')
|
54 |
-
pipe = StableDiffusionControlNetPipeline.from_pretrained(
|
55 |
-
model_key, controlnet=controlnet, torch_dtype=weight_dtype
|
56 |
-
)
|
57 |
-
elif use_depth:
|
58 |
-
pipe = StableDiffusionDepth2ImgPipeline.from_pretrained(
|
59 |
-
model_key, torch_dtype=weight_dtype
|
60 |
-
)
|
61 |
-
else:
|
62 |
-
pipe = StableDiffusionPipeline.from_pretrained(
|
63 |
-
# model_key, torch_dtype=weight_dtype
|
64 |
-
model_key, torch_dtype=weight_dtype,
|
65 |
-
)
|
66 |
-
|
67 |
-
return pipe.to(device), scheduler, model_key
|
68 |
-
|
69 |
-
|
70 |
-
def seed_everything(seed):
|
71 |
-
torch.manual_seed(seed)
|
72 |
-
torch.cuda.manual_seed(seed)
|
73 |
-
random.seed(seed)
|
74 |
-
np.random.seed(seed)
|
75 |
-
|
76 |
-
|
77 |
-
def load_image(image_path):
|
78 |
-
image = Image.open(image_path).convert('RGB')
|
79 |
-
image = T.ToTensor()(image)
|
80 |
-
return image.unsqueeze(0)
|
81 |
-
|
82 |
-
|
83 |
-
def process_frames(frames, h, w):
|
84 |
-
|
85 |
-
fh, fw = frames.shape[-2:]
|
86 |
-
h = int(np.floor(h / 64.0)) * 64
|
87 |
-
w = int(np.floor(w / 64.0)) * 64
|
88 |
-
|
89 |
-
nw = int(fw / fh * h)
|
90 |
-
if nw >= w:
|
91 |
-
size = (h, nw)
|
92 |
-
else:
|
93 |
-
size = (int(fh / fw * w), w)
|
94 |
-
|
95 |
-
assert len(frames.shape) >= 3
|
96 |
-
if len(frames.shape) == 3:
|
97 |
-
frames = [frames]
|
98 |
-
|
99 |
-
print(
|
100 |
-
f"[INFO] frame size {(fh, fw)} resize to {size} and centercrop to {(h, w)}")
|
101 |
-
|
102 |
-
frame_ls = []
|
103 |
-
for frame in frames:
|
104 |
-
resized_frame = T.Resize(size)(frame)
|
105 |
-
cropped_frame = T.CenterCrop([h, w])(resized_frame)
|
106 |
-
# croped_frame = T.FiveCrop([h, w])(resized_frame)[0]
|
107 |
-
frame_ls.append(cropped_frame)
|
108 |
-
return torch.stack(frame_ls)
|
109 |
-
|
110 |
-
|
111 |
-
def glob_frame_paths(video_path):
|
112 |
-
frame_paths = []
|
113 |
-
for ext in FRAME_EXT:
|
114 |
-
frame_paths += glob(os.path.join(video_path, f"*{ext}"))
|
115 |
-
frame_paths = sorted(frame_paths)
|
116 |
-
return frame_paths
|
117 |
-
|
118 |
-
|
119 |
-
def load_video(video_path, h, w, frame_ids=None, device="cuda"):
|
120 |
-
|
121 |
-
|
122 |
-
if ".mp4" in video_path:
|
123 |
-
frames, _, _ = read_video(
|
124 |
-
video_path, output_format="TCHW", pts_unit="sec")
|
125 |
-
frames = frames / 255
|
126 |
-
elif ".gif" in video_path:
|
127 |
-
frames = Image.open(video_path)
|
128 |
-
frame_ls = []
|
129 |
-
for frame in ImageSequence.Iterator(frames):
|
130 |
-
frame_ls += [T.ToTensor()(frame.convert("RGB"))]
|
131 |
-
frames = torch.stack(frame_ls)
|
132 |
-
else:
|
133 |
-
frame_paths = glob_frame_paths(video_path)
|
134 |
-
frame_ls = []
|
135 |
-
for frame_path in frame_paths:
|
136 |
-
frame = load_image(frame_path)
|
137 |
-
frame_ls.append(frame)
|
138 |
-
frames = torch.cat(frame_ls)
|
139 |
-
if frame_ids is not None:
|
140 |
-
frames = frames[frame_ids]
|
141 |
-
|
142 |
-
print(f"[INFO] loaded video with {len(frames)} frames from: {video_path}")
|
143 |
-
|
144 |
-
frames = process_frames(frames, h, w)
|
145 |
-
return frames.to(device)
|
146 |
-
|
147 |
-
|
148 |
-
def save_video(frames: torch.Tensor, path, frame_ids=None, save_frame=False):
|
149 |
-
os.makedirs(path, exist_ok=True)
|
150 |
-
if frame_ids is None:
|
151 |
-
frame_ids = [i for i in range(len(frames))]
|
152 |
-
frames = frames[frame_ids]
|
153 |
-
|
154 |
-
proc_frames = (rearrange(frames, "T C H W -> T H W C") * 255).to(torch.uint8).cpu()
|
155 |
-
write_video(os.path.join(path, "output.mp4"), proc_frames, fps = 30, video_codec="h264")
|
156 |
-
print(f"[INFO] save video to {os.path.join(path, 'output.mp4')}")
|
157 |
-
|
158 |
-
if save_frame:
|
159 |
-
save_frames(frames, os.path.join(path, "frames"), frame_ids = frame_ids)
|
160 |
-
|
161 |
-
|
162 |
-
def save_frames(frames: torch.Tensor, path, ext="png", frame_ids=None):
|
163 |
-
os.makedirs(path, exist_ok=True)
|
164 |
-
if frame_ids is None:
|
165 |
-
frame_ids = [i for i in range(len(frames))]
|
166 |
-
for i, frame in zip(frame_ids, frames):
|
167 |
-
T.ToPILImage()(frame).save(
|
168 |
-
os.path.join(path, '{:04}.{}'.format(i, ext)))
|
169 |
-
|
170 |
-
|
171 |
-
def load_latent(latent_path, t, frame_ids=None):
|
172 |
-
latent_fname = f'noisy_latents_{t}.pt'
|
173 |
-
|
174 |
-
lp = os.path.join(latent_path, latent_fname)
|
175 |
-
assert os.path.exists(
|
176 |
-
lp), f"Latent at timestep {t} not found in {latent_path}."
|
177 |
-
|
178 |
-
latents = torch.load(lp)
|
179 |
-
if frame_ids is not None:
|
180 |
-
latents = latents[frame_ids]
|
181 |
|
182 |
-
|
183 |
-
|
184 |
-
return latents
|
185 |
|
186 |
-
|
187 |
-
|
|
|
188 |
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
if os.path.exists(depth_path):
|
203 |
-
depth_map = torch.load(depth_path)
|
204 |
-
else:
|
205 |
-
input_image = T.ToPILImage()(input_image.squeeze())
|
206 |
-
depth_map = prepare_depth_map(
|
207 |
-
model, input_image, dtype=dtype, device=model.device)
|
208 |
-
torch.save(depth_map, depth_path)
|
209 |
-
depth_image = (((depth_map + 1.0) / 2.0) * 255).to(torch.uint8)
|
210 |
-
T.ToPILImage()(depth_image.squeeze()).convert(
|
211 |
-
"L").save(depth_path.replace(".pt", ".png"))
|
212 |
-
|
213 |
-
return depth_map
|
214 |
-
|
215 |
-
@torch.no_grad()
|
216 |
-
def prepare_depth_map(model, image, depth_map=None, batch_size=1, do_classifier_free_guidance=False, dtype=torch.float32, device="cuda"):
|
217 |
-
if isinstance(image, Image.Image):
|
218 |
-
image = [image]
|
219 |
-
else:
|
220 |
-
image = list(image)
|
221 |
-
|
222 |
-
if isinstance(image[0], Image.Image):
|
223 |
-
width, height = image[0].size
|
224 |
-
elif isinstance(image[0], np.ndarray):
|
225 |
-
width, height = image[0].shape[:-1]
|
226 |
-
else:
|
227 |
-
height, width = image[0].shape[-2:]
|
228 |
-
|
229 |
-
if depth_map is None:
|
230 |
-
pixel_values = model.feature_extractor(
|
231 |
-
images=image, return_tensors="pt").pixel_values
|
232 |
-
pixel_values = pixel_values.to(device=device)
|
233 |
-
# The DPT-Hybrid model uses batch-norm layers which are not compatible with fp16.
|
234 |
-
# So we use `torch.autocast` here for half precision inference.
|
235 |
-
context_manger = torch.autocast(
|
236 |
-
"cuda", dtype=dtype) if device.type == "cuda" else contextlib.nullcontext()
|
237 |
-
with context_manger:
|
238 |
-
ret = model.depth_estimator(pixel_values)
|
239 |
-
depth_map = ret.predicted_depth
|
240 |
-
# depth_image = ret.depth
|
241 |
-
else:
|
242 |
-
depth_map = depth_map.to(device=device, dtype=dtype)
|
243 |
-
|
244 |
-
indices = depth_map != -1
|
245 |
-
bg_indices = depth_map == -1
|
246 |
-
min_d = depth_map[indices].min()
|
247 |
-
|
248 |
-
if bg_indices.sum() > 0:
|
249 |
-
depth_map[bg_indices] = min_d - 10
|
250 |
-
# min_d = min_d - 10
|
251 |
-
|
252 |
-
depth_map = torch.nn.functional.interpolate(
|
253 |
-
depth_map.unsqueeze(1),
|
254 |
-
size=(height // model.vae_scale_factor,
|
255 |
-
width // model.vae_scale_factor),
|
256 |
-
mode="bicubic",
|
257 |
-
align_corners=False,
|
258 |
-
)
|
259 |
-
|
260 |
-
depth_min = torch.amin(depth_map, dim=[1, 2, 3], keepdim=True)
|
261 |
-
depth_max = torch.amax(depth_map, dim=[1, 2, 3], keepdim=True)
|
262 |
-
depth_map = 2.0 * (depth_map - depth_min) / (depth_max - depth_min) - 1.0
|
263 |
-
depth_map = depth_map.to(dtype)
|
264 |
-
|
265 |
-
# duplicate mask and masked_image_latents for each generation per prompt, using mps friendly method
|
266 |
-
if depth_map.shape[0] < batch_size:
|
267 |
-
repeat_by = batch_size // depth_map.shape[0]
|
268 |
-
depth_map = depth_map.repeat(repeat_by, 1, 1, 1)
|
269 |
-
|
270 |
-
depth_map = torch.cat(
|
271 |
-
[depth_map] * 2) if do_classifier_free_guidance else depth_map
|
272 |
-
return depth_map
|
273 |
-
|
274 |
-
|
275 |
-
def get_latents_dir(latents_path, model_key):
|
276 |
-
model_key = model_key.split("/")[-1]
|
277 |
-
return os.path.join(latents_path, model_key)
|
278 |
-
|
279 |
-
|
280 |
-
def get_controlnet_kwargs(controlnet, x, cond, t, controlnet_cond, controlnet_scale=1.0):
|
281 |
-
down_block_res_samples, mid_block_res_sample = controlnet(
|
282 |
-
x,
|
283 |
-
t,
|
284 |
-
encoder_hidden_states=cond,
|
285 |
-
controlnet_cond=controlnet_cond,
|
286 |
-
return_dict=False,
|
287 |
-
)
|
288 |
-
down_block_res_samples = [
|
289 |
-
down_block_res_sample * controlnet_scale
|
290 |
-
for down_block_res_sample in down_block_res_samples
|
291 |
-
]
|
292 |
-
mid_block_res_sample *= controlnet_scale
|
293 |
-
controlnet_kwargs = {"down_block_additional_residuals": down_block_res_samples,
|
294 |
-
"mid_block_additional_residual": mid_block_res_sample}
|
295 |
-
return controlnet_kwargs
|
296 |
-
|
297 |
-
|
298 |
-
def get_frame_ids(frame_range, frame_ids=None):
|
299 |
-
if frame_ids is None:
|
300 |
-
frame_ids = list(range(*frame_range))
|
301 |
-
frame_ids = sorted(frame_ids)
|
302 |
-
|
303 |
-
if len(frame_ids) > 4:
|
304 |
-
frame_ids_str = "{} {} ... {} {}".format(
|
305 |
-
*frame_ids[:2], *frame_ids[-2:])
|
306 |
-
else:
|
307 |
-
frame_ids_str = " ".join(["{}"] * len(frame_ids)).format(*frame_ids)
|
308 |
-
print("[INFO] frame indexes: ", frame_ids_str)
|
309 |
-
return frame_ids
|
310 |
-
|
311 |
-
|
312 |
-
def prepare_control(control, frames, frame_ids, save_path):
|
313 |
-
if control not in CONTROLNET_DICT.keys():
|
314 |
-
print(f"[WARNING] unknown controlnet type {control}")
|
315 |
-
return None
|
316 |
-
|
317 |
-
control_subdir = f'{save_path}/{control}_image'
|
318 |
-
|
319 |
-
preprocess_flag = True
|
320 |
-
if os.path.exists(control_subdir):
|
321 |
-
print(f"[INFO] load control image from {control_subdir}.")
|
322 |
-
control_image_ls = []
|
323 |
-
for frame_id in frame_ids:
|
324 |
-
image_path = os.path.join(
|
325 |
-
control_subdir, "{:04}.png".format(frame_id))
|
326 |
-
if not os.path.exists(image_path):
|
327 |
-
break
|
328 |
-
control_image_ls += [load_image(image_path)]
|
329 |
else:
|
330 |
-
|
331 |
-
|
332 |
-
|
333 |
-
|
334 |
-
|
335 |
-
|
336 |
-
|
337 |
-
|
338 |
-
|
339 |
-
|
340 |
-
|
341 |
-
|
342 |
-
|
343 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import torch
|
|
|
|
|
|
|
|
|
|
|
2 |
from einops import rearrange
|
3 |
|
4 |
+
def isinstance_str(x: object, cls_name: str):
|
5 |
+
"""
|
6 |
+
Checks whether x has any class *named* cls_name in its ancestry.
|
7 |
+
Doesn't require access to the class's implementation.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
|
9 |
+
Useful for patching!
|
10 |
+
"""
|
|
|
11 |
|
12 |
+
for _cls in x.__class__.__mro__:
|
13 |
+
if _cls.__name__ == cls_name:
|
14 |
+
return True
|
15 |
|
16 |
+
return False
|
17 |
+
|
18 |
+
def init_generator(device: torch.device, fallback: torch.Generator=None):
|
19 |
+
"""
|
20 |
+
Forks the current default random generator given device.
|
21 |
+
"""
|
22 |
+
if device.type == "cpu":
|
23 |
+
return torch.Generator(device="cpu").set_state(torch.get_rng_state())
|
24 |
+
elif device.type == "cuda":
|
25 |
+
return torch.Generator(device=device).set_state(torch.cuda.get_rng_state())
|
26 |
+
else:
|
27 |
+
if fallback is None:
|
28 |
+
return init_generator(torch.device("cpu"))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
else:
|
30 |
+
return fallback
|
31 |
+
|
32 |
+
def join_frame(x, fsize):
|
33 |
+
""" Join multi-frame tokens """
|
34 |
+
x = rearrange(x, "(B F) N C -> B (F N) C", F=fsize)
|
35 |
+
return x
|
36 |
+
|
37 |
+
def split_frame(x, fsize):
|
38 |
+
""" Split multi-frame tokens """
|
39 |
+
x = rearrange(x, "B (F N) C -> (B F) N C", F=fsize)
|
40 |
+
return x
|
41 |
+
|
42 |
+
def func_warper(funcs):
|
43 |
+
""" Warp a function sequence """
|
44 |
+
def fn(x, **kwarg):
|
45 |
+
for func in funcs:
|
46 |
+
x = func(x, **kwarg)
|
47 |
+
return x
|
48 |
+
return fn
|
49 |
+
|
50 |
+
def join_warper(fsize):
|
51 |
+
def fn(x):
|
52 |
+
x = join_frame(x, fsize)
|
53 |
+
return x
|
54 |
+
return fn
|
55 |
+
|
56 |
+
def split_warper(fsize):
|
57 |
+
def fn(x):
|
58 |
+
x = split_frame(x, fsize)
|
59 |
+
return x
|
60 |
+
return fn
|