jadechoghari commited on
Commit
a7f9357
1 Parent(s): 7470108

Create utils.py

Browse files
Files changed (1) hide show
  1. utils.py +53 -336
utils.py CHANGED
@@ -1,343 +1,60 @@
1
- import contextlib
2
- import random
3
- import numpy as np
4
- import os
5
- from glob import glob
6
- from PIL import Image, ImageSequence
7
-
8
  import torch
9
- from torchvision.io import read_video, write_video
10
- import torchvision.transforms as T
11
-
12
- from diffusers import DDIMScheduler, StableDiffusionControlNetPipeline, StableDiffusionPipeline, StableDiffusionDepth2ImgPipeline, ControlNetModel
13
- from .controlnet_utils import CONTROLNET_DICT, control_preprocess
14
  from einops import rearrange
15
 
16
- FRAME_EXT = [".jpg", ".png"]
17
-
18
-
19
- def init_model(device="cuda", sd_version="1.5", model_key=None, control_type="none", weight_dtype="fp16"):
20
-
21
- use_depth = False
22
- if model_key is None:
23
- if sd_version == '2.1':
24
- model_key = "stabilityai/stable-diffusion-2-1-base"
25
- elif sd_version == '2.0':
26
- model_key = "stabilityai/stable-diffusion-2-base"
27
- elif sd_version == '1.5':
28
- model_key = "runwayml/stable-diffusion-v1-5"
29
- elif sd_version == 'depth':
30
- model_key = "stabilityai/stable-diffusion-2-depth"
31
- use_depth = True
32
- else:
33
- raise ValueError(
34
- f'Stable-diffusion version {sd_version} not supported.')
35
-
36
- print(f'[INFO] loading stable diffusion from: {model_key}')
37
- else:
38
- print(f'[INFO] loading custome model from: {model_key}')
39
-
40
- scheduler = DDIMScheduler.from_pretrained(
41
- model_key, subfolder="scheduler")
42
-
43
- if weight_dtype == "fp16":
44
- weight_dtype = torch.float16
45
- else:
46
- weight_dtype = torch.float32
47
-
48
- if control_type not in ["none", "pnp"]:
49
- controlnet_key = CONTROLNET_DICT[control_type]
50
- print(f'[INFO] loading controlnet from: {controlnet_key}')
51
- controlnet = ControlNetModel.from_pretrained(
52
- controlnet_key, torch_dtype=weight_dtype)
53
- print(f'[INFO] loaded controlnet!')
54
- pipe = StableDiffusionControlNetPipeline.from_pretrained(
55
- model_key, controlnet=controlnet, torch_dtype=weight_dtype
56
- )
57
- elif use_depth:
58
- pipe = StableDiffusionDepth2ImgPipeline.from_pretrained(
59
- model_key, torch_dtype=weight_dtype
60
- )
61
- else:
62
- pipe = StableDiffusionPipeline.from_pretrained(
63
- # model_key, torch_dtype=weight_dtype
64
- model_key, torch_dtype=weight_dtype,
65
- )
66
-
67
- return pipe.to(device), scheduler, model_key
68
-
69
-
70
- def seed_everything(seed):
71
- torch.manual_seed(seed)
72
- torch.cuda.manual_seed(seed)
73
- random.seed(seed)
74
- np.random.seed(seed)
75
-
76
-
77
- def load_image(image_path):
78
- image = Image.open(image_path).convert('RGB')
79
- image = T.ToTensor()(image)
80
- return image.unsqueeze(0)
81
-
82
-
83
- def process_frames(frames, h, w):
84
-
85
- fh, fw = frames.shape[-2:]
86
- h = int(np.floor(h / 64.0)) * 64
87
- w = int(np.floor(w / 64.0)) * 64
88
-
89
- nw = int(fw / fh * h)
90
- if nw >= w:
91
- size = (h, nw)
92
- else:
93
- size = (int(fh / fw * w), w)
94
-
95
- assert len(frames.shape) >= 3
96
- if len(frames.shape) == 3:
97
- frames = [frames]
98
-
99
- print(
100
- f"[INFO] frame size {(fh, fw)} resize to {size} and centercrop to {(h, w)}")
101
-
102
- frame_ls = []
103
- for frame in frames:
104
- resized_frame = T.Resize(size)(frame)
105
- cropped_frame = T.CenterCrop([h, w])(resized_frame)
106
- # croped_frame = T.FiveCrop([h, w])(resized_frame)[0]
107
- frame_ls.append(cropped_frame)
108
- return torch.stack(frame_ls)
109
-
110
-
111
- def glob_frame_paths(video_path):
112
- frame_paths = []
113
- for ext in FRAME_EXT:
114
- frame_paths += glob(os.path.join(video_path, f"*{ext}"))
115
- frame_paths = sorted(frame_paths)
116
- return frame_paths
117
-
118
-
119
- def load_video(video_path, h, w, frame_ids=None, device="cuda"):
120
-
121
-
122
- if ".mp4" in video_path:
123
- frames, _, _ = read_video(
124
- video_path, output_format="TCHW", pts_unit="sec")
125
- frames = frames / 255
126
- elif ".gif" in video_path:
127
- frames = Image.open(video_path)
128
- frame_ls = []
129
- for frame in ImageSequence.Iterator(frames):
130
- frame_ls += [T.ToTensor()(frame.convert("RGB"))]
131
- frames = torch.stack(frame_ls)
132
- else:
133
- frame_paths = glob_frame_paths(video_path)
134
- frame_ls = []
135
- for frame_path in frame_paths:
136
- frame = load_image(frame_path)
137
- frame_ls.append(frame)
138
- frames = torch.cat(frame_ls)
139
- if frame_ids is not None:
140
- frames = frames[frame_ids]
141
-
142
- print(f"[INFO] loaded video with {len(frames)} frames from: {video_path}")
143
-
144
- frames = process_frames(frames, h, w)
145
- return frames.to(device)
146
-
147
-
148
- def save_video(frames: torch.Tensor, path, frame_ids=None, save_frame=False):
149
- os.makedirs(path, exist_ok=True)
150
- if frame_ids is None:
151
- frame_ids = [i for i in range(len(frames))]
152
- frames = frames[frame_ids]
153
-
154
- proc_frames = (rearrange(frames, "T C H W -> T H W C") * 255).to(torch.uint8).cpu()
155
- write_video(os.path.join(path, "output.mp4"), proc_frames, fps = 30, video_codec="h264")
156
- print(f"[INFO] save video to {os.path.join(path, 'output.mp4')}")
157
-
158
- if save_frame:
159
- save_frames(frames, os.path.join(path, "frames"), frame_ids = frame_ids)
160
-
161
-
162
- def save_frames(frames: torch.Tensor, path, ext="png", frame_ids=None):
163
- os.makedirs(path, exist_ok=True)
164
- if frame_ids is None:
165
- frame_ids = [i for i in range(len(frames))]
166
- for i, frame in zip(frame_ids, frames):
167
- T.ToPILImage()(frame).save(
168
- os.path.join(path, '{:04}.{}'.format(i, ext)))
169
-
170
-
171
- def load_latent(latent_path, t, frame_ids=None):
172
- latent_fname = f'noisy_latents_{t}.pt'
173
-
174
- lp = os.path.join(latent_path, latent_fname)
175
- assert os.path.exists(
176
- lp), f"Latent at timestep {t} not found in {latent_path}."
177
-
178
- latents = torch.load(lp)
179
- if frame_ids is not None:
180
- latents = latents[frame_ids]
181
 
182
- # print(f"[INFO] loaded initial latent from {lp}")
183
-
184
- return latents
185
 
186
- @torch.no_grad()
187
- def prepare_depth(pipe, frames, frame_ids, work_dir):
 
188
 
189
- depth_ls = []
190
- depth_dir = os.path.join(work_dir, "depth")
191
- os.makedirs(depth_dir, exist_ok=True)
192
- for frame, frame_id in zip(frames, frame_ids):
193
- depth_path = os.path.join(depth_dir, "{:04}.pt".format(frame_id))
194
- depth = load_depth(pipe, depth_path, frame)
195
- depth_ls += [depth]
196
- print(f"[INFO] loaded depth images from {depth_path}")
197
- return torch.cat(depth_ls)
198
-
199
- # From pix2video: code/file_utils.py
200
-
201
- def load_depth(model, depth_path, input_image, dtype=torch.float32):
202
- if os.path.exists(depth_path):
203
- depth_map = torch.load(depth_path)
204
- else:
205
- input_image = T.ToPILImage()(input_image.squeeze())
206
- depth_map = prepare_depth_map(
207
- model, input_image, dtype=dtype, device=model.device)
208
- torch.save(depth_map, depth_path)
209
- depth_image = (((depth_map + 1.0) / 2.0) * 255).to(torch.uint8)
210
- T.ToPILImage()(depth_image.squeeze()).convert(
211
- "L").save(depth_path.replace(".pt", ".png"))
212
-
213
- return depth_map
214
-
215
- @torch.no_grad()
216
- def prepare_depth_map(model, image, depth_map=None, batch_size=1, do_classifier_free_guidance=False, dtype=torch.float32, device="cuda"):
217
- if isinstance(image, Image.Image):
218
- image = [image]
219
- else:
220
- image = list(image)
221
-
222
- if isinstance(image[0], Image.Image):
223
- width, height = image[0].size
224
- elif isinstance(image[0], np.ndarray):
225
- width, height = image[0].shape[:-1]
226
- else:
227
- height, width = image[0].shape[-2:]
228
-
229
- if depth_map is None:
230
- pixel_values = model.feature_extractor(
231
- images=image, return_tensors="pt").pixel_values
232
- pixel_values = pixel_values.to(device=device)
233
- # The DPT-Hybrid model uses batch-norm layers which are not compatible with fp16.
234
- # So we use `torch.autocast` here for half precision inference.
235
- context_manger = torch.autocast(
236
- "cuda", dtype=dtype) if device.type == "cuda" else contextlib.nullcontext()
237
- with context_manger:
238
- ret = model.depth_estimator(pixel_values)
239
- depth_map = ret.predicted_depth
240
- # depth_image = ret.depth
241
- else:
242
- depth_map = depth_map.to(device=device, dtype=dtype)
243
-
244
- indices = depth_map != -1
245
- bg_indices = depth_map == -1
246
- min_d = depth_map[indices].min()
247
-
248
- if bg_indices.sum() > 0:
249
- depth_map[bg_indices] = min_d - 10
250
- # min_d = min_d - 10
251
-
252
- depth_map = torch.nn.functional.interpolate(
253
- depth_map.unsqueeze(1),
254
- size=(height // model.vae_scale_factor,
255
- width // model.vae_scale_factor),
256
- mode="bicubic",
257
- align_corners=False,
258
- )
259
-
260
- depth_min = torch.amin(depth_map, dim=[1, 2, 3], keepdim=True)
261
- depth_max = torch.amax(depth_map, dim=[1, 2, 3], keepdim=True)
262
- depth_map = 2.0 * (depth_map - depth_min) / (depth_max - depth_min) - 1.0
263
- depth_map = depth_map.to(dtype)
264
-
265
- # duplicate mask and masked_image_latents for each generation per prompt, using mps friendly method
266
- if depth_map.shape[0] < batch_size:
267
- repeat_by = batch_size // depth_map.shape[0]
268
- depth_map = depth_map.repeat(repeat_by, 1, 1, 1)
269
-
270
- depth_map = torch.cat(
271
- [depth_map] * 2) if do_classifier_free_guidance else depth_map
272
- return depth_map
273
-
274
-
275
- def get_latents_dir(latents_path, model_key):
276
- model_key = model_key.split("/")[-1]
277
- return os.path.join(latents_path, model_key)
278
-
279
-
280
- def get_controlnet_kwargs(controlnet, x, cond, t, controlnet_cond, controlnet_scale=1.0):
281
- down_block_res_samples, mid_block_res_sample = controlnet(
282
- x,
283
- t,
284
- encoder_hidden_states=cond,
285
- controlnet_cond=controlnet_cond,
286
- return_dict=False,
287
- )
288
- down_block_res_samples = [
289
- down_block_res_sample * controlnet_scale
290
- for down_block_res_sample in down_block_res_samples
291
- ]
292
- mid_block_res_sample *= controlnet_scale
293
- controlnet_kwargs = {"down_block_additional_residuals": down_block_res_samples,
294
- "mid_block_additional_residual": mid_block_res_sample}
295
- return controlnet_kwargs
296
-
297
-
298
- def get_frame_ids(frame_range, frame_ids=None):
299
- if frame_ids is None:
300
- frame_ids = list(range(*frame_range))
301
- frame_ids = sorted(frame_ids)
302
-
303
- if len(frame_ids) > 4:
304
- frame_ids_str = "{} {} ... {} {}".format(
305
- *frame_ids[:2], *frame_ids[-2:])
306
- else:
307
- frame_ids_str = " ".join(["{}"] * len(frame_ids)).format(*frame_ids)
308
- print("[INFO] frame indexes: ", frame_ids_str)
309
- return frame_ids
310
-
311
-
312
- def prepare_control(control, frames, frame_ids, save_path):
313
- if control not in CONTROLNET_DICT.keys():
314
- print(f"[WARNING] unknown controlnet type {control}")
315
- return None
316
-
317
- control_subdir = f'{save_path}/{control}_image'
318
-
319
- preprocess_flag = True
320
- if os.path.exists(control_subdir):
321
- print(f"[INFO] load control image from {control_subdir}.")
322
- control_image_ls = []
323
- for frame_id in frame_ids:
324
- image_path = os.path.join(
325
- control_subdir, "{:04}.png".format(frame_id))
326
- if not os.path.exists(image_path):
327
- break
328
- control_image_ls += [load_image(image_path)]
329
  else:
330
- preprocess_flag = False
331
- control_images = torch.cat(control_image_ls)
332
-
333
- if preprocess_flag:
334
- print("[INFO] preprocessing control images...")
335
- control_images = control_preprocess(frames, control)
336
- print(f"[INFO] save control images to {control_subdir}.")
337
- os.makedirs(control_subdir, exist_ok=True)
338
- for image, frame_id in zip(control_images, frame_ids):
339
- image_path = os.path.join(
340
- control_subdir, "{:04}.png".format(frame_id))
341
- T.ToPILImage()(image).save(image_path)
342
-
343
- return control_images
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import torch
 
 
 
 
 
2
  from einops import rearrange
3
 
4
+ def isinstance_str(x: object, cls_name: str):
5
+ """
6
+ Checks whether x has any class *named* cls_name in its ancestry.
7
+ Doesn't require access to the class's implementation.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
+ Useful for patching!
10
+ """
 
11
 
12
+ for _cls in x.__class__.__mro__:
13
+ if _cls.__name__ == cls_name:
14
+ return True
15
 
16
+ return False
17
+
18
+ def init_generator(device: torch.device, fallback: torch.Generator=None):
19
+ """
20
+ Forks the current default random generator given device.
21
+ """
22
+ if device.type == "cpu":
23
+ return torch.Generator(device="cpu").set_state(torch.get_rng_state())
24
+ elif device.type == "cuda":
25
+ return torch.Generator(device=device).set_state(torch.cuda.get_rng_state())
26
+ else:
27
+ if fallback is None:
28
+ return init_generator(torch.device("cpu"))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  else:
30
+ return fallback
31
+
32
+ def join_frame(x, fsize):
33
+ """ Join multi-frame tokens """
34
+ x = rearrange(x, "(B F) N C -> B (F N) C", F=fsize)
35
+ return x
36
+
37
+ def split_frame(x, fsize):
38
+ """ Split multi-frame tokens """
39
+ x = rearrange(x, "B (F N) C -> (B F) N C", F=fsize)
40
+ return x
41
+
42
+ def func_warper(funcs):
43
+ """ Warp a function sequence """
44
+ def fn(x, **kwarg):
45
+ for func in funcs:
46
+ x = func(x, **kwarg)
47
+ return x
48
+ return fn
49
+
50
+ def join_warper(fsize):
51
+ def fn(x):
52
+ x = join_frame(x, fsize)
53
+ return x
54
+ return fn
55
+
56
+ def split_warper(fsize):
57
+ def fn(x):
58
+ x = split_frame(x, fsize)
59
+ return x
60
+ return fn