jadechoghari commited on
Commit
2899431
1 Parent(s): da7256e

Create utils.py

Browse files
Files changed (1) hide show
  1. utils.py +343 -0
utils.py ADDED
@@ -0,0 +1,343 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import contextlib
2
+ import random
3
+ import numpy as np
4
+ import os
5
+ from glob import glob
6
+ from PIL import Image, ImageSequence
7
+
8
+ import torch
9
+ from torchvision.io import read_video, write_video
10
+ import torchvision.transforms as T
11
+
12
+ from diffusers import DDIMScheduler, StableDiffusionControlNetPipeline, StableDiffusionPipeline, StableDiffusionDepth2ImgPipeline, ControlNetModel
13
+ from .controlnet_utils import CONTROLNET_DICT, control_preprocess
14
+ from einops import rearrange
15
+
16
+ FRAME_EXT = [".jpg", ".png"]
17
+
18
+
19
+ def init_model(device="cuda", sd_version="1.5", model_key=None, control_type="none", weight_dtype="fp16"):
20
+
21
+ use_depth = False
22
+ if model_key is None:
23
+ if sd_version == '2.1':
24
+ model_key = "stabilityai/stable-diffusion-2-1-base"
25
+ elif sd_version == '2.0':
26
+ model_key = "stabilityai/stable-diffusion-2-base"
27
+ elif sd_version == '1.5':
28
+ model_key = "runwayml/stable-diffusion-v1-5"
29
+ elif sd_version == 'depth':
30
+ model_key = "stabilityai/stable-diffusion-2-depth"
31
+ use_depth = True
32
+ else:
33
+ raise ValueError(
34
+ f'Stable-diffusion version {sd_version} not supported.')
35
+
36
+ print(f'[INFO] loading stable diffusion from: {model_key}')
37
+ else:
38
+ print(f'[INFO] loading custome model from: {model_key}')
39
+
40
+ scheduler = DDIMScheduler.from_pretrained(
41
+ model_key, subfolder="scheduler")
42
+
43
+ if weight_dtype == "fp16":
44
+ weight_dtype = torch.float16
45
+ else:
46
+ weight_dtype = torch.float32
47
+
48
+ if control_type not in ["none", "pnp"]:
49
+ controlnet_key = CONTROLNET_DICT[control_type]
50
+ print(f'[INFO] loading controlnet from: {controlnet_key}')
51
+ controlnet = ControlNetModel.from_pretrained(
52
+ controlnet_key, torch_dtype=weight_dtype)
53
+ print(f'[INFO] loaded controlnet!')
54
+ pipe = StableDiffusionControlNetPipeline.from_pretrained(
55
+ model_key, controlnet=controlnet, torch_dtype=weight_dtype
56
+ )
57
+ elif use_depth:
58
+ pipe = StableDiffusionDepth2ImgPipeline.from_pretrained(
59
+ model_key, torch_dtype=weight_dtype
60
+ )
61
+ else:
62
+ pipe = StableDiffusionPipeline.from_pretrained(
63
+ # model_key, torch_dtype=weight_dtype
64
+ model_key, torch_dtype=weight_dtype,
65
+ )
66
+
67
+ return pipe.to(device), scheduler, model_key
68
+
69
+
70
+ def seed_everything(seed):
71
+ torch.manual_seed(seed)
72
+ torch.cuda.manual_seed(seed)
73
+ random.seed(seed)
74
+ np.random.seed(seed)
75
+
76
+
77
+ def load_image(image_path):
78
+ image = Image.open(image_path).convert('RGB')
79
+ image = T.ToTensor()(image)
80
+ return image.unsqueeze(0)
81
+
82
+
83
+ def process_frames(frames, h, w):
84
+
85
+ fh, fw = frames.shape[-2:]
86
+ h = int(np.floor(h / 64.0)) * 64
87
+ w = int(np.floor(w / 64.0)) * 64
88
+
89
+ nw = int(fw / fh * h)
90
+ if nw >= w:
91
+ size = (h, nw)
92
+ else:
93
+ size = (int(fh / fw * w), w)
94
+
95
+ assert len(frames.shape) >= 3
96
+ if len(frames.shape) == 3:
97
+ frames = [frames]
98
+
99
+ print(
100
+ f"[INFO] frame size {(fh, fw)} resize to {size} and centercrop to {(h, w)}")
101
+
102
+ frame_ls = []
103
+ for frame in frames:
104
+ resized_frame = T.Resize(size)(frame)
105
+ cropped_frame = T.CenterCrop([h, w])(resized_frame)
106
+ # croped_frame = T.FiveCrop([h, w])(resized_frame)[0]
107
+ frame_ls.append(cropped_frame)
108
+ return torch.stack(frame_ls)
109
+
110
+
111
+ def glob_frame_paths(video_path):
112
+ frame_paths = []
113
+ for ext in FRAME_EXT:
114
+ frame_paths += glob(os.path.join(video_path, f"*{ext}"))
115
+ frame_paths = sorted(frame_paths)
116
+ return frame_paths
117
+
118
+
119
+ def load_video(video_path, h, w, frame_ids=None, device="cuda"):
120
+
121
+
122
+ if ".mp4" in video_path:
123
+ frames, _, _ = read_video(
124
+ video_path, output_format="TCHW", pts_unit="sec")
125
+ frames = frames / 255
126
+ elif ".gif" in video_path:
127
+ frames = Image.open(video_path)
128
+ frame_ls = []
129
+ for frame in ImageSequence.Iterator(frames):
130
+ frame_ls += [T.ToTensor()(frame.convert("RGB"))]
131
+ frames = torch.stack(frame_ls)
132
+ else:
133
+ frame_paths = glob_frame_paths(video_path)
134
+ frame_ls = []
135
+ for frame_path in frame_paths:
136
+ frame = load_image(frame_path)
137
+ frame_ls.append(frame)
138
+ frames = torch.cat(frame_ls)
139
+ if frame_ids is not None:
140
+ frames = frames[frame_ids]
141
+
142
+ print(f"[INFO] loaded video with {len(frames)} frames from: {video_path}")
143
+
144
+ frames = process_frames(frames, h, w)
145
+ return frames.to(device)
146
+
147
+
148
+ def save_video(frames: torch.Tensor, path, frame_ids=None, save_frame=False):
149
+ os.makedirs(path, exist_ok=True)
150
+ if frame_ids is None:
151
+ frame_ids = [i for i in range(len(frames))]
152
+ frames = frames[frame_ids]
153
+
154
+ proc_frames = (rearrange(frames, "T C H W -> T H W C") * 255).to(torch.uint8).cpu()
155
+ write_video(os.path.join(path, "output.mp4"), proc_frames, fps = 30, video_codec="h264")
156
+ print(f"[INFO] save video to {os.path.join(path, 'output.mp4')}")
157
+
158
+ if save_frame:
159
+ save_frames(frames, os.path.join(path, "frames"), frame_ids = frame_ids)
160
+
161
+
162
+ def save_frames(frames: torch.Tensor, path, ext="png", frame_ids=None):
163
+ os.makedirs(path, exist_ok=True)
164
+ if frame_ids is None:
165
+ frame_ids = [i for i in range(len(frames))]
166
+ for i, frame in zip(frame_ids, frames):
167
+ T.ToPILImage()(frame).save(
168
+ os.path.join(path, '{:04}.{}'.format(i, ext)))
169
+
170
+
171
+ def load_latent(latent_path, t, frame_ids=None):
172
+ latent_fname = f'noisy_latents_{t}.pt'
173
+
174
+ lp = os.path.join(latent_path, latent_fname)
175
+ assert os.path.exists(
176
+ lp), f"Latent at timestep {t} not found in {latent_path}."
177
+
178
+ latents = torch.load(lp)
179
+ if frame_ids is not None:
180
+ latents = latents[frame_ids]
181
+
182
+ # print(f"[INFO] loaded initial latent from {lp}")
183
+
184
+ return latents
185
+
186
+ @torch.no_grad()
187
+ def prepare_depth(pipe, frames, frame_ids, work_dir):
188
+
189
+ depth_ls = []
190
+ depth_dir = os.path.join(work_dir, "depth")
191
+ os.makedirs(depth_dir, exist_ok=True)
192
+ for frame, frame_id in zip(frames, frame_ids):
193
+ depth_path = os.path.join(depth_dir, "{:04}.pt".format(frame_id))
194
+ depth = load_depth(pipe, depth_path, frame)
195
+ depth_ls += [depth]
196
+ print(f"[INFO] loaded depth images from {depth_path}")
197
+ return torch.cat(depth_ls)
198
+
199
+ # From pix2video: code/file_utils.py
200
+
201
+ def load_depth(model, depth_path, input_image, dtype=torch.float32):
202
+ if os.path.exists(depth_path):
203
+ depth_map = torch.load(depth_path)
204
+ else:
205
+ input_image = T.ToPILImage()(input_image.squeeze())
206
+ depth_map = prepare_depth_map(
207
+ model, input_image, dtype=dtype, device=model.device)
208
+ torch.save(depth_map, depth_path)
209
+ depth_image = (((depth_map + 1.0) / 2.0) * 255).to(torch.uint8)
210
+ T.ToPILImage()(depth_image.squeeze()).convert(
211
+ "L").save(depth_path.replace(".pt", ".png"))
212
+
213
+ return depth_map
214
+
215
+ @torch.no_grad()
216
+ def prepare_depth_map(model, image, depth_map=None, batch_size=1, do_classifier_free_guidance=False, dtype=torch.float32, device="cuda"):
217
+ if isinstance(image, Image.Image):
218
+ image = [image]
219
+ else:
220
+ image = list(image)
221
+
222
+ if isinstance(image[0], Image.Image):
223
+ width, height = image[0].size
224
+ elif isinstance(image[0], np.ndarray):
225
+ width, height = image[0].shape[:-1]
226
+ else:
227
+ height, width = image[0].shape[-2:]
228
+
229
+ if depth_map is None:
230
+ pixel_values = model.feature_extractor(
231
+ images=image, return_tensors="pt").pixel_values
232
+ pixel_values = pixel_values.to(device=device)
233
+ # The DPT-Hybrid model uses batch-norm layers which are not compatible with fp16.
234
+ # So we use `torch.autocast` here for half precision inference.
235
+ context_manger = torch.autocast(
236
+ "cuda", dtype=dtype) if device.type == "cuda" else contextlib.nullcontext()
237
+ with context_manger:
238
+ ret = model.depth_estimator(pixel_values)
239
+ depth_map = ret.predicted_depth
240
+ # depth_image = ret.depth
241
+ else:
242
+ depth_map = depth_map.to(device=device, dtype=dtype)
243
+
244
+ indices = depth_map != -1
245
+ bg_indices = depth_map == -1
246
+ min_d = depth_map[indices].min()
247
+
248
+ if bg_indices.sum() > 0:
249
+ depth_map[bg_indices] = min_d - 10
250
+ # min_d = min_d - 10
251
+
252
+ depth_map = torch.nn.functional.interpolate(
253
+ depth_map.unsqueeze(1),
254
+ size=(height // model.vae_scale_factor,
255
+ width // model.vae_scale_factor),
256
+ mode="bicubic",
257
+ align_corners=False,
258
+ )
259
+
260
+ depth_min = torch.amin(depth_map, dim=[1, 2, 3], keepdim=True)
261
+ depth_max = torch.amax(depth_map, dim=[1, 2, 3], keepdim=True)
262
+ depth_map = 2.0 * (depth_map - depth_min) / (depth_max - depth_min) - 1.0
263
+ depth_map = depth_map.to(dtype)
264
+
265
+ # duplicate mask and masked_image_latents for each generation per prompt, using mps friendly method
266
+ if depth_map.shape[0] < batch_size:
267
+ repeat_by = batch_size // depth_map.shape[0]
268
+ depth_map = depth_map.repeat(repeat_by, 1, 1, 1)
269
+
270
+ depth_map = torch.cat(
271
+ [depth_map] * 2) if do_classifier_free_guidance else depth_map
272
+ return depth_map
273
+
274
+
275
+ def get_latents_dir(latents_path, model_key):
276
+ model_key = model_key.split("/")[-1]
277
+ return os.path.join(latents_path, model_key)
278
+
279
+
280
+ def get_controlnet_kwargs(controlnet, x, cond, t, controlnet_cond, controlnet_scale=1.0):
281
+ down_block_res_samples, mid_block_res_sample = controlnet(
282
+ x,
283
+ t,
284
+ encoder_hidden_states=cond,
285
+ controlnet_cond=controlnet_cond,
286
+ return_dict=False,
287
+ )
288
+ down_block_res_samples = [
289
+ down_block_res_sample * controlnet_scale
290
+ for down_block_res_sample in down_block_res_samples
291
+ ]
292
+ mid_block_res_sample *= controlnet_scale
293
+ controlnet_kwargs = {"down_block_additional_residuals": down_block_res_samples,
294
+ "mid_block_additional_residual": mid_block_res_sample}
295
+ return controlnet_kwargs
296
+
297
+
298
+ def get_frame_ids(frame_range, frame_ids=None):
299
+ if frame_ids is None:
300
+ frame_ids = list(range(*frame_range))
301
+ frame_ids = sorted(frame_ids)
302
+
303
+ if len(frame_ids) > 4:
304
+ frame_ids_str = "{} {} ... {} {}".format(
305
+ *frame_ids[:2], *frame_ids[-2:])
306
+ else:
307
+ frame_ids_str = " ".join(["{}"] * len(frame_ids)).format(*frame_ids)
308
+ print("[INFO] frame indexes: ", frame_ids_str)
309
+ return frame_ids
310
+
311
+
312
+ def prepare_control(control, frames, frame_ids, save_path):
313
+ if control not in CONTROLNET_DICT.keys():
314
+ print(f"[WARNING] unknown controlnet type {control}")
315
+ return None
316
+
317
+ control_subdir = f'{save_path}/{control}_image'
318
+
319
+ preprocess_flag = True
320
+ if os.path.exists(control_subdir):
321
+ print(f"[INFO] load control image from {control_subdir}.")
322
+ control_image_ls = []
323
+ for frame_id in frame_ids:
324
+ image_path = os.path.join(
325
+ control_subdir, "{:04}.png".format(frame_id))
326
+ if not os.path.exists(image_path):
327
+ break
328
+ control_image_ls += [load_image(image_path)]
329
+ else:
330
+ preprocess_flag = False
331
+ control_images = torch.cat(control_image_ls)
332
+
333
+ if preprocess_flag:
334
+ print("[INFO] preprocessing control images...")
335
+ control_images = control_preprocess(frames, control)
336
+ print(f"[INFO] save control images to {control_subdir}.")
337
+ os.makedirs(control_subdir, exist_ok=True)
338
+ for image, frame_id in zip(control_images, frame_ids):
339
+ image_path = os.path.join(
340
+ control_subdir, "{:04}.png".format(frame_id))
341
+ T.ToPILImage()(image).save(image_path)
342
+
343
+ return control_images