guardiancc commited on
Commit
c63eb09
·
verified ·
1 Parent(s): 6ce97fd

Create bckp.py

Browse files
Files changed (1) hide show
  1. mimicmotion/pipelines/bckp.py +618 -0
mimicmotion/pipelines/bckp.py ADDED
@@ -0,0 +1,618 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import inspect
2
+ from dataclasses import dataclass
3
+ from typing import Callable, Dict, List, Optional, Union
4
+
5
+ import PIL.Image
6
+ import einops
7
+ import numpy as np
8
+ import torch
9
+ from diffusers.image_processor import VaeImageProcessor, PipelineImageInput
10
+ from diffusers.models import AutoencoderKLTemporalDecoder, UNetSpatioTemporalConditionModel
11
+ from diffusers.pipelines.pipeline_utils import DiffusionPipeline
12
+ from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import retrieve_timesteps
13
+ from diffusers.pipelines.stable_video_diffusion.pipeline_stable_video_diffusion \
14
+ import _resize_with_antialiasing, _append_dims
15
+ from diffusers.schedulers import EulerDiscreteScheduler
16
+ from diffusers.utils import BaseOutput, logging
17
+ from diffusers.utils.torch_utils import is_compiled_module, randn_tensor
18
+ from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
19
+
20
+ from ..modules.pose_net import PoseNet
21
+
22
+ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
23
+
24
+
25
+ def _append_dims(x, target_dims):
26
+ """Appends dimensions to the end of a tensor until it has target_dims dimensions."""
27
+ dims_to_append = target_dims - x.ndim
28
+ if dims_to_append < 0:
29
+ raise ValueError(f"input has {x.ndim} dims but target_dims is {target_dims}, which is less")
30
+ return x[(...,) + (None,) * dims_to_append]
31
+
32
+
33
+ # Copied from diffusers.pipelines.animatediff.pipeline_animatediff.tensor2vid
34
+ def tensor2vid(video: torch.Tensor, processor: "VaeImageProcessor", output_type: str = "np"):
35
+ batch_size, channels, num_frames, height, width = video.shape
36
+ outputs = []
37
+ for batch_idx in range(batch_size):
38
+ batch_vid = video[batch_idx].permute(1, 0, 2, 3)
39
+ batch_output = processor.postprocess(batch_vid, output_type)
40
+
41
+ outputs.append(batch_output)
42
+
43
+ if output_type == "np":
44
+ outputs = np.stack(outputs)
45
+
46
+ elif output_type == "pt":
47
+ outputs = torch.stack(outputs)
48
+
49
+ elif not output_type == "pil":
50
+ raise ValueError(f"{output_type} does not exist. Please choose one of ['np', 'pt', 'pil]")
51
+
52
+ return outputs
53
+
54
+
55
+ @dataclass
56
+ class MimicMotionPipelineOutput(BaseOutput):
57
+ r"""
58
+ Output class for mimicmotion pipeline.
59
+ Args:
60
+ frames (`[List[List[PIL.Image.Image]]`, `np.ndarray`, `torch.Tensor`]):
61
+ List of denoised PIL images of length `batch_size` or numpy array or torch tensor of shape `(batch_size,
62
+ num_frames, height, width, num_channels)`.
63
+ """
64
+
65
+ frames: Union[List[List[PIL.Image.Image]], np.ndarray, torch.Tensor]
66
+
67
+
68
+ class MimicMotionPipeline(DiffusionPipeline):
69
+ r"""
70
+ Pipeline to generate video from an input image using Stable Video Diffusion.
71
+ This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
72
+ implemented for all pipelines (downloading, saving, running on a particular device, etc.).
73
+ Args:
74
+ vae ([`AutoencoderKLTemporalDecoder`]):
75
+ Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
76
+ image_encoder ([`~transformers.CLIPVisionModelWithProjection`]):
77
+ Frozen CLIP image-encoder ([laion/CLIP-ViT-H-14-laion2B-s32B-b79K]
78
+ (https://huggingface.co/laion/CLIP-ViT-H-14-laion2B-s32B-b79K)).
79
+ unet ([`UNetSpatioTemporalConditionModel`]):
80
+ A `UNetSpatioTemporalConditionModel` to denoise the encoded image latents.
81
+ scheduler ([`EulerDiscreteScheduler`]):
82
+ A scheduler to be used in combination with `unet` to denoise the encoded image latents.
83
+ feature_extractor ([`~transformers.CLIPImageProcessor`]):
84
+ A `CLIPImageProcessor` to extract features from generated images.
85
+ pose_net ([`PoseNet`]):
86
+ A `` to inject pose signals into unet.
87
+ """
88
+
89
+ model_cpu_offload_seq = "image_encoder->unet->vae"
90
+ _callback_tensor_inputs = ["latents"]
91
+
92
+ def __init__(
93
+ self,
94
+ vae: AutoencoderKLTemporalDecoder,
95
+ image_encoder: CLIPVisionModelWithProjection,
96
+ unet: UNetSpatioTemporalConditionModel,
97
+ scheduler: EulerDiscreteScheduler,
98
+ feature_extractor: CLIPImageProcessor,
99
+ pose_net: PoseNet,
100
+ ):
101
+ super().__init__()
102
+
103
+ self.register_modules(
104
+ vae=vae,
105
+ image_encoder=image_encoder,
106
+ unet=unet,
107
+ scheduler=scheduler,
108
+ feature_extractor=feature_extractor,
109
+ pose_net=pose_net,
110
+ )
111
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
112
+ self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
113
+
114
+ def _encode_image(
115
+ self,
116
+ image: PipelineImageInput,
117
+ device: Union[str, torch.device],
118
+ num_videos_per_prompt: int,
119
+ do_classifier_free_guidance: bool):
120
+ dtype = next(self.image_encoder.parameters()).dtype
121
+
122
+ if not isinstance(image, torch.Tensor):
123
+ image = self.image_processor.pil_to_numpy(image)
124
+ image = self.image_processor.numpy_to_pt(image)
125
+
126
+ # We normalize the image before resizing to match with the original implementation.
127
+ # Then we unnormalize it after resizing.
128
+ image = image * 2.0 - 1.0
129
+ image = _resize_with_antialiasing(image, (224, 224))
130
+ image = (image + 1.0) / 2.0
131
+
132
+ # Normalize the image with for CLIP input
133
+ image = self.feature_extractor(
134
+ images=image,
135
+ do_normalize=True,
136
+ do_center_crop=False,
137
+ do_resize=False,
138
+ do_rescale=False,
139
+ return_tensors="pt",
140
+ ).pixel_values
141
+
142
+ image = image.to(device=device, dtype=dtype)
143
+ image_embeddings = self.image_encoder(image).image_embeds
144
+ image_embeddings = image_embeddings.unsqueeze(1)
145
+
146
+ # duplicate image embeddings for each generation per prompt, using mps friendly method
147
+ bs_embed, seq_len, _ = image_embeddings.shape
148
+ image_embeddings = image_embeddings.repeat(1, num_videos_per_prompt, 1)
149
+ image_embeddings = image_embeddings.view(bs_embed * num_videos_per_prompt, seq_len, -1)
150
+
151
+ if do_classifier_free_guidance:
152
+ negative_image_embeddings = torch.zeros_like(image_embeddings)
153
+
154
+ # For classifier free guidance, we need to do two forward passes.
155
+ # Here we concatenate the unconditional and text embeddings into a single batch
156
+ # to avoid doing two forward passes
157
+ image_embeddings = torch.cat([negative_image_embeddings, image_embeddings])
158
+
159
+ return image_embeddings
160
+
161
+ def _encode_vae_image(
162
+ self,
163
+ image: torch.Tensor,
164
+ device: Union[str, torch.device],
165
+ num_videos_per_prompt: int,
166
+ do_classifier_free_guidance: bool,
167
+ ):
168
+ image = image.to(device=device, dtype=self.vae.dtype)
169
+ image_latents = self.vae.encode(image).latent_dist.mode()
170
+
171
+ if do_classifier_free_guidance:
172
+ negative_image_latents = torch.zeros_like(image_latents)
173
+
174
+ # For classifier free guidance, we need to do two forward passes.
175
+ # Here we concatenate the unconditional and text embeddings into a single batch
176
+ # to avoid doing two forward passes
177
+ image_latents = torch.cat([negative_image_latents, image_latents])
178
+
179
+ # duplicate image_latents for each generation per prompt, using mps friendly method
180
+ image_latents = image_latents.repeat(num_videos_per_prompt, 1, 1, 1)
181
+
182
+ return image_latents
183
+
184
+ def _get_add_time_ids(
185
+ self,
186
+ fps: int,
187
+ motion_bucket_id: int,
188
+ noise_aug_strength: float,
189
+ dtype: torch.dtype,
190
+ batch_size: int,
191
+ num_videos_per_prompt: int,
192
+ do_classifier_free_guidance: bool,
193
+ ):
194
+ add_time_ids = [fps, motion_bucket_id, noise_aug_strength]
195
+
196
+ passed_add_embed_dim = self.unet.config.addition_time_embed_dim * len(add_time_ids)
197
+ expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features
198
+
199
+ if expected_add_embed_dim != passed_add_embed_dim:
200
+ raise ValueError(
201
+ f"Model expects an added time embedding vector of length {expected_add_embed_dim}, " \
202
+ f"but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. " \
203
+ f"Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`."
204
+ )
205
+
206
+ add_time_ids = torch.tensor([add_time_ids], dtype=dtype)
207
+ add_time_ids = add_time_ids.repeat(batch_size * num_videos_per_prompt, 1)
208
+
209
+ if do_classifier_free_guidance:
210
+ add_time_ids = torch.cat([add_time_ids, add_time_ids])
211
+
212
+ return add_time_ids
213
+
214
+ def decode_latents(
215
+ self,
216
+ latents: torch.Tensor,
217
+ num_frames: int,
218
+ decode_chunk_size: int = 8):
219
+ # [batch, frames, channels, height, width] -> [batch*frames, channels, height, width]
220
+ latents = latents.flatten(0, 1)
221
+
222
+ latents = 1 / self.vae.config.scaling_factor * latents
223
+
224
+ forward_vae_fn = self.vae._orig_mod.forward if is_compiled_module(self.vae) else self.vae.forward
225
+ accepts_num_frames = "num_frames" in set(inspect.signature(forward_vae_fn).parameters.keys())
226
+
227
+ # decode decode_chunk_size frames at a time to avoid OOM
228
+ frames = []
229
+ for i in range(0, latents.shape[0], decode_chunk_size):
230
+ num_frames_in = latents[i: i + decode_chunk_size].shape[0]
231
+ decode_kwargs = {}
232
+ if accepts_num_frames:
233
+ # we only pass num_frames_in if it's expected
234
+ decode_kwargs["num_frames"] = num_frames_in
235
+
236
+ frame = self.vae.decode(latents[i: i + decode_chunk_size], **decode_kwargs).sample
237
+ frames.append(frame.cpu())
238
+ frames = torch.cat(frames, dim=0)
239
+
240
+ # [batch*frames, channels, height, width] -> [batch, channels, frames, height, width]
241
+ frames = frames.reshape(-1, num_frames, *frames.shape[1:]).permute(0, 2, 1, 3, 4)
242
+
243
+ # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
244
+ frames = frames.float()
245
+ return frames
246
+
247
+ def check_inputs(self, image, height, width):
248
+ if (
249
+ not isinstance(image, torch.Tensor)
250
+ and not isinstance(image, PIL.Image.Image)
251
+ and not isinstance(image, list)
252
+ ):
253
+ raise ValueError(
254
+ "`image` has to be of type `torch.FloatTensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is"
255
+ f" {type(image)}"
256
+ )
257
+
258
+ if height % 8 != 0 or width % 8 != 0:
259
+ raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
260
+
261
+ def prepare_latents(
262
+ self,
263
+ batch_size: int,
264
+ num_frames: int,
265
+ num_channels_latents: int,
266
+ height: int,
267
+ width: int,
268
+ dtype: torch.dtype,
269
+ device: Union[str, torch.device],
270
+ generator: torch.Generator,
271
+ latents: Optional[torch.Tensor] = None,
272
+ ):
273
+ shape = (
274
+ batch_size,
275
+ num_frames,
276
+ num_channels_latents // 2,
277
+ height // self.vae_scale_factor,
278
+ width // self.vae_scale_factor,
279
+ )
280
+ if isinstance(generator, list) and len(generator) != batch_size:
281
+ raise ValueError(
282
+ f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
283
+ f" size of {batch_size}. Make sure the batch size matches the length of the generators."
284
+ )
285
+
286
+ if latents is None:
287
+ latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
288
+ else:
289
+ latents = latents.to(device)
290
+
291
+ # scale the initial noise by the standard deviation required by the scheduler
292
+ latents = latents * self.scheduler.init_noise_sigma
293
+ return latents
294
+
295
+ @property
296
+ def guidance_scale(self):
297
+ return self._guidance_scale
298
+
299
+ # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
300
+ # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
301
+ # corresponds to doing no classifier free guidance.
302
+ @property
303
+ def do_classifier_free_guidance(self):
304
+ if isinstance(self.guidance_scale, (int, float)):
305
+ return self.guidance_scale > 1
306
+ return self.guidance_scale.max() > 1
307
+
308
+ @property
309
+ def num_timesteps(self):
310
+ return self._num_timesteps
311
+
312
+ def prepare_extra_step_kwargs(self, generator, eta):
313
+ # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
314
+ # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
315
+ # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
316
+ # and should be between [0, 1]
317
+
318
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
319
+ extra_step_kwargs = {}
320
+ if accepts_eta:
321
+ extra_step_kwargs["eta"] = eta
322
+
323
+ # check if the scheduler accepts generator
324
+ accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
325
+ if accepts_generator:
326
+ extra_step_kwargs["generator"] = generator
327
+ return extra_step_kwargs
328
+
329
+ @torch.no_grad()
330
+ def __call__(
331
+ self,
332
+ image: Union[PIL.Image.Image, List[PIL.Image.Image], torch.FloatTensor],
333
+ image_pose: Union[torch.FloatTensor],
334
+ height: int = 576,
335
+ width: int = 1024,
336
+ num_frames: Optional[int] = None,
337
+ tile_size: Optional[int] = 16,
338
+ tile_overlap: Optional[int] = 4,
339
+ num_inference_steps: int = 25,
340
+ min_guidance_scale: float = 1.0,
341
+ max_guidance_scale: float = 3.0,
342
+ fps: int = 7,
343
+ motion_bucket_id: int = 127,
344
+ noise_aug_strength: float = 0.02,
345
+ image_only_indicator: bool = False,
346
+ decode_chunk_size: Optional[int] = None,
347
+ num_videos_per_prompt: Optional[int] = 1,
348
+ generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
349
+ latents: Optional[torch.FloatTensor] = None,
350
+ output_type: Optional[str] = "pil",
351
+ callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
352
+ callback_on_step_end_tensor_inputs: List[str] = ["latents"],
353
+ return_dict: bool = True,
354
+ device: Union[str, torch.device] =None,
355
+ ):
356
+ r"""
357
+ The call function to the pipeline for generation.
358
+ Args:
359
+ image (`PIL.Image.Image` or `List[PIL.Image.Image]` or `torch.FloatTensor`):
360
+ Image or images to guide image generation. If you provide a tensor, it needs to be compatible with
361
+ [`CLIPImageProcessor`](https://huggingface.co/lambdalabs/sd-image-variations-diffusers/blob/main/
362
+ feature_extractor/preprocessor_config.json).
363
+ height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
364
+ The height in pixels of the generated image.
365
+ width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
366
+ The width in pixels of the generated image.
367
+ num_frames (`int`, *optional*):
368
+ The number of video frames to generate. Defaults to 14 for `stable-video-diffusion-img2vid`
369
+ and to 25 for `stable-video-diffusion-img2vid-xt`
370
+ num_inference_steps (`int`, *optional*, defaults to 25):
371
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
372
+ expense of slower inference. This parameter is modulated by `strength`.
373
+ min_guidance_scale (`float`, *optional*, defaults to 1.0):
374
+ The minimum guidance scale. Used for the classifier free guidance with first frame.
375
+ max_guidance_scale (`float`, *optional*, defaults to 3.0):
376
+ The maximum guidance scale. Used for the classifier free guidance with last frame.
377
+ fps (`int`, *optional*, defaults to 7):
378
+ Frames per second.The rate at which the generated images shall be exported to a video after generation.
379
+ Note that Stable Diffusion Video's UNet was micro-conditioned on fps-1 during training.
380
+ motion_bucket_id (`int`, *optional*, defaults to 127):
381
+ The motion bucket ID. Used as conditioning for the generation.
382
+ The higher the number the more motion will be in the video.
383
+ noise_aug_strength (`float`, *optional*, defaults to 0.02):
384
+ The amount of noise added to the init image,
385
+ the higher it is the less the video will look like the init image. Increase it for more motion.
386
+ image_only_indicator (`bool`, *optional*, defaults to False):
387
+ Whether to treat the inputs as batch of images instead of videos.
388
+ decode_chunk_size (`int`, *optional*):
389
+ The number of frames to decode at a time.The higher the chunk size, the higher the temporal consistency
390
+ between frames, but also the higher the memory consumption.
391
+ By default, the decoder will decode all frames at once for maximal quality.
392
+ Reduce `decode_chunk_size` to reduce memory usage.
393
+ num_videos_per_prompt (`int`, *optional*, defaults to 1):
394
+ The number of images to generate per prompt.
395
+ generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
396
+ A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
397
+ generation deterministic.
398
+ latents (`torch.FloatTensor`, *optional*):
399
+ Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
400
+ generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
401
+ tensor is generated by sampling using the supplied random `generator`.
402
+ output_type (`str`, *optional*, defaults to `"pil"`):
403
+ The output format of the generated image. Choose between `PIL.Image` or `np.array`.
404
+ callback_on_step_end (`Callable`, *optional*):
405
+ A function that calls at the end of each denoising steps during the inference. The function is called
406
+ with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
407
+ callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
408
+ `callback_on_step_end_tensor_inputs`.
409
+ callback_on_step_end_tensor_inputs (`List`, *optional*):
410
+ The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
411
+ will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
412
+ `._callback_tensor_inputs` attribute of your pipeline class.
413
+ return_dict (`bool`, *optional*, defaults to `True`):
414
+ Whether to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
415
+ plain tuple.
416
+ device:
417
+ On which device the pipeline runs on.
418
+ Returns:
419
+ [`~pipelines.stable_diffusion.StableVideoDiffusionPipelineOutput`] or `tuple`:
420
+ If `return_dict` is `True`,
421
+ [`~pipelines.stable_diffusion.StableVideoDiffusionPipelineOutput`] is returned,
422
+ otherwise a `tuple` is returned where the first element is a list of list with the generated frames.
423
+ Examples:
424
+ ```py
425
+ from diffusers import StableVideoDiffusionPipeline
426
+ from diffusers.utils import load_image, export_to_video
427
+ pipe = StableVideoDiffusionPipeline.from_pretrained(
428
+ "stabilityai/stable-video-diffusion-img2vid-xt", torch_dtype=torch.float16, variant="fp16")
429
+ pipe.to("cuda")
430
+ image = load_image(
431
+ "https://lh3.googleusercontent.com/y-iFOHfLTwkuQSUegpwDdgKmOjRSTvPxat63dQLB25xkTs4lhIbRUFeNBWZzYf370g=s1200")
432
+ image = image.resize((1024, 576))
433
+ frames = pipe(image, num_frames=25, decode_chunk_size=8).frames[0]
434
+ export_to_video(frames, "generated.mp4", fps=7)
435
+ ```
436
+ """
437
+ # 0. Default height and width to unet
438
+ height = height or self.unet.config.sample_size * self.vae_scale_factor
439
+ width = width or self.unet.config.sample_size * self.vae_scale_factor
440
+
441
+ num_frames = num_frames if num_frames is not None else self.unet.config.num_frames
442
+ decode_chunk_size = decode_chunk_size if decode_chunk_size is not None else num_frames
443
+
444
+ # 1. Check inputs. Raise error if not correct
445
+ self.check_inputs(image, height, width)
446
+
447
+ # 2. Define call parameters
448
+ if isinstance(image, PIL.Image.Image):
449
+ batch_size = 1
450
+ elif isinstance(image, list):
451
+ batch_size = len(image)
452
+ else:
453
+ batch_size = image.shape[0]
454
+ device = device if device is not None else self._execution_device
455
+ # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
456
+ # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
457
+ # corresponds to doing no classifier free guidance.
458
+ self._guidance_scale = max_guidance_scale
459
+
460
+ # 3. Encode input image
461
+ self.image_encoder.to(device)
462
+ image_embeddings = self._encode_image(image, device, num_videos_per_prompt, self.do_classifier_free_guidance)
463
+ self.image_encoder.cpu()
464
+
465
+ # NOTE: Stable Diffusion Video was conditioned on fps - 1, which
466
+ # is why it is reduced here.
467
+ fps = fps - 1
468
+
469
+ # 4. Encode input image using VAE
470
+ image = self.image_processor.preprocess(image, height=height, width=width).to(device)
471
+ noise = randn_tensor(image.shape, generator=generator, device=device, dtype=image.dtype)
472
+ image = image + noise_aug_strength * noise
473
+
474
+ self.vae.to(device)
475
+ image_latents = self._encode_vae_image(
476
+ image,
477
+ device=device,
478
+ num_videos_per_prompt=num_videos_per_prompt,
479
+ do_classifier_free_guidance=self.do_classifier_free_guidance,
480
+ )
481
+ image_latents = image_latents.to(image_embeddings.dtype)
482
+ self.vae.cpu()
483
+
484
+ # Repeat the image latents for each frame so we can concatenate them with the noise
485
+ # image_latents [batch, channels, height, width] ->[batch, num_frames, channels, height, width]
486
+ image_latents = image_latents.unsqueeze(1).repeat(1, num_frames, 1, 1, 1)
487
+
488
+ # 5. Get Added Time IDs
489
+ added_time_ids = self._get_add_time_ids(
490
+ fps,
491
+ motion_bucket_id,
492
+ noise_aug_strength,
493
+ image_embeddings.dtype,
494
+ batch_size,
495
+ num_videos_per_prompt,
496
+ self.do_classifier_free_guidance,
497
+ )
498
+ added_time_ids = added_time_ids.to(device)
499
+
500
+ # 4. Prepare timesteps
501
+ timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, None)
502
+
503
+ # 5. Prepare latent variables
504
+ num_channels_latents = self.unet.config.in_channels
505
+ latents = self.prepare_latents(
506
+ batch_size * num_videos_per_prompt,
507
+ tile_size,
508
+ num_channels_latents,
509
+ height,
510
+ width,
511
+ image_embeddings.dtype,
512
+ device,
513
+ generator,
514
+ latents,
515
+ )
516
+ latents = latents.repeat(1, num_frames // tile_size + 1, 1, 1, 1)[:, :num_frames]
517
+
518
+ # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
519
+ extra_step_kwargs = self.prepare_extra_step_kwargs(generator, 0.0)
520
+
521
+ # 7. Prepare guidance scale
522
+ guidance_scale = torch.linspace(min_guidance_scale, max_guidance_scale, num_frames).unsqueeze(0)
523
+ guidance_scale = guidance_scale.to(device, latents.dtype)
524
+ guidance_scale = guidance_scale.repeat(batch_size * num_videos_per_prompt, 1)
525
+ guidance_scale = _append_dims(guidance_scale, latents.ndim)
526
+
527
+ self._guidance_scale = guidance_scale
528
+
529
+ # 8. Denoising loop
530
+ self._num_timesteps = len(timesteps)
531
+ indices = [[0, *range(i + 1, min(i + tile_size, num_frames))] for i in
532
+ range(0, num_frames - tile_size + 1, tile_size - tile_overlap)]
533
+ if indices[-1][-1] < num_frames - 1:
534
+ indices.append([0, *range(num_frames - tile_size + 1, num_frames)])
535
+
536
+ self.pose_net.to(device)
537
+ self.unet.to(device)
538
+
539
+ with torch.cuda.device(device):
540
+ torch.cuda.empty_cache()
541
+
542
+ with self.progress_bar(total=len(timesteps) * len(indices)) as progress_bar:
543
+ for i, t in enumerate(timesteps):
544
+ # expand the latents if we are doing classifier free guidance
545
+ latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
546
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
547
+
548
+ # Concatenate image_latents over channels dimension
549
+ latent_model_input = torch.cat([latent_model_input, image_latents], dim=2)
550
+
551
+ # predict the noise residual
552
+ noise_pred = torch.zeros_like(image_latents)
553
+ noise_pred_cnt = image_latents.new_zeros((num_frames,))
554
+ weight = (torch.arange(tile_size, device=device) + 0.5) * 2. / tile_size
555
+ weight = torch.minimum(weight, 2 - weight)
556
+ for idx in indices:
557
+
558
+ # classification-free inference
559
+ pose_latents = self.pose_net(image_pose[idx].to(device))
560
+ _noise_pred = self.unet(
561
+ latent_model_input[:1, idx],
562
+ t,
563
+ encoder_hidden_states=image_embeddings[:1],
564
+ added_time_ids=added_time_ids[:1],
565
+ pose_latents=None,
566
+ image_only_indicator=image_only_indicator,
567
+ return_dict=False,
568
+ )[0]
569
+ noise_pred[:1, idx] += _noise_pred * weight[:, None, None, None]
570
+
571
+ # normal inference
572
+ _noise_pred = self.unet(
573
+ latent_model_input[1:, idx],
574
+ t,
575
+ encoder_hidden_states=image_embeddings[1:],
576
+ added_time_ids=added_time_ids[1:],
577
+ pose_latents=pose_latents,
578
+ image_only_indicator=image_only_indicator,
579
+ return_dict=False,
580
+ )[0]
581
+ noise_pred[1:, idx] += _noise_pred * weight[:, None, None, None]
582
+
583
+ noise_pred_cnt[idx] += weight
584
+ progress_bar.update()
585
+ noise_pred.div_(noise_pred_cnt[:, None, None, None])
586
+
587
+ # perform guidance
588
+ if self.do_classifier_free_guidance:
589
+ noise_pred_uncond, noise_pred_cond = noise_pred.chunk(2)
590
+ noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_cond - noise_pred_uncond)
591
+
592
+ # compute the previous noisy sample x_t -> x_t-1
593
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
594
+
595
+ if callback_on_step_end is not None:
596
+ callback_kwargs = {}
597
+ for k in callback_on_step_end_tensor_inputs:
598
+ callback_kwargs[k] = locals()[k]
599
+ callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
600
+
601
+ latents = callback_outputs.pop("latents", latents)
602
+
603
+ self.pose_net.cpu()
604
+ self.unet.cpu()
605
+
606
+ if not output_type == "latent":
607
+ self.vae.decoder.to(device)
608
+ frames = self.decode_latents(latents, num_frames, decode_chunk_size)
609
+ frames = tensor2vid(frames, self.image_processor, output_type=output_type)
610
+ else:
611
+ frames = latents
612
+
613
+ self.maybe_free_model_hooks()
614
+
615
+ if not return_dict:
616
+ return frames
617
+
618
+ return MimicMotionPipelineOutput(frames=frames)