smoothieAI
/

pipeline_animatediff_context_controlnet

Model card Files Files and versions Community

smoothieAI commited on Jan 27, 2024

Commit

d185fb7

verified ·

1 Parent(s): 72f3d04

Update pipeline.py

Browse files

Files changed (1) hide show

pipeline.py +176 -13

pipeline.py CHANGED Viewed

@@ -53,6 +53,8 @@ import torchvision
 import PIL
 import PIL.Image
 import math
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
@@ -70,7 +72,62 @@ EXAMPLE_DOC_STRING = """
         >>> export_to_gif(frames, "animation.gif")
         ```
 """
 def tensor2vid(video: torch.Tensor, processor, output_type="np"):
     # Based on:
@@ -810,20 +867,104 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
         #     init_latents[:, :, 1:] = torch.zeros_like(init_latents[:, :, 1:])
         latents = latents.to(device)
-        return latents, init_latents
-    def prepare_control_latents(self, batch_size, contorl_frames, num_channels_latents, num_frames, height, width, dtype, device):
-            shape = (
-                num_frames,
-                num_channels_latents,
-                height // self.vae_scale_factor,
-                width // self.vae_scale_factor,
             )
-            # convert input control image array to latents tensor array
-            latents = torch.zeros(shape, dtype=dtype, device=device)
-            return latents
     # Copied from diffusers.pipelines.controlnet.pipeline_controlnet.StableDiffusionControlNetPipeline.prepare_image
     def prepare_control_frames(
@@ -1112,6 +1253,7 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
         # 4. Prepare timesteps
         self.scheduler.set_timesteps(num_inference_steps, device=device)
         timesteps = self.scheduler.timesteps
         # round num frames to the nearest multiple of context size - overlap
         num_frames = (num_frames // (context_size - overlap)) * (context_size - overlap)
@@ -1189,6 +1331,25 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
                 smooth_weight,
                 smooth_steps,
             )
         # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
@@ -1263,7 +1424,8 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
                     latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-                    if self.controlnet != None:
                         current_context_conditioning_frames = conditioning_frames[current_context_indexes, :, :, :]
                         current_context_conditioning_frames = torch.cat([current_context_conditioning_frames] * 2) if do_classifier_free_guidance else current_context_conditioning_frames
@@ -1302,7 +1464,8 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
                             conditioning_scale=cond_scale,
                             guess_mode=guess_mode,
                             return_dict=False,
-                        )
                         # predict the noise residual with the added controlnet residuals

 import PIL
 import PIL.Image
 import math
+import time
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
         >>> export_to_gif(frames, "animation.gif")
         ```
 """
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
+def retrieve_latents(
+    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used,
+            `timesteps` must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+                Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default
+                timestep spacing strategy of the scheduler is used. If `timesteps` is passed, `num_inference_steps`
+                must be `None`.
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
 def tensor2vid(video: torch.Tensor, processor, output_type="np"):
     # Based on:
         #     init_latents[:, :, 1:] = torch.zeros_like(init_latents[:, :, 1:])
         latents = latents.to(device)
+        return latents, init_latents
+    def prepare_video_latents(
+        self,
+        video,
+        height,
+        width,
+        num_channels_latents,
+        batch_size,
+        timestep,
+        dtype,
+        device,
+        generator,
+        latents=None,
+    ):
+        # video must be a list of list of images
+        # the outer list denotes having multiple videos as input, whereas inner list means the frames of the video
+        # as a list of images
+        if not isinstance(video[0], list):
+            video = [video]
+        if latents is None:
+            video = torch.cat(
+                [self.image_processor.preprocess(vid, height=height, width=width).unsqueeze(0) for vid in video], dim=0
+            )
+            video = video.to(device=device, dtype=dtype)
+            num_frames = video.shape[1]
+        else:
+            num_frames = latents.shape[2]
+        shape = (
+            batch_size,
+            num_channels_latents,
+            num_frames,
+            height // self.vae_scale_factor,
+            width // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
             )
+        if latents is None:
+            # make sure the VAE is in float32 mode, as it overflows in float16
+            if self.vae.config.force_upcast:
+                video = video.float()
+                self.vae.to(dtype=torch.float32)
+            if isinstance(generator, list):
+                if len(generator) != batch_size:
+                    raise ValueError(
+                        f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                        f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+                    )
+                init_latents = [
+                    retrieve_latents(self.vae.encode(video[i]), generator=generator[i]).unsqueeze(0)
+                    for i in range(batch_size)
+                ]
+            else:
+                init_latents = [
+                    retrieve_latents(self.vae.encode(vid), generator=generator).unsqueeze(0) for vid in video
+                ]
+            init_latents = torch.cat(init_latents, dim=0)
+            # restore vae to original dtype
+            if self.vae.config.force_upcast:
+                self.vae.to(dtype)
+            init_latents = init_latents.to(dtype)
+            init_latents = self.vae.config.scaling_factor * init_latents
+            if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0:
+                # expand init_latents for batch_size
+                error_message = (
+                    f"You have passed {batch_size} text prompts (`prompt`), but only {init_latents.shape[0]} initial"
+                    " images (`image`). Please make sure to update your script to pass as many initial images as text prompts"
+                )
+                raise ValueError(error_message)
+            elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
+                raise ValueError(
+                    f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
+                )
+            else:
+                init_latents = torch.cat([init_latents], dim=0)
+            noise = randn_tensor(init_latents.shape, generator=generator, device=device, dtype=dtype)
+            latents = self.scheduler.add_noise(init_latents, noise, timestep).permute(0, 2, 1, 3, 4)
+        else:
+            if shape != latents.shape:
+                # [B, C, F, H, W]
+                raise ValueError(f"`latents` expected to have {shape=}, but found {latents.shape=}")
+            latents = latents.to(device, dtype=dtype)
+        return latents
     # Copied from diffusers.pipelines.controlnet.pipeline_controlnet.StableDiffusionControlNetPipeline.prepare_image
     def prepare_control_frames(
         # 4. Prepare timesteps
         self.scheduler.set_timesteps(num_inference_steps, device=device)
         timesteps = self.scheduler.timesteps
         # round num frames to the nearest multiple of context size - overlap
         num_frames = (num_frames // (context_size - overlap)) * (context_size - overlap)
                 smooth_weight,
                 smooth_steps,
             )
+        elif(latent_mode == "video"):
+            # 4. Prepare timesteps
+            timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
+            timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, init_image_strength, device)
+            latent_timestep = timesteps[:1].repeat(batch_size * num_videos_per_prompt)
+            self._num_timesteps = len(timesteps)
+            num_channels_latents = self.unet.config.in_channels
+            latents = self.prepare_latents(
+                video=video,
+                height=height,
+                width=width,
+                num_channels_latents=num_channels_latents,
+                batch_size=batch_size * num_videos_per_prompt,
+                timestep=latent_timestep,
+                dtype=prompt_embeds.dtype,
+                device=device,
+                generator=generator,
+                latents=latents,
+            )
         # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
                     latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                    if self.controlnet != None or i > 2:
+                        contorl_start = time.time()
                         current_context_conditioning_frames = conditioning_frames[current_context_indexes, :, :, :]
                         current_context_conditioning_frames = torch.cat([current_context_conditioning_frames] * 2) if do_classifier_free_guidance else current_context_conditioning_frames
                             conditioning_scale=cond_scale,
                             guess_mode=guess_mode,
                             return_dict=False,
+                        )
+                        print("controlnet time", time.time() - contorl_start)
                         # predict the noise residual with the added controlnet residuals