smoothieAI
/

pipeline_animatediff_context

Model card Files Files and versions

xet

Community

smoothieAI commited on Jan 16, 2024

Commit

1e17894

verified ·

1 Parent(s): fcf6eb4

Update pipeline.py

Browse files

Files changed (1) hide show

pipeline.py +298 -59

pipeline.py CHANGED Viewed

@@ -20,12 +20,13 @@ import numpy as np
 import torch
 from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
-from ...image_processor import PipelineImageInput, VaeImageProcessor
-from ...loaders import IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
-from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel, UNetMotionModel
-from ...models.lora import adjust_lora_scale_text_encoder
-from ...models.unet_motion_model import MotionAdapter
-from ...schedulers import (
     DDIMScheduler,
     DPMSolverMultistepScheduler,
     EulerAncestralDiscreteScheduler,
@@ -33,17 +34,25 @@ from ...schedulers import (
     LMSDiscreteScheduler,
     PNDMScheduler,
 )
-from ...utils import (
     USE_PEFT_BACKEND,
     BaseOutput,
     logging,
-    replace_example_docstring,
     scale_lora_layers,
     unscale_lora_layers,
 )
-from ...utils.torch_utils import randn_tensor
-from ..pipeline_utils import DiffusionPipeline
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
@@ -53,7 +62,6 @@ EXAMPLE_DOC_STRING = """
         >>> import torch
         >>> from diffusers import MotionAdapter, AnimateDiffPipeline, DDIMScheduler
         >>> from diffusers.utils import export_to_gif
         >>> adapter = MotionAdapter.from_pretrained("guoyww/animatediff-motion-adapter-v1-5-2")
         >>> pipe = AnimateDiffPipeline.from_pretrained("frankjoshua/toonyou_beta6", motion_adapter=adapter)
         >>> pipe.scheduler = DDIMScheduler(beta_schedule="linear", steps_offset=1, clip_sample=False)
@@ -87,16 +95,13 @@ class AnimateDiffPipelineOutput(BaseOutput):
 class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdapterMixin, LoraLoaderMixin):
     r"""
     Pipeline for text-to-video generation.
     This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
     implemented for all pipelines (downloading, saving, running on a particular device, etc.).
     The pipeline also inherits the following loading methods:
         - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
         - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
         - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
         - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
     Args:
         vae ([`AutoencoderKL`]):
             Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
@@ -150,6 +155,9 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
         self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
         self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt with num_images_per_prompt -> num_videos_per_prompt
     def encode_prompt(
         self,
@@ -165,7 +173,6 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
     ):
         r"""
         Encodes the prompt into text encoder hidden states.
         Args:
             prompt (`str` or `List[str]`, *optional*):
                 prompt to be encoded
@@ -417,12 +424,9 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu
     def enable_freeu(self, s1: float, s2: float, b1: float, b2: float):
         r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497.
         The suffixes after the scaling factors represent the stages where they are being applied.
         Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values
         that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
         Args:
             s1 (`float`):
                 Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
@@ -539,8 +543,208 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
         latents = latents * self.scheduler.init_noise_sigma
         return latents
     @torch.no_grad()
-    @replace_example_docstring(EXAMPLE_DOC_STRING)
     def __call__(
         self,
         prompt: Union[str, List[str]] = None,
@@ -561,15 +765,22 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
         negative_prompt_embeds: Optional[torch.FloatTensor] = None,
         ip_adapter_image: Optional[PipelineImageInput] = None,
         output_type: Optional[str] = "pil",
         return_dict: bool = True,
         callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
         callback_steps: Optional[int] = 1,
         cross_attention_kwargs: Optional[Dict[str, Any]] = None,
         clip_skip: Optional[int] = None,
     ):
         r"""
         The call function to the pipeline for generation.
         Args:
             prompt (`str` or `List[str]`, *optional*):
                 The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
@@ -626,7 +837,6 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
                 Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
                 the output of the pre-final layer will be used for computing the prompt embeddings.
         Examples:
         Returns:
             [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] or `tuple`:
                 If `return_dict` is `True`, [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] is
@@ -696,17 +906,48 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
         # 5. Prepare latent variables
         num_channels_latents = self.unet.config.in_channels
-        latents = self.prepare_latents(
-            batch_size * num_videos_per_prompt,
-            num_channels_latents,
-            num_frames,
-            height,
-            width,
-            prompt_embeds.dtype,
-            device,
-            generator,
-            latents,
-        )
         # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
@@ -719,7 +960,7 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
         # Denoising loop
         num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
-        with self.progress_bar(total=num_context_groups * len(timesteps)) as progress_bar:
             for i, t in enumerate(timesteps):
                 latent_sum = torch.zeros_like(latents).to(device).to(dtype=torch.float16)
@@ -733,24 +974,6 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
                     # select the relevent context from the latents
                     current_context_latents = latents[:, :, current_context_start : current_context_start + context_size, :, :]
-                    # if the context extends past the end of the latents, wrap around to the start
-                    if current_context_start + context_size > num_frames:
-                        current_context_latents = torch.cat([current_context_latents, latents[:, :, :current_context_start + context_size - num_frames, :, :]], dim=2)
-                # for context_group in range(num_context_groups):
-                #     # Calculate the current start index, considering overlap
-                #     current_context_start = 0 if context_group == 0 else context_group * (context_size - overlap)
-                #     # Calculate the end index and adjust if it exceeds num_frames
-                #     current_context_end = (current_context_start + context_size) % num_frames
-                #     # Select the relevant context from the latents with wrap-around handling
-                #     current_context_latents = torch.cat([
-                #         latents[:, :, current_context_start:min(current_context_start + context_size, num_frames), :, :],
-                #         latents[:, :, :max(current_context_end - num_frames, 0), :, :]
-                #     ], dim=2) if current_context_start + context_size > num_frames else latents[:, :, current_context_start:current_context_start + context_size, :, :]
                     # expand the latents if we are doing classifier free guidance
                     latent_model_input = torch.cat([current_context_latents] * 2) if do_classifier_free_guidance else current_context_latents
@@ -772,27 +995,43 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
                     # compute the previous noisy sample x_t -> x_t-1
                     current_context_latents = self.scheduler.step(noise_pred, t, current_context_latents, **extra_step_kwargs).prev_sample
-                    # call the callback, if provided
-                    if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                        progress_bar.update()
-                        if callback is not None and i % callback_steps == 0:
-                            callback(i, t, current_context_latents)
                     #add the context current_context_latents back to the latent sum starting from the current context start
                     latent_sum[:, :, current_context_start : current_context_start + context_size, :, :] += current_context_latents
                     # add one to the counter for each timestep in the context
                     latent_counter[current_context_start : current_context_start + context_size] += 1
                 latent_counter = latent_counter.reshape(1, 1, num_frames, 1, 1)
                 latents = latent_sum / latent_counter
-                # shuffle rotate latent images by step places, wrapping around the last n steps to the start
                 latents = torch.cat([latents[:, :, -step:, :, :], latents[:, :, :-step, :, :]], dim=2)
         if output_type == "latent":
             return AnimateDiffPipelineOutput(frames=latents)
         # Post-processing
         video_tensor = self.decode_latents(latents)
@@ -807,4 +1046,4 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
         if not return_dict:
             return (video,)
-        return AnimateDiffPipelineOutput(frames=video)

 import torch
 from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
+# Updated to use absolute paths
+from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
+from diffusers.loaders import IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
+from diffusers.models import AutoencoderKL, ImageProjection, UNet2DConditionModel, UNetMotionModel
+from diffusers.models.lora import adjust_lora_scale_text_encoder
+from diffusers.models.unet_motion_model import MotionAdapter
+from diffusers.schedulers import (
     DDIMScheduler,
     DPMSolverMultistepScheduler,
     EulerAncestralDiscreteScheduler,
     LMSDiscreteScheduler,
     PNDMScheduler,
 )
+from diffusers.utils import (
     USE_PEFT_BACKEND,
     BaseOutput,
     logging,
     scale_lora_layers,
     unscale_lora_layers,
 )
+from diffusers.utils.torch_utils import is_compiled_module, randn_tensor
+# Added imports based on the working paths
+from diffusers.models import ControlNetModel
+from diffusers.pipelines.controlnet.multicontrolnet import MultiControlNetModel
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.utils import deprecate
+import torchvision
+import PIL
+import PIL.Image
+import math
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
         >>> import torch
         >>> from diffusers import MotionAdapter, AnimateDiffPipeline, DDIMScheduler
         >>> from diffusers.utils import export_to_gif
         >>> adapter = MotionAdapter.from_pretrained("guoyww/animatediff-motion-adapter-v1-5-2")
         >>> pipe = AnimateDiffPipeline.from_pretrained("frankjoshua/toonyou_beta6", motion_adapter=adapter)
         >>> pipe.scheduler = DDIMScheduler(beta_schedule="linear", steps_offset=1, clip_sample=False)
 class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdapterMixin, LoraLoaderMixin):
     r"""
     Pipeline for text-to-video generation.
     This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
     implemented for all pipelines (downloading, saving, running on a particular device, etc.).
     The pipeline also inherits the following loading methods:
         - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
         - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
         - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
         - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
     Args:
         vae ([`AutoencoderKL`]):
             Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
         self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
         self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+    def load_motion_adapter(self,motion_adapter):
+        self.register_modules(motion_adapter=motion_adapter)
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt with num_images_per_prompt -> num_videos_per_prompt
     def encode_prompt(
         self,
     ):
         r"""
         Encodes the prompt into text encoder hidden states.
         Args:
             prompt (`str` or `List[str]`, *optional*):
                 prompt to be encoded
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu
     def enable_freeu(self, s1: float, s2: float, b1: float, b2: float):
         r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497.
         The suffixes after the scaling factors represent the stages where they are being applied.
         Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values
         that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
         Args:
             s1 (`float`):
                 Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
         latents = latents * self.scheduler.init_noise_sigma
         return latents
+    def prepare_motion_latents(self, batch_size, num_channels_latents, num_frames, height, width, dtype, device, generator,
+                        latents=None, x_velocity=0, y_velocity=0, scale_velocity=0):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            num_frames,
+            height // self.vae_scale_factor,
+            width // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        for frame in range(num_frames):
+            x_offset = int(frame * x_velocity)  # Convert to int
+            y_offset = int(frame * y_velocity)  # Convert to int
+            scale_factor = 1 + frame * scale_velocity
+            # Apply offsets
+            latents[:, :, frame] = torch.roll(latents[:, :, frame], shifts=(x_offset,), dims=3)  # x direction
+            latents[:, :, frame] = torch.roll(latents[:, :, frame], shifts=(y_offset,), dims=2)  # y direction
+            # Apply scaling - This is a simple approach and might not be ideal for all applications
+            if scale_factor != 1:
+                scaled_size = (
+                    int(latents.shape[3] * scale_factor),
+                    int(latents.shape[4] * scale_factor)
+                )
+                latents[:, :, frame] = torch.nn.functional.interpolate(
+                    latents[:, :, frame].unsqueeze(0), size=scaled_size, mode='bilinear', align_corners=False
+                ).squeeze(0)
+        return latents
+    def generate_correlated_noise(self, latents, init_noise_correlation):
+        cloned_latents = latents.clone()
+        p = init_noise_correlation
+        flattened_latents = torch.flatten(cloned_latents)
+        noise = torch.randn_like(flattened_latents)
+        correlated_noise = flattened_latents * p + math.sqrt(1 - p**2) * noise
+        return correlated_noise.reshape(cloned_latents.shape)
+    def generate_correlated_latents(self, latents, init_noise_correlation):
+        cloned_latents = latents.clone()
+        for i in range(1, cloned_latents.shape[2]):
+            p = init_noise_correlation
+            flattened_latents = torch.flatten(cloned_latents[:, :, i])
+            prev_flattened_latents = torch.flatten(cloned_latents[:, :, i - 1])
+            correlated_latents = (prev_flattened_latents * p/math.sqrt((1+p**2))+flattened_latents * math.sqrt(1/(1 + p**2)))
+            cloned_latents[:, :, i] = correlated_latents.reshape(cloned_latents[:, :, i].shape)
+        return cloned_latents
+    def generate_correlated_latents_legacy(self, latents, init_noise_correlation):
+        cloned_latents = latents.clone()
+        for i in range(1, cloned_latents.shape[2]):
+            p = init_noise_correlation
+            flattened_latents = torch.flatten(cloned_latents[:, :, i])
+            prev_flattened_latents = torch.flatten(cloned_latents[:, :, i - 1])
+            correlated_latents = (
+                prev_flattened_latents * p
+                +
+                flattened_latents * math.sqrt(1 - p**2)
+            )
+            cloned_latents[:, :, i] = correlated_latents.reshape(
+                cloned_latents[:, :, i].shape
+            )
+        return cloned_latents
+    def generate_mixed_noise(self, noise, init_noise_correlation):
+        shared_noise = torch.randn_like(noise[0, :, 0])
+        for b in range(noise.shape[0]):
+            for f in range(noise.shape[2]):
+                p = init_noise_correlation
+                flattened_latents = torch.flatten(noise[b, :, f])
+                shared_latents = torch.flatten(shared_noise)
+                correlated_latents = (
+                    shared_latents * math.sqrt(p**2/(1+p**2)) +
+                    flattened_latents * math.sqrt(1/(1+p**2))
+                )
+                noise[b, :, f] = correlated_latents.reshape(noise[b, :, f].shape)
+        return noise
+    def prepare_correlated_latents(
+        self,
+        init_image,
+        init_image_strength,
+        init_noise_correlation,
+        batch_size,
+        num_channels_latents,
+        video_length,
+        height,
+        width,
+        dtype,
+        device,
+        generator,
+        latents=None,
+    ):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            video_length,
+            height // self.vae_scale_factor,
+            width // self.vae_scale_factor,
+        )
+        if init_image is not None:
+            start_image = ((torchvision.transforms.functional.pil_to_tensor(init_image))/ 255 )[:3, :, :].to("cuda").to(dtype).unsqueeze(0)
+            start_image = (
+                self.vae.encode(start_image.mul(2).sub(1))
+                .latent_dist.sample()
+                .view(1, 4, height // 8, width // 8)
+                * 0.18215
+            )
+            init_latents = start_image.unsqueeze(2).repeat(1, 1, video_length, 1, 1)
+        else:
+            init_latents = None
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+        if latents is None:
+            rand_device = "cpu" if device.type == "mps" else device
+            if isinstance(generator, list):
+                shape = shape
+                # shape = (1,) + shape[1:]
+                # ignore init latents for batch model
+                latents = [torch.randn(shape, generator=generator[i], device=rand_device, dtype=dtype)for i in range(batch_size)]
+                latents = torch.cat(latents, dim=0).to(device)
+            else:
+                if init_latents is not None:
+                    offset = int(
+                        init_image_strength * (len(self.scheduler.timesteps) - 1)
+                    )
+                    noise = torch.randn_like(init_latents)
+                    noise = self.generate_correlated_latents(noise, init_noise_correlation)
+                    # Eric - some black magic here
+                    # We should be only adding the noise at timestep[offset], but I noticed that
+                    # we get more motion and cooler motion if we add the noise at timestep[offset - 1]
+                    # or offset - 2. However, this breaks the fewer timesteps there are, so let's interpolate
+                    timesteps = self.scheduler.timesteps
+                    average_timestep = None
+                    if offset == 0:
+                        average_timestep = timesteps[0]
+                    elif offset == 1:
+                        average_timestep = (
+                            timesteps[offset - 1] * (1 - init_image_strength)
+                            + timesteps[offset] * init_image_strength
+                        )
+                    else:
+                        average_timestep = timesteps[offset - 1]
+                    latents = self.scheduler.add_noise(
+                        init_latents, noise, average_timestep.long()
+                    )
+                    latents = self.scheduler.add_noise(
+                        latents, torch.randn_like(init_latents), timesteps[-2]
+                    )
+                else:
+                    latents = torch.randn(
+                        shape, generator=generator, device=rand_device, dtype=dtype
+                    ).to(device)
+                    latents = self.generate_correlated_latents(
+                        latents, init_noise_correlation
+                    )
+        else:
+            if latents.shape != shape:
+                raise ValueError(
+                    f"Unexpected latents shape, got {latents.shape}, expected {shape}"
+                )
+            latents = latents.to(device)
+        # scale the initial noise by the standard deviation required by the scheduler
+        if init_latents is None:
+            latents = latents * self.scheduler.init_noise_sigma
+        # elif self.unet.trained_initial_frames and init_latents is not None:
+        #     # we only want to use this as the first frame
+        #     init_latents[:, :, 1:] = torch.zeros_like(init_latents[:, :, 1:])
+        latents = latents.to(device)
+        return latents, init_latents
     @torch.no_grad()
+    # @replace_example_docstring(EXAMPLE_DOC_STRING)
     def __call__(
         self,
         prompt: Union[str, List[str]] = None,
         negative_prompt_embeds: Optional[torch.FloatTensor] = None,
         ip_adapter_image: Optional[PipelineImageInput] = None,
         output_type: Optional[str] = "pil",
+        output_path: Optional[str] = None,
         return_dict: bool = True,
         callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
         callback_steps: Optional[int] = 1,
         cross_attention_kwargs: Optional[Dict[str, Any]] = None,
         clip_skip: Optional[int] = None,
+        x_velocity: Optional[float] = 0,
+        y_velocity: Optional[float] = 0,
+        scale_velocity: Optional[float] = 0,
+        init_image: Optional[PipelineImageInput] = None,
+        init_image_strength: Optional[float] = 1.0,
+        init_noise_correlation: Optional[float] = 0.0,
+        latent_mode: Optional[str] = "normal",
     ):
         r"""
         The call function to the pipeline for generation.
         Args:
             prompt (`str` or `List[str]`, *optional*):
                 The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
                 Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
                 the output of the pre-final layer will be used for computing the prompt embeddings.
         Examples:
         Returns:
             [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] or `tuple`:
                 If `return_dict` is `True`, [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] is
         # 5. Prepare latent variables
         num_channels_latents = self.unet.config.in_channels
+        if(latent_mode == "normal"):
+            latents = self.prepare_latents(
+                batch_size * num_videos_per_prompt,
+                num_channels_latents,
+                num_frames,
+                height,
+                width,
+                prompt_embeds.dtype,
+                device,
+                generator,
+                latents,
+            )
+        elif(latent_mode == "motion"):
+            latents = self.prepare_motion_latents(
+                batch_size * num_videos_per_prompt,
+                num_channels_latents,
+                num_frames,
+                height,
+                width,
+                prompt_embeds.dtype,
+                device,
+                generator,
+                latents,
+                x_velocity=x_velocity,
+                y_velocity=y_velocity,
+                scale_velocity=scale_velocity,
+            )
+        elif(latent_mode == "correlated"):
+            latents, init_latents = self.prepare_correlated_latents(
+                init_image,
+                init_image_strength,
+                init_noise_correlation,
+                batch_size,
+                num_channels_latents,
+                num_frames,
+                height,
+                width,
+                prompt_embeds.dtype,
+                device,
+                generator,
+            )
         # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
         # Denoising loop
         num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=len(timesteps)) as progress_bar:
             for i, t in enumerate(timesteps):
                 latent_sum = torch.zeros_like(latents).to(device).to(dtype=torch.float16)
                     # select the relevent context from the latents
                     current_context_latents = latents[:, :, current_context_start : current_context_start + context_size, :, :]
                     # expand the latents if we are doing classifier free guidance
                     latent_model_input = torch.cat([current_context_latents] * 2) if do_classifier_free_guidance else current_context_latents
                     # compute the previous noisy sample x_t -> x_t-1
                     current_context_latents = self.scheduler.step(noise_pred, t, current_context_latents, **extra_step_kwargs).prev_sample
                     #add the context current_context_latents back to the latent sum starting from the current context start
                     latent_sum[:, :, current_context_start : current_context_start + context_size, :, :] += current_context_latents
                     # add one to the counter for each timestep in the context
                     latent_counter[current_context_start : current_context_start + context_size] += 1
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        callback(i, t, None)
                 latent_counter = latent_counter.reshape(1, 1, num_frames, 1, 1)
                 latents = latent_sum / latent_counter
+                # shuffle rotate latent images by step places, wrapping around the last 2 to the start
                 latents = torch.cat([latents[:, :, -step:, :, :], latents[:, :, :-step, :, :]], dim=2)
+        print("Done denoising")
         if output_type == "latent":
             return AnimateDiffPipelineOutput(frames=latents)
+        # save frames
+        if output_path is not None:
+            output_batch_size = 2 # prevents out of memory errors with large videos
+            num_digits = output_path.count('#')  # count the number of '#' characters
+            frame_format = output_path.replace('#' * num_digits, '{:0' + str(num_digits) + 'd}')
+            for batch in range((num_frames + output_batch_size - 1) // output_batch_size):
+                start_id = batch * output_batch_size
+                end_id = min((batch + 1) * output_batch_size, num_frames)
+                video_tensor = self.decode_latents(latents[:, :, start_id:end_id, :, :])
+                video = tensor2vid(video_tensor, self.image_processor, output_type=output_type)
+                for f_id, frame in enumerate(video[0]):
+                    frame.save(frame_format.format(start_id + f_id))
+            return output_path
         # Post-processing
         video_tensor = self.decode_latents(latents)
         if not return_dict:
             return (video,)
+        return AnimateDiffPipelineOutput(frames=video)