smoothieAI
/

pipeline_animatediff_context

Model card Files Files and versions Community

smoothieAI commited on Jan 16, 2024

Commit

2aeb2f1

verified ·

1 Parent(s): 47ecd18

Update pipeline.py

Browse files

Files changed (1) hide show

pipeline.py +64 -304

pipeline.py CHANGED Viewed

@@ -20,13 +20,12 @@ import numpy as np
 import torch
 from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
-# Updated to use absolute paths
-from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
-from diffusers.loaders import IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
-from diffusers.models import AutoencoderKL, ImageProjection, UNet2DConditionModel, UNetMotionModel
-from diffusers.models.lora import adjust_lora_scale_text_encoder
-from diffusers.models.unet_motion_model import MotionAdapter
-from diffusers.schedulers import (
     DDIMScheduler,
     DPMSolverMultistepScheduler,
     EulerAncestralDiscreteScheduler,
@@ -34,25 +33,17 @@ from diffusers.schedulers import (
     LMSDiscreteScheduler,
     PNDMScheduler,
 )
-from diffusers.utils import (
     USE_PEFT_BACKEND,
     BaseOutput,
     logging,
     scale_lora_layers,
     unscale_lora_layers,
 )
-from diffusers.utils.torch_utils import is_compiled_module, randn_tensor
-# Added imports based on the working paths
-from diffusers.models import ControlNetModel
-from diffusers.pipelines.controlnet.multicontrolnet import MultiControlNetModel
-from diffusers.pipelines.pipeline_utils import DiffusionPipeline
-from diffusers.utils import deprecate
-import torchvision
-import PIL
-import PIL.Image
-import math
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
@@ -62,6 +53,7 @@ EXAMPLE_DOC_STRING = """
         >>> import torch
         >>> from diffusers import MotionAdapter, AnimateDiffPipeline, DDIMScheduler
         >>> from diffusers.utils import export_to_gif
         >>> adapter = MotionAdapter.from_pretrained("guoyww/animatediff-motion-adapter-v1-5-2")
         >>> pipe = AnimateDiffPipeline.from_pretrained("frankjoshua/toonyou_beta6", motion_adapter=adapter)
         >>> pipe.scheduler = DDIMScheduler(beta_schedule="linear", steps_offset=1, clip_sample=False)
@@ -95,13 +87,16 @@ class AnimateDiffPipelineOutput(BaseOutput):
 class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdapterMixin, LoraLoaderMixin):
     r"""
     Pipeline for text-to-video generation.
     This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
     implemented for all pipelines (downloading, saving, running on a particular device, etc.).
     The pipeline also inherits the following loading methods:
         - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
         - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
         - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
         - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
     Args:
         vae ([`AutoencoderKL`]):
             Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
@@ -155,9 +150,6 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
         self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
         self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
-    def load_motion_adapter(self,motion_adapter):
-        self.register_modules(motion_adapter=motion_adapter)
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt with num_images_per_prompt -> num_videos_per_prompt
     def encode_prompt(
         self,
@@ -173,6 +165,7 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
     ):
         r"""
         Encodes the prompt into text encoder hidden states.
         Args:
             prompt (`str` or `List[str]`, *optional*):
                 prompt to be encoded
@@ -424,9 +417,12 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu
     def enable_freeu(self, s1: float, s2: float, b1: float, b2: float):
         r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497.
         The suffixes after the scaling factors represent the stages where they are being applied.
         Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values
         that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
         Args:
             s1 (`float`):
                 Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
@@ -543,208 +539,8 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
         latents = latents * self.scheduler.init_noise_sigma
         return latents
-    def prepare_motion_latents(self, batch_size, num_channels_latents, num_frames, height, width, dtype, device, generator,
-                        latents=None, x_velocity=0, y_velocity=0, scale_velocity=0):
-        shape = (
-            batch_size,
-            num_channels_latents,
-            num_frames,
-            height // self.vae_scale_factor,
-            width // self.vae_scale_factor,
-        )
-        if isinstance(generator, list) and len(generator) != batch_size:
-            raise ValueError(
-                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
-                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
-            )
-        if latents is None:
-            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
-        else:
-            latents = latents.to(device)
-        # scale the initial noise by the standard deviation required by the scheduler
-        latents = latents * self.scheduler.init_noise_sigma
-        for frame in range(num_frames):
-            x_offset = int(frame * x_velocity)  # Convert to int
-            y_offset = int(frame * y_velocity)  # Convert to int
-            scale_factor = 1 + frame * scale_velocity
-            # Apply offsets
-            latents[:, :, frame] = torch.roll(latents[:, :, frame], shifts=(x_offset,), dims=3)  # x direction
-            latents[:, :, frame] = torch.roll(latents[:, :, frame], shifts=(y_offset,), dims=2)  # y direction
-            # Apply scaling - This is a simple approach and might not be ideal for all applications
-            if scale_factor != 1:
-                scaled_size = (
-                    int(latents.shape[3] * scale_factor),
-                    int(latents.shape[4] * scale_factor)
-                )
-                latents[:, :, frame] = torch.nn.functional.interpolate(
-                    latents[:, :, frame].unsqueeze(0), size=scaled_size, mode='bilinear', align_corners=False
-                ).squeeze(0)
-        return latents
-    def generate_correlated_noise(self, latents, init_noise_correlation):
-        cloned_latents = latents.clone()
-        p = init_noise_correlation
-        flattened_latents = torch.flatten(cloned_latents)
-        noise = torch.randn_like(flattened_latents)
-        correlated_noise = flattened_latents * p + math.sqrt(1 - p**2) * noise
-        return correlated_noise.reshape(cloned_latents.shape)
-    def generate_correlated_latents(self, latents, init_noise_correlation):
-        cloned_latents = latents.clone()
-        for i in range(1, cloned_latents.shape[2]):
-            p = init_noise_correlation
-            flattened_latents = torch.flatten(cloned_latents[:, :, i])
-            prev_flattened_latents = torch.flatten(cloned_latents[:, :, i - 1])
-            correlated_latents = (prev_flattened_latents * p/math.sqrt((1+p**2))+flattened_latents * math.sqrt(1/(1 + p**2)))
-            cloned_latents[:, :, i] = correlated_latents.reshape(cloned_latents[:, :, i].shape)
-        return cloned_latents
-    def generate_correlated_latents_legacy(self, latents, init_noise_correlation):
-        cloned_latents = latents.clone()
-        for i in range(1, cloned_latents.shape[2]):
-            p = init_noise_correlation
-            flattened_latents = torch.flatten(cloned_latents[:, :, i])
-            prev_flattened_latents = torch.flatten(cloned_latents[:, :, i - 1])
-            correlated_latents = (
-                prev_flattened_latents * p
-                +
-                flattened_latents * math.sqrt(1 - p**2)
-            )
-            cloned_latents[:, :, i] = correlated_latents.reshape(
-                cloned_latents[:, :, i].shape
-            )
-        return cloned_latents
-    def generate_mixed_noise(self, noise, init_noise_correlation):
-        shared_noise = torch.randn_like(noise[0, :, 0])
-        for b in range(noise.shape[0]):
-            for f in range(noise.shape[2]):
-                p = init_noise_correlation
-                flattened_latents = torch.flatten(noise[b, :, f])
-                shared_latents = torch.flatten(shared_noise)
-                correlated_latents = (
-                    shared_latents * math.sqrt(p**2/(1+p**2)) +
-                    flattened_latents * math.sqrt(1/(1+p**2))
-                )
-                noise[b, :, f] = correlated_latents.reshape(noise[b, :, f].shape)
-        return noise
-    def prepare_correlated_latents(
-        self,
-        init_image,
-        init_image_strength,
-        init_noise_correlation,
-        batch_size,
-        num_channels_latents,
-        video_length,
-        height,
-        width,
-        dtype,
-        device,
-        generator,
-        latents=None,
-    ):
-        shape = (
-            batch_size,
-            num_channels_latents,
-            video_length,
-            height // self.vae_scale_factor,
-            width // self.vae_scale_factor,
-        )
-        if init_image is not None:
-            start_image = ((torchvision.transforms.functional.pil_to_tensor(init_image))/ 255 )[:3, :, :].to("cuda").to(dtype).unsqueeze(0)
-            start_image = (
-                self.vae.encode(start_image.mul(2).sub(1))
-                .latent_dist.sample()
-                .view(1, 4, height // 8, width // 8)
-                * 0.18215
-            )
-            init_latents = start_image.unsqueeze(2).repeat(1, 1, video_length, 1, 1)
-        else:
-            init_latents = None
-        if isinstance(generator, list) and len(generator) != batch_size:
-            raise ValueError(
-                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
-                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
-            )
-        if latents is None:
-            rand_device = "cpu" if device.type == "mps" else device
-            if isinstance(generator, list):
-                shape = shape
-                # shape = (1,) + shape[1:]
-                # ignore init latents for batch model
-                latents = [torch.randn(shape, generator=generator[i], device=rand_device, dtype=dtype)for i in range(batch_size)]
-                latents = torch.cat(latents, dim=0).to(device)
-            else:
-                if init_latents is not None:
-                    offset = int(
-                        init_image_strength * (len(self.scheduler.timesteps) - 1)
-                    )
-                    noise = torch.randn_like(init_latents)
-                    noise = self.generate_correlated_latents(noise, init_noise_correlation)
-                    # Eric - some black magic here
-                    # We should be only adding the noise at timestep[offset], but I noticed that
-                    # we get more motion and cooler motion if we add the noise at timestep[offset - 1]
-                    # or offset - 2. However, this breaks the fewer timesteps there are, so let's interpolate
-                    timesteps = self.scheduler.timesteps
-                    average_timestep = None
-                    if offset == 0:
-                        average_timestep = timesteps[0]
-                    elif offset == 1:
-                        average_timestep = (
-                            timesteps[offset - 1] * (1 - init_image_strength)
-                            + timesteps[offset] * init_image_strength
-                        )
-                    else:
-                        average_timestep = timesteps[offset - 1]
-                    latents = self.scheduler.add_noise(
-                        init_latents, noise, average_timestep.long()
-                    )
-                    latents = self.scheduler.add_noise(
-                        latents, torch.randn_like(init_latents), timesteps[-2]
-                    )
-                else:
-                    latents = torch.randn(
-                        shape, generator=generator, device=rand_device, dtype=dtype
-                    ).to(device)
-                    latents = self.generate_correlated_latents(
-                        latents, init_noise_correlation
-                    )
-        else:
-            if latents.shape != shape:
-                raise ValueError(
-                    f"Unexpected latents shape, got {latents.shape}, expected {shape}"
-                )
-            latents = latents.to(device)
-        # scale the initial noise by the standard deviation required by the scheduler
-        if init_latents is None:
-            latents = latents * self.scheduler.init_noise_sigma
-        # elif self.unet.trained_initial_frames and init_latents is not None:
-        #     # we only want to use this as the first frame
-        #     init_latents[:, :, 1:] = torch.zeros_like(init_latents[:, :, 1:])
-        latents = latents.to(device)
-        return latents, init_latents
     @torch.no_grad()
-    # @replace_example_docstring(EXAMPLE_DOC_STRING)
     def __call__(
         self,
         prompt: Union[str, List[str]] = None,
@@ -765,22 +561,15 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
         negative_prompt_embeds: Optional[torch.FloatTensor] = None,
         ip_adapter_image: Optional[PipelineImageInput] = None,
         output_type: Optional[str] = "pil",
-        output_path: Optional[str] = None,
         return_dict: bool = True,
         callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
         callback_steps: Optional[int] = 1,
         cross_attention_kwargs: Optional[Dict[str, Any]] = None,
         clip_skip: Optional[int] = None,
-        x_velocity: Optional[float] = 0,
-        y_velocity: Optional[float] = 0,
-        scale_velocity: Optional[float] = 0,
-        init_image: Optional[PipelineImageInput] = None,
-        init_image_strength: Optional[float] = 1.0,
-        init_noise_correlation: Optional[float] = 0.0,
-        latent_mode: Optional[str] = "normal",
     ):
         r"""
         The call function to the pipeline for generation.
         Args:
             prompt (`str` or `List[str]`, *optional*):
                 The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
@@ -837,6 +626,7 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
                 Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
                 the output of the pre-final layer will be used for computing the prompt embeddings.
         Examples:
         Returns:
             [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] or `tuple`:
                 If `return_dict` is `True`, [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] is
@@ -906,48 +696,17 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
         # 5. Prepare latent variables
         num_channels_latents = self.unet.config.in_channels
-        if(latent_mode == "normal"):
-            latents = self.prepare_latents(
-                batch_size * num_videos_per_prompt,
-                num_channels_latents,
-                num_frames,
-                height,
-                width,
-                prompt_embeds.dtype,
-                device,
-                generator,
-                latents,
-            )
-        elif(latent_mode == "motion"):
-            latents = self.prepare_motion_latents(
-                batch_size * num_videos_per_prompt,
-                num_channels_latents,
-                num_frames,
-                height,
-                width,
-                prompt_embeds.dtype,
-                device,
-                generator,
-                latents,
-                x_velocity=x_velocity,
-                y_velocity=y_velocity,
-                scale_velocity=scale_velocity,
-            )
-        elif(latent_mode == "correlated"):
-            latents, init_latents = self.prepare_correlated_latents(
-                init_image,
-                init_image_strength,
-                init_noise_correlation,
-                batch_size,
-                num_channels_latents,
-                num_frames,
-                height,
-                width,
-                prompt_embeds.dtype,
-                device,
-                generator,
-            )
         # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
@@ -960,20 +719,37 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
         # Denoising loop
         num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
-        with self.progress_bar(total=len(timesteps)) as progress_bar:
             for i, t in enumerate(timesteps):
                 latent_sum = torch.zeros_like(latents).to(device).to(dtype=torch.float16)
                 latent_counter = torch.zeros(num_frames).to(device).to(dtype=torch.float16)
-                # foreach context group seperately denoise the current timestep
                 for context_group in range(num_context_groups):
-                    # calculate to current indexes, considering overlap
-                    if context_group == 0:current_context_start = 0
-                    else:current_context_start = context_group * (context_size - overlap)
-                    # select the relevent context from the latents
-                    current_context_latents = latents[:, :, current_context_start : current_context_start + context_size, :, :]
                     # expand the latents if we are doing classifier free guidance
                     latent_model_input = torch.cat([current_context_latents] * 2) if do_classifier_free_guidance else current_context_latents
@@ -995,43 +771,27 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
                     # compute the previous noisy sample x_t -> x_t-1
                     current_context_latents = self.scheduler.step(noise_pred, t, current_context_latents, **extra_step_kwargs).prev_sample
                     #add the context current_context_latents back to the latent sum starting from the current context start
                     latent_sum[:, :, current_context_start : current_context_start + context_size, :, :] += current_context_latents
                     # add one to the counter for each timestep in the context
                     latent_counter[current_context_start : current_context_start + context_size] += 1
-                # call the callback, if provided
-                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                    progress_bar.update()
-                    if callback is not None and i % callback_steps == 0:
-                        callback(i, t, None)
                 latent_counter = latent_counter.reshape(1, 1, num_frames, 1, 1)
                 latents = latent_sum / latent_counter
-                # shuffle rotate latent images by step places, wrapping around the last 2 to the start
                 latents = torch.cat([latents[:, :, -step:, :, :], latents[:, :, :-step, :, :]], dim=2)
-        print("Done denoising")
         if output_type == "latent":
             return AnimateDiffPipelineOutput(frames=latents)
-        # save frames
-        if output_path is not None:
-            output_batch_size = 2 # prevents out of memory errors with large videos
-            num_digits = output_path.count('#')  # count the number of '#' characters
-            frame_format = output_path.replace('#' * num_digits, '{:0' + str(num_digits) + 'd}')
-            for batch in range((num_frames + output_batch_size - 1) // output_batch_size):
-                start_id = batch * output_batch_size
-                end_id = min((batch + 1) * output_batch_size, num_frames)
-                video_tensor = self.decode_latents(latents[:, :, start_id:end_id, :, :])
-                video = tensor2vid(video_tensor, self.image_processor, output_type=output_type)
-                for f_id, frame in enumerate(video[0]):
-                    frame.save(frame_format.format(start_id + f_id))
-            return output_path
         # Post-processing
         video_tensor = self.decode_latents(latents)
@@ -1046,4 +806,4 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
         if not return_dict:
             return (video,)
-        return AnimateDiffPipelineOutput(frames=video)

 import torch
 from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
+from ...image_processor import PipelineImageInput, VaeImageProcessor
+from ...loaders import IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
+from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel, UNetMotionModel
+from ...models.lora import adjust_lora_scale_text_encoder
+from ...models.unet_motion_model import MotionAdapter
+from ...schedulers import (
     DDIMScheduler,
     DPMSolverMultistepScheduler,
     EulerAncestralDiscreteScheduler,
     LMSDiscreteScheduler,
     PNDMScheduler,
 )
+from ...utils import (
     USE_PEFT_BACKEND,
     BaseOutput,
     logging,
+    replace_example_docstring,
     scale_lora_layers,
     unscale_lora_layers,
 )
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
         >>> import torch
         >>> from diffusers import MotionAdapter, AnimateDiffPipeline, DDIMScheduler
         >>> from diffusers.utils import export_to_gif
         >>> adapter = MotionAdapter.from_pretrained("guoyww/animatediff-motion-adapter-v1-5-2")
         >>> pipe = AnimateDiffPipeline.from_pretrained("frankjoshua/toonyou_beta6", motion_adapter=adapter)
         >>> pipe.scheduler = DDIMScheduler(beta_schedule="linear", steps_offset=1, clip_sample=False)
 class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdapterMixin, LoraLoaderMixin):
     r"""
     Pipeline for text-to-video generation.
     This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
     implemented for all pipelines (downloading, saving, running on a particular device, etc.).
     The pipeline also inherits the following loading methods:
         - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
         - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
         - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
         - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
     Args:
         vae ([`AutoencoderKL`]):
             Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
         self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
         self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt with num_images_per_prompt -> num_videos_per_prompt
     def encode_prompt(
         self,
     ):
         r"""
         Encodes the prompt into text encoder hidden states.
         Args:
             prompt (`str` or `List[str]`, *optional*):
                 prompt to be encoded
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu
     def enable_freeu(self, s1: float, s2: float, b1: float, b2: float):
         r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497.
         The suffixes after the scaling factors represent the stages where they are being applied.
         Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values
         that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
         Args:
             s1 (`float`):
                 Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
         latents = latents * self.scheduler.init_noise_sigma
         return latents
     @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
     def __call__(
         self,
         prompt: Union[str, List[str]] = None,
         negative_prompt_embeds: Optional[torch.FloatTensor] = None,
         ip_adapter_image: Optional[PipelineImageInput] = None,
         output_type: Optional[str] = "pil",
         return_dict: bool = True,
         callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
         callback_steps: Optional[int] = 1,
         cross_attention_kwargs: Optional[Dict[str, Any]] = None,
         clip_skip: Optional[int] = None,
     ):
         r"""
         The call function to the pipeline for generation.
         Args:
             prompt (`str` or `List[str]`, *optional*):
                 The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
                 Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
                 the output of the pre-final layer will be used for computing the prompt embeddings.
         Examples:
         Returns:
             [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] or `tuple`:
                 If `return_dict` is `True`, [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] is
         # 5. Prepare latent variables
         num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_videos_per_prompt,
+            num_channels_latents,
+            num_frames,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
         # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
         # Denoising loop
         num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_context_groups * len(timesteps)) as progress_bar:
             for i, t in enumerate(timesteps):
                 latent_sum = torch.zeros_like(latents).to(device).to(dtype=torch.float16)
                 latent_counter = torch.zeros(num_frames).to(device).to(dtype=torch.float16)
+                # # foreach context group seperately denoise the current timestep
+                # for context_group in range(num_context_groups):
+                #     # calculate to current indexes, considering overlap
+                #     if context_group == 0:current_context_start = 0
+                #     else:current_context_start = context_group * (context_size - overlap)
+                #     # select the relevent context from the latents
+                #     current_context_latents = latents[:, :, current_context_start : current_context_start + context_size, :, :]
+                #     # if the context extends past the end of the latents, wrap around to the start
+                #     if current_context_start + context_size > num_frames:
+                #         current_context_latents = torch.cat([current_context_latents, latents[:, :, :current_context_start + context_size - num_frames, :, :]], dim=2)
                 for context_group in range(num_context_groups):
+                    # Calculate the current start index, considering overlap
+                    current_context_start = 0 if context_group == 0 else context_group * (context_size - overlap)
+                    # Calculate the end index and adjust if it exceeds num_frames
+                    current_context_end = (current_context_start + context_size) % num_frames
+                    # Select the relevant context from the latents with wrap-around handling
+                    current_context_latents = torch.cat([
+                        latents[:, :, current_context_start:min(current_context_start + context_size, num_frames), :, :],
+                        latents[:, :, :max(current_context_end - num_frames, 0), :, :]
+                    ], dim=2) if current_context_start + context_size > num_frames else latents[:, :, current_context_start:current_context_start + context_size, :, :]
                     # expand the latents if we are doing classifier free guidance
                     latent_model_input = torch.cat([current_context_latents] * 2) if do_classifier_free_guidance else current_context_latents
                     # compute the previous noisy sample x_t -> x_t-1
                     current_context_latents = self.scheduler.step(noise_pred, t, current_context_latents, **extra_step_kwargs).prev_sample
+                    # call the callback, if provided
+                    if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                        progress_bar.update()
+                        if callback is not None and i % callback_steps == 0:
+                            callback(i, t, current_context_latents)
                     #add the context current_context_latents back to the latent sum starting from the current context start
                     latent_sum[:, :, current_context_start : current_context_start + context_size, :, :] += current_context_latents
                     # add one to the counter for each timestep in the context
                     latent_counter[current_context_start : current_context_start + context_size] += 1
                 latent_counter = latent_counter.reshape(1, 1, num_frames, 1, 1)
                 latents = latent_sum / latent_counter
+                # shuffle rotate latent images by step places, wrapping around the last n steps to the start
                 latents = torch.cat([latents[:, :, -step:, :, :], latents[:, :, :-step, :, :]], dim=2)
         if output_type == "latent":
             return AnimateDiffPipelineOutput(frames=latents)
         # Post-processing
         video_tensor = self.decode_latents(latents)
         if not return_dict:
             return (video,)
+        return AnimateDiffPipelineOutput(frames=video)