smoothieAI commited on
Commit
1e17894
·
verified ·
1 Parent(s): fcf6eb4

Update pipeline.py

Browse files
Files changed (1) hide show
  1. pipeline.py +298 -59
pipeline.py CHANGED
@@ -20,12 +20,13 @@ import numpy as np
20
  import torch
21
  from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
22
 
23
- from ...image_processor import PipelineImageInput, VaeImageProcessor
24
- from ...loaders import IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
25
- from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel, UNetMotionModel
26
- from ...models.lora import adjust_lora_scale_text_encoder
27
- from ...models.unet_motion_model import MotionAdapter
28
- from ...schedulers import (
 
29
  DDIMScheduler,
30
  DPMSolverMultistepScheduler,
31
  EulerAncestralDiscreteScheduler,
@@ -33,17 +34,25 @@ from ...schedulers import (
33
  LMSDiscreteScheduler,
34
  PNDMScheduler,
35
  )
36
- from ...utils import (
37
  USE_PEFT_BACKEND,
38
  BaseOutput,
39
  logging,
40
- replace_example_docstring,
41
  scale_lora_layers,
42
  unscale_lora_layers,
43
  )
44
- from ...utils.torch_utils import randn_tensor
45
- from ..pipeline_utils import DiffusionPipeline
46
 
 
 
 
 
 
 
 
 
 
 
47
 
48
  logger = logging.get_logger(__name__) # pylint: disable=invalid-name
49
 
@@ -53,7 +62,6 @@ EXAMPLE_DOC_STRING = """
53
  >>> import torch
54
  >>> from diffusers import MotionAdapter, AnimateDiffPipeline, DDIMScheduler
55
  >>> from diffusers.utils import export_to_gif
56
-
57
  >>> adapter = MotionAdapter.from_pretrained("guoyww/animatediff-motion-adapter-v1-5-2")
58
  >>> pipe = AnimateDiffPipeline.from_pretrained("frankjoshua/toonyou_beta6", motion_adapter=adapter)
59
  >>> pipe.scheduler = DDIMScheduler(beta_schedule="linear", steps_offset=1, clip_sample=False)
@@ -87,16 +95,13 @@ class AnimateDiffPipelineOutput(BaseOutput):
87
  class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdapterMixin, LoraLoaderMixin):
88
  r"""
89
  Pipeline for text-to-video generation.
90
-
91
  This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
92
  implemented for all pipelines (downloading, saving, running on a particular device, etc.).
93
-
94
  The pipeline also inherits the following loading methods:
95
  - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
96
  - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
97
  - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
98
  - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
99
-
100
  Args:
101
  vae ([`AutoencoderKL`]):
102
  Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
@@ -150,6 +155,9 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
150
  self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
151
  self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
152
 
 
 
 
153
  # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt with num_images_per_prompt -> num_videos_per_prompt
154
  def encode_prompt(
155
  self,
@@ -165,7 +173,6 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
165
  ):
166
  r"""
167
  Encodes the prompt into text encoder hidden states.
168
-
169
  Args:
170
  prompt (`str` or `List[str]`, *optional*):
171
  prompt to be encoded
@@ -417,12 +424,9 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
417
  # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu
418
  def enable_freeu(self, s1: float, s2: float, b1: float, b2: float):
419
  r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497.
420
-
421
  The suffixes after the scaling factors represent the stages where they are being applied.
422
-
423
  Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values
424
  that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
425
-
426
  Args:
427
  s1 (`float`):
428
  Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
@@ -539,8 +543,208 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
539
  latents = latents * self.scheduler.init_noise_sigma
540
  return latents
541
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
542
  @torch.no_grad()
543
- @replace_example_docstring(EXAMPLE_DOC_STRING)
544
  def __call__(
545
  self,
546
  prompt: Union[str, List[str]] = None,
@@ -561,15 +765,22 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
561
  negative_prompt_embeds: Optional[torch.FloatTensor] = None,
562
  ip_adapter_image: Optional[PipelineImageInput] = None,
563
  output_type: Optional[str] = "pil",
 
564
  return_dict: bool = True,
565
  callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
566
  callback_steps: Optional[int] = 1,
567
  cross_attention_kwargs: Optional[Dict[str, Any]] = None,
568
  clip_skip: Optional[int] = None,
 
 
 
 
 
 
 
569
  ):
570
  r"""
571
  The call function to the pipeline for generation.
572
-
573
  Args:
574
  prompt (`str` or `List[str]`, *optional*):
575
  The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
@@ -626,7 +837,6 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
626
  Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
627
  the output of the pre-final layer will be used for computing the prompt embeddings.
628
  Examples:
629
-
630
  Returns:
631
  [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] or `tuple`:
632
  If `return_dict` is `True`, [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] is
@@ -696,17 +906,48 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
696
 
697
  # 5. Prepare latent variables
698
  num_channels_latents = self.unet.config.in_channels
699
- latents = self.prepare_latents(
700
- batch_size * num_videos_per_prompt,
701
- num_channels_latents,
702
- num_frames,
703
- height,
704
- width,
705
- prompt_embeds.dtype,
706
- device,
707
- generator,
708
- latents,
709
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
710
 
711
  # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
712
  extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
@@ -719,7 +960,7 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
719
 
720
  # Denoising loop
721
  num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
722
- with self.progress_bar(total=num_context_groups * len(timesteps)) as progress_bar:
723
  for i, t in enumerate(timesteps):
724
 
725
  latent_sum = torch.zeros_like(latents).to(device).to(dtype=torch.float16)
@@ -733,24 +974,6 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
733
 
734
  # select the relevent context from the latents
735
  current_context_latents = latents[:, :, current_context_start : current_context_start + context_size, :, :]
736
- # if the context extends past the end of the latents, wrap around to the start
737
- if current_context_start + context_size > num_frames:
738
- current_context_latents = torch.cat([current_context_latents, latents[:, :, :current_context_start + context_size - num_frames, :, :]], dim=2)
739
-
740
- # for context_group in range(num_context_groups):
741
- # # Calculate the current start index, considering overlap
742
- # current_context_start = 0 if context_group == 0 else context_group * (context_size - overlap)
743
-
744
- # # Calculate the end index and adjust if it exceeds num_frames
745
- # current_context_end = (current_context_start + context_size) % num_frames
746
-
747
- # # Select the relevant context from the latents with wrap-around handling
748
- # current_context_latents = torch.cat([
749
- # latents[:, :, current_context_start:min(current_context_start + context_size, num_frames), :, :],
750
- # latents[:, :, :max(current_context_end - num_frames, 0), :, :]
751
- # ], dim=2) if current_context_start + context_size > num_frames else latents[:, :, current_context_start:current_context_start + context_size, :, :]
752
-
753
-
754
 
755
  # expand the latents if we are doing classifier free guidance
756
  latent_model_input = torch.cat([current_context_latents] * 2) if do_classifier_free_guidance else current_context_latents
@@ -772,27 +995,43 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
772
 
773
  # compute the previous noisy sample x_t -> x_t-1
774
  current_context_latents = self.scheduler.step(noise_pred, t, current_context_latents, **extra_step_kwargs).prev_sample
775
-
776
- # call the callback, if provided
777
- if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
778
- progress_bar.update()
779
- if callback is not None and i % callback_steps == 0:
780
- callback(i, t, current_context_latents)
781
 
782
  #add the context current_context_latents back to the latent sum starting from the current context start
783
  latent_sum[:, :, current_context_start : current_context_start + context_size, :, :] += current_context_latents
784
  # add one to the counter for each timestep in the context
785
  latent_counter[current_context_start : current_context_start + context_size] += 1
786
 
 
 
 
 
 
 
787
  latent_counter = latent_counter.reshape(1, 1, num_frames, 1, 1)
788
  latents = latent_sum / latent_counter
789
 
790
- # shuffle rotate latent images by step places, wrapping around the last n steps to the start
791
  latents = torch.cat([latents[:, :, -step:, :, :], latents[:, :, :-step, :, :]], dim=2)
792
 
 
 
793
  if output_type == "latent":
794
  return AnimateDiffPipelineOutput(frames=latents)
795
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
796
  # Post-processing
797
  video_tensor = self.decode_latents(latents)
798
 
@@ -807,4 +1046,4 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
807
  if not return_dict:
808
  return (video,)
809
 
810
- return AnimateDiffPipelineOutput(frames=video)
 
20
  import torch
21
  from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
22
 
23
+ # Updated to use absolute paths
24
+ from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
25
+ from diffusers.loaders import IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
26
+ from diffusers.models import AutoencoderKL, ImageProjection, UNet2DConditionModel, UNetMotionModel
27
+ from diffusers.models.lora import adjust_lora_scale_text_encoder
28
+ from diffusers.models.unet_motion_model import MotionAdapter
29
+ from diffusers.schedulers import (
30
  DDIMScheduler,
31
  DPMSolverMultistepScheduler,
32
  EulerAncestralDiscreteScheduler,
 
34
  LMSDiscreteScheduler,
35
  PNDMScheduler,
36
  )
37
+ from diffusers.utils import (
38
  USE_PEFT_BACKEND,
39
  BaseOutput,
40
  logging,
 
41
  scale_lora_layers,
42
  unscale_lora_layers,
43
  )
44
+ from diffusers.utils.torch_utils import is_compiled_module, randn_tensor
 
45
 
46
+ # Added imports based on the working paths
47
+ from diffusers.models import ControlNetModel
48
+ from diffusers.pipelines.controlnet.multicontrolnet import MultiControlNetModel
49
+ from diffusers.pipelines.pipeline_utils import DiffusionPipeline
50
+ from diffusers.utils import deprecate
51
+
52
+ import torchvision
53
+ import PIL
54
+ import PIL.Image
55
+ import math
56
 
57
  logger = logging.get_logger(__name__) # pylint: disable=invalid-name
58
 
 
62
  >>> import torch
63
  >>> from diffusers import MotionAdapter, AnimateDiffPipeline, DDIMScheduler
64
  >>> from diffusers.utils import export_to_gif
 
65
  >>> adapter = MotionAdapter.from_pretrained("guoyww/animatediff-motion-adapter-v1-5-2")
66
  >>> pipe = AnimateDiffPipeline.from_pretrained("frankjoshua/toonyou_beta6", motion_adapter=adapter)
67
  >>> pipe.scheduler = DDIMScheduler(beta_schedule="linear", steps_offset=1, clip_sample=False)
 
95
  class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdapterMixin, LoraLoaderMixin):
96
  r"""
97
  Pipeline for text-to-video generation.
 
98
  This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
99
  implemented for all pipelines (downloading, saving, running on a particular device, etc.).
 
100
  The pipeline also inherits the following loading methods:
101
  - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
102
  - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
103
  - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
104
  - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
 
105
  Args:
106
  vae ([`AutoencoderKL`]):
107
  Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
 
155
  self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
156
  self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
157
 
158
+ def load_motion_adapter(self,motion_adapter):
159
+ self.register_modules(motion_adapter=motion_adapter)
160
+
161
  # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt with num_images_per_prompt -> num_videos_per_prompt
162
  def encode_prompt(
163
  self,
 
173
  ):
174
  r"""
175
  Encodes the prompt into text encoder hidden states.
 
176
  Args:
177
  prompt (`str` or `List[str]`, *optional*):
178
  prompt to be encoded
 
424
  # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu
425
  def enable_freeu(self, s1: float, s2: float, b1: float, b2: float):
426
  r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497.
 
427
  The suffixes after the scaling factors represent the stages where they are being applied.
 
428
  Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values
429
  that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
 
430
  Args:
431
  s1 (`float`):
432
  Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
 
543
  latents = latents * self.scheduler.init_noise_sigma
544
  return latents
545
 
546
+ def prepare_motion_latents(self, batch_size, num_channels_latents, num_frames, height, width, dtype, device, generator,
547
+ latents=None, x_velocity=0, y_velocity=0, scale_velocity=0):
548
+ shape = (
549
+ batch_size,
550
+ num_channels_latents,
551
+ num_frames,
552
+ height // self.vae_scale_factor,
553
+ width // self.vae_scale_factor,
554
+ )
555
+ if isinstance(generator, list) and len(generator) != batch_size:
556
+ raise ValueError(
557
+ f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
558
+ f" size of {batch_size}. Make sure the batch size matches the length of the generators."
559
+ )
560
+
561
+ if latents is None:
562
+ latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
563
+ else:
564
+ latents = latents.to(device)
565
+
566
+ # scale the initial noise by the standard deviation required by the scheduler
567
+ latents = latents * self.scheduler.init_noise_sigma
568
+
569
+ for frame in range(num_frames):
570
+ x_offset = int(frame * x_velocity) # Convert to int
571
+ y_offset = int(frame * y_velocity) # Convert to int
572
+ scale_factor = 1 + frame * scale_velocity
573
+
574
+ # Apply offsets
575
+ latents[:, :, frame] = torch.roll(latents[:, :, frame], shifts=(x_offset,), dims=3) # x direction
576
+ latents[:, :, frame] = torch.roll(latents[:, :, frame], shifts=(y_offset,), dims=2) # y direction
577
+
578
+ # Apply scaling - This is a simple approach and might not be ideal for all applications
579
+ if scale_factor != 1:
580
+ scaled_size = (
581
+ int(latents.shape[3] * scale_factor),
582
+ int(latents.shape[4] * scale_factor)
583
+ )
584
+ latents[:, :, frame] = torch.nn.functional.interpolate(
585
+ latents[:, :, frame].unsqueeze(0), size=scaled_size, mode='bilinear', align_corners=False
586
+ ).squeeze(0)
587
+
588
+ return latents
589
+
590
+ def generate_correlated_noise(self, latents, init_noise_correlation):
591
+ cloned_latents = latents.clone()
592
+ p = init_noise_correlation
593
+ flattened_latents = torch.flatten(cloned_latents)
594
+ noise = torch.randn_like(flattened_latents)
595
+ correlated_noise = flattened_latents * p + math.sqrt(1 - p**2) * noise
596
+
597
+ return correlated_noise.reshape(cloned_latents.shape)
598
+
599
+ def generate_correlated_latents(self, latents, init_noise_correlation):
600
+ cloned_latents = latents.clone()
601
+ for i in range(1, cloned_latents.shape[2]):
602
+ p = init_noise_correlation
603
+ flattened_latents = torch.flatten(cloned_latents[:, :, i])
604
+ prev_flattened_latents = torch.flatten(cloned_latents[:, :, i - 1])
605
+ correlated_latents = (prev_flattened_latents * p/math.sqrt((1+p**2))+flattened_latents * math.sqrt(1/(1 + p**2)))
606
+ cloned_latents[:, :, i] = correlated_latents.reshape(cloned_latents[:, :, i].shape)
607
+
608
+ return cloned_latents
609
+
610
+ def generate_correlated_latents_legacy(self, latents, init_noise_correlation):
611
+ cloned_latents = latents.clone()
612
+ for i in range(1, cloned_latents.shape[2]):
613
+ p = init_noise_correlation
614
+ flattened_latents = torch.flatten(cloned_latents[:, :, i])
615
+ prev_flattened_latents = torch.flatten(cloned_latents[:, :, i - 1])
616
+ correlated_latents = (
617
+ prev_flattened_latents * p
618
+ +
619
+ flattened_latents * math.sqrt(1 - p**2)
620
+ )
621
+ cloned_latents[:, :, i] = correlated_latents.reshape(
622
+ cloned_latents[:, :, i].shape
623
+ )
624
+
625
+ return cloned_latents
626
+
627
+ def generate_mixed_noise(self, noise, init_noise_correlation):
628
+ shared_noise = torch.randn_like(noise[0, :, 0])
629
+ for b in range(noise.shape[0]):
630
+ for f in range(noise.shape[2]):
631
+ p = init_noise_correlation
632
+ flattened_latents = torch.flatten(noise[b, :, f])
633
+ shared_latents = torch.flatten(shared_noise)
634
+ correlated_latents = (
635
+ shared_latents * math.sqrt(p**2/(1+p**2)) +
636
+ flattened_latents * math.sqrt(1/(1+p**2))
637
+ )
638
+ noise[b, :, f] = correlated_latents.reshape(noise[b, :, f].shape)
639
+
640
+ return noise
641
+
642
+ def prepare_correlated_latents(
643
+ self,
644
+ init_image,
645
+ init_image_strength,
646
+ init_noise_correlation,
647
+ batch_size,
648
+ num_channels_latents,
649
+ video_length,
650
+ height,
651
+ width,
652
+ dtype,
653
+ device,
654
+ generator,
655
+ latents=None,
656
+ ):
657
+ shape = (
658
+ batch_size,
659
+ num_channels_latents,
660
+ video_length,
661
+ height // self.vae_scale_factor,
662
+ width // self.vae_scale_factor,
663
+ )
664
+
665
+ if init_image is not None:
666
+ start_image = ((torchvision.transforms.functional.pil_to_tensor(init_image))/ 255 )[:3, :, :].to("cuda").to(dtype).unsqueeze(0)
667
+ start_image = (
668
+ self.vae.encode(start_image.mul(2).sub(1))
669
+ .latent_dist.sample()
670
+ .view(1, 4, height // 8, width // 8)
671
+ * 0.18215
672
+ )
673
+ init_latents = start_image.unsqueeze(2).repeat(1, 1, video_length, 1, 1)
674
+ else:
675
+ init_latents = None
676
+
677
+ if isinstance(generator, list) and len(generator) != batch_size:
678
+ raise ValueError(
679
+ f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
680
+ f" size of {batch_size}. Make sure the batch size matches the length of the generators."
681
+ )
682
+ if latents is None:
683
+ rand_device = "cpu" if device.type == "mps" else device
684
+ if isinstance(generator, list):
685
+ shape = shape
686
+ # shape = (1,) + shape[1:]
687
+ # ignore init latents for batch model
688
+ latents = [torch.randn(shape, generator=generator[i], device=rand_device, dtype=dtype)for i in range(batch_size)]
689
+ latents = torch.cat(latents, dim=0).to(device)
690
+ else:
691
+ if init_latents is not None:
692
+ offset = int(
693
+ init_image_strength * (len(self.scheduler.timesteps) - 1)
694
+ )
695
+ noise = torch.randn_like(init_latents)
696
+ noise = self.generate_correlated_latents(noise, init_noise_correlation)
697
+
698
+ # Eric - some black magic here
699
+ # We should be only adding the noise at timestep[offset], but I noticed that
700
+ # we get more motion and cooler motion if we add the noise at timestep[offset - 1]
701
+ # or offset - 2. However, this breaks the fewer timesteps there are, so let's interpolate
702
+ timesteps = self.scheduler.timesteps
703
+ average_timestep = None
704
+ if offset == 0:
705
+ average_timestep = timesteps[0]
706
+ elif offset == 1:
707
+ average_timestep = (
708
+ timesteps[offset - 1] * (1 - init_image_strength)
709
+ + timesteps[offset] * init_image_strength
710
+ )
711
+ else:
712
+ average_timestep = timesteps[offset - 1]
713
+
714
+ latents = self.scheduler.add_noise(
715
+ init_latents, noise, average_timestep.long()
716
+ )
717
+
718
+ latents = self.scheduler.add_noise(
719
+ latents, torch.randn_like(init_latents), timesteps[-2]
720
+ )
721
+ else:
722
+ latents = torch.randn(
723
+ shape, generator=generator, device=rand_device, dtype=dtype
724
+ ).to(device)
725
+ latents = self.generate_correlated_latents(
726
+ latents, init_noise_correlation
727
+ )
728
+ else:
729
+ if latents.shape != shape:
730
+ raise ValueError(
731
+ f"Unexpected latents shape, got {latents.shape}, expected {shape}"
732
+ )
733
+ latents = latents.to(device)
734
+
735
+ # scale the initial noise by the standard deviation required by the scheduler
736
+ if init_latents is None:
737
+ latents = latents * self.scheduler.init_noise_sigma
738
+ # elif self.unet.trained_initial_frames and init_latents is not None:
739
+ # # we only want to use this as the first frame
740
+ # init_latents[:, :, 1:] = torch.zeros_like(init_latents[:, :, 1:])
741
+
742
+ latents = latents.to(device)
743
+ return latents, init_latents
744
+
745
+
746
  @torch.no_grad()
747
+ # @replace_example_docstring(EXAMPLE_DOC_STRING)
748
  def __call__(
749
  self,
750
  prompt: Union[str, List[str]] = None,
 
765
  negative_prompt_embeds: Optional[torch.FloatTensor] = None,
766
  ip_adapter_image: Optional[PipelineImageInput] = None,
767
  output_type: Optional[str] = "pil",
768
+ output_path: Optional[str] = None,
769
  return_dict: bool = True,
770
  callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
771
  callback_steps: Optional[int] = 1,
772
  cross_attention_kwargs: Optional[Dict[str, Any]] = None,
773
  clip_skip: Optional[int] = None,
774
+ x_velocity: Optional[float] = 0,
775
+ y_velocity: Optional[float] = 0,
776
+ scale_velocity: Optional[float] = 0,
777
+ init_image: Optional[PipelineImageInput] = None,
778
+ init_image_strength: Optional[float] = 1.0,
779
+ init_noise_correlation: Optional[float] = 0.0,
780
+ latent_mode: Optional[str] = "normal",
781
  ):
782
  r"""
783
  The call function to the pipeline for generation.
 
784
  Args:
785
  prompt (`str` or `List[str]`, *optional*):
786
  The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
 
837
  Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
838
  the output of the pre-final layer will be used for computing the prompt embeddings.
839
  Examples:
 
840
  Returns:
841
  [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] or `tuple`:
842
  If `return_dict` is `True`, [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] is
 
906
 
907
  # 5. Prepare latent variables
908
  num_channels_latents = self.unet.config.in_channels
909
+ if(latent_mode == "normal"):
910
+ latents = self.prepare_latents(
911
+ batch_size * num_videos_per_prompt,
912
+ num_channels_latents,
913
+ num_frames,
914
+ height,
915
+ width,
916
+ prompt_embeds.dtype,
917
+ device,
918
+ generator,
919
+ latents,
920
+ )
921
+ elif(latent_mode == "motion"):
922
+ latents = self.prepare_motion_latents(
923
+ batch_size * num_videos_per_prompt,
924
+ num_channels_latents,
925
+ num_frames,
926
+ height,
927
+ width,
928
+ prompt_embeds.dtype,
929
+ device,
930
+ generator,
931
+ latents,
932
+ x_velocity=x_velocity,
933
+ y_velocity=y_velocity,
934
+ scale_velocity=scale_velocity,
935
+ )
936
+ elif(latent_mode == "correlated"):
937
+ latents, init_latents = self.prepare_correlated_latents(
938
+ init_image,
939
+ init_image_strength,
940
+ init_noise_correlation,
941
+ batch_size,
942
+ num_channels_latents,
943
+ num_frames,
944
+ height,
945
+ width,
946
+ prompt_embeds.dtype,
947
+ device,
948
+ generator,
949
+ )
950
+
951
 
952
  # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
953
  extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
 
960
 
961
  # Denoising loop
962
  num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
963
+ with self.progress_bar(total=len(timesteps)) as progress_bar:
964
  for i, t in enumerate(timesteps):
965
 
966
  latent_sum = torch.zeros_like(latents).to(device).to(dtype=torch.float16)
 
974
 
975
  # select the relevent context from the latents
976
  current_context_latents = latents[:, :, current_context_start : current_context_start + context_size, :, :]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
977
 
978
  # expand the latents if we are doing classifier free guidance
979
  latent_model_input = torch.cat([current_context_latents] * 2) if do_classifier_free_guidance else current_context_latents
 
995
 
996
  # compute the previous noisy sample x_t -> x_t-1
997
  current_context_latents = self.scheduler.step(noise_pred, t, current_context_latents, **extra_step_kwargs).prev_sample
 
 
 
 
 
 
998
 
999
  #add the context current_context_latents back to the latent sum starting from the current context start
1000
  latent_sum[:, :, current_context_start : current_context_start + context_size, :, :] += current_context_latents
1001
  # add one to the counter for each timestep in the context
1002
  latent_counter[current_context_start : current_context_start + context_size] += 1
1003
 
1004
+ # call the callback, if provided
1005
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
1006
+ progress_bar.update()
1007
+ if callback is not None and i % callback_steps == 0:
1008
+ callback(i, t, None)
1009
+
1010
  latent_counter = latent_counter.reshape(1, 1, num_frames, 1, 1)
1011
  latents = latent_sum / latent_counter
1012
 
1013
+ # shuffle rotate latent images by step places, wrapping around the last 2 to the start
1014
  latents = torch.cat([latents[:, :, -step:, :, :], latents[:, :, :-step, :, :]], dim=2)
1015
 
1016
+ print("Done denoising")
1017
+
1018
  if output_type == "latent":
1019
  return AnimateDiffPipelineOutput(frames=latents)
1020
 
1021
+ # save frames
1022
+ if output_path is not None:
1023
+ output_batch_size = 2 # prevents out of memory errors with large videos
1024
+ num_digits = output_path.count('#') # count the number of '#' characters
1025
+ frame_format = output_path.replace('#' * num_digits, '{:0' + str(num_digits) + 'd}')
1026
+ for batch in range((num_frames + output_batch_size - 1) // output_batch_size):
1027
+ start_id = batch * output_batch_size
1028
+ end_id = min((batch + 1) * output_batch_size, num_frames)
1029
+ video_tensor = self.decode_latents(latents[:, :, start_id:end_id, :, :])
1030
+ video = tensor2vid(video_tensor, self.image_processor, output_type=output_type)
1031
+ for f_id, frame in enumerate(video[0]):
1032
+ frame.save(frame_format.format(start_id + f_id))
1033
+ return output_path
1034
+
1035
  # Post-processing
1036
  video_tensor = self.decode_latents(latents)
1037
 
 
1046
  if not return_dict:
1047
  return (video,)
1048
 
1049
+ return AnimateDiffPipelineOutput(frames=video)