smoothieAI commited on
Commit
2aeb2f1
·
verified ·
1 Parent(s): 47ecd18

Update pipeline.py

Browse files
Files changed (1) hide show
  1. pipeline.py +64 -304
pipeline.py CHANGED
@@ -20,13 +20,12 @@ import numpy as np
20
  import torch
21
  from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
22
 
23
- # Updated to use absolute paths
24
- from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
25
- from diffusers.loaders import IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
26
- from diffusers.models import AutoencoderKL, ImageProjection, UNet2DConditionModel, UNetMotionModel
27
- from diffusers.models.lora import adjust_lora_scale_text_encoder
28
- from diffusers.models.unet_motion_model import MotionAdapter
29
- from diffusers.schedulers import (
30
  DDIMScheduler,
31
  DPMSolverMultistepScheduler,
32
  EulerAncestralDiscreteScheduler,
@@ -34,25 +33,17 @@ from diffusers.schedulers import (
34
  LMSDiscreteScheduler,
35
  PNDMScheduler,
36
  )
37
- from diffusers.utils import (
38
  USE_PEFT_BACKEND,
39
  BaseOutput,
40
  logging,
 
41
  scale_lora_layers,
42
  unscale_lora_layers,
43
  )
44
- from diffusers.utils.torch_utils import is_compiled_module, randn_tensor
 
45
 
46
- # Added imports based on the working paths
47
- from diffusers.models import ControlNetModel
48
- from diffusers.pipelines.controlnet.multicontrolnet import MultiControlNetModel
49
- from diffusers.pipelines.pipeline_utils import DiffusionPipeline
50
- from diffusers.utils import deprecate
51
-
52
- import torchvision
53
- import PIL
54
- import PIL.Image
55
- import math
56
 
57
  logger = logging.get_logger(__name__) # pylint: disable=invalid-name
58
 
@@ -62,6 +53,7 @@ EXAMPLE_DOC_STRING = """
62
  >>> import torch
63
  >>> from diffusers import MotionAdapter, AnimateDiffPipeline, DDIMScheduler
64
  >>> from diffusers.utils import export_to_gif
 
65
  >>> adapter = MotionAdapter.from_pretrained("guoyww/animatediff-motion-adapter-v1-5-2")
66
  >>> pipe = AnimateDiffPipeline.from_pretrained("frankjoshua/toonyou_beta6", motion_adapter=adapter)
67
  >>> pipe.scheduler = DDIMScheduler(beta_schedule="linear", steps_offset=1, clip_sample=False)
@@ -95,13 +87,16 @@ class AnimateDiffPipelineOutput(BaseOutput):
95
  class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdapterMixin, LoraLoaderMixin):
96
  r"""
97
  Pipeline for text-to-video generation.
 
98
  This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
99
  implemented for all pipelines (downloading, saving, running on a particular device, etc.).
 
100
  The pipeline also inherits the following loading methods:
101
  - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
102
  - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
103
  - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
104
  - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
 
105
  Args:
106
  vae ([`AutoencoderKL`]):
107
  Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
@@ -155,9 +150,6 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
155
  self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
156
  self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
157
 
158
- def load_motion_adapter(self,motion_adapter):
159
- self.register_modules(motion_adapter=motion_adapter)
160
-
161
  # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt with num_images_per_prompt -> num_videos_per_prompt
162
  def encode_prompt(
163
  self,
@@ -173,6 +165,7 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
173
  ):
174
  r"""
175
  Encodes the prompt into text encoder hidden states.
 
176
  Args:
177
  prompt (`str` or `List[str]`, *optional*):
178
  prompt to be encoded
@@ -424,9 +417,12 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
424
  # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu
425
  def enable_freeu(self, s1: float, s2: float, b1: float, b2: float):
426
  r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497.
 
427
  The suffixes after the scaling factors represent the stages where they are being applied.
 
428
  Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values
429
  that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
 
430
  Args:
431
  s1 (`float`):
432
  Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
@@ -543,208 +539,8 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
543
  latents = latents * self.scheduler.init_noise_sigma
544
  return latents
545
 
546
- def prepare_motion_latents(self, batch_size, num_channels_latents, num_frames, height, width, dtype, device, generator,
547
- latents=None, x_velocity=0, y_velocity=0, scale_velocity=0):
548
- shape = (
549
- batch_size,
550
- num_channels_latents,
551
- num_frames,
552
- height // self.vae_scale_factor,
553
- width // self.vae_scale_factor,
554
- )
555
- if isinstance(generator, list) and len(generator) != batch_size:
556
- raise ValueError(
557
- f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
558
- f" size of {batch_size}. Make sure the batch size matches the length of the generators."
559
- )
560
-
561
- if latents is None:
562
- latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
563
- else:
564
- latents = latents.to(device)
565
-
566
- # scale the initial noise by the standard deviation required by the scheduler
567
- latents = latents * self.scheduler.init_noise_sigma
568
-
569
- for frame in range(num_frames):
570
- x_offset = int(frame * x_velocity) # Convert to int
571
- y_offset = int(frame * y_velocity) # Convert to int
572
- scale_factor = 1 + frame * scale_velocity
573
-
574
- # Apply offsets
575
- latents[:, :, frame] = torch.roll(latents[:, :, frame], shifts=(x_offset,), dims=3) # x direction
576
- latents[:, :, frame] = torch.roll(latents[:, :, frame], shifts=(y_offset,), dims=2) # y direction
577
-
578
- # Apply scaling - This is a simple approach and might not be ideal for all applications
579
- if scale_factor != 1:
580
- scaled_size = (
581
- int(latents.shape[3] * scale_factor),
582
- int(latents.shape[4] * scale_factor)
583
- )
584
- latents[:, :, frame] = torch.nn.functional.interpolate(
585
- latents[:, :, frame].unsqueeze(0), size=scaled_size, mode='bilinear', align_corners=False
586
- ).squeeze(0)
587
-
588
- return latents
589
-
590
- def generate_correlated_noise(self, latents, init_noise_correlation):
591
- cloned_latents = latents.clone()
592
- p = init_noise_correlation
593
- flattened_latents = torch.flatten(cloned_latents)
594
- noise = torch.randn_like(flattened_latents)
595
- correlated_noise = flattened_latents * p + math.sqrt(1 - p**2) * noise
596
-
597
- return correlated_noise.reshape(cloned_latents.shape)
598
-
599
- def generate_correlated_latents(self, latents, init_noise_correlation):
600
- cloned_latents = latents.clone()
601
- for i in range(1, cloned_latents.shape[2]):
602
- p = init_noise_correlation
603
- flattened_latents = torch.flatten(cloned_latents[:, :, i])
604
- prev_flattened_latents = torch.flatten(cloned_latents[:, :, i - 1])
605
- correlated_latents = (prev_flattened_latents * p/math.sqrt((1+p**2))+flattened_latents * math.sqrt(1/(1 + p**2)))
606
- cloned_latents[:, :, i] = correlated_latents.reshape(cloned_latents[:, :, i].shape)
607
-
608
- return cloned_latents
609
-
610
- def generate_correlated_latents_legacy(self, latents, init_noise_correlation):
611
- cloned_latents = latents.clone()
612
- for i in range(1, cloned_latents.shape[2]):
613
- p = init_noise_correlation
614
- flattened_latents = torch.flatten(cloned_latents[:, :, i])
615
- prev_flattened_latents = torch.flatten(cloned_latents[:, :, i - 1])
616
- correlated_latents = (
617
- prev_flattened_latents * p
618
- +
619
- flattened_latents * math.sqrt(1 - p**2)
620
- )
621
- cloned_latents[:, :, i] = correlated_latents.reshape(
622
- cloned_latents[:, :, i].shape
623
- )
624
-
625
- return cloned_latents
626
-
627
- def generate_mixed_noise(self, noise, init_noise_correlation):
628
- shared_noise = torch.randn_like(noise[0, :, 0])
629
- for b in range(noise.shape[0]):
630
- for f in range(noise.shape[2]):
631
- p = init_noise_correlation
632
- flattened_latents = torch.flatten(noise[b, :, f])
633
- shared_latents = torch.flatten(shared_noise)
634
- correlated_latents = (
635
- shared_latents * math.sqrt(p**2/(1+p**2)) +
636
- flattened_latents * math.sqrt(1/(1+p**2))
637
- )
638
- noise[b, :, f] = correlated_latents.reshape(noise[b, :, f].shape)
639
-
640
- return noise
641
-
642
- def prepare_correlated_latents(
643
- self,
644
- init_image,
645
- init_image_strength,
646
- init_noise_correlation,
647
- batch_size,
648
- num_channels_latents,
649
- video_length,
650
- height,
651
- width,
652
- dtype,
653
- device,
654
- generator,
655
- latents=None,
656
- ):
657
- shape = (
658
- batch_size,
659
- num_channels_latents,
660
- video_length,
661
- height // self.vae_scale_factor,
662
- width // self.vae_scale_factor,
663
- )
664
-
665
- if init_image is not None:
666
- start_image = ((torchvision.transforms.functional.pil_to_tensor(init_image))/ 255 )[:3, :, :].to("cuda").to(dtype).unsqueeze(0)
667
- start_image = (
668
- self.vae.encode(start_image.mul(2).sub(1))
669
- .latent_dist.sample()
670
- .view(1, 4, height // 8, width // 8)
671
- * 0.18215
672
- )
673
- init_latents = start_image.unsqueeze(2).repeat(1, 1, video_length, 1, 1)
674
- else:
675
- init_latents = None
676
-
677
- if isinstance(generator, list) and len(generator) != batch_size:
678
- raise ValueError(
679
- f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
680
- f" size of {batch_size}. Make sure the batch size matches the length of the generators."
681
- )
682
- if latents is None:
683
- rand_device = "cpu" if device.type == "mps" else device
684
- if isinstance(generator, list):
685
- shape = shape
686
- # shape = (1,) + shape[1:]
687
- # ignore init latents for batch model
688
- latents = [torch.randn(shape, generator=generator[i], device=rand_device, dtype=dtype)for i in range(batch_size)]
689
- latents = torch.cat(latents, dim=0).to(device)
690
- else:
691
- if init_latents is not None:
692
- offset = int(
693
- init_image_strength * (len(self.scheduler.timesteps) - 1)
694
- )
695
- noise = torch.randn_like(init_latents)
696
- noise = self.generate_correlated_latents(noise, init_noise_correlation)
697
-
698
- # Eric - some black magic here
699
- # We should be only adding the noise at timestep[offset], but I noticed that
700
- # we get more motion and cooler motion if we add the noise at timestep[offset - 1]
701
- # or offset - 2. However, this breaks the fewer timesteps there are, so let's interpolate
702
- timesteps = self.scheduler.timesteps
703
- average_timestep = None
704
- if offset == 0:
705
- average_timestep = timesteps[0]
706
- elif offset == 1:
707
- average_timestep = (
708
- timesteps[offset - 1] * (1 - init_image_strength)
709
- + timesteps[offset] * init_image_strength
710
- )
711
- else:
712
- average_timestep = timesteps[offset - 1]
713
-
714
- latents = self.scheduler.add_noise(
715
- init_latents, noise, average_timestep.long()
716
- )
717
-
718
- latents = self.scheduler.add_noise(
719
- latents, torch.randn_like(init_latents), timesteps[-2]
720
- )
721
- else:
722
- latents = torch.randn(
723
- shape, generator=generator, device=rand_device, dtype=dtype
724
- ).to(device)
725
- latents = self.generate_correlated_latents(
726
- latents, init_noise_correlation
727
- )
728
- else:
729
- if latents.shape != shape:
730
- raise ValueError(
731
- f"Unexpected latents shape, got {latents.shape}, expected {shape}"
732
- )
733
- latents = latents.to(device)
734
-
735
- # scale the initial noise by the standard deviation required by the scheduler
736
- if init_latents is None:
737
- latents = latents * self.scheduler.init_noise_sigma
738
- # elif self.unet.trained_initial_frames and init_latents is not None:
739
- # # we only want to use this as the first frame
740
- # init_latents[:, :, 1:] = torch.zeros_like(init_latents[:, :, 1:])
741
-
742
- latents = latents.to(device)
743
- return latents, init_latents
744
-
745
-
746
  @torch.no_grad()
747
- # @replace_example_docstring(EXAMPLE_DOC_STRING)
748
  def __call__(
749
  self,
750
  prompt: Union[str, List[str]] = None,
@@ -765,22 +561,15 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
765
  negative_prompt_embeds: Optional[torch.FloatTensor] = None,
766
  ip_adapter_image: Optional[PipelineImageInput] = None,
767
  output_type: Optional[str] = "pil",
768
- output_path: Optional[str] = None,
769
  return_dict: bool = True,
770
  callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
771
  callback_steps: Optional[int] = 1,
772
  cross_attention_kwargs: Optional[Dict[str, Any]] = None,
773
  clip_skip: Optional[int] = None,
774
- x_velocity: Optional[float] = 0,
775
- y_velocity: Optional[float] = 0,
776
- scale_velocity: Optional[float] = 0,
777
- init_image: Optional[PipelineImageInput] = None,
778
- init_image_strength: Optional[float] = 1.0,
779
- init_noise_correlation: Optional[float] = 0.0,
780
- latent_mode: Optional[str] = "normal",
781
  ):
782
  r"""
783
  The call function to the pipeline for generation.
 
784
  Args:
785
  prompt (`str` or `List[str]`, *optional*):
786
  The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
@@ -837,6 +626,7 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
837
  Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
838
  the output of the pre-final layer will be used for computing the prompt embeddings.
839
  Examples:
 
840
  Returns:
841
  [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] or `tuple`:
842
  If `return_dict` is `True`, [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] is
@@ -906,48 +696,17 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
906
 
907
  # 5. Prepare latent variables
908
  num_channels_latents = self.unet.config.in_channels
909
- if(latent_mode == "normal"):
910
- latents = self.prepare_latents(
911
- batch_size * num_videos_per_prompt,
912
- num_channels_latents,
913
- num_frames,
914
- height,
915
- width,
916
- prompt_embeds.dtype,
917
- device,
918
- generator,
919
- latents,
920
- )
921
- elif(latent_mode == "motion"):
922
- latents = self.prepare_motion_latents(
923
- batch_size * num_videos_per_prompt,
924
- num_channels_latents,
925
- num_frames,
926
- height,
927
- width,
928
- prompt_embeds.dtype,
929
- device,
930
- generator,
931
- latents,
932
- x_velocity=x_velocity,
933
- y_velocity=y_velocity,
934
- scale_velocity=scale_velocity,
935
- )
936
- elif(latent_mode == "correlated"):
937
- latents, init_latents = self.prepare_correlated_latents(
938
- init_image,
939
- init_image_strength,
940
- init_noise_correlation,
941
- batch_size,
942
- num_channels_latents,
943
- num_frames,
944
- height,
945
- width,
946
- prompt_embeds.dtype,
947
- device,
948
- generator,
949
- )
950
-
951
 
952
  # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
953
  extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
@@ -960,20 +719,37 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
960
 
961
  # Denoising loop
962
  num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
963
- with self.progress_bar(total=len(timesteps)) as progress_bar:
964
  for i, t in enumerate(timesteps):
965
 
966
  latent_sum = torch.zeros_like(latents).to(device).to(dtype=torch.float16)
967
  latent_counter = torch.zeros(num_frames).to(device).to(dtype=torch.float16)
968
 
969
- # foreach context group seperately denoise the current timestep
 
 
 
 
 
 
 
 
 
 
 
970
  for context_group in range(num_context_groups):
971
- # calculate to current indexes, considering overlap
972
- if context_group == 0:current_context_start = 0
973
- else:current_context_start = context_group * (context_size - overlap)
 
 
 
 
 
 
 
 
974
 
975
- # select the relevent context from the latents
976
- current_context_latents = latents[:, :, current_context_start : current_context_start + context_size, :, :]
977
 
978
  # expand the latents if we are doing classifier free guidance
979
  latent_model_input = torch.cat([current_context_latents] * 2) if do_classifier_free_guidance else current_context_latents
@@ -995,43 +771,27 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
995
 
996
  # compute the previous noisy sample x_t -> x_t-1
997
  current_context_latents = self.scheduler.step(noise_pred, t, current_context_latents, **extra_step_kwargs).prev_sample
 
 
 
 
 
 
998
 
999
  #add the context current_context_latents back to the latent sum starting from the current context start
1000
  latent_sum[:, :, current_context_start : current_context_start + context_size, :, :] += current_context_latents
1001
  # add one to the counter for each timestep in the context
1002
  latent_counter[current_context_start : current_context_start + context_size] += 1
1003
 
1004
- # call the callback, if provided
1005
- if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
1006
- progress_bar.update()
1007
- if callback is not None and i % callback_steps == 0:
1008
- callback(i, t, None)
1009
-
1010
  latent_counter = latent_counter.reshape(1, 1, num_frames, 1, 1)
1011
  latents = latent_sum / latent_counter
1012
 
1013
- # shuffle rotate latent images by step places, wrapping around the last 2 to the start
1014
  latents = torch.cat([latents[:, :, -step:, :, :], latents[:, :, :-step, :, :]], dim=2)
1015
 
1016
- print("Done denoising")
1017
-
1018
  if output_type == "latent":
1019
  return AnimateDiffPipelineOutput(frames=latents)
1020
 
1021
- # save frames
1022
- if output_path is not None:
1023
- output_batch_size = 2 # prevents out of memory errors with large videos
1024
- num_digits = output_path.count('#') # count the number of '#' characters
1025
- frame_format = output_path.replace('#' * num_digits, '{:0' + str(num_digits) + 'd}')
1026
- for batch in range((num_frames + output_batch_size - 1) // output_batch_size):
1027
- start_id = batch * output_batch_size
1028
- end_id = min((batch + 1) * output_batch_size, num_frames)
1029
- video_tensor = self.decode_latents(latents[:, :, start_id:end_id, :, :])
1030
- video = tensor2vid(video_tensor, self.image_processor, output_type=output_type)
1031
- for f_id, frame in enumerate(video[0]):
1032
- frame.save(frame_format.format(start_id + f_id))
1033
- return output_path
1034
-
1035
  # Post-processing
1036
  video_tensor = self.decode_latents(latents)
1037
 
@@ -1046,4 +806,4 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
1046
  if not return_dict:
1047
  return (video,)
1048
 
1049
- return AnimateDiffPipelineOutput(frames=video)
 
20
  import torch
21
  from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
22
 
23
+ from ...image_processor import PipelineImageInput, VaeImageProcessor
24
+ from ...loaders import IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
25
+ from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel, UNetMotionModel
26
+ from ...models.lora import adjust_lora_scale_text_encoder
27
+ from ...models.unet_motion_model import MotionAdapter
28
+ from ...schedulers import (
 
29
  DDIMScheduler,
30
  DPMSolverMultistepScheduler,
31
  EulerAncestralDiscreteScheduler,
 
33
  LMSDiscreteScheduler,
34
  PNDMScheduler,
35
  )
36
+ from ...utils import (
37
  USE_PEFT_BACKEND,
38
  BaseOutput,
39
  logging,
40
+ replace_example_docstring,
41
  scale_lora_layers,
42
  unscale_lora_layers,
43
  )
44
+ from ...utils.torch_utils import randn_tensor
45
+ from ..pipeline_utils import DiffusionPipeline
46
 
 
 
 
 
 
 
 
 
 
 
47
 
48
  logger = logging.get_logger(__name__) # pylint: disable=invalid-name
49
 
 
53
  >>> import torch
54
  >>> from diffusers import MotionAdapter, AnimateDiffPipeline, DDIMScheduler
55
  >>> from diffusers.utils import export_to_gif
56
+
57
  >>> adapter = MotionAdapter.from_pretrained("guoyww/animatediff-motion-adapter-v1-5-2")
58
  >>> pipe = AnimateDiffPipeline.from_pretrained("frankjoshua/toonyou_beta6", motion_adapter=adapter)
59
  >>> pipe.scheduler = DDIMScheduler(beta_schedule="linear", steps_offset=1, clip_sample=False)
 
87
  class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdapterMixin, LoraLoaderMixin):
88
  r"""
89
  Pipeline for text-to-video generation.
90
+
91
  This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
92
  implemented for all pipelines (downloading, saving, running on a particular device, etc.).
93
+
94
  The pipeline also inherits the following loading methods:
95
  - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
96
  - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
97
  - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
98
  - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
99
+
100
  Args:
101
  vae ([`AutoencoderKL`]):
102
  Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
 
150
  self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
151
  self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
152
 
 
 
 
153
  # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt with num_images_per_prompt -> num_videos_per_prompt
154
  def encode_prompt(
155
  self,
 
165
  ):
166
  r"""
167
  Encodes the prompt into text encoder hidden states.
168
+
169
  Args:
170
  prompt (`str` or `List[str]`, *optional*):
171
  prompt to be encoded
 
417
  # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu
418
  def enable_freeu(self, s1: float, s2: float, b1: float, b2: float):
419
  r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497.
420
+
421
  The suffixes after the scaling factors represent the stages where they are being applied.
422
+
423
  Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values
424
  that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
425
+
426
  Args:
427
  s1 (`float`):
428
  Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
 
539
  latents = latents * self.scheduler.init_noise_sigma
540
  return latents
541
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
542
  @torch.no_grad()
543
+ @replace_example_docstring(EXAMPLE_DOC_STRING)
544
  def __call__(
545
  self,
546
  prompt: Union[str, List[str]] = None,
 
561
  negative_prompt_embeds: Optional[torch.FloatTensor] = None,
562
  ip_adapter_image: Optional[PipelineImageInput] = None,
563
  output_type: Optional[str] = "pil",
 
564
  return_dict: bool = True,
565
  callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
566
  callback_steps: Optional[int] = 1,
567
  cross_attention_kwargs: Optional[Dict[str, Any]] = None,
568
  clip_skip: Optional[int] = None,
 
 
 
 
 
 
 
569
  ):
570
  r"""
571
  The call function to the pipeline for generation.
572
+
573
  Args:
574
  prompt (`str` or `List[str]`, *optional*):
575
  The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
 
626
  Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
627
  the output of the pre-final layer will be used for computing the prompt embeddings.
628
  Examples:
629
+
630
  Returns:
631
  [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] or `tuple`:
632
  If `return_dict` is `True`, [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] is
 
696
 
697
  # 5. Prepare latent variables
698
  num_channels_latents = self.unet.config.in_channels
699
+ latents = self.prepare_latents(
700
+ batch_size * num_videos_per_prompt,
701
+ num_channels_latents,
702
+ num_frames,
703
+ height,
704
+ width,
705
+ prompt_embeds.dtype,
706
+ device,
707
+ generator,
708
+ latents,
709
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
710
 
711
  # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
712
  extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
 
719
 
720
  # Denoising loop
721
  num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
722
+ with self.progress_bar(total=num_context_groups * len(timesteps)) as progress_bar:
723
  for i, t in enumerate(timesteps):
724
 
725
  latent_sum = torch.zeros_like(latents).to(device).to(dtype=torch.float16)
726
  latent_counter = torch.zeros(num_frames).to(device).to(dtype=torch.float16)
727
 
728
+ # # foreach context group seperately denoise the current timestep
729
+ # for context_group in range(num_context_groups):
730
+ # # calculate to current indexes, considering overlap
731
+ # if context_group == 0:current_context_start = 0
732
+ # else:current_context_start = context_group * (context_size - overlap)
733
+
734
+ # # select the relevent context from the latents
735
+ # current_context_latents = latents[:, :, current_context_start : current_context_start + context_size, :, :]
736
+ # # if the context extends past the end of the latents, wrap around to the start
737
+ # if current_context_start + context_size > num_frames:
738
+ # current_context_latents = torch.cat([current_context_latents, latents[:, :, :current_context_start + context_size - num_frames, :, :]], dim=2)
739
+
740
  for context_group in range(num_context_groups):
741
+ # Calculate the current start index, considering overlap
742
+ current_context_start = 0 if context_group == 0 else context_group * (context_size - overlap)
743
+
744
+ # Calculate the end index and adjust if it exceeds num_frames
745
+ current_context_end = (current_context_start + context_size) % num_frames
746
+
747
+ # Select the relevant context from the latents with wrap-around handling
748
+ current_context_latents = torch.cat([
749
+ latents[:, :, current_context_start:min(current_context_start + context_size, num_frames), :, :],
750
+ latents[:, :, :max(current_context_end - num_frames, 0), :, :]
751
+ ], dim=2) if current_context_start + context_size > num_frames else latents[:, :, current_context_start:current_context_start + context_size, :, :]
752
 
 
 
753
 
754
  # expand the latents if we are doing classifier free guidance
755
  latent_model_input = torch.cat([current_context_latents] * 2) if do_classifier_free_guidance else current_context_latents
 
771
 
772
  # compute the previous noisy sample x_t -> x_t-1
773
  current_context_latents = self.scheduler.step(noise_pred, t, current_context_latents, **extra_step_kwargs).prev_sample
774
+
775
+ # call the callback, if provided
776
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
777
+ progress_bar.update()
778
+ if callback is not None and i % callback_steps == 0:
779
+ callback(i, t, current_context_latents)
780
 
781
  #add the context current_context_latents back to the latent sum starting from the current context start
782
  latent_sum[:, :, current_context_start : current_context_start + context_size, :, :] += current_context_latents
783
  # add one to the counter for each timestep in the context
784
  latent_counter[current_context_start : current_context_start + context_size] += 1
785
 
 
 
 
 
 
 
786
  latent_counter = latent_counter.reshape(1, 1, num_frames, 1, 1)
787
  latents = latent_sum / latent_counter
788
 
789
+ # shuffle rotate latent images by step places, wrapping around the last n steps to the start
790
  latents = torch.cat([latents[:, :, -step:, :, :], latents[:, :, :-step, :, :]], dim=2)
791
 
 
 
792
  if output_type == "latent":
793
  return AnimateDiffPipelineOutput(frames=latents)
794
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
795
  # Post-processing
796
  video_tensor = self.decode_latents(latents)
797
 
 
806
  if not return_dict:
807
  return (video,)
808
 
809
+ return AnimateDiffPipelineOutput(frames=video)