Update pipeline.py
Browse files- pipeline.py +298 -59
pipeline.py
CHANGED
@@ -20,12 +20,13 @@ import numpy as np
|
|
20 |
import torch
|
21 |
from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
|
22 |
|
23 |
-
|
24 |
-
from
|
25 |
-
from
|
26 |
-
from
|
27 |
-
from
|
28 |
-
from
|
|
|
29 |
DDIMScheduler,
|
30 |
DPMSolverMultistepScheduler,
|
31 |
EulerAncestralDiscreteScheduler,
|
@@ -33,17 +34,25 @@ from ...schedulers import (
|
|
33 |
LMSDiscreteScheduler,
|
34 |
PNDMScheduler,
|
35 |
)
|
36 |
-
from
|
37 |
USE_PEFT_BACKEND,
|
38 |
BaseOutput,
|
39 |
logging,
|
40 |
-
replace_example_docstring,
|
41 |
scale_lora_layers,
|
42 |
unscale_lora_layers,
|
43 |
)
|
44 |
-
from
|
45 |
-
from ..pipeline_utils import DiffusionPipeline
|
46 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
|
48 |
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
|
49 |
|
@@ -53,7 +62,6 @@ EXAMPLE_DOC_STRING = """
|
|
53 |
>>> import torch
|
54 |
>>> from diffusers import MotionAdapter, AnimateDiffPipeline, DDIMScheduler
|
55 |
>>> from diffusers.utils import export_to_gif
|
56 |
-
|
57 |
>>> adapter = MotionAdapter.from_pretrained("guoyww/animatediff-motion-adapter-v1-5-2")
|
58 |
>>> pipe = AnimateDiffPipeline.from_pretrained("frankjoshua/toonyou_beta6", motion_adapter=adapter)
|
59 |
>>> pipe.scheduler = DDIMScheduler(beta_schedule="linear", steps_offset=1, clip_sample=False)
|
@@ -87,16 +95,13 @@ class AnimateDiffPipelineOutput(BaseOutput):
|
|
87 |
class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdapterMixin, LoraLoaderMixin):
|
88 |
r"""
|
89 |
Pipeline for text-to-video generation.
|
90 |
-
|
91 |
This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
|
92 |
implemented for all pipelines (downloading, saving, running on a particular device, etc.).
|
93 |
-
|
94 |
The pipeline also inherits the following loading methods:
|
95 |
- [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
|
96 |
- [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
|
97 |
- [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
|
98 |
- [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
|
99 |
-
|
100 |
Args:
|
101 |
vae ([`AutoencoderKL`]):
|
102 |
Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
|
@@ -150,6 +155,9 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
|
|
150 |
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
|
151 |
self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
|
152 |
|
|
|
|
|
|
|
153 |
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt with num_images_per_prompt -> num_videos_per_prompt
|
154 |
def encode_prompt(
|
155 |
self,
|
@@ -165,7 +173,6 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
|
|
165 |
):
|
166 |
r"""
|
167 |
Encodes the prompt into text encoder hidden states.
|
168 |
-
|
169 |
Args:
|
170 |
prompt (`str` or `List[str]`, *optional*):
|
171 |
prompt to be encoded
|
@@ -417,12 +424,9 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
|
|
417 |
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu
|
418 |
def enable_freeu(self, s1: float, s2: float, b1: float, b2: float):
|
419 |
r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497.
|
420 |
-
|
421 |
The suffixes after the scaling factors represent the stages where they are being applied.
|
422 |
-
|
423 |
Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values
|
424 |
that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
|
425 |
-
|
426 |
Args:
|
427 |
s1 (`float`):
|
428 |
Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
|
@@ -539,8 +543,208 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
|
|
539 |
latents = latents * self.scheduler.init_noise_sigma
|
540 |
return latents
|
541 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
542 |
@torch.no_grad()
|
543 |
-
@replace_example_docstring(EXAMPLE_DOC_STRING)
|
544 |
def __call__(
|
545 |
self,
|
546 |
prompt: Union[str, List[str]] = None,
|
@@ -561,15 +765,22 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
|
|
561 |
negative_prompt_embeds: Optional[torch.FloatTensor] = None,
|
562 |
ip_adapter_image: Optional[PipelineImageInput] = None,
|
563 |
output_type: Optional[str] = "pil",
|
|
|
564 |
return_dict: bool = True,
|
565 |
callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
|
566 |
callback_steps: Optional[int] = 1,
|
567 |
cross_attention_kwargs: Optional[Dict[str, Any]] = None,
|
568 |
clip_skip: Optional[int] = None,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
569 |
):
|
570 |
r"""
|
571 |
The call function to the pipeline for generation.
|
572 |
-
|
573 |
Args:
|
574 |
prompt (`str` or `List[str]`, *optional*):
|
575 |
The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
|
@@ -626,7 +837,6 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
|
|
626 |
Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
|
627 |
the output of the pre-final layer will be used for computing the prompt embeddings.
|
628 |
Examples:
|
629 |
-
|
630 |
Returns:
|
631 |
[`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] or `tuple`:
|
632 |
If `return_dict` is `True`, [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] is
|
@@ -696,17 +906,48 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
|
|
696 |
|
697 |
# 5. Prepare latent variables
|
698 |
num_channels_latents = self.unet.config.in_channels
|
699 |
-
|
700 |
-
|
701 |
-
|
702 |
-
|
703 |
-
|
704 |
-
|
705 |
-
|
706 |
-
|
707 |
-
|
708 |
-
|
709 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
710 |
|
711 |
# 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
|
712 |
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
|
@@ -719,7 +960,7 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
|
|
719 |
|
720 |
# Denoising loop
|
721 |
num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
|
722 |
-
with self.progress_bar(total=
|
723 |
for i, t in enumerate(timesteps):
|
724 |
|
725 |
latent_sum = torch.zeros_like(latents).to(device).to(dtype=torch.float16)
|
@@ -733,24 +974,6 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
|
|
733 |
|
734 |
# select the relevent context from the latents
|
735 |
current_context_latents = latents[:, :, current_context_start : current_context_start + context_size, :, :]
|
736 |
-
# if the context extends past the end of the latents, wrap around to the start
|
737 |
-
if current_context_start + context_size > num_frames:
|
738 |
-
current_context_latents = torch.cat([current_context_latents, latents[:, :, :current_context_start + context_size - num_frames, :, :]], dim=2)
|
739 |
-
|
740 |
-
# for context_group in range(num_context_groups):
|
741 |
-
# # Calculate the current start index, considering overlap
|
742 |
-
# current_context_start = 0 if context_group == 0 else context_group * (context_size - overlap)
|
743 |
-
|
744 |
-
# # Calculate the end index and adjust if it exceeds num_frames
|
745 |
-
# current_context_end = (current_context_start + context_size) % num_frames
|
746 |
-
|
747 |
-
# # Select the relevant context from the latents with wrap-around handling
|
748 |
-
# current_context_latents = torch.cat([
|
749 |
-
# latents[:, :, current_context_start:min(current_context_start + context_size, num_frames), :, :],
|
750 |
-
# latents[:, :, :max(current_context_end - num_frames, 0), :, :]
|
751 |
-
# ], dim=2) if current_context_start + context_size > num_frames else latents[:, :, current_context_start:current_context_start + context_size, :, :]
|
752 |
-
|
753 |
-
|
754 |
|
755 |
# expand the latents if we are doing classifier free guidance
|
756 |
latent_model_input = torch.cat([current_context_latents] * 2) if do_classifier_free_guidance else current_context_latents
|
@@ -772,27 +995,43 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
|
|
772 |
|
773 |
# compute the previous noisy sample x_t -> x_t-1
|
774 |
current_context_latents = self.scheduler.step(noise_pred, t, current_context_latents, **extra_step_kwargs).prev_sample
|
775 |
-
|
776 |
-
# call the callback, if provided
|
777 |
-
if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
|
778 |
-
progress_bar.update()
|
779 |
-
if callback is not None and i % callback_steps == 0:
|
780 |
-
callback(i, t, current_context_latents)
|
781 |
|
782 |
#add the context current_context_latents back to the latent sum starting from the current context start
|
783 |
latent_sum[:, :, current_context_start : current_context_start + context_size, :, :] += current_context_latents
|
784 |
# add one to the counter for each timestep in the context
|
785 |
latent_counter[current_context_start : current_context_start + context_size] += 1
|
786 |
|
|
|
|
|
|
|
|
|
|
|
|
|
787 |
latent_counter = latent_counter.reshape(1, 1, num_frames, 1, 1)
|
788 |
latents = latent_sum / latent_counter
|
789 |
|
790 |
-
# shuffle rotate latent images by step places, wrapping around the last
|
791 |
latents = torch.cat([latents[:, :, -step:, :, :], latents[:, :, :-step, :, :]], dim=2)
|
792 |
|
|
|
|
|
793 |
if output_type == "latent":
|
794 |
return AnimateDiffPipelineOutput(frames=latents)
|
795 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
796 |
# Post-processing
|
797 |
video_tensor = self.decode_latents(latents)
|
798 |
|
@@ -807,4 +1046,4 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
|
|
807 |
if not return_dict:
|
808 |
return (video,)
|
809 |
|
810 |
-
return AnimateDiffPipelineOutput(frames=video)
|
|
|
20 |
import torch
|
21 |
from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
|
22 |
|
23 |
+
# Updated to use absolute paths
|
24 |
+
from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
|
25 |
+
from diffusers.loaders import IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
|
26 |
+
from diffusers.models import AutoencoderKL, ImageProjection, UNet2DConditionModel, UNetMotionModel
|
27 |
+
from diffusers.models.lora import adjust_lora_scale_text_encoder
|
28 |
+
from diffusers.models.unet_motion_model import MotionAdapter
|
29 |
+
from diffusers.schedulers import (
|
30 |
DDIMScheduler,
|
31 |
DPMSolverMultistepScheduler,
|
32 |
EulerAncestralDiscreteScheduler,
|
|
|
34 |
LMSDiscreteScheduler,
|
35 |
PNDMScheduler,
|
36 |
)
|
37 |
+
from diffusers.utils import (
|
38 |
USE_PEFT_BACKEND,
|
39 |
BaseOutput,
|
40 |
logging,
|
|
|
41 |
scale_lora_layers,
|
42 |
unscale_lora_layers,
|
43 |
)
|
44 |
+
from diffusers.utils.torch_utils import is_compiled_module, randn_tensor
|
|
|
45 |
|
46 |
+
# Added imports based on the working paths
|
47 |
+
from diffusers.models import ControlNetModel
|
48 |
+
from diffusers.pipelines.controlnet.multicontrolnet import MultiControlNetModel
|
49 |
+
from diffusers.pipelines.pipeline_utils import DiffusionPipeline
|
50 |
+
from diffusers.utils import deprecate
|
51 |
+
|
52 |
+
import torchvision
|
53 |
+
import PIL
|
54 |
+
import PIL.Image
|
55 |
+
import math
|
56 |
|
57 |
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
|
58 |
|
|
|
62 |
>>> import torch
|
63 |
>>> from diffusers import MotionAdapter, AnimateDiffPipeline, DDIMScheduler
|
64 |
>>> from diffusers.utils import export_to_gif
|
|
|
65 |
>>> adapter = MotionAdapter.from_pretrained("guoyww/animatediff-motion-adapter-v1-5-2")
|
66 |
>>> pipe = AnimateDiffPipeline.from_pretrained("frankjoshua/toonyou_beta6", motion_adapter=adapter)
|
67 |
>>> pipe.scheduler = DDIMScheduler(beta_schedule="linear", steps_offset=1, clip_sample=False)
|
|
|
95 |
class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdapterMixin, LoraLoaderMixin):
|
96 |
r"""
|
97 |
Pipeline for text-to-video generation.
|
|
|
98 |
This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
|
99 |
implemented for all pipelines (downloading, saving, running on a particular device, etc.).
|
|
|
100 |
The pipeline also inherits the following loading methods:
|
101 |
- [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
|
102 |
- [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
|
103 |
- [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
|
104 |
- [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
|
|
|
105 |
Args:
|
106 |
vae ([`AutoencoderKL`]):
|
107 |
Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
|
|
|
155 |
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
|
156 |
self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
|
157 |
|
158 |
+
def load_motion_adapter(self,motion_adapter):
|
159 |
+
self.register_modules(motion_adapter=motion_adapter)
|
160 |
+
|
161 |
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt with num_images_per_prompt -> num_videos_per_prompt
|
162 |
def encode_prompt(
|
163 |
self,
|
|
|
173 |
):
|
174 |
r"""
|
175 |
Encodes the prompt into text encoder hidden states.
|
|
|
176 |
Args:
|
177 |
prompt (`str` or `List[str]`, *optional*):
|
178 |
prompt to be encoded
|
|
|
424 |
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu
|
425 |
def enable_freeu(self, s1: float, s2: float, b1: float, b2: float):
|
426 |
r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497.
|
|
|
427 |
The suffixes after the scaling factors represent the stages where they are being applied.
|
|
|
428 |
Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values
|
429 |
that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
|
|
|
430 |
Args:
|
431 |
s1 (`float`):
|
432 |
Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
|
|
|
543 |
latents = latents * self.scheduler.init_noise_sigma
|
544 |
return latents
|
545 |
|
546 |
+
def prepare_motion_latents(self, batch_size, num_channels_latents, num_frames, height, width, dtype, device, generator,
|
547 |
+
latents=None, x_velocity=0, y_velocity=0, scale_velocity=0):
|
548 |
+
shape = (
|
549 |
+
batch_size,
|
550 |
+
num_channels_latents,
|
551 |
+
num_frames,
|
552 |
+
height // self.vae_scale_factor,
|
553 |
+
width // self.vae_scale_factor,
|
554 |
+
)
|
555 |
+
if isinstance(generator, list) and len(generator) != batch_size:
|
556 |
+
raise ValueError(
|
557 |
+
f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
|
558 |
+
f" size of {batch_size}. Make sure the batch size matches the length of the generators."
|
559 |
+
)
|
560 |
+
|
561 |
+
if latents is None:
|
562 |
+
latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
|
563 |
+
else:
|
564 |
+
latents = latents.to(device)
|
565 |
+
|
566 |
+
# scale the initial noise by the standard deviation required by the scheduler
|
567 |
+
latents = latents * self.scheduler.init_noise_sigma
|
568 |
+
|
569 |
+
for frame in range(num_frames):
|
570 |
+
x_offset = int(frame * x_velocity) # Convert to int
|
571 |
+
y_offset = int(frame * y_velocity) # Convert to int
|
572 |
+
scale_factor = 1 + frame * scale_velocity
|
573 |
+
|
574 |
+
# Apply offsets
|
575 |
+
latents[:, :, frame] = torch.roll(latents[:, :, frame], shifts=(x_offset,), dims=3) # x direction
|
576 |
+
latents[:, :, frame] = torch.roll(latents[:, :, frame], shifts=(y_offset,), dims=2) # y direction
|
577 |
+
|
578 |
+
# Apply scaling - This is a simple approach and might not be ideal for all applications
|
579 |
+
if scale_factor != 1:
|
580 |
+
scaled_size = (
|
581 |
+
int(latents.shape[3] * scale_factor),
|
582 |
+
int(latents.shape[4] * scale_factor)
|
583 |
+
)
|
584 |
+
latents[:, :, frame] = torch.nn.functional.interpolate(
|
585 |
+
latents[:, :, frame].unsqueeze(0), size=scaled_size, mode='bilinear', align_corners=False
|
586 |
+
).squeeze(0)
|
587 |
+
|
588 |
+
return latents
|
589 |
+
|
590 |
+
def generate_correlated_noise(self, latents, init_noise_correlation):
|
591 |
+
cloned_latents = latents.clone()
|
592 |
+
p = init_noise_correlation
|
593 |
+
flattened_latents = torch.flatten(cloned_latents)
|
594 |
+
noise = torch.randn_like(flattened_latents)
|
595 |
+
correlated_noise = flattened_latents * p + math.sqrt(1 - p**2) * noise
|
596 |
+
|
597 |
+
return correlated_noise.reshape(cloned_latents.shape)
|
598 |
+
|
599 |
+
def generate_correlated_latents(self, latents, init_noise_correlation):
|
600 |
+
cloned_latents = latents.clone()
|
601 |
+
for i in range(1, cloned_latents.shape[2]):
|
602 |
+
p = init_noise_correlation
|
603 |
+
flattened_latents = torch.flatten(cloned_latents[:, :, i])
|
604 |
+
prev_flattened_latents = torch.flatten(cloned_latents[:, :, i - 1])
|
605 |
+
correlated_latents = (prev_flattened_latents * p/math.sqrt((1+p**2))+flattened_latents * math.sqrt(1/(1 + p**2)))
|
606 |
+
cloned_latents[:, :, i] = correlated_latents.reshape(cloned_latents[:, :, i].shape)
|
607 |
+
|
608 |
+
return cloned_latents
|
609 |
+
|
610 |
+
def generate_correlated_latents_legacy(self, latents, init_noise_correlation):
|
611 |
+
cloned_latents = latents.clone()
|
612 |
+
for i in range(1, cloned_latents.shape[2]):
|
613 |
+
p = init_noise_correlation
|
614 |
+
flattened_latents = torch.flatten(cloned_latents[:, :, i])
|
615 |
+
prev_flattened_latents = torch.flatten(cloned_latents[:, :, i - 1])
|
616 |
+
correlated_latents = (
|
617 |
+
prev_flattened_latents * p
|
618 |
+
+
|
619 |
+
flattened_latents * math.sqrt(1 - p**2)
|
620 |
+
)
|
621 |
+
cloned_latents[:, :, i] = correlated_latents.reshape(
|
622 |
+
cloned_latents[:, :, i].shape
|
623 |
+
)
|
624 |
+
|
625 |
+
return cloned_latents
|
626 |
+
|
627 |
+
def generate_mixed_noise(self, noise, init_noise_correlation):
|
628 |
+
shared_noise = torch.randn_like(noise[0, :, 0])
|
629 |
+
for b in range(noise.shape[0]):
|
630 |
+
for f in range(noise.shape[2]):
|
631 |
+
p = init_noise_correlation
|
632 |
+
flattened_latents = torch.flatten(noise[b, :, f])
|
633 |
+
shared_latents = torch.flatten(shared_noise)
|
634 |
+
correlated_latents = (
|
635 |
+
shared_latents * math.sqrt(p**2/(1+p**2)) +
|
636 |
+
flattened_latents * math.sqrt(1/(1+p**2))
|
637 |
+
)
|
638 |
+
noise[b, :, f] = correlated_latents.reshape(noise[b, :, f].shape)
|
639 |
+
|
640 |
+
return noise
|
641 |
+
|
642 |
+
def prepare_correlated_latents(
|
643 |
+
self,
|
644 |
+
init_image,
|
645 |
+
init_image_strength,
|
646 |
+
init_noise_correlation,
|
647 |
+
batch_size,
|
648 |
+
num_channels_latents,
|
649 |
+
video_length,
|
650 |
+
height,
|
651 |
+
width,
|
652 |
+
dtype,
|
653 |
+
device,
|
654 |
+
generator,
|
655 |
+
latents=None,
|
656 |
+
):
|
657 |
+
shape = (
|
658 |
+
batch_size,
|
659 |
+
num_channels_latents,
|
660 |
+
video_length,
|
661 |
+
height // self.vae_scale_factor,
|
662 |
+
width // self.vae_scale_factor,
|
663 |
+
)
|
664 |
+
|
665 |
+
if init_image is not None:
|
666 |
+
start_image = ((torchvision.transforms.functional.pil_to_tensor(init_image))/ 255 )[:3, :, :].to("cuda").to(dtype).unsqueeze(0)
|
667 |
+
start_image = (
|
668 |
+
self.vae.encode(start_image.mul(2).sub(1))
|
669 |
+
.latent_dist.sample()
|
670 |
+
.view(1, 4, height // 8, width // 8)
|
671 |
+
* 0.18215
|
672 |
+
)
|
673 |
+
init_latents = start_image.unsqueeze(2).repeat(1, 1, video_length, 1, 1)
|
674 |
+
else:
|
675 |
+
init_latents = None
|
676 |
+
|
677 |
+
if isinstance(generator, list) and len(generator) != batch_size:
|
678 |
+
raise ValueError(
|
679 |
+
f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
|
680 |
+
f" size of {batch_size}. Make sure the batch size matches the length of the generators."
|
681 |
+
)
|
682 |
+
if latents is None:
|
683 |
+
rand_device = "cpu" if device.type == "mps" else device
|
684 |
+
if isinstance(generator, list):
|
685 |
+
shape = shape
|
686 |
+
# shape = (1,) + shape[1:]
|
687 |
+
# ignore init latents for batch model
|
688 |
+
latents = [torch.randn(shape, generator=generator[i], device=rand_device, dtype=dtype)for i in range(batch_size)]
|
689 |
+
latents = torch.cat(latents, dim=0).to(device)
|
690 |
+
else:
|
691 |
+
if init_latents is not None:
|
692 |
+
offset = int(
|
693 |
+
init_image_strength * (len(self.scheduler.timesteps) - 1)
|
694 |
+
)
|
695 |
+
noise = torch.randn_like(init_latents)
|
696 |
+
noise = self.generate_correlated_latents(noise, init_noise_correlation)
|
697 |
+
|
698 |
+
# Eric - some black magic here
|
699 |
+
# We should be only adding the noise at timestep[offset], but I noticed that
|
700 |
+
# we get more motion and cooler motion if we add the noise at timestep[offset - 1]
|
701 |
+
# or offset - 2. However, this breaks the fewer timesteps there are, so let's interpolate
|
702 |
+
timesteps = self.scheduler.timesteps
|
703 |
+
average_timestep = None
|
704 |
+
if offset == 0:
|
705 |
+
average_timestep = timesteps[0]
|
706 |
+
elif offset == 1:
|
707 |
+
average_timestep = (
|
708 |
+
timesteps[offset - 1] * (1 - init_image_strength)
|
709 |
+
+ timesteps[offset] * init_image_strength
|
710 |
+
)
|
711 |
+
else:
|
712 |
+
average_timestep = timesteps[offset - 1]
|
713 |
+
|
714 |
+
latents = self.scheduler.add_noise(
|
715 |
+
init_latents, noise, average_timestep.long()
|
716 |
+
)
|
717 |
+
|
718 |
+
latents = self.scheduler.add_noise(
|
719 |
+
latents, torch.randn_like(init_latents), timesteps[-2]
|
720 |
+
)
|
721 |
+
else:
|
722 |
+
latents = torch.randn(
|
723 |
+
shape, generator=generator, device=rand_device, dtype=dtype
|
724 |
+
).to(device)
|
725 |
+
latents = self.generate_correlated_latents(
|
726 |
+
latents, init_noise_correlation
|
727 |
+
)
|
728 |
+
else:
|
729 |
+
if latents.shape != shape:
|
730 |
+
raise ValueError(
|
731 |
+
f"Unexpected latents shape, got {latents.shape}, expected {shape}"
|
732 |
+
)
|
733 |
+
latents = latents.to(device)
|
734 |
+
|
735 |
+
# scale the initial noise by the standard deviation required by the scheduler
|
736 |
+
if init_latents is None:
|
737 |
+
latents = latents * self.scheduler.init_noise_sigma
|
738 |
+
# elif self.unet.trained_initial_frames and init_latents is not None:
|
739 |
+
# # we only want to use this as the first frame
|
740 |
+
# init_latents[:, :, 1:] = torch.zeros_like(init_latents[:, :, 1:])
|
741 |
+
|
742 |
+
latents = latents.to(device)
|
743 |
+
return latents, init_latents
|
744 |
+
|
745 |
+
|
746 |
@torch.no_grad()
|
747 |
+
# @replace_example_docstring(EXAMPLE_DOC_STRING)
|
748 |
def __call__(
|
749 |
self,
|
750 |
prompt: Union[str, List[str]] = None,
|
|
|
765 |
negative_prompt_embeds: Optional[torch.FloatTensor] = None,
|
766 |
ip_adapter_image: Optional[PipelineImageInput] = None,
|
767 |
output_type: Optional[str] = "pil",
|
768 |
+
output_path: Optional[str] = None,
|
769 |
return_dict: bool = True,
|
770 |
callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
|
771 |
callback_steps: Optional[int] = 1,
|
772 |
cross_attention_kwargs: Optional[Dict[str, Any]] = None,
|
773 |
clip_skip: Optional[int] = None,
|
774 |
+
x_velocity: Optional[float] = 0,
|
775 |
+
y_velocity: Optional[float] = 0,
|
776 |
+
scale_velocity: Optional[float] = 0,
|
777 |
+
init_image: Optional[PipelineImageInput] = None,
|
778 |
+
init_image_strength: Optional[float] = 1.0,
|
779 |
+
init_noise_correlation: Optional[float] = 0.0,
|
780 |
+
latent_mode: Optional[str] = "normal",
|
781 |
):
|
782 |
r"""
|
783 |
The call function to the pipeline for generation.
|
|
|
784 |
Args:
|
785 |
prompt (`str` or `List[str]`, *optional*):
|
786 |
The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
|
|
|
837 |
Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
|
838 |
the output of the pre-final layer will be used for computing the prompt embeddings.
|
839 |
Examples:
|
|
|
840 |
Returns:
|
841 |
[`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] or `tuple`:
|
842 |
If `return_dict` is `True`, [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] is
|
|
|
906 |
|
907 |
# 5. Prepare latent variables
|
908 |
num_channels_latents = self.unet.config.in_channels
|
909 |
+
if(latent_mode == "normal"):
|
910 |
+
latents = self.prepare_latents(
|
911 |
+
batch_size * num_videos_per_prompt,
|
912 |
+
num_channels_latents,
|
913 |
+
num_frames,
|
914 |
+
height,
|
915 |
+
width,
|
916 |
+
prompt_embeds.dtype,
|
917 |
+
device,
|
918 |
+
generator,
|
919 |
+
latents,
|
920 |
+
)
|
921 |
+
elif(latent_mode == "motion"):
|
922 |
+
latents = self.prepare_motion_latents(
|
923 |
+
batch_size * num_videos_per_prompt,
|
924 |
+
num_channels_latents,
|
925 |
+
num_frames,
|
926 |
+
height,
|
927 |
+
width,
|
928 |
+
prompt_embeds.dtype,
|
929 |
+
device,
|
930 |
+
generator,
|
931 |
+
latents,
|
932 |
+
x_velocity=x_velocity,
|
933 |
+
y_velocity=y_velocity,
|
934 |
+
scale_velocity=scale_velocity,
|
935 |
+
)
|
936 |
+
elif(latent_mode == "correlated"):
|
937 |
+
latents, init_latents = self.prepare_correlated_latents(
|
938 |
+
init_image,
|
939 |
+
init_image_strength,
|
940 |
+
init_noise_correlation,
|
941 |
+
batch_size,
|
942 |
+
num_channels_latents,
|
943 |
+
num_frames,
|
944 |
+
height,
|
945 |
+
width,
|
946 |
+
prompt_embeds.dtype,
|
947 |
+
device,
|
948 |
+
generator,
|
949 |
+
)
|
950 |
+
|
951 |
|
952 |
# 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
|
953 |
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
|
|
|
960 |
|
961 |
# Denoising loop
|
962 |
num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
|
963 |
+
with self.progress_bar(total=len(timesteps)) as progress_bar:
|
964 |
for i, t in enumerate(timesteps):
|
965 |
|
966 |
latent_sum = torch.zeros_like(latents).to(device).to(dtype=torch.float16)
|
|
|
974 |
|
975 |
# select the relevent context from the latents
|
976 |
current_context_latents = latents[:, :, current_context_start : current_context_start + context_size, :, :]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
977 |
|
978 |
# expand the latents if we are doing classifier free guidance
|
979 |
latent_model_input = torch.cat([current_context_latents] * 2) if do_classifier_free_guidance else current_context_latents
|
|
|
995 |
|
996 |
# compute the previous noisy sample x_t -> x_t-1
|
997 |
current_context_latents = self.scheduler.step(noise_pred, t, current_context_latents, **extra_step_kwargs).prev_sample
|
|
|
|
|
|
|
|
|
|
|
|
|
998 |
|
999 |
#add the context current_context_latents back to the latent sum starting from the current context start
|
1000 |
latent_sum[:, :, current_context_start : current_context_start + context_size, :, :] += current_context_latents
|
1001 |
# add one to the counter for each timestep in the context
|
1002 |
latent_counter[current_context_start : current_context_start + context_size] += 1
|
1003 |
|
1004 |
+
# call the callback, if provided
|
1005 |
+
if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
|
1006 |
+
progress_bar.update()
|
1007 |
+
if callback is not None and i % callback_steps == 0:
|
1008 |
+
callback(i, t, None)
|
1009 |
+
|
1010 |
latent_counter = latent_counter.reshape(1, 1, num_frames, 1, 1)
|
1011 |
latents = latent_sum / latent_counter
|
1012 |
|
1013 |
+
# shuffle rotate latent images by step places, wrapping around the last 2 to the start
|
1014 |
latents = torch.cat([latents[:, :, -step:, :, :], latents[:, :, :-step, :, :]], dim=2)
|
1015 |
|
1016 |
+
print("Done denoising")
|
1017 |
+
|
1018 |
if output_type == "latent":
|
1019 |
return AnimateDiffPipelineOutput(frames=latents)
|
1020 |
|
1021 |
+
# save frames
|
1022 |
+
if output_path is not None:
|
1023 |
+
output_batch_size = 2 # prevents out of memory errors with large videos
|
1024 |
+
num_digits = output_path.count('#') # count the number of '#' characters
|
1025 |
+
frame_format = output_path.replace('#' * num_digits, '{:0' + str(num_digits) + 'd}')
|
1026 |
+
for batch in range((num_frames + output_batch_size - 1) // output_batch_size):
|
1027 |
+
start_id = batch * output_batch_size
|
1028 |
+
end_id = min((batch + 1) * output_batch_size, num_frames)
|
1029 |
+
video_tensor = self.decode_latents(latents[:, :, start_id:end_id, :, :])
|
1030 |
+
video = tensor2vid(video_tensor, self.image_processor, output_type=output_type)
|
1031 |
+
for f_id, frame in enumerate(video[0]):
|
1032 |
+
frame.save(frame_format.format(start_id + f_id))
|
1033 |
+
return output_path
|
1034 |
+
|
1035 |
# Post-processing
|
1036 |
video_tensor = self.decode_latents(latents)
|
1037 |
|
|
|
1046 |
if not return_dict:
|
1047 |
return (video,)
|
1048 |
|
1049 |
+
return AnimateDiffPipelineOutput(frames=video)
|