Update pipeline.py
Browse files- pipeline.py +59 -1
pipeline.py
CHANGED
@@ -539,6 +539,51 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
|
|
539 |
latents = latents * self.scheduler.init_noise_sigma
|
540 |
return latents
|
541 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
542 |
@torch.no_grad()
|
543 |
# @replace_example_docstring(EXAMPLE_DOC_STRING)
|
544 |
def __call__(
|
@@ -695,7 +740,18 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
|
|
695 |
|
696 |
# 5. Prepare latent variables
|
697 |
num_channels_latents = self.unet.config.in_channels
|
698 |
-
latents = self.prepare_latents(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
699 |
batch_size * num_videos_per_prompt,
|
700 |
num_channels_latents,
|
701 |
num_frames,
|
@@ -705,6 +761,8 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
|
|
705 |
device,
|
706 |
generator,
|
707 |
latents,
|
|
|
|
|
708 |
)
|
709 |
|
710 |
# 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
|
|
|
539 |
latents = latents * self.scheduler.init_noise_sigma
|
540 |
return latents
|
541 |
|
542 |
+
def prepare_motion_latents(self, batch_size, num_channels_latents, num_frames, height, width, dtype, device, generator,
|
543 |
+
latents=None, x_velocity=0, y_velocity=0, scale_velocity=0):
|
544 |
+
shape = (
|
545 |
+
batch_size,
|
546 |
+
num_channels_latents,
|
547 |
+
num_frames,
|
548 |
+
height // self.vae_scale_factor,
|
549 |
+
width // self.vae_scale_factor,
|
550 |
+
)
|
551 |
+
if isinstance(generator, list) and len(generator) != batch_size:
|
552 |
+
raise ValueError(
|
553 |
+
f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
|
554 |
+
f" size of {batch_size}. Make sure the batch size matches the length of the generators."
|
555 |
+
)
|
556 |
+
|
557 |
+
if latents is None:
|
558 |
+
latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
|
559 |
+
else:
|
560 |
+
latents = latents.to(device)
|
561 |
+
|
562 |
+
# scale the initial noise by the standard deviation required by the scheduler
|
563 |
+
latents = latents * self.scheduler.init_noise_sigma
|
564 |
+
|
565 |
+
# Apply motion and scale dynamics
|
566 |
+
for frame in range(num_frames):
|
567 |
+
x_offset = frame * x_velocity
|
568 |
+
y_offset = frame * y_velocity
|
569 |
+
scale_factor = 1 + frame * scale_velocity
|
570 |
+
|
571 |
+
# Apply offsets
|
572 |
+
latents[:, :, frame] = torch.roll(latents[:, :, frame], shifts=x_offset, dims=3) # x direction
|
573 |
+
latents[:, :, frame] = torch.roll(latents[:, :, frame], shifts=y_offset, dims=2) # y direction
|
574 |
+
|
575 |
+
# Apply scaling - This is a simple approach and might not be ideal for all applications
|
576 |
+
if scale_factor != 1:
|
577 |
+
scaled_size = (
|
578 |
+
int(latents.shape[3] * scale_factor),
|
579 |
+
int(latents.shape[4] * scale_factor)
|
580 |
+
)
|
581 |
+
latents[:, :, frame] = torch.nn.functional.interpolate(
|
582 |
+
latents[:, :, frame].unsqueeze(0), size=scaled_size, mode='bilinear', align_corners=False
|
583 |
+
).squeeze(0)
|
584 |
+
|
585 |
+
return latents
|
586 |
+
|
587 |
@torch.no_grad()
|
588 |
# @replace_example_docstring(EXAMPLE_DOC_STRING)
|
589 |
def __call__(
|
|
|
740 |
|
741 |
# 5. Prepare latent variables
|
742 |
num_channels_latents = self.unet.config.in_channels
|
743 |
+
# latents = self.prepare_latents(
|
744 |
+
# batch_size * num_videos_per_prompt,
|
745 |
+
# num_channels_latents,
|
746 |
+
# num_frames,
|
747 |
+
# height,
|
748 |
+
# width,
|
749 |
+
# prompt_embeds.dtype,
|
750 |
+
# device,
|
751 |
+
# generator,
|
752 |
+
# latents,
|
753 |
+
# )
|
754 |
+
latents = self.prepare_motion_latents(
|
755 |
batch_size * num_videos_per_prompt,
|
756 |
num_channels_latents,
|
757 |
num_frames,
|
|
|
761 |
device,
|
762 |
generator,
|
763 |
latents,
|
764 |
+
x_velocity=0.1,
|
765 |
+
y_velocity=0.1,
|
766 |
)
|
767 |
|
768 |
# 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
|