Update pipeline.py
Browse files- pipeline.py +51 -7
pipeline.py
CHANGED
@@ -540,6 +540,36 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
|
|
540 |
# scale the initial noise by the standard deviation required by the scheduler
|
541 |
latents = latents * self.scheduler.init_noise_sigma
|
542 |
return latents
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
543 |
|
544 |
def prepare_latents_consistent(self, batch_size, num_channels_latents, num_frames, height, width, dtype, device, generator, latents=None,smooth_weight=0.5,smooth_steps=3):
|
545 |
shape = (
|
@@ -954,6 +984,20 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
|
|
954 |
generator,
|
955 |
latents,
|
956 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
957 |
elif(latent_mode == "motion"):
|
958 |
latents = self.prepare_motion_latents(
|
959 |
batch_size * num_videos_per_prompt,
|
@@ -1022,15 +1066,15 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
|
|
1022 |
# Iterate over each index in the context group
|
1023 |
local_context_size = context_size
|
1024 |
if timestep <= 1:
|
1025 |
-
local_context_size = context_size *
|
1026 |
for index in range(local_context_size):
|
1027 |
# if its the first timestep, spread the indexes out evenly over the full frame range, offset by the group index
|
1028 |
-
|
1029 |
-
|
1030 |
-
|
1031 |
-
|
1032 |
-
|
1033 |
-
|
1034 |
# If frame index exceeds total frames, wrap around
|
1035 |
if frame_index >= total_frames:
|
1036 |
frame_index %= total_frames
|
|
|
540 |
# scale the initial noise by the standard deviation required by the scheduler
|
541 |
latents = latents * self.scheduler.init_noise_sigma
|
542 |
return latents
|
543 |
+
|
544 |
+
def prepare_latents_same_start(self, batch_size, num_channels_latents, num_frames, height, width, dtype, device, generator, latents=None, context_size=16, blend_frames=4):
|
545 |
+
shape = (
|
546 |
+
batch_size,
|
547 |
+
num_channels_latents,
|
548 |
+
num_frames,
|
549 |
+
height // self.vae_scale_factor,
|
550 |
+
width // self.vae_scale_factor,
|
551 |
+
)
|
552 |
+
if isinstance(generator, list) and len(generator) != batch_size:
|
553 |
+
raise ValueError(
|
554 |
+
f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
|
555 |
+
f" size of {batch_size}. Make sure the batch size matches the length of the generators."
|
556 |
+
)
|
557 |
+
|
558 |
+
if latents is None:
|
559 |
+
latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
|
560 |
+
else:
|
561 |
+
latents = latents.to(device)
|
562 |
+
|
563 |
+
# make every (context_size-blend_frames) frames have the same noise
|
564 |
+
loop_size = context_size - blend_frames
|
565 |
+
loop_count = num_frames // loop_size
|
566 |
+
for i in range(loop_count):
|
567 |
+
# repeat the first frames noise for i*loop_size frame
|
568 |
+
latents[:, :, i*loop_size, :, :] = latents[:, :, 0, :, :]
|
569 |
+
|
570 |
+
# scale the initial noise by the standard deviation required by the scheduler
|
571 |
+
latents = latents * self.scheduler.init_noise_sigma
|
572 |
+
return latents
|
573 |
|
574 |
def prepare_latents_consistent(self, batch_size, num_channels_latents, num_frames, height, width, dtype, device, generator, latents=None,smooth_weight=0.5,smooth_steps=3):
|
575 |
shape = (
|
|
|
984 |
generator,
|
985 |
latents,
|
986 |
)
|
987 |
+
if(latent_mode == "same_start"):
|
988 |
+
latents = self.prepare_latents_same_start(
|
989 |
+
batch_size * num_videos_per_prompt,
|
990 |
+
num_channels_latents,
|
991 |
+
num_frames,
|
992 |
+
height,
|
993 |
+
width,
|
994 |
+
prompt_embeds.dtype,
|
995 |
+
device,
|
996 |
+
generator,
|
997 |
+
latents,
|
998 |
+
context_size=context_size,
|
999 |
+
blend_frames=overlap,
|
1000 |
+
)
|
1001 |
elif(latent_mode == "motion"):
|
1002 |
latents = self.prepare_motion_latents(
|
1003 |
batch_size * num_videos_per_prompt,
|
|
|
1066 |
# Iterate over each index in the context group
|
1067 |
local_context_size = context_size
|
1068 |
if timestep <= 1:
|
1069 |
+
local_context_size = context_size * 1.5
|
1070 |
for index in range(local_context_size):
|
1071 |
# if its the first timestep, spread the indexes out evenly over the full frame range, offset by the group index
|
1072 |
+
if timestep <= 1:
|
1073 |
+
step_size = 2
|
1074 |
+
# make the context group stretch
|
1075 |
+
else:
|
1076 |
+
# Calculate the frame index
|
1077 |
+
frame_index = (group_index * (local_context_size - overlap)) + (offset * timestep) + index
|
1078 |
# If frame index exceeds total frames, wrap around
|
1079 |
if frame_index >= total_frames:
|
1080 |
frame_index %= total_frames
|