smoothieAI
/

pipeline_animatediff_context

Model card Files Files and versions Community

smoothieAI commited on Jan 16, 2024

Commit

0e91da2

verified ·

1 Parent(s): d73178b

Update pipeline.py

Browse files

Files changed (1) hide show

pipeline.py +11 -19

pipeline.py CHANGED Viewed

@@ -982,23 +982,6 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
                         print(f"Appending {max(current_context_start + context_size - num_frames, 0)} frames from the start of the latents")
                         current_context_latents = torch.cat([current_context_latents, latents[:, :, :max(current_context_start + context_size - num_frames, 0), :, :]], dim=2)
-                # for context_group in range(num_context_groups):
-                #     # Calculate the current start index, considering overlap
-                #     current_context_start = 0 if context_group == 0 else context_group * (context_size - overlap)
-                #     # Calculate the end index and adjust if it exceeds num_frames
-                #     current_context_end = (current_context_start + context_size) % num_frames
-                #     # Select the relevant context from the latents with wrap-around handling
-                #     current_context_latents = torch.cat([
-                #         latents[:, :, current_context_start:min(current_context_start + context_size, num_frames), :, :],
-                #         latents[:, :, :max(current_context_end - num_frames, 0), :, :]
-                #     ], dim=2) if current_context_start + context_size > num_frames else latents[:, :, current_context_start:current_context_start + context_size, :, :]
                     # expand the latents if we are doing classifier free guidance
                     latent_model_input = torch.cat([current_context_latents] * 2) if do_classifier_free_guidance else current_context_latents
                     latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
@@ -1019,9 +1002,18 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
                     # compute the previous noisy sample x_t -> x_t-1
                     current_context_latents = self.scheduler.step(noise_pred, t, current_context_latents, **extra_step_kwargs).prev_sample
                     #add the context current_context_latents back to the latent sum starting from the current context start
-                    latent_sum[:, :, current_context_start : current_context_start + context_size, :, :] += current_context_latents
                     # add one to the counter for each timestep in the context
                     latent_counter[current_context_start : current_context_start + context_size] += 1

                         print(f"Appending {max(current_context_start + context_size - num_frames, 0)} frames from the start of the latents")
                         current_context_latents = torch.cat([current_context_latents, latents[:, :, :max(current_context_start + context_size - num_frames, 0), :, :]], dim=2)
                     # expand the latents if we are doing classifier free guidance
                     latent_model_input = torch.cat([current_context_latents] * 2) if do_classifier_free_guidance else current_context_latents
                     latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
                     # compute the previous noisy sample x_t -> x_t-1
                     current_context_latents = self.scheduler.step(noise_pred, t, current_context_latents, **extra_step_kwargs).prev_sample
+                    # if context_start + context_size > num_frames: remove the appended frames from the end of the current_context_latents
+                    if current_context_start + context_size > num_frames:
+                        # add the ending frames from current_context_latents to the start of the latent_sum
+                        latent_sum[:, :, -max(current_context_start + context_size - num_frames, 0):, :, :] += current_context_latents[:, :, -max(current_context_start + context_size - num_frames, 0):, :, :]
+                        # increase the counter for the ending frames
+                        latent_counter[-max(current_context_start + context_size - num_frames, 0):] += 1
+                        # remove the ending frames from current_context_latents
+                        current_context_latents = current_context_latents[:, :, :-max(current_context_start + context_size - num_frames, 0), :, :]
                     #add the context current_context_latents back to the latent sum starting from the current context start
+                    latent_sum[:, :, current_context_start : current_context_start + context_size, :, :] += current_context_latents
                     # add one to the counter for each timestep in the context
                     latent_counter[current_context_start : current_context_start + context_size] += 1