smoothieAI
/

pipeline_animatediff_context

Model card Files Files and versions Community

smoothieAI commited on Jan 18, 2024

Commit

00f8442

verified ·

1 Parent(s): 24453ad

Update pipeline.py

Browse files

Files changed (1) hide show

pipeline.py +30 -27

pipeline.py CHANGED Viewed

@@ -1005,7 +1005,27 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
         added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None
         # divide the initial latents into context groups
-        num_context_groups = num_frames // (context_size-overlap)
         # Denoising loop
         num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
@@ -1013,24 +1033,16 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
             for i, t in enumerate(timesteps):
                 noise_pred_uncond_sum = torch.zeros_like(latents).to(device).to(dtype=torch.float16)
                 noise_pred_text_sum = torch.zeros_like(latents).to(device).to(dtype=torch.float16)
-                # latent_sum = torch.zeros_like(latents).to(device).to(dtype=torch.float16)
                 latent_counter = torch.zeros(num_frames).to(device).to(dtype=torch.float16)
                 # foreach context group seperately denoise the current timestep
-                for context_group in range(num_context_groups):
                     # calculate to current indexes, considering overlap
-                    if context_group == 0:current_context_start = 0
-                    else:current_context_start = context_group * (context_size - overlap)
                     # select the relevent context from the latents
-                    current_context_latents = latents[:, :, current_context_start : current_context_start + context_size, :, :]
-                    wrap_count = max(current_context_start + context_size - num_frames, 0)
-                    # if context_start + context_size > num_frames: append the remaining frames from the start of the latents
-                    if wrap_count > 0:
-                        current_context_latents = torch.cat([current_context_latents, latents[:, :, :wrap_count, :, :]], dim=2)
                     # expand the latents if we are doing classifier free guidance
                     latent_model_input = torch.cat([current_context_latents] * 2) if do_classifier_free_guidance else current_context_latents
                     latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
@@ -1047,18 +1059,12 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
                     # sum the noise predictions for the unconditional and text conditioned noise
                     if do_classifier_free_guidance:
                         noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                        if wrap_count > 0:
-                            # add the ending frames from noise_pred_uncond to the start of the noise_pred_uncond_sum
-                            noise_pred_uncond_sum[:, :, 0:wrap_count, :, :] += noise_pred_uncond[:, :, -wrap_count:, :, :]
-                            noise_pred_text_sum[:, :, 0:wrap_count, :, :] += noise_pred_text[:, :, -wrap_count:, :, :]
-                            #increase the counter for the ending frames
-                            latent_counter[0:wrap_count] += 1
-                            # remove the ending frames from noise_pred_uncond
-                            noise_pred_uncond = noise_pred_uncond[:, :, :-wrap_count, :, :]
-                            noise_pred_text = noise_pred_text[:, :, :-wrap_count, :, :]
-                        noise_pred_uncond_sum[:, :, current_context_start : current_context_start + context_size, :, :] += noise_pred_uncond
-                        noise_pred_text_sum[:, :, current_context_start : current_context_start + context_size, :, :] += noise_pred_text
-                        latent_counter[current_context_start : current_context_start + context_size] += 1
                     # set the step index to the current batch
                     self.scheduler._step_index = i
@@ -1078,9 +1084,6 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
                     progress_bar.update()
                     if callback is not None and i % callback_steps == 0:
                         callback(i, t, None)
-                # offset latent images by step places, wrapping around the last frames to the start
-                latents = torch.cat([latents[:, :, -step:, :, :], latents[:, :, :-step, :, :]], dim=2)
         if output_type == "latent":
             return AnimateDiffPipelineOutput(frames=latents)

         added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None
         # divide the initial latents into context groups
+        def context_scheduler(context_size, overlap, offset, num_frames, num_timesteps):
+            num_context_groups = (num_frames // (context_size-overlap))+1
+            context_indexes = []
+            for t in range(num_timesteps):
+                context_groups = []
+                for context_group_index in range(num_context_groups):
+                    context_group = []
+                    for i in range(context_size):
+                        # calculate the frame index
+                        frame_index = ((t+1) * context_group_index * (context_size-overlap)) + i
+                        # wrap around at the end
+                        if frame_index >= num_frames:frame_index = frame_index % num_frames
+                        context_group.append(frame_index)
+                    context_groups.append(context_groups)
+                context_indexes.append(context_groups)
+            return context_indexes
+        context_indexes = context_scheduler(context_size, overlap, num_frames, len(timesteps))
+        print(f"Context indexes: {context_indexes}")
         # Denoising loop
         num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
             for i, t in enumerate(timesteps):
                 noise_pred_uncond_sum = torch.zeros_like(latents).to(device).to(dtype=torch.float16)
                 noise_pred_text_sum = torch.zeros_like(latents).to(device).to(dtype=torch.float16)
                 latent_counter = torch.zeros(num_frames).to(device).to(dtype=torch.float16)
                 # foreach context group seperately denoise the current timestep
+                for context_group in range(len(context_indexes[i])):
                     # calculate to current indexes, considering overlap
+                    current_context_indexes = context_indexes[i][context_group]
                     # select the relevent context from the latents
+                    current_context_latents = latents[:, :, current_context_indexes, :, :]
                     # expand the latents if we are doing classifier free guidance
                     latent_model_input = torch.cat([current_context_latents] * 2) if do_classifier_free_guidance else current_context_latents
                     latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
                     # sum the noise predictions for the unconditional and text conditioned noise
                     if do_classifier_free_guidance:
                         noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                        # add the ending frames from noise_pred_uncond to the start of the noise_pred_uncond_sum
+                        noise_pred_uncond_sum[:, :,current_context_indexes, :, :] += noise_pred_uncond
+                        noise_pred_text_sum[:, :,current_context_indexes, :, :] += noise_pred_text
+                        #increase the counter for the ending frames
+                        latent_counter[current_context_indexes] += 1
                     # set the step index to the current batch
                     self.scheduler._step_index = i
                     progress_bar.update()
                     if callback is not None and i % callback_steps == 0:
                         callback(i, t, None)
         if output_type == "latent":
             return AnimateDiffPipelineOutput(frames=latents)