smoothieAI
/

pipeline_animatediff_context_controlnet

Model card Files Files and versions Community

smoothieAI commited on Jan 25, 2024

Commit

15d0b5d

·

verified ·

1 Parent(s): d2ffe37

Update pipeline.py

Files changed (1) hide show

pipeline.py +7 -3

pipeline.py CHANGED Viewed

@@ -843,6 +843,7 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
     ):
         image = self.control_image_processor.preprocess(image, height=height, width=width).to(dtype=torch.float32)
         image_batch_size = image.shape[0]
         if image_batch_size == 1:
             repeat_by = batch_size
@@ -856,6 +857,8 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
         if do_classifier_free_guidance and not guess_mode:
             image = torch.cat([image] * 2)
         return image
@@ -1266,6 +1269,7 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
                         else:
                             # select the relevent context from the conditioning frames of shape (frame_number, channel, height, width)
                             current_context_conditioning_frames = conditioning_frames[current_context_indexes, :, :, :]
                     else:
                         current_context_conditioning_frames = None
@@ -1300,9 +1304,9 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
                         )
                         down_block_res_samples, mid_block_res_sample = self.controlnet(
-                            control_model_input.to(device),
-                            t.to(device),
-                            encoder_hidden_states=controlnet_prompt_embeds.to(device),
                             controlnet_cond=current_context_conditioning_frames,
                             conditioning_scale=cond_scale,
                             guess_mode=guess_mode,

     ):
         image = self.control_image_processor.preprocess(image, height=height, width=width).to(dtype=torch.float32)
         image_batch_size = image.shape[0]
+        print("prepared control image_batch_size", image_batch_size)
         if image_batch_size == 1:
             repeat_by = batch_size
         if do_classifier_free_guidance and not guess_mode:
             image = torch.cat([image] * 2)
+        print("prepared control image_batch_size", image.shape)
         return image
                         else:
                             # select the relevent context from the conditioning frames of shape (frame_number, channel, height, width)
                             current_context_conditioning_frames = conditioning_frames[current_context_indexes, :, :, :]
+                            current_context_conditioning_frames = current_context_conditioning_frames.to(device)
                     else:
                         current_context_conditioning_frames = None
                         )
                         down_block_res_samples, mid_block_res_sample = self.controlnet(
+                            control_model_input,
+                            t,
+                            encoder_hidden_states=controlnet_prompt_embeds,
                             controlnet_cond=current_context_conditioning_frames,
                             conditioning_scale=cond_scale,
                             guess_mode=guess_mode,