smoothieAI
/

pipeline_animatediff_context_controlnet_v2

Model card Files Files and versions

xet

Community

smoothieAI commited on Feb 8, 2024

Commit

93c3c38

verified ·

1 Parent(s): b06bdc3

Update pipeline.py

Browse files

Files changed (1) hide show

pipeline.py +7 -13

pipeline.py CHANGED Viewed

@@ -1139,8 +1139,6 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
         else:
             batch_size = 1
-        # print promtp embed shape
-        print(prompt_embeds.shape)
         device = self._execution_device
@@ -1176,9 +1174,6 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
             lora_scale=text_encoder_lora_scale,
             clip_skip=clip_skip,
         )
-        # print promtp embed shape
-        print("prompt_embeds shape after encoding")
-        print(prompt_embeds.shape)
         # For classifier free guidance, we need to do two forward passes.
         # Here we concatenate the unconditional and text embeddings into a single batch
@@ -1187,9 +1182,6 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
             # concatenate negative prompt embeddings with prompt embeddings on a new dimension after the first batch dimension
             prompt_embeds = torch.stack([negative_prompt_embeds, prompt_embeds], dim=1)
-        print("prompt_embeds shape after stacking")
-        print(prompt_embeds.shape)
         if ip_adapter_image is not None:
             output_hidden_state = False if isinstance(self.unet.encoder_hid_proj, ImageProjection) else True
             image_embeds, negative_image_embeds = self.encode_image(
@@ -1433,6 +1425,12 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
                     # get the current prompt index based on the current context position (for blending between multiple prompts)
                     context_position = current_context_indexes[0] % context_size
                     current_prompt_index =  int(context_position / (context_size / num_prompts))
                     if self.controlnet != None and i < int(control_end*num_inference_steps):
@@ -1467,7 +1465,7 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
                         down_block_res_samples, mid_block_res_sample = self.controlnet(
                             control_model_input,
                             t,
-                            encoder_hidden_states=controlnet_prompt_embeds,
                             controlnet_cond=current_context_conditioning_frames,
                             conditioning_scale=cond_scale,
                             guess_mode=guess_mode,
@@ -1486,10 +1484,6 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
                         ).sample
                     else:
-                        # predict the noise residual without contorlnet
-                        # print current context embeding shape
-                        print("current context embeding shape")
-                        print(prompt_embeds[current_prompt_index].shape)
                         noise_pred = self.unet(
                             latent_model_input,
                             t,

         else:
             batch_size = 1
         device = self._execution_device
             lora_scale=text_encoder_lora_scale,
             clip_skip=clip_skip,
         )
         # For classifier free guidance, we need to do two forward passes.
         # Here we concatenate the unconditional and text embeddings into a single batch
             # concatenate negative prompt embeddings with prompt embeddings on a new dimension after the first batch dimension
             prompt_embeds = torch.stack([negative_prompt_embeds, prompt_embeds], dim=1)
         if ip_adapter_image is not None:
             output_hidden_state = False if isinstance(self.unet.encoder_hid_proj, ImageProjection) else True
             image_embeds, negative_image_embeds = self.encode_image(
                     # get the current prompt index based on the current context position (for blending between multiple prompts)
                     context_position = current_context_indexes[0] % context_size
                     current_prompt_index =  int(context_position / (context_size / num_prompts))
+                    print("current_prompt_index", current_prompt_index)
+                    print("current prompt embed shape", prompt_embeds[current_prompt_index].shape)
+                    # print min and max values of the current prompt embed
+                    print("min", torch.min(prompt_embeds[current_prompt_index]))
+                    print("max", torch.max(prompt_embeds[current_prompt_index]))
                     if self.controlnet != None and i < int(control_end*num_inference_steps):
                         down_block_res_samples, mid_block_res_sample = self.controlnet(
                             control_model_input,
                             t,
+                            encoder_hidden_states=controlnet_prompt_embeds[current_prompt_index],
                             controlnet_cond=current_context_conditioning_frames,
                             conditioning_scale=cond_scale,
                             guess_mode=guess_mode,
                         ).sample
                     else:
                         noise_pred = self.unet(
                             latent_model_input,
                             t,