smoothieAI
/

pipeline_animatediff_context_controlnet

Model card Files Files and versions Community

smoothieAI commited on Jan 26, 2024

Commit

418ded7

verified ·

1 Parent(s): 5f14f2c

Update pipeline.py

Browse files

Files changed (1) hide show

pipeline.py +13 -41

pipeline.py CHANGED Viewed

@@ -136,16 +136,13 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
             EulerAncestralDiscreteScheduler,
             DPMSolverMultistepScheduler,
         ],
-        controlnet: Union[ControlNetModel, List[ControlNetModel], Tuple[ControlNetModel], MultiControlNetModel] = None,
-        feature_extractor: CLIPImageProcessor = None,
-        image_encoder: CLIPVisionModelWithProjection = None,
     ):
         super().__init__()
         unet = UNetMotionModel.from_unet2d(unet, motion_adapter)
-        if isinstance(controlnet, (list, tuple)):
-            controlnet = MultiControlNetModel(controlnet)
         self.register_modules(
             vae=vae,
             text_encoder=text_encoder,
@@ -1260,31 +1257,15 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
                     # select the relevent context from the latents
                     current_context_latents = latents[:, :, current_context_indexes, :, :]
-                    if self.controlnet != None:
-                        # if we are using multiple controlnets, select the context window for each controlnet
-                        if isinstance(controlnet, MultiControlNetModel):
-                            print("lengt of conditioning_frames", len(conditioning_frames))
-                            current_context_conditioning_frames = [conditioning_frames[c][current_context_indexes, :, :, :] for c in range(len(controlnet.nets))]
-                            # move to device
-                            current_context_conditioning_frames = [c.to(device) for c in current_context_conditioning_frames]
-                            # print shape of curent context conditioning frames [0]
-                            print("shape of current context conditioning frames", current_context_conditioning_frames[0].shape)
-                            # print device
-                            print("device of current context conditioning frames", current_context_conditioning_frames[0].device)
-                        else:
-                            # select the relevent context from the conditioning frames of shape (frame_number, channel, height, width)
-                            current_context_conditioning_frames = conditioning_frames[current_context_indexes, :, :, :]
-                            current_context_conditioning_frames = current_context_conditioning_frames.to(device)
-                    else:
-                        current_context_conditioning_frames = None
                     # expand the latents if we are doing classifier free guidance
                     latent_model_input = torch.cat([current_context_latents] * 2) if do_classifier_free_guidance else current_context_latents
                     latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
                     if self.controlnet != None:
                         if guess_mode and self.do_classifier_free_guidance:
                             # Infer ControlNet only for the conditional batch.
                             control_model_input = latents
@@ -1302,21 +1283,12 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
                             if isinstance(controlnet_cond_scale, list):
                                 controlnet_cond_scale = controlnet_cond_scale[0]
                             cond_scale = controlnet_cond_scale * controlnet_keep[i]
                         control_model_input = torch.transpose(control_model_input, 1, 2)
-                        control_model_input = control_model_input.reshape((-1, control_model_input.shape[2], control_model_input.shape[3], control_model_input.shape[4]))
-                        # print device for all inputs
-                        try:
-                            print("cond_scale", cond_scale)
-                            print("device of control_model_input", control_model_input.device)
-                            print("device of controlnet_prompt_embeds", controlnet_prompt_embeds.device)
-                            print("device of current_context_conditioning_frames", current_context_conditioning_frames.device)
-                            print("shape of control_model_input", current_context_conditioning_frames.shape)
-                            print("device of cond_scale", cond_scale.device)
-                        # print error
-                        except Exception as e:
-                            print("error", e)
                         down_block_res_samples, mid_block_res_sample = self.controlnet(
                             control_model_input,
                             t,
@@ -1339,7 +1311,7 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
                         ).sample
                     else:
-                        # predict the noise residual
                         noise_pred = self.unet(
                             latent_model_input,
                             t,

             EulerAncestralDiscreteScheduler,
             DPMSolverMultistepScheduler,
         ],
+        controlnet: Union[ControlNetModel, MultiControlNetModel],
+        feature_extractor: Optional[CLIPImageProcessor] = None,
+        image_encoder: Optional[CLIPVisionModelWithProjection] = None,
     ):
         super().__init__()
         unet = UNetMotionModel.from_unet2d(unet, motion_adapter)
         self.register_modules(
             vae=vae,
             text_encoder=text_encoder,
                     # select the relevent context from the latents
                     current_context_latents = latents[:, :, current_context_indexes, :, :]
                     # expand the latents if we are doing classifier free guidance
                     latent_model_input = torch.cat([current_context_latents] * 2) if do_classifier_free_guidance else current_context_latents
                     latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
                     if self.controlnet != None:
+                        current_context_conditioning_frames = conditioning_frames[current_context_indexes, :, :, :]
                         if guess_mode and self.do_classifier_free_guidance:
                             # Infer ControlNet only for the conditional batch.
                             control_model_input = latents
                             if isinstance(controlnet_cond_scale, list):
                                 controlnet_cond_scale = controlnet_cond_scale[0]
                             cond_scale = controlnet_cond_scale * controlnet_keep[i]
                         control_model_input = torch.transpose(control_model_input, 1, 2)
+                        control_model_input = control_model_input.reshape(
+                            (-1, control_model_input.shape[2], control_model_input.shape[3], control_model_input.shape[4])
+                        )
                         down_block_res_samples, mid_block_res_sample = self.controlnet(
                             control_model_input,
                             t,
                         ).sample
                     else:
+                        # predict the noise residual without contorlnet
                         noise_pred = self.unet(
                             latent_model_input,
                             t,