smoothieAI
/

pipeline_animatediff_context_controlnet

Model card Files Files and versions Community

smoothieAI commited on Jan 25, 2024

Commit

b58a073

verified ·

1 Parent(s): 701a6d6

Update pipeline.py

Browse files

Files changed (1) hide show

pipeline.py +141 -10

pipeline.py CHANGED Viewed

@@ -23,7 +23,7 @@ from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPV
 # Updated to use absolute paths
 from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
 from diffusers.loaders import IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
-from diffusers.models import AutoencoderKL, ImageProjection, UNet2DConditionModel, UNetMotionModel
 from diffusers.models.lora import adjust_lora_scale_text_encoder
 from diffusers.models.unet_motion_model import MotionAdapter
 from diffusers.schedulers import (
@@ -136,6 +136,7 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
             EulerAncestralDiscreteScheduler,
             DPMSolverMultistepScheduler,
         ],
         feature_extractor: CLIPImageProcessor = None,
         image_encoder: CLIPVisionModelWithProjection = None,
     ):
@@ -148,12 +149,16 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
             tokenizer=tokenizer,
             unet=unet,
             motion_adapter=motion_adapter,
             scheduler=scheduler,
             feature_extractor=feature_extractor,
             image_encoder=image_encoder,
         )
         self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
         self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
     def load_motion_adapter(self,motion_adapter):
         self.register_modules(motion_adapter=motion_adapter)
@@ -846,6 +851,10 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
         smooth_weight: Optional[float] = 0.5,
         smooth_steps: Optional[int] = 3,
         initial_context_scale: Optional[float] = 1.0,
     ):
         r"""
         The call function to the pipeline for generation.
@@ -910,6 +919,23 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
                 If `return_dict` is `True`, [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] is
                 returned, otherwise a `tuple` is returned where the first element is a list with the generated frames.
         """
         # 0. Default height and width to unet
         height = height or self.unet.config.sample_size * self.vae_scale_factor
         width = width or self.unet.config.sample_size * self.vae_scale_factor
@@ -930,6 +956,19 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
             batch_size = prompt_embeds.shape[0]
         device = self._execution_device
         # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
         # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
         # corresponds to doing no classifier free guidance.
@@ -964,6 +1003,40 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
             if do_classifier_free_guidance:
                 image_embeds = torch.cat([negative_image_embeds, image_embeds])
         # 4. Prepare timesteps
         self.scheduler.set_timesteps(num_inference_steps, device=device)
         timesteps = self.scheduler.timesteps
@@ -1051,6 +1124,16 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
         # 7 Add image embeds for IP-Adapter
         added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None
         # divide the initial latents into context groups
         def context_scheduler(context_size, overlap, offset, total_frames, total_timesteps):
@@ -1105,15 +1188,63 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
                     # expand the latents if we are doing classifier free guidance
                     latent_model_input = torch.cat([current_context_latents] * 2) if do_classifier_free_guidance else current_context_latents
                     latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-                    # predict the noise residual
-                    noise_pred = self.unet(
-                        latent_model_input,
-                        t,
-                        encoder_hidden_states=prompt_embeds,
-                        cross_attention_kwargs=cross_attention_kwargs,
-                        added_cond_kwargs=added_cond_kwargs,
-                    ).sample
                     # sum the noise predictions for the unconditional and text conditioned noise
                     if do_classifier_free_guidance:
@@ -1176,4 +1307,4 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
         if not return_dict:
             return (video,)
-        return AnimateDiffPipelineOutput(frames=video)

 # Updated to use absolute paths
 from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
 from diffusers.loaders import IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
+from diffusers.models import AutoencoderKL, ImageProjection, UNet2DConditionModel, UNetMotionModel, ControlNetModel, MultiControlNetModel
 from diffusers.models.lora import adjust_lora_scale_text_encoder
 from diffusers.models.unet_motion_model import MotionAdapter
 from diffusers.schedulers import (
             EulerAncestralDiscreteScheduler,
             DPMSolverMultistepScheduler,
         ],
+        controlnet: Optional[Union[ControlNetModel, MultiControlNetModel]] = None,
         feature_extractor: CLIPImageProcessor = None,
         image_encoder: CLIPVisionModelWithProjection = None,
     ):
             tokenizer=tokenizer,
             unet=unet,
             motion_adapter=motion_adapter,
+            controlnet=controlnet,
             scheduler=scheduler,
             feature_extractor=feature_extractor,
             image_encoder=image_encoder,
         )
         self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
         self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.control_image_processor = VaeImageProcessor(
+            vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True, do_normalize=False
+        )
     def load_motion_adapter(self,motion_adapter):
         self.register_modules(motion_adapter=motion_adapter)
         smooth_weight: Optional[float] = 0.5,
         smooth_steps: Optional[int] = 3,
         initial_context_scale: Optional[float] = 1.0,
+        conditioning_frames: Optional[List[PipelineImageInput]] = None,
+        controlnet_conditioning_scale: Union[float, List[float]] = 1.0,
+        control_guidance_start: Union[float, List[float]] = 0.0,
+        control_guidance_end: Union[float, List[float]] = 1.0,
     ):
         r"""
         The call function to the pipeline for generation.
                 If `return_dict` is `True`, [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] is
                 returned, otherwise a `tuple` is returned where the first element is a list with the generated frames.
         """
+        if controlnet != None:
+            controlnet = self.controlnet._orig_mod if is_compiled_module(self.controlnet) else self.controlnet
+            # align format for control guidance
+            if not isinstance(control_guidance_start, list) and isinstance(control_guidance_end, list):
+                control_guidance_start = len(control_guidance_end) * [control_guidance_start]
+            elif not isinstance(control_guidance_end, list) and isinstance(control_guidance_start, list):
+                control_guidance_end = len(control_guidance_start) * [control_guidance_end]
+            elif not isinstance(control_guidance_start, list) and not isinstance(control_guidance_end, list):
+                mult = len(controlnet.nets) if isinstance(controlnet, MultiControlNetModel) else 1
+                control_guidance_start, control_guidance_end = (
+                    mult * [control_guidance_start],
+                    mult * [control_guidance_end],
+                )
         # 0. Default height and width to unet
         height = height or self.unet.config.sample_size * self.vae_scale_factor
         width = width or self.unet.config.sample_size * self.vae_scale_factor
             batch_size = prompt_embeds.shape[0]
         device = self._execution_device
+        if controlnet != None:
+            if isinstance(controlnet, MultiControlNetModel) and isinstance(controlnet_conditioning_scale, float):
+                controlnet_conditioning_scale = [controlnet_conditioning_scale] * len(controlnet.nets)
+            global_pool_conditions = (
+                controlnet.config.global_pool_conditions
+                if isinstance(controlnet, ControlNetModel)
+                else controlnet.nets[0].config.global_pool_conditions
+            )
+            guess_mode = guess_mode or global_pool_conditions
         # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
         # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
         # corresponds to doing no classifier free guidance.
             if do_classifier_free_guidance:
                 image_embeds = torch.cat([negative_image_embeds, image_embeds])
+        if controlnet != None:
+            if isinstance(controlnet, ControlNetModel):
+                conditioning_frames = self.prepare_image(
+                    image=conditioning_frames,
+                    width=width,
+                    height=height,
+                    batch_size=batch_size * num_videos_per_prompt * num_frames,
+                    num_images_per_prompt=num_videos_per_prompt,
+                    device=device,
+                    dtype=controlnet.dtype,
+                    do_classifier_free_guidance=self.do_classifier_free_guidance,
+                    guess_mode=guess_mode,
+                )
+            elif isinstance(controlnet, MultiControlNetModel):
+                cond_prepared_frames = []
+                for frame_ in conditioning_frames:
+                    prepared_frame = self.prepare_image(
+                        image=frame_,
+                        width=width,
+                        height=height,
+                        batch_size=batch_size * num_videos_per_prompt * num_frames,
+                        num_images_per_prompt=num_videos_per_prompt,
+                        device=device,
+                        dtype=controlnet.dtype,
+                        do_classifier_free_guidance=self.do_classifier_free_guidance,
+                        guess_mode=guess_mode,
+                    )
+                    cond_prepared_frames.append(prepared_frame)
+                conditioning_frames = cond_prepared_frames
+            else:
+                assert False
         # 4. Prepare timesteps
         self.scheduler.set_timesteps(num_inference_steps, device=device)
         timesteps = self.scheduler.timesteps
         # 7 Add image embeds for IP-Adapter
         added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None
+        # 7.1 Create tensor stating which controlnets to keep
+        if controlnet != None:
+            controlnet_keep = []
+            for i in range(len(timesteps)):
+                keeps = [
+                    1.0 - float(i / len(timesteps) < s or (i + 1) / len(timesteps) > e)
+                    for s, e in zip(control_guidance_start, control_guidance_end)
+                ]
+                controlnet_keep.append(keeps[0] if isinstance(controlnet, ControlNetModel) else keeps)
         # divide the initial latents into context groups
         def context_scheduler(context_size, overlap, offset, total_frames, total_timesteps):
                     # expand the latents if we are doing classifier free guidance
                     latent_model_input = torch.cat([current_context_latents] * 2) if do_classifier_free_guidance else current_context_latents
                     latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                    if controlnet != None:
+                        if guess_mode and self.do_classifier_free_guidance:
+                            # Infer ControlNet only for the conditional batch.
+                            control_model_input = latents
+                            control_model_input = self.scheduler.scale_model_input(control_model_input, t)
+                            controlnet_prompt_embeds = prompt_embeds.chunk(2)[1]
+                        else:
+                            control_model_input = latent_model_input
+                            controlnet_prompt_embeds = prompt_embeds
+                        controlnet_prompt_embeds = controlnet_prompt_embeds.repeat_interleave(num_frames, dim=0)
+                        if isinstance(controlnet_keep[i], list):
+                            cond_scale = [c * s for c, s in zip(controlnet_conditioning_scale, controlnet_keep[i])]
+                        else:
+                            controlnet_cond_scale = controlnet_conditioning_scale
+                            if isinstance(controlnet_cond_scale, list):
+                                controlnet_cond_scale = controlnet_cond_scale[0]
+                            cond_scale = controlnet_cond_scale * controlnet_keep[i]
+                        control_model_input = torch.transpose(control_model_input, 1, 2)
+                        control_model_input = control_model_input.reshape(
+                            (-1, control_model_input.shape[2], control_model_input.shape[3], control_model_input.shape[4])
+                        )
+                        down_block_res_samples, mid_block_res_sample = self.controlnet(
+                            control_model_input,
+                            t,
+                            encoder_hidden_states=controlnet_prompt_embeds,
+                            controlnet_cond=conditioning_frames,
+                            conditioning_scale=cond_scale,
+                            guess_mode=guess_mode,
+                            return_dict=False,
+                        )
+                        # predict the noise residual with the added controlnet residuals
+                        noise_pred = self.unet(
+                            latent_model_input,
+                            t,
+                            encoder_hidden_states=prompt_embeds,
+                            cross_attention_kwargs=cross_attention_kwargs,
+                            added_cond_kwargs=added_cond_kwargs,
+                            down_block_additional_residuals=down_block_res_samples,
+                            mid_block_additional_residual=mid_block_res_sample,
+                        ).sample
+                    else:
+                        # predict the noise residual
+                        noise_pred = self.unet(
+                            latent_model_input,
+                            t,
+                            encoder_hidden_states=prompt_embeds,
+                            cross_attention_kwargs=cross_attention_kwargs,
+                            added_cond_kwargs=added_cond_kwargs,
+                        ).sample
                     # sum the noise predictions for the unconditional and text conditioned noise
                     if do_classifier_free_guidance:
         if not return_dict:
             return (video,)
+        return AnimateDiffPipelineOutput(frames=video)