smoothieAI
/

pipeline_animatediff_context_controlnet_v2

Model card Files Files and versions Community

smoothieAI commited on Feb 8, 2024

Commit

127315d

verified ·

1 Parent(s): 14c4be1

Update pipeline.py

Browse files

Files changed (1) hide show

pipeline.py +13 -19

pipeline.py CHANGED Viewed

@@ -1176,11 +1176,16 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
             lora_scale=text_encoder_lora_scale,
             clip_skip=clip_skip,
         )
         # For classifier free guidance, we need to do two forward passes.
         # Here we concatenate the unconditional and text embeddings into a single batch
         # to avoid doing two forward passes
         if do_classifier_free_guidance:
-            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
         if ip_adapter_image is not None:
             output_hidden_state = False if isinstance(self.unet.encoder_hid_proj, ImageProjection) else True
@@ -1402,6 +1407,8 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
         # Denoising loop
         num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
         with self.progress_bar(total=len(timesteps)) as progress_bar:
             for i, t in enumerate(timesteps):
                 noise_pred_uncond_sum = torch.zeros_like(latents).to(device).to(dtype=torch.float16)
@@ -1421,9 +1428,6 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
                     latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
                     if self.controlnet != None and i < int(control_end*num_inference_steps):
-                        torch.cuda.synchronize()  # Synchronize GPU
-                        control_start = time.time()
                         current_context_conditioning_frames = conditioning_frames[current_context_indexes, :, :, :]
                         current_context_conditioning_frames = torch.cat([current_context_conditioning_frames] * 2) if do_classifier_free_guidance else current_context_conditioning_frames
@@ -1454,6 +1458,10 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
                         )
                         down_block_res_samples, mid_block_res_sample = self.controlnet(
                             control_model_input,
                             t,
@@ -1464,12 +1472,11 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
                             return_dict=False,
                         )
-                        unet_start = time.time()
                         # predict the noise residual with the added controlnet residuals
                         noise_pred = self.unet(
                             latent_model_input,
                             t,
-                            encoder_hidden_states=prompt_embeds,
                             cross_attention_kwargs=cross_attention_kwargs,
                             added_cond_kwargs=added_cond_kwargs,
                             down_block_additional_residuals=down_block_res_samples,
@@ -1478,8 +1485,6 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
                     else:
                         # predict the noise residual without contorlnet
-                        torch.cuda.synchronize()
-                        unet_start = time.time()
                         noise_pred = self.unet(
                             latent_model_input,
                             t,
@@ -1489,19 +1494,8 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
                         ).sample
                     if do_classifier_free_guidance:
-                        # Start timing for overall guidance process
-                        torch.cuda.synchronize()  # Synchronize GPU before starting timing
-                        start_guidance_time = time.time()
-                        # Timing for chunk operation
-                        torch.cuda.synchronize()  # Synchronize GPU before chunking
-                        time_chunk_start = time.time()
                         noise_pred_uncond, noise_pred_text = torch.chunk(noise_pred, 2, dim=0)
-                        # Timing for batch addition and latent counter increment
-                        torch.cuda.synchronize()  # Synchronize GPU before batch addition
-                        time_batch_addition_start = time.time()
                         # Perform batch addition
                         noise_pred_uncond_sum[..., current_context_indexes, :, :] += noise_pred_uncond

             lora_scale=text_encoder_lora_scale,
             clip_skip=clip_skip,
         )
+        # print promtp embed shape
+        print("prompt_embeds shape after encoding")
+        print(prompt_embeds.shape)
         # For classifier free guidance, we need to do two forward passes.
         # Here we concatenate the unconditional and text embeddings into a single batch
         # to avoid doing two forward passes
         if do_classifier_free_guidance:
+            # concatenate negative prompt embeddings with prompt embeddings on a new dimension after the first batch dimension
+            prompt_embeds = torch.stack([negative_prompt_embeds, prompt_embeds], dim=1)
         if ip_adapter_image is not None:
             output_hidden_state = False if isinstance(self.unet.encoder_hid_proj, ImageProjection) else True
         # Denoising loop
         num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        # get the number of prompt from the 1st dimension of prompt_embeds
+        num_prompts = prompt_embeds.shape[0]
         with self.progress_bar(total=len(timesteps)) as progress_bar:
             for i, t in enumerate(timesteps):
                 noise_pred_uncond_sum = torch.zeros_like(latents).to(device).to(dtype=torch.float16)
                     latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
                     if self.controlnet != None and i < int(control_end*num_inference_steps):
                         current_context_conditioning_frames = conditioning_frames[current_context_indexes, :, :, :]
                         current_context_conditioning_frames = torch.cat([current_context_conditioning_frames] * 2) if do_classifier_free_guidance else current_context_conditioning_frames
                         )
+                        # get the current prompt index based on the current context position (for blending between multiple prompts)
+                        context_position = current_context_indexes[0] % context_size
+                        current_prompt_index =  int(context_position / (context_size / num_prompts))
                         down_block_res_samples, mid_block_res_sample = self.controlnet(
                             control_model_input,
                             t,
                             return_dict=False,
                         )
                         # predict the noise residual with the added controlnet residuals
                         noise_pred = self.unet(
                             latent_model_input,
                             t,
+                            encoder_hidden_states=prompt_embeds[current_prompt_index],
                             cross_attention_kwargs=cross_attention_kwargs,
                             added_cond_kwargs=added_cond_kwargs,
                             down_block_additional_residuals=down_block_res_samples,
                     else:
                         # predict the noise residual without contorlnet
                         noise_pred = self.unet(
                             latent_model_input,
                             t,
                         ).sample
                     if do_classifier_free_guidance:
                         noise_pred_uncond, noise_pred_text = torch.chunk(noise_pred, 2, dim=0)
                         # Perform batch addition
                         noise_pred_uncond_sum[..., current_context_indexes, :, :] += noise_pred_uncond