smoothieAI
/

pipeline_animatediff_context_controlnet_v2

Model card Files Files and versions Community

smoothieAI commited on Feb 8, 2024

Commit

e558fee

verified ·

1 Parent(s): 42e8f87

Update pipeline.py

Browse files

Files changed (1) hide show

pipeline.py +29 -22

pipeline.py CHANGED Viewed

@@ -1163,24 +1163,34 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
         text_encoder_lora_scale = (
             cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
         )
-        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
-            prompt,
-            device,
-            num_videos_per_prompt,
-            do_classifier_free_guidance,
-            negative_prompt,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
-            lora_scale=text_encoder_lora_scale,
-            clip_skip=clip_skip,
-        )
-        # For classifier free guidance, we need to do two forward passes.
-        # Here we concatenate the unconditional and text embeddings into a single batch
-        # to avoid doing two forward passes
-        if do_classifier_free_guidance:
-            # concatenate negative prompt embeddings with prompt embeddings on a new dimension after the first batch dimension
-            prompt_embeds = torch.stack([negative_prompt_embeds, prompt_embeds], dim=1)
         if ip_adapter_image is not None:
             output_hidden_state = False if isinstance(self.unet.encoder_hid_proj, ImageProjection) else True
@@ -1403,8 +1413,6 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
         # Denoising loop
         num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
-        # get the number of prompt from the 1st dimension of prompt_embeds
-        num_prompts = prompt_embeds.shape[0]
         with self.progress_bar(total=len(timesteps)) as progress_bar:
             for i, t in enumerate(timesteps):
                 noise_pred_uncond_sum = torch.zeros_like(latents).to(device).to(dtype=torch.float16)
@@ -1428,8 +1436,7 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
                     current_prompt_index =  int(context_position / (context_size / num_prompts))
                     # print min and max values of the current prompt embed
-                    print("avg", torch.mean(prompt_embeds[current_prompt_index][0]))
-                    print("max", torch.max(prompt_embeds[current_prompt_index][0]))
                     # 7 Add image embeds for IP-Adapter
                     added_cond_kwargs = {"image_embeds": image_embeds[min(current_prompt_index, len(image_embeds) - 1)]} if ip_adapter_image is not None else None
@@ -1489,7 +1496,7 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
                         noise_pred = self.unet(
                             latent_model_input,
                             t,
-                            encoder_hidden_states=prompt_embeds[current_prompt_index],
                             cross_attention_kwargs=cross_attention_kwargs,
                             added_cond_kwargs=added_cond_kwargs,
                         ).sample

         text_encoder_lora_scale = (
             cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
         )
+        # if promptEmbeds size
+        num_prompts = prompt_embeds.size(0) if prompt_embeds is not None else 0
+        # foreach prompt embed
+        prompt_embeds_list = []
+        for p in range(num_prompts):
+            prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+                prompt,
+                device,
+                num_videos_per_prompt,
+                do_classifier_free_guidance,
+                negative_prompt,
+                prompt_embeds=prompt_embeds[p].unsqueeze(0),
+                negative_prompt_embeds=negative_prompt_embeds[p].unsqueeze(0),
+                lora_scale=text_encoder_lora_scale,
+                clip_skip=clip_skip,
+            )
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            if do_classifier_free_guidance:
+                # concatenate negative prompt embeddings with prompt embeddings on a new dimension after the first batch dimension
+                prompt_embeds = torch.stack([negative_prompt_embeds, prompt_embeds], dim=1)
+            prompt_embeds_list.append(prompt_embeds)
         if ip_adapter_image is not None:
             output_hidden_state = False if isinstance(self.unet.encoder_hid_proj, ImageProjection) else True
         # Denoising loop
         num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
         with self.progress_bar(total=len(timesteps)) as progress_bar:
             for i, t in enumerate(timesteps):
                 noise_pred_uncond_sum = torch.zeros_like(latents).to(device).to(dtype=torch.float16)
                     current_prompt_index =  int(context_position / (context_size / num_prompts))
                     # print min and max values of the current prompt embed
+                    print("avg", torch.mean(prompt_embeds_list[current_prompt_index][1]))
                     # 7 Add image embeds for IP-Adapter
                     added_cond_kwargs = {"image_embeds": image_embeds[min(current_prompt_index, len(image_embeds) - 1)]} if ip_adapter_image is not None else None
                         noise_pred = self.unet(
                             latent_model_input,
                             t,
+                            encoder_hidden_states=prompt_embeds_list[current_prompt_index],
                             cross_attention_kwargs=cross_attention_kwargs,
                             added_cond_kwargs=added_cond_kwargs,
                         ).sample