Update pipeline.py
Browse files- pipeline.py +29 -22
pipeline.py
CHANGED
@@ -1163,24 +1163,34 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
|
|
1163 |
text_encoder_lora_scale = (
|
1164 |
cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
|
1165 |
)
|
1166 |
-
prompt_embeds, negative_prompt_embeds = self.encode_prompt(
|
1167 |
-
prompt,
|
1168 |
-
device,
|
1169 |
-
num_videos_per_prompt,
|
1170 |
-
do_classifier_free_guidance,
|
1171 |
-
negative_prompt,
|
1172 |
-
prompt_embeds=prompt_embeds,
|
1173 |
-
negative_prompt_embeds=negative_prompt_embeds,
|
1174 |
-
lora_scale=text_encoder_lora_scale,
|
1175 |
-
clip_skip=clip_skip,
|
1176 |
-
)
|
1177 |
|
1178 |
-
#
|
1179 |
-
|
1180 |
-
#
|
1181 |
-
|
1182 |
-
|
1183 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1184 |
|
1185 |
if ip_adapter_image is not None:
|
1186 |
output_hidden_state = False if isinstance(self.unet.encoder_hid_proj, ImageProjection) else True
|
@@ -1403,8 +1413,6 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
|
|
1403 |
|
1404 |
# Denoising loop
|
1405 |
num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
|
1406 |
-
# get the number of prompt from the 1st dimension of prompt_embeds
|
1407 |
-
num_prompts = prompt_embeds.shape[0]
|
1408 |
with self.progress_bar(total=len(timesteps)) as progress_bar:
|
1409 |
for i, t in enumerate(timesteps):
|
1410 |
noise_pred_uncond_sum = torch.zeros_like(latents).to(device).to(dtype=torch.float16)
|
@@ -1428,8 +1436,7 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
|
|
1428 |
current_prompt_index = int(context_position / (context_size / num_prompts))
|
1429 |
|
1430 |
# print min and max values of the current prompt embed
|
1431 |
-
print("avg", torch.mean(
|
1432 |
-
print("max", torch.max(prompt_embeds[current_prompt_index][0]))
|
1433 |
|
1434 |
# 7 Add image embeds for IP-Adapter
|
1435 |
added_cond_kwargs = {"image_embeds": image_embeds[min(current_prompt_index, len(image_embeds) - 1)]} if ip_adapter_image is not None else None
|
@@ -1489,7 +1496,7 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
|
|
1489 |
noise_pred = self.unet(
|
1490 |
latent_model_input,
|
1491 |
t,
|
1492 |
-
encoder_hidden_states=
|
1493 |
cross_attention_kwargs=cross_attention_kwargs,
|
1494 |
added_cond_kwargs=added_cond_kwargs,
|
1495 |
).sample
|
|
|
1163 |
text_encoder_lora_scale = (
|
1164 |
cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
|
1165 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1166 |
|
1167 |
+
# if promptEmbeds size
|
1168 |
+
num_prompts = prompt_embeds.size(0) if prompt_embeds is not None else 0
|
1169 |
+
# foreach prompt embed
|
1170 |
+
|
1171 |
+
prompt_embeds_list = []
|
1172 |
+
for p in range(num_prompts):
|
1173 |
+
prompt_embeds, negative_prompt_embeds = self.encode_prompt(
|
1174 |
+
prompt,
|
1175 |
+
device,
|
1176 |
+
num_videos_per_prompt,
|
1177 |
+
do_classifier_free_guidance,
|
1178 |
+
negative_prompt,
|
1179 |
+
prompt_embeds=prompt_embeds[p].unsqueeze(0),
|
1180 |
+
negative_prompt_embeds=negative_prompt_embeds[p].unsqueeze(0),
|
1181 |
+
lora_scale=text_encoder_lora_scale,
|
1182 |
+
clip_skip=clip_skip,
|
1183 |
+
)
|
1184 |
+
|
1185 |
+
# For classifier free guidance, we need to do two forward passes.
|
1186 |
+
# Here we concatenate the unconditional and text embeddings into a single batch
|
1187 |
+
# to avoid doing two forward passes
|
1188 |
+
if do_classifier_free_guidance:
|
1189 |
+
# concatenate negative prompt embeddings with prompt embeddings on a new dimension after the first batch dimension
|
1190 |
+
prompt_embeds = torch.stack([negative_prompt_embeds, prompt_embeds], dim=1)
|
1191 |
+
|
1192 |
+
prompt_embeds_list.append(prompt_embeds)
|
1193 |
+
|
1194 |
|
1195 |
if ip_adapter_image is not None:
|
1196 |
output_hidden_state = False if isinstance(self.unet.encoder_hid_proj, ImageProjection) else True
|
|
|
1413 |
|
1414 |
# Denoising loop
|
1415 |
num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
|
|
|
|
|
1416 |
with self.progress_bar(total=len(timesteps)) as progress_bar:
|
1417 |
for i, t in enumerate(timesteps):
|
1418 |
noise_pred_uncond_sum = torch.zeros_like(latents).to(device).to(dtype=torch.float16)
|
|
|
1436 |
current_prompt_index = int(context_position / (context_size / num_prompts))
|
1437 |
|
1438 |
# print min and max values of the current prompt embed
|
1439 |
+
print("avg", torch.mean(prompt_embeds_list[current_prompt_index][1]))
|
|
|
1440 |
|
1441 |
# 7 Add image embeds for IP-Adapter
|
1442 |
added_cond_kwargs = {"image_embeds": image_embeds[min(current_prompt_index, len(image_embeds) - 1)]} if ip_adapter_image is not None else None
|
|
|
1496 |
noise_pred = self.unet(
|
1497 |
latent_model_input,
|
1498 |
t,
|
1499 |
+
encoder_hidden_states=prompt_embeds_list[current_prompt_index],
|
1500 |
cross_attention_kwargs=cross_attention_kwargs,
|
1501 |
added_cond_kwargs=added_cond_kwargs,
|
1502 |
).sample
|