Update pipeline.py
Browse files- pipeline.py +7 -13
pipeline.py
CHANGED
@@ -1139,8 +1139,6 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
|
|
1139 |
else:
|
1140 |
batch_size = 1
|
1141 |
|
1142 |
-
# print promtp embed shape
|
1143 |
-
print(prompt_embeds.shape)
|
1144 |
|
1145 |
device = self._execution_device
|
1146 |
|
@@ -1176,9 +1174,6 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
|
|
1176 |
lora_scale=text_encoder_lora_scale,
|
1177 |
clip_skip=clip_skip,
|
1178 |
)
|
1179 |
-
# print promtp embed shape
|
1180 |
-
print("prompt_embeds shape after encoding")
|
1181 |
-
print(prompt_embeds.shape)
|
1182 |
|
1183 |
# For classifier free guidance, we need to do two forward passes.
|
1184 |
# Here we concatenate the unconditional and text embeddings into a single batch
|
@@ -1187,9 +1182,6 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
|
|
1187 |
# concatenate negative prompt embeddings with prompt embeddings on a new dimension after the first batch dimension
|
1188 |
prompt_embeds = torch.stack([negative_prompt_embeds, prompt_embeds], dim=1)
|
1189 |
|
1190 |
-
print("prompt_embeds shape after stacking")
|
1191 |
-
print(prompt_embeds.shape)
|
1192 |
-
|
1193 |
if ip_adapter_image is not None:
|
1194 |
output_hidden_state = False if isinstance(self.unet.encoder_hid_proj, ImageProjection) else True
|
1195 |
image_embeds, negative_image_embeds = self.encode_image(
|
@@ -1433,6 +1425,12 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
|
|
1433 |
# get the current prompt index based on the current context position (for blending between multiple prompts)
|
1434 |
context_position = current_context_indexes[0] % context_size
|
1435 |
current_prompt_index = int(context_position / (context_size / num_prompts))
|
|
|
|
|
|
|
|
|
|
|
|
|
1436 |
|
1437 |
if self.controlnet != None and i < int(control_end*num_inference_steps):
|
1438 |
|
@@ -1467,7 +1465,7 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
|
|
1467 |
down_block_res_samples, mid_block_res_sample = self.controlnet(
|
1468 |
control_model_input,
|
1469 |
t,
|
1470 |
-
encoder_hidden_states=controlnet_prompt_embeds,
|
1471 |
controlnet_cond=current_context_conditioning_frames,
|
1472 |
conditioning_scale=cond_scale,
|
1473 |
guess_mode=guess_mode,
|
@@ -1486,10 +1484,6 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
|
|
1486 |
).sample
|
1487 |
|
1488 |
else:
|
1489 |
-
# predict the noise residual without contorlnet
|
1490 |
-
# print current context embeding shape
|
1491 |
-
print("current context embeding shape")
|
1492 |
-
print(prompt_embeds[current_prompt_index].shape)
|
1493 |
noise_pred = self.unet(
|
1494 |
latent_model_input,
|
1495 |
t,
|
|
|
1139 |
else:
|
1140 |
batch_size = 1
|
1141 |
|
|
|
|
|
1142 |
|
1143 |
device = self._execution_device
|
1144 |
|
|
|
1174 |
lora_scale=text_encoder_lora_scale,
|
1175 |
clip_skip=clip_skip,
|
1176 |
)
|
|
|
|
|
|
|
1177 |
|
1178 |
# For classifier free guidance, we need to do two forward passes.
|
1179 |
# Here we concatenate the unconditional and text embeddings into a single batch
|
|
|
1182 |
# concatenate negative prompt embeddings with prompt embeddings on a new dimension after the first batch dimension
|
1183 |
prompt_embeds = torch.stack([negative_prompt_embeds, prompt_embeds], dim=1)
|
1184 |
|
|
|
|
|
|
|
1185 |
if ip_adapter_image is not None:
|
1186 |
output_hidden_state = False if isinstance(self.unet.encoder_hid_proj, ImageProjection) else True
|
1187 |
image_embeds, negative_image_embeds = self.encode_image(
|
|
|
1425 |
# get the current prompt index based on the current context position (for blending between multiple prompts)
|
1426 |
context_position = current_context_indexes[0] % context_size
|
1427 |
current_prompt_index = int(context_position / (context_size / num_prompts))
|
1428 |
+
|
1429 |
+
print("current_prompt_index", current_prompt_index)
|
1430 |
+
print("current prompt embed shape", prompt_embeds[current_prompt_index].shape)
|
1431 |
+
# print min and max values of the current prompt embed
|
1432 |
+
print("min", torch.min(prompt_embeds[current_prompt_index]))
|
1433 |
+
print("max", torch.max(prompt_embeds[current_prompt_index]))
|
1434 |
|
1435 |
if self.controlnet != None and i < int(control_end*num_inference_steps):
|
1436 |
|
|
|
1465 |
down_block_res_samples, mid_block_res_sample = self.controlnet(
|
1466 |
control_model_input,
|
1467 |
t,
|
1468 |
+
encoder_hidden_states=controlnet_prompt_embeds[current_prompt_index],
|
1469 |
controlnet_cond=current_context_conditioning_frames,
|
1470 |
conditioning_scale=cond_scale,
|
1471 |
guess_mode=guess_mode,
|
|
|
1484 |
).sample
|
1485 |
|
1486 |
else:
|
|
|
|
|
|
|
|
|
1487 |
noise_pred = self.unet(
|
1488 |
latent_model_input,
|
1489 |
t,
|