smoothieAI commited on
Commit
e558fee
·
verified ·
1 Parent(s): 42e8f87

Update pipeline.py

Browse files
Files changed (1) hide show
  1. pipeline.py +29 -22
pipeline.py CHANGED
@@ -1163,24 +1163,34 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
1163
  text_encoder_lora_scale = (
1164
  cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
1165
  )
1166
- prompt_embeds, negative_prompt_embeds = self.encode_prompt(
1167
- prompt,
1168
- device,
1169
- num_videos_per_prompt,
1170
- do_classifier_free_guidance,
1171
- negative_prompt,
1172
- prompt_embeds=prompt_embeds,
1173
- negative_prompt_embeds=negative_prompt_embeds,
1174
- lora_scale=text_encoder_lora_scale,
1175
- clip_skip=clip_skip,
1176
- )
1177
 
1178
- # For classifier free guidance, we need to do two forward passes.
1179
- # Here we concatenate the unconditional and text embeddings into a single batch
1180
- # to avoid doing two forward passes
1181
- if do_classifier_free_guidance:
1182
- # concatenate negative prompt embeddings with prompt embeddings on a new dimension after the first batch dimension
1183
- prompt_embeds = torch.stack([negative_prompt_embeds, prompt_embeds], dim=1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1184
 
1185
  if ip_adapter_image is not None:
1186
  output_hidden_state = False if isinstance(self.unet.encoder_hid_proj, ImageProjection) else True
@@ -1403,8 +1413,6 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
1403
 
1404
  # Denoising loop
1405
  num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
1406
- # get the number of prompt from the 1st dimension of prompt_embeds
1407
- num_prompts = prompt_embeds.shape[0]
1408
  with self.progress_bar(total=len(timesteps)) as progress_bar:
1409
  for i, t in enumerate(timesteps):
1410
  noise_pred_uncond_sum = torch.zeros_like(latents).to(device).to(dtype=torch.float16)
@@ -1428,8 +1436,7 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
1428
  current_prompt_index = int(context_position / (context_size / num_prompts))
1429
 
1430
  # print min and max values of the current prompt embed
1431
- print("avg", torch.mean(prompt_embeds[current_prompt_index][0]))
1432
- print("max", torch.max(prompt_embeds[current_prompt_index][0]))
1433
 
1434
  # 7 Add image embeds for IP-Adapter
1435
  added_cond_kwargs = {"image_embeds": image_embeds[min(current_prompt_index, len(image_embeds) - 1)]} if ip_adapter_image is not None else None
@@ -1489,7 +1496,7 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
1489
  noise_pred = self.unet(
1490
  latent_model_input,
1491
  t,
1492
- encoder_hidden_states=prompt_embeds[current_prompt_index],
1493
  cross_attention_kwargs=cross_attention_kwargs,
1494
  added_cond_kwargs=added_cond_kwargs,
1495
  ).sample
 
1163
  text_encoder_lora_scale = (
1164
  cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
1165
  )
 
 
 
 
 
 
 
 
 
 
 
1166
 
1167
+ # if promptEmbeds size
1168
+ num_prompts = prompt_embeds.size(0) if prompt_embeds is not None else 0
1169
+ # foreach prompt embed
1170
+
1171
+ prompt_embeds_list = []
1172
+ for p in range(num_prompts):
1173
+ prompt_embeds, negative_prompt_embeds = self.encode_prompt(
1174
+ prompt,
1175
+ device,
1176
+ num_videos_per_prompt,
1177
+ do_classifier_free_guidance,
1178
+ negative_prompt,
1179
+ prompt_embeds=prompt_embeds[p].unsqueeze(0),
1180
+ negative_prompt_embeds=negative_prompt_embeds[p].unsqueeze(0),
1181
+ lora_scale=text_encoder_lora_scale,
1182
+ clip_skip=clip_skip,
1183
+ )
1184
+
1185
+ # For classifier free guidance, we need to do two forward passes.
1186
+ # Here we concatenate the unconditional and text embeddings into a single batch
1187
+ # to avoid doing two forward passes
1188
+ if do_classifier_free_guidance:
1189
+ # concatenate negative prompt embeddings with prompt embeddings on a new dimension after the first batch dimension
1190
+ prompt_embeds = torch.stack([negative_prompt_embeds, prompt_embeds], dim=1)
1191
+
1192
+ prompt_embeds_list.append(prompt_embeds)
1193
+
1194
 
1195
  if ip_adapter_image is not None:
1196
  output_hidden_state = False if isinstance(self.unet.encoder_hid_proj, ImageProjection) else True
 
1413
 
1414
  # Denoising loop
1415
  num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
 
 
1416
  with self.progress_bar(total=len(timesteps)) as progress_bar:
1417
  for i, t in enumerate(timesteps):
1418
  noise_pred_uncond_sum = torch.zeros_like(latents).to(device).to(dtype=torch.float16)
 
1436
  current_prompt_index = int(context_position / (context_size / num_prompts))
1437
 
1438
  # print min and max values of the current prompt embed
1439
+ print("avg", torch.mean(prompt_embeds_list[current_prompt_index][1]))
 
1440
 
1441
  # 7 Add image embeds for IP-Adapter
1442
  added_cond_kwargs = {"image_embeds": image_embeds[min(current_prompt_index, len(image_embeds) - 1)]} if ip_adapter_image is not None else None
 
1496
  noise_pred = self.unet(
1497
  latent_model_input,
1498
  t,
1499
+ encoder_hidden_states=prompt_embeds_list[current_prompt_index],
1500
  cross_attention_kwargs=cross_attention_kwargs,
1501
  added_cond_kwargs=added_cond_kwargs,
1502
  ).sample