Update pipeline.py
Browse files- pipeline.py +0 -33
pipeline.py
CHANGED
@@ -982,7 +982,6 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
|
|
982 |
image = self.control_image_processor.preprocess(image, height=height, width=width).to(dtype=torch.float32)
|
983 |
# image_batch_size = image.shape[0]
|
984 |
image_batch_size = len(image)
|
985 |
-
print("prepared control image_batch_size", image_batch_size)
|
986 |
|
987 |
# if image_batch_size == 1:
|
988 |
# repeat_by = batch_size
|
@@ -996,9 +995,6 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
|
|
996 |
|
997 |
# if do_classifier_free_guidance and not guess_mode:
|
998 |
# image = torch.cat([image] * 2)
|
999 |
-
|
1000 |
-
print("prepared control image_batch_size", image.shape)
|
1001 |
-
print("prepared control device", image.device)
|
1002 |
|
1003 |
return image
|
1004 |
|
@@ -1258,7 +1254,6 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
|
|
1258 |
|
1259 |
# round num frames to the nearest multiple of context size - overlap
|
1260 |
num_frames = (num_frames // (context_size - overlap)) * (context_size - overlap)
|
1261 |
-
print(f"Num frames: {num_frames}")
|
1262 |
|
1263 |
# 5. Prepare latent variables
|
1264 |
num_channels_latents = self.unet.config.in_channels
|
@@ -1408,15 +1403,12 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
|
|
1408 |
num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
|
1409 |
with self.progress_bar(total=len(timesteps)) as progress_bar:
|
1410 |
for i, t in enumerate(timesteps):
|
1411 |
-
print("i", i)
|
1412 |
-
print("t", t)
|
1413 |
noise_pred_uncond_sum = torch.zeros_like(latents).to(device).to(dtype=torch.float16)
|
1414 |
noise_pred_text_sum = torch.zeros_like(latents).to(device).to(dtype=torch.float16)
|
1415 |
latent_counter = torch.zeros(num_frames).to(device).to(dtype=torch.float16)
|
1416 |
|
1417 |
# foreach context group seperately denoise the current timestep
|
1418 |
for context_group in range(len(context_indexes[i])):
|
1419 |
-
print("Denoising context group", context_group, "of", len(context_indexes[i]))
|
1420 |
# calculate to current indexes, considering overlapa
|
1421 |
current_context_indexes = context_indexes[i][context_group]
|
1422 |
|
@@ -1428,10 +1420,6 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
|
|
1428 |
latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
|
1429 |
|
1430 |
control_end_step = int(control_end*num_inference_steps)
|
1431 |
-
print(i, control_end_step)
|
1432 |
-
print("control_end_step", control_end_step)
|
1433 |
-
if self.controlnet != None and i < control_end_step:
|
1434 |
-
print("adding controlnet")
|
1435 |
|
1436 |
if self.controlnet != None and i < int(control_end*num_inference_steps):
|
1437 |
|
@@ -1477,9 +1465,6 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
|
|
1477 |
return_dict=False,
|
1478 |
)
|
1479 |
|
1480 |
-
torch.cuda.synchronize() # Synchronize GPU
|
1481 |
-
print("controlnet time", time.time() - control_start)
|
1482 |
-
torch.cuda.synchronize()
|
1483 |
unet_start = time.time()
|
1484 |
# predict the noise residual with the added controlnet residuals
|
1485 |
noise_pred = self.unet(
|
@@ -1491,8 +1476,6 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
|
|
1491 |
down_block_additional_residuals=down_block_res_samples,
|
1492 |
mid_block_additional_residual=mid_block_res_sample,
|
1493 |
).sample
|
1494 |
-
torch.cuda.synchronize()
|
1495 |
-
print("unet time", time.time() - unet_start)
|
1496 |
|
1497 |
else:
|
1498 |
# predict the noise residual without contorlnet
|
@@ -1505,9 +1488,6 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
|
|
1505 |
cross_attention_kwargs=cross_attention_kwargs,
|
1506 |
added_cond_kwargs=added_cond_kwargs,
|
1507 |
).sample
|
1508 |
-
torch.cuda.synchronize()
|
1509 |
-
print("unet time", time.time() - unet_start)
|
1510 |
-
|
1511 |
|
1512 |
if do_classifier_free_guidance:
|
1513 |
# Start timing for overall guidance process
|
@@ -1520,10 +1500,6 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
|
|
1520 |
|
1521 |
noise_pred_uncond, noise_pred_text = torch.chunk(noise_pred, 2, dim=0)
|
1522 |
|
1523 |
-
torch.cuda.synchronize() # Synchronize GPU after chunking
|
1524 |
-
time_chunk_end = time.time()
|
1525 |
-
print("Chunk time: {:.6f} seconds".format(time_chunk_end - time_chunk_start))
|
1526 |
-
|
1527 |
# Timing for batch addition and latent counter increment
|
1528 |
torch.cuda.synchronize() # Synchronize GPU before batch addition
|
1529 |
time_batch_addition_start = time.time()
|
@@ -1533,15 +1509,6 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
|
|
1533 |
noise_pred_text_sum[..., current_context_indexes, :, :] += noise_pred_text
|
1534 |
latent_counter[current_context_indexes] += 1
|
1535 |
|
1536 |
-
torch.cuda.synchronize() # Synchronize GPU after batch addition
|
1537 |
-
time_batch_addition_end = time.time()
|
1538 |
-
print("Batch addition and counter increment time: {:.6f} seconds".format(time_batch_addition_end - time_batch_addition_start))
|
1539 |
-
|
1540 |
-
# End timing for overall guidance process
|
1541 |
-
torch.cuda.synchronize() # Synchronize GPU after overall guidance process
|
1542 |
-
end_guidance_time = time.time()
|
1543 |
-
print("Total guidance time: {:.6f} seconds".format(end_guidance_time - start_guidance_time))
|
1544 |
-
|
1545 |
# set the step index to the current batch
|
1546 |
self.scheduler._step_index = i
|
1547 |
|
|
|
982 |
image = self.control_image_processor.preprocess(image, height=height, width=width).to(dtype=torch.float32)
|
983 |
# image_batch_size = image.shape[0]
|
984 |
image_batch_size = len(image)
|
|
|
985 |
|
986 |
# if image_batch_size == 1:
|
987 |
# repeat_by = batch_size
|
|
|
995 |
|
996 |
# if do_classifier_free_guidance and not guess_mode:
|
997 |
# image = torch.cat([image] * 2)
|
|
|
|
|
|
|
998 |
|
999 |
return image
|
1000 |
|
|
|
1254 |
|
1255 |
# round num frames to the nearest multiple of context size - overlap
|
1256 |
num_frames = (num_frames // (context_size - overlap)) * (context_size - overlap)
|
|
|
1257 |
|
1258 |
# 5. Prepare latent variables
|
1259 |
num_channels_latents = self.unet.config.in_channels
|
|
|
1403 |
num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
|
1404 |
with self.progress_bar(total=len(timesteps)) as progress_bar:
|
1405 |
for i, t in enumerate(timesteps):
|
|
|
|
|
1406 |
noise_pred_uncond_sum = torch.zeros_like(latents).to(device).to(dtype=torch.float16)
|
1407 |
noise_pred_text_sum = torch.zeros_like(latents).to(device).to(dtype=torch.float16)
|
1408 |
latent_counter = torch.zeros(num_frames).to(device).to(dtype=torch.float16)
|
1409 |
|
1410 |
# foreach context group seperately denoise the current timestep
|
1411 |
for context_group in range(len(context_indexes[i])):
|
|
|
1412 |
# calculate to current indexes, considering overlapa
|
1413 |
current_context_indexes = context_indexes[i][context_group]
|
1414 |
|
|
|
1420 |
latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
|
1421 |
|
1422 |
control_end_step = int(control_end*num_inference_steps)
|
|
|
|
|
|
|
|
|
1423 |
|
1424 |
if self.controlnet != None and i < int(control_end*num_inference_steps):
|
1425 |
|
|
|
1465 |
return_dict=False,
|
1466 |
)
|
1467 |
|
|
|
|
|
|
|
1468 |
unet_start = time.time()
|
1469 |
# predict the noise residual with the added controlnet residuals
|
1470 |
noise_pred = self.unet(
|
|
|
1476 |
down_block_additional_residuals=down_block_res_samples,
|
1477 |
mid_block_additional_residual=mid_block_res_sample,
|
1478 |
).sample
|
|
|
|
|
1479 |
|
1480 |
else:
|
1481 |
# predict the noise residual without contorlnet
|
|
|
1488 |
cross_attention_kwargs=cross_attention_kwargs,
|
1489 |
added_cond_kwargs=added_cond_kwargs,
|
1490 |
).sample
|
|
|
|
|
|
|
1491 |
|
1492 |
if do_classifier_free_guidance:
|
1493 |
# Start timing for overall guidance process
|
|
|
1500 |
|
1501 |
noise_pred_uncond, noise_pred_text = torch.chunk(noise_pred, 2, dim=0)
|
1502 |
|
|
|
|
|
|
|
|
|
1503 |
# Timing for batch addition and latent counter increment
|
1504 |
torch.cuda.synchronize() # Synchronize GPU before batch addition
|
1505 |
time_batch_addition_start = time.time()
|
|
|
1509 |
noise_pred_text_sum[..., current_context_indexes, :, :] += noise_pred_text
|
1510 |
latent_counter[current_context_indexes] += 1
|
1511 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1512 |
# set the step index to the current batch
|
1513 |
self.scheduler._step_index = i
|
1514 |
|