smoothieAI
/

pipeline_animatediff_context

Model card Files Files and versions

xet

Community

smoothieAI commited on Jan 18, 2024

Commit

1ee3c52

verified ·

1 Parent(s): 788e3dc

Update pipeline.py

Browse files

Files changed (1) hide show

pipeline.py +20 -0

pipeline.py CHANGED Viewed

@@ -1094,6 +1094,10 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
                 noise_pred_uncond_sum = torch.zeros_like(latents).to(device).to(dtype=torch.float16)
                 noise_pred_text_sum = torch.zeros_like(latents).to(device).to(dtype=torch.float16)
                 latent_counter = torch.zeros(num_frames).to(device).to(dtype=torch.float16)
                 # foreach context group seperately denoise the current timestep
                 for context_group in range(len(context_indexes[i])):
@@ -1123,6 +1127,11 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
                         # add the ending frames from noise_pred_uncond to the start of the noise_pred_uncond_sum
                         noise_pred_uncond_sum[:, :,current_context_indexes, :, :] += noise_pred_uncond
                         noise_pred_text_sum[:, :,current_context_indexes, :, :] += noise_pred_text
                         #increase the counter for the ending frames
                         latent_counter[current_context_indexes] += 1
@@ -1137,6 +1146,17 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
                     latent_counter = latent_counter.reshape(1, 1, num_frames, 1, 1)
                     noise_pred_uncond = noise_pred_uncond_sum / latent_counter
                     noise_pred_text = noise_pred_text_sum / latent_counter
                     noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
                 # print min and max

                 noise_pred_uncond_sum = torch.zeros_like(latents).to(device).to(dtype=torch.float16)
                 noise_pred_text_sum = torch.zeros_like(latents).to(device).to(dtype=torch.float16)
                 latent_counter = torch.zeros(num_frames).to(device).to(dtype=torch.float16)
+                max_sum = 0
+                min_sum = 0
+                max_sum_cond = 0
+                min_sum_cond = 0
                 # foreach context group seperately denoise the current timestep
                 for context_group in range(len(context_indexes[i])):
                         # add the ending frames from noise_pred_uncond to the start of the noise_pred_uncond_sum
                         noise_pred_uncond_sum[:, :,current_context_indexes, :, :] += noise_pred_uncond
                         noise_pred_text_sum[:, :,current_context_indexes, :, :] += noise_pred_text
+                        # track the average min and max for normalization
+                        max_sum += noise_pred_uncond.max()
+                        min_sum += noise_pred_uncond.min()
+                        max_sum_cond += noise_pred_text.max()
+                        min_sum_cond += noise_pred_text.min()
                         #increase the counter for the ending frames
                         latent_counter[current_context_indexes] += 1
                     latent_counter = latent_counter.reshape(1, 1, num_frames, 1, 1)
                     noise_pred_uncond = noise_pred_uncond_sum / latent_counter
                     noise_pred_text = noise_pred_text_sum / latent_counter
+                    # calculate the average min and max for normalization
+                    avg_max = max_sum / latent_counter.sum()
+                    avg_min = min_sum / latent_counter.sum()
+                    avg_max_cond = max_sum_cond / latent_counter.sum()
+                    avg_min_cond = min_sum_cond / latent_counter.sum()
+                    # scale the noise predictions to the range of the avg min and max
+                    noise_pred_uncond = (noise_pred_uncond - avg_min) / (avg_max - avg_min)
+                    noise_pred_text = (noise_pred_text - avg_min_cond) / (avg_max_cond - avg_min_cond)
                     noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
                 # print min and max