smoothieAI
/

pipeline_animatediff_context

Model card Files Files and versions

xet

Community

smoothieAI commited on Jan 17, 2024

Commit

adb42e1

verified ·

1 Parent(s): 5aaa10b

Update pipeline.py

Browse files

Files changed (1) hide show

pipeline.py +67 -68

pipeline.py CHANGED Viewed

@@ -1009,77 +1009,76 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
         # Denoising loop
         num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
-        if False:
-            with self.progress_bar(total=len(timesteps)) as progress_bar:
-                for i, t in enumerate(timesteps):
-                    latent_sum = torch.zeros_like(latents).to(device).to(dtype=torch.float16)
-                    latent_counter = torch.zeros(num_frames).to(device).to(dtype=torch.float16)
-                    # foreach context group seperately denoise the current timestep
-                    for context_group in range(num_context_groups):
-                        # calculate to current indexes, considering overlap
-                        if context_group == 0:current_context_start = 0
-                        else:current_context_start = context_group * (context_size - overlap)
-                        # select the relevent context from the latents
-                        current_context_latents = latents[:, :, current_context_start : current_context_start + context_size, :, :]
-                        wrap_count = max(current_context_start + context_size - num_frames, 0)
-                        # if context_start + context_size > num_frames: append the remaining frames from the start of the latents
-                        if wrap_count > 0:
-                            current_context_latents = torch.cat([current_context_latents, latents[:, :, :wrap_count, :, :]], dim=2)
-                        # expand the latents if we are doing classifier free guidance
-                        latent_model_input = torch.cat([current_context_latents] * 2) if do_classifier_free_guidance else current_context_latents
-                        latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-                        # predict the noise residual
-                        noise_pred = self.unet(
-                            latent_model_input,
-                            t,
-                            encoder_hidden_states=prompt_embeds,
-                            cross_attention_kwargs=cross_attention_kwargs,
-                            added_cond_kwargs=added_cond_kwargs,
-                        ).sample
-                        # perform guidance
-                        if do_classifier_free_guidance:
-                            noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                            noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-                        # set the step index to the current batch
-                        self.scheduler._step_index = i
-                        # compute the previous noisy sample x_t -> x_t-1
-                        current_context_latents = self.scheduler.step(noise_pred, t, current_context_latents, **extra_step_kwargs).prev_sample
-                        # if context_start + context_size > num_frames: remove the appended frames from the end of the current_context_latents
-                        if wrap_count > 0:
-                            # add the ending frames from current_context_latents to the start of the latent_sum
-                            latent_sum[:, :, 0:wrap_count, :, :] += current_context_latents[:, :, -wrap_count:, :, :]
-                            # increase the counter for the ending frames
-                            latent_counter[0:wrap_count] += 1
-                            # remove the ending frames from current_context_latents
-                            current_context_latents = current_context_latents[:, :, :-wrap_count, :, :]
-                        #add the context current_context_latents back to the latent sum starting from the current context start
-                        latent_sum[:, :, current_context_start : current_context_start + context_size, :, :] += current_context_latents
-                        # add one to the counter for each timestep in the context
-                        latent_counter[current_context_start : current_context_start + context_size] += 1
-                    # call the callback, if provided
-                    if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                        progress_bar.update()
-                        if callback is not None and i % callback_steps == 0:
-                            callback(i, t, None)
-                    latent_counter = latent_counter.reshape(1, 1, num_frames, 1, 1)
-                    latents = latent_sum / latent_counter
-                    # shuffle rotate latent images by step places, wrapping around the last 2 to the start
-                    latents = torch.cat([latents[:, :, -step:, :, :], latents[:, :, :-step, :, :]], dim=2)
         print("Done denoising")

         # Denoising loop
         num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=len(timesteps)) as progress_bar:
+            for i, t in enumerate(timesteps):
+                latent_sum = torch.zeros_like(latents).to(device).to(dtype=torch.float16)
+                latent_counter = torch.zeros(num_frames).to(device).to(dtype=torch.float16)
+                # foreach context group seperately denoise the current timestep
+                for context_group in range(num_context_groups):
+                    # calculate to current indexes, considering overlap
+                    if context_group == 0:current_context_start = 0
+                    else:current_context_start = context_group * (context_size - overlap)
+                    # select the relevent context from the latents
+                    current_context_latents = latents[:, :, current_context_start : current_context_start + context_size, :, :]
+                    wrap_count = max(current_context_start + context_size - num_frames, 0)
+                    # if context_start + context_size > num_frames: append the remaining frames from the start of the latents
+                    if wrap_count > 0:
+                        current_context_latents = torch.cat([current_context_latents, latents[:, :, :wrap_count, :, :]], dim=2)
+                    # expand the latents if we are doing classifier free guidance
+                    latent_model_input = torch.cat([current_context_latents] * 2) if do_classifier_free_guidance else current_context_latents
+                    latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                    # predict the noise residual
+                    noise_pred = self.unet(
+                        latent_model_input,
+                        t,
+                        encoder_hidden_states=prompt_embeds,
+                        cross_attention_kwargs=cross_attention_kwargs,
+                        added_cond_kwargs=added_cond_kwargs,
+                    ).sample
+                    # perform guidance
+                    if do_classifier_free_guidance:
+                        noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                        noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+                    # set the step index to the current batch
+                    self.scheduler._step_index = i
+                    # compute the previous noisy sample x_t -> x_t-1
+                    current_context_latents = self.scheduler.step(noise_pred, t, current_context_latents, **extra_step_kwargs).prev_sample
+                    # if context_start + context_size > num_frames: remove the appended frames from the end of the current_context_latents
+                    if wrap_count > 0:
+                        # add the ending frames from current_context_latents to the start of the latent_sum
+                        latent_sum[:, :, 0:wrap_count, :, :] += current_context_latents[:, :, -wrap_count:, :, :]
+                        # increase the counter for the ending frames
+                        latent_counter[0:wrap_count] += 1
+                        # remove the ending frames from current_context_latents
+                        current_context_latents = current_context_latents[:, :, :-wrap_count, :, :]
+                    #add the context current_context_latents back to the latent sum starting from the current context start
+                    latent_sum[:, :, current_context_start : current_context_start + context_size, :, :] += current_context_latents
+                    # add one to the counter for each timestep in the context
+                    latent_counter[current_context_start : current_context_start + context_size] += 1
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        callback(i, t, None)
+                latent_counter = latent_counter.reshape(1, 1, num_frames, 1, 1)
+                latents = latent_sum / latent_counter
+                # shuffle rotate latent images by step places, wrapping around the last 2 to the start
+                latents = torch.cat([latents[:, :, -step:, :, :], latents[:, :, :-step, :, :]], dim=2)
         print("Done denoising")