smoothieAI
/

pipeline_animatediff_context

Model card Files Files and versions

xet

Community

smoothieAI commited on Jan 14, 2024

Commit

0a7ee5b

verified ·

1 Parent(s): b0f31b3

Update pipeline.py

Browse files

Files changed (1) hide show

pipeline.py +203 -6

pipeline.py CHANGED Viewed

@@ -583,6 +583,185 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
         return latents
     @torch.no_grad()
     # @replace_example_docstring(EXAMPLE_DOC_STRING)
     def __call__(
@@ -614,6 +793,9 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
         x_velocity: Optional[float] = 0,
         y_velocity: Optional[float] = 0,
         scale_velocity: Optional[float] = 0,
     ):
         r"""
         The call function to the pipeline for generation.
@@ -753,8 +935,26 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
         #     generator,
         #     latents,
         # )
-        latents = self.prepare_motion_latents(
-            batch_size * num_videos_per_prompt,
             num_channels_latents,
             num_frames,
             height,
@@ -762,10 +962,7 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
             prompt_embeds.dtype,
             device,
             generator,
-            latents,
-            x_velocity=x_velocity,
-            y_velocity=y_velocity,
-            scale_velocity=scale_velocity,
         )
         # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline

         return latents
+    def generate_correlated_noise(self, latents, init_noise_correlation):
+        cloned_latents = latents.clone()
+        p = init_noise_correlation
+        flattened_latents = torch.flatten(cloned_latents)
+        noise = torch.randn_like(flattened_latents)
+        correlated_noise = flattened_latents * p + math.sqrt(1 - p**2) * noise
+        return correlated_noise.reshape(cloned_latents.shape)
+    def generate_correlated_latents(self, latents, init_noise_correlation):
+        cloned_latents = latents.clone()
+        for i in range(1, cloned_latents.shape[2]):
+            p = init_noise_correlation
+            flattened_latents = torch.flatten(cloned_latents[:, :, i])
+            prev_flattened_latents = torch.flatten(cloned_latents[:, :, i - 1])
+            correlated_latents = (
+                prev_flattened_latents * p/math.sqrt((1+p**2))
+                +
+                flattened_latents * math.sqrt(1/(1 + p**2))
+            )
+            cloned_latents[:, :, i] = correlated_latents.reshape(
+                cloned_latents[:, :, i].shape
+            )
+        return cloned_latents
+    def generate_correlated_latents_legacy(self, latents, init_noise_correlation):
+        cloned_latents = latents.clone()
+        for i in range(1, cloned_latents.shape[2]):
+            p = init_noise_correlation
+            flattened_latents = torch.flatten(cloned_latents[:, :, i])
+            prev_flattened_latents = torch.flatten(cloned_latents[:, :, i - 1])
+            correlated_latents = (
+                prev_flattened_latents * p
+                +
+                flattened_latents * math.sqrt(1 - p**2)
+            )
+            cloned_latents[:, :, i] = correlated_latents.reshape(
+                cloned_latents[:, :, i].shape
+            )
+        return cloned_latents
+    def generate_mixed_noise(self, noise, init_noise_correlation):
+        shared_noise = torch.randn_like(noise[0, :, 0])
+        for b in range(noise.shape[0]):
+            for f in range(noise.shape[2]):
+                p = init_noise_correlation
+                flattened_latents = torch.flatten(noise[b, :, f])
+                shared_latents = torch.flatten(shared_noise)
+                correlated_latents = (
+                    shared_latents * math.sqrt(p**2/(1+p**2)) +
+                    flattened_latents * math.sqrt(1/(1+p**2))
+                )
+                noise[b, :, f] = correlated_latents.reshape(noise[b, :, f].shape)
+        return noise
+    def prepare_correlated_latents(
+        self,
+        init_image,
+        init_image_strength,
+        init_noise_correlation,
+        batch_size,
+        num_channels_latents,
+        video_length,
+        height,
+        width,
+        dtype,
+        device,
+        generator,
+        latents=None,
+    ):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            video_length,
+            height // self.vae_scale_factor,
+            width // self.vae_scale_factor,
+        )
+        if init_image is not None:
+            start_image = (
+                (
+                    torchvision.transforms.functional.pil_to_tensor(
+                        PIL.Image.open(init_image).resize((width, height))
+                    )
+                    / 255
+                )[:3, :, :]
+                .to("cuda")
+                .to(torch.bfloat16)
+                .unsqueeze(0)
+            )
+            start_image = (
+                self.vae.encode(start_image.mul(2).sub(1))
+                .latent_dist.sample()
+                .view(1, 4, height // 8, width // 8)
+                * 0.18215
+            )
+            init_latents = start_image.unsqueeze(2).repeat(1, 1, video_length, 1, 1)
+        else:
+            init_latents = None
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+        if latents is None:
+            rand_device = "cpu" if device.type == "mps" else device
+            if isinstance(generator, list):
+                shape = shape
+                # shape = (1,) + shape[1:]
+                # ignore init latents for batch model
+                latents = [
+                    torch.randn(
+                        shape, generator=generator[i], device=rand_device, dtype=dtype
+                    )
+                    for i in range(batch_size)
+                ]
+                latents = torch.cat(latents, dim=0).to(device)
+            else:
+                if init_latents is not None:
+                    offset = int(
+                        init_image_strength * (len(self.scheduler.timesteps) - 1)
+                    )
+                    noise = torch.randn_like(init_latents)
+                    noise = self.generate_correlated_latents(
+                        noise, init_noise_correlation
+                    )
+                    # Eric - some black magic here
+                    # We should be only adding the noise at timestep[offset], but I noticed that
+                    # we get more motion and cooler motion if we add the noise at timestep[offset - 1]
+                    # or offset - 2. However, this breaks the fewer timesteps there are, so let's interpolate
+                    timesteps = self.scheduler.timesteps
+                    average_timestep = None
+                    if offset == 0:
+                        average_timestep = timesteps[0]
+                    elif offset == 1:
+                        average_timestep = (
+                            timesteps[offset - 1] * (1 - init_image_strength)
+                            + timesteps[offset] * init_image_strength
+                        )
+                    else:
+                        average_timestep = timesteps[offset - 1]
+                    latents = self.scheduler.add_noise(
+                        init_latents, noise, average_timestep.long()
+                    )
+                    latents = self.scheduler.add_noise(
+                        latents, torch.randn_like(init_latents), timesteps[-2]
+                    )
+                else:
+                    latents = torch.randn(
+                        shape, generator=generator, device=rand_device, dtype=dtype
+                    ).to(device)
+                    latents = self.generate_correlated_latents(
+                        latents, init_noise_correlation
+                    )
+        else:
+            if latents.shape != shape:
+                raise ValueError(
+                    f"Unexpected latents shape, got {latents.shape}, expected {shape}"
+                )
+            latents = latents.to(device)
+        # scale the initial noise by the standard deviation required by the scheduler
+        if init_latents is None:
+            latents = latents * self.scheduler.init_noise_sigma
+        elif self.unet.trained_initial_frames and init_latents is not None:
+            # we only want to use this as the first frame
+            init_latents[:, :, 1:] = torch.zeros_like(init_latents[:, :, 1:])
+        latents = latents.to(device)
+        return latents, init_latents
     @torch.no_grad()
     # @replace_example_docstring(EXAMPLE_DOC_STRING)
     def __call__(
         x_velocity: Optional[float] = 0,
         y_velocity: Optional[float] = 0,
         scale_velocity: Optional[float] = 0,
+        init_image: Optional[str] = None,
+        init_image_strength: Optional[float] = 1.0,
+        init_noise_correlation: Optional[float] = 0.0,
     ):
         r"""
         The call function to the pipeline for generation.
         #     generator,
         #     latents,
         # )
+        # latents = self.prepare_motion_latents(
+        #     batch_size * num_videos_per_prompt,
+        #     num_channels_latents,
+        #     num_frames,
+        #     height,
+        #     width,
+        #     prompt_embeds.dtype,
+        #     device,
+        #     generator,
+        #     latents,
+        #     x_velocity=x_velocity,
+        #     y_velocity=y_velocity,
+        #     scale_velocity=scale_velocity,
+        # )
+        latents = self.prepare_correlated_latents(
+            self,
+            init_image,
+            init_image_strength,
+            init_noise_correlation,
+            batch_size,
             num_channels_latents,
             num_frames,
             height,
             prompt_embeds.dtype,
             device,
             generator,
+            latents=None,
         )
         # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline