AlanB
/

lpw_stable_diffusion_mod

Model card Files Files and versions Community

AlanB commited on Feb 22, 2023

Commit

69e533f

1 Parent(s): afbfa5a

Updates from diffusers

Browse files

Files changed (1) hide show

pipeline.py +21 -53

pipeline.py CHANGED Viewed

@@ -3,19 +3,19 @@ import re
 from typing import Callable, List, Optional, Union
 import numpy as np
 import torch
 import random
 import sys
 from tqdm.auto import tqdm
 import diffusers
-import PIL
 from diffusers import SchedulerMixin, StableDiffusionPipeline
 from diffusers.models import AutoencoderKL, UNet2DConditionModel
 from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput, StableDiffusionSafetyChecker
-from diffusers.utils import deprecate, logging
-from packaging import version
-from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
 try:
@@ -255,7 +255,6 @@ def get_weighted_text_embeddings(
     no_boseos_middle: Optional[bool] = False,
     skip_parsing: Optional[bool] = False,
     skip_weighting: Optional[bool] = False,
-    **kwargs,
 ):
     r"""
     Prompts can be assigned with local weights using brackets. For example,
@@ -603,7 +602,7 @@ class StableDiffusionLongPromptWeightingPipeline(StableDiffusionPipeline):
         latents = 1 / 0.18215 * latents
         image = self.vae.decode(latents).sample
         image = (image / 2 + 0.5).clamp(0, 1)
-        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloa16
         image = image.cpu().permute(0, 2, 3, 1).float().numpy()
         return image
@@ -684,8 +683,7 @@ class StableDiffusionLongPromptWeightingPipeline(StableDiffusionPipeline):
         return_dict: bool = True,
         callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
         is_cancelled_callback: Optional[Callable[[], bool]] = None,
-        callback_steps: Optional[int] = 1,
-        **kwargs,
     ):
         r"""
         Function invoked when calling the pipeline for generation.
@@ -761,10 +759,6 @@ class StableDiffusionLongPromptWeightingPipeline(StableDiffusionPipeline):
             list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
             (nsfw) content, according to the `safety_checker`.
         """
-        message = "Please use `image` instead of `init_image`."
-        init_image = deprecate("init_image", "0.14.0", message, take_from=kwargs)
-        image = init_image or image
         # 0. Default height and width to unet
         height = height or self.unet.config.sample_size * self.vae_scale_factor
         width = width or self.unet.config.sample_size * self.vae_scale_factor
@@ -886,8 +880,7 @@ class StableDiffusionLongPromptWeightingPipeline(StableDiffusionPipeline):
         return_dict: bool = True,
         callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
         is_cancelled_callback: Optional[Callable[[], bool]] = None,
-        callback_steps: Optional[int] = 1,
-        **kwargs,
     ):
         r"""
         Function for text-to-image generation.
@@ -963,7 +956,6 @@ class StableDiffusionLongPromptWeightingPipeline(StableDiffusionPipeline):
             callback=callback,
             is_cancelled_callback=is_cancelled_callback,
             callback_steps=callback_steps,
-            **kwargs,
         )
     def img2img(
@@ -982,8 +974,7 @@ class StableDiffusionLongPromptWeightingPipeline(StableDiffusionPipeline):
         return_dict: bool = True,
         callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
         is_cancelled_callback: Optional[Callable[[], bool]] = None,
-        callback_steps: Optional[int] = 1,
-        **kwargs,
     ):
         r"""
         Function for image-to-image generation.
@@ -1059,7 +1050,6 @@ class StableDiffusionLongPromptWeightingPipeline(StableDiffusionPipeline):
             callback=callback,
             is_cancelled_callback=is_cancelled_callback,
             callback_steps=callback_steps,
-            **kwargs,
         )
     def inpaint(
@@ -1079,8 +1069,7 @@ class StableDiffusionLongPromptWeightingPipeline(StableDiffusionPipeline):
         return_dict: bool = True,
         callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
         is_cancelled_callback: Optional[Callable[[], bool]] = None,
-        callback_steps: Optional[int] = 1,
-        **kwargs,
     ):
         r"""
         Function for inpaint.
@@ -1161,13 +1150,11 @@ class StableDiffusionLongPromptWeightingPipeline(StableDiffusionPipeline):
             callback=callback,
             is_cancelled_callback=is_cancelled_callback,
             callback_steps=callback_steps,
-            **kwargs,
         )
     # Borrowed from https://github.com/csaluski/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
     def get_text_latent_space(self, prompt, guidance_scale = 7.5):
         # get prompt text embeddings
         text_input = self.tokenizer(
             prompt,
@@ -1177,7 +1164,7 @@ class StableDiffusionLongPromptWeightingPipeline(StableDiffusionPipeline):
             return_tensors="pt",
         )
         text_embeddings = self.text_encoder(text_input.input_ids.to(self.device))[0]
         # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
         # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
         # corresponds to doing no classifier free guidance.
@@ -1196,7 +1183,7 @@ class StableDiffusionLongPromptWeightingPipeline(StableDiffusionPipeline):
             text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
         return text_embeddings
     def slerp(self, t, v0, v1, DOT_THRESHOLD=0.9995):
         """ helper function to spherically interpolate two arrays v1 v2
         from https://gist.github.com/karpathy/00103b0037c5aaea32fe1da1af553355
@@ -1293,11 +1280,11 @@ class StableDiffusionLongPromptWeightingPipeline(StableDiffusionPipeline):
         eta: Optional[float] = 0.0,
         generator: Optional[torch.Generator] = None,
         output_type: Optional[str] = "pil",
-        save_n_steps: Optional[int] = None,
         **kwargs,):
         from diffusers.schedulers import LMSDiscreteScheduler
         batch_size = 1
         if generator == None:
             generator = torch.Generator("cuda")
         generator_state = generator.get_state()
@@ -1331,27 +1318,8 @@ class StableDiffusionLongPromptWeightingPipeline(StableDiffusionPipeline):
         extra_step_kwargs = {}
         if accepts_eta:
             extra_step_kwargs["eta"] = eta
-        if save_n_steps:
-            mid_latents = []
-            mid_images = []
-        else:
-            mid_latents = None
-            mid_images = None
         for i, t in tqdm(enumerate(self.scheduler.timesteps)):
-            if save_n_steps:
-                if i % save_n_steps == 0:
-                    # scale and decode the image latents with vae
-                    dec_mid_latents = 1 / 0.18215 * latents
-                    mid_latents.append(dec_mid_latents)
-                    image = self.vae.decode(dec_mid_latents).sample
-                    image = (image / 2 + 0.5).clamp(0, 1)
-                    image = image.cpu().permute(0, 2, 3, 1).numpy()
-                    if output_type == "pil":
-                        image = self.numpy_to_pil(image)
-                    mid_latents.append(image)
-                    mid_images.append(image)
             # expand the latents if we are doing classifier free guidance
             latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
             if isinstance(self.scheduler, LMSDiscreteScheduler):
@@ -1359,7 +1327,7 @@ class StableDiffusionLongPromptWeightingPipeline(StableDiffusionPipeline):
                 latent_model_input = latent_model_input / ((sigma**2 + 1) ** 0.5)
             # predict the noise residual
-            noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings)["sample"]
             # perform guidance
             if do_classifier_free_guidance:
@@ -1368,21 +1336,21 @@ class StableDiffusionLongPromptWeightingPipeline(StableDiffusionPipeline):
             # compute the previous noisy sample x_t -> x_t-1
             if isinstance(self.scheduler, LMSDiscreteScheduler):
-                latents = self.scheduler.step(noise_pred, i, latents, **extra_step_kwargs)["prev_sample"]
             else:
-                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs)["prev_sample"]
         # scale and decode the image latents with vae
         latents = 1 / 0.18215 * latents
-        image = self.vae.decode(latents).sample
         image = (image / 2 + 0.5).clamp(0, 1)
         image = image.cpu().permute(0, 2, 3, 1).numpy()
         if output_type == "pil":
             image = self.numpy_to_pil(image)
-        return {"image": image, "generator_state": generator_state, "mid_latents": mid_latents, "mid_images": mid_images}
     def variation(self, text_embeddings, generator_state, variation_magnitude = 100, **kwargs):
         # random vector to move in latent space
@@ -1390,7 +1358,7 @@ class StableDiffusionLongPromptWeightingPipeline(StableDiffusionPipeline):
         rand_mag = torch.sum(torch.abs(rand_t)) / variation_magnitude
         scaled_rand_t = rand_t / rand_mag
         variation_embedding = text_embeddings + scaled_rand_t
         generator = torch.Generator("cuda")
         generator.set_state(generator_state)
         result = self.diffuse_from_inits(variation_embedding, generator=generator, **kwargs)

 from typing import Callable, List, Optional, Union
 import numpy as np
+import PIL
 import torch
+from packaging import version
+from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
 import random
 import sys
 from tqdm.auto import tqdm
 import diffusers
 from diffusers import SchedulerMixin, StableDiffusionPipeline
 from diffusers.models import AutoencoderKL, UNet2DConditionModel
 from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput, StableDiffusionSafetyChecker
+from diffusers.utils import logging
 try:
     no_boseos_middle: Optional[bool] = False,
     skip_parsing: Optional[bool] = False,
     skip_weighting: Optional[bool] = False,
 ):
     r"""
     Prompts can be assigned with local weights using brackets. For example,
         latents = 1 / 0.18215 * latents
         image = self.vae.decode(latents).sample
         image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
         image = image.cpu().permute(0, 2, 3, 1).float().numpy()
         return image
         return_dict: bool = True,
         callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
         is_cancelled_callback: Optional[Callable[[], bool]] = None,
+        callback_steps: int = 1,
     ):
         r"""
         Function invoked when calling the pipeline for generation.
             list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
             (nsfw) content, according to the `safety_checker`.
         """
         # 0. Default height and width to unet
         height = height or self.unet.config.sample_size * self.vae_scale_factor
         width = width or self.unet.config.sample_size * self.vae_scale_factor
         return_dict: bool = True,
         callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
         is_cancelled_callback: Optional[Callable[[], bool]] = None,
+        callback_steps: int = 1,
     ):
         r"""
         Function for text-to-image generation.
             callback=callback,
             is_cancelled_callback=is_cancelled_callback,
             callback_steps=callback_steps,
         )
     def img2img(
         return_dict: bool = True,
         callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
         is_cancelled_callback: Optional[Callable[[], bool]] = None,
+        callback_steps: int = 1,
     ):
         r"""
         Function for image-to-image generation.
             callback=callback,
             is_cancelled_callback=is_cancelled_callback,
             callback_steps=callback_steps,
         )
     def inpaint(
         return_dict: bool = True,
         callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
         is_cancelled_callback: Optional[Callable[[], bool]] = None,
+        callback_steps: int = 1,
     ):
         r"""
         Function for inpaint.
             callback=callback,
             is_cancelled_callback=is_cancelled_callback,
             callback_steps=callback_steps,
         )
     # Borrowed from https://github.com/csaluski/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
     def get_text_latent_space(self, prompt, guidance_scale = 7.5):
         # get prompt text embeddings
         text_input = self.tokenizer(
             prompt,
             return_tensors="pt",
         )
         text_embeddings = self.text_encoder(text_input.input_ids.to(self.device))[0]
         # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
         # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
         # corresponds to doing no classifier free guidance.
             text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
         return text_embeddings
     def slerp(self, t, v0, v1, DOT_THRESHOLD=0.9995):
         """ helper function to spherically interpolate two arrays v1 v2
         from https://gist.github.com/karpathy/00103b0037c5aaea32fe1da1af553355
         eta: Optional[float] = 0.0,
         generator: Optional[torch.Generator] = None,
         output_type: Optional[str] = "pil",
         **kwargs,):
         from diffusers.schedulers import LMSDiscreteScheduler
         batch_size = 1
         if generator == None:
             generator = torch.Generator("cuda")
         generator_state = generator.get_state()
         extra_step_kwargs = {}
         if accepts_eta:
             extra_step_kwargs["eta"] = eta
         for i, t in tqdm(enumerate(self.scheduler.timesteps)):
             # expand the latents if we are doing classifier free guidance
             latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
             if isinstance(self.scheduler, LMSDiscreteScheduler):
                 latent_model_input = latent_model_input / ((sigma**2 + 1) ** 0.5)
             # predict the noise residual
+            noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample
             # perform guidance
             if do_classifier_free_guidance:
             # compute the previous noisy sample x_t -> x_t-1
             if isinstance(self.scheduler, LMSDiscreteScheduler):
+                latents = self.scheduler.step(noise_pred, i, latents, **extra_step_kwargs).prev_sample
             else:
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
         # scale and decode the image latents with vae
         latents = 1 / 0.18215 * latents
+        image = self.vae.decode(latents)
         image = (image / 2 + 0.5).clamp(0, 1)
         image = image.cpu().permute(0, 2, 3, 1).numpy()
         if output_type == "pil":
             image = self.numpy_to_pil(image)
+        return {"image": image, "generator_state": generator_state}
     def variation(self, text_embeddings, generator_state, variation_magnitude = 100, **kwargs):
         # random vector to move in latent space
         rand_mag = torch.sum(torch.abs(rand_t)) / variation_magnitude
         scaled_rand_t = rand_t / rand_mag
         variation_embedding = text_embeddings + scaled_rand_t
         generator = torch.Generator("cuda")
         generator.set_state(generator_state)
         result = self.diffuse_from_inits(variation_embedding, generator=generator, **kwargs)