Replaced init_image with image to match git change.
Browse files- pipeline.py +26 -23
pipeline.py
CHANGED
@@ -579,7 +579,7 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
|
|
579 |
self,
|
580 |
prompt: Union[str, List[str]],
|
581 |
negative_prompt: Optional[Union[str, List[str]]] = None,
|
582 |
-
|
583 |
mask_image: Union[torch.FloatTensor, PIL.Image.Image] = None,
|
584 |
height: int = 512,
|
585 |
width: int = 512,
|
@@ -607,11 +607,11 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
|
|
607 |
negative_prompt (`str` or `List[str]`, *optional*):
|
608 |
The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
|
609 |
if `guidance_scale` is less than `1`).
|
610 |
-
|
611 |
`Image`, or tensor representing an image batch, that will be used as the starting point for the
|
612 |
process.
|
613 |
mask_image (`torch.FloatTensor` or `PIL.Image.Image`):
|
614 |
-
`Image`, or tensor representing an image batch, to mask `
|
615 |
replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a
|
616 |
PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should
|
617 |
contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`.
|
@@ -629,11 +629,11 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
|
|
629 |
1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
|
630 |
usually at the expense of lower image quality.
|
631 |
strength (`float`, *optional*, defaults to 0.8):
|
632 |
-
Conceptually, indicates how much to transform the reference `
|
633 |
-
`
|
634 |
number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added
|
635 |
noise will be maximum and the denoising process will run for the full number of iterations specified in
|
636 |
-
`num_inference_steps`. A value of 1, therefore, essentially ignores `
|
637 |
num_images_per_prompt (`int`, *optional*, defaults to 1):
|
638 |
The number of images to generate per prompt.
|
639 |
eta (`float`, *optional*, defaults to 0.0):
|
@@ -672,6 +672,9 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
|
|
672 |
list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
|
673 |
(nsfw) content, according to the `safety_checker`.
|
674 |
"""
|
|
|
|
|
|
|
675 |
|
676 |
if isinstance(prompt, str):
|
677 |
batch_size = 1
|
@@ -738,7 +741,7 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
|
|
738 |
mask = None
|
739 |
noise = None
|
740 |
|
741 |
-
if
|
742 |
# get the initial random noise unless the user supplied it
|
743 |
|
744 |
# Unlike in other pipelines, latents need to be generated in the target device
|
@@ -777,11 +780,11 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
|
|
777 |
# scale the initial noise by the standard deviation required by the scheduler
|
778 |
latents = latents * self.scheduler.init_noise_sigma
|
779 |
else:
|
780 |
-
if isinstance(
|
781 |
-
|
782 |
# encode the init image into latents and scale the latents
|
783 |
-
|
784 |
-
init_latent_dist = self.vae.encode(
|
785 |
init_latents = init_latent_dist.sample(generator=generator)
|
786 |
init_latents = 0.18215 * init_latents
|
787 |
init_latents = torch.cat([init_latents] * batch_size * num_images_per_prompt, dim=0)
|
@@ -796,7 +799,7 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
|
|
796 |
|
797 |
# check sizes
|
798 |
if not mask.shape == init_latents.shape:
|
799 |
-
raise ValueError("The mask and
|
800 |
|
801 |
# get the original timestep using init_timestep
|
802 |
offset = self.scheduler.config.get("steps_offset", 0)
|
@@ -985,7 +988,7 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
|
|
985 |
|
986 |
def img2img(
|
987 |
self,
|
988 |
-
|
989 |
prompt: Union[str, List[str]],
|
990 |
negative_prompt: Optional[Union[str, List[str]]] = None,
|
991 |
strength: float = 0.8,
|
@@ -1004,7 +1007,7 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
|
|
1004 |
r"""
|
1005 |
Function for image-to-image generation.
|
1006 |
Args:
|
1007 |
-
|
1008 |
`Image`, or tensor representing an image batch, that will be used as the starting point for the
|
1009 |
process.
|
1010 |
prompt (`str` or `List[str]`):
|
@@ -1013,11 +1016,11 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
|
|
1013 |
The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
|
1014 |
if `guidance_scale` is less than `1`).
|
1015 |
strength (`float`, *optional*, defaults to 0.8):
|
1016 |
-
Conceptually, indicates how much to transform the reference `
|
1017 |
-
`
|
1018 |
number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added
|
1019 |
noise will be maximum and the denoising process will run for the full number of iterations specified in
|
1020 |
-
`num_inference_steps`. A value of 1, therefore, essentially ignores `
|
1021 |
num_inference_steps (`int`, *optional*, defaults to 50):
|
1022 |
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
|
1023 |
expense of slower inference. This parameter will be modulated by `strength`.
|
@@ -1059,7 +1062,7 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
|
|
1059 |
return self.__call__(
|
1060 |
prompt=prompt,
|
1061 |
negative_prompt=negative_prompt,
|
1062 |
-
|
1063 |
num_inference_steps=num_inference_steps,
|
1064 |
guidance_scale=guidance_scale,
|
1065 |
strength=strength,
|
@@ -1076,7 +1079,7 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
|
|
1076 |
|
1077 |
def inpaint(
|
1078 |
self,
|
1079 |
-
|
1080 |
mask_image: Union[torch.FloatTensor, PIL.Image.Image],
|
1081 |
prompt: Union[str, List[str]],
|
1082 |
negative_prompt: Optional[Union[str, List[str]]] = None,
|
@@ -1096,11 +1099,11 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
|
|
1096 |
r"""
|
1097 |
Function for inpaint.
|
1098 |
Args:
|
1099 |
-
|
1100 |
`Image`, or tensor representing an image batch, that will be used as the starting point for the
|
1101 |
process. This is the image whose masked region will be inpainted.
|
1102 |
mask_image (`torch.FloatTensor` or `PIL.Image.Image`):
|
1103 |
-
`Image`, or tensor representing an image batch, to mask `
|
1104 |
replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a
|
1105 |
PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should
|
1106 |
contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`.
|
@@ -1112,7 +1115,7 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
|
|
1112 |
strength (`float`, *optional*, defaults to 0.8):
|
1113 |
Conceptually, indicates how much to inpaint the masked area. Must be between 0 and 1. When `strength`
|
1114 |
is 1, the denoising process will be run on the masked area for the full number of iterations specified
|
1115 |
-
in `num_inference_steps`. `
|
1116 |
noise to that region the larger the `strength`. If `strength` is 0, no inpainting will occur.
|
1117 |
num_inference_steps (`int`, *optional*, defaults to 50):
|
1118 |
The reference number of denoising steps. More denoising steps usually lead to a higher quality image at
|
@@ -1155,7 +1158,7 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
|
|
1155 |
return self.__call__(
|
1156 |
prompt=prompt,
|
1157 |
negative_prompt=negative_prompt,
|
1158 |
-
|
1159 |
mask_image=mask_image,
|
1160 |
num_inference_steps=num_inference_steps,
|
1161 |
guidance_scale=guidance_scale,
|
|
|
579 |
self,
|
580 |
prompt: Union[str, List[str]],
|
581 |
negative_prompt: Optional[Union[str, List[str]]] = None,
|
582 |
+
image: Union[torch.FloatTensor, PIL.Image.Image] = None,
|
583 |
mask_image: Union[torch.FloatTensor, PIL.Image.Image] = None,
|
584 |
height: int = 512,
|
585 |
width: int = 512,
|
|
|
607 |
negative_prompt (`str` or `List[str]`, *optional*):
|
608 |
The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
|
609 |
if `guidance_scale` is less than `1`).
|
610 |
+
image (`torch.FloatTensor` or `PIL.Image.Image`):
|
611 |
`Image`, or tensor representing an image batch, that will be used as the starting point for the
|
612 |
process.
|
613 |
mask_image (`torch.FloatTensor` or `PIL.Image.Image`):
|
614 |
+
`Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
|
615 |
replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a
|
616 |
PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should
|
617 |
contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`.
|
|
|
629 |
1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
|
630 |
usually at the expense of lower image quality.
|
631 |
strength (`float`, *optional*, defaults to 0.8):
|
632 |
+
Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1.
|
633 |
+
`image` will be used as a starting point, adding more noise to it the larger the `strength`. The
|
634 |
number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added
|
635 |
noise will be maximum and the denoising process will run for the full number of iterations specified in
|
636 |
+
`num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
|
637 |
num_images_per_prompt (`int`, *optional*, defaults to 1):
|
638 |
The number of images to generate per prompt.
|
639 |
eta (`float`, *optional*, defaults to 0.0):
|
|
|
672 |
list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
|
673 |
(nsfw) content, according to the `safety_checker`.
|
674 |
"""
|
675 |
+
message = "Please use `image` instead of `init_image`."
|
676 |
+
init_image = deprecate("init_image", "0.12.0", message, take_from=kwargs)
|
677 |
+
image = init_image or image
|
678 |
|
679 |
if isinstance(prompt, str):
|
680 |
batch_size = 1
|
|
|
741 |
mask = None
|
742 |
noise = None
|
743 |
|
744 |
+
if image is None:
|
745 |
# get the initial random noise unless the user supplied it
|
746 |
|
747 |
# Unlike in other pipelines, latents need to be generated in the target device
|
|
|
780 |
# scale the initial noise by the standard deviation required by the scheduler
|
781 |
latents = latents * self.scheduler.init_noise_sigma
|
782 |
else:
|
783 |
+
if isinstance(image, PIL.Image.Image):
|
784 |
+
image = preprocess_image(image)
|
785 |
# encode the init image into latents and scale the latents
|
786 |
+
image = image.to(device=self.device, dtype=latents_dtype)
|
787 |
+
init_latent_dist = self.vae.encode(image).latent_dist
|
788 |
init_latents = init_latent_dist.sample(generator=generator)
|
789 |
init_latents = 0.18215 * init_latents
|
790 |
init_latents = torch.cat([init_latents] * batch_size * num_images_per_prompt, dim=0)
|
|
|
799 |
|
800 |
# check sizes
|
801 |
if not mask.shape == init_latents.shape:
|
802 |
+
raise ValueError("The mask and image should be the same size!")
|
803 |
|
804 |
# get the original timestep using init_timestep
|
805 |
offset = self.scheduler.config.get("steps_offset", 0)
|
|
|
988 |
|
989 |
def img2img(
|
990 |
self,
|
991 |
+
image: Union[torch.FloatTensor, PIL.Image.Image],
|
992 |
prompt: Union[str, List[str]],
|
993 |
negative_prompt: Optional[Union[str, List[str]]] = None,
|
994 |
strength: float = 0.8,
|
|
|
1007 |
r"""
|
1008 |
Function for image-to-image generation.
|
1009 |
Args:
|
1010 |
+
image (`torch.FloatTensor` or `PIL.Image.Image`):
|
1011 |
`Image`, or tensor representing an image batch, that will be used as the starting point for the
|
1012 |
process.
|
1013 |
prompt (`str` or `List[str]`):
|
|
|
1016 |
The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
|
1017 |
if `guidance_scale` is less than `1`).
|
1018 |
strength (`float`, *optional*, defaults to 0.8):
|
1019 |
+
Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1.
|
1020 |
+
`image` will be used as a starting point, adding more noise to it the larger the `strength`. The
|
1021 |
number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added
|
1022 |
noise will be maximum and the denoising process will run for the full number of iterations specified in
|
1023 |
+
`num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
|
1024 |
num_inference_steps (`int`, *optional*, defaults to 50):
|
1025 |
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
|
1026 |
expense of slower inference. This parameter will be modulated by `strength`.
|
|
|
1062 |
return self.__call__(
|
1063 |
prompt=prompt,
|
1064 |
negative_prompt=negative_prompt,
|
1065 |
+
image=image,
|
1066 |
num_inference_steps=num_inference_steps,
|
1067 |
guidance_scale=guidance_scale,
|
1068 |
strength=strength,
|
|
|
1079 |
|
1080 |
def inpaint(
|
1081 |
self,
|
1082 |
+
image: Union[torch.FloatTensor, PIL.Image.Image],
|
1083 |
mask_image: Union[torch.FloatTensor, PIL.Image.Image],
|
1084 |
prompt: Union[str, List[str]],
|
1085 |
negative_prompt: Optional[Union[str, List[str]]] = None,
|
|
|
1099 |
r"""
|
1100 |
Function for inpaint.
|
1101 |
Args:
|
1102 |
+
image (`torch.FloatTensor` or `PIL.Image.Image`):
|
1103 |
`Image`, or tensor representing an image batch, that will be used as the starting point for the
|
1104 |
process. This is the image whose masked region will be inpainted.
|
1105 |
mask_image (`torch.FloatTensor` or `PIL.Image.Image`):
|
1106 |
+
`Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
|
1107 |
replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a
|
1108 |
PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should
|
1109 |
contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`.
|
|
|
1115 |
strength (`float`, *optional*, defaults to 0.8):
|
1116 |
Conceptually, indicates how much to inpaint the masked area. Must be between 0 and 1. When `strength`
|
1117 |
is 1, the denoising process will be run on the masked area for the full number of iterations specified
|
1118 |
+
in `num_inference_steps`. `image` will be used as a reference for the masked area, adding more
|
1119 |
noise to that region the larger the `strength`. If `strength` is 0, no inpainting will occur.
|
1120 |
num_inference_steps (`int`, *optional*, defaults to 50):
|
1121 |
The reference number of denoising steps. More denoising steps usually lead to a higher quality image at
|
|
|
1158 |
return self.__call__(
|
1159 |
prompt=prompt,
|
1160 |
negative_prompt=negative_prompt,
|
1161 |
+
image=image,
|
1162 |
mask_image=mask_image,
|
1163 |
num_inference_steps=num_inference_steps,
|
1164 |
guidance_scale=guidance_scale,
|