AlanB commited on
Commit
67f4e7b
·
1 Parent(s): e97d8f2

Replaced init_image with image to match git change.

Browse files
Files changed (1) hide show
  1. pipeline.py +26 -23
pipeline.py CHANGED
@@ -579,7 +579,7 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
579
  self,
580
  prompt: Union[str, List[str]],
581
  negative_prompt: Optional[Union[str, List[str]]] = None,
582
- init_image: Union[torch.FloatTensor, PIL.Image.Image] = None,
583
  mask_image: Union[torch.FloatTensor, PIL.Image.Image] = None,
584
  height: int = 512,
585
  width: int = 512,
@@ -607,11 +607,11 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
607
  negative_prompt (`str` or `List[str]`, *optional*):
608
  The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
609
  if `guidance_scale` is less than `1`).
610
- init_image (`torch.FloatTensor` or `PIL.Image.Image`):
611
  `Image`, or tensor representing an image batch, that will be used as the starting point for the
612
  process.
613
  mask_image (`torch.FloatTensor` or `PIL.Image.Image`):
614
- `Image`, or tensor representing an image batch, to mask `init_image`. White pixels in the mask will be
615
  replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a
616
  PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should
617
  contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`.
@@ -629,11 +629,11 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
629
  1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
630
  usually at the expense of lower image quality.
631
  strength (`float`, *optional*, defaults to 0.8):
632
- Conceptually, indicates how much to transform the reference `init_image`. Must be between 0 and 1.
633
- `init_image` will be used as a starting point, adding more noise to it the larger the `strength`. The
634
  number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added
635
  noise will be maximum and the denoising process will run for the full number of iterations specified in
636
- `num_inference_steps`. A value of 1, therefore, essentially ignores `init_image`.
637
  num_images_per_prompt (`int`, *optional*, defaults to 1):
638
  The number of images to generate per prompt.
639
  eta (`float`, *optional*, defaults to 0.0):
@@ -672,6 +672,9 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
672
  list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
673
  (nsfw) content, according to the `safety_checker`.
674
  """
 
 
 
675
 
676
  if isinstance(prompt, str):
677
  batch_size = 1
@@ -738,7 +741,7 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
738
  mask = None
739
  noise = None
740
 
741
- if init_image is None:
742
  # get the initial random noise unless the user supplied it
743
 
744
  # Unlike in other pipelines, latents need to be generated in the target device
@@ -777,11 +780,11 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
777
  # scale the initial noise by the standard deviation required by the scheduler
778
  latents = latents * self.scheduler.init_noise_sigma
779
  else:
780
- if isinstance(init_image, PIL.Image.Image):
781
- init_image = preprocess_image(init_image)
782
  # encode the init image into latents and scale the latents
783
- init_image = init_image.to(device=self.device, dtype=latents_dtype)
784
- init_latent_dist = self.vae.encode(init_image).latent_dist
785
  init_latents = init_latent_dist.sample(generator=generator)
786
  init_latents = 0.18215 * init_latents
787
  init_latents = torch.cat([init_latents] * batch_size * num_images_per_prompt, dim=0)
@@ -796,7 +799,7 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
796
 
797
  # check sizes
798
  if not mask.shape == init_latents.shape:
799
- raise ValueError("The mask and init_image should be the same size!")
800
 
801
  # get the original timestep using init_timestep
802
  offset = self.scheduler.config.get("steps_offset", 0)
@@ -985,7 +988,7 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
985
 
986
  def img2img(
987
  self,
988
- init_image: Union[torch.FloatTensor, PIL.Image.Image],
989
  prompt: Union[str, List[str]],
990
  negative_prompt: Optional[Union[str, List[str]]] = None,
991
  strength: float = 0.8,
@@ -1004,7 +1007,7 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
1004
  r"""
1005
  Function for image-to-image generation.
1006
  Args:
1007
- init_image (`torch.FloatTensor` or `PIL.Image.Image`):
1008
  `Image`, or tensor representing an image batch, that will be used as the starting point for the
1009
  process.
1010
  prompt (`str` or `List[str]`):
@@ -1013,11 +1016,11 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
1013
  The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
1014
  if `guidance_scale` is less than `1`).
1015
  strength (`float`, *optional*, defaults to 0.8):
1016
- Conceptually, indicates how much to transform the reference `init_image`. Must be between 0 and 1.
1017
- `init_image` will be used as a starting point, adding more noise to it the larger the `strength`. The
1018
  number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added
1019
  noise will be maximum and the denoising process will run for the full number of iterations specified in
1020
- `num_inference_steps`. A value of 1, therefore, essentially ignores `init_image`.
1021
  num_inference_steps (`int`, *optional*, defaults to 50):
1022
  The number of denoising steps. More denoising steps usually lead to a higher quality image at the
1023
  expense of slower inference. This parameter will be modulated by `strength`.
@@ -1059,7 +1062,7 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
1059
  return self.__call__(
1060
  prompt=prompt,
1061
  negative_prompt=negative_prompt,
1062
- init_image=init_image,
1063
  num_inference_steps=num_inference_steps,
1064
  guidance_scale=guidance_scale,
1065
  strength=strength,
@@ -1076,7 +1079,7 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
1076
 
1077
  def inpaint(
1078
  self,
1079
- init_image: Union[torch.FloatTensor, PIL.Image.Image],
1080
  mask_image: Union[torch.FloatTensor, PIL.Image.Image],
1081
  prompt: Union[str, List[str]],
1082
  negative_prompt: Optional[Union[str, List[str]]] = None,
@@ -1096,11 +1099,11 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
1096
  r"""
1097
  Function for inpaint.
1098
  Args:
1099
- init_image (`torch.FloatTensor` or `PIL.Image.Image`):
1100
  `Image`, or tensor representing an image batch, that will be used as the starting point for the
1101
  process. This is the image whose masked region will be inpainted.
1102
  mask_image (`torch.FloatTensor` or `PIL.Image.Image`):
1103
- `Image`, or tensor representing an image batch, to mask `init_image`. White pixels in the mask will be
1104
  replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a
1105
  PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should
1106
  contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`.
@@ -1112,7 +1115,7 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
1112
  strength (`float`, *optional*, defaults to 0.8):
1113
  Conceptually, indicates how much to inpaint the masked area. Must be between 0 and 1. When `strength`
1114
  is 1, the denoising process will be run on the masked area for the full number of iterations specified
1115
- in `num_inference_steps`. `init_image` will be used as a reference for the masked area, adding more
1116
  noise to that region the larger the `strength`. If `strength` is 0, no inpainting will occur.
1117
  num_inference_steps (`int`, *optional*, defaults to 50):
1118
  The reference number of denoising steps. More denoising steps usually lead to a higher quality image at
@@ -1155,7 +1158,7 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
1155
  return self.__call__(
1156
  prompt=prompt,
1157
  negative_prompt=negative_prompt,
1158
- init_image=init_image,
1159
  mask_image=mask_image,
1160
  num_inference_steps=num_inference_steps,
1161
  guidance_scale=guidance_scale,
 
579
  self,
580
  prompt: Union[str, List[str]],
581
  negative_prompt: Optional[Union[str, List[str]]] = None,
582
+ image: Union[torch.FloatTensor, PIL.Image.Image] = None,
583
  mask_image: Union[torch.FloatTensor, PIL.Image.Image] = None,
584
  height: int = 512,
585
  width: int = 512,
 
607
  negative_prompt (`str` or `List[str]`, *optional*):
608
  The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
609
  if `guidance_scale` is less than `1`).
610
+ image (`torch.FloatTensor` or `PIL.Image.Image`):
611
  `Image`, or tensor representing an image batch, that will be used as the starting point for the
612
  process.
613
  mask_image (`torch.FloatTensor` or `PIL.Image.Image`):
614
+ `Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
615
  replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a
616
  PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should
617
  contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`.
 
629
  1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
630
  usually at the expense of lower image quality.
631
  strength (`float`, *optional*, defaults to 0.8):
632
+ Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1.
633
+ `image` will be used as a starting point, adding more noise to it the larger the `strength`. The
634
  number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added
635
  noise will be maximum and the denoising process will run for the full number of iterations specified in
636
+ `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
637
  num_images_per_prompt (`int`, *optional*, defaults to 1):
638
  The number of images to generate per prompt.
639
  eta (`float`, *optional*, defaults to 0.0):
 
672
  list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
673
  (nsfw) content, according to the `safety_checker`.
674
  """
675
+ message = "Please use `image` instead of `init_image`."
676
+ init_image = deprecate("init_image", "0.12.0", message, take_from=kwargs)
677
+ image = init_image or image
678
 
679
  if isinstance(prompt, str):
680
  batch_size = 1
 
741
  mask = None
742
  noise = None
743
 
744
+ if image is None:
745
  # get the initial random noise unless the user supplied it
746
 
747
  # Unlike in other pipelines, latents need to be generated in the target device
 
780
  # scale the initial noise by the standard deviation required by the scheduler
781
  latents = latents * self.scheduler.init_noise_sigma
782
  else:
783
+ if isinstance(image, PIL.Image.Image):
784
+ image = preprocess_image(image)
785
  # encode the init image into latents and scale the latents
786
+ image = image.to(device=self.device, dtype=latents_dtype)
787
+ init_latent_dist = self.vae.encode(image).latent_dist
788
  init_latents = init_latent_dist.sample(generator=generator)
789
  init_latents = 0.18215 * init_latents
790
  init_latents = torch.cat([init_latents] * batch_size * num_images_per_prompt, dim=0)
 
799
 
800
  # check sizes
801
  if not mask.shape == init_latents.shape:
802
+ raise ValueError("The mask and image should be the same size!")
803
 
804
  # get the original timestep using init_timestep
805
  offset = self.scheduler.config.get("steps_offset", 0)
 
988
 
989
  def img2img(
990
  self,
991
+ image: Union[torch.FloatTensor, PIL.Image.Image],
992
  prompt: Union[str, List[str]],
993
  negative_prompt: Optional[Union[str, List[str]]] = None,
994
  strength: float = 0.8,
 
1007
  r"""
1008
  Function for image-to-image generation.
1009
  Args:
1010
+ image (`torch.FloatTensor` or `PIL.Image.Image`):
1011
  `Image`, or tensor representing an image batch, that will be used as the starting point for the
1012
  process.
1013
  prompt (`str` or `List[str]`):
 
1016
  The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
1017
  if `guidance_scale` is less than `1`).
1018
  strength (`float`, *optional*, defaults to 0.8):
1019
+ Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1.
1020
+ `image` will be used as a starting point, adding more noise to it the larger the `strength`. The
1021
  number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added
1022
  noise will be maximum and the denoising process will run for the full number of iterations specified in
1023
+ `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
1024
  num_inference_steps (`int`, *optional*, defaults to 50):
1025
  The number of denoising steps. More denoising steps usually lead to a higher quality image at the
1026
  expense of slower inference. This parameter will be modulated by `strength`.
 
1062
  return self.__call__(
1063
  prompt=prompt,
1064
  negative_prompt=negative_prompt,
1065
+ image=image,
1066
  num_inference_steps=num_inference_steps,
1067
  guidance_scale=guidance_scale,
1068
  strength=strength,
 
1079
 
1080
  def inpaint(
1081
  self,
1082
+ image: Union[torch.FloatTensor, PIL.Image.Image],
1083
  mask_image: Union[torch.FloatTensor, PIL.Image.Image],
1084
  prompt: Union[str, List[str]],
1085
  negative_prompt: Optional[Union[str, List[str]]] = None,
 
1099
  r"""
1100
  Function for inpaint.
1101
  Args:
1102
+ image (`torch.FloatTensor` or `PIL.Image.Image`):
1103
  `Image`, or tensor representing an image batch, that will be used as the starting point for the
1104
  process. This is the image whose masked region will be inpainted.
1105
  mask_image (`torch.FloatTensor` or `PIL.Image.Image`):
1106
+ `Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
1107
  replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a
1108
  PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should
1109
  contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`.
 
1115
  strength (`float`, *optional*, defaults to 0.8):
1116
  Conceptually, indicates how much to inpaint the masked area. Must be between 0 and 1. When `strength`
1117
  is 1, the denoising process will be run on the masked area for the full number of iterations specified
1118
+ in `num_inference_steps`. `image` will be used as a reference for the masked area, adding more
1119
  noise to that region the larger the `strength`. If `strength` is 0, no inpainting will occur.
1120
  num_inference_steps (`int`, *optional*, defaults to 50):
1121
  The reference number of denoising steps. More denoising steps usually lead to a higher quality image at
 
1158
  return self.__call__(
1159
  prompt=prompt,
1160
  negative_prompt=negative_prompt,
1161
+ image=image,
1162
  mask_image=mask_image,
1163
  num_inference_steps=num_inference_steps,
1164
  guidance_scale=guidance_scale,