stable-diffusion-2-gray-inpaint-to-rgb / modeling_sd_gray_inpaint.py

Upload folder using huggingface_hub

4651dee verified 18 days ago

4.22 kB

	import torch
	import torch.nn as nn
	import torch.nn.functional as F

	from copy import deepcopy
	from torchvision.transforms.functional import rgb_to_grayscale
	import segmentation_models_pytorch as smp
	from diffusers import StableDiffusionInpaintPipeline
	from diffusers.utils.torch_utils import randn_tensor
	from transformers import PretrainedConfig, PreTrainedModel

	class SDGrayInpaintConfig(PretrainedConfig):
	model_type = "sd_gray_inpaint"
	def __init__(
	self,
	base_model="stabilityai/stable-diffusion-2-inpainting",
	height=512,
	width=512,
	**kwargs
	):
	self.base_model=base_model
	self.height=height
	self.width=width
	super().__init__(**kwargs)

	class SDGrayInpaintModel(PreTrainedModel):
	config_class = SDGrayInpaintConfig
	def __init__(self, config):
	super().__init__(config)
	pipe = StableDiffusionInpaintPipeline.from_pretrained(config.base_model)
	self.mask_predictor = smp.Unet(
	encoder_name="mit_b4",
	encoder_weights="imagenet",
	in_channels=3,
	classes=1,
	)
	self.image_processor = pipe.image_processor
	self.scheduler = pipe.scheduler
	self.unet = pipe.unet
	self.vae = pipe.vae
	self.prompt_embeds = nn.Parameter(torch.randn(1,77,1024))
	self.height=config.height
	self.width=config.width

	def forward(
	self,
	images_gray_masked,
	masks=None,
	num_inference_steps=250,
	seed=42,
	input_type='pil',
	output_type='pil'
	):
	generator = torch.Generator()
	generator.manual_seed(seed)
	if input_type=='pil':
	images_gray_masked = self.image_processor.preprocess(images_gray_masked, height=self.height, width=self.width).float()
	elif input_type=='pt':
	images_gray_masked=images_gray_masked
	else:
	raise ValueError('unsupported input_type')
	images_gray_masked = images_gray_masked.to(self.vae.device)
	if masks is None:
	masks_logits = self.mask_predictor(images_gray_masked)
	masks = (torch.sigmoid(masks_logits)>0.5)*1.
	masks = masks.float().to(self.vae.device)
	images_gray_masked = (1-masks) * images_gray_masked
	B, C, H, W = images_gray_masked.shape
	prompt_embeds = self.prompt_embeds.repeat(B,1,1)

	scheduler = deepcopy(self.scheduler)
	scheduler.set_timesteps(num_inference_steps=num_inference_steps, device=self.vae.device)
	masked_image_latents = self.vae.encode(images_gray_masked).latent_dist.mode() * self.vae.config.scaling_factor
	mask_latents = F.interpolate(masks, size=(self.unet.config.sample_size, self.unet.config.sample_size))
	latents = randn_tensor(masked_image_latents.shape, generator=generator).to(self.device) * self.scheduler.init_noise_sigma
	for t in scheduler.timesteps:
	latents = scheduler.scale_model_input(latents, t)
	latent_model_input = torch.cat([latents, mask_latents, masked_image_latents], dim=1)
	noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=prompt_embeds)[0]
	latents = scheduler.step(noise_pred, t, latents)[0]
	latents = latents / self.vae.config.scaling_factor
	images_gray_restored = self.vae.decode(latents.detach())[0]
	images_gray_restored = images_gray_masked * (1-masks) + images_gray_restored.detach() * masks
	images_gray_restored = rgb_to_grayscale(images_gray_restored)

	if output_type=='pil':
	images_gray_restored = self.image_processor.postprocess(images_gray_restored)
	elif output_type=='np':
	images_gray_restored = self.image_processor.postprocess(images_gray_restored, 'np')
	elif output_type=='pt':
	images_gray_restored = self.image_processor.postprocess(images_gray_restored, 'pt')
	elif output_type=='none':
	images_gray_restored = images_gray_restored
	else:
	raise ValueError('unsupported output_type')

	return images_gray_restored