Spaces:

KBlueLeaf
/

Sketch-Gen

Running on Zero

Sketch-Gen / app.py

Kohaku-Blueleaf

init

26d4aa7 4 months ago

12.4 kB

	'''
	Modified from https://github.com/lllyasviel/Paints-UNDO/blob/main/gradio_app.py
	'''
	import functools

	import spaces
	import gradio as gr
	import numpy as np
	import cv2
	import torch

	from PIL import Image
	from diffusers import AutoencoderKL, UNet2DConditionModel
	from diffusers.models.attention_processor import AttnProcessor2_0
	from transformers import CLIPTextModel, CLIPTokenizer
	from imgutils.metrics import lpips_difference
	from imgutils.tagging import get_wd14_tags

	from diffusers_helper.code_cond import unet_add_coded_conds
	from diffusers_helper.cat_cond import unet_add_concat_conds
	from diffusers_helper.k_diffusion import KDiffusionSampler
	from diffusers_helper.attention import AttnProcessor2_0_xformers, XFORMERS_AVAIL

	from lineart_models import MangaLineExtraction, LineartAnimeDetector, LineartDetector


	def resize_and_center_crop(
	image, target_width, target_height=None, interpolation=cv2.INTER_AREA
	):
	original_height, original_width = image.shape[:2]
	if target_height is None:
	aspect_ratio = original_width / original_height
	target_pixel_count = target_width * target_width
	target_height = (target_pixel_count / aspect_ratio) ** 0.5
	target_width = target_height * aspect_ratio
	target_height = int(target_height)
	target_width = int(target_width)
	print(
	f"original_height={original_height}, "
	f"original_width={original_width}, "
	f"target_height={target_height}, "
	f"target_width={target_width}"
	)
	k = max(target_height / original_height, target_width / original_width)
	new_width = int(round(original_width * k))
	new_height = int(round(original_height * k))
	resized_image = cv2.resize(
	image, (new_width, new_height), interpolation=interpolation
	)
	x_start = (new_width - target_width) // 2
	y_start = (new_height - target_height) // 2
	cropped_image = resized_image[
	y_start : y_start + target_height, x_start : x_start + target_width
	]
	return cropped_image


	class ModifiedUNet(UNet2DConditionModel):
	@classmethod
	def from_config(cls, args, *kwargs):
	m = super().from_config(args, *kwargs)
	unet_add_concat_conds(unet=m, new_channels=4)
	unet_add_coded_conds(unet=m, added_number_count=1)
	return m


	DEVICE = "cuda"
	torch._dynamo.config.cache_size_limit = 256


	lineart_models = []

	lineart_model = MangaLineExtraction("cuda", "./hf_download")
	lineart_model.load_model()
	lineart_model.model.to(device=DEVICE).eval()
	lineart_models.append(lineart_model)

	lineart_model = LineartAnimeDetector()
	lineart_model.model.to(device=DEVICE).eval()
	lineart_models.append(lineart_model)

	lineart_model = LineartDetector()
	lineart_model.model.to(device=DEVICE).eval()
	lineart_models.append(lineart_model)


	model_name = "lllyasviel/paints_undo_single_frame"
	tokenizer: CLIPTokenizer = CLIPTokenizer.from_pretrained(
	model_name, subfolder="tokenizer"
	)
	text_encoder: CLIPTextModel = (
	CLIPTextModel.from_pretrained(
	model_name,
	subfolder="text_encoder",
	)
	.to(dtype=torch.float16, device=DEVICE)
	.eval()
	)
	vae: AutoencoderKL = (
	AutoencoderKL.from_pretrained(
	model_name,
	subfolder="vae",
	)
	.to(dtype=torch.bfloat16, device=DEVICE)
	.eval()
	)
	unet: ModifiedUNet = (
	ModifiedUNet.from_pretrained(
	model_name,
	subfolder="unet",
	)
	.to(dtype=torch.float16, device=DEVICE)
	.eval()
	)

	if XFORMERS_AVAIL:
	unet.set_attn_processor(AttnProcessor2_0_xformers())
	vae.set_attn_processor(AttnProcessor2_0_xformers())
	else:
	unet.set_attn_processor(AttnProcessor2_0())
	vae.set_attn_processor(AttnProcessor2_0())

	# text_encoder = torch.compile(text_encoder, backend="eager", dynamic=True)
	# vae = torch.compile(vae, backend="eager", dynamic=True)
	# unet = torch.compile(unet, mode="reduce-overhead", dynamic=True)
	# for model in lineart_models:
	# model.model = torch.compile(model.model, backend="eager", dynamic=True)
	k_sampler = KDiffusionSampler(
	unet=unet,
	timesteps=1000,
	linear_start=0.00085,
	linear_end=0.020,
	linear=True,
	)


	@spaces.GPU
	@torch.inference_mode()
	def encode_cropped_prompt_77tokens(txt: str):
	cond_ids = tokenizer(
	txt,
	padding="max_length",
	max_length=tokenizer.model_max_length,
	truncation=True,
	return_tensors="pt",
	).input_ids.to(device=text_encoder.device)
	text_cond = text_encoder(cond_ids, attention_mask=None).last_hidden_state
	return text_cond


	@spaces.GPU
	@torch.inference_mode()
	def encode_cropped_prompt(txt: str, max_length=225):
	cond_ids = tokenizer(
	txt,
	padding="max_length",
	max_length=max_length + 2,
	truncation=True,
	return_tensors="pt",
	).input_ids.to(device=text_encoder.device)
	if max_length + 2 > tokenizer.model_max_length:
	input_ids = cond_ids.squeeze(0)
	id_list = list(range(1, max_length + 2 - tokenizer.model_max_length + 2, tokenizer.model_max_length - 2))
	text_cond_list = []
	for i in id_list:
	ids_chunk = (
	input_ids[0].unsqueeze(0),
	input_ids[i : i + tokenizer.model_max_length - 2],
	input_ids[-1].unsqueeze(0),
	)
	if torch.all(ids_chunk[1] == tokenizer.pad_token_id):
	break
	text_cond = text_encoder(torch.concat(ids_chunk).unsqueeze(0)).last_hidden_state
	if text_cond_list == []:
	text_cond_list.append(text_cond[:, :1])
	text_cond_list.append(text_cond[:, 1:tokenizer.model_max_length - 1])
	text_cond_list.append(text_cond[:, -1:])
	text_cond = torch.concat(text_cond_list, dim=1)
	else:
	text_cond = text_encoder(
	cond_ids, attention_mask=None
	).last_hidden_state
	return text_cond.flatten(0, 1).unsqueeze(0)


	@spaces.GPU
	@torch.inference_mode()
	def pytorch2numpy(imgs):
	results = []
	for x in imgs:
	y = x.movedim(0, -1)
	y = y * 127.5 + 127.5
	y = y.detach().float().cpu().numpy().clip(0, 255).astype(np.uint8)
	results.append(y)
	return results


	@spaces.GPU
	@torch.inference_mode()
	def numpy2pytorch(imgs):
	h = torch.from_numpy(np.stack(imgs, axis=0)).float() / 127.5 - 1.0
	h = h.movedim(-1, 1)
	return h


	@spaces.GPU
	@torch.inference_mode()
	def interrogator_process(x):
	img = Image.fromarray(x)
	rating, features, chars = get_wd14_tags(img, general_threshold=0.25, no_underline=True)
	result = ""
	for char in chars:
	result += char
	result += ", "
	for feature in features:
	result += feature
	result += ", "
	result += max(rating, key=rating.get)
	return result


	@spaces.GPU
	@torch.inference_mode()
	def process(
	input_fg,
	prompt,
	input_undo_steps,
	image_width,
	seed,
	steps,
	n_prompt,
	cfg,
	num_sets,
	progress=gr.Progress(),
	):
	lineart_fg = input_fg
	linearts = []
	for model in lineart_models:
	linearts.append(model(lineart_fg))
	fg = resize_and_center_crop(input_fg, image_width)
	for i, lineart in enumerate(linearts):
	lineart = resize_and_center_crop(lineart, fg.shape[1], fg.shape[0])
	linearts[i] = lineart

	concat_conds = numpy2pytorch([fg]).to(device=vae.device, dtype=vae.dtype)
	concat_conds = (
	vae.encode(concat_conds).latent_dist.mode() * vae.config.scaling_factor
	)

	conds = encode_cropped_prompt(prompt)
	unconds = encode_cropped_prompt_77tokens(n_prompt)
	print(conds.shape, unconds.shape)
	torch.cuda.empty_cache()

	fs = torch.tensor(input_undo_steps).to(device=unet.device, dtype=torch.long)
	initial_latents = torch.zeros_like(concat_conds)
	concat_conds = concat_conds.to(device=unet.device, dtype=unet.dtype)
	latents = []
	rng = torch.Generator(device=DEVICE).manual_seed(int(seed))
	latents = (
	k_sampler(
	initial_latent=initial_latents,
	strength=1.0,
	num_inference_steps=steps,
	guidance_scale=cfg,
	batch_size=len(input_undo_steps) * num_sets,
	generator=rng,
	prompt_embeds=conds,
	negative_prompt_embeds=unconds,
	cross_attention_kwargs={
	"concat_conds": concat_conds,
	"coded_conds": fs,
	},
	same_noise_in_batch=False,
	progress_tqdm=functools.partial(
	progress.tqdm, desc="Generating Key Frames"
	),
	).to(vae.dtype)
	/ vae.config.scaling_factor
	)
	torch.cuda.empty_cache()

	pixels = torch.concat(
	[vae.decode(latent.unsqueeze(0)).sample for latent in latents]
	)
	pixels = pytorch2numpy(pixels)
	pixels_with_lpips = []
	lineart_pils = [Image.fromarray(lineart) for lineart in linearts]
	for pixel in pixels:
	pixel_pil = Image.fromarray(pixel)
	pixels_with_lpips.append(
	(
	sum(
	[
	lpips_difference(lineart_pil, pixel_pil)
	for lineart_pil in lineart_pils
	]
	),
	pixel,
	)
	)
	pixels = np.stack(
	[i[1] for i in sorted(pixels_with_lpips, key=lambda x: x[0])], axis=0
	)
	torch.cuda.empty_cache()

	return pixels, np.stack(linearts)


	block = gr.Blocks().queue()
	with block:
	gr.Markdown("# Sketch/Lineart extractor")

	with gr.Row():
	with gr.Column():
	input_fg = gr.Image(
	sources=["upload"], type="numpy", label="Image", height=384
	)
	with gr.Row():
	with gr.Column(scale=2, variant="compact"):
	prompt = gr.Textbox(label="Output Prompt", interactive=True)
	with gr.Column(scale=1, variant="compact", min_width=160):
	n_prompt = gr.Textbox(
	label="Negative Prompt",
	value="lowres, worst quality, bad anatomy, bad hands, text, extra digit, fewer digits, cropped, low quality, jpeg artifacts, signature, watermark, username",
	)
	with gr.Row():
	input_undo_steps = gr.Dropdown(
	label="Operation Steps",
	value=[850, 875, 900, 925, 950, 975],
	choices=list(range(0, 1000, 25)),
	multiselect=True,
	)
	num_sets = gr.Slider(
	label="Num Sets", minimum=1, maximum=10, value=4, step=1
	)
	with gr.Row():
	seed = gr.Slider(
	label="Seed", minimum=0, maximum=50000, step=1, value=37462
	)
	image_width = gr.Slider(
	label="Target size", minimum=512, maximum=1024, value=768, step=32
	)
	steps = gr.Slider(
	label="Steps", minimum=1, maximum=32, value=16, step=1
	)
	cfg = gr.Slider(
	label="CFG Scale", minimum=1.0, maximum=16, value=5, step=0.05
	)
	key_gen_button = gr.Button(value="Generate Sketch", interactive=False)

	with gr.Column():
	gr.Markdown("#### Sketch Outputs")
	result_gallery = gr.Gallery(
	height=384, object_fit="contain", label="Sketch Outputs", columns=4
	)
	gr.Markdown("#### Line Art Outputs")
	lineart_result = gr.Gallery(
	height=384,
	object_fit="contain",
	label="LineArt outputs",
	)

	input_fg.change(
	lambda x: [
	interrogator_process(x) if x is not None else "",
	gr.update(interactive=True),
	],
	inputs=[input_fg],
	outputs=[prompt, key_gen_button],
	)

	key_gen_button.click(
	fn=process,
	inputs=[
	input_fg,
	prompt,
	input_undo_steps,
	image_width,
	seed,
	steps,
	n_prompt,
	cfg,
	num_sets,
	],
	outputs=[result_gallery, lineart_result],
	).then(
	lambda: gr.update(interactive=True),
	outputs=[key_gen_button],
	)

	block.queue().launch()