import gc import json import webcolors import spaces import gradio as gr import os.path as osp from copy import deepcopy from PIL import Image, ImageDraw, ImageFont import torch from diffusers import UNet2DConditionModel, AutoencoderKL from diffusers.models.attention import BasicTransformerBlock from peft import LoraConfig from peft.utils import set_peft_model_state_dict from transformers import PretrainedConfig from diffusers import DPMSolverMultistepScheduler from glyph_sdxl.utils import ( parse_config, UNET_CKPT_NAME, huggingface_cache_dir, load_byt5_and_byt5_tokenizer, BYT5_MAPPER_CKPT_NAME, INSERTED_ATTN_CKPT_NAME, BYT5_CKPT_NAME, PromptFormat, MultilingualPromptFormat, ) from glyph_sdxl.custom_diffusers import ( StableDiffusionGlyphXLPipeline, CrossAttnInsertBasicTransformerBlock, ) from glyph_sdxl.modules import T5EncoderBlockByT5Mapper from demo.constants import MAX_TEXT_BOX state = 0 stack = [] multilingual_state = 0 multilingual_stack = [] font = ImageFont.truetype("assets/Arial.ttf", 20) device = "cuda" pipeline = None pipeline_multilingual = None prompt_format = PromptFormat() multilingual_prompt_format = MultilingualPromptFormat() multilingual_code_dict = { 'cn': 'Chinese', 'en': 'English', 'fr': 'French', 'de': 'German', 'es': 'Spanish', 'it': 'Italian', 'pt': 'Portuguese', 'ru': 'Russian', 'jp': 'Japanese', 'kr': 'Korean', } multilingual_reverse_code_dict = { 'Chinese': 'cn', 'English': 'en', 'French': 'en', 'German': 'en', 'Spanish': 'en', 'Italian': 'en', 'Portuguese': 'en', 'Russian': 'en', 'Japanese': 'jp', 'Korean': 'kr', } multilingual_font_dict = {} multilingual_meta_path = 'assets/multi_fonts' for code in multilingual_code_dict: with open(osp.join(multilingual_meta_path, f"{code}.json"), 'r') as f: lang_font_list = json.load(f) multilingual_font_dict[code] = lang_font_list def flush(): gc.collect() torch.cuda.empty_cache() def import_model_class_from_model_name_or_path( pretrained_model_name_or_path: str, revision: str, subfolder: str = "text_encoder", ): text_encoder_config = PretrainedConfig.from_pretrained( pretrained_model_name_or_path, subfolder=subfolder, revision=revision, ) model_class = text_encoder_config.architectures[0] if model_class == "CLIPTextModel": from transformers import CLIPTextModel return CLIPTextModel elif model_class == "CLIPTextModelWithProjection": from transformers import CLIPTextModelWithProjection return CLIPTextModelWithProjection else: raise ValueError(f"{model_class} is not supported.") def init_pipeline(): global pipeline global pipeline_multilingual config = parse_config('configs/glyph_sdxl_albedo.py') ckpt_dir = 'checkpoints/glyph-sdxl' config_multilingual = parse_config('configs/glyph_sdxl_multilingual_albedo.py') ckpt_dir_multilingual = 'checkpoints/glyph-sdxl_multilingual_10-lang' text_encoder_cls_one = import_model_class_from_model_name_or_path( config.pretrained_model_name_or_path, config.revision, ) text_encoder_cls_two = import_model_class_from_model_name_or_path( config.pretrained_model_name_or_path, config.revision, subfolder="text_encoder_2", ) text_encoder_one = text_encoder_cls_one.from_pretrained( config.pretrained_model_name_or_path, subfolder="text_encoder", revision=config.revision, cache_dir=huggingface_cache_dir, ) text_encoder_two = text_encoder_cls_two.from_pretrained( config.pretrained_model_name_or_path, subfolder="text_encoder_2", revision=config.revision, cache_dir=huggingface_cache_dir, ) unet = UNet2DConditionModel.from_pretrained( config.pretrained_model_name_or_path, subfolder="unet", revision=config.revision, cache_dir=huggingface_cache_dir, ) unet_multilingual = UNet2DConditionModel.from_pretrained( config_multilingual.pretrained_model_name_or_path, subfolder="unet", revision=config.revision, cache_dir=huggingface_cache_dir, ) vae_path = ( config.pretrained_model_name_or_path if config.pretrained_vae_model_name_or_path is None else config.pretrained_vae_model_name_or_path ) vae = AutoencoderKL.from_pretrained( vae_path, subfolder="vae" if config.pretrained_vae_model_name_or_path is None else None, revision=config.revision, cache_dir=huggingface_cache_dir, ) byt5_model, byt5_tokenizer = load_byt5_and_byt5_tokenizer( **config.byt5_config, huggingface_cache_dir=huggingface_cache_dir, ) byt5_model_multilingual, byt5_tokenizer_multilingual = load_byt5_and_byt5_tokenizer( **config_multilingual.byt5_config, huggingface_cache_dir=huggingface_cache_dir, ) inference_dtype = torch.float32 if config.inference_dtype == "fp16": inference_dtype = torch.float16 elif config.inference_dtype == "bf16": inference_dtype = torch.bfloat16 inserted_new_modules_para_set = set() for name, module in unet.named_modules(): if isinstance(module, BasicTransformerBlock) and name in config.attn_block_to_modify: parent_module = unet for n in name.split(".")[:-1]: parent_module = getattr(parent_module, n) new_block = CrossAttnInsertBasicTransformerBlock.from_transformer_block( module, byt5_model.config.d_model if config.byt5_mapper_config.sdxl_channels is None else config.byt5_mapper_config.sdxl_channels, ) new_block.requires_grad_(False) for inserted_module_name, inserted_module in zip( new_block.get_inserted_modules_names(), new_block.get_inserted_modules() ): inserted_module.requires_grad_(True) for para_name, para in inserted_module.named_parameters(): para_key = name + '.' + inserted_module_name + '.' + para_name assert para_key not in inserted_new_modules_para_set inserted_new_modules_para_set.add(para_key) for origin_module in new_block.get_origin_modules(): origin_module.to(dtype=inference_dtype) parent_module.register_module(name.split(".")[-1], new_block) print(f"inserted cross attn block to {name}") inserted_new_modules_para_set_multilingual = set() for name, module in unet_multilingual.named_modules(): if isinstance(module, BasicTransformerBlock) and name in config_multilingual.attn_block_to_modify: parent_module = unet_multilingual for n in name.split(".")[:-1]: parent_module = getattr(parent_module, n) new_block = CrossAttnInsertBasicTransformerBlock.from_transformer_block( module, byt5_model.config.d_model if config_multilingual.byt5_mapper_config.sdxl_channels is None else config_multilingual.byt5_mapper_config.sdxl_channels, ) new_block.requires_grad_(False) for inserted_module_name, inserted_module in zip( new_block.get_inserted_modules_names(), new_block.get_inserted_modules() ): inserted_module.requires_grad_(True) for para_name, para in inserted_module.named_parameters(): para_key = name + '.' + inserted_module_name + '.' + para_name assert para_key not in inserted_new_modules_para_set_multilingual inserted_new_modules_para_set_multilingual.add(para_key) for origin_module in new_block.get_origin_modules(): origin_module.to(dtype=inference_dtype) parent_module.register_module(name.split(".")[-1], new_block) print(f"inserted cross attn block to {name}") byt5_mapper_dict = [T5EncoderBlockByT5Mapper] byt5_mapper_dict = {mapper.__name__: mapper for mapper in byt5_mapper_dict} byt5_mapper = byt5_mapper_dict[config.byt5_mapper_type]( byt5_model.config, **config.byt5_mapper_config, ) byt5_mapper_multilingual = byt5_mapper_dict[config_multilingual.byt5_mapper_type]( byt5_model.config, **config_multilingual.byt5_mapper_config, ) unet_lora_target_modules = [ "attn1.to_k", "attn1.to_q", "attn1.to_v", "attn1.to_out.0", "attn2.to_k", "attn2.to_q", "attn2.to_v", "attn2.to_out.0", ] unet_lora_config = LoraConfig( r=config.unet_lora_rank, lora_alpha=config.unet_lora_rank, init_lora_weights="gaussian", target_modules=unet_lora_target_modules, ) unet.add_adapter(unet_lora_config) unet_lora_config_multilingual = LoraConfig( r=config_multilingual.unet_lora_rank, lora_alpha=config_multilingual.unet_lora_rank, init_lora_weights="gaussian", target_modules=unet_lora_target_modules, ) unet_multilingual.add_adapter(unet_lora_config_multilingual) unet_lora_layers_para = torch.load(osp.join(ckpt_dir, UNET_CKPT_NAME), map_location='cpu') incompatible_keys = set_peft_model_state_dict(unet, unet_lora_layers_para, adapter_name="default") if getattr(incompatible_keys, 'unexpected_keys', []) == []: print(f"loaded unet_lora_layers_para") else: print(f"unet_lora_layers has unexpected_keys: {getattr(incompatible_keys, 'unexpected_keys', None)}") unet_lora_layers_para_multilingual = torch.load(osp.join(ckpt_dir_multilingual, UNET_CKPT_NAME), map_location='cpu') incompatible_keys = set_peft_model_state_dict(unet_multilingual, unet_lora_layers_para_multilingual, adapter_name="default") if getattr(incompatible_keys, 'unexpected_keys', []) == []: print(f"loaded unet_lora_layers_para_multilingual") else: print(f"unet_lora_layers_multilingual has unexpected_keys: {getattr(incompatible_keys, 'unexpected_keys', None)}") inserted_attn_module_paras = torch.load(osp.join(ckpt_dir, INSERTED_ATTN_CKPT_NAME), map_location='cpu') missing_keys, unexpected_keys = unet.load_state_dict(inserted_attn_module_paras, strict=False) assert len(unexpected_keys) == 0, unexpected_keys inserted_attn_module_paras_multilingual = torch.load(osp.join(ckpt_dir_multilingual, INSERTED_ATTN_CKPT_NAME), map_location='cpu') missing_keys, unexpected_keys = unet_multilingual.load_state_dict(inserted_attn_module_paras_multilingual, strict=False) assert len(unexpected_keys) == 0, unexpected_keys byt5_mapper_para = torch.load(osp.join(ckpt_dir, BYT5_MAPPER_CKPT_NAME), map_location='cpu') byt5_mapper.load_state_dict(byt5_mapper_para) byt5_mapper_para_multilingual = torch.load(osp.join(ckpt_dir_multilingual, BYT5_MAPPER_CKPT_NAME), map_location='cpu') byt5_mapper_multilingual.load_state_dict(byt5_mapper_para_multilingual) byt5_model_para = torch.load(osp.join(ckpt_dir, BYT5_CKPT_NAME), map_location='cpu') byt5_model.load_state_dict(byt5_model_para) byt5_model_para_multilingual = torch.load(osp.join(ckpt_dir_multilingual, BYT5_CKPT_NAME), map_location='cpu') byt5_model_multilingual.load_state_dict(byt5_model_para_multilingual) pipeline = StableDiffusionGlyphXLPipeline.from_pretrained( config.pretrained_model_name_or_path, vae=vae, text_encoder=text_encoder_one, text_encoder_2=text_encoder_two, byt5_text_encoder=byt5_model, byt5_tokenizer=byt5_tokenizer, byt5_mapper=byt5_mapper, unet=unet, byt5_max_length=config.byt5_max_length, revision=config.revision, torch_dtype=inference_dtype, safety_checker=None, cache_dir=huggingface_cache_dir, ) pipeline.scheduler = DPMSolverMultistepScheduler.from_pretrained( config.pretrained_model_name_or_path, subfolder="scheduler", use_karras_sigmas=True, ) pipeline_multilingual = StableDiffusionGlyphXLPipeline.from_pretrained( config_multilingual.pretrained_model_name_or_path, vae=vae, text_encoder=text_encoder_one, text_encoder_2=text_encoder_two, byt5_text_encoder=byt5_model_multilingual, byt5_tokenizer=byt5_tokenizer_multilingual, byt5_mapper=byt5_mapper_multilingual, unet=unet_multilingual, byt5_max_length=config_multilingual.byt5_max_length, revision=config_multilingual.revision, torch_dtype=inference_dtype, safety_checker=None, cache_dir=huggingface_cache_dir, ) pipeline_multilingual.scheduler = DPMSolverMultistepScheduler.from_pretrained( config_multilingual.pretrained_model_name_or_path, subfolder="scheduler", use_karras_sigmas=True, ) # move to gpu if config.pretrained_vae_model_name_or_path is None: vae = vae.to(device, dtype=torch.float32) else: vae = vae.to(device, dtype=inference_dtype) text_encoder_one = text_encoder_one.to(device, dtype=inference_dtype) text_encoder_two = text_encoder_two.to(device, dtype=inference_dtype) byt5_mapper = byt5_mapper.to(device) byt5_model = byt5_model.to(device) unet = unet.to(device, dtype=inference_dtype) pipeline = pipeline.to(device) byt5_mapper_multilingual = byt5_mapper_multilingual.to(device) byt5_model_multilingual = byt5_model_multilingual.to(device) unet_multilingual = unet_multilingual.to(device, dtype=inference_dtype) pipeline_multilingual = pipeline_multilingual.to(device) def get_pixels( box_sketch_template, evt: gr.SelectData ): global state global stack text_position = evt.index if state == 0: stack.append(text_position) state = 1 else: x, y = stack.pop() stack.append([x, y, text_position[0], text_position[1]]) state = 0 print(stack) box_sketch_template = Image.new('RGB', (1024, 1024), (255, 255, 255)) draw = ImageDraw.Draw(box_sketch_template) for i, text_position in enumerate(stack): if len(text_position) == 2: x, y = text_position r = 4 leftUpPoint = (x-r, y-r) rightDownPoint = (x+r, y+r) text_color = (255, 0, 0) draw.text((x+2, y), str(i + 1), font=font, fill=text_color) draw.ellipse((leftUpPoint,rightDownPoint), fill='red') elif len(text_position) == 4: x0, y0, x1, y1 = text_position x0, x1 = min(x0, x1), max(x0, x1) y0, y1 = min(y0, y1), max(y0, y1) r = 4 leftUpPoint = (x0-r, y0-r) rightDownPoint = (x0+r, y0+r) text_color = (255, 0, 0) draw.text((x0+2, y0), str(i + 1), font=font, fill=text_color) draw.rectangle((x0, y0, x1, y1), outline=(255, 0, 0)) return box_sketch_template def get_pixels_multilingual( box_sketch_template, evt: gr.SelectData ): global multilingual_state global multilingual_stack text_position = evt.index if multilingual_state == 0: multilingual_stack.append(text_position) multilingual_state = 1 else: x, y = multilingual_stack.pop() multilingual_stack.append([x, y, text_position[0], text_position[1]]) multilingual_state = 0 print(multilingual_stack) box_sketch_template = Image.new('RGB', (1024, 1024), (255, 255, 255)) draw = ImageDraw.Draw(box_sketch_template) for i, text_position in enumerate(multilingual_stack): if len(text_position) == 2: x, y = text_position r = 4 leftUpPoint = (x-r, y-r) rightDownPoint = (x+r, y+r) text_color = (255, 0, 0) draw.text((x+2, y), str(i + 1), font=font, fill=text_color) draw.ellipse((leftUpPoint,rightDownPoint), fill='red') elif len(text_position) == 4: x0, y0, x1, y1 = text_position x0, x1 = min(x0, x1), max(x0, x1) y0, y1 = min(y0, y1), max(y0, y1) r = 4 leftUpPoint = (x0-r, y0-r) rightDownPoint = (x0+r, y0+r) text_color = (255, 0, 0) draw.text((x0+2, y0), str(i + 1), font=font, fill=text_color) draw.rectangle((x0, y0, x1, y1), outline=(255, 0, 0)) return box_sketch_template def exe_redo( box_sketch_template ): global state global stack state = 1 - state if len(stack[-1]) == 2: stack = stack[:-1] else: x, y, _, _ = stack[-1] stack = stack[:-1] + [[x, y]] box_sketch_template = Image.new('RGB', (1024, 1024), (255, 255, 255)) draw = ImageDraw.Draw(box_sketch_template) for i, text_position in enumerate(stack): if len(text_position) == 2: x, y = text_position r = 4 leftUpPoint = (x-r, y-r) rightDownPoint = (x+r, y+r) text_color = (255, 0, 0) draw.text((x+2, y), str(i+1), font=font, fill=text_color) draw.ellipse((leftUpPoint, rightDownPoint), fill='red') elif len(text_position) == 4: x0, y0, x1, y1 = text_position x0, x1 = min(x0, x1), max(x0, x1) y0, y1 = min(y0, y1), max(y0, y1) r = 4 leftUpPoint = (x0-r, y0-r) rightDownPoint = (x0+r, y0+r) text_color = (255, 0, 0) draw.text((x0+2, y0), str(i+1), font=font, fill=text_color) draw.rectangle((x0,y0,x1,y1), outline=(255, 0, 0)) return box_sketch_template def exe_redo_multilingual( box_sketch_template ): global multilingual_state global multilingual_stack multilingual_state = 1 - multilingual_state if len(multilingual_stack[-1]) == 2: multilingual_stack = multilingual_stack[:-1] else: x, y, _, _ = multilingual_stack[-1] multilingual_stack = multilingual_stack[:-1] + [[x, y]] box_sketch_template = Image.new('RGB', (1024, 1024), (255, 255, 255)) draw = ImageDraw.Draw(box_sketch_template) for i, text_position in enumerate(multilingual_stack): if len(text_position) == 2: x, y = text_position r = 4 leftUpPoint = (x-r, y-r) rightDownPoint = (x+r, y+r) text_color = (255, 0, 0) draw.text((x+2, y), str(i+1), font=font, fill=text_color) draw.ellipse((leftUpPoint, rightDownPoint), fill='red') elif len(text_position) == 4: x0, y0, x1, y1 = text_position x0, x1 = min(x0, x1), max(x0, x1) y0, y1 = min(y0, y1), max(y0, y1) r = 4 leftUpPoint = (x0-r, y0-r) rightDownPoint = (x0+r, y0+r) text_color = (255, 0, 0) draw.text((x0+2, y0), str(i+1), font=font, fill=text_color) draw.rectangle((x0,y0,x1,y1), outline=(255, 0, 0)) return box_sketch_template def exe_undo( box_sketch_template ): global state global stack state = 0 stack = [] box_sketch_template = Image.new('RGB', (1024, 1024), (255, 255, 255)) return box_sketch_template def exe_undo_multilingual( box_sketch_template ): global multilingual_state global multilingual_stack multilingual_state = 0 multilingual_stack = [] box_sketch_template = Image.new('RGB', (1024, 1024), (255, 255, 255)) return box_sketch_template def process_box(): visibilities = [] for _ in range(MAX_TEXT_BOX + 1): visibilities.append(gr.update(visible=False)) for n in range(len(stack) + 1): visibilities[n] = gr.update(visible=True) # return [gr.update(visible=True), binary_matrixes, *visibilities, *colors] return [gr.update(visible=True), *visibilities] def process_box_multilingual(): visibilities = [] for _ in range(MAX_TEXT_BOX + 1): visibilities.append(gr.update(visible=False)) for n in range(len(multilingual_stack) + 1): visibilities[n] = gr.update(visible=True) # return [gr.update(visible=True), binary_matrixes, *visibilities, *colors] return [gr.update(visible=True), *visibilities] @torch.inference_mode() @spaces.GPU(enable_queue=True, duration=120) def generate_image(bg_prompt, bg_class, bg_tags, seed, cfg, *conditions): stack_cp = deepcopy(stack) print(f"conditions: {conditions}") # 1. parse input prompts = [] colors = [] font_type = [] bboxes = [] num_boxes = len(stack_cp) if len(stack_cp[-1]) == 4 else len(stack_cp) - 1 for i in range(num_boxes): prompts.append(conditions[i]) colors.append(conditions[i + MAX_TEXT_BOX]) font_type.append(conditions[i + MAX_TEXT_BOX * 2]) # 2. input check styles = [] if bg_prompt == "" or bg_prompt is None: raise gr.Error("Empty background prompt!") for i, (prompt, color, style) in enumerate(zip(prompts, colors, font_type)): if prompt == "" or prompt is None: raise gr.Error(f"Invalid prompt for text box {i + 1} !") if color is None: raise gr.Error(f"Invalid color for text box {i + 1} !") if style is None: raise gr.Error(f"Invalid style for text box {i + 1} !") bboxes.append( [ stack_cp[i][0] / 1024, stack_cp[i][1] / 1024, (stack_cp[i][2] - stack_cp[i][0]) / 1024, (stack_cp[i][3] - stack_cp[i][1]) / 1024, ] ) styles.append( { 'color': webcolors.name_to_hex(color), 'font-family': style, } ) # 3. format input if bg_class != "" and bg_class is not None: bg_prompt = bg_class + ". " + bg_prompt if bg_tags != "" and bg_tags is not None: bg_prompt += " Tags: " + bg_tags text_prompt = prompt_format.format_prompt(prompts, styles) print(f"bg_prompt: {bg_prompt}") print(f"text_prompt: {text_prompt}") # 4. inference if seed == -1: generator = torch.Generator(device=device) else: generator = torch.Generator(device=device).manual_seed(int(seed)) with torch.cuda.amp.autocast(): image = pipeline( prompt=bg_prompt, text_prompt=text_prompt, texts=prompts, bboxes=bboxes, num_inference_steps=50, guidance_scale=cfg, generator=generator, text_attn_mask=None, ).images[0] flush() return image @torch.inference_mode() @spaces.GPU(enable_queue=True, duration=120) def generate_image_multilingual(bg_prompt, bg_class, bg_tags, seed, cfg, *conditions): stack_cp = deepcopy(multilingual_stack) print(f"conditions: {conditions}") # 1. parse input prompts = [] colors = [] font_type = [] langs = [] bboxes = [] num_boxes = len(stack_cp) if len(stack_cp[-1]) == 4 else len(stack_cp) - 1 for i in range(num_boxes): prompts.append(conditions[i]) colors.append(conditions[i + MAX_TEXT_BOX]) lang = conditions[i + MAX_TEXT_BOX * 2].split(":")[0].strip() font = conditions[i + MAX_TEXT_BOX * 2].split(":")[1].strip() print(conditions[i + MAX_TEXT_BOX * 2], " ", lang, " ", font) langs.append(multilingual_reverse_code_dict[lang]) font_type.append(f'{multilingual_reverse_code_dict[lang]}-{font}') # 2. input check styles = [] if bg_prompt == "" or bg_prompt is None: raise gr.Error("Empty background prompt!") for i, (prompt, color, style) in enumerate(zip(prompts, colors, font_type)): if prompt == "" or prompt is None: raise gr.Error(f"Invalid prompt for text box {i + 1} !") if color is None: raise gr.Error(f"Invalid color for text box {i + 1} !") if style is None: raise gr.Error(f"Invalid style for text box {i + 1} !") bboxes.append( [ stack_cp[i][0] / 1024, stack_cp[i][1] / 1024, (stack_cp[i][2] - stack_cp[i][0]) / 1024, (stack_cp[i][3] - stack_cp[i][1]) / 1024, ] ) styles.append( { 'color': webcolors.name_to_hex(color), 'font-family': style, } ) # 3. format input if bg_class != "" and bg_class is not None: bg_prompt = bg_class + ". " + bg_prompt if bg_tags != "" and bg_tags is not None: bg_prompt += " Tags: " + bg_tags text_prompt = multilingual_prompt_format.format_prompt(prompts, styles) print(f"bg_prompt: {bg_prompt}") print(f"text_prompt: {text_prompt}") # 4. inference if seed == -1: generator = torch.Generator(device=device) else: generator = torch.Generator(device=device).manual_seed(int(seed)) with torch.cuda.amp.autocast(): image = pipeline_multilingual( prompt=bg_prompt, text_prompt=text_prompt, texts=prompts, bboxes=bboxes, num_inference_steps=50, guidance_scale=cfg, generator=generator, text_attn_mask=None, ).images[0] flush() return image def process_example(prev_img, bg_prompt, bg_class, bg_tags, color_str, style_str, text_str, box_str, seed, cfg): global stack, state colors = color_str.split(",") styles = style_str.split(";") boxes = box_str.split(";") prompts = text_str.split("**********") colors = [color.strip() for color in colors] styles = [style.strip() for style in styles] colors += [None] * (MAX_TEXT_BOX - len(colors)) styles += [None] * (MAX_TEXT_BOX - len(styles)) prompts += [""] * (MAX_TEXT_BOX - len(prompts)) state = 0 stack = [] print(boxes) for box in boxes: print(box) box = box.strip()[1:-1] print(box) box = box.split(",") print(box) x = eval(box[0].strip()) * 1024 y = eval(box[1].strip()) * 1024 w = eval(box[2].strip()) * 1024 h = eval(box[3].strip()) * 1024 stack.append([int(x), int(y), int(x + w + 0.5), int(y + h + 0.5)]) visibilities = [] for _ in range(MAX_TEXT_BOX + 1): visibilities.append(gr.update(visible=False)) for n in range(len(stack) + 1): visibilities[n] = gr.update(visible=True) box_sketch_template = Image.new('RGB', (1024, 1024), (255, 255, 255)) draw = ImageDraw.Draw(box_sketch_template) for i, text_position in enumerate(stack): if len(text_position) == 2: x, y = text_position r = 4 leftUpPoint = (x-r, y-r) rightDownPoint = (x+r, y+r) text_color = (255, 0, 0) draw.text((x+2, y), str(i + 1), font=font, fill=text_color) draw.ellipse((leftUpPoint,rightDownPoint), fill='red') elif len(text_position) == 4: x0, y0, x1, y1 = text_position x0, x1 = min(x0, x1), max(x0, x1) y0, y1 = min(y0, y1), max(y0, y1) r = 4 leftUpPoint = (x0-r, y0-r) rightDownPoint = (x0+r, y0+r) text_color = (255, 0, 0) draw.text((x0+2, y0), str(i + 1), font=font, fill=text_color) draw.rectangle((x0, y0, x1, y1), outline=(255, 0, 0)) return [ gr.update(visible=True), box_sketch_template, seed, *visibilities, *colors, *styles, *prompts, ] def process_example_multilingual(prev_img, bg_prompt, bg_class, bg_tags, color_str, style_str, text_str, box_str, seed, cfg): global multilingual_stack, multilingual_state colors = color_str.split(",") styles = style_str.split(";") print(styles) boxes = box_str.split(";") prompts = text_str.split("**********") colors = [color.strip() for color in colors] styles = [style.strip() for style in styles] colors += [None] * (MAX_TEXT_BOX - len(colors)) styles += [None] * (MAX_TEXT_BOX - len(styles)) prompts += [""] * (MAX_TEXT_BOX - len(prompts)) multilingual_state = 0 multilingual_stack = [] print(boxes) for box in boxes: print(box) box = box.strip()[1:-1] print(box) box = box.split(",") print(box) x = eval(box[0].strip()) * 1024 y = eval(box[1].strip()) * 1024 w = eval(box[2].strip()) * 1024 h = eval(box[3].strip()) * 1024 multilingual_stack.append([int(x), int(y), int(x + w + 0.5), int(y + h + 0.5)]) visibilities = [] for _ in range(MAX_TEXT_BOX + 1): visibilities.append(gr.update(visible=False)) for n in range(len(multilingual_stack) + 1): visibilities[n] = gr.update(visible=True) box_sketch_template = Image.new('RGB', (1024, 1024), (255, 255, 255)) draw = ImageDraw.Draw(box_sketch_template) for i, text_position in enumerate(multilingual_stack): if len(text_position) == 2: x, y = text_position r = 4 leftUpPoint = (x-r, y-r) rightDownPoint = (x+r, y+r) text_color = (255, 0, 0) draw.text((x+2, y), str(i + 1), font=font, fill=text_color) draw.ellipse((leftUpPoint,rightDownPoint), fill='red') elif len(text_position) == 4: x0, y0, x1, y1 = text_position x0, x1 = min(x0, x1), max(x0, x1) y0, y1 = min(y0, y1), max(y0, y1) r = 4 leftUpPoint = (x0-r, y0-r) rightDownPoint = (x0+r, y0+r) text_color = (255, 0, 0) draw.text((x0+2, y0), str(i + 1), font=font, fill=text_color) draw.rectangle((x0, y0, x1, y1), outline=(255, 0, 0)) return [ gr.update(visible=True), box_sketch_template, seed, *visibilities, *colors, *styles, *prompts, ] def build_input_block(color_idx_list, font_idx_list, examples): with gr.Row(): with gr.Column(elem_id="main-image"): box_sketch_template = gr.Image( value=Image.new('RGB', (1024, 1024), (255, 255, 255)), sources=[], interactive=False, ) box_sketch_template.select(get_pixels, [box_sketch_template], [box_sketch_template]) with gr.Row(): redo = gr.Button(value='Redo - Cancel last point') undo = gr.Button(value='Undo - Clear the canvas') redo.click(exe_redo, [box_sketch_template], [box_sketch_template]) undo.click(exe_undo, [box_sketch_template], [box_sketch_template]) button_layout = gr.Button("(1) I've finished my layout!", elem_id="main_button", interactive=True) prompts = [] colors = [] styles = [] color_row = [None] * (MAX_TEXT_BOX + 1) with gr.Column(visible=False) as post_box: for n in range(MAX_TEXT_BOX + 1): if n == 0 : with gr.Row(visible=True) as color_row[n]: bg_prompt = gr.Textbox(label="Design prompt of background", value="") bg_class = gr.Textbox(label="Design type of background (optional)", value="") bg_tags = gr.Textbox(label="Design type of the background (optional)", value="") else: with gr.Row(visible=False) as color_row[n]: prompts.append(gr.Textbox(label="Prompt for box "+str(n))) colors.append(gr.Dropdown( label="Color for box "+str(n), choices=color_idx_list, )) styles.append(gr.Dropdown( label="Font type for box "+str(n), choices=font_idx_list, )) seed_ = gr.Slider(label="Seed", minimum=-1, maximum=2147483647, value=-1, step=1) cfg_ = gr.Slider(label="CFG Scale", minimum=1, maximum=10, value=5) button_generate = gr.Button("(2) I've finished my texts, colors and styles, generate!", elem_id="main_button", interactive=True, variant='primary') button_layout.click(process_box, inputs=[], outputs=[post_box, *color_row]) with gr.Column(): output_image = gr.Image(label="Output Image", interactive=False) button_generate.click(generate_image, inputs=[bg_prompt, bg_class, bg_tags, seed_, cfg_, *(prompts + colors + styles)], outputs=[output_image], queue=True) with gr.Row(): # examples color_str = gr.Textbox(label="Color list", value="", visible=False) style_str = gr.Textbox(label="Font type list", value="", visible=False) box_str = gr.Textbox(label="Bbox list", value="", visible=False) text_str = gr.Textbox(label="Text list", value="", visible=False) prev_img = gr.Image(label="Preview", visible = False) gr.Examples( examples=examples, inputs=[ prev_img, bg_prompt, bg_class, bg_tags, color_str, style_str, text_str, box_str, seed_, cfg_ ], outputs=[post_box, box_sketch_template, seed_, *color_row, *colors, *styles, *prompts], fn=process_example, cache_examples=False, run_on_click=True, label='Examples', ) def build_input_block_multilingual(color_idx_list, font_idx_list, examples): with gr.Row(): with gr.Column(elem_id="main-image"): box_sketch_template = gr.Image( value=Image.new('RGB', (1024, 1024), (255, 255, 255)), sources=[], interactive=False, ) box_sketch_template.select(get_pixels_multilingual, [box_sketch_template], [box_sketch_template]) with gr.Row(): redo = gr.Button(value='Redo - Cancel last point') undo = gr.Button(value='Undo - Clear the canvas') redo.click(exe_redo_multilingual, [box_sketch_template], [box_sketch_template]) undo.click(exe_undo_multilingual, [box_sketch_template], [box_sketch_template]) button_layout = gr.Button("(1) I've finished my layout!", elem_id="main_button", interactive=True) prompts = [] colors = [] styles = [] color_row = [None] * (MAX_TEXT_BOX + 1) with gr.Column(visible=False) as post_box: for n in range(MAX_TEXT_BOX + 1): if n == 0 : with gr.Row(visible=True) as color_row[n]: bg_prompt = gr.Textbox(label="Design prompt of background", value="") bg_class = gr.Textbox(label="Design type of background (optional)", value="") bg_tags = gr.Textbox(label="Design type of the background (optional)", value="") else: with gr.Row(visible=False) as color_row[n]: prompts.append(gr.Textbox(label="Prompt for box "+str(n))) colors.append(gr.Dropdown( label="Color for box "+str(n), choices=color_idx_list, )) styles.append(gr.Dropdown( label="Font type for box "+str(n), choices=font_idx_list, )) seed_ = gr.Slider(label="Seed", minimum=-1, maximum=2147483647, value=-1, step=1) cfg_ = gr.Slider(label="CFG Scale", minimum=1, maximum=10, value=5) button_generate = gr.Button("(2) I've finished my texts, colors and styles, generate!", elem_id="main_button", interactive=True, variant='primary') button_layout.click(process_box_multilingual, inputs=[], outputs=[post_box, *color_row]) with gr.Column(): output_image = gr.Image(label="Output Image", interactive=False) button_generate.click(generate_image_multilingual, inputs=[bg_prompt, bg_class, bg_tags, seed_, cfg_, *(prompts + colors + styles)], outputs=[output_image], queue=True) with gr.Row(): # examples color_str = gr.Textbox(label="Color list", value="", visible=False) style_str = gr.Textbox(label="Font type list", value="", visible=False) box_str = gr.Textbox(label="Bbox list", value="", visible=False) text_str = gr.Textbox(label="Text list", value="", visible=False) prev_img = gr.Image(label="Preview", visible = False) gr.Examples( examples=examples, inputs=[ prev_img, bg_prompt, bg_class, bg_tags, color_str, style_str, text_str, box_str, seed_, cfg_ ], outputs=[post_box, box_sketch_template, seed_, *color_row, *colors, *styles, *prompts], fn=process_example_multilingual, cache_examples=False, run_on_click=True, label='Examples', ) def main(): init_pipeline() # load configs with open('assets/color_idx.json', 'r') as f: color_idx_dict = json.load(f) color_idx_list = list(color_idx_dict) with open('assets/font_idx_512.json', 'r') as f: eng_font_idx_dict = json.load(f) eng_font_idx_list = list(eng_font_idx_dict) multi_font_idx_list = [] for lang in multilingual_font_dict: with open(f'assets/multi_fonts/{lang}.json', 'r') as f: lang_font_list = json.load(f) for font in lang_font_list: font_name = font[0][3:] multi_font_idx_list.append(f"{multilingual_code_dict[lang]}: {font_name}") html = f"""

Glyph-ByT5: A Customized Text Encoder for Accurate Visual Text Rendering

Glyph-ByT5 Project Page |Glyph-ByT5-v2 Project Page | Glyph-ByT5 arXiv Paper |Glyph-ByT5-v2 arXiv Paper | Github

We present a basic version of Glyph-SDXL, and a multilingual version Glyph-SDXL-v2 supporting up to 10 languages: English, Chinese, French, German, Spanish, Portuguese, Italian, Russian, Japanese and Korean.

Note: due to limited capacity, we support 5000 chars in Chinese, 1148 chars in Japanese and 617 in Korean. Certain uncommon characters might not be supported for these three languages.

Models presented in this demo are all based on albedo-xl!

Try some examples at the bottom of the page to get started!

Quick Guide:

1. Select bounding boxes on the canvas on the left by clicking twice.

2. Click "Redo" if you want to cancel last point, "Undo" for clearing the canvas.

3. Click "I've finished my layout!" to start choosing specific prompts, colors and font-types.

4. Enter a design prompt for the background image. Optionally, you can choose to specify the design categories and tags (separated by a comma).

5. For each text box, enter the text prompts in the text box on the left, and select colors and font-types from the drop boxes on the right.

6. Click on "I've finished my texts, colors and styles, generate!" to start generating!.