tolgacangoz
/

anytext

Text-to-Image

Diffusers

Safetensors

Model card Files Files and versions Community

tolgacangoz commited on 6 days ago

Commit

a4686b9

verified ·

1 Parent(s): 225e8aa

Upload anytext.py

Browse files

Files changed (1) hide show

auxiliary_latent_module/anytext.py +131 -34

auxiliary_latent_module/anytext.py CHANGED Viewed

@@ -25,6 +25,7 @@ import math
 import os
 import re
 import sys
 from functools import partial
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
@@ -33,9 +34,9 @@ import numpy as np
 import PIL.Image
 import torch
 import torch.nn.functional as F
-from bert_tokenizer import BasicTokenizer
 from easydict import EasyDict as edict
 from frozen_clip_embedder_t3 import FrozenCLIPEmbedderT3
 from ocr_recog.RecModel import RecModel
 from PIL import Image, ImageDraw, ImageFont
 from safetensors.torch import load_file
@@ -66,12 +67,75 @@ from diffusers.utils import (
     scale_lora_layers,
     unscale_lora_layers,
 )
 from diffusers.utils.torch_utils import is_compiled_module, is_torch_version, randn_tensor
-from diffusers.configuration_utils import register_to_config, ConfigMixin
-from diffusers.models.modeling_utils import ModelMixin
-checker = BasicTokenizer()
 PLACE_HOLDER = "*"
@@ -81,18 +145,22 @@ logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 EXAMPLE_DOC_STRING = """
     Examples:
         ```py
-        >>> from pipeline_anytext import AnyTextPipeline
         >>> from anytext_controlnet import AnyTextControlNetModel
         >>> from diffusers import DDIMScheduler
         >>> from diffusers.utils import load_image
-        >>> import torch
         >>> # load control net and stable diffusion v1-5
-        >>> text_controlnet = AnyTextControlNetModel.from_pretrained("tolgacangoz/anytext-controlnet", torch_dtype=torch.float16,
-        ...                                                        variant="fp16",)
-        >>> pipe = AnyTextPipeline.from_pretrained("tolgacangoz/anytext", controlnet=text_controlnet,
-        ...                                          torch_dtype=torch.float16, variant="fp16",
-        ...                                          ).to("cuda")
         >>> pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
         >>> # uncomment following line if PyTorch>=2.0 is not installed for memory optimization
@@ -103,11 +171,9 @@ EXAMPLE_DOC_STRING = """
         >>> #pipe.enable_model_cpu_offload()
         >>> # generate image
-        >>> generator = torch.Generator("cpu").manual_seed(66273235)
         >>> prompt = 'photo of caramel macchiato coffee on the table, top-down perspective, with "Any" "Text" written on it using cream'
-        >>> draw_pos = load_image("www.huggingface.co/a/AnyText/tree/main/examples/gen9.png")
-        >>> image = pipe(prompt, num_inference_steps=20, generator=generator, mode="generate",
-        ...              draw_pos=draw_pos,
         ...              ).images[0]
         >>> image
         ```
@@ -152,7 +218,12 @@ class EmbeddingManager(nn.Module):
         self.token_dim = token_dim
         self.proj = nn.Linear(40 * 64, token_dim)
-        # self.proj.load_state_dict(load_file("proj.safetensors", device=str(embedder.device)))
         if use_fp16:
             self.proj = self.proj.to(dtype=torch.float16)
@@ -269,9 +340,14 @@ def crop_image(src_img, mask):
 def create_predictor(model_dir=None, model_lang="ch", device="cpu", use_fp16=False):
-    model_file_path = model_dir
-    if model_file_path is not None and not os.path.exists(model_file_path):
-        raise ValueError("not find model file path {}".format(model_file_path))
     if model_lang == "ch":
         n_class = 6625
@@ -287,8 +363,8 @@ def create_predictor(model_dir=None, model_lang="ch", device="cpu", use_fp16=Fal
     )
     rec_model = RecModel(rec_config)
-    if model_file_path is not None:
-        rec_model.load_state_dict(torch.load(model_file_path, map_location=device))
     return rec_model
@@ -401,7 +477,7 @@ class TextRecognizer(object):
                 preds["ctc"] = torch.from_numpy(outputs[0])
                 preds["ctc_neck"] = [torch.zeros(1)] * img_num
             else:
-                preds = self.predictor(norm_img_batch)
             for rno in range(preds["ctc"].shape[0]):
                 preds_all[indices[beg_img_no + rno]] = preds["ctc"][rno]
                 preds_neck_all[indices[beg_img_no + rno]] = preds["ctc_neck"][rno]
@@ -450,21 +526,28 @@ class TextRecognizer(object):
         return loss
-class TextEmbeddingModule(ModelMixin, ConfigMixin):
-    @register_to_config
     def __init__(self, font_path, use_fp16=False, device="cpu"):
         super().__init__()
         # TODO: Learn if the recommended font file is free to use
         self.font = ImageFont.truetype(font_path, 60)
         self.frozen_CLIP_embedder_t3 = FrozenCLIPEmbedderT3(device=device, use_fp16=use_fp16)
         self.embedding_manager = EmbeddingManager(self.frozen_CLIP_embedder_t3, use_fp16=use_fp16)
-        rec_model_dir = "./OCR/ppv3_rec.pth"
         self.text_predictor = create_predictor(rec_model_dir, device=device, use_fp16=use_fp16).eval()
         args = {}
         args["rec_image_shape"] = "3, 48, 320"
         args["rec_batch_num"] = 6
-        args["rec_char_dict_path"] = "OCR/ppocr_keys_v1.txt"
-        args["use_fp16"] = self.use_fp16
         self.embedding_manager.recog = TextRecognizer(args, self.text_predictor)
     @torch.no_grad()
@@ -487,7 +570,10 @@ class TextEmbeddingModule(ModelMixin, ConfigMixin):
         # preprocess pos_imgs(if numpy, make sure it's white pos in black bg)
         if draw_pos is None:
             pos_imgs = np.zeros((w, h, 1))
-        if isinstance(draw_pos, str):
             draw_pos = cv2.imread(draw_pos)[..., ::-1]
             if draw_pos is None:
                 raise ValueError(f"Can't read draw_pos image from {draw_pos}!")
@@ -580,7 +666,7 @@ class TextEmbeddingModule(ModelMixin, ConfigMixin):
         self.embedding_manager.encode_text(text_info)
         negative_prompt_embeds = self.frozen_CLIP_embedder_t3.encode(
-            [negative_prompt], embedding_manager=self.embedding_manager
         )
         return prompt_embeds, negative_prompt_embeds, text_info, np_hint
@@ -799,7 +885,8 @@ class AuxiliaryLatentModule(nn.Module):
         # get masked_x
         masked_img = ((edit_image.astype(np.float32) / 127.5) - 1.0) * (1 - np_hint)
         masked_img = np.transpose(masked_img, (2, 0, 1))
-        masked_img = torch.from_numpy(masked_img.copy()).float().to(self.device)
         if self.use_fp16:
             masked_img = masked_img.half()
         masked_x = (retrieve_latents(self.vae.encode(masked_img[None, ...])) * self.vae.config.scaling_factor).detach()
@@ -842,9 +929,9 @@ class AuxiliaryLatentModule(nn.Module):
             new_string += char + " " * nSpace
         return new_string[:-nSpace]
-    def to(self, device):
-        self.device = device
-        self.vae = self.vae.to(device)
         return self
@@ -969,6 +1056,9 @@ class AnyTextPipeline(
         scheduler: KarrasDiffusionSchedulers,
         safety_checker: StableDiffusionSafetyChecker,
         feature_extractor: CLIPImageProcessor,
         image_encoder: CLIPVisionModelWithProjection = None,
         requires_safety_checker: bool = True,
     ):
@@ -1877,6 +1967,7 @@ class AnyTextPipeline(
         text_encoder_lora_scale = (
             self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
         )
         prompt_embeds, negative_prompt_embeds, text_info, np_hint = self.text_embedding_module(
             prompt,
             texts,
@@ -2035,7 +2126,7 @@ class AnyTextPipeline(
                     control_model_input,
                     t,
                     encoder_hidden_states=controlnet_prompt_embeds,
-                    guided_hint=guided_hint,
                     conditioning_scale=cond_scale,
                     guess_mode=guess_mode,
                     return_dict=False,
@@ -2116,3 +2207,9 @@ class AnyTextPipeline(
             return (image, has_nsfw_concept)
         return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)

 import os
 import re
 import sys
+import unicodedata
 from functools import partial
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 import PIL.Image
 import torch
 import torch.nn.functional as F
 from easydict import EasyDict as edict
 from frozen_clip_embedder_t3 import FrozenCLIPEmbedderT3
+from huggingface_hub import hf_hub_download
 from ocr_recog.RecModel import RecModel
 from PIL import Image, ImageDraw, ImageFont
 from safetensors.torch import load_file
     scale_lora_layers,
     unscale_lora_layers,
 )
+from diffusers.utils.constants import HF_MODULES_CACHE
 from diffusers.utils.torch_utils import is_compiled_module, is_torch_version, randn_tensor
+class Checker:
+    def __init__(self):
+        pass
+    def _is_chinese_char(self, cp):
+        """Checks whether CP is the codepoint of a CJK character."""
+        # This defines a "chinese character" as anything in the CJK Unicode block:
+        #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+        #
+        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
+        # despite its name. The modern Korean Hangul alphabet is a different block,
+        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
+        # space-separated words, so they are not treated specially and handled
+        # like the all of the other languages.
+        if (
+            (cp >= 0x4E00 and cp <= 0x9FFF)
+            or (cp >= 0x3400 and cp <= 0x4DBF)
+            or (cp >= 0x20000 and cp <= 0x2A6DF)
+            or (cp >= 0x2A700 and cp <= 0x2B73F)
+            or (cp >= 0x2B740 and cp <= 0x2B81F)
+            or (cp >= 0x2B820 and cp <= 0x2CEAF)
+            or (cp >= 0xF900 and cp <= 0xFAFF)
+            or (cp >= 0x2F800 and cp <= 0x2FA1F)
+        ):
+            return True
+        return False
+    def _clean_text(self, text):
+        """Performs invalid character removal and whitespace cleanup on text."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if cp == 0 or cp == 0xFFFD or self._is_control(char):
+                continue
+            if self._is_whitespace(char):
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+    def _is_control(self, char):
+        """Checks whether `chars` is a control character."""
+        # These are technically control characters but we count them as whitespace
+        # characters.
+        if char == "\t" or char == "\n" or char == "\r":
+            return False
+        cat = unicodedata.category(char)
+        if cat in ("Cc", "Cf"):
+            return True
+        return False
+    def _is_whitespace(self, char):
+        """Checks whether `chars` is a whitespace character."""
+        # \t, \n, and \r are technically control characters but we treat them
+        # as whitespace since they are generally considered as such.
+        if char == " " or char == "\t" or char == "\n" or char == "\r":
+            return True
+        cat = unicodedata.category(char)
+        if cat == "Zs":
+            return True
+        return False
+checker = Checker()
 PLACE_HOLDER = "*"
 EXAMPLE_DOC_STRING = """
     Examples:
         ```py
+        >>> import torch
+        >>> from diffusers import DiffusionPipeline
         >>> from anytext_controlnet import AnyTextControlNetModel
         >>> from diffusers import DDIMScheduler
         >>> from diffusers.utils import load_image
+        >>> # I chose a font file shared by an HF staff:
+        >>> !wget https://huggingface.co/spaces/ysharma/TranslateQuotesInImageForwards/resolve/main/arial-unicode-ms.ttf
         >>> # load control net and stable diffusion v1-5
+        >>> anytext_controlnet = AnyTextControlNetModel.from_pretrained("tolgacangoz/anytext-controlnet", torch_dtype=torch.float16,
+        ...                                                             variant="fp16",)
+        >>> pipe = DiffusionPipeline.from_pretrained("tolgacangoz/anytext", font_path="arial-unicode-ms.ttf",
+        ...                                           controlnet=anytext_controlnet, torch_dtype=torch.float16,
+        ...                                           trust_remote_code=True,
+        ...                                           ).to("cuda")
         >>> pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
         >>> # uncomment following line if PyTorch>=2.0 is not installed for memory optimization
         >>> #pipe.enable_model_cpu_offload()
         >>> # generate image
         >>> prompt = 'photo of caramel macchiato coffee on the table, top-down perspective, with "Any" "Text" written on it using cream'
+        >>> draw_pos = load_image("https://raw.githubusercontent.com/tyxsspa/AnyText/refs/heads/main/example_images/gen9.png")
+        >>> image = pipe(prompt, num_inference_steps=20, mode="generate", draw_pos=draw_pos,
         ...              ).images[0]
         >>> image
         ```
         self.token_dim = token_dim
         self.proj = nn.Linear(40 * 64, token_dim)
+        proj_dir = hf_hub_download(
+            repo_id="tolgacangoz/anytext",
+            filename="text_embedding_module/proj.safetensors",
+            cache_dir=HF_MODULES_CACHE,
+        )
+        self.proj.load_state_dict(load_file(proj_dir, device=str(embedder.device)))
         if use_fp16:
             self.proj = self.proj.to(dtype=torch.float16)
 def create_predictor(model_dir=None, model_lang="ch", device="cpu", use_fp16=False):
+    if model_dir is None or not os.path.exists(model_dir):
+        model_dir = hf_hub_download(
+            repo_id="tolgacangoz/anytext",
+            filename="text_embedding_module/OCR/ppv3_rec.pth",
+            cache_dir=HF_MODULES_CACHE,
+        )
+    if not os.path.exists(model_dir):
+        raise ValueError("not find model file path {}".format(model_dir))
     if model_lang == "ch":
         n_class = 6625
     )
     rec_model = RecModel(rec_config)
+    state_dict = torch.load(model_dir, map_location=device)
+    rec_model.load_state_dict(state_dict)
     return rec_model
                 preds["ctc"] = torch.from_numpy(outputs[0])
                 preds["ctc_neck"] = [torch.zeros(1)] * img_num
             else:
+                preds = self.predictor(norm_img_batch.to(next(self.predictor.parameters()).device))
             for rno in range(preds["ctc"].shape[0]):
                 preds_all[indices[beg_img_no + rno]] = preds["ctc"][rno]
                 preds_neck_all[indices[beg_img_no + rno]] = preds["ctc_neck"][rno]
         return loss
+class TextEmbeddingModule(nn.Module):
+    # @register_to_config
     def __init__(self, font_path, use_fp16=False, device="cpu"):
         super().__init__()
         # TODO: Learn if the recommended font file is free to use
         self.font = ImageFont.truetype(font_path, 60)
+        self.use_fp16 = use_fp16
+        self.device = device
         self.frozen_CLIP_embedder_t3 = FrozenCLIPEmbedderT3(device=device, use_fp16=use_fp16)
         self.embedding_manager = EmbeddingManager(self.frozen_CLIP_embedder_t3, use_fp16=use_fp16)
+        rec_model_dir = "./text_embedding_module/OCR/ppv3_rec.pth"
         self.text_predictor = create_predictor(rec_model_dir, device=device, use_fp16=use_fp16).eval()
         args = {}
         args["rec_image_shape"] = "3, 48, 320"
         args["rec_batch_num"] = 6
+        args["rec_char_dict_path"] = "./text_embedding_module/OCR/ppocr_keys_v1.txt"
+        args["rec_char_dict_path"] = hf_hub_download(
+            repo_id="tolgacangoz/anytext",
+            filename="text_embedding_module/OCR/ppocr_keys_v1.txt",
+            cache_dir=HF_MODULES_CACHE,
+        )
+        args["use_fp16"] = use_fp16
         self.embedding_manager.recog = TextRecognizer(args, self.text_predictor)
     @torch.no_grad()
         # preprocess pos_imgs(if numpy, make sure it's white pos in black bg)
         if draw_pos is None:
             pos_imgs = np.zeros((w, h, 1))
+        if isinstance(draw_pos, PIL.Image.Image):
+            pos_imgs = np.array(draw_pos)[..., ::-1]
+            pos_imgs = 255 - pos_imgs
+        elif isinstance(draw_pos, str):
             draw_pos = cv2.imread(draw_pos)[..., ::-1]
             if draw_pos is None:
                 raise ValueError(f"Can't read draw_pos image from {draw_pos}!")
         self.embedding_manager.encode_text(text_info)
         negative_prompt_embeds = self.frozen_CLIP_embedder_t3.encode(
+            [negative_prompt or ""], embedding_manager=self.embedding_manager
         )
         return prompt_embeds, negative_prompt_embeds, text_info, np_hint
         # get masked_x
         masked_img = ((edit_image.astype(np.float32) / 127.5) - 1.0) * (1 - np_hint)
         masked_img = np.transpose(masked_img, (2, 0, 1))
+        device = next(self.vae.parameters()).device
+        masked_img = torch.from_numpy(masked_img.copy()).float().to(device)
         if self.use_fp16:
             masked_img = masked_img.half()
         masked_x = (retrieve_latents(self.vae.encode(masked_img[None, ...])) * self.vae.config.scaling_factor).detach()
             new_string += char + " " * nSpace
         return new_string[:-nSpace]
+    def to(self, *args, **kwargs):
+        self.vae = self.vae.to(*args, **kwargs)
+        self.device = self.vae.device
         return self
         scheduler: KarrasDiffusionSchedulers,
         safety_checker: StableDiffusionSafetyChecker,
         feature_extractor: CLIPImageProcessor,
+        trust_remote_code: bool = False,
+        text_embedding_module: TextEmbeddingModule = None,
+        auxiliary_latent_module: AuxiliaryLatentModule = None,
         image_encoder: CLIPVisionModelWithProjection = None,
         requires_safety_checker: bool = True,
     ):
         text_encoder_lora_scale = (
             self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
         )
+        draw_pos = draw_pos.to(device=device) if isinstance(draw_pos, torch.Tensor) else draw_pos
         prompt_embeds, negative_prompt_embeds, text_info, np_hint = self.text_embedding_module(
             prompt,
             texts,
                     control_model_input,
                     t,
                     encoder_hidden_states=controlnet_prompt_embeds,
+                    controlnet_cond=guided_hint,
                     conditioning_scale=cond_scale,
                     guess_mode=guess_mode,
                     return_dict=False,
             return (image, has_nsfw_concept)
         return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
+    def to(self, *args, **kwargs):
+        super().to(*args, **kwargs)
+        self.text_embedding_module.to(*args, **kwargs)
+        self.auxiliary_latent_module.to(*args, **kwargs)
+        return self