Spaces:

ura23
/

wd-tagger

Running

App Files Files Community

ura23 commited on Jan 25

Commit

dc7b29f

verified ·

1 Parent(s): ce8d28d

Upload 3 files

Browse files

Files changed (3) hide show

tagger/tagger___init__.py +0 -0
tagger/tagger_common.py +180 -0
tagger/tagger_model.py +206 -0

tagger/tagger___init__.py ADDED Viewed

File without changes

tagger/tagger_common.py ADDED Viewed

	@@ -0,0 +1,180 @@

+import math
+from dataclasses import dataclass
+from functools import lru_cache
+from pathlib import Path
+from typing import Optional
+import numpy as np
+import pandas as pd
+import torch
+from huggingface_hub import hf_hub_download
+from huggingface_hub.utils import HfHubHTTPError
+from PIL import Image
+from torch import Tensor, nn
+@dataclass
+class Heatmap:
+    label: str
+    score: float
+    image: Image.Image
+@dataclass
+class LabelData:
+    names: list[str]
+    rating: list[np.int64]
+    general: list[np.int64]
+    character: list[np.int64]
+@dataclass
+class ImageLabels:
+    caption: str
+    booru: str
+    rating: dict[str, float]
+    general: dict[str, float]
+    character: dict[str, float]
+@lru_cache(maxsize=5)
+def load_labels_hf(
+    repo_id: str,
+    revision: Optional[str] = None,
+    token: Optional[str] = None,
+) -> LabelData:
+    try:
+        csv_path = hf_hub_download(
+            repo_id=repo_id, filename="selected_tags.csv", revision=revision, token=token
+        )
+        csv_path = Path(csv_path).resolve()
+    except HfHubHTTPError as e:
+        raise FileNotFoundError(f"selected_tags.csv failed to download from {repo_id}") from e
+    df: pd.DataFrame = pd.read_csv(csv_path, usecols=["name", "category"])
+    tag_data = LabelData(
+        names=df["name"].tolist(),
+        rating=list(np.where(df["category"] == 9)[0]),
+        general=list(np.where(df["category"] == 0)[0]),
+        character=list(np.where(df["category"] == 4)[0]),
+    )
+    return tag_data
+def mcut_threshold(probs: np.ndarray) -> float:
+    """
+    Maximum Cut Thresholding (MCut)
+    Largeron, C., Moulin, C., & Gery, M. (2012). MCut: A Thresholding Strategy
+     for Multi-label Classification. In 11th International Symposium, IDA 2012
+     (pp. 172-183).
+    """
+    probs = probs[probs.argsort()[::-1]]
+    diffs = probs[:-1] - probs[1:]
+    idx = diffs.argmax()
+    thresh = (probs[idx] + probs[idx + 1]) / 2
+    return float(thresh)
+def pil_ensure_rgb(image: Image.Image) -> Image.Image:
+    # convert to RGB/RGBA if not already (deals with palette images etc.)
+    if image.mode not in ["RGB", "RGBA"]:
+        image = image.convert("RGBA") if "transparency" in image.info else image.convert("RGB")
+    # convert RGBA to RGB with white background
+    if image.mode == "RGBA":
+        canvas = Image.new("RGBA", image.size, (255, 255, 255))
+        canvas.alpha_composite(image)
+        image = canvas.convert("RGB")
+    return image
+def pil_pad_square(
+    image: Image.Image,
+    fill: tuple[int, int, int] = (255, 255, 255),
+) -> Image.Image:
+    w, h = image.size
+    # get the largest dimension so we can pad to a square
+    px = max(image.size)
+    # pad to square with white background
+    canvas = Image.new("RGB", (px, px), fill)
+    canvas.paste(image, ((px - w) // 2, (px - h) // 2))
+    return canvas
+def preprocess_image(
+    image: Image.Image,
+    size_px: int | tuple[int, int],
+    upscale: bool = True,
+) -> Image.Image:
+    """
+    Preprocess an image to be square and centered on a white background.
+    """
+    if isinstance(size_px, int):
+        size_px = (size_px, size_px)
+    # ensure RGB and pad to square
+    image = pil_ensure_rgb(image)
+    image = pil_pad_square(image)
+    # resize to target size
+    if image.size[0] < size_px[0] or image.size[1] < size_px[1]:
+        if upscale is False:
+            raise ValueError("Image is smaller than target size, and upscaling is disabled")
+        image = image.resize(size_px, Image.LANCZOS)
+    if image.size[0] > size_px[0] or image.size[1] > size_px[1]:
+        image.thumbnail(size_px, Image.BICUBIC)
+    return image
+def pil_make_grid(
+    images: list[Image.Image],
+    max_cols: int = 8,
+    padding: int = 4,
+    bg_color: tuple[int, int, int] = (40, 42, 54),  # dracula background color
+    partial_rows: bool = True,
+) -> Image.Image:
+    n_cols = min(math.floor(math.sqrt(len(images))), max_cols)
+    n_rows = math.ceil(len(images) / n_cols)
+    # if the final row is not full and partial_rows is False, remove a row
+    if n_cols * n_rows > len(images) and not partial_rows:
+        n_rows -= 1
+    # assumes all images are same size
+    image_width, image_height = images[0].size
+    canvas_width = ((image_width + padding) * n_cols) + padding
+    canvas_height = ((image_height + padding) * n_rows) + padding
+    canvas = Image.new("RGB", (canvas_width, canvas_height), bg_color)
+    for i, img in enumerate(images):
+        x = (i % n_cols) * (image_width + padding) + padding
+        y = (i // n_cols) * (image_height + padding) + padding
+        canvas.paste(img, (x, y))
+    return canvas
+# https://github.com/toriato/stable-diffusion-webui-wd14-tagger/blob/a9eacb1eff904552d3012babfa28b57e1d3e295c/tagger/ui.py#L368
+kaomojis = [
+    "0_0",
+    "(o)_(o)",
+    "+_+",
+    "+_-",
+    "._.",
+    "<o>_<o>",
+    "<|>_<|>",
+    "=_=",
+    ">_<",
+    "3_3",
+    "6_9",
+    ">_o",
+    "@_@",
+    "^_^",
+    "o_o",
+    "u_u",
+    "x_x",
+    "|_|",
+    "||_||",
+]

tagger/tagger_model.py ADDED Viewed

	@@ -0,0 +1,206 @@

+import math
+from pathlib import Path
+import colorcet as cc
+import cv2
+import numpy as np
+import timm
+import torch
+from PIL import Image
+from matplotlib.colors import LinearSegmentedColormap
+from timm.data import create_transform, resolve_data_config
+from timm.models import VisionTransformer
+from torch import Tensor, nn
+from torch.nn import functional as F
+from torchvision import transforms as T
+from .common import Heatmap, ImageLabels, LabelData, pil_make_grid
+# working dir, either file parent dir or cwd if interactive
+work_dir = (Path(__file__).parent if "__file__" in locals() else Path.cwd()).resolve()
+temp_dir = work_dir.joinpath("temp")
+temp_dir.mkdir(exist_ok=True, parents=True)
+# model cache
+model_cache: dict[str, VisionTransformer] = {}
+transform_cache: dict[str, T.Compose] = {}
+# device to use
+torch_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+class RGBtoBGR(nn.Module):
+    def forward(self, x: Tensor) -> Tensor:
+        if x.ndim == 4:
+            return x[:, [2, 1, 0], :, :]
+        return x[[2, 1, 0], :, :]
+def model_device(model: nn.Module) -> torch.device:
+    return next(model.parameters()).device
+def load_model(repo_id: str) -> VisionTransformer:
+    global model_cache
+    if model_cache.get(repo_id, None) is None:
+        # save model to cache
+        model_cache[repo_id] = timm.create_model("hf-hub:" + repo_id, pretrained=True).eval().to(torch_device)
+    return model_cache[repo_id]
+def load_model_and_transform(repo_id: str) -> tuple[VisionTransformer, T.Compose]:
+    global transform_cache
+    global model_cache
+    if model_cache.get(repo_id, None) is None:
+        # save model to cache
+        model_cache[repo_id] = timm.create_model("hf-hub:" + repo_id, pretrained=True).eval()
+    model = model_cache[repo_id]
+    if transform_cache.get(repo_id, None) is None:
+        transforms = create_transform(**resolve_data_config(model.pretrained_cfg, model=model))
+        # hack in the RGBtoBGR transform, save to cache
+        transform_cache[repo_id] = T.Compose(transforms.transforms + [RGBtoBGR()])
+    transform = transform_cache[repo_id]
+    return model, transform
+def get_tags(
+    probs: Tensor,
+    labels: LabelData,
+    gen_threshold: float,
+    char_threshold: float,
+):
+    # Convert indices+probs to labels
+    probs = list(zip(labels.names, probs.numpy()))
+    # First 4 labels are actually ratings
+    rating_labels = dict([probs[i] for i in labels.rating])
+    # General labels, pick any where prediction confidence > threshold
+    gen_labels = [probs[i] for i in labels.general]
+    gen_labels = dict([x for x in gen_labels if x[1] > gen_threshold])
+    gen_labels = dict(sorted(gen_labels.items(), key=lambda item: item[1], reverse=True))
+    # Character labels, pick any where prediction confidence > threshold
+    char_labels = [probs[i] for i in labels.character]
+    char_labels = dict([x for x in char_labels if x[1] > char_threshold])
+    char_labels = dict(sorted(char_labels.items(), key=lambda item: item[1], reverse=True))
+    # Combine general and character labels, sort by confidence
+    combined_names = [x for x in gen_labels]
+    combined_names.extend([x for x in char_labels])
+    # Convert to a string suitable for use as a training caption
+    caption = ", ".join(combined_names).replace("(", "\(").replace(")", "\)")
+    booru = caption.replace("_", " ")
+    return caption, booru, rating_labels, char_labels, gen_labels
+@torch.no_grad()
+def render_heatmap(
+    image: Tensor,
+    gradients: Tensor,
+    image_feats: Tensor,
+    image_probs: Tensor,
+    image_labels: list[str],
+    cmap: LinearSegmentedColormap = cc.m_linear_bmy_10_95_c71,
+    pos_embed_dim: int = 784,
+    image_size: tuple[int, int] = (448, 448),
+    font_args: dict = {
+        "fontFace": cv2.FONT_HERSHEY_SIMPLEX,
+        "fontScale": 1,
+        "color": (255, 255, 255),
+        "thickness": 2,
+        "lineType": cv2.LINE_AA,
+    },
+    partial_rows: bool = True,
+) -> tuple[list[Heatmap], Image.Image]:
+    # hmap_dim = int(math.sqrt(pos_embed_dim))
+    image_hmaps = gradients.mean(2, keepdim=True).mul(image_feats.unsqueeze(0)).squeeze()
+    hmap_dim = int(math.sqrt(image_hmaps.mean(-1).numel() / len(image_labels)))
+    image_hmaps = image_hmaps.mean(-1).reshape(len(image_labels), -1)
+    image_hmaps = image_hmaps[..., -hmap_dim ** 2:]
+    image_hmaps = image_hmaps.reshape(len(image_labels), hmap_dim, hmap_dim)
+    image_hmaps = image_hmaps.max(torch.zeros_like(image_hmaps))
+    image_hmaps /= image_hmaps.reshape(image_hmaps.shape[0], -1).max(-1)[0].unsqueeze(-1).unsqueeze(-1)
+    # normalize to 0-1
+    image_hmaps = torch.stack([(x - x.min()) / (x.max() - x.min()) for x in image_hmaps]).unsqueeze(1)
+    # interpolate to input image size
+    image_hmaps = F.interpolate(image_hmaps, size=image_size, mode="bilinear").squeeze(1)
+    hmap_imgs: list[Heatmap] = []
+    for tag, hmap, score in zip(image_labels, image_hmaps, image_probs.cpu()):
+        image_pixels = image.add(1).mul(127.5).squeeze().permute(1, 2, 0).cpu().numpy().astype(np.uint8)
+        hmap_pixels = cmap(hmap.cpu().numpy(), bytes=True)[:, :, :3]
+        hmap_cv2 = cv2.cvtColor(hmap_pixels, cv2.COLOR_RGB2BGR)
+        hmap_image = cv2.addWeighted(image_pixels, 0.5, hmap_cv2, 0.5, 0)
+        if tag is not None:
+            cv2.putText(hmap_image, tag, (10, 30), **font_args)
+            cv2.putText(hmap_image, f"{score:.3f}", org=(10, 60), **font_args)
+        hmap_pil = Image.fromarray(cv2.cvtColor(hmap_image, cv2.COLOR_BGR2RGB))
+        hmap_imgs.append(Heatmap(tag, score.item(), hmap_pil))
+    hmap_imgs = sorted(hmap_imgs, key=lambda x: x.score, reverse=True)
+    hmap_grid = pil_make_grid([x.image for x in hmap_imgs], partial_rows=partial_rows)
+    return hmap_imgs, hmap_grid
+def process_heatmap(
+    model: VisionTransformer,
+    image: Tensor,
+    labels: LabelData,
+    threshold: float = 0.5,
+    partial_rows: bool = True,
+) -> tuple[list[tuple[float, str, Image.Image]], Image.Image, ImageLabels]:
+    torch_device = model_device(model)
+    with torch.set_grad_enabled(True):
+        features = model.forward_features(image.to(torch_device))
+        probs = model.forward_head(features)
+        probs = F.sigmoid(probs).squeeze(0)
+        probs_mask = probs > threshold
+        heatmap_probs = probs[probs_mask]
+        label_indices = torch.nonzero(probs_mask, as_tuple=False).squeeze(1)
+        image_labels = [labels.names[label_indices[i]] for i in range(len(label_indices))]
+        eye = torch.eye(heatmap_probs.shape[0], device=torch_device)
+        grads = torch.autograd.grad(
+            outputs=heatmap_probs,
+            inputs=features,
+            grad_outputs=eye,
+            is_grads_batched=True,
+            retain_graph=True,
+        )
+        grads = grads[0].detach().requires_grad_(False)[:, 0, :, :].unsqueeze(1)
+    with torch.set_grad_enabled(False):
+        hmap_imgs, hmap_grid = render_heatmap(
+            image=image,
+            gradients=grads,
+            image_feats=features,
+            image_probs=heatmap_probs,
+            image_labels=image_labels,
+            partial_rows=partial_rows,
+        )
+        caption, booru, ratings, character, general = get_tags(
+            probs=probs.cpu(),
+            labels=labels,
+            gen_threshold=threshold,
+            char_threshold=threshold,
+        )
+        labels = ImageLabels(caption, booru, ratings, general, character)
+    return hmap_imgs, hmap_grid, labels