Spaces:

Adapter
/

T2I-Adapter

Runtime error

App Files Files Community

Adapter commited on Feb 23, 2023

Commit

a056b0b

1 Parent(s): af3233a

add seg

Browse files

Files changed (5) hide show

app.py +4 -3
demo/demos.py +25 -0
demo/model.py +102 -1
requirements.txt +2 -1
seger.py +283 -0

app.py CHANGED Viewed

@@ -8,7 +8,7 @@ os.system('mim install mmcv-full==1.7.0')
 from demo.model import Model_all
 import gradio as gr
-from demo.demos import create_demo_keypose, create_demo_sketch, create_demo_draw
 import torch
 import subprocess
 import shlex
@@ -22,6 +22,7 @@ urls = {
 urls_mmpose = [
     'https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco/faster_rcnn_r50_fpn_1x_coco_20200130-047c8118.pth',
     'https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_coco_256x192-b9e0b3ab_20200708.pth',
 ]
 if os.path.exists('models') == False:
     os.mkdir('models')
@@ -69,7 +70,7 @@ with gr.Blocks(css='style.css') as demo:
             create_demo_sketch(model.process_sketch)
         with gr.TabItem('Draw'):
             create_demo_draw(model.process_draw)
-# demo.queue(api_open=False).launch(server_name='0.0.0.0')
-# demo.queue(show_api=False, enable_queue=False).launch(server_name='0.0.0.0')
 demo.queue().launch(debug=True, server_name='0.0.0.0')

 from demo.model import Model_all
 import gradio as gr
+from demo.demos import create_demo_keypose, create_demo_sketch, create_demo_draw, create_demo_seg
 import torch
 import subprocess
 import shlex
 urls_mmpose = [
     'https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco/faster_rcnn_r50_fpn_1x_coco_20200130-047c8118.pth',
     'https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_coco_256x192-b9e0b3ab_20200708.pth',
+    'https://github.com/kazuto1011/deeplab-pytorch/releases/download/v1.0/deeplabv2_resnet101_msc-cocostuff164k-100000.pth'
 ]
 if os.path.exists('models') == False:
     os.mkdir('models')
             create_demo_sketch(model.process_sketch)
         with gr.TabItem('Draw'):
             create_demo_draw(model.process_draw)
+        with gr.TabItem('Segmentation'):
+            create_demo_seg(model.process_seg)
 demo.queue().launch(debug=True, server_name='0.0.0.0')

demo/demos.py CHANGED Viewed

@@ -70,6 +70,31 @@ def create_demo_sketch(process):
         run_button.click(fn=process, inputs=ips, outputs=[result])
     return demo
 def create_demo_draw(process):
     with gr.Blocks() as demo:
         with gr.Row():

         run_button.click(fn=process, inputs=ips, outputs=[result])
     return demo
+def create_demo_seg(process):
+    with gr.Blocks() as demo:
+        with gr.Row():
+            gr.Markdown('## T2I-Adapter (Segmentation)')
+        with gr.Row():
+            with gr.Column():
+                input_img = gr.Image(source='upload', type="numpy")
+                prompt = gr.Textbox(label="Prompt")
+                neg_prompt = gr.Textbox(label="Negative Prompt",
+                value='ugly, tiling, poorly drawn hands, poorly drawn feet, poorly drawn face, out of frame, extra limbs, disfigured, deformed, body out of frame, bad anatomy, watermark, signature, cut off, low contrast, underexposed, overexposed, bad art, beginner, amateur, distorted face')
+                pos_prompt = gr.Textbox(label="Positive Prompt",
+                value = 'crafted, elegant, meticulous, magnificent, maximum details, extremely hyper aesthetic, intricately detailed')
+                with gr.Row():
+                    type_in = gr.inputs.Radio(['Segmentation', 'Image'], type="value", default='Image', label='You can input an image or a segmentation. If you choose to input a segmentation, it must correspond to the coco-stuff')
+                run_button = gr.Button(label="Run")
+                con_strength = gr.Slider(label="Controling Strength (The guidance strength of the segmentation to the result)", minimum=0, maximum=1, value=0.4, step=0.1)
+                scale = gr.Slider(label="Guidance Scale (Classifier free guidance)", minimum=0.1, maximum=30.0, value=7.5, step=0.1)
+                fix_sample = gr.inputs.Radio(['True', 'False'], type="value", default='False', label='Fix Sampling\n (Fix the random seed)')
+                base_model = gr.inputs.Radio(['sd-v1-4.ckpt', 'anything-v4.0-pruned.ckpt'], type="value", default='sd-v1-4.ckpt', label='The base model you want to use')
+            with gr.Column():
+                result = gr.Gallery(label='Output', show_label=False, elem_id="gallery").style(grid=2, height='auto')
+            ips = [input_img, type_in, prompt, neg_prompt, pos_prompt, fix_sample, scale, con_strength, base_model]
+        run_button.click(fn=process, inputs=ips, outputs=[result])
+    return demo
 def create_demo_draw(process):
     with gr.Blocks() as demo:
         with gr.Row():

demo/model.py CHANGED Viewed

@@ -13,7 +13,30 @@ from mmpose.apis import (inference_top_down_pose_model, init_pose_model, process
 import os
 import cv2
 import numpy as np
 def imshow_keypoints(img,
                      pose_result,
@@ -118,6 +141,13 @@ class Model_all:
         self.model_edge.load_state_dict({k.replace('module.', ''): v for k, v in ckp.items()})
         self.model_edge.to(device)
         # keypose part
         self.model_pose = Adapter(cin=int(3 * 64), channels=[320, 640, 1280, 1280][:4], nums_rb=2, ksize=1, sk=True,
                                   use_conv=False).to(device)
@@ -218,6 +248,77 @@ class Model_all:
         return [im_edge, x_samples_ddim]
     @torch.no_grad()
     def process_draw(self, input_img, prompt, neg_prompt, pos_prompt, fix_sample, scale, con_strength, base_model):
         if self.current_base != base_model:

 import os
 import cv2
 import numpy as np
+from seger import seger, Colorize
+import torch.nn.functional as F
+def preprocessing(image, device):
+    # Resize
+    scale = 640 / max(image.shape[:2])
+    image = cv2.resize(image, dsize=None, fx=scale, fy=scale)
+    raw_image = image.astype(np.uint8)
+    # Subtract mean values
+    image = image.astype(np.float32)
+    image -= np.array(
+        [
+            float(104.008),
+            float(116.669),
+            float(122.675),
+        ]
+    )
+    # Convert to torch.Tensor and add "batch" axis
+    image = torch.from_numpy(image.transpose(2, 0, 1)).float().unsqueeze(0)
+    image = image.to(device)
+    return image, raw_image
 def imshow_keypoints(img,
                      pose_result,
         self.model_edge.load_state_dict({k.replace('module.', ''): v for k, v in ckp.items()})
         self.model_edge.to(device)
+        # segmentation part
+        self.model_seger = seger().to(device)
+        self.model_seger.eval()
+        self.coler = Colorize(n=182)
+        self.model_seg = Adapter(cin=int(3*64), channels=[320, 640, 1280, 1280][:4], nums_rb=2, ksize=1, sk=True, use_conv=False).to(device)
+        self.model_seg.load_state_dict(torch.load("models/t2iadapter_seg_sd14v1.pth", map_location=device))
         # keypose part
         self.model_pose = Adapter(cin=int(3 * 64), channels=[320, 640, 1280, 1280][:4], nums_rb=2, ksize=1, sk=True,
                                   use_conv=False).to(device)
         return [im_edge, x_samples_ddim]
+    @torch.no_grad()
+    def process_seg(self, input_img, type_in, prompt, neg_prompt, pos_prompt, fix_sample, scale,
+                       con_strength, base_model):
+        if self.current_base != base_model:
+            ckpt = os.path.join("models", base_model)
+            pl_sd = torch.load(ckpt, map_location="cuda")
+            if "state_dict" in pl_sd:
+                sd = pl_sd["state_dict"]
+            else:
+                sd = pl_sd
+            self.base_model.load_state_dict(sd, strict=False)
+            self.current_base = base_model
+            if 'anything' in base_model.lower():
+                self.load_vae()
+        con_strength = int((1 - con_strength) * 50)
+        if fix_sample == 'True':
+            seed_everything(42)
+        im = cv2.resize(input_img, (512, 512))
+        if type_in == 'Segmentation':
+            im_seg = im.copy()
+            im = img2tensor(im).unsqueeze(0) / 255.
+            labelmap = im.float()
+        elif type_in == 'Image':
+            im, _ = preprocessing(im, self.device)
+            _, _, H, W = im.shape
+            # Image -> Probability map
+            logits = self.model_seger(im)
+            logits = F.interpolate(logits, size=(H, W), mode="bilinear", align_corners=False)
+            probs = F.softmax(logits, dim=1)[0]
+            probs = probs.cpu().data.numpy()
+            labelmap = np.argmax(probs, axis=0)
+            labelmap = self.coler(labelmap)
+            labelmap = np.transpose(labelmap, (1,2,0))
+            labelmap = cv2.resize(labelmap, (512, 512))
+            labelmap = img2tensor(labelmap, bgr2rgb=False, float32=True)/255.
+            im_seg = tensor2img(labelmap)
+            labelmap = labelmap.unsqueeze(0)
+        # extract condition features
+        c = self.base_model.get_learned_conditioning([prompt + ', ' + pos_prompt])
+        nc = self.base_model.get_learned_conditioning([neg_prompt])
+        features_adapter = self.model_seg(labelmap.to(self.device))
+        shape = [4, 64, 64]
+        # sampling
+        samples_ddim, _ = self.sampler.sample(S=50,
+                                              conditioning=c,
+                                              batch_size=1,
+                                              shape=shape,
+                                              verbose=False,
+                                              unconditional_guidance_scale=scale,
+                                              unconditional_conditioning=nc,
+                                              eta=0.0,
+                                              x_T=None,
+                                              features_adapter1=features_adapter,
+                                              mode='sketch',
+                                              con_strength=con_strength)
+        x_samples_ddim = self.base_model.decode_first_stage(samples_ddim)
+        x_samples_ddim = torch.clamp((x_samples_ddim + 1.0) / 2.0, min=0.0, max=1.0)
+        x_samples_ddim = x_samples_ddim.to('cpu')
+        x_samples_ddim = x_samples_ddim.permute(0, 2, 3, 1).numpy()[0]
+        x_samples_ddim = 255. * x_samples_ddim
+        x_samples_ddim = x_samples_ddim.astype(np.uint8)
+        return [im_seg, x_samples_ddim]
     @torch.no_grad()
     def process_draw(self, input_img, prompt, neg_prompt, pos_prompt, fix_sample, scale, con_strength, base_model):
         if self.current_base != base_model:

requirements.txt CHANGED Viewed

@@ -15,4 +15,5 @@ kornia==0.6.8
 openmim
 mmpose
 mmdet
-psutil

 openmim
 mmpose
 mmdet
+psutil
+blobfile

seger.py ADDED Viewed

	@@ -0,0 +1,283 @@

+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import cv2
+from basicsr.utils import img2tensor, tensor2img
+_BATCH_NORM = nn.BatchNorm2d
+_BOTTLENECK_EXPANSION = 4
+import blobfile as bf
+def _list_image_files_recursively(data_dir):
+    results = []
+    for entry in sorted(bf.listdir(data_dir)):
+        full_path = bf.join(data_dir, entry)
+        ext = entry.split(".")[-1]
+        if "." in entry and ext.lower() in ["jpg", "jpeg", "png", "gif"]:
+            results.append(full_path)
+        elif bf.isdir(full_path):
+            results.extend(_list_image_files_recursively(full_path))
+    return results
+def uint82bin(n, count=8):
+    """returns the binary of integer n, count refers to amount of bits"""
+    return ''.join([str((n >> y) & 1) for y in range(count - 1, -1, -1)])
+def labelcolormap(N):
+    if N == 35:  # cityscape
+        cmap = np.array([(0, 0, 0), (0, 0, 0), (0, 0, 0), (0, 0, 0), (0, 0, 0), (111, 74, 0), (81, 0, 81),
+                         (128, 64, 128), (244, 35, 232), (250, 170, 160), (230, 150, 140), (70, 70, 70), (102, 102, 156), (190, 153, 153),
+                         (180, 165, 180), (150, 100, 100), (150, 120, 90), (153, 153, 153), (153, 153, 153), (250, 170, 30), (220, 220, 0),
+                         (107, 142, 35), (152, 251, 152), (70, 130, 180), (220, 20, 60), (255, 0, 0), (0, 0, 142), (0, 0, 70),
+                         (0, 60, 100), (0, 0, 90), (0, 0, 110), (0, 80, 100), (0, 0, 230), (119, 11, 32), (0, 0, 142)],
+                        dtype=np.uint8)
+    else:
+        cmap = np.zeros((N, 3), dtype=np.uint8)
+        for i in range(N):
+            r, g, b = 0, 0, 0
+            id = i + 1  # let's give 0 a color
+            for j in range(7):
+                str_id = uint82bin(id)
+                r = r ^ (np.uint8(str_id[-1]) << (7 - j))
+                g = g ^ (np.uint8(str_id[-2]) << (7 - j))
+                b = b ^ (np.uint8(str_id[-3]) << (7 - j))
+                id = id >> 3
+            cmap[i, 0] =  r
+            cmap[i, 1] =  g
+            cmap[i, 2] =  b
+    return cmap
+class Colorize(object):
+    def __init__(self, n=182):
+        self.cmap = labelcolormap(n)
+    def __call__(self, gray_image):
+        size = gray_image.shape
+        color_image = np.zeros((3, size[0], size[1]))
+        for label in range(0, len(self.cmap)):
+            mask = (label == gray_image )
+            color_image[0][mask] = self.cmap[label][0]
+            color_image[1][mask] = self.cmap[label][1]
+            color_image[2][mask] = self.cmap[label][2]
+        return color_image
+class _ConvBnReLU(nn.Sequential):
+    """
+    Cascade of 2D convolution, batch norm, and ReLU.
+    """
+    BATCH_NORM = _BATCH_NORM
+    def __init__(
+        self, in_ch, out_ch, kernel_size, stride, padding, dilation, relu=True
+    ):
+        super(_ConvBnReLU, self).__init__()
+        self.add_module(
+            "conv",
+            nn.Conv2d(
+                in_ch, out_ch, kernel_size, stride, padding, dilation, bias=False
+            ),
+        )
+        self.add_module("bn", _BATCH_NORM(out_ch, eps=1e-5, momentum=1 - 0.999))
+        if relu:
+            self.add_module("relu", nn.ReLU())
+class _Bottleneck(nn.Module):
+    """
+    Bottleneck block of MSRA ResNet.
+    """
+    def __init__(self, in_ch, out_ch, stride, dilation, downsample):
+        super(_Bottleneck, self).__init__()
+        mid_ch = out_ch // _BOTTLENECK_EXPANSION
+        self.reduce = _ConvBnReLU(in_ch, mid_ch, 1, stride, 0, 1, True)
+        self.conv3x3 = _ConvBnReLU(mid_ch, mid_ch, 3, 1, dilation, dilation, True)
+        self.increase = _ConvBnReLU(mid_ch, out_ch, 1, 1, 0, 1, False)
+        self.shortcut = (
+            _ConvBnReLU(in_ch, out_ch, 1, stride, 0, 1, False)
+            if downsample
+            else nn.Identity()
+        )
+    def forward(self, x):
+        h = self.reduce(x)
+        h = self.conv3x3(h)
+        h = self.increase(h)
+        h += self.shortcut(x)
+        return F.relu(h)
+class _ResLayer(nn.Sequential):
+    """
+    Residual layer with multi grids
+    """
+    def __init__(self, n_layers, in_ch, out_ch, stride, dilation, multi_grids=None):
+        super(_ResLayer, self).__init__()
+        if multi_grids is None:
+            multi_grids = [1 for _ in range(n_layers)]
+        else:
+            assert n_layers == len(multi_grids)
+        # Downsampling is only in the first block
+        for i in range(n_layers):
+            self.add_module(
+                "block{}".format(i + 1),
+                _Bottleneck(
+                    in_ch=(in_ch if i == 0 else out_ch),
+                    out_ch=out_ch,
+                    stride=(stride if i == 0 else 1),
+                    dilation=dilation * multi_grids[i],
+                    downsample=(True if i == 0 else False),
+                ),
+            )
+class _Stem(nn.Sequential):
+    """
+    The 1st conv layer.
+    Note that the max pooling is different from both MSRA and FAIR ResNet.
+    """
+    def __init__(self, out_ch):
+        super(_Stem, self).__init__()
+        self.add_module("conv1", _ConvBnReLU(3, out_ch, 7, 2, 3, 1))
+        self.add_module("pool", nn.MaxPool2d(3, 2, 1, ceil_mode=True))
+class _ASPP(nn.Module):
+    """
+    Atrous spatial pyramid pooling (ASPP)
+    """
+    def __init__(self, in_ch, out_ch, rates):
+        super(_ASPP, self).__init__()
+        for i, rate in enumerate(rates):
+            self.add_module(
+                "c{}".format(i),
+                nn.Conv2d(in_ch, out_ch, 3, 1, padding=rate, dilation=rate, bias=True),
+            )
+        for m in self.children():
+            nn.init.normal_(m.weight, mean=0, std=0.01)
+            nn.init.constant_(m.bias, 0)
+    def forward(self, x):
+        return sum([stage(x) for stage in self.children()])
+class MSC(nn.Module):
+    """
+    Multi-scale inputs
+    """
+    def __init__(self, base, scales=None):
+        super(MSC, self).__init__()
+        self.base = base
+        if scales:
+            self.scales = scales
+        else:
+            self.scales = [0.5, 0.75]
+    def forward(self, x):
+        # Original
+        logits = self.base(x)
+        _, _, H, W = logits.shape
+        interp = lambda l: F.interpolate(
+            l, size=(H, W), mode="bilinear", align_corners=False
+        )
+        # Scaled
+        logits_pyramid = []
+        for p in self.scales:
+            h = F.interpolate(x, scale_factor=p, mode="bilinear", align_corners=False)
+            logits_pyramid.append(self.base(h))
+        # Pixel-wise max
+        logits_all = [logits] + [interp(l) for l in logits_pyramid]
+        logits_max = torch.max(torch.stack(logits_all), dim=0)[0]
+        return logits_max
+class DeepLabV2(nn.Sequential):
+    """
+    DeepLab v2: Dilated ResNet + ASPP
+    Output stride is fixed at 8
+    """
+    def __init__(self, n_classes=182, n_blocks=[3, 4, 23, 3], atrous_rates=[6, 12, 18, 24]):
+        super(DeepLabV2, self).__init__()
+        ch = [64 * 2 ** p for p in range(6)]
+        self.add_module("layer1", _Stem(ch[0]))
+        self.add_module("layer2", _ResLayer(n_blocks[0], ch[0], ch[2], 1, 1))
+        self.add_module("layer3", _ResLayer(n_blocks[1], ch[2], ch[3], 2, 1))
+        self.add_module("layer4", _ResLayer(n_blocks[2], ch[3], ch[4], 1, 2))
+        self.add_module("layer5", _ResLayer(n_blocks[3], ch[4], ch[5], 1, 4))
+        self.add_module("aspp", _ASPP(ch[5], n_classes, atrous_rates))
+    def freeze_bn(self):
+        for m in self.modules():
+            if isinstance(m, _ConvBnReLU.BATCH_NORM):
+                m.eval()
+def preprocessing(image, device):
+    # Resize
+    scale = 640 / max(image.shape[:2])
+    image = cv2.resize(image, dsize=None, fx=scale, fy=scale)
+    raw_image = image.astype(np.uint8)
+    # Subtract mean values
+    image = image.astype(np.float32)
+    image -= np.array(
+        [
+            float(104.008),
+            float(116.669),
+            float(122.675),
+        ]
+    )
+    # Convert to torch.Tensor and add "batch" axis
+    image = torch.from_numpy(image.transpose(2, 0, 1)).float().unsqueeze(0)
+    image = image.to(device)
+    return image, raw_image
+# Model setup
+def seger():
+    model = MSC(
+            base=DeepLabV2(
+                n_classes=182, n_blocks=[3, 4, 23, 3], atrous_rates=[6, 12, 18, 24]
+            ),
+            scales=[0.5, 0.75],
+        )
+    state_dict = torch.load('models/deeplabv2_resnet101_msc-cocostuff164k-100000.pth')
+    model.load_state_dict(state_dict)  # to skip ASPP
+    return model
+if __name__ == '__main__':
+    device = 'cuda'
+    model = seger()
+    model.to(device)
+    model.eval()
+    with torch.no_grad():
+        im = cv2.imread('/group/30042/chongmou/ft_local/Diffusion/baselines/SPADE/datasets/coco_stuff/val_img/000000000785.jpg', cv2.IMREAD_COLOR)
+        im, raw_im = preprocessing(im, 'cuda')
+        _, _, H, W = im.shape
+        # Image -> Probability map
+        logits = model(im)
+        logits = F.interpolate(logits, size=(H, W), mode="bilinear", align_corners=False)
+        probs = F.softmax(logits, dim=1)[0]
+        probs = probs.cpu().data.numpy()
+        labelmap = np.argmax(probs, axis=0)
+        print(labelmap.shape, np.max(labelmap), np.min(labelmap))
+        cv2.imwrite('mask.png', labelmap)