Spaces:

naver-iv
/

ZIM_demo_with_SAM

Sleeping

File size: 13,461 Bytes

"""
Copyright (c) 2024-present Naver Cloud Corp.
This source code is based on code from the Segment Anything Model (SAM)
(https://github.com/facebookresearch/segment-anything).

This source code is licensed under the license found in the
LICENSE file in the root directory of this source tree.
"""
import os, sys
sys.path.append(os.getcwd())

# Gradio demo, comparison SAM vs ZIM
import os
import torch
import gradio as gr
from gradio_image_prompter import ImagePrompter
import numpy as np
import cv2
from zim import zim_model_registry, ZimPredictor, ZimAutomaticMaskGenerator
from segment_anything import sam_model_registry, SamPredictor, SamAutomaticMaskGenerator
from zim.utils import show_mat_anns

from huggingface_hub import hf_hub_download

def get_shortest_axis(image):
    h, w, _ = image.shape
    return h if h < w else w

def reset_image(image, prompts):
    if image is None:
        image = np.zeros((1024, 1024, 3), dtype=np.uint8)
    else:
        image = image['image']
    zim_predictor.set_image(image)
    sam_predictor.set_image(image)
    prompts = dict()
    black = np.zeros(image.shape[:2], dtype=np.uint8)

    return (image, image, image, image, black, black, black, black, prompts)

def reset_example_image(image, prompts):
    if image is None:
        image = np.zeros((1024, 1024, 3), dtype=np.uint8)

    zim_predictor.set_image(image)
    sam_predictor.set_image(image)
    prompts = dict()
    black = np.zeros(image.shape[:2], dtype=np.uint8)

    image_dict = {}
    image_dict['image'] = image
    image_dict['prompts'] = prompts

    return (image, image_dict, image, image, image, black, black, black, black, prompts)

def run_amg(image):
    gr.Info('Checkout ZIM Auto Mask tab.', duration=3)
    zim_masks = zim_mask_generator.generate(image)
    zim_masks_vis = show_mat_anns(image, zim_masks)
    
    sam_masks = sam_mask_generator.generate(image)
    sam_masks_vis = show_mat_anns(image, sam_masks)
    
    return zim_masks_vis, sam_masks_vis
    
    
def run_model(image, prompts):
    if not prompts:
        raise gr.Error(f'Please input any point or BBox')
    gr.Info('Checkout ZIM Mask tab.', duration=3)
    point_coords = None
    point_labels = None
    boxes = None

    if "point" in prompts:
        point_coords, point_labels = [], []

        for type, pts in prompts["point"]:
            point_coords.append(pts)
            point_labels.append(type)
        point_coords = np.array(point_coords)
        point_labels = np.array(point_labels)

    if "bbox" in prompts:
        boxes = prompts['bbox']
        boxes = np.array(boxes)

    if "scribble" in prompts:
        point_coords, point_labels = [], []

        for pts in prompts["scribble"]:
            point_coords.append(np.flip(pts))
            point_labels.append(1)
        if len(point_coords) == 0:
            raise gr.Error("Please input any scribbles.")
        point_coords = np.array(point_coords)
        point_labels = np.array(point_labels)

    # run ZIM
    zim_mask, _, _ = zim_predictor.predict(
        point_coords=point_coords,
        point_labels=point_labels,
        box=boxes,
        multimask_output=False,
    )
    zim_mask = np.squeeze(zim_mask, axis=0)
    zim_mask = np.uint8(zim_mask * 255)

    # run SAM
    sam_mask, _, _ = sam_predictor.predict(
        point_coords=point_coords,
        point_labels=point_labels,
        box=boxes,
        multimask_output=False,
        )
    sam_mask = np.squeeze(sam_mask, axis=0)
    sam_mask = np.uint8(sam_mask * 255)
    
    return zim_mask, sam_mask

def reset_scribble(image, scribble, prompts):
    # scribble = dict()
    for k in prompts.keys():
        prompts[k] = []

    for k, v in scribble.items():
        scribble[k] = None

    black = np.zeros(image.shape[:3], dtype=np.uint8)

    return scribble, black, black

def update_scribble(image, scribble, prompts):
    if "point" in prompts:
        del prompts["point"]

    if "bbox" in prompts:
        del prompts["bbox"]
    
    prompts = dict() # reset prompt
    scribble_mask = scribble["layers"][0][..., -1] > 0

    scribble_coords = np.argwhere(scribble_mask)
    n_points = min(len(scribble_coords), 24)
    indices = np.linspace(0, len(scribble_coords)-1, n_points, dtype=int)
    scribble_sampled = scribble_coords[indices]

    prompts["scribble"] = scribble_sampled
    
    zim_mask, sam_mask = run_model(image, prompts)

    return zim_mask, sam_mask, prompts


def draw_point(img, pt, size, color):
    # draw circle with white boundary region
    cv2.circle(img, (int(pt[0]), int(pt[1])), int(size * 1.3), (255, 255, 255), -1)
    cv2.circle(img, (int(pt[0]), int(pt[1])), int(size * 0.9), color, -1)


def draw_images(image, mask, prompts):
    if len(prompts) == 0 or mask.shape[1] == 1:
        return image, image, image

    minor = get_shortest_axis(image)
    size = int(minor / 80)

    image = np.float32(image)

    def blending(image, mask):
        mask = np.float32(mask) / 255
        blended_image = np.zeros_like(image, dtype=np.float32)
        blended_image[:, :, :] = [108, 0, 192]
        blended_image = (image * 0.5) + (blended_image * 0.5)
    
        img_with_mask = mask[:, :, None] * blended_image + (1 - mask[:, :, None]) * image
        img_with_mask = np.uint8(img_with_mask)

        return img_with_mask

    img_with_mask = blending(image, mask)
    img_with_point = img_with_mask.copy()

    if "point" in prompts:
        for type, pts in prompts["point"]:
            if type == "Positive":
                color = (0, 0, 255)
                draw_point(img_with_point, pts, size, color)
            elif type == "Negative":
                color = (255, 0, 0)
                draw_point(img_with_point, pts, size, color)

    size = int(minor / 200)

    return (
        img,
        img_with_mask,
    )

def get_point_or_box_prompts(img, prompts):
    image, img_prompts = img['image'], img['points']
    point_prompts = []
    box_prompts = []
    for prompt in img_prompts:
        for p in range(len(prompt)):
            prompt[p] = int(prompt[p])
        if prompt[2] == 2 and prompt[5] == 3:  # box prompt
            if len(box_prompts) != 0:
                raise gr.Error("Please input only one BBox.", duration=3)
            box_prompts.append([prompt[0], prompt[1], prompt[3], prompt[4]])
        elif prompt[2] == 1 and prompt[5] == 4:  # Positive point prompt
            point_prompts.append((1, (prompt[0], prompt[1])))
        elif prompt[2] == 0 and prompt[5] == 4:  # Negative point prompt
            point_prompts.append((0, (prompt[0], prompt[1])))

    if "scribble" in prompts:
        del prompts["scribble"]

    if len(point_prompts) > 0:
        prompts['point'] = point_prompts
    elif 'point' in prompts:
        del prompts['point']

    if len(box_prompts) > 0:
        prompts['bbox'] = box_prompts
    elif 'bbox' in prompts:
        del prompts['bbox']

    zim_mask, sam_mask = run_model(image, prompts)

    return image, zim_mask, sam_mask, prompts

def get_examples():
    assets_dir = os.path.join(os.path.dirname(__file__), 'examples')
    images = os.listdir(assets_dir)
    return [os.path.join(assets_dir, img) for img in images]

def download_onnx_weights(repo_id="naver-iv/zim-anything-vitb", file_dir="zim_vit_b_2043"):
    hf_hub_download(repo_id=repo_id, filename=f"{file_dir}/encoder.onnx")
    filepath = hf_hub_download(repo_id=repo_id, filename=f"{file_dir}/decoder.onnx")

    return os.path.dirname(filepath)

    
if __name__ == "__main__":
    backbone = "vit_b"
    
    # load ZIM
    zim = zim_model_registry[backbone](checkpoint=download_onnx_weights())
    if torch.cuda.is_available():
        zim.cuda()
    zim_predictor = ZimPredictor(zim)
    zim_mask_generator = ZimAutomaticMaskGenerator(
        zim, 
        pred_iou_thresh=0.7, 
        points_per_batch=8,
        stability_score_thresh=0.9, 
    )

    # load SAM
    ckpt_sam = "ckpts/sam_vit_b_01ec64.pth"
    sam = sam_model_registry[backbone](checkpoint=ckpt_sam)
    if torch.cuda.is_available():
        sam.cuda()
    sam_predictor = SamPredictor(sam)
    sam_mask_generator = SamAutomaticMaskGenerator(
        sam,
        points_per_batch=8,
    )
    
    with gr.Blocks() as demo:
        gr.Markdown("# <center> [Demo] ZIM: Zero-Shot Image Matting for Anything")

        prompts = gr.State(dict())
        img = gr.Image(visible=False)
        example_image = gr.Image(visible=False)
        
        with gr.Row():
            with gr.Column():
                # Point and Bbox prompt
                with gr.Tab(label="Point or Box"):
                    img_with_point_or_box = ImagePrompter(
                        label="query image", 
                        sources="upload"
                    )
                    interactions = "Left Click (Pos) | Middle/Right Click (Neg) | Press Move (Box)"
                    gr.Markdown("<h3 style='text-align: center'> {} </h3>".format(interactions))
                    run_bttn = gr.Button("Run")
                    amg_bttn = gr.Button("Automatic Mask Generation")

                # Scribble prompt
                with gr.Tab(label="Scribble"):
                    img_with_scribble = gr.ImageEditor(
                        label="Scribble", 
                        brush=gr.Brush(colors=["#00FF00"], default_size=15),
                        sources="upload", 
                        transforms=None, 
                        layers=False
                    )
                    interactions = "Press Move (Scribble)"
                    gr.Markdown("<h3 style='text-align: center'> Step 1. Select Draw button </h3>")
                    gr.Markdown("<h3 style='text-align: center'> Step 2. {} </h3>".format(interactions))
                    scribble_bttn = gr.Button("Run")
                    scribble_reset_bttn = gr.Button("Reset Scribbles")
                    amg_scribble_bttn = gr.Button("Automatic Mask Generation")
                
                # Example image
                gr.Examples(get_examples(), inputs=[example_image])

            # with gr.Row():
            with gr.Column():
                with gr.Tab(label="ZIM Image"):
                    img_with_zim_mask = gr.Image(
                        label="ZIM Image", 
                        interactive=False
                    )

                with gr.Tab(label="ZIM Mask"):
                    zim_mask = gr.Image(
                        label="ZIM Mask", 
                        image_mode="L", 
                        interactive=False
                    )
                with gr.Tab(label="ZIM Auto Mask"):
                    zim_amg = gr.Image(
                        label="ZIM Auto Mask", 
                        interactive=False
                    )
                    
            with gr.Column():
                with gr.Tab(label="SAM Image"):
                    img_with_sam_mask = gr.Image(
                        label="SAM image", 
                        interactive=False
                    )

                with gr.Tab(label="SAM Mask"):
                    sam_mask = gr.Image(
                        label="SAM Mask", 
                        image_mode="L", 
                        interactive=False
                    )
                    
                with gr.Tab(label="SAM Auto Mask"):
                    sam_amg = gr.Image(
                        label="SAM Auto Mask", 
                        interactive=False
                    )
                    
        example_image.change(
            reset_example_image,
            [example_image, prompts],
            [
                img,
                img_with_point_or_box,
                img_with_scribble,
                img_with_zim_mask,
                img_with_sam_mask,
                zim_amg,
                sam_amg,
                zim_mask,
                sam_mask,
                prompts,
            ]
        )

        img_with_point_or_box.upload(
            reset_image,
            [img_with_point_or_box, prompts],
            [
                img,
                img_with_scribble,
                img_with_zim_mask,
                img_with_sam_mask,
                zim_amg,
                sam_amg,
                zim_mask,
                sam_mask,
                prompts,
            ],
        )

        amg_bttn.click(
            run_amg,
            [img],
            [zim_amg, sam_amg]
        )
        amg_scribble_bttn.click(
            run_amg,
            [img],
            [zim_amg, sam_amg]
        )
        
        run_bttn.click(
            get_point_or_box_prompts,
            [img_with_point_or_box, prompts],
            [img, zim_mask, sam_mask, prompts]
        )

        zim_mask.change(
            draw_images,
            [img, zim_mask, prompts],
            [
                img, img_with_zim_mask, 
            ],
        )
        sam_mask.change(
            draw_images,
            [img, sam_mask, prompts],
            [
                img, img_with_sam_mask, 
            ],
        )
        
        scribble_reset_bttn.click(
            reset_scribble,
            [img, img_with_scribble, prompts],
            [img_with_scribble, zim_mask, sam_mask],
        )
        scribble_bttn.click(
            update_scribble,
            [img, img_with_scribble, prompts],
            [zim_mask, sam_mask, prompts],
        )

    demo.queue()
    demo.launch()