Spaces:

dragonSwing
/

annotate-anything

Running

App Files Files Community

dragonSwing commited on Jun 5, 2023

Commit

699405e

1 Parent(s): eeef127

Revert app.py

Browse files

Files changed (1) hide show

app.py +180 -279

app.py CHANGED Viewed

@@ -1,47 +1,96 @@
-import argparse
 import json
 import os
 import sys
 import tempfile
 import numpy as np
 import supervision as sv
-from groundingdino.util.inference import Model as DinoModel
-from imutils import paths
 from PIL import Image
-from segment_anything import sam_model_registry
 from segment_anything import SamAutomaticMaskGenerator
 from segment_anything import SamPredictor
 from supervision.detection.utils import xywh_to_xyxy
-from tqdm import tqdm
 sys.path.append("tag2text")
 from tag2text.models import tag2text
 from config import *
-from utils import detect, download_file_hf, segment, generate_tags, show_anns_sv
-def process(
-    tag2text_model,
-    grounding_dino_model,
-    sam_predictor,
-    sam_automask_generator,
-    image_path,
-    task,
-    prompt,
-    box_threshold,
-    text_threshold,
-    iou_threshold,
-    device,
-    output_dir=None,
-    save_mask=False,
-):
-    detections = None
-    metadata = {"image": {}, "annotations": [], "assets": {}}
-    if save_mask:
-        metadata["assets"]["intermediate_mask"] = []
     try:
         # Load image
@@ -51,18 +100,17 @@ def process(
         # Extract image metadata
         filename = os.path.basename(image_path)
-        basename = os.path.splitext(filename)[0]
         h, w = image.shape[:2]
         metadata["image"]["file_name"] = filename
         metadata["image"]["width"] = w
         metadata["image"]["height"] = h
         # Generate tags
-        if task in ["auto", "detection"] and prompt == "":
             tags, caption = generate_tags(tag2text_model, image_pil, "None", device)
             prompt = " . ".join(tags)
-            # print(f"Caption: {caption}")
-            # print(f"Tags: {tags}")
             # ToDo: Extract metadata
             metadata["image"]["caption"] = caption
@@ -70,6 +118,7 @@ def process(
         if prompt:
             metadata["prompt"] = prompt
         # Detect boxes
         if prompt != "":
@@ -82,21 +131,18 @@ def process(
                 iou_threshold=iou_threshold,
                 post_process=True,
             )
-            # Save detection image
-            if output_dir:
-                # Draw boxes
-                box_annotator = sv.BoxAnnotator()
-                labels = [
-                    f"{phrases[i]} {detections.confidence[i]:0.2f}"
-                    for i in range(len(phrases))
-                ]
-                box_image = box_annotator.annotate(
-                    scene=image, detections=detections, labels=labels
-                )
-                box_image_path = os.path.join(output_dir, basename + "_detect.png")
-                metadata["assets"]["detection"] = box_image_path
-                Image.fromarray(box_image).save(box_image_path)
         # Segmentation
         if task in ["auto", "segment"]:
@@ -121,27 +167,18 @@ def process(
                 detections = sv.Detections(
                     xyxy=xywh_to_xyxy(boxes_xywh=xywh), mask=mask
                 )
-            # Save annotated image
-            if output_dir:
-                mask_annotator = sv.MaskAnnotator()
-                mask_image, res = show_anns_sv(detections)
-                annotated_image = mask_annotator.annotate(image, detections=detections)
-                mask_image_path = os.path.join(output_dir, basename + "_mask.png")
-                metadata["assets"]["mask"] = mask_image_path
-                Image.fromarray(mask_image).save(mask_image_path)
-                # Save annotation encoding from https://github.com/LUSSeg/ImageNet-S
-                mask_enc_path = os.path.join(output_dir, basename + "_mask_enc.npy")
-                np.save(mask_enc_path, res)
-                metadata["assets"]["mask_enc"] = mask_enc_path
-                annotated_image_path = os.path.join(
-                    output_dir, basename + "_annotate.png"
-                )
-                metadata["assets"]["annotate"] = annotated_image_path
-                Image.fromarray(annotated_image).save(annotated_image_path)
         # ToDo: Extract metadata
         if detections:
@@ -164,222 +201,86 @@ def process(
                 metadata["annotations"].append(annotation)
                 i += 1
-                if output_dir and save_mask:
-                    mask_image_path = os.path.join(
-                        output_dir, f"{basename}_mask_{id}.png"
-                    )
-                    metadata["assets"]["intermediate_mask"].append(mask_image_path)
-                    Image.fromarray(mask * 255).save(mask_image_path)
-        if output_dir:
-            meta_file_path = os.path.join(output_dir, basename + "_meta.json")
-            with open(meta_file_path, "w") as fp:
-                json.dump(metadata, fp)
-        else:
-            meta_file = tempfile.NamedTemporaryFile(delete=False, suffix=".json")
-            meta_file_path = meta_file.name
-        return meta_file_path
-    except Exception as error:
-        raise ValueError(f"global exception: {error}")
-def main(args: argparse.Namespace) -> None:
-    device = args.device
-    prompt = args.prompt
-    task = args.task
-    tag2text_model = None
-    grounding_dino_model = None
-    sam_predictor = None
-    sam_automask_generator = None
-    box_threshold = args.box_threshold
-    text_threshold = args.text_threshold
-    iou_threshold = args.iou_threshold
-    save_mask = args.save_mask
-    # load model
-    if task in ["auto", "detection"] and prompt == "":
-        print("Loading Tag2Text model...")
-        tag2text_type = args.tag2text_type
-        tag2text_checkpoint = os.path.join(
-            abs_weight_dir, tag2text_dict[tag2text_type]["checkpoint_file"]
-        )
-        if not os.path.exists(tag2text_checkpoint):
-            print(f"Downloading weights for Tag2Text {tag2text_type} model")
-            os.system(
-                f"wget {tag2text_dict[tag2text_type]['checkpoint_url']} -O {tag2text_checkpoint}"
-            )
-        tag2text_model = tag2text.tag2text_caption(
-            pretrained=tag2text_checkpoint,
-            image_size=384,
-            vit="swin_b",
-            delete_tag_index=delete_tag_index,
-        )
-        # threshold for tagging
-        # we reduce the threshold to obtain more tags
-        tag2text_model.threshold = 0.64
-        tag2text_model.to(device)
-        tag2text_model.eval()
-    if task in ["auto", "detection"] or prompt != "":
-        print("Loading Grounding Dino model...")
-        dino_type = args.dino_type
-        dino_checkpoint = os.path.join(
-            abs_weight_dir, dino_dict[dino_type]["checkpoint_file"]
-        )
-        dino_config_file = os.path.join(
-            abs_weight_dir, dino_dict[dino_type]["config_file"]
-        )
-        if not os.path.exists(dino_checkpoint):
-            print(f"Downloading weights for Grounding Dino {dino_type} model")
-            dino_repo_id = dino_dict[dino_type]["repo_id"]
-            download_file_hf(
-                repo_id=dino_repo_id,
-                filename=dino_dict[dino_type]["checkpoint_file"],
-                cache_dir=weight_dir,
-            )
-            download_file_hf(
-                repo_id=dino_repo_id,
-                filename=dino_dict[dino_type]["checkpoint_file"],
-                cache_dir=weight_dir,
-            )
-        grounding_dino_model = DinoModel(
-            model_config_path=dino_config_file,
-            model_checkpoint_path=dino_checkpoint,
-            device=device,
-        )
-    if task in ["auto", "segment"]:
-        print("Loading SAM...")
-        sam_type = args.sam_type
-        sam_checkpoint = os.path.join(
-            abs_weight_dir, sam_dict[sam_type]["checkpoint_file"]
-        )
-        if not os.path.exists(sam_checkpoint):
-            print(f"Downloading weights for SAM {sam_type}")
-            os.system(
-                f"wget {sam_dict[sam_type]['checkpoint_url']} -O {sam_checkpoint}"
-            )
-        sam = sam_model_registry[sam_type](checkpoint=sam_checkpoint)
-        sam.to(device=device)
-        sam_predictor = SamPredictor(sam)
-        sam_automask_generator = SamAutomaticMaskGenerator(sam)
-    if not os.path.exists(args.input):
-        raise ValueError("The input directory doesn't exist!")
-    elif not os.path.isdir(args.input):
-        image_paths = [args.input]
-    else:
-        image_paths = paths.list_images(args.input)
-    os.makedirs(args.output, exist_ok=True)
-    with tqdm(image_paths) as pbar:
-        for image_path in pbar:
-            pbar.set_postfix_str(f"Processing {image_path}")
-            process(
-                tag2text_model=tag2text_model,
-                grounding_dino_model=grounding_dino_model,
-                sam_predictor=sam_predictor,
-                sam_automask_generator=sam_automask_generator,
-                image_path=image_path,
-                task=task,
-                prompt=prompt,
-                box_threshold=box_threshold,
-                text_threshold=text_threshold,
-                iou_threshold=iou_threshold,
-                device=device,
-                output_dir=args.output,
-                save_mask=save_mask,
-            )
-if __name__ == "__main__":
-    if not os.path.exists(abs_weight_dir):
-        os.makedirs(abs_weight_dir, exist_ok=True)
-    parser = argparse.ArgumentParser(
-        description=(
-            "Runs automatic detection and mask generation on an input image or directory of images"
         )
     )
-    parser.add_argument(
-        "--input",
-        "-i",
-        type=str,
-        required=True,
-        help="Path to either a single input image or folder of images.",
-    )
-    parser.add_argument(
-        "--output",
-        "-o",
-        type=str,
-        required=True,
-        help=(
-            "Path to the directory where masks will be output."
-        ),
-    )
-    parser.add_argument(
-        "--sam-type",
-        type=str,
-        default=default_sam,
-        choices=sam_dict.keys(),
-        help="The type of SA model use for segmentation.",
-    )
-    parser.add_argument(
-        "--tag2text-type",
-        type=str,
-        default=default_tag2text,
-        choices=tag2text_dict.keys(),
-        help="The type of Tag2Text model use for tags and caption generation.",
-    )
-    parser.add_argument(
-        "--dino-type",
-        type=str,
-        default=default_dino,
-        choices=dino_dict.keys(),
-        help="The type of Grounding Dino model use for promptable object detection.",
-    )
-    parser.add_argument(
-        "--task",
-        help="Task to run",
-        default="auto",
-        choices=["auto", "detect", "segment"],
-        type=str,
-    )
-    parser.add_argument(
-        "--prompt",
-        help="Detection prompt",
-        default="",
-        type=str,
-    )
-    parser.add_argument(
-        "--box-threshold", type=float, default=0.25, help="box threshold"
-    )
-    parser.add_argument(
-        "--text-threshold", type=float, default=0.2, help="text threshold"
-    )
-    parser.add_argument(
-        "--iou-threshold", type=float, default=0.5, help="iou threshold"
-    )
-    parser.add_argument(
-        "--save-mask",
-        action="store_true",
-        default=False,
-        help="If True, save all intermidiate masks.",
-    )
-    parser.add_argument(
-        "--device", type=str, default="cuda", help="The device to run generation on."
-    )
-    args = parser.parse_args()
-    main(args)

 import json
 import os
+import subprocess
 import sys
 import tempfile
+import gradio as gr
 import numpy as np
 import supervision as sv
+import torch
 from PIL import Image
+from segment_anything import build_sam
 from segment_anything import SamAutomaticMaskGenerator
 from segment_anything import SamPredictor
+from supervision.detection.utils import mask_to_polygons
 from supervision.detection.utils import xywh_to_xyxy
+if os.environ.get('IS_MY_DEBUG') is None:
+    result = subprocess.run(['pip', 'install', '-e', 'GroundingDINO'], check=True)
+    print(f'pip install GroundingDINO = {result}')
 sys.path.append("tag2text")
+sys.path.append("GroundingDINO")
+from groundingdino.util.inference import Model as DinoModel
 from tag2text.models import tag2text
 from config import *
+from utils import download_file_hf, detect, segment, show_anns, generate_tags
+if not os.path.exists(abs_weight_dir):
+    os.makedirs(abs_weight_dir, exist_ok=True)
+sam_checkpoint = os.path.join(abs_weight_dir, sam_dict[default_sam]["checkpoint_file"])
+if not os.path.exists(sam_checkpoint):
+    os.system(f"wget {sam_dict[default_sam]['checkpoint_url']} -O {sam_checkpoint}")
+tag2text_checkpoint = os.path.join(
+    abs_weight_dir, tag2text_dict[default_tag2text]["checkpoint_file"]
+)
+if not os.path.exists(tag2text_checkpoint):
+    os.system(
+        f"wget {tag2text_dict[default_tag2text]['checkpoint_url']} -O {tag2text_checkpoint}"
+    )
+dino_checkpoint = os.path.join(
+    abs_weight_dir, dino_dict[default_dino]["checkpoint_file"]
+)
+dino_config_file = os.path.join(abs_weight_dir, dino_dict[default_dino]["config_file"])
+if not os.path.exists(dino_checkpoint):
+    dino_repo_id = dino_dict[default_dino]["repo_id"]
+    download_file_hf(
+        repo_id=dino_repo_id,
+        filename=dino_dict[default_dino]["config_file"],
+        cache_dir=weight_dir,
+    )
+    download_file_hf(
+        repo_id=dino_repo_id,
+        filename=dino_dict[default_dino]["checkpoint_file"],
+        cache_dir=weight_dir,
+    )
+# load model
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+tag2text_model = tag2text.tag2text_caption(
+    pretrained=tag2text_checkpoint,
+    image_size=384,
+    vit="swin_b",
+    delete_tag_index=delete_tag_index,
+)
+# threshold for tagging
+# we reduce the threshold to obtain more tags
+tag2text_model.threshold = 0.64
+tag2text_model.to(device)
+tag2text_model.eval()
+sam = build_sam(checkpoint=sam_checkpoint)
+sam.to(device=device)
+sam_predictor = SamPredictor(sam)
+sam_automask_generator = SamAutomaticMaskGenerator(sam)
+grounding_dino_model = DinoModel(
+    model_config_path=dino_config_file,
+    model_checkpoint_path=dino_checkpoint,
+    device=device,
+)
+def process(image_path, task, prompt, box_threshold, text_threshold, iou_threshold):
+    global tag2text_model, sam_predictor, sam_automask_generator, grounding_dino_model, device
+    output_gallery = []
+    detections = None
+    metadata = {"image": {}, "annotations": []}
     try:
         # Load image
         # Extract image metadata
         filename = os.path.basename(image_path)
         h, w = image.shape[:2]
         metadata["image"]["file_name"] = filename
         metadata["image"]["width"] = w
         metadata["image"]["height"] = h
         # Generate tags
+        if task in ["auto", "detect"] and prompt == "":
             tags, caption = generate_tags(tag2text_model, image_pil, "None", device)
             prompt = " . ".join(tags)
+            print(f"Caption: {caption}")
+            print(f"Tags: {tags}")
             # ToDo: Extract metadata
             metadata["image"]["caption"] = caption
         if prompt:
             metadata["prompt"] = prompt
+            print(f"Prompt: {prompt}")
         # Detect boxes
         if prompt != "":
                 iou_threshold=iou_threshold,
                 post_process=True,
             )
+            print(phrases)
+            # Draw boxes
+            box_annotator = sv.BoxAnnotator()
+            labels = [
+                f"{phrases[i]} {detections.confidence[i]:0.2f}"
+                for i in range(len(phrases))
+            ]
+            image = box_annotator.annotate(
+                scene=image, detections=detections, labels=labels
+            )
+            output_gallery.append(image)
         # Segmentation
         if task in ["auto", "segment"]:
                 detections = sv.Detections(
                     xyxy=xywh_to_xyxy(boxes_xywh=xywh), mask=mask
                 )
+                # opacity = 0.4
+                # mask_image, _ = show_anns_sam(masks)
+                # annotated_image = np.uint8(mask_image * opacity + image * (1 - opacity))
+            mask_annotator = sv.MaskAnnotator()
+            mask_image = np.zeros_like(image, dtype=np.uint8)
+            mask_image = mask_annotator.annotate(
+                mask_image, detections=detections, opacity=1
+            )
+            annotated_image = mask_annotator.annotate(image, detections=detections)
+            output_gallery.append(mask_image)
+            output_gallery.append(annotated_image)
         # ToDo: Extract metadata
         if detections:
                 metadata["annotations"].append(annotation)
                 i += 1
+        meta_file = tempfile.NamedTemporaryFile(delete=False, suffix=".json")
+        meta_file_path = meta_file.name
+        with open(meta_file_path, "w") as fp:
+            json.dump(metadata, fp)
+        return output_gallery, meta_file_path
+    except Exception as error:
+        raise gr.Error(f"global exception: {error}")
+title = "Annotate Anything"
+with gr.Blocks(css="style.css", title=title) as demo:
+    with gr.Row(elem_classes=["container"]):
+        with gr.Column(scale=1):
+            input_image = gr.Image(type="filepath", label="Input")
+            task = gr.Dropdown(
+                ["detect", "segment", "auto"], value="auto", label="task_type"
+            )
+            text_prompt = gr.Textbox(
+                label="Detection Prompt",
+                info="To detect multiple objects, seperating each name with '.', like this: cat . dog . chair ",
+            )
+            with gr.Accordion("Advanced parameters", open=False):
+                box_threshold = gr.Slider(
+                    minimum=0,
+                    maximum=1,
+                    value=0.3,
+                    step=0.05,
+                    label="Box threshold",
+                    info="Hash size to use for image hashing",
+                )
+                text_threshold = gr.Slider(
+                    minimum=0,
+                    maximum=1,
+                    value=0.25,
+                    step=0.05,
+                    label="Text threshold",
+                    info="Number of history images used to find out duplicate image",
+                )
+                iou_threshold = gr.Slider(
+                    minimum=0,
+                    maximum=1,
+                    value=0.5,
+                    step=0.05,
+                    label="IOU threshold",
+                    info="Minimum similarity threshold (in percent) to consider 2 images to be similar",
+                )
+            run_button = gr.Button(label="Run")
+        with gr.Column(scale=2):
+            gallery = gr.Gallery(
+                label="Generated images", show_label=False, elem_id="gallery"
+            ).style(preview=True, grid=2, object_fit="scale-down")
+            meta_file = gr.File(label="Metadata file")
+    with gr.Row(elem_classes=["container"]):
+        gr.Examples(
+            [
+                ["examples/dog.png", "auto", ""],
+                ["examples/eiffel.png", "auto", ""],
+                ["examples/eiffel.png", "segment", ""],
+                ["examples/girl.png", "auto", "girl . face"],
+                ["examples/horse.png", "detect", "horse"],
+                ["examples/horses.jpg", "auto", "horse"],
+                ["examples/traffic.jpg", "auto", ""],
+            ],
+            [input_image, task, text_prompt],
         )
+    run_button.click(
+        fn=process,
+        inputs=[
+            input_image,
+            task,
+            text_prompt,
+            box_threshold,
+            text_threshold,
+            iou_threshold,
+        ],
+        outputs=[gallery, meta_file],
     )
+demo.queue(concurrency_count=2).launch()