import time import gradio as gr import numpy as np import supervision as sv from PIL import Image import torch from transformers import SamModel, SamProcessor from typing import Tuple MARKDOWN = """ # EfficientSAM sv. SAM """ DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") SAM_MODEL = SamModel.from_pretrained("facebook/sam-vit-huge").to(DEVICE) SAM_PROCESSOR = SamProcessor.from_pretrained("facebook/sam-vit-huge") MASK_ANNOTATOR = sv.MaskAnnotator( color=sv.Color.red(), color_lookup=sv.ColorLookup.INDEX) def annotate_image(image: np.ndarray, detections: sv.Detections) -> np.ndarray: bgr_image = image[:, :, ::-1] annotated_bgr_image = MASK_ANNOTATOR.annotate( scene=bgr_image, detections=detections) return annotated_bgr_image[:, :, ::-1] def efficient_sam_inference( image: np.ndarray, x_min: int, y_min: int, x_max: int, y_max: int ) -> np.ndarray: time.sleep(0.2) return image def sam_inference( image: np.ndarray, x_min: int, y_min: int, x_max: int, y_max: int ) -> np.ndarray: input_boxes = [[[x_min, y_min, x_max, y_max]]] inputs = SAM_PROCESSOR( Image.fromarray(image), input_boxes=[input_boxes], return_tensors="pt" ).to(DEVICE) with torch.no_grad(): outputs = SAM_MODEL(**inputs) mask = SAM_PROCESSOR.image_processor.post_process_masks( outputs.pred_masks.cpu(), inputs["original_sizes"].cpu(), inputs["reshaped_input_sizes"].cpu() )[0][0][0].numpy() mask = mask[np.newaxis, ...] detections = sv.Detections(xyxy=sv.mask_to_xyxy(masks=mask), mask=mask) return annotate_image(image=image, detections=detections) def inference( image: np.ndarray, x_min: int, y_min: int, x_max: int, y_max: int ) -> Tuple[np.ndarray, np.ndarray]: return ( efficient_sam_inference(image, x_min, y_min, x_max, y_max), sam_inference(image, x_min, y_min, x_max, y_max) ) with gr.Blocks() as demo: gr.Markdown(MARKDOWN) with gr.Tab(label="Box prompt"): with gr.Row(): with gr.Column(): input_image = gr.Image() with gr.Accordion(label="Box", open=False): with gr.Row(): x_min_number = gr.Number(label="x_min") y_min_number = gr.Number(label="y_min") x_max_number = gr.Number(label="x_max") y_max_number = gr.Number(label="y_max") efficient_sam_output_image = gr.Image() sam_output_image = gr.Image() with gr.Row(): submit_button = gr.Button("Submit") gr.Examples( fn=inference, examples=[ [ 'https://media.roboflow.com/notebooks/examples/dog.jpeg', 69, 247, 624, 930 ] ], inputs=[input_image, x_min_number, y_min_number, x_max_number, y_max_number], outputs=[efficient_sam_output_image, sam_output_image], ) submit_button.click( efficient_sam_inference, inputs=[input_image, x_min_number, y_min_number, x_max_number, y_max_number], outputs=efficient_sam_output_image ) submit_button.click( sam_inference, inputs=[input_image, x_min_number, y_min_number, x_max_number, y_max_number], outputs=sam_output_image ) demo.launch(debug=False, show_error=True)