# -*- coding: utf-8 -*- """## hugging face funcs""" import io import matplotlib.pyplot as plt import requests import inflect from PIL import Image def load_image_from_url(url): return Image.open(requests.get(url, stream=True).raw) def render_results_in_image(in_pil_img, in_results): plt.figure(figsize=(16, 10)) plt.imshow(in_pil_img) ax = plt.gca() for prediction in in_results: x, y = prediction['box']['xmin'], prediction['box']['ymin'] w = prediction['box']['xmax'] - prediction['box']['xmin'] h = prediction['box']['ymax'] - prediction['box']['ymin'] ax.add_patch(plt.Rectangle((x, y), w, h, fill=False, color="green", linewidth=2)) ax.text( x, y, f"{prediction['label']}: {round(prediction['score']*100, 1)}%", color='red' ) plt.axis("off") # Save the modified image to a BytesIO object img_buf = io.BytesIO() plt.savefig(img_buf, format='png', bbox_inches='tight', pad_inches=0) img_buf.seek(0) modified_image = Image.open(img_buf) # Close the plot to prevent it from being displayed plt.close() return modified_image def summarize_predictions_natural_language(predictions): summary = {} p = inflect.engine() for prediction in predictions: label = prediction['label'] if label in summary: summary[label] += 1 else: summary[label] = 1 result_string = "In this image, there are " for i, (label, count) in enumerate(summary.items()): count_string = p.number_to_words(count) result_string += f"{count_string} {label}" if count > 1: result_string += "s" result_string += " " if i == len(summary) - 2: result_string += "and " # Remove the trailing comma and space result_string = result_string.rstrip(', ') + "." return result_string ##### To ignore warnings ##### import warnings import logging from transformers import logging as hf_logging def ignore_warnings(): # Ignore specific Python warnings warnings.filterwarnings("ignore", message="Some weights of the model checkpoint") warnings.filterwarnings("ignore", message="Could not find image processor class") warnings.filterwarnings("ignore", message="The `max_size` parameter is deprecated") # Adjust logging for libraries using the logging module logging.basicConfig(level=logging.ERROR) hf_logging.set_verbosity_error() ######## import numpy as np import torch import matplotlib.pyplot as plt def show_mask(mask, ax, random_color=False): if random_color: color = np.concatenate([np.random.random(3), np.array([0.6])], axis=0) else: color = np.array([30/255, 144/255, 255/255, 0.6]) h, w = mask.shape[-2:] mask_image = mask.reshape(h, w, 1) * color.reshape(1, 1, -1) ax.imshow(mask_image) def show_box(box, ax): x0, y0 = box[0], box[1] w, h = box[2] - box[0], box[3] - box[1] ax.add_patch(plt.Rectangle((x0, y0), w, h, edgecolor='green', facecolor=(0,0,0,0), lw=2)) def show_boxes_on_image(raw_image, boxes): plt.figure(figsize=(10,10)) plt.imshow(raw_image) for box in boxes: show_box(box, plt.gca()) plt.axis('on') plt.show() def show_points_on_image(raw_image, input_points, input_labels=None): plt.figure(figsize=(10,10)) plt.imshow(raw_image) input_points = np.array(input_points) if input_labels is None: labels = np.ones_like(input_points[:, 0]) else: labels = np.array(input_labels) show_points(input_points, labels, plt.gca()) plt.axis('on') plt.show() def show_points_and_boxes_on_image(raw_image, boxes, input_points, input_labels=None): plt.figure(figsize=(10,10)) plt.imshow(raw_image) input_points = np.array(input_points) if input_labels is None: labels = np.ones_like(input_points[:, 0]) else: labels = np.array(input_labels) show_points(input_points, labels, plt.gca()) for box in boxes: show_box(box, plt.gca()) plt.axis('on') plt.show() def show_points_and_boxes_on_image(raw_image, boxes, input_points, input_labels=None): plt.figure(figsize=(10,10)) plt.imshow(raw_image) input_points = np.array(input_points) if input_labels is None: labels = np.ones_like(input_points[:, 0]) else: labels = np.array(input_labels) show_points(input_points, labels, plt.gca()) for box in boxes: show_box(box, plt.gca()) plt.axis('on') plt.show() def show_points(coords, labels, ax, marker_size=375): pos_points = coords[labels==1] neg_points = coords[labels==0] ax.scatter(pos_points[:, 0], pos_points[:, 1], color='green', marker='*', s=marker_size, edgecolor='white', linewidth=1.25) ax.scatter(neg_points[:, 0], neg_points[:, 1], color='red', marker='*', s=marker_size, edgecolor='white', linewidth=1.25) def fig2img(fig): """Convert a Matplotlib figure to a PIL Image and return it""" import io buf = io.BytesIO() fig.savefig(buf) buf.seek(0) img = Image.open(buf) return img def show_mask_on_image(raw_image, mask, return_image=False): if not isinstance(mask, torch.Tensor): mask = torch.Tensor(mask) if len(mask.shape) == 4: mask = mask.squeeze() fig, axes = plt.subplots(1, 1, figsize=(15, 15)) mask = mask.cpu().detach() axes.imshow(np.array(raw_image)) show_mask(mask, axes) axes.axis("off") plt.show() if return_image: fig = plt.gcf() return fig2img(fig) def show_pipe_masks_on_image(raw_image, outputs, return_image=False): plt.imshow(np.array(raw_image)) ax = plt.gca() for mask in outputs["masks"]: show_mask(mask, ax=ax, random_color=True) plt.axis("off") plt.show() if return_image: fig = plt.gcf() return fig2img(fig) """## imports""" from transformers import pipeline from transformers import SamModel, SamProcessor from transformers import BlipForImageTextRetrieval from transformers import AutoProcessor from transformers.utils import logging logging.set_verbosity_error() #ignore_warnings() import io import matplotlib.pyplot as plt import requests import inflect from PIL import Image import os import gradio as gr import time """# Object detection ## hugging face model ("facebook/detr-resnet-50"). 167MB """ od_pipe = pipeline("object-detection", "facebook/detr-resnet-50") chosen_model = pipeline("object-detection", "hustvl/yolos-small") """## gradio funcs""" def get_object_detection_prediction(model_name, raw_image): model = od_pipe if "chosen-model" in model_name: model = chosen_model start = time.time() pipeline_output = model(raw_image) end = time.time() elapsed_result = f'{model_name} object detection elapsed {end-start} seconds' print(elapsed_result) processed_image = render_results_in_image(raw_image, pipeline_output) return [processed_image, elapsed_result] """# Image segmentation ## hugging face models: Zigeng/SlimSAM-uniform-77(segmentation) 39MB, Intel/dpt-hybrid-midas(depth) 490MB """ hugging_face_segmentation_pipe = pipeline("mask-generation", "Zigeng/SlimSAM-uniform-77") hugging_face_segmentation_model = SamModel.from_pretrained("Zigeng/SlimSAM-uniform-77") hugging_face_segmentation_processor = SamProcessor.from_pretrained("Zigeng/SlimSAM-uniform-77") hugging_face_depth_estimator = pipeline(task="depth-estimation", model="Intel/dpt-hybrid-midas") """## chosen models: facebook/sam-vit-base(segmentation) 375MB, LiheYoung/depth-anything-small-hf(depth) 100MB""" chosen_name = "facebook/sam-vit-base" chosen_segmentation_pipe = pipeline("mask-generation", chosen_name) chosen_segmentation_model = SamModel.from_pretrained(chosen_name) chosen_segmentation_processor = SamProcessor.from_pretrained(chosen_name) chosen_depth_estimator = pipeline(task="depth-estimation", model="LiheYoung/depth-anything-small-hf") """## gradio funcs""" input_points = [[[1600, 700]]] def segment_image_pretrained(model_name, raw_image): processor = hugging_face_segmentation_processor model = hugging_face_segmentation_model if("chosen" in model_name): processor = chosen_segmentation_processor model = chosen_segmentation_model start = time.time() inputs = processor(raw_image, input_points=input_points, return_tensors="pt") with torch.no_grad(): outputs = model(**inputs) predicted_masks = processor.image_processor.post_process_masks( outputs.pred_masks, inputs["original_sizes"], inputs["reshaped_input_sizes"]) results = [] predicted_mask = predicted_masks[0] end = time.time() elapsed_result = f'{model_name} pretrained image segmentation elapsed {end-start} seconds' print(elapsed_result) for i in range(3): results.append(show_mask_on_image(raw_image, predicted_mask[:, i], return_image=True)) results.append(elapsed_result); return results def segment_image(model_name, raw_image): model = hugging_face_segmentation_pipe if("chosen" in model_name): print("chosen model used") model = chosen_segmentation_pipe start = time.time() output = model(raw_image, points_per_batch=32) end = time.time() elapsed_result = f'{model_name} raw image segmentation elapsed {end-start} seconds' print(elapsed_result) return [show_pipe_masks_on_image(raw_image, output, return_image = True), elapsed_result] def depth_image(model_name, input_image): depth_estimator = hugging_face_depth_estimator print(model_name) if("chosen" in model_name): print("chosen model used") depth_estimator = chosen_depth_estimator start = time.time() out = depth_estimator(input_image) prediction = torch.nn.functional.interpolate( out["predicted_depth"].unsqueeze(0).unsqueeze(0), size=input_image.size[::-1], mode="bicubic", align_corners=False, ) end = time.time() elapsed_result = f'{model_name} Depth Estimation elapsed {end-start} seconds' print(elapsed_result) output = prediction.squeeze().numpy() formatted = (output * 255 / np.max(output)).astype("uint8") depth = Image.fromarray(formatted) return [depth, elapsed_result] """# Image retrieval ## hugging face model: Salesforce/blip-itm-base-coco 900MB """ hugging_face_retrieval_model = BlipForImageTextRetrieval.from_pretrained( "Salesforce/blip-itm-base-coco") hugging_face_retrieval_processor = AutoProcessor.from_pretrained( "Salesforce/blip-itm-base-coco") """## chosen model: Salesforce/blip-itm-base-flickr 900MB""" chosen_retrieval_model = BlipForImageTextRetrieval.from_pretrained( "Salesforce/blip-itm-base-flickr") chosen_retrieval_processor = AutoProcessor.from_pretrained( "Salesforce/blip-itm-base-flickr") """## gradion func""" def retrieve_image(model_name, raw_image, predict_text): processor = hugging_face_retrieval_processor model = hugging_face_retrieval_model if("chosen" in model_name): processor = chosen_retrieval_processor model = chosen_retrieval_model start = time.time() inputs = processor(images=raw_image, text=predict_text, return_tensors="pt") end = time.time() elapsed_result = f"{model_name} image retrieval elapsed {end-start} seconds" print(elapsed_result) itm_scores = model(**inputs)[0] itm_score = torch.nn.functional.softmax(itm_scores,dim=1) return [f"""\ The image and text are matched \ with a probability of {itm_score[0][1]:.4f}""", elapsed_result] """# gradio""" with gr.Blocks() as object_detection_tab: gr.Markdown("# Detect objects on image") gr.Markdown("Upload an image, choose model, press button.") with gr.Row(): with gr.Column(): # Input components input_image = gr.Image(label="Upload Image", type="pil") model_selector = gr.Dropdown(["hugging-face(facebook/detr-resnet-50)", "chosen-model(hustvl/yolos-small)"], label = "Select Model") with gr.Column(): # Output image elapsed_result = gr.Textbox(label="Seconds elapsed", lines=1) output_image = gr.Image(label="Output Image", type="pil") # Process button process_btn = gr.Button("Detect objects") # Connect the input components to the processing function process_btn.click( fn=get_object_detection_prediction, inputs=[ model_selector, input_image ], outputs=[output_image, elapsed_result] ) with gr.Blocks() as image_segmentation_detection_tab: gr.Markdown("# Image segmentation") gr.Markdown("Upload an image, choose model, press button.") with gr.Row(): with gr.Column(): # Input components input_image = gr.Image(label="Upload Image", type="pil") model_selector = gr.Dropdown(["hugging-face(Zigeng/SlimSAM-uniform-77)", "chosen-model(facebook/sam-vit-base)"], label = "Select Model") with gr.Column(): elapsed_result = gr.Textbox(label="Seconds elapsed", lines=1) # Output image output_image = gr.Image(label="Segmented image", type="pil") with gr.Row(): with gr.Column(): segment_btn = gr.Button("Segment image(not pretrained)") with gr.Row(): elapsed_result_pretrained_segment = gr.Textbox(label="Seconds elapsed", lines=1) with gr.Column(): segment_pretrained_output_image_1 = gr.Image(label="Segmented image by pretrained model", type="pil") with gr.Column(): segment_pretrained_output_image_2 = gr.Image(label="Segmented image by pretrained model", type="pil") with gr.Column(): segment_pretrained_output_image_3 = gr.Image(label="Segmented image by pretrained model", type="pil") with gr.Row(): with gr.Column(): segment_pretrained_model_selector = gr.Dropdown(["hugging-face(Zigeng/SlimSAM-uniform-77)", "chosen-model(facebook/sam-vit-base)"], label = "Select Model") segment_pretrained_btn = gr.Button("Segment image(pretrained)") with gr.Row(): with gr.Column(): depth_output_image = gr.Image(label="Depth image", type="pil") elapsed_result_depth = gr.Textbox(label="Seconds elapsed", lines=1) with gr.Row(): with gr.Column(): depth_model_selector = gr.Dropdown(["hugging-face(Intel/dpt-hybrid-midas)", "chosen-model(LiheYoung/depth-anything-small-hf)"], label = "Select Model") depth_btn = gr.Button("Get image depth") segment_btn.click( fn=segment_image, inputs=[ model_selector, input_image ], outputs=[output_image, elapsed_result] ) segment_pretrained_btn.click( fn=segment_image_pretrained, inputs=[ segment_pretrained_model_selector, input_image ], outputs=[segment_pretrained_output_image_1, segment_pretrained_output_image_2, segment_pretrained_output_image_3, elapsed_result_pretrained_segment] ) depth_btn.click( fn=depth_image, inputs=[ depth_model_selector, input_image, ], outputs=[depth_output_image, elapsed_result_depth] ) with gr.Blocks() as image_retrieval_tab: gr.Markdown("# Check is text describes image") gr.Markdown("Upload an image, choose model, press button.") with gr.Row(): with gr.Column(): # Input components input_image = gr.Image(label="Upload Image", type="pil") text_prediction = gr.TextArea(label="Describe image") model_selector = gr.Dropdown(["hugging-face(Salesforce/blip-itm-base-coco)", "chosen-model(Salesforce/blip-itm-base-flickr)"], label = "Select Model") with gr.Column(): # Output image output_result = gr.Textbox(label="Probability result", lines=3) elapsed_result = gr.Textbox(label="Seconds elapsed", lines=1) # Process button process_btn = gr.Button("Detect objects") # Connect the input components to the processing function process_btn.click( fn=retrieve_image, inputs=[ model_selector, input_image, text_prediction ], outputs=[output_result, elapsed_result] ) with gr.Blocks() as app: gr.TabbedInterface( [object_detection_tab, image_segmentation_detection_tab, image_retrieval_tab], ["Object detection", "Image segmentation", "Retrieve image" ], ) app.launch(share=True, debug=True) app.close()