Spaces:

ya02
/

object_detection

Sleeping

App Files Files Community

object_detection / app.py

ya02

Create app.py

342bbfd verified 11 months ago

raw

history blame contribute delete

2.15 kB

	import torch
	from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection
	from PIL import Image, ImageDraw
	import gradio as gr

	# Specify the checkpoint name or identifier for the pre-trained model
	checkpoint = "google/owlvit-base-patch32"

	# Initialize the pre-trained model and processor
	model = AutoModelForZeroShotObjectDetection.from_pretrained(checkpoint)
	processor = AutoProcessor.from_pretrained(checkpoint)

	def detect_objects(image, text_queries):
	# Convert image to PIL Image format if not already
	if isinstance(image, str):
	image = Image.open(image)

	# Prepare inputs for zero-shot object detection
	inputs = processor(images=image, text=text_queries, return_tensors="pt")

	# Perform inference with the model
	with torch.no_grad():
	outputs = model(**inputs)
	target_sizes = torch.tensor([image.size[::-1]])
	results = processor.post_process_object_detection(outputs, threshold=0.1, target_sizes=target_sizes)[0]

	# Create a drawing object for the image
	draw = ImageDraw.Draw(image)

	# Extract detection results (scores, labels, and bounding boxes)
	scores = results["scores"].tolist()
	labels = results["labels"].tolist()
	boxes = results["boxes"].tolist()

	# Iterate over detected objects and draw bounding boxes and labels
	for box, score, label in zip(boxes, scores, labels):
	xmin, ymin, xmax, ymax = box
	draw.rectangle((xmin, ymin, xmax, ymax), outline="red", width=1)
	draw.text((xmin, ymin), f"{text_queries[label]}: {round(score, 2)}", fill="black")

	return image

	# Gradio Interface
	gr.Interface(
	fn=detect_objects,
	inputs=[
	gr.Image(type="pil", label="Upload an Image"),
	gr.Textbox(lines=2, placeholder="Enter text queries separated by commas...", label="Text Queries")
	],
	outputs=gr.Image(label="Detected Objects"),
	title="AI Workshop Zero-Shot Object Detection",
	description="Upload an image and provide text queries to perform zero-shot object detection using a pre-trained model. The model identifies objects based on the queries you provide.",
	).launch()