Spaces:

sagar007
/

SegmentVision

Sleeping

App Files Files Community

SegmentVision / app.py

sagar007

Update app.py

26c0f04 verified 11 months ago

raw

history blame

4.83 kB

	import gradio as gr
	import torch
	import cv2
	import numpy as np
	from transformers import SamModel, SamProcessor, BlipProcessor, BlipForConditionalGeneration
	from PIL import Image
	from scipy.ndimage import label, center_of_mass

	# Set up device
	device = "cuda" if torch.cuda.is_available() else "cpu"

	# Load SAM model and processor
	sam_model = SamModel.from_pretrained("facebook/sam-vit-base").to(device)
	sam_processor = SamProcessor.from_pretrained("facebook/sam-vit-base")

	# Load BLIP model and processor for image-to-text
	blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
	blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)

	def process_mask(mask, target_size):
	if mask.ndim > 2:
	mask = mask.squeeze()
	if mask.ndim > 2:
	mask = mask[0]
	mask = (mask > 0.5).astype(np.uint8) * 255
	mask_image = Image.fromarray(mask)
	mask_image = mask_image.resize(target_size, Image.NEAREST)
	return np.array(mask_image) > 0

	def is_cat_like(mask, image_area):
	labeled, num_features = label(mask)
	if num_features == 0:
	return False

	largest_component = (labeled == (np.bincount(labeled.flatten())[1:].argmax() + 1))
	area = largest_component.sum()

	# Check if the area is reasonable for a cat (between 5% and 30% of image)
	if not (0.05 * image_area < area < 0.3 * image_area):
	return False

	# Check if the shape is roughly elliptical
	cy, cx = center_of_mass(largest_component)
	major_axis = max(largest_component.shape)
	minor_axis = min(largest_component.shape)
	aspect_ratio = major_axis / minor_axis

	return 1.5 < aspect_ratio < 3 # Most cats have an aspect ratio in this range

	def segment_image(input_image, object_name):
	try:
	if input_image is None:
	return None, "Please upload an image before submitting."

	input_image = Image.fromarray(input_image).convert("RGB")
	original_size = input_image.size
	if not original_size or 0 in original_size:
	return None, "Invalid image size. Please upload a different image."

	# Generate detailed image caption
	blip_inputs = blip_processor(input_image, return_tensors="pt").to(device)
	caption = blip_model.generate(**blip_inputs, max_length=50)
	caption_text = blip_processor.decode(caption[0], skip_special_tokens=True)

	# Process the image with SAM
	sam_inputs = sam_processor(input_image, return_tensors="pt").to(device)

	# Generate masks
	with torch.no_grad():
	sam_outputs = sam_model(**sam_inputs)

	# Post-process masks
	masks = sam_processor.image_processor.post_process_masks(
	sam_outputs.pred_masks.cpu(),
	sam_inputs["original_sizes"].cpu(),
	sam_inputs["reshaped_input_sizes"].cpu()
	)

	# Find the mask that best matches the specified object
	best_mask = None
	best_score = -1
	image_area = original_size[0] * original_size[1]

	cat_related_words = ['cat', 'kitten', 'feline', 'tabby', 'kitty']
	caption_contains_cat = any(word in caption_text.lower() for word in cat_related_words)

	for mask in masks[0]:
	mask_binary = mask.numpy() > 0.5
	if is_cat_like(mask_binary, image_area) and caption_contains_cat:
	mask_area = mask_binary.sum()
	if mask_area > best_score:
	best_mask = mask_binary
	best_score = mask_area

	if best_mask is None:
	return input_image, f"Could not find a suitable '{object_name}' in the image."

	combined_mask = process_mask(best_mask, original_size)

	# Overlay the mask on the original image
	result_image = np.array(input_image)
	mask_rgb = np.zeros_like(result_image)
	mask_rgb[combined_mask] = [255, 0, 0] # Red color for the mask
	result_image = cv2.addWeighted(result_image, 1, mask_rgb, 0.5, 0)

	return result_image, f"Segmented '{object_name}' in the image."

	except Exception as e:
	return None, f"An error occurred: {str(e)}"

	# Create Gradio interface
	iface = gr.Interface(
	fn=segment_image,
	inputs=[
	gr.Image(type="numpy", label="Upload an image"),
	gr.Textbox(label="Specify object to segment (e.g., dog, cat, grass)")
	],
	outputs=[
	gr.Image(type="numpy", label="Segmented Image"),
	gr.Textbox(label="Status")
	],
	title="Segment Anything Model (SAM) with Object Specification",
	description="Upload an image and specify an object to segment."
	)

	# Launch the interface
	iface.launch()