Spaces:

sagar007
/

SegmentVision

Sleeping

App Files Files Community

sagar007 commited on Jul 26, 2024

Commit

3cd1243

verified ·

1 Parent(s): 564688d

Update app.py

Browse files

Files changed (1) hide show

app.py +38 -38

app.py CHANGED Viewed

@@ -2,73 +2,73 @@ import gradio as gr
 import torch
 import cv2
 import numpy as np
-from transformers import SamModel, SamProcessor
 from PIL import Image
 # Set up device
 device = "cuda" if torch.cuda.is_available() else "cpu"
-# Load model and processor
-model = SamModel.from_pretrained("facebook/sam-vit-base").to(device)
-processor = SamProcessor.from_pretrained("facebook/sam-vit-base")
 def process_mask(mask, target_size):
-    # Ensure mask is 2D
     if mask.ndim > 2:
         mask = mask.squeeze()
-    # If mask is still not 2D, take the first 2D slice
     if mask.ndim > 2:
         mask = mask[0]
-    # Convert to binary
     mask = (mask > 0.5).astype(np.uint8) * 255
-    # Resize mask to match original image size using PIL
     mask_image = Image.fromarray(mask)
     mask_image = mask_image.resize(target_size, Image.NEAREST)
     return np.array(mask_image) > 0
-def segment_image(input_image, segment_anything):
     try:
         if input_image is None:
             return None, "Please upload an image before submitting."
-        # Convert input_image to PIL Image and ensure it's RGB
         input_image = Image.fromarray(input_image).convert("RGB")
-        # Store original size
         original_size = input_image.size
         if not original_size or 0 in original_size:
             return None, "Invalid image size. Please upload a different image."
-        # Process the image
-        if segment_anything:
-            inputs = processor(input_image, return_tensors="pt").to(device)
-        else:
-            width, height = original_size
-            center_point = [[width // 2, height // 2]]
-            inputs = processor(input_image, input_points=[center_point], return_tensors="pt").to(device)
         # Generate masks
         with torch.no_grad():
-            outputs = model(**inputs)
         # Post-process masks
-        masks = processor.image_processor.post_process_masks(
-            outputs.pred_masks.cpu(),
-            inputs["original_sizes"].cpu(),
-            inputs["reshaped_input_sizes"].cpu()
         )
-        # Process the mask
-        if segment_anything:
-            combined_mask = np.any(masks[0].numpy() > 0.5, axis=0)
-        else:
-            combined_mask = masks[0][0].numpy()
-        combined_mask = process_mask(combined_mask, original_size)
         # Overlay the mask on the original image
         result_image = np.array(input_image)
@@ -76,7 +76,7 @@ def segment_image(input_image, segment_anything):
         mask_rgb[combined_mask] = [255, 0, 0]  # Red color for the mask
         result_image = cv2.addWeighted(result_image, 1, mask_rgb, 0.5, 0)
-        return result_image, "Segmentation completed successfully."
     except Exception as e:
         return None, f"An error occurred: {str(e)}"
@@ -86,14 +86,14 @@ iface = gr.Interface(
     fn=segment_image,
     inputs=[
         gr.Image(type="numpy", label="Upload an image"),
-        gr.Checkbox(label="Segment Everything")
     ],
     outputs=[
         gr.Image(type="numpy", label="Segmented Image"),
         gr.Textbox(label="Status")
     ],
-    title="Segment Anything Model (SAM) Image Segmentation",
-    description="Upload an image and choose whether to segment everything or use a center point."
 )
 # Launch the interface

 import torch
 import cv2
 import numpy as np
+from transformers import SamModel, SamProcessor, BlipProcessor, BlipForConditionalGeneration
 from PIL import Image
 # Set up device
 device = "cuda" if torch.cuda.is_available() else "cpu"
+# Load SAM model and processor
+sam_model = SamModel.from_pretrained("facebook/sam-vit-base").to(device)
+sam_processor = SamProcessor.from_pretrained("facebook/sam-vit-base")
+# Load BLIP model and processor for image-to-text
+blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
+blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)
 def process_mask(mask, target_size):
     if mask.ndim > 2:
         mask = mask.squeeze()
     if mask.ndim > 2:
         mask = mask[0]
     mask = (mask > 0.5).astype(np.uint8) * 255
     mask_image = Image.fromarray(mask)
     mask_image = mask_image.resize(target_size, Image.NEAREST)
     return np.array(mask_image) > 0
+def segment_image(input_image, object_name):
     try:
         if input_image is None:
             return None, "Please upload an image before submitting."
         input_image = Image.fromarray(input_image).convert("RGB")
         original_size = input_image.size
         if not original_size or 0 in original_size:
             return None, "Invalid image size. Please upload a different image."
+        # Generate image caption
+        blip_inputs = blip_processor(input_image, return_tensors="pt").to(device)
+        caption = blip_model.generate(**blip_inputs)
+        caption_text = blip_processor.decode(caption[0], skip_special_tokens=True)
+        # Process the image with SAM
+        sam_inputs = sam_processor(input_image, return_tensors="pt").to(device)
         # Generate masks
         with torch.no_grad():
+            sam_outputs = sam_model(**sam_inputs)
         # Post-process masks
+        masks = sam_processor.image_processor.post_process_masks(
+            sam_outputs.pred_masks.cpu(),
+            sam_inputs["original_sizes"].cpu(),
+            sam_inputs["reshaped_input_sizes"].cpu()
         )
+        # Find the mask that best matches the specified object
+        best_mask = None
+        best_score = -1
+        for mask in masks[0]:
+            mask_binary = mask.numpy() > 0.5
+            mask_area = mask_binary.sum()
+            if object_name.lower() in caption_text.lower() and mask_area > best_score:
+                best_mask = mask_binary
+                best_score = mask_area
+        if best_mask is None:
+            return input_image, f"Could not find '{object_name}' in the image."
+        combined_mask = process_mask(best_mask, original_size)
         # Overlay the mask on the original image
         result_image = np.array(input_image)
         mask_rgb[combined_mask] = [255, 0, 0]  # Red color for the mask
         result_image = cv2.addWeighted(result_image, 1, mask_rgb, 0.5, 0)
+        return result_image, f"Segmented '{object_name}' in the image."
     except Exception as e:
         return None, f"An error occurred: {str(e)}"
     fn=segment_image,
     inputs=[
         gr.Image(type="numpy", label="Upload an image"),
+        gr.Textbox(label="Specify object to segment (e.g., dog, cat, grass)")
     ],
     outputs=[
         gr.Image(type="numpy", label="Segmented Image"),
         gr.Textbox(label="Status")
     ],
+    title="Segment Anything Model (SAM) with Object Specification",
+    description="Upload an image and specify an object to segment."
 )
 # Launch the interface