File size: 3,325 Bytes
dfdcd97
a3ee867
c95f3e0
fd55cab
 
 
 
3701938
 
c95f3e0
fd55cab
 
 
3cd1243
3701938
 
 
e0d4d2f
fd55cab
3701938
 
 
fd55cab
 
 
 
 
 
 
 
 
 
 
 
4f39124
fd55cab
 
72f4c5c
fd55cab
3701938
 
 
fd55cab
 
3ba1061
fd55cab
3701938
 
 
 
 
 
 
 
fd55cab
3701938
 
 
fd55cab
3701938
e0d4d2f
e9cd6fd
fd55cab
3701938
 
 
 
 
 
 
 
fd55cab
 
 
 
3701938
 
fd55cab
 
 
 
 
 
 
 
 
 
3701938
 
 
 
 
 
e0d4d2f
3701938
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import gradio as gr
import torch
from PIL import Image
import cv2
import numpy as np
from transformers import CLIPProcessor, CLIPModel
from ultralytics import FastSAM
import supervision as sv
from huggingface_hub import hf_hub_download

# Load CLIP model
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# Download and load FastSAM model
model_path = hf_hub_download("Jiawei-Yang/FastSAM-x", filename="FastSAM-x.pt")
fast_sam = FastSAM(model_path)

def process_image_clip(image, text_input):
    if image is None:
        return "Please upload an image first."
    
    # Process image for CLIP
    inputs = processor(
        images=image,
        text=[text_input],
        return_tensors="pt",
        padding=True
    )
    
    # Get model predictions
    outputs = model(**inputs)
    logits_per_image = outputs.logits_per_image
    probs = logits_per_image.softmax(dim=1)
    
    confidence = float(probs[0][0])
    return f"Confidence that the image contains '{text_input}': {confidence:.2%}"

def process_image_fastsam(image):
    if image is None:
        return None
        
    # Convert PIL image to numpy array
    image_np = np.array(image)
    
    # Run FastSAM inference
    results = fast_sam(image_np, device='cpu', retina_masks=True, imgsz=1024, conf=0.4, iou=0.9)
    
    # Get detections
    detections = sv.Detections.from_ultralytics(results[0])
    
    # Create annotator
    box_annotator = sv.BoxAnnotator()
    mask_annotator = sv.MaskAnnotator()
    
    # Annotate image
    annotated_image = mask_annotator.annotate(scene=image_np.copy(), detections=detections)
    annotated_image = box_annotator.annotate(scene=annotated_image, detections=detections)
    
    return Image.fromarray(annotated_image)

# Create Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("""
    # CLIP and FastSAM Demo
    This demo combines two powerful AI models:
    - **CLIP**: For zero-shot image classification
    - **FastSAM**: For automatic image segmentation
    
    Try uploading an image and use either of the tabs below!
    """)
    
    with gr.Tab("CLIP Zero-Shot Classification"):
        with gr.Row():
            image_input = gr.Image(type="pil", label="Input Image")
            text_input = gr.Textbox(label="What do you want to check in the image?", 
                                  placeholder="e.g., 'a dog', 'sunset', 'people playing'")
        output_text = gr.Textbox(label="Result")
        classify_btn = gr.Button("Classify")
        classify_btn.click(fn=process_image_clip, inputs=[image_input, text_input], outputs=output_text)
    
    with gr.Tab("FastSAM Segmentation"):
        with gr.Row():
            image_input_sam = gr.Image(type="pil", label="Input Image")
            image_output = gr.Image(type="pil", label="Segmentation Result")
        segment_btn = gr.Button("Segment")
        segment_btn.click(fn=process_image_fastsam, inputs=[image_input_sam], outputs=image_output)
    
    gr.Markdown("""
    ### How to use:
    1. **CLIP Classification**: Upload an image and enter text to check if that concept exists in the image
    2. **FastSAM Segmentation**: Upload an image to get automatic segmentation with bounding boxes and masks
    """)

demo.launch()