Spaces:

onuralpszr
/

paligemma2-detection

Running on Zero

File size: 4,410 Bytes

from transformers import PaliGemmaForConditionalGeneration, PaliGemmaProcessor
import torch
import supervision as sv
import cv2
import numpy as np
from PIL import Image
import gradio as gr
import spaces

BOX_ANNOTATOR = sv.BoxAnnotator()
LABEL_ANNOTATOR = sv.LabelAnnotator()
MASK_ANNOTATOR = sv.MaskAnnotator()
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_id = "google/paligemma2-3b-pt-448"
model = PaliGemmaForConditionalGeneration.from_pretrained(model_id).eval().to(DEVICE)
processor = PaliGemmaProcessor.from_pretrained(model_id)



@spaces.GPU
def process_image(input_image,input_text,class_names):
    class_list = class_names.split(',')
    cv_image = cv2.cvtColor(np.array(input_image), cv2.COLOR_RGB2BGR)
    model_inputs = processor(text=input_text, images=input_image, return_tensors="pt").to(torch.bfloat16).to(model.device)
    input_len = model_inputs["input_ids"].shape[-1]

    with torch.inference_mode():
        generation = model.generate(**model_inputs, max_new_tokens=100, do_sample=False)
        generation = generation[0][input_len:]
        result = processor.decode(generation, skip_special_tokens=True)

    detections = sv.Detections.from_lmm(
        sv.LMM.PALIGEMMA,
        result,
        resolution_wh=(input_image.width, input_image.height),
        classes=class_list
    )

    annotated_image = BOX_ANNOTATOR.annotate(
        scene=cv_image.copy(),
        detections=detections
    )
    annotated_image = LABEL_ANNOTATOR.annotate(
        scene=annotated_image,
        detections=detections
    )
    annotated_image = MASK_ANNOTATOR.annotate(
        scene=annotated_image,
        detections=detections
    )

    annotated_image = cv2.cvtColor(annotated_image, cv2.COLOR_BGR2RGB)
    annotated_image = Image.fromarray(annotated_image)

    return annotated_image, result

with gr.Blocks() as app:
    gr.Markdown( """
    ## PaliGemma 2 Detection with Supervision - Demo \n\n

    <div style="display: flex; gap: 10px;">
    <a href="https://github.com/google-research/big_vision/blob/main/big_vision/configs/proj/paligemma/README.md">
        <img src="https://img.shields.io/badge/Github-100000?style=flat&logo=github&logoColor=white" alt="Github">
    </a>
    <a href="https://huggingface.co/blog/paligemma">
        <img src="https://img.shields.io/badge/Huggingface-FFD21E?style=flat&logo=Huggingface&logoColor=black" alt="Huggingface">
    </a>
    <a href="https://github.com/merveenoyan/smol-vision/blob/main/Fine_tune_PaliGemma.ipynb">
        <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Colab">
    </a>
    <a href="https://arxiv.org/abs/2412.03555">
        <img src="https://img.shields.io/badge/Arvix-B31B1B?style=flat&logo=arXiv&logoColor=white" alt="Paper">
    </a>
    <a href="https://supervision.roboflow.com/">
        <img src="https://img.shields.io/badge/Supervision-6706CE?style=flat&logo=Roboflow&logoColor=white" alt="Supervision">
    </a>
    </div>

    \n\n
    PaliGemma 2 is an open vision-language model by Google, inspired by [PaLI-3](https://arxiv.org/abs/2310.09199) and 
    built with open components such as the [SigLIP](https://arxiv.org/abs/2303.15343) 
    vision model and the [Gemma 2](https://arxiv.org/abs/2408.00118) language model. PaliGemma 2 is designed as a versatile 
    model for transfer to a wide range of vision-language tasks such as image and short video caption, visual question 
    answering, text reading, object detection and object segmentation.

    This space show how to use PaliGemma 2 for object detection with supervision.
    You can input an image and a text prompt
    """)
    with gr.Row():
        with gr.Column():
            input_image = gr.Image(type="pil", label="Input Image")
            input_text = gr.Textbox(lines=2, placeholder="Enter text here...", label="Enter prompt for example 'detect person;dog")
            class_names = gr.Textbox(lines=1, placeholder="Enter class names separated by commas...", label="Class Names")
        with gr.Column():
            annotated_image = gr.Image(type="pil", label="Annotated Image")
            detection_result = gr.Textbox(label="Detection Result")
    gr.Button("Submit").click(
        fn=process_image,
        inputs=[input_image, input_text, class_names],
        outputs=[annotated_image, detection_result]
    )

if __name__ == "__main__":
    app.launch()