|
|
|
import gradio as gr |
|
|
|
from demo import query_image |
|
|
|
description = """ |
|
Gradio demo for <a href="https://huggingface.co/docs/transformers/main/en/model_doc/owlvit">OWL-ViT</a>, |
|
introduced in <a href="https://arxiv.org/abs/2205.06230">Simple Open-Vocabulary Object Detection |
|
with Vision Transformers</a>. |
|
\n\nYou can use OWL-ViT to query images with text descriptions of any object. |
|
To use it, simply upload an image and enter comma separated text descriptions of objects you want to query the image for. You |
|
can also use the score threshold slider to set a threshold to filter out low probability predictions. |
|
\n\nOWL-ViT is trained on text templates, |
|
hence you can get better predictions by querying the image with text templates used in training the original model: *"photo of a star-spangled banner"*, |
|
*"image of a shoe"*. Refer to the <a href="https://arxiv.org/abs/2103.00020">CLIP</a> paper to see the full list of text templates used to augment the training data. |
|
\n\n<a href="https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/zeroshot_object_detection_with_owlvit.ipynb">Colab demo</a> |
|
""" |
|
demo = gr.Interface( |
|
query_image, |
|
inputs=[gr.Image(), "text"], |
|
outputs=["image", "image"], |
|
title="Zero-Shot Object Detection with OWL-ViT", |
|
description=description, |
|
examples=[ |
|
["./demo_images/cats.png", "cats,ears"], |
|
["./demo_images/demo1.jpg", "bear,soil,sea"], |
|
["./demo_images/demo2.jpg", "dog,ear,leg,eyes,tail"], |
|
["./demo_images/tanager.jpg", "wing,eyes,back,legs,tail"] |
|
], |
|
) |
|
|
|
|
|
demo.launch(server_name="0.0.0.0", debug=True) |