Spaces:

shengqiangShi
/

SV3

Sleeping

App Files Files Community

shengqiangShi commited on Mar 30, 2024

Commit

a64bccf

1 Parent(s): 6fa71e4

Add application file

Browse files

Files changed (2) hide show

app.py +65 -0
requirements.txt +6 -0

app.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import torch
+import gradio as gr
+from transformers import Owlv2Processor, Owlv2ForObjectDetection
+import spaces
+# Use GPU if available
+if torch.cuda.is_available():
+    device = torch.device("cuda")
+else:
+    device = torch.device("cpu")
+model = Owlv2ForObjectDetection.from_pretrained("google/owlv2-base-patch16-ensemble").to(device)
+processor = Owlv2Processor.from_pretrained("google/owlv2-base-patch16-ensemble")
+@spaces.GPU
+def query_image(img, text_queries, score_threshold):
+    text_queries = text_queries
+    text_queries = text_queries.split(",")
+    size = max(img.shape[:2])
+    target_sizes = torch.Tensor([[size, size]])
+    inputs = processor(text=text_queries, images=img, return_tensors="pt").to(device)
+    with torch.no_grad():
+        outputs = model(**inputs)
+    outputs.logits = outputs.logits.cpu()
+    outputs.pred_boxes = outputs.pred_boxes.cpu()
+    results = processor.post_process_object_detection(outputs=outputs, target_sizes=target_sizes)
+    boxes, scores, labels = results[0]["boxes"], results[0]["scores"], results[0]["labels"]
+    result_labels = []
+    for box, score, label in zip(boxes, scores, labels):
+        box = [int(i) for i in box.tolist()]
+        if score < score_threshold:
+            continue
+        result_labels.append((box, text_queries[label.item()]))
+    return img, result_labels
+description = """
+Try this demo for <a href="https://huggingface.co/docs/transformers/main/en/model_doc/owlv2">OWLv2</a>,
+introduced in <a href="https://arxiv.org/abs/2306.09683">Scaling Open-Vocabulary Object Detection</a>.
+\n\n Compared to OWLVIT, OWLv2 performs better both in yield and performance (average precision).
+You can use OWLv2 to query images with text descriptions of any object.
+To use it, simply upload an image and enter comma separated text descriptions of objects you want to query the image for. You
+can also use the score threshold slider to set a threshold to filter out low probability predictions.
+\n\nOWL-ViT is trained on text templates,
+hence you can get better predictions by querying the image with text templates used in training the original model: e.g. *"photo of a star-spangled banner"*,
+*"image of a shoe"*. Refer to the <a href="https://arxiv.org/abs/2103.00020">CLIP</a> paper to see the full list of text templates used to augment the training data.
+\n\n<a href="https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/zeroshot_object_detection_with_owlvit.ipynb">Colab demo</a>
+"""
+demo = gr.Interface(
+    query_image,
+    inputs=[gr.Image(), "text", gr.Slider(0, 1, value=0.1)],
+    outputs="annotatedimage",
+    title="Zero-Shot Object Detection with OWLv2",
+    description=description,
+    examples=[
+        ["assets/astronaut.png", "human face, rocket, star-spangled banner, nasa badge", 0.11],
+        ["assets/coffee.png", "coffee mug, spoon, plate", 0.1],
+        ["assets/butterflies.jpeg", "orange butterfly", 0.3],
+    ],
+)
+demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+numpy>=1.18.5
+torch>=1.7.0
+torchvision>=0.8.1
+git+https://github.com/huggingface/transformers.git
+scipy
+spaces