Tobias Cornille commited on
Commit
d24572a
·
unverified ·
1 Parent(s): d688b67

Update GroundingDINO

Browse files
Files changed (37) hide show
  1. GroundingDINO/LICENSE +1 -1
  2. GroundingDINO/README.md +245 -41
  3. GroundingDINO/groundingdino/config/{GroundingDINO_SwinB.cfg.py → GroundingDINO_SwinB_cfg.py} +0 -0
  4. GroundingDINO/groundingdino/config/__init__.py +0 -0
  5. GroundingDINO/groundingdino/datasets/__pycache__/__init__.cpython-310.pyc +0 -0
  6. GroundingDINO/groundingdino/datasets/__pycache__/transforms.cpython-310.pyc +0 -0
  7. GroundingDINO/groundingdino/datasets/cocogrounding_eval.py +269 -0
  8. GroundingDINO/groundingdino/models/GroundingDINO/__pycache__/__init__.cpython-310.pyc +0 -0
  9. GroundingDINO/groundingdino/models/GroundingDINO/__pycache__/bertwarper.cpython-310.pyc +0 -0
  10. GroundingDINO/groundingdino/models/GroundingDINO/__pycache__/fuse_modules.cpython-310.pyc +0 -0
  11. GroundingDINO/groundingdino/models/GroundingDINO/__pycache__/groundingdino.cpython-310.pyc +0 -0
  12. GroundingDINO/groundingdino/models/GroundingDINO/__pycache__/ms_deform_attn.cpython-310.pyc +0 -0
  13. GroundingDINO/groundingdino/models/GroundingDINO/__pycache__/transformer.cpython-310.pyc +0 -0
  14. GroundingDINO/groundingdino/models/GroundingDINO/__pycache__/transformer_vanilla.cpython-310.pyc +0 -0
  15. GroundingDINO/groundingdino/models/GroundingDINO/__pycache__/utils.cpython-310.pyc +0 -0
  16. GroundingDINO/groundingdino/models/GroundingDINO/backbone/__pycache__/__init__.cpython-310.pyc +0 -0
  17. GroundingDINO/groundingdino/models/GroundingDINO/backbone/__pycache__/backbone.cpython-310.pyc +0 -0
  18. GroundingDINO/groundingdino/models/GroundingDINO/backbone/__pycache__/position_encoding.cpython-310.pyc +0 -0
  19. GroundingDINO/groundingdino/models/GroundingDINO/backbone/__pycache__/swin_transformer.cpython-310.pyc +0 -0
  20. GroundingDINO/groundingdino/models/GroundingDINO/groundingdino.py +25 -8
  21. GroundingDINO/groundingdino/models/__pycache__/__init__.cpython-310.pyc +0 -0
  22. GroundingDINO/groundingdino/models/__pycache__/registry.cpython-310.pyc +0 -0
  23. GroundingDINO/groundingdino/util/__pycache__/__init__.cpython-310.pyc +0 -0
  24. GroundingDINO/groundingdino/util/__pycache__/box_ops.cpython-310.pyc +0 -0
  25. GroundingDINO/groundingdino/util/__pycache__/get_tokenlizer.cpython-310.pyc +0 -0
  26. GroundingDINO/groundingdino/util/__pycache__/inference.cpython-310.pyc +0 -0
  27. GroundingDINO/groundingdino/util/__pycache__/misc.cpython-310.pyc +0 -0
  28. GroundingDINO/groundingdino/util/__pycache__/slconfig.cpython-310.pyc +0 -0
  29. GroundingDINO/groundingdino/util/__pycache__/utils.cpython-310.pyc +0 -0
  30. GroundingDINO/groundingdino/util/__pycache__/visualizer.cpython-310.pyc +0 -0
  31. GroundingDINO/groundingdino/util/__pycache__/vl_utils.cpython-310.pyc +0 -0
  32. GroundingDINO/groundingdino/util/get_tokenlizer.py +5 -2
  33. GroundingDINO/groundingdino/util/inference.py +180 -7
  34. GroundingDINO/groundingdino/util/slconfig.py +2 -2
  35. GroundingDINO/groundingdino/util/utils.py +3 -1
  36. GroundingDINO/requirements.txt +2 -2
  37. GroundingDINO/setup.py +13 -1
GroundingDINO/LICENSE CHANGED
@@ -186,7 +186,7 @@
186
  same "printed page" as the copyright notice for easier
187
  identification within third-party archives.
188
 
189
- Copyright 2020 - present, Facebook, Inc
190
 
191
  Licensed under the Apache License, Version 2.0 (the "License");
192
  you may not use this file except in compliance with the License.
 
186
  same "printed page" as the copyright notice for easier
187
  identification within third-party archives.
188
 
189
+ Copyright 2023 - present, IDEA Research.
190
 
191
  Licensed under the Apache License, Version 2.0 (the "License");
192
  you may not use this file except in compliance with the License.
GroundingDINO/README.md CHANGED
@@ -1,78 +1,269 @@
1
- # Grounding DINO
 
 
2
 
3
- ---
4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  [![arXiv](https://img.shields.io/badge/arXiv-2303.05499-b31b1b.svg)](https://arxiv.org/abs/2303.05499)
6
- [![YouTube](https://badges.aleen42.com/src/youtube.svg)](https://youtu.be/wxWDt5UiwY8)
7
- [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/roboflow-ai/notebooks/blob/main/notebooks/zero-shot-object-detection-with-grounding-dino.ipynb)
8
- [![YouTube](https://badges.aleen42.com/src/youtube.svg)](https://youtu.be/cMa77r3YrDk)
 
 
9
  [![HuggingFace space](https://img.shields.io/badge/🤗-HuggingFace%20Space-cyan.svg)](https://huggingface.co/spaces/ShilongLiu/Grounding_DINO_demo)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
- [![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/grounding-dino-marrying-dino-with-grounded/zero-shot-object-detection-on-mscoco)](https://paperswithcode.com/sota/zero-shot-object-detection-on-mscoco?p=grounding-dino-marrying-dino-with-grounded) \
12
- [![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/grounding-dino-marrying-dino-with-grounded/zero-shot-object-detection-on-odinw)](https://paperswithcode.com/sota/zero-shot-object-detection-on-odinw?p=grounding-dino-marrying-dino-with-grounded) \
13
- [![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/grounding-dino-marrying-dino-with-grounded/object-detection-on-coco-minival)](https://paperswithcode.com/sota/object-detection-on-coco-minival?p=grounding-dino-marrying-dino-with-grounded) \
14
- [![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/grounding-dino-marrying-dino-with-grounded/object-detection-on-coco)](https://paperswithcode.com/sota/object-detection-on-coco?p=grounding-dino-marrying-dino-with-grounded)
15
 
16
 
17
 
18
- Official PyTorch implementation of [Grounding DINO](https://arxiv.org/abs/2303.05499), a stronger open-set object detector. Code is available now!
19
 
20
 
21
- ## Highlight
22
 
23
  - **Open-Set Detection.** Detect **everything** with language!
24
- - **High Performancce.** COCO zero-shot **52.5 AP** (training without COCO data!). COCO fine-tune **63.0 AP**.
25
  - **Flexible.** Collaboration with Stable Diffusion for Image Editting.
26
 
27
- ## News
28
- [2023/03/28] A YouTube [video](https://youtu.be/cMa77r3YrDk) about Grounding DINO and basic object detection prompt engineering. [[SkalskiP](https://github.com/SkalskiP)] \
29
- [2023/03/28] Add a [demo](https://huggingface.co/spaces/ShilongLiu/Grounding_DINO_demo) on Hugging Face Space! \
30
- [2023/03/27] Support CPU-only mode. Now the model can run on machines without GPUs.\
31
- [2023/03/25] A [demo](https://colab.research.google.com/github/roboflow-ai/notebooks/blob/main/notebooks/zero-shot-object-detection-with-grounding-dino.ipynb) for Grounding DINO is available at Colab. [[SkalskiP](https://github.com/SkalskiP)] \
32
- [2023/03/22] Code is available Now!
 
 
 
 
 
 
 
 
 
33
 
34
  <details open>
35
  <summary><font size="4">
36
  Description
37
  </font></summary>
 
38
  <img src=".asset/hero_figure.png" alt="ODinW" width="100%">
 
 
39
  </details>
40
 
 
 
 
 
 
 
 
 
 
 
41
 
42
-
43
- ## TODO
44
 
45
  - [x] Release inference code and demo.
46
  - [x] Release checkpoints.
47
- - [ ] Grounding DINO with Stable Diffusion and GLIGEN demos.
48
  - [ ] Release training codes.
49
 
50
- ## Install
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
 
52
- If you have a CUDA environment, please make sure the environment variable `CUDA_HOME` is set. It will be compiled under CPU-only mode if no CUDA available.
53
 
54
  ```bash
55
  pip install -e .
56
  ```
57
 
58
- ## Demo
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
 
 
60
  ```bash
61
- CUDA_VISIBLE_DEVICES=6 python demo/inference_on_a_image.py \
62
- -c /path/to/config \
63
- -p /path/to/checkpoint \
64
- -i .asset/cats.png \
65
- -o "outputs/0" \
66
- -t "cat ear." \
67
- [--cpu-only] # open it for cpu mode
 
68
  ```
 
 
69
  See the `demo/inference_on_a_image.py` for more details.
70
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
  **Web UI**
72
 
73
  We also provide a demo code to integrate Grounding DINO with Gradio Web UI. See the file `demo/gradio_app.py` for more details.
74
 
75
- ## Checkpoints
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
 
77
  <!-- insert a table -->
78
  <table>
@@ -94,13 +285,22 @@ We also provide a demo code to integrate Grounding DINO with Gradio Web UI. See
94
  <td>Swin-T</td>
95
  <td>O365,GoldG,Cap4M</td>
96
  <td>48.4 (zero-shot) / 57.2 (fine-tune)</td>
97
- <td><a href="https://github.com/IDEA-Research/GroundingDINO/releases/download/v0.1.0-alpha/groundingdino_swint_ogc.pth">Github link</a> | <a href="https://huggingface.co/ShilongLiu/GroundingDINO/resolve/main/groundingdino_swint_ogc.pth">HF link</a></td>
98
  <td><a href="https://github.com/IDEA-Research/GroundingDINO/blob/main/groundingdino/config/GroundingDINO_SwinT_OGC.py">link</a></td>
99
  </tr>
 
 
 
 
 
 
 
 
 
100
  </tbody>
101
  </table>
102
 
103
- ## Results
104
 
105
  <details open>
106
  <summary><font size="4">
@@ -120,24 +320,27 @@ ODinW Object Detection Results
120
  <summary><font size="4">
121
  Marrying Grounding DINO with <a href="https://github.com/Stability-AI/StableDiffusion">Stable Diffusion</a> for Image Editing
122
  </font></summary>
 
123
  <img src=".asset/GD_SD.png" alt="GD_SD" width="100%">
124
  </details>
125
 
 
126
  <details open>
127
  <summary><font size="4">
128
- Marrying Grounding DINO with <a href="https://github.com/gligen/GLIGEN">GLIGEN</a> for more Detailed Image Editing
129
  </font></summary>
 
130
  <img src=".asset/GD_GLIGEN.png" alt="GD_GLIGEN" width="100%">
131
  </details>
132
 
133
- ## Model
134
 
135
  Includes: a text backbone, an image backbone, a feature enhancer, a language-guided query selection, and a cross-modality decoder.
136
 
137
  ![arch](.asset/arch.png)
138
 
139
 
140
- ## Acknowledgement
141
 
142
  Our model is related to [DINO](https://github.com/IDEA-Research/DINO) and [GLIP](https://github.com/microsoft/GLIP). Thanks for their great work!
143
 
@@ -146,14 +349,15 @@ We also thank great previous work including DETR, Deformable DETR, SMCA, Conditi
146
  Thanks [Stable Diffusion](https://github.com/Stability-AI/StableDiffusion) and [GLIGEN](https://github.com/gligen/GLIGEN) for their awesome models.
147
 
148
 
149
- ## Citation
150
 
151
  If you find our work helpful for your research, please consider citing the following BibTeX entry.
152
 
153
  ```bibtex
154
- @inproceedings{ShilongLiu2023GroundingDM,
155
- title={Grounding DINO: Marrying DINO with Grounded Pre-Training for Open-Set Object Detection},
156
- author={Shilong Liu and Zhaoyang Zeng and Tianhe Ren and Feng Li and Hao Zhang and Jie Yang and Chunyuan Li and Jianwei Yang and Hang Su and Jun Zhu and Lei Zhang},
 
157
  year={2023}
158
  }
159
  ```
 
1
+ <div align="center">
2
+ <img src="./.asset/grounding_dino_logo.png" width="30%">
3
+ </div>
4
 
5
+ # :sauropod: Grounding DINO
6
 
7
+ [![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/grounding-dino-marrying-dino-with-grounded/zero-shot-object-detection-on-mscoco)](https://paperswithcode.com/sota/zero-shot-object-detection-on-mscoco?p=grounding-dino-marrying-dino-with-grounded) [![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/grounding-dino-marrying-dino-with-grounded/zero-shot-object-detection-on-odinw)](https://paperswithcode.com/sota/zero-shot-object-detection-on-odinw?p=grounding-dino-marrying-dino-with-grounded) \
8
+ [![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/grounding-dino-marrying-dino-with-grounded/object-detection-on-coco-minival)](https://paperswithcode.com/sota/object-detection-on-coco-minival?p=grounding-dino-marrying-dino-with-grounded) [![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/grounding-dino-marrying-dino-with-grounded/object-detection-on-coco)](https://paperswithcode.com/sota/object-detection-on-coco?p=grounding-dino-marrying-dino-with-grounded)
9
+
10
+
11
+ **[IDEA-CVR, IDEA-Research](https://github.com/IDEA-Research)**
12
+
13
+ [Shilong Liu](http://www.lsl.zone/), [Zhaoyang Zeng](https://scholar.google.com/citations?user=U_cvvUwAAAAJ&hl=zh-CN&oi=ao), [Tianhe Ren](https://rentainhe.github.io/), [Feng Li](https://scholar.google.com/citations?user=ybRe9GcAAAAJ&hl=zh-CN), [Hao Zhang](https://scholar.google.com/citations?user=B8hPxMQAAAAJ&hl=zh-CN), [Jie Yang](https://github.com/yangjie-cv), [Chunyuan Li](https://scholar.google.com/citations?user=Zd7WmXUAAAAJ&hl=zh-CN&oi=ao), [Jianwei Yang](https://jwyang.github.io/), [Hang Su](https://scholar.google.com/citations?hl=en&user=dxN1_X0AAAAJ&view_op=list_works&sortby=pubdate), [Jun Zhu](https://scholar.google.com/citations?hl=en&user=axsP38wAAAAJ), [Lei Zhang](https://www.leizhang.org/)<sup>:email:</sup>.
14
+
15
+
16
+ [[`Paper`](https://arxiv.org/abs/2303.05499)] [[`Demo`](https://huggingface.co/spaces/ShilongLiu/Grounding_DINO_demo)] [[`BibTex`](#black_nib-citation)]
17
+
18
+
19
+ PyTorch implementation and pretrained models for Grounding DINO. For details, see the paper **[Grounding DINO: Marrying DINO with Grounded Pre-Training for Open-Set Object Detection](https://arxiv.org/abs/2303.05499)**.
20
+
21
+ ## :sun_with_face: Helpful Tutorial
22
+
23
+ - :grapes: [[Read our arXiv Paper](https://arxiv.org/abs/2303.05499)]
24
+ - :apple: [[Watch our simple introduction video on YouTube](https://youtu.be/wxWDt5UiwY8)]
25
+ - :blossom: &nbsp;[[Try the Colab Demo](https://colab.research.google.com/github/roboflow-ai/notebooks/blob/main/notebooks/zero-shot-object-detection-with-grounding-dino.ipynb)]
26
+ - :sunflower: [[Try our Official Huggingface Demo](https://huggingface.co/spaces/ShilongLiu/Grounding_DINO_demo)]
27
+ - :maple_leaf: [[Watch the Step by Step Tutorial about GroundingDINO by Roboflow AI](https://youtu.be/cMa77r3YrDk)]
28
+ - :mushroom: [[GroundingDINO: Automated Dataset Annotation and Evaluation by Roboflow AI](https://youtu.be/C4NqaRBz_Kw)]
29
+ - :hibiscus: [[Accelerate Image Annotation with SAM and GroundingDINO by Roboflow AI](https://youtu.be/oEQYStnF2l8)]
30
+ - :white_flower: [[Autodistill: Train YOLOv8 with ZERO Annotations based on Grounding-DINO and Grounded-SAM by Roboflow AI](https://github.com/autodistill/autodistill)]
31
+
32
+ <!-- Grounding DINO Methods |
33
  [![arXiv](https://img.shields.io/badge/arXiv-2303.05499-b31b1b.svg)](https://arxiv.org/abs/2303.05499)
34
+ [![YouTube](https://badges.aleen42.com/src/youtube.svg)](https://youtu.be/wxWDt5UiwY8) -->
35
+
36
+ <!-- Grounding DINO Demos |
37
+ [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/roboflow-ai/notebooks/blob/main/notebooks/zero-shot-object-detection-with-grounding-dino.ipynb) -->
38
+ <!-- [![YouTube](https://badges.aleen42.com/src/youtube.svg)](https://youtu.be/cMa77r3YrDk)
39
  [![HuggingFace space](https://img.shields.io/badge/🤗-HuggingFace%20Space-cyan.svg)](https://huggingface.co/spaces/ShilongLiu/Grounding_DINO_demo)
40
+ [![YouTube](https://badges.aleen42.com/src/youtube.svg)](https://youtu.be/oEQYStnF2l8)
41
+ [![YouTube](https://badges.aleen42.com/src/youtube.svg)](https://youtu.be/C4NqaRBz_Kw) -->
42
+
43
+ ## :sparkles: Highlight Projects
44
+
45
+ - [Semantic-SAM: a universal image segmentation model to enable segment and recognize anything at any desired granularity.](https://github.com/UX-Decoder/Semantic-SAM),
46
+ - [DetGPT: Detect What You Need via Reasoning](https://github.com/OptimalScale/DetGPT)
47
+ - [Grounded-SAM: Marrying Grounding DINO with Segment Anything](https://github.com/IDEA-Research/Grounded-Segment-Anything)
48
+ - [Grounding DINO with Stable Diffusion](demo/image_editing_with_groundingdino_stablediffusion.ipynb)
49
+ - [Grounding DINO with GLIGEN for Controllable Image Editing](demo/image_editing_with_groundingdino_gligen.ipynb)
50
+ - [OpenSeeD: A Simple and Strong Openset Segmentation Model](https://github.com/IDEA-Research/OpenSeeD)
51
+ - [SEEM: Segment Everything Everywhere All at Once](https://github.com/UX-Decoder/Segment-Everything-Everywhere-All-At-Once)
52
+ - [X-GPT: Conversational Visual Agent supported by X-Decoder](https://github.com/microsoft/X-Decoder/tree/xgpt)
53
+ - [GLIGEN: Open-Set Grounded Text-to-Image Generation](https://github.com/gligen/GLIGEN)
54
+ - [LLaVA: Large Language and Vision Assistant](https://github.com/haotian-liu/LLaVA)
55
 
56
+ <!-- Extensions | [Grounding DINO with Segment Anything](https://github.com/IDEA-Research/Grounded-Segment-Anything); [Grounding DINO with Stable Diffusion](demo/image_editing_with_groundingdino_stablediffusion.ipynb); [Grounding DINO with GLIGEN](demo/image_editing_with_groundingdino_gligen.ipynb) -->
 
 
 
57
 
58
 
59
 
60
+ <!-- Official PyTorch implementation of [Grounding DINO](https://arxiv.org/abs/2303.05499), a stronger open-set object detector. Code is available now! -->
61
 
62
 
63
+ ## :bulb: Highlight
64
 
65
  - **Open-Set Detection.** Detect **everything** with language!
66
+ - **High Performance.** COCO zero-shot **52.5 AP** (training without COCO data!). COCO fine-tune **63.0 AP**.
67
  - **Flexible.** Collaboration with Stable Diffusion for Image Editting.
68
 
69
+
70
+
71
+
72
+ ## :fire: News
73
+ - **`2023/07/18`**: We release [Semantic-SAM](https://github.com/UX-Decoder/Semantic-SAM), a universal image segmentation model to enable segment and recognize anything at any desired granularity. **Code** and **checkpoint** are available!
74
+ - **`2023/06/17`**: We provide an example to evaluate Grounding DINO on COCO zero-shot performance.
75
+ - **`2023/04/15`**: Refer to [CV in the Wild Readings](https://github.com/Computer-Vision-in-the-Wild/CVinW_Readings) for those who are interested in open-set recognition!
76
+ - **`2023/04/08`**: We release [demos](demo/image_editing_with_groundingdino_gligen.ipynb) to combine [Grounding DINO](https://arxiv.org/abs/2303.05499) with [GLIGEN](https://github.com/gligen/GLIGEN) for more controllable image editings.
77
+ - **`2023/04/08`**: We release [demos](demo/image_editing_with_groundingdino_stablediffusion.ipynb) to combine [Grounding DINO](https://arxiv.org/abs/2303.05499) with [Stable Diffusion](https://github.com/Stability-AI/StableDiffusion) for image editings.
78
+ - **`2023/04/06`**: We build a new demo by marrying GroundingDINO with [Segment-Anything](https://github.com/facebookresearch/segment-anything) named **[Grounded-Segment-Anything](https://github.com/IDEA-Research/Grounded-Segment-Anything)** aims to support segmentation in GroundingDINO.
79
+ - **`2023/03/28`**: A YouTube [video](https://youtu.be/cMa77r3YrDk) about Grounding DINO and basic object detection prompt engineering. [[SkalskiP](https://github.com/SkalskiP)]
80
+ - **`2023/03/28`**: Add a [demo](https://huggingface.co/spaces/ShilongLiu/Grounding_DINO_demo) on Hugging Face Space!
81
+ - **`2023/03/27`**: Support CPU-only mode. Now the model can run on machines without GPUs.
82
+ - **`2023/03/25`**: A [demo](https://colab.research.google.com/github/roboflow-ai/notebooks/blob/main/notebooks/zero-shot-object-detection-with-grounding-dino.ipynb) for Grounding DINO is available at Colab. [[SkalskiP](https://github.com/SkalskiP)]
83
+ - **`2023/03/22`**: Code is available Now!
84
 
85
  <details open>
86
  <summary><font size="4">
87
  Description
88
  </font></summary>
89
+ <a href="https://arxiv.org/abs/2303.05499">Paper</a> introduction.
90
  <img src=".asset/hero_figure.png" alt="ODinW" width="100%">
91
+ Marrying <a href="https://github.com/IDEA-Research/GroundingDINO">Grounding DINO</a> and <a href="https://github.com/gligen/GLIGEN">GLIGEN</a>
92
+ <img src="https://huggingface.co/ShilongLiu/GroundingDINO/resolve/main/GD_GLIGEN.png" alt="gd_gligen" width="100%">
93
  </details>
94
 
95
+ ## :star: Explanations/Tips for Grounding DINO Inputs and Outputs
96
+ - Grounding DINO accepts an `(image, text)` pair as inputs.
97
+ - It outputs `900` (by default) object boxes. Each box has similarity scores across all input words. (as shown in Figures below.)
98
+ - We defaultly choose the boxes whose highest similarities are higher than a `box_threshold`.
99
+ - We extract the words whose similarities are higher than the `text_threshold` as predicted labels.
100
+ - If you want to obtain objects of specific phrases, like the `dogs` in the sentence `two dogs with a stick.`, you can select the boxes with highest text similarities with `dogs` as final outputs.
101
+ - Note that each word can be split to **more than one** tokens with different tokenlizers. The number of words in a sentence may not equal to the number of text tokens.
102
+ - We suggest separating different category names with `.` for Grounding DINO.
103
+ ![model_explain1](.asset/model_explan1.PNG)
104
+ ![model_explain2](.asset/model_explan2.PNG)
105
 
106
+ ## :label: TODO
 
107
 
108
  - [x] Release inference code and demo.
109
  - [x] Release checkpoints.
110
+ - [x] Grounding DINO with Stable Diffusion and GLIGEN demos.
111
  - [ ] Release training codes.
112
 
113
+ ## :hammer_and_wrench: Install
114
+
115
+ **Note:**
116
+
117
+ 0. If you have a CUDA environment, please make sure the environment variable `CUDA_HOME` is set. It will be compiled under CPU-only mode if no CUDA available.
118
+
119
+ Please make sure following the installation steps strictly, otherwise the program may produce:
120
+ ```bash
121
+ NameError: name '_C' is not defined
122
+ ```
123
+
124
+ If this happened, please reinstalled the groundingDINO by reclone the git and do all the installation steps again.
125
+
126
+ #### how to check cuda:
127
+ ```bash
128
+ echo $CUDA_HOME
129
+ ```
130
+ If it print nothing, then it means you haven't set up the path/
131
+
132
+ Run this so the environment variable will be set under current shell.
133
+ ```bash
134
+ export CUDA_HOME=/path/to/cuda-11.3
135
+ ```
136
+
137
+ Notice the version of cuda should be aligned with your CUDA runtime, for there might exists multiple cuda at the same time.
138
+
139
+ If you want to set the CUDA_HOME permanently, store it using:
140
+
141
+ ```bash
142
+ echo 'export CUDA_HOME=/path/to/cuda' >> ~/.bashrc
143
+ ```
144
+ after that, source the bashrc file and check CUDA_HOME:
145
+ ```bash
146
+ source ~/.bashrc
147
+ echo $CUDA_HOME
148
+ ```
149
+
150
+ In this example, /path/to/cuda-11.3 should be replaced with the path where your CUDA toolkit is installed. You can find this by typing **which nvcc** in your terminal:
151
+
152
+ For instance,
153
+ if the output is /usr/local/cuda/bin/nvcc, then:
154
+ ```bash
155
+ export CUDA_HOME=/usr/local/cuda
156
+ ```
157
+ **Installation:**
158
+
159
+ 1.Clone the GroundingDINO repository from GitHub.
160
+
161
+ ```bash
162
+ git clone https://github.com/IDEA-Research/GroundingDINO.git
163
+ ```
164
+
165
+ 2. Change the current directory to the GroundingDINO folder.
166
+
167
+ ```bash
168
+ cd GroundingDINO/
169
+ ```
170
 
171
+ 3. Install the required dependencies in the current directory.
172
 
173
  ```bash
174
  pip install -e .
175
  ```
176
 
177
+ 4. Download pre-trained model weights.
178
+
179
+ ```bash
180
+ mkdir weights
181
+ cd weights
182
+ wget -q https://github.com/IDEA-Research/GroundingDINO/releases/download/v0.1.0-alpha/groundingdino_swint_ogc.pth
183
+ cd ..
184
+ ```
185
+
186
+ ## :arrow_forward: Demo
187
+ Check your GPU ID (only if you're using a GPU)
188
+
189
+ ```bash
190
+ nvidia-smi
191
+ ```
192
+ Replace `{GPU ID}`, `image_you_want_to_detect.jpg`, and `"dir you want to save the output"` with appropriate values in the following command
193
+ ```bash
194
+ CUDA_VISIBLE_DEVICES={GPU ID} python demo/inference_on_a_image.py \
195
+ -c groundingdino/config/GroundingDINO_SwinT_OGC.py \
196
+ -p weights/groundingdino_swint_ogc.pth \
197
+ -i image_you_want_to_detect.jpg \
198
+ -o "dir you want to save the output" \
199
+ -t "chair"
200
+ [--cpu-only] # open it for cpu mode
201
+ ```
202
 
203
+ If you would like to specify the phrases to detect, here is a demo:
204
  ```bash
205
+ CUDA_VISIBLE_DEVICES={GPU ID} python demo/inference_on_a_image.py \
206
+ -c groundingdino/config/GroundingDINO_SwinT_OGC.py \
207
+ -p ./groundingdino_swint_ogc.pth \
208
+ -i .asset/cat_dog.jpeg \
209
+ -o logs/1111 \
210
+ -t "There is a cat and a dog in the image ." \
211
+ --token_spans "[[[9, 10], [11, 14]], [[19, 20], [21, 24]]]"
212
+ [--cpu-only] # open it for cpu mode
213
  ```
214
+ The token_spans specify the start and end positions of a phrases. For example, the first phrase is `[[9, 10], [11, 14]]`. `"There is a cat and a dog in the image ."[9:10] = 'a'`, `"There is a cat and a dog in the image ."[11:14] = 'cat'`. Hence it refers to the phrase `a cat` . Similarly, the `[[19, 20], [21, 24]]` refers to the phrase `a dog`.
215
+
216
  See the `demo/inference_on_a_image.py` for more details.
217
 
218
+ **Running with Python:**
219
+
220
+ ```python
221
+ from groundingdino.util.inference import load_model, load_image, predict, annotate
222
+ import cv2
223
+
224
+ model = load_model("groundingdino/config/GroundingDINO_SwinT_OGC.py", "weights/groundingdino_swint_ogc.pth")
225
+ IMAGE_PATH = "weights/dog-3.jpeg"
226
+ TEXT_PROMPT = "chair . person . dog ."
227
+ BOX_TRESHOLD = 0.35
228
+ TEXT_TRESHOLD = 0.25
229
+
230
+ image_source, image = load_image(IMAGE_PATH)
231
+
232
+ boxes, logits, phrases = predict(
233
+ model=model,
234
+ image=image,
235
+ caption=TEXT_PROMPT,
236
+ box_threshold=BOX_TRESHOLD,
237
+ text_threshold=TEXT_TRESHOLD
238
+ )
239
+
240
+ annotated_frame = annotate(image_source=image_source, boxes=boxes, logits=logits, phrases=phrases)
241
+ cv2.imwrite("annotated_image.jpg", annotated_frame)
242
+ ```
243
  **Web UI**
244
 
245
  We also provide a demo code to integrate Grounding DINO with Gradio Web UI. See the file `demo/gradio_app.py` for more details.
246
 
247
+ **Notebooks**
248
+
249
+ - We release [demos](demo/image_editing_with_groundingdino_gligen.ipynb) to combine [Grounding DINO](https://arxiv.org/abs/2303.05499) with [GLIGEN](https://github.com/gligen/GLIGEN) for more controllable image editings.
250
+ - We release [demos](demo/image_editing_with_groundingdino_stablediffusion.ipynb) to combine [Grounding DINO](https://arxiv.org/abs/2303.05499) with [Stable Diffusion](https://github.com/Stability-AI/StableDiffusion) for image editings.
251
+
252
+ ## COCO Zero-shot Evaluations
253
+
254
+ We provide an example to evaluate Grounding DINO zero-shot performance on COCO. The results should be **48.5**.
255
+
256
+ ```bash
257
+ CUDA_VISIBLE_DEVICES=0 \
258
+ python demo/test_ap_on_coco.py \
259
+ -c groundingdino/config/GroundingDINO_SwinT_OGC.py \
260
+ -p weights/groundingdino_swint_ogc.pth \
261
+ --anno_path /path/to/annoataions/ie/instances_val2017.json \
262
+ --image_dir /path/to/imagedir/ie/val2017
263
+ ```
264
+
265
+
266
+ ## :luggage: Checkpoints
267
 
268
  <!-- insert a table -->
269
  <table>
 
285
  <td>Swin-T</td>
286
  <td>O365,GoldG,Cap4M</td>
287
  <td>48.4 (zero-shot) / 57.2 (fine-tune)</td>
288
+ <td><a href="https://github.com/IDEA-Research/GroundingDINO/releases/download/v0.1.0-alpha/groundingdino_swint_ogc.pth">GitHub link</a> | <a href="https://huggingface.co/ShilongLiu/GroundingDINO/resolve/main/groundingdino_swint_ogc.pth">HF link</a></td>
289
  <td><a href="https://github.com/IDEA-Research/GroundingDINO/blob/main/groundingdino/config/GroundingDINO_SwinT_OGC.py">link</a></td>
290
  </tr>
291
+ <tr>
292
+ <th>2</th>
293
+ <td>GroundingDINO-B</td>
294
+ <td>Swin-B</td>
295
+ <td>COCO,O365,GoldG,Cap4M,OpenImage,ODinW-35,RefCOCO</td>
296
+ <td>56.7 </td>
297
+ <td><a href="https://github.com/IDEA-Research/GroundingDINO/releases/download/v0.1.0-alpha2/groundingdino_swinb_cogcoor.pth">GitHub link</a> | <a href="https://huggingface.co/ShilongLiu/GroundingDINO/resolve/main/groundingdino_swinb_cogcoor.pth">HF link</a>
298
+ <td><a href="https://github.com/IDEA-Research/GroundingDINO/blob/main/groundingdino/config/GroundingDINO_SwinB_cfg.py">link</a></td>
299
+ </tr>
300
  </tbody>
301
  </table>
302
 
303
+ ## :medal_military: Results
304
 
305
  <details open>
306
  <summary><font size="4">
 
320
  <summary><font size="4">
321
  Marrying Grounding DINO with <a href="https://github.com/Stability-AI/StableDiffusion">Stable Diffusion</a> for Image Editing
322
  </font></summary>
323
+ See our example <a href="https://github.com/IDEA-Research/GroundingDINO/blob/main/demo/image_editing_with_groundingdino_stablediffusion.ipynb">notebook</a> for more details.
324
  <img src=".asset/GD_SD.png" alt="GD_SD" width="100%">
325
  </details>
326
 
327
+
328
  <details open>
329
  <summary><font size="4">
330
+ Marrying Grounding DINO with <a href="https://github.com/gligen/GLIGEN">GLIGEN</a> for more Detailed Image Editing.
331
  </font></summary>
332
+ See our example <a href="https://github.com/IDEA-Research/GroundingDINO/blob/main/demo/image_editing_with_groundingdino_gligen.ipynb">notebook</a> for more details.
333
  <img src=".asset/GD_GLIGEN.png" alt="GD_GLIGEN" width="100%">
334
  </details>
335
 
336
+ ## :sauropod: Model: Grounding DINO
337
 
338
  Includes: a text backbone, an image backbone, a feature enhancer, a language-guided query selection, and a cross-modality decoder.
339
 
340
  ![arch](.asset/arch.png)
341
 
342
 
343
+ ## :hearts: Acknowledgement
344
 
345
  Our model is related to [DINO](https://github.com/IDEA-Research/DINO) and [GLIP](https://github.com/microsoft/GLIP). Thanks for their great work!
346
 
 
349
  Thanks [Stable Diffusion](https://github.com/Stability-AI/StableDiffusion) and [GLIGEN](https://github.com/gligen/GLIGEN) for their awesome models.
350
 
351
 
352
+ ## :black_nib: Citation
353
 
354
  If you find our work helpful for your research, please consider citing the following BibTeX entry.
355
 
356
  ```bibtex
357
+ @article{liu2023grounding,
358
+ title={Grounding dino: Marrying dino with grounded pre-training for open-set object detection},
359
+ author={Liu, Shilong and Zeng, Zhaoyang and Ren, Tianhe and Li, Feng and Zhang, Hao and Yang, Jie and Li, Chunyuan and Yang, Jianwei and Su, Hang and Zhu, Jun and others},
360
+ journal={arXiv preprint arXiv:2303.05499},
361
  year={2023}
362
  }
363
  ```
GroundingDINO/groundingdino/config/{GroundingDINO_SwinB.cfg.py → GroundingDINO_SwinB_cfg.py} RENAMED
File without changes
GroundingDINO/groundingdino/config/__init__.py ADDED
File without changes
GroundingDINO/groundingdino/datasets/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (189 Bytes). View file
 
GroundingDINO/groundingdino/datasets/__pycache__/transforms.cpython-310.pyc ADDED
Binary file (10.1 kB). View file
 
GroundingDINO/groundingdino/datasets/cocogrounding_eval.py ADDED
@@ -0,0 +1,269 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ------------------------------------------------------------------------
2
+ # Grounding DINO. Midified by Shilong Liu.
3
+ # url: https://github.com/IDEA-Research/GroundingDINO
4
+ # Copyright (c) 2023 IDEA. All Rights Reserved.
5
+ # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
6
+ # ------------------------------------------------------------------------
7
+ # Copyright (c) Aishwarya Kamath & Nicolas Carion. Licensed under the Apache License 2.0. All Rights Reserved
8
+ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
9
+ """
10
+ COCO evaluator that works in distributed mode.
11
+
12
+ Mostly copy-paste from https://github.com/pytorch/vision/blob/edfd5a7/references/detection/coco_eval.py
13
+ The difference is that there is less copy-pasting from pycocotools
14
+ in the end of the file, as python3 can suppress prints with contextlib
15
+ """
16
+ import contextlib
17
+ import copy
18
+ import os
19
+
20
+ import numpy as np
21
+ import pycocotools.mask as mask_util
22
+ import torch
23
+ from pycocotools.coco import COCO
24
+ from pycocotools.cocoeval import COCOeval
25
+
26
+ from groundingdino.util.misc import all_gather
27
+
28
+
29
+ class CocoGroundingEvaluator(object):
30
+ def __init__(self, coco_gt, iou_types, useCats=True):
31
+ assert isinstance(iou_types, (list, tuple))
32
+ coco_gt = copy.deepcopy(coco_gt)
33
+ self.coco_gt = coco_gt
34
+
35
+ self.iou_types = iou_types
36
+ self.coco_eval = {}
37
+ for iou_type in iou_types:
38
+ self.coco_eval[iou_type] = COCOeval(coco_gt, iouType=iou_type)
39
+ self.coco_eval[iou_type].useCats = useCats
40
+
41
+ self.img_ids = []
42
+ self.eval_imgs = {k: [] for k in iou_types}
43
+ self.useCats = useCats
44
+
45
+ def update(self, predictions):
46
+ img_ids = list(np.unique(list(predictions.keys())))
47
+ self.img_ids.extend(img_ids)
48
+
49
+ for iou_type in self.iou_types:
50
+ results = self.prepare(predictions, iou_type)
51
+
52
+ # suppress pycocotools prints
53
+ with open(os.devnull, "w") as devnull:
54
+ with contextlib.redirect_stdout(devnull):
55
+ coco_dt = COCO.loadRes(self.coco_gt, results) if results else COCO()
56
+
57
+ coco_eval = self.coco_eval[iou_type]
58
+
59
+ coco_eval.cocoDt = coco_dt
60
+ coco_eval.params.imgIds = list(img_ids)
61
+ coco_eval.params.useCats = self.useCats
62
+ img_ids, eval_imgs = evaluate(coco_eval)
63
+
64
+ self.eval_imgs[iou_type].append(eval_imgs)
65
+
66
+ def synchronize_between_processes(self):
67
+ for iou_type in self.iou_types:
68
+ self.eval_imgs[iou_type] = np.concatenate(self.eval_imgs[iou_type], 2)
69
+ create_common_coco_eval(self.coco_eval[iou_type], self.img_ids, self.eval_imgs[iou_type])
70
+
71
+ def accumulate(self):
72
+ for coco_eval in self.coco_eval.values():
73
+ coco_eval.accumulate()
74
+
75
+ def summarize(self):
76
+ for iou_type, coco_eval in self.coco_eval.items():
77
+ print("IoU metric: {}".format(iou_type))
78
+ coco_eval.summarize()
79
+
80
+ def prepare(self, predictions, iou_type):
81
+ if iou_type == "bbox":
82
+ return self.prepare_for_coco_detection(predictions)
83
+ elif iou_type == "segm":
84
+ return self.prepare_for_coco_segmentation(predictions)
85
+ elif iou_type == "keypoints":
86
+ return self.prepare_for_coco_keypoint(predictions)
87
+ else:
88
+ raise ValueError("Unknown iou type {}".format(iou_type))
89
+
90
+ def prepare_for_coco_detection(self, predictions):
91
+ coco_results = []
92
+ for original_id, prediction in predictions.items():
93
+ if len(prediction) == 0:
94
+ continue
95
+
96
+ boxes = prediction["boxes"]
97
+ boxes = convert_to_xywh(boxes).tolist()
98
+ scores = prediction["scores"].tolist()
99
+ labels = prediction["labels"].tolist()
100
+
101
+ coco_results.extend(
102
+ [
103
+ {
104
+ "image_id": original_id,
105
+ "category_id": labels[k],
106
+ "bbox": box,
107
+ "score": scores[k],
108
+ }
109
+ for k, box in enumerate(boxes)
110
+ ]
111
+ )
112
+ return coco_results
113
+
114
+ def prepare_for_coco_segmentation(self, predictions):
115
+ coco_results = []
116
+ for original_id, prediction in predictions.items():
117
+ if len(prediction) == 0:
118
+ continue
119
+
120
+ scores = prediction["scores"]
121
+ labels = prediction["labels"]
122
+ masks = prediction["masks"]
123
+
124
+ masks = masks > 0.5
125
+
126
+ scores = prediction["scores"].tolist()
127
+ labels = prediction["labels"].tolist()
128
+
129
+ rles = [
130
+ mask_util.encode(np.array(mask[0, :, :, np.newaxis], dtype=np.uint8, order="F"))[0]
131
+ for mask in masks
132
+ ]
133
+ for rle in rles:
134
+ rle["counts"] = rle["counts"].decode("utf-8")
135
+
136
+ coco_results.extend(
137
+ [
138
+ {
139
+ "image_id": original_id,
140
+ "category_id": labels[k],
141
+ "segmentation": rle,
142
+ "score": scores[k],
143
+ }
144
+ for k, rle in enumerate(rles)
145
+ ]
146
+ )
147
+ return coco_results
148
+
149
+ def prepare_for_coco_keypoint(self, predictions):
150
+ coco_results = []
151
+ for original_id, prediction in predictions.items():
152
+ if len(prediction) == 0:
153
+ continue
154
+
155
+ boxes = prediction["boxes"]
156
+ boxes = convert_to_xywh(boxes).tolist()
157
+ scores = prediction["scores"].tolist()
158
+ labels = prediction["labels"].tolist()
159
+ keypoints = prediction["keypoints"]
160
+ keypoints = keypoints.flatten(start_dim=1).tolist()
161
+
162
+ coco_results.extend(
163
+ [
164
+ {
165
+ "image_id": original_id,
166
+ "category_id": labels[k],
167
+ "keypoints": keypoint,
168
+ "score": scores[k],
169
+ }
170
+ for k, keypoint in enumerate(keypoints)
171
+ ]
172
+ )
173
+ return coco_results
174
+
175
+
176
+ def convert_to_xywh(boxes):
177
+ xmin, ymin, xmax, ymax = boxes.unbind(1)
178
+ return torch.stack((xmin, ymin, xmax - xmin, ymax - ymin), dim=1)
179
+
180
+
181
+ def merge(img_ids, eval_imgs):
182
+ all_img_ids = all_gather(img_ids)
183
+ all_eval_imgs = all_gather(eval_imgs)
184
+
185
+ merged_img_ids = []
186
+ for p in all_img_ids:
187
+ merged_img_ids.extend(p)
188
+
189
+ merged_eval_imgs = []
190
+ for p in all_eval_imgs:
191
+ merged_eval_imgs.append(p)
192
+
193
+ merged_img_ids = np.array(merged_img_ids)
194
+ merged_eval_imgs = np.concatenate(merged_eval_imgs, 2)
195
+
196
+ # keep only unique (and in sorted order) images
197
+ merged_img_ids, idx = np.unique(merged_img_ids, return_index=True)
198
+ merged_eval_imgs = merged_eval_imgs[..., idx]
199
+
200
+ return merged_img_ids, merged_eval_imgs
201
+
202
+
203
+ def create_common_coco_eval(coco_eval, img_ids, eval_imgs):
204
+ img_ids, eval_imgs = merge(img_ids, eval_imgs)
205
+ img_ids = list(img_ids)
206
+ eval_imgs = list(eval_imgs.flatten())
207
+
208
+ coco_eval.evalImgs = eval_imgs
209
+ coco_eval.params.imgIds = img_ids
210
+ coco_eval._paramsEval = copy.deepcopy(coco_eval.params)
211
+
212
+
213
+ #################################################################
214
+ # From pycocotools, just removed the prints and fixed
215
+ # a Python3 bug about unicode not defined
216
+ #################################################################
217
+
218
+
219
+ def evaluate(self):
220
+ """
221
+ Run per image evaluation on given images and store results (a list of dict) in self.evalImgs
222
+ :return: None
223
+ """
224
+ # tic = time.time()
225
+ # print('Running per image evaluation...')
226
+ p = self.params
227
+ # add backward compatibility if useSegm is specified in params
228
+ if p.useSegm is not None:
229
+ p.iouType = "segm" if p.useSegm == 1 else "bbox"
230
+ print("useSegm (deprecated) is not None. Running {} evaluation".format(p.iouType))
231
+ # print('Evaluate annotation type *{}*'.format(p.iouType))
232
+ p.imgIds = list(np.unique(p.imgIds))
233
+ if p.useCats:
234
+ p.catIds = list(np.unique(p.catIds))
235
+ p.maxDets = sorted(p.maxDets)
236
+ self.params = p
237
+
238
+ self._prepare()
239
+ # loop through images, area range, max detection number
240
+ catIds = p.catIds if p.useCats else [-1]
241
+
242
+ if p.iouType == "segm" or p.iouType == "bbox":
243
+ computeIoU = self.computeIoU
244
+ elif p.iouType == "keypoints":
245
+ computeIoU = self.computeOks
246
+ self.ious = {
247
+ (imgId, catId): computeIoU(imgId, catId)
248
+ for imgId in p.imgIds
249
+ for catId in catIds}
250
+
251
+ evaluateImg = self.evaluateImg
252
+ maxDet = p.maxDets[-1]
253
+ evalImgs = [
254
+ evaluateImg(imgId, catId, areaRng, maxDet)
255
+ for catId in catIds
256
+ for areaRng in p.areaRng
257
+ for imgId in p.imgIds
258
+ ]
259
+ # this is NOT in the pycocotools code, but could be done outside
260
+ evalImgs = np.asarray(evalImgs).reshape(len(catIds), len(p.areaRng), len(p.imgIds))
261
+ self._paramsEval = copy.deepcopy(self.params)
262
+ # toc = time.time()
263
+ # print('DONE (t={:0.2f}s).'.format(toc-tic))
264
+ return p.imgIds, evalImgs
265
+
266
+
267
+ #################################################################
268
+ # end of straight copy from pycocotools, just removing the prints
269
+ #################################################################
GroundingDINO/groundingdino/models/GroundingDINO/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (258 Bytes). View file
 
GroundingDINO/groundingdino/models/GroundingDINO/__pycache__/bertwarper.cpython-310.pyc ADDED
Binary file (7.23 kB). View file
 
GroundingDINO/groundingdino/models/GroundingDINO/__pycache__/fuse_modules.cpython-310.pyc ADDED
Binary file (7.78 kB). View file
 
GroundingDINO/groundingdino/models/GroundingDINO/__pycache__/groundingdino.cpython-310.pyc ADDED
Binary file (11.3 kB). View file
 
GroundingDINO/groundingdino/models/GroundingDINO/__pycache__/ms_deform_attn.cpython-310.pyc ADDED
Binary file (11.8 kB). View file
 
GroundingDINO/groundingdino/models/GroundingDINO/__pycache__/transformer.cpython-310.pyc ADDED
Binary file (19.3 kB). View file
 
GroundingDINO/groundingdino/models/GroundingDINO/__pycache__/transformer_vanilla.cpython-310.pyc ADDED
Binary file (3.45 kB). View file
 
GroundingDINO/groundingdino/models/GroundingDINO/__pycache__/utils.cpython-310.pyc ADDED
Binary file (9.58 kB). View file
 
GroundingDINO/groundingdino/models/GroundingDINO/backbone/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (257 Bytes). View file
 
GroundingDINO/groundingdino/models/GroundingDINO/backbone/__pycache__/backbone.cpython-310.pyc ADDED
Binary file (6.25 kB). View file
 
GroundingDINO/groundingdino/models/GroundingDINO/backbone/__pycache__/position_encoding.cpython-310.pyc ADDED
Binary file (5.16 kB). View file
 
GroundingDINO/groundingdino/models/GroundingDINO/backbone/__pycache__/swin_transformer.cpython-310.pyc ADDED
Binary file (20.6 kB). View file
 
GroundingDINO/groundingdino/models/GroundingDINO/groundingdino.py CHANGED
@@ -206,6 +206,21 @@ class GroundingDINO(nn.Module):
206
  nn.init.xavier_uniform_(proj[0].weight, gain=1)
207
  nn.init.constant_(proj[0].bias, 0)
208
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
209
  def init_ref_points(self, use_num_queries):
210
  self.refpoint_embed = nn.Embedding(use_num_queries, self.query_dim)
211
 
@@ -228,7 +243,6 @@ class GroundingDINO(nn.Module):
228
  captions = kw["captions"]
229
  else:
230
  captions = [t["caption"] for t in targets]
231
- len(captions)
232
 
233
  # encoder texts
234
  tokenized = self.tokenizer(captions, padding="longest", return_tensors="pt").to(
@@ -283,14 +297,14 @@ class GroundingDINO(nn.Module):
283
  }
284
 
285
  # import ipdb; ipdb.set_trace()
286
-
287
  if isinstance(samples, (list, torch.Tensor)):
288
  samples = nested_tensor_from_tensor_list(samples)
289
- features, poss = self.backbone(samples)
 
290
 
291
  srcs = []
292
  masks = []
293
- for l, feat in enumerate(features):
294
  src, mask = feat.decompose()
295
  srcs.append(self.input_proj[l](src))
296
  masks.append(mask)
@@ -299,7 +313,7 @@ class GroundingDINO(nn.Module):
299
  _len_srcs = len(srcs)
300
  for l in range(_len_srcs, self.num_feature_levels):
301
  if l == _len_srcs:
302
- src = self.input_proj[l](features[-1].tensors)
303
  else:
304
  src = self.input_proj[l](srcs[-1])
305
  m = samples.mask
@@ -307,11 +321,11 @@ class GroundingDINO(nn.Module):
307
  pos_l = self.backbone[1](NestedTensor(src, mask)).to(src.dtype)
308
  srcs.append(src)
309
  masks.append(mask)
310
- poss.append(pos_l)
311
 
312
  input_query_bbox = input_query_label = attn_mask = dn_meta = None
313
  hs, reference, hs_enc, ref_enc, init_box_proposal = self.transformer(
314
- srcs, masks, input_query_bbox, poss, input_query_label, attn_mask, text_dict
315
  )
316
 
317
  # deformable-detr-like anchor update
@@ -345,7 +359,9 @@ class GroundingDINO(nn.Module):
345
  # interm_class = self.transformer.enc_out_class_embed(hs_enc[-1], text_dict)
346
  # out['interm_outputs'] = {'pred_logits': interm_class, 'pred_boxes': interm_coord}
347
  # out['interm_outputs_for_matching_pre'] = {'pred_logits': interm_class, 'pred_boxes': init_box_proposal}
348
-
 
 
349
  return out
350
 
351
  @torch.jit.unused
@@ -393,3 +409,4 @@ def build_groundingdino(args):
393
  )
394
 
395
  return model
 
 
206
  nn.init.xavier_uniform_(proj[0].weight, gain=1)
207
  nn.init.constant_(proj[0].bias, 0)
208
 
209
+ def set_image_tensor(self, samples: NestedTensor):
210
+ if isinstance(samples, (list, torch.Tensor)):
211
+ samples = nested_tensor_from_tensor_list(samples)
212
+ self.features, self.poss = self.backbone(samples)
213
+
214
+ def unset_image_tensor(self):
215
+ if hasattr(self, 'features'):
216
+ del self.features
217
+ if hasattr(self,'poss'):
218
+ del self.poss
219
+
220
+ def set_image_features(self, features , poss):
221
+ self.features = features
222
+ self.poss = poss
223
+
224
  def init_ref_points(self, use_num_queries):
225
  self.refpoint_embed = nn.Embedding(use_num_queries, self.query_dim)
226
 
 
243
  captions = kw["captions"]
244
  else:
245
  captions = [t["caption"] for t in targets]
 
246
 
247
  # encoder texts
248
  tokenized = self.tokenizer(captions, padding="longest", return_tensors="pt").to(
 
297
  }
298
 
299
  # import ipdb; ipdb.set_trace()
 
300
  if isinstance(samples, (list, torch.Tensor)):
301
  samples = nested_tensor_from_tensor_list(samples)
302
+ if not hasattr(self, 'features') or not hasattr(self, 'poss'):
303
+ self.set_image_tensor(samples)
304
 
305
  srcs = []
306
  masks = []
307
+ for l, feat in enumerate(self.features):
308
  src, mask = feat.decompose()
309
  srcs.append(self.input_proj[l](src))
310
  masks.append(mask)
 
313
  _len_srcs = len(srcs)
314
  for l in range(_len_srcs, self.num_feature_levels):
315
  if l == _len_srcs:
316
+ src = self.input_proj[l](self.features[-1].tensors)
317
  else:
318
  src = self.input_proj[l](srcs[-1])
319
  m = samples.mask
 
321
  pos_l = self.backbone[1](NestedTensor(src, mask)).to(src.dtype)
322
  srcs.append(src)
323
  masks.append(mask)
324
+ self.poss.append(pos_l)
325
 
326
  input_query_bbox = input_query_label = attn_mask = dn_meta = None
327
  hs, reference, hs_enc, ref_enc, init_box_proposal = self.transformer(
328
+ srcs, masks, input_query_bbox, self.poss, input_query_label, attn_mask, text_dict
329
  )
330
 
331
  # deformable-detr-like anchor update
 
359
  # interm_class = self.transformer.enc_out_class_embed(hs_enc[-1], text_dict)
360
  # out['interm_outputs'] = {'pred_logits': interm_class, 'pred_boxes': interm_coord}
361
  # out['interm_outputs_for_matching_pre'] = {'pred_logits': interm_class, 'pred_boxes': init_box_proposal}
362
+ unset_image_tensor = kw.get('unset_image_tensor', True)
363
+ if unset_image_tensor:
364
+ self.unset_image_tensor() ## If necessary
365
  return out
366
 
367
  @torch.jit.unused
 
409
  )
410
 
411
  return model
412
+
GroundingDINO/groundingdino/models/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (502 Bytes). View file
 
GroundingDINO/groundingdino/models/__pycache__/registry.cpython-310.pyc ADDED
Binary file (2.11 kB). View file
 
GroundingDINO/groundingdino/util/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (187 Bytes). View file
 
GroundingDINO/groundingdino/util/__pycache__/box_ops.cpython-310.pyc ADDED
Binary file (3.85 kB). View file
 
GroundingDINO/groundingdino/util/__pycache__/get_tokenlizer.cpython-310.pyc ADDED
Binary file (1.13 kB). View file
 
GroundingDINO/groundingdino/util/__pycache__/inference.cpython-310.pyc ADDED
Binary file (8 kB). View file
 
GroundingDINO/groundingdino/util/__pycache__/misc.cpython-310.pyc ADDED
Binary file (20.3 kB). View file
 
GroundingDINO/groundingdino/util/__pycache__/slconfig.cpython-310.pyc ADDED
Binary file (13.2 kB). View file
 
GroundingDINO/groundingdino/util/__pycache__/utils.cpython-310.pyc ADDED
Binary file (19.9 kB). View file
 
GroundingDINO/groundingdino/util/__pycache__/visualizer.cpython-310.pyc ADDED
Binary file (7.84 kB). View file
 
GroundingDINO/groundingdino/util/__pycache__/vl_utils.cpython-310.pyc ADDED
Binary file (3.12 kB). View file
 
GroundingDINO/groundingdino/util/get_tokenlizer.py CHANGED
@@ -1,5 +1,5 @@
1
  from transformers import AutoTokenizer, BertModel, BertTokenizer, RobertaModel, RobertaTokenizerFast
2
-
3
 
4
  def get_tokenlizer(text_encoder_type):
5
  if not isinstance(text_encoder_type, str):
@@ -8,6 +8,8 @@ def get_tokenlizer(text_encoder_type):
8
  text_encoder_type = text_encoder_type.text_encoder_type
9
  elif text_encoder_type.get("text_encoder_type", False):
10
  text_encoder_type = text_encoder_type.get("text_encoder_type")
 
 
11
  else:
12
  raise ValueError(
13
  "Unknown type of text_encoder_type: {}".format(type(text_encoder_type))
@@ -19,8 +21,9 @@ def get_tokenlizer(text_encoder_type):
19
 
20
 
21
  def get_pretrained_language_model(text_encoder_type):
22
- if text_encoder_type == "bert-base-uncased":
23
  return BertModel.from_pretrained(text_encoder_type)
24
  if text_encoder_type == "roberta-base":
25
  return RobertaModel.from_pretrained(text_encoder_type)
 
26
  raise ValueError("Unknown text_encoder_type {}".format(text_encoder_type))
 
1
  from transformers import AutoTokenizer, BertModel, BertTokenizer, RobertaModel, RobertaTokenizerFast
2
+ import os
3
 
4
  def get_tokenlizer(text_encoder_type):
5
  if not isinstance(text_encoder_type, str):
 
8
  text_encoder_type = text_encoder_type.text_encoder_type
9
  elif text_encoder_type.get("text_encoder_type", False):
10
  text_encoder_type = text_encoder_type.get("text_encoder_type")
11
+ elif os.path.isdir(text_encoder_type) and os.path.exists(text_encoder_type):
12
+ pass
13
  else:
14
  raise ValueError(
15
  "Unknown type of text_encoder_type: {}".format(type(text_encoder_type))
 
21
 
22
 
23
  def get_pretrained_language_model(text_encoder_type):
24
+ if text_encoder_type == "bert-base-uncased" or (os.path.isdir(text_encoder_type) and os.path.exists(text_encoder_type)):
25
  return BertModel.from_pretrained(text_encoder_type)
26
  if text_encoder_type == "roberta-base":
27
  return RobertaModel.from_pretrained(text_encoder_type)
28
+
29
  raise ValueError("Unknown text_encoder_type {}".format(text_encoder_type))
GroundingDINO/groundingdino/util/inference.py CHANGED
@@ -6,6 +6,7 @@ import supervision as sv
6
  import torch
7
  from PIL import Image
8
  from torchvision.ops import box_convert
 
9
 
10
  import groundingdino.datasets.transforms as T
11
  from groundingdino.models import build_model
@@ -13,6 +14,10 @@ from groundingdino.util.misc import clean_state_dict
13
  from groundingdino.util.slconfig import SLConfig
14
  from groundingdino.util.utils import get_phrases_from_posmap
15
 
 
 
 
 
16
 
17
  def preprocess_caption(caption: str) -> str:
18
  result = caption.lower().strip()
@@ -51,7 +56,8 @@ def predict(
51
  caption: str,
52
  box_threshold: float,
53
  text_threshold: float,
54
- device: str = "cuda"
 
55
  ) -> Tuple[torch.Tensor, torch.Tensor, List[str]]:
56
  caption = preprocess_caption(caption=caption)
57
 
@@ -70,17 +76,40 @@ def predict(
70
 
71
  tokenizer = model.tokenizer
72
  tokenized = tokenizer(caption)
73
-
74
- phrases = [
75
- get_phrases_from_posmap(logit > text_threshold, tokenized, tokenizer).replace('.', '')
76
- for logit
77
- in logits
78
- ]
 
 
 
 
 
 
 
 
 
 
 
79
 
80
  return boxes, logits.max(dim=1)[0], phrases
81
 
82
 
83
  def annotate(image_source: np.ndarray, boxes: torch.Tensor, logits: torch.Tensor, phrases: List[str]) -> np.ndarray:
 
 
 
 
 
 
 
 
 
 
 
 
84
  h, w, _ = image_source.shape
85
  boxes = boxes * torch.Tensor([w, h, w, h])
86
  xyxy = box_convert(boxes=boxes, in_fmt="cxcywh", out_fmt="xyxy").numpy()
@@ -96,3 +125,147 @@ def annotate(image_source: np.ndarray, boxes: torch.Tensor, logits: torch.Tensor
96
  annotated_frame = cv2.cvtColor(image_source, cv2.COLOR_RGB2BGR)
97
  annotated_frame = box_annotator.annotate(scene=annotated_frame, detections=detections, labels=labels)
98
  return annotated_frame
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  import torch
7
  from PIL import Image
8
  from torchvision.ops import box_convert
9
+ import bisect
10
 
11
  import groundingdino.datasets.transforms as T
12
  from groundingdino.models import build_model
 
14
  from groundingdino.util.slconfig import SLConfig
15
  from groundingdino.util.utils import get_phrases_from_posmap
16
 
17
+ # ----------------------------------------------------------------------------------------------------------------------
18
+ # OLD API
19
+ # ----------------------------------------------------------------------------------------------------------------------
20
+
21
 
22
  def preprocess_caption(caption: str) -> str:
23
  result = caption.lower().strip()
 
56
  caption: str,
57
  box_threshold: float,
58
  text_threshold: float,
59
+ device: str = "cuda",
60
+ remove_combined: bool = False
61
  ) -> Tuple[torch.Tensor, torch.Tensor, List[str]]:
62
  caption = preprocess_caption(caption=caption)
63
 
 
76
 
77
  tokenizer = model.tokenizer
78
  tokenized = tokenizer(caption)
79
+
80
+ if remove_combined:
81
+ sep_idx = [i for i in range(len(tokenized['input_ids'])) if tokenized['input_ids'][i] in [101, 102, 1012]]
82
+
83
+ phrases = []
84
+ for logit in logits:
85
+ max_idx = logit.argmax()
86
+ insert_idx = bisect.bisect_left(sep_idx, max_idx)
87
+ right_idx = sep_idx[insert_idx]
88
+ left_idx = sep_idx[insert_idx - 1]
89
+ phrases.append(get_phrases_from_posmap(logit > text_threshold, tokenized, tokenizer, left_idx, right_idx).replace('.', ''))
90
+ else:
91
+ phrases = [
92
+ get_phrases_from_posmap(logit > text_threshold, tokenized, tokenizer).replace('.', '')
93
+ for logit
94
+ in logits
95
+ ]
96
 
97
  return boxes, logits.max(dim=1)[0], phrases
98
 
99
 
100
  def annotate(image_source: np.ndarray, boxes: torch.Tensor, logits: torch.Tensor, phrases: List[str]) -> np.ndarray:
101
+ """
102
+ This function annotates an image with bounding boxes and labels.
103
+
104
+ Parameters:
105
+ image_source (np.ndarray): The source image to be annotated.
106
+ boxes (torch.Tensor): A tensor containing bounding box coordinates.
107
+ logits (torch.Tensor): A tensor containing confidence scores for each bounding box.
108
+ phrases (List[str]): A list of labels for each bounding box.
109
+
110
+ Returns:
111
+ np.ndarray: The annotated image.
112
+ """
113
  h, w, _ = image_source.shape
114
  boxes = boxes * torch.Tensor([w, h, w, h])
115
  xyxy = box_convert(boxes=boxes, in_fmt="cxcywh", out_fmt="xyxy").numpy()
 
125
  annotated_frame = cv2.cvtColor(image_source, cv2.COLOR_RGB2BGR)
126
  annotated_frame = box_annotator.annotate(scene=annotated_frame, detections=detections, labels=labels)
127
  return annotated_frame
128
+
129
+
130
+ # ----------------------------------------------------------------------------------------------------------------------
131
+ # NEW API
132
+ # ----------------------------------------------------------------------------------------------------------------------
133
+
134
+
135
+ class Model:
136
+
137
+ def __init__(
138
+ self,
139
+ model_config_path: str,
140
+ model_checkpoint_path: str,
141
+ device: str = "cuda"
142
+ ):
143
+ self.model = load_model(
144
+ model_config_path=model_config_path,
145
+ model_checkpoint_path=model_checkpoint_path,
146
+ device=device
147
+ ).to(device)
148
+ self.device = device
149
+
150
+ def predict_with_caption(
151
+ self,
152
+ image: np.ndarray,
153
+ caption: str,
154
+ box_threshold: float = 0.35,
155
+ text_threshold: float = 0.25
156
+ ) -> Tuple[sv.Detections, List[str]]:
157
+ """
158
+ import cv2
159
+
160
+ image = cv2.imread(IMAGE_PATH)
161
+
162
+ model = Model(model_config_path=CONFIG_PATH, model_checkpoint_path=WEIGHTS_PATH)
163
+ detections, labels = model.predict_with_caption(
164
+ image=image,
165
+ caption=caption,
166
+ box_threshold=BOX_THRESHOLD,
167
+ text_threshold=TEXT_THRESHOLD
168
+ )
169
+
170
+ import supervision as sv
171
+
172
+ box_annotator = sv.BoxAnnotator()
173
+ annotated_image = box_annotator.annotate(scene=image, detections=detections, labels=labels)
174
+ """
175
+ processed_image = Model.preprocess_image(image_bgr=image).to(self.device)
176
+ boxes, logits, phrases = predict(
177
+ model=self.model,
178
+ image=processed_image,
179
+ caption=caption,
180
+ box_threshold=box_threshold,
181
+ text_threshold=text_threshold,
182
+ device=self.device)
183
+ source_h, source_w, _ = image.shape
184
+ detections = Model.post_process_result(
185
+ source_h=source_h,
186
+ source_w=source_w,
187
+ boxes=boxes,
188
+ logits=logits)
189
+ return detections, phrases
190
+
191
+ def predict_with_classes(
192
+ self,
193
+ image: np.ndarray,
194
+ classes: List[str],
195
+ box_threshold: float,
196
+ text_threshold: float
197
+ ) -> sv.Detections:
198
+ """
199
+ import cv2
200
+
201
+ image = cv2.imread(IMAGE_PATH)
202
+
203
+ model = Model(model_config_path=CONFIG_PATH, model_checkpoint_path=WEIGHTS_PATH)
204
+ detections = model.predict_with_classes(
205
+ image=image,
206
+ classes=CLASSES,
207
+ box_threshold=BOX_THRESHOLD,
208
+ text_threshold=TEXT_THRESHOLD
209
+ )
210
+
211
+
212
+ import supervision as sv
213
+
214
+ box_annotator = sv.BoxAnnotator()
215
+ annotated_image = box_annotator.annotate(scene=image, detections=detections)
216
+ """
217
+ caption = ". ".join(classes)
218
+ processed_image = Model.preprocess_image(image_bgr=image).to(self.device)
219
+ boxes, logits, phrases = predict(
220
+ model=self.model,
221
+ image=processed_image,
222
+ caption=caption,
223
+ box_threshold=box_threshold,
224
+ text_threshold=text_threshold,
225
+ device=self.device)
226
+ source_h, source_w, _ = image.shape
227
+ detections = Model.post_process_result(
228
+ source_h=source_h,
229
+ source_w=source_w,
230
+ boxes=boxes,
231
+ logits=logits)
232
+ class_id = Model.phrases2classes(phrases=phrases, classes=classes)
233
+ detections.class_id = class_id
234
+ return detections
235
+
236
+ @staticmethod
237
+ def preprocess_image(image_bgr: np.ndarray) -> torch.Tensor:
238
+ transform = T.Compose(
239
+ [
240
+ T.RandomResize([800], max_size=1333),
241
+ T.ToTensor(),
242
+ T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
243
+ ]
244
+ )
245
+ image_pillow = Image.fromarray(cv2.cvtColor(image_bgr, cv2.COLOR_BGR2RGB))
246
+ image_transformed, _ = transform(image_pillow, None)
247
+ return image_transformed
248
+
249
+ @staticmethod
250
+ def post_process_result(
251
+ source_h: int,
252
+ source_w: int,
253
+ boxes: torch.Tensor,
254
+ logits: torch.Tensor
255
+ ) -> sv.Detections:
256
+ boxes = boxes * torch.Tensor([source_w, source_h, source_w, source_h])
257
+ xyxy = box_convert(boxes=boxes, in_fmt="cxcywh", out_fmt="xyxy").numpy()
258
+ confidence = logits.numpy()
259
+ return sv.Detections(xyxy=xyxy, confidence=confidence)
260
+
261
+ @staticmethod
262
+ def phrases2classes(phrases: List[str], classes: List[str]) -> np.ndarray:
263
+ class_ids = []
264
+ for phrase in phrases:
265
+ for class_ in classes:
266
+ if class_ in phrase:
267
+ class_ids.append(classes.index(class_))
268
+ break
269
+ else:
270
+ class_ids.append(None)
271
+ return np.array(class_ids)
GroundingDINO/groundingdino/util/slconfig.py CHANGED
@@ -2,13 +2,13 @@
2
  # Modified from mmcv
3
  # ==========================================================
4
  import ast
 
5
  import os.path as osp
6
  import shutil
7
  import sys
8
  import tempfile
9
  from argparse import Action
10
  from importlib import import_module
11
- import platform
12
 
13
  from addict import Dict
14
  from yapf.yapflib.yapf_api import FormatCode
@@ -81,7 +81,7 @@ class SLConfig(object):
81
  with tempfile.TemporaryDirectory() as temp_config_dir:
82
  temp_config_file = tempfile.NamedTemporaryFile(dir=temp_config_dir, suffix=".py")
83
  temp_config_name = osp.basename(temp_config_file.name)
84
- if platform.system() == 'Windows':
85
  temp_config_file.close()
86
  shutil.copyfile(filename, osp.join(temp_config_dir, temp_config_name))
87
  temp_module_name = osp.splitext(temp_config_name)[0]
 
2
  # Modified from mmcv
3
  # ==========================================================
4
  import ast
5
+ import os
6
  import os.path as osp
7
  import shutil
8
  import sys
9
  import tempfile
10
  from argparse import Action
11
  from importlib import import_module
 
12
 
13
  from addict import Dict
14
  from yapf.yapflib.yapf_api import FormatCode
 
81
  with tempfile.TemporaryDirectory() as temp_config_dir:
82
  temp_config_file = tempfile.NamedTemporaryFile(dir=temp_config_dir, suffix=".py")
83
  temp_config_name = osp.basename(temp_config_file.name)
84
+ if os.name == 'nt':
85
  temp_config_file.close()
86
  shutil.copyfile(filename, osp.join(temp_config_dir, temp_config_name))
87
  temp_module_name = osp.splitext(temp_config_name)[0]
GroundingDINO/groundingdino/util/utils.py CHANGED
@@ -597,10 +597,12 @@ def targets_to(targets: List[Dict[str, Any]], device):
597
 
598
 
599
  def get_phrases_from_posmap(
600
- posmap: torch.BoolTensor, tokenized: Dict, tokenizer: AutoTokenizer
601
  ):
602
  assert isinstance(posmap, torch.Tensor), "posmap must be torch.Tensor"
603
  if posmap.dim() == 1:
 
 
604
  non_zero_idx = posmap.nonzero(as_tuple=True)[0].tolist()
605
  token_ids = [tokenized["input_ids"][i] for i in non_zero_idx]
606
  return tokenizer.decode(token_ids)
 
597
 
598
 
599
  def get_phrases_from_posmap(
600
+ posmap: torch.BoolTensor, tokenized: Dict, tokenizer: AutoTokenizer, left_idx: int = 0, right_idx: int = 255
601
  ):
602
  assert isinstance(posmap, torch.Tensor), "posmap must be torch.Tensor"
603
  if posmap.dim() == 1:
604
+ posmap[0: left_idx + 1] = False
605
+ posmap[right_idx:] = False
606
  non_zero_idx = posmap.nonzero(as_tuple=True)[0].tolist()
607
  token_ids = [tokenized["input_ids"][i] for i in non_zero_idx]
608
  return tokenizer.decode(token_ids)
GroundingDINO/requirements.txt CHANGED
@@ -6,5 +6,5 @@ yapf
6
  timm
7
  numpy
8
  opencv-python
9
- supervision==0.3.2
10
- pycocotools
 
6
  timm
7
  numpy
8
  opencv-python
9
+ supervision
10
+ pycocotools
GroundingDINO/setup.py CHANGED
@@ -24,6 +24,18 @@ import glob
24
  import os
25
  import subprocess
26
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  import torch
28
  from setuptools import find_packages, setup
29
  from torch.utils.cpp_extension import CUDA_HOME, CppExtension, CUDAExtension
@@ -70,7 +82,7 @@ def get_extensions():
70
  extra_compile_args = {"cxx": []}
71
  define_macros = []
72
 
73
- if torch.cuda.is_available() and CUDA_HOME is not None:
74
  print("Compiling with CUDA")
75
  extension = CUDAExtension
76
  sources += source_cuda
 
24
  import os
25
  import subprocess
26
 
27
+ import subprocess
28
+ import sys
29
+
30
+ def install_torch():
31
+ try:
32
+ import torch
33
+ except ImportError:
34
+ subprocess.check_call([sys.executable, "-m", "pip", "install", "torch"])
35
+
36
+ # Call the function to ensure torch is installed
37
+ install_torch()
38
+
39
  import torch
40
  from setuptools import find_packages, setup
41
  from torch.utils.cpp_extension import CUDA_HOME, CppExtension, CUDAExtension
 
82
  extra_compile_args = {"cxx": []}
83
  define_macros = []
84
 
85
+ if CUDA_HOME is not None and (torch.cuda.is_available() or "TORCH_CUDA_ARCH_LIST" in os.environ):
86
  print("Compiling with CUDA")
87
  extension = CUDAExtension
88
  sources += source_cuda