Upload 226 files
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- ViTPose/ckpts/vitpose-s-coco_25.pth +3 -0
- ViTPose/easy_ViTPose/.dockerignore +2 -0
- ViTPose/easy_ViTPose/.gitignore +13 -0
- ViTPose/easy_ViTPose/.ipynb_checkpoints/README-checkpoint.md +275 -0
- ViTPose/easy_ViTPose/.ipynb_checkpoints/colab_demo-checkpoint.ipynb +0 -0
- ViTPose/easy_ViTPose/.ipynb_checkpoints/evaluation_on_coco-checkpoint.py +92 -0
- ViTPose/easy_ViTPose/.ipynb_checkpoints/inference-checkpoint.py +188 -0
- ViTPose/easy_ViTPose/.ipynb_checkpoints/requirements_gpu-checkpoint.txt +3 -0
- ViTPose/easy_ViTPose/Dockerfile +11 -0
- ViTPose/easy_ViTPose/LICENSE +201 -0
- ViTPose/easy_ViTPose/README.md +275 -0
- ViTPose/easy_ViTPose/colab_demo.ipynb +0 -0
- ViTPose/easy_ViTPose/easy_ViTPose.egg-info/PKG-INFO +7 -0
- ViTPose/easy_ViTPose/easy_ViTPose.egg-info/SOURCES.txt +56 -0
- ViTPose/easy_ViTPose/easy_ViTPose.egg-info/dependency_links.txt +1 -0
- ViTPose/easy_ViTPose/easy_ViTPose.egg-info/top_level.txt +1 -0
- ViTPose/easy_ViTPose/easy_ViTPose/.ipynb_checkpoints/ViTPose_Inference-checkpoint.ipynb +0 -0
- ViTPose/easy_ViTPose/easy_ViTPose/.ipynb_checkpoints/__init__-checkpoint.py +5 -0
- ViTPose/easy_ViTPose/easy_ViTPose/.ipynb_checkpoints/config-checkpoint.yaml +15 -0
- ViTPose/easy_ViTPose/easy_ViTPose/.ipynb_checkpoints/inference-checkpoint.py +337 -0
- ViTPose/easy_ViTPose/easy_ViTPose/.ipynb_checkpoints/testVITPOSE-checkpoint.jpg +0 -0
- ViTPose/easy_ViTPose/easy_ViTPose/.ipynb_checkpoints/train-checkpoint.py +174 -0
- ViTPose/easy_ViTPose/easy_ViTPose/ViTPose_Inference.ipynb +0 -0
- ViTPose/easy_ViTPose/easy_ViTPose/__init__.py +5 -0
- ViTPose/easy_ViTPose/easy_ViTPose/config.yaml +15 -0
- ViTPose/easy_ViTPose/easy_ViTPose/configs/.ipynb_checkpoints/ViTPose_common-checkpoint.py +195 -0
- ViTPose/easy_ViTPose/easy_ViTPose/configs/.ipynb_checkpoints/ViTPose_small_coco_256x192-checkpoint.py +173 -0
- ViTPose/easy_ViTPose/easy_ViTPose/configs/.ipynb_checkpoints/ViTPose_wholebody-checkpoint.py +20 -0
- ViTPose/easy_ViTPose/easy_ViTPose/configs/ViTPose_aic.py +20 -0
- ViTPose/easy_ViTPose/easy_ViTPose/configs/ViTPose_ap10k.py +22 -0
- ViTPose/easy_ViTPose/easy_ViTPose/configs/ViTPose_apt36k.py +22 -0
- ViTPose/easy_ViTPose/easy_ViTPose/configs/ViTPose_coco.py +18 -0
- ViTPose/easy_ViTPose/easy_ViTPose/configs/ViTPose_coco_25.py +20 -0
- ViTPose/easy_ViTPose/easy_ViTPose/configs/ViTPose_common.py +195 -0
- ViTPose/easy_ViTPose/easy_ViTPose/configs/ViTPose_mpii.py +18 -0
- ViTPose/easy_ViTPose/easy_ViTPose/configs/ViTPose_small_coco_256x192.py +173 -0
- ViTPose/easy_ViTPose/easy_ViTPose/configs/ViTPose_wholebody.py +20 -0
- ViTPose/easy_ViTPose/easy_ViTPose/configs/__init__.py +0 -0
- ViTPose/easy_ViTPose/easy_ViTPose/configs/__pycache__/ViTPose_coco_25.cpython-39.pyc +0 -0
- ViTPose/easy_ViTPose/easy_ViTPose/configs/__pycache__/ViTPose_common.cpython-39.pyc +0 -0
- ViTPose/easy_ViTPose/easy_ViTPose/configs/__pycache__/ViTPose_small_coco_256x192.cpython-39.pyc +0 -0
- ViTPose/easy_ViTPose/easy_ViTPose/configs/__pycache__/__init__.cpython-39.pyc +0 -0
- ViTPose/easy_ViTPose/easy_ViTPose/configs/_base_/datasets/300w.py +384 -0
- ViTPose/easy_ViTPose/easy_ViTPose/configs/_base_/datasets/aflw.py +83 -0
- ViTPose/easy_ViTPose/easy_ViTPose/configs/_base_/datasets/aic.py +140 -0
- ViTPose/easy_ViTPose/easy_ViTPose/configs/_base_/datasets/aic_info.py +140 -0
- ViTPose/easy_ViTPose/easy_ViTPose/configs/_base_/datasets/animalpose.py +166 -0
- ViTPose/easy_ViTPose/easy_ViTPose/configs/_base_/datasets/ap10k.py +142 -0
- ViTPose/easy_ViTPose/easy_ViTPose/configs/_base_/datasets/ap10k_info.py +142 -0
- ViTPose/easy_ViTPose/easy_ViTPose/configs/_base_/datasets/atrw.py +144 -0
ViTPose/ckpts/vitpose-s-coco_25.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5af7cbeb123e2a60bf25d981d4b89dab281f3fca18b7956b49a7a685b6311bfe
|
3 |
+
size 97235808
|
ViTPose/easy_ViTPose/.dockerignore
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
Dockerfile
|
2 |
+
models
|
ViTPose/easy_ViTPose/.gitignore
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
**/*.pt
|
2 |
+
**/*.pth
|
3 |
+
**/*.onnx
|
4 |
+
**/__pycache__
|
5 |
+
**/coco/
|
6 |
+
.DS_Store
|
7 |
+
runs
|
8 |
+
ckpts
|
9 |
+
annotations
|
10 |
+
examples
|
11 |
+
outputs
|
12 |
+
.ipynb_checkpoints
|
13 |
+
easy_ViTPose.egg-info
|
ViTPose/easy_ViTPose/.ipynb_checkpoints/README-checkpoint.md
ADDED
@@ -0,0 +1,275 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# easy_ViTPose
|
2 |
+
<p align="center">
|
3 |
+
<img src="https://user-images.githubusercontent.com/24314647/236082274-b25a70c8-9267-4375-97b0-eddf60a7dfc6.png" width=375> easy_ViTPose
|
4 |
+
</p>
|
5 |
+
|
6 |
+
## Accurate 2d human and animal pose estimation
|
7 |
+
|
8 |
+
<a target="_blank" href="https://colab.research.google.com/github/JunkyByte/easy_ViTPose/blob/main/colab_demo.ipynb">
|
9 |
+
<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
|
10 |
+
</a>
|
11 |
+
|
12 |
+
### Easy to use SOTA `ViTPose` [Y. Xu et al., 2022] models for fast inference.
|
13 |
+
We provide all the VitPose original models, converted for inference, with single dataset format output.
|
14 |
+
|
15 |
+
In addition to that we also provide a Coco-25 model, trained on the original coco dataset + feet https://cmu-perceptual-computing-lab.github.io/foot_keypoint_dataset/
|
16 |
+
Finetuning is not currently supported, you can check de43d54cad87404cf0ad4a7b5da6bacf4240248b and previous commits for a working state of `train.py`
|
17 |
+
|
18 |
+
> [!WARNING]
|
19 |
+
> Ultralytics `yolov8` has issue with wrong bounding boxes when using `mps`, upgrade to latest version! (Works correctly on 8.2.48)
|
20 |
+
|
21 |
+
## Results
|
22 |
+

|
23 |
+
|
24 |
+
https://github.com/JunkyByte/easy_ViTPose/assets/24314647/e9a82c17-6e99-4111-8cc8-5257910cb87e
|
25 |
+
|
26 |
+
https://github.com/JunkyByte/easy_ViTPose/assets/24314647/63af44b1-7245-4703-8906-3f034a43f9e3
|
27 |
+
|
28 |
+
(Credits dance: https://www.youtube.com/watch?v=p-rSdt0aFuw )
|
29 |
+
(Credits zebras: https://www.youtube.com/watch?v=y-vELRYS8Yk )
|
30 |
+
|
31 |
+
## Features
|
32 |
+
- Image / Video / Webcam support
|
33 |
+
- Video support using SORT algorithm to track bboxes between frames
|
34 |
+
- Torch / ONNX / Tensorrt inference
|
35 |
+
- Runs the original VitPose checkpoints from [ViTAE-Transformer/ViTPose](https://github.com/ViTAE-Transformer/ViTPose)
|
36 |
+
- 4 ViTPose architectures with different sizes and performances (s: small, b: base, l: large, h: huge)
|
37 |
+
- Multi skeleton and dataset: (AIC / MPII / COCO / COCO + FEET / COCO WHOLEBODY / APT36k / AP10k)
|
38 |
+
- Human / Animal pose estimation
|
39 |
+
- cpu / gpu / metal support
|
40 |
+
- show and save images / videos and output to json
|
41 |
+
|
42 |
+
We run YOLOv8 for detection, it does not provide complete animal detection. You can finetune a custom yolo model to detect the animal you are interested in,
|
43 |
+
if you do please open an issue, we might want to integrate other models for detection.
|
44 |
+
|
45 |
+
### Benchmark:
|
46 |
+
You can expect realtime >30 fps with modern nvidia gpus and apple silicon (using metal!).
|
47 |
+
|
48 |
+
### Skeleton reference
|
49 |
+
There are multiple skeletons for different dataset. Check the definition here [visualization.py](https://github.com/JunkyByte/easy_ViTPose/blob/main/easy_ViTPose/vit_utils/visualization.py).
|
50 |
+
|
51 |
+
## Installation and Usage
|
52 |
+
> [!IMPORTANT]
|
53 |
+
> Install `torch>2.0 with cuda / mps support` by yourself.
|
54 |
+
> also check `requirements_gpu.txt`.
|
55 |
+
|
56 |
+
```bash
|
57 |
+
git clone [email protected]:JunkyByte/easy_ViTPose.git
|
58 |
+
cd easy_ViTPose/
|
59 |
+
pip install -e .
|
60 |
+
pip install -r requirements.txt
|
61 |
+
```
|
62 |
+
|
63 |
+
### Download models
|
64 |
+
- Download the models from [Huggingface](https://huggingface.co/JunkyByte/easy_ViTPose)
|
65 |
+
We provide torch models for every dataset and architecture.
|
66 |
+
If you want to run onnx / tensorrt inference download the appropriate torch ckpt and use `export.py` to convert it.
|
67 |
+
You can use `ultralytics` `yolo export` command to export yolo to onnx and tensorrt as well.
|
68 |
+
|
69 |
+
#### Export to onnx and tensorrt
|
70 |
+
```bash
|
71 |
+
$ python export.py --help
|
72 |
+
usage: export.py [-h] --model-ckpt MODEL_CKPT --model-name {s,b,l,h} [--output OUTPUT] [--dataset DATASET]
|
73 |
+
|
74 |
+
optional arguments:
|
75 |
+
-h, --help show this help message and exit
|
76 |
+
--model-ckpt MODEL_CKPT
|
77 |
+
The torch model that shall be used for conversion
|
78 |
+
--model-name {s,b,l,h}
|
79 |
+
[s: ViT-S, b: ViT-B, l: ViT-L, h: ViT-H]
|
80 |
+
--output OUTPUT File (without extension) or dir path for checkpoint output
|
81 |
+
--dataset DATASET Name of the dataset. If None it"s extracted from the file name. ["coco", "coco_25",
|
82 |
+
"wholebody", "mpii", "ap10k", "apt36k", "aic"]
|
83 |
+
```
|
84 |
+
|
85 |
+
### Run inference
|
86 |
+
To run inference from command line you can use the `inference.py` script as follows:
|
87 |
+
```bash
|
88 |
+
$ python inference.py --help
|
89 |
+
usage: inference.py [-h] [--input INPUT] [--output-path OUTPUT_PATH] --model MODEL [--yolo YOLO] [--dataset DATASET]
|
90 |
+
[--det-class DET_CLASS] [--model-name {s,b,l,h}] [--yolo-size YOLO_SIZE]
|
91 |
+
[--conf-threshold CONF_THRESHOLD] [--rotate {0,90,180,270}] [--yolo-step YOLO_STEP]
|
92 |
+
[--single-pose] [--show] [--show-yolo] [--show-raw-yolo] [--save-img] [--save-json]
|
93 |
+
|
94 |
+
optional arguments:
|
95 |
+
-h, --help show this help message and exit
|
96 |
+
--input INPUT path to image / video or webcam ID (=cv2)
|
97 |
+
--output-path OUTPUT_PATH
|
98 |
+
output path, if the path provided is a directory output files are "input_name
|
99 |
+
+_result{extension}".
|
100 |
+
--model MODEL checkpoint path of the model
|
101 |
+
--yolo YOLO checkpoint path of the yolo model
|
102 |
+
--dataset DATASET Name of the dataset. If None it"s extracted from the file name. ["coco", "coco_25",
|
103 |
+
"wholebody", "mpii", "ap10k", "apt36k", "aic"]
|
104 |
+
--det-class DET_CLASS
|
105 |
+
["human", "cat", "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe",
|
106 |
+
"animals"]
|
107 |
+
--model-name {s,b,l,h}
|
108 |
+
[s: ViT-S, b: ViT-B, l: ViT-L, h: ViT-H]
|
109 |
+
--yolo-size YOLO_SIZE
|
110 |
+
YOLOv8 image size during inference
|
111 |
+
--conf-threshold CONF_THRESHOLD
|
112 |
+
Minimum confidence for keypoints to be drawn. [0, 1] range
|
113 |
+
--rotate {0,90,180,270}
|
114 |
+
Rotate the image of [90, 180, 270] degress counterclockwise
|
115 |
+
--yolo-step YOLO_STEP
|
116 |
+
The tracker can be used to predict the bboxes instead of yolo for performance, this flag
|
117 |
+
specifies how often yolo is applied (e.g. 1 applies yolo every frame). This does not have any
|
118 |
+
effect when is_video is False
|
119 |
+
--single-pose Do not use SORT tracker because single pose is expected in the video
|
120 |
+
--show preview result during inference
|
121 |
+
--show-yolo draw yolo results
|
122 |
+
--show-raw-yolo draw yolo result before that SORT is applied for tracking (only valid during video inference)
|
123 |
+
--save-img save image results
|
124 |
+
--save-json save json results
|
125 |
+
```
|
126 |
+
|
127 |
+
You can run inference from code as follows:
|
128 |
+
```python
|
129 |
+
import cv2
|
130 |
+
from easy_ViTPose import VitInference
|
131 |
+
|
132 |
+
# Image to run inference RGB format
|
133 |
+
img = cv2.imread('./examples/img1.jpg')
|
134 |
+
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
|
135 |
+
|
136 |
+
# set is_video=True to enable tracking in video inference
|
137 |
+
# be sure to use VitInference.reset() function to reset the tracker after each video
|
138 |
+
# There are a few flags that allows to customize VitInference, be sure to check the class definition
|
139 |
+
model_path = './ckpts/vitpose-s-coco_25.pth'
|
140 |
+
yolo_path = './yolov8s.pth'
|
141 |
+
|
142 |
+
# If you want to use MPS (on new macbooks) use the torch checkpoints for both ViTPose and Yolo
|
143 |
+
# If device is None will try to use cuda -> mps -> cpu (otherwise specify 'cpu', 'mps' or 'cuda')
|
144 |
+
# dataset and det_class parameters can be inferred from the ckpt name, but you can specify them.
|
145 |
+
model = VitInference(model_path, yolo_path, model_name='s', yolo_size=320, is_video=False, device=None)
|
146 |
+
|
147 |
+
# Infer keypoints, output is a dict where keys are person ids and values are keypoints (np.ndarray (25, 3): (y, x, score))
|
148 |
+
# If is_video=True the IDs will be consistent among the ordered video frames.
|
149 |
+
keypoints = model.inference(img)
|
150 |
+
|
151 |
+
# call model.reset() after each video
|
152 |
+
|
153 |
+
img = model.draw(show_yolo=True) # Returns RGB image with drawings
|
154 |
+
cv2.imshow('image', cv2.cvtColor(img, cv2.COLOR_RGB2BGR)); cv2.waitKey(0)
|
155 |
+
```
|
156 |
+
> [!NOTE]
|
157 |
+
> If the input file is a video [SORT](https://github.com/abewley/sort) is used to track people IDs and output consistent identifications.
|
158 |
+
|
159 |
+
### OUTPUT json format
|
160 |
+
The output format of the json files:
|
161 |
+
|
162 |
+
```
|
163 |
+
{
|
164 |
+
"keypoints":
|
165 |
+
[ # The list of frames, len(json['keypoints']) == len(video)
|
166 |
+
{ # For each frame a dict
|
167 |
+
"0": [ # keys are id to track people and value the keypoints
|
168 |
+
[121.19, 458.15, 0.99], # Each keypoint is (y, x, score)
|
169 |
+
[110.02, 469.43, 0.98],
|
170 |
+
[110.86, 445.04, 0.99],
|
171 |
+
],
|
172 |
+
"1": [
|
173 |
+
...
|
174 |
+
],
|
175 |
+
},
|
176 |
+
{
|
177 |
+
"0": [
|
178 |
+
[122.19, 458.15, 0.91],
|
179 |
+
[105.02, 469.43, 0.95],
|
180 |
+
[122.86, 445.04, 0.99],
|
181 |
+
],
|
182 |
+
"1": [
|
183 |
+
...
|
184 |
+
]
|
185 |
+
}
|
186 |
+
],
|
187 |
+
"skeleton":
|
188 |
+
{ # Skeleton reference, key the idx, value the name
|
189 |
+
"0": "nose",
|
190 |
+
"1": "left_eye",
|
191 |
+
"2": "right_eye",
|
192 |
+
"3": "left_ear",
|
193 |
+
"4": "right_ear",
|
194 |
+
"5": "neck",
|
195 |
+
...
|
196 |
+
}
|
197 |
+
}
|
198 |
+
```
|
199 |
+
|
200 |
+
## Finetuning
|
201 |
+
Finetuning is possible but not officially supported right now. If you would like to finetune and need help open an issue.
|
202 |
+
You can check `train.py`, `datasets/COCO.py` and `config.yaml` for details.
|
203 |
+
|
204 |
+
---
|
205 |
+
|
206 |
+
## Evaluation on COCO dataset
|
207 |
+
1. Download COCO dataset images and labels
|
208 |
+
- 2017 Val images [5K/1GB]: http://images.cocodataset.org/zips/val2017.zip <br>
|
209 |
+
The extracted directory looks like this:
|
210 |
+
```
|
211 |
+
val2017/
|
212 |
+
├── 000000000139.jpg
|
213 |
+
├── 000000000285.jpg
|
214 |
+
├── 000000000632.jpg
|
215 |
+
└── ...
|
216 |
+
```
|
217 |
+
- 2017 Train/Val annotations [241MB]: http://images.cocodataset.org/annotations/annotations_trainval2017.zip <br>
|
218 |
+
The extracted directory looks like this:
|
219 |
+
```
|
220 |
+
annotations/
|
221 |
+
├── person_keypoints_val2017.json
|
222 |
+
├── person_keypoints_train2017.json
|
223 |
+
└── ...
|
224 |
+
```
|
225 |
+
|
226 |
+
2. Run the following command:
|
227 |
+
|
228 |
+
```bash
|
229 |
+
|
230 |
+
$ python evaluation_on_coco.py
|
231 |
+
|
232 |
+
Command line arguments:
|
233 |
+
--model_path: Path to the pretrained ViT Pose model
|
234 |
+
|
235 |
+
--yolo_path: Path to the YOLOv8 model
|
236 |
+
|
237 |
+
--img_folder_path: Path to the directory containing COCO val images (/val2017 extracted in step 1).
|
238 |
+
|
239 |
+
--annFile: Path to json file for COCO keypoints for val set (annotations/person_keypoints_val2017.json extracted in step 1)
|
240 |
+
```
|
241 |
+
|
242 |
+
---
|
243 |
+
|
244 |
+
|
245 |
+
## Docker
|
246 |
+
The system may be built in a container using Docker. This is intended to demonstrate container-wise inference, adapt it to your own needs by changing models and skeletons:
|
247 |
+
|
248 |
+
`docker build . -t easy_vitpose`
|
249 |
+
|
250 |
+
The image is based on NVIDIA's PyTorch image, which is 20GB large.
|
251 |
+
If you have a compatible GPU set up with [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html),
|
252 |
+
ViTPose will run with hardware acceleration.
|
253 |
+
|
254 |
+
To test an example, create a folder called `cats` with a picture of a cat as `image.jpg`.
|
255 |
+
Run `./models/download.sh` to fetch the large yolov8 and ap10k ViTPose models. Then run inference using the following command (replace with the correct `cats` and `models` paths):
|
256 |
+
|
257 |
+
`docker run --gpus all --rm -it --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 -v ./models:/models -v ~/cats:/cats easy_vitpose python inference.py --det-class cat --input /cats/image.jpg --output-path /cats --save-img --model /models/vitpose-l-ap10k.onnx --yolo /models/yolov8l.pt`
|
258 |
+
|
259 |
+
The result image may be viewed in your `cats` folder.
|
260 |
+
|
261 |
+
## TODO:
|
262 |
+
- refactor finetuning (currently not available)
|
263 |
+
- benchmark and check bottlenecks of inference pipeline
|
264 |
+
- parallel batched inference
|
265 |
+
- other minor fixes
|
266 |
+
- yolo version for animal pose, check https://github.com/JunkyByte/easy_ViTPose/pull/18
|
267 |
+
- solve cuda exceptions on script exit when using tensorrt (no idea how)
|
268 |
+
- add infos about inferred informations during inference, better output of inference status (device etc)
|
269 |
+
- check if is possible to make colab work without runtime restart
|
270 |
+
|
271 |
+
Feel free to open issues, pull requests and contribute on these TODOs.
|
272 |
+
|
273 |
+
## Reference
|
274 |
+
Thanks to the VitPose authors and their official implementation [ViTAE-Transformer/ViTPose](https://github.com/ViTAE-Transformer/ViTPose).
|
275 |
+
The SORT code is taken from [abewley/sort](https://github.com/abewley/sort)
|
ViTPose/easy_ViTPose/.ipynb_checkpoints/colab_demo-checkpoint.ipynb
ADDED
The diff for this file is too large to render.
See raw diff
|
|
ViTPose/easy_ViTPose/.ipynb_checkpoints/evaluation_on_coco-checkpoint.py
ADDED
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Reference: https://github.com/cocodataset/cocoapi/blob/master/PythonAPI/pycocoEvalDemo.ipynb
|
2 |
+
|
3 |
+
import cv2
|
4 |
+
from easy_ViTPose.inference import VitInference
|
5 |
+
from pathlib import Path
|
6 |
+
import os
|
7 |
+
from tqdm.auto import tqdm
|
8 |
+
|
9 |
+
from pycocotools.coco import COCO
|
10 |
+
from pycocotools.cocoeval import COCOeval
|
11 |
+
from statistics import mean
|
12 |
+
import json
|
13 |
+
import argparse
|
14 |
+
|
15 |
+
def parse_arguments():
|
16 |
+
|
17 |
+
parser = argparse.ArgumentParser(description='Argument Parser for infer')
|
18 |
+
parser.add_argument('--model_path', type=str,
|
19 |
+
help='Path to the ViT Pose model')
|
20 |
+
parser.add_argument('--model-name', type=str, choices=['s', 'b', 'l', 'h'],
|
21 |
+
help='[s: ViT-S, b: ViT-B, l: ViT-L, h: ViT-H]')
|
22 |
+
parser.add_argument('--yolo_path', type=str,
|
23 |
+
help='Path to the YOLOv8 model')
|
24 |
+
parser.add_argument('--img_folder_path', type=str,
|
25 |
+
help='Path to the folder containing images')
|
26 |
+
parser.add_argument('--annFile', type=str,
|
27 |
+
help='Path to the COCO annotations file')
|
28 |
+
return parser.parse_args()
|
29 |
+
|
30 |
+
|
31 |
+
def evaluation_on_coco(model_path, model_name, yolo_path, img_folder_path, annFile):
|
32 |
+
# get image IDs of images in val set
|
33 |
+
# Opening JSON file
|
34 |
+
f = open(annFile)
|
35 |
+
gt_annotations = json.load(f)
|
36 |
+
f.close()
|
37 |
+
|
38 |
+
image_ids = set()
|
39 |
+
for ann in gt_annotations['images']:
|
40 |
+
image_ids.add(ann['id'])
|
41 |
+
|
42 |
+
|
43 |
+
model = VitInference(model_path, yolo_path, model_name = model_name, yolo_size=640, is_video=False, device=None)
|
44 |
+
results_list = []
|
45 |
+
|
46 |
+
for image_id in tqdm(image_ids):
|
47 |
+
# run inference here
|
48 |
+
img_path = os.path.join(img_folder_path, str(image_id).zfill(12) + '.jpg')
|
49 |
+
img = cv2.imread(img_path)
|
50 |
+
|
51 |
+
|
52 |
+
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
|
53 |
+
frame_keypoints = model.inference(img)
|
54 |
+
for key in frame_keypoints:
|
55 |
+
results_element = {}
|
56 |
+
results_element['image_id'] = image_id
|
57 |
+
results_element['category_id'] = 1
|
58 |
+
results_element['score'] = model._scores_bbox[key]
|
59 |
+
results_element['bbox'] = []
|
60 |
+
keypoints = []
|
61 |
+
for k in frame_keypoints[key]:
|
62 |
+
keypoints.append(float(round(k[1], 0)))
|
63 |
+
keypoints.append(float(round(k[0], 0)))
|
64 |
+
keypoints.append(0)
|
65 |
+
results_element['keypoints'] = keypoints
|
66 |
+
results_list.append(results_element)
|
67 |
+
|
68 |
+
|
69 |
+
# Define the file path where you want to save the JSON file
|
70 |
+
file_path = "results.json"
|
71 |
+
# Save the list of dictionaries to a JSON file
|
72 |
+
with open(file_path, "w") as json_file:
|
73 |
+
json.dump(results_list, json_file, indent=4)
|
74 |
+
|
75 |
+
|
76 |
+
#initialize COCO ground truth api
|
77 |
+
annType = 'keypoints'
|
78 |
+
cocoGt=COCO(annFile)
|
79 |
+
#initialize COCO detections api
|
80 |
+
resFile="results.json"
|
81 |
+
cocoDt=cocoGt.loadRes(resFile)
|
82 |
+
# running evaluation
|
83 |
+
cocoEval = COCOeval(cocoGt,cocoDt,annType)
|
84 |
+
cocoEval.params.imgIds = [int(i) for i in image_ids]
|
85 |
+
cocoEval.evaluate()
|
86 |
+
cocoEval.accumulate()
|
87 |
+
cocoEval.summarize()
|
88 |
+
|
89 |
+
|
90 |
+
if __name__ == '__main__':
|
91 |
+
args = parse_arguments()
|
92 |
+
evaluation_on_coco(args.model_path, args.model_name, args.yolo_path, args.img_folder_path, args.annFile)
|
ViTPose/easy_ViTPose/.ipynb_checkpoints/inference-checkpoint.py
ADDED
@@ -0,0 +1,188 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
import json
|
3 |
+
import os
|
4 |
+
import time
|
5 |
+
|
6 |
+
from PIL import Image
|
7 |
+
import cv2
|
8 |
+
import numpy as np
|
9 |
+
import torch
|
10 |
+
import tqdm
|
11 |
+
|
12 |
+
from easy_ViTPose.vit_utils.inference import NumpyEncoder, VideoReader
|
13 |
+
from easy_ViTPose.inference import VitInference
|
14 |
+
from easy_ViTPose.vit_utils.visualization import joints_dict
|
15 |
+
|
16 |
+
try:
|
17 |
+
import onnxruntime # noqa: F401
|
18 |
+
has_onnx = True
|
19 |
+
except ModuleNotFoundError:
|
20 |
+
has_onnx = False
|
21 |
+
|
22 |
+
|
23 |
+
if __name__ == "__main__":
|
24 |
+
parser = argparse.ArgumentParser()
|
25 |
+
parser.add_argument('--input', type=str, required=True,
|
26 |
+
help='path to image / video or webcam ID (=cv2)')
|
27 |
+
parser.add_argument('--output-path', type=str, default='',
|
28 |
+
help='output path, if the path provided is a directory '
|
29 |
+
'output files are "input_name +_result{extension}".')
|
30 |
+
parser.add_argument('--model', type=str, required=True,
|
31 |
+
help='checkpoint path of the model')
|
32 |
+
parser.add_argument('--yolo', type=str, required=False, default=None,
|
33 |
+
help='checkpoint path of the yolo model')
|
34 |
+
parser.add_argument('--dataset', type=str, required=False, default=None,
|
35 |
+
help='Name of the dataset. If None it"s extracted from the file name. \
|
36 |
+
["coco", "coco_25", "wholebody", "mpii", "ap10k", "apt36k", "aic"]')
|
37 |
+
parser.add_argument('--det-class', type=str, required=False, default=None,
|
38 |
+
help='["human", "cat", "dog", "horse", "sheep", \
|
39 |
+
"cow", "elephant", "bear", "zebra", "giraffe", "animals"]')
|
40 |
+
parser.add_argument('--model-name', type=str, required=False, choices=['s', 'b', 'l', 'h'],
|
41 |
+
help='[s: ViT-S, b: ViT-B, l: ViT-L, h: ViT-H]')
|
42 |
+
parser.add_argument('--yolo-size', type=int, required=False, default=320,
|
43 |
+
help='YOLOv8 image size during inference')
|
44 |
+
parser.add_argument('--conf-threshold', type=float, required=False, default=0.5,
|
45 |
+
help='Minimum confidence for keypoints to be drawn. [0, 1] range')
|
46 |
+
parser.add_argument('--rotate', type=int, choices=[0, 90, 180, 270],
|
47 |
+
required=False, default=0,
|
48 |
+
help='Rotate the image of [90, 180, 270] degress counterclockwise')
|
49 |
+
parser.add_argument('--yolo-step', type=int,
|
50 |
+
required=False, default=1,
|
51 |
+
help='The tracker can be used to predict the bboxes instead of yolo for performance, '
|
52 |
+
'this flag specifies how often yolo is applied (e.g. 1 applies yolo every frame). '
|
53 |
+
'This does not have any effect when is_video is False')
|
54 |
+
parser.add_argument('--single-pose', default=False, action='store_true',
|
55 |
+
help='Do not use SORT tracker because single pose is expected in the video')
|
56 |
+
parser.add_argument('--show', default=False, action='store_true',
|
57 |
+
help='preview result during inference')
|
58 |
+
parser.add_argument('--show-yolo', default=False, action='store_true',
|
59 |
+
help='draw yolo results')
|
60 |
+
parser.add_argument('--show-raw-yolo', default=False, action='store_true',
|
61 |
+
help='draw yolo result before that SORT is applied for tracking'
|
62 |
+
' (only valid during video inference)')
|
63 |
+
parser.add_argument('--save-img', default=False, action='store_true',
|
64 |
+
help='save image results')
|
65 |
+
parser.add_argument('--save-json', default=False, action='store_true',
|
66 |
+
help='save json results')
|
67 |
+
args = parser.parse_args()
|
68 |
+
|
69 |
+
use_mps = hasattr(torch.backends, 'mps') and torch.backends.mps.is_available()
|
70 |
+
use_cuda = torch.cuda.is_available()
|
71 |
+
|
72 |
+
# Load Yolo
|
73 |
+
yolo = args.yolo
|
74 |
+
if yolo is None:
|
75 |
+
yolo = 'easy_ViTPose/' + ('yolov8s' + ('.onnx' if has_onnx and not (use_mps or use_cuda) else '.pt'))
|
76 |
+
input_path = args.input
|
77 |
+
|
78 |
+
# Load the image / video reader
|
79 |
+
try: # Check if is webcam
|
80 |
+
int(input_path)
|
81 |
+
is_video = True
|
82 |
+
except ValueError:
|
83 |
+
assert os.path.isfile(input_path), 'The input file does not exist'
|
84 |
+
is_video = input_path[input_path.rfind('.') + 1:].lower() in ['mp4', 'mov']
|
85 |
+
|
86 |
+
ext = '.mp4' if is_video else '.png'
|
87 |
+
assert not (args.save_img or args.save_json) or args.output_path, \
|
88 |
+
'Specify an output path if using save-img or save-json flags'
|
89 |
+
output_path = args.output_path
|
90 |
+
if output_path:
|
91 |
+
if os.path.isdir(output_path):
|
92 |
+
og_ext = input_path[input_path.rfind('.'):]
|
93 |
+
save_name_img = os.path.basename(input_path).replace(og_ext, f"_result{ext}")
|
94 |
+
save_name_json = os.path.basename(input_path).replace(og_ext, "_result.json")
|
95 |
+
output_path_img = os.path.join(output_path, save_name_img)
|
96 |
+
output_path_json = os.path.join(output_path, save_name_json)
|
97 |
+
else:
|
98 |
+
output_path_img = output_path + f'{ext}'
|
99 |
+
output_path_json = output_path + '.json'
|
100 |
+
|
101 |
+
wait = 0
|
102 |
+
total_frames = 1
|
103 |
+
if is_video:
|
104 |
+
reader = VideoReader(input_path, args.rotate)
|
105 |
+
cap = cv2.VideoCapture(input_path) # type: ignore
|
106 |
+
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
|
107 |
+
cap.release()
|
108 |
+
wait = 15
|
109 |
+
if args.save_img:
|
110 |
+
cap = cv2.VideoCapture(input_path) # type: ignore
|
111 |
+
fps = cap.get(cv2.CAP_PROP_FPS)
|
112 |
+
ret, frame = cap.read()
|
113 |
+
cap.release()
|
114 |
+
assert ret
|
115 |
+
assert fps > 0
|
116 |
+
output_size = frame.shape[:2][::-1]
|
117 |
+
|
118 |
+
# Check if we have X264 otherwise use default MJPG
|
119 |
+
try:
|
120 |
+
temp_video = cv2.VideoWriter('/tmp/checkcodec.mp4',
|
121 |
+
cv2.VideoWriter_fourcc(*'h264'), 30, (32, 32))
|
122 |
+
opened = temp_video.isOpened()
|
123 |
+
except Exception:
|
124 |
+
opened = False
|
125 |
+
codec = 'h264' if opened else 'MJPG'
|
126 |
+
out_writer = cv2.VideoWriter(output_path_img,
|
127 |
+
cv2.VideoWriter_fourcc(*codec), # More efficient codec
|
128 |
+
fps, output_size) # type: ignore
|
129 |
+
else:
|
130 |
+
reader = [np.array(Image.open(input_path).rotate(args.rotate))] # type: ignore
|
131 |
+
|
132 |
+
# Initialize model
|
133 |
+
model = VitInference(args.model, yolo, args.model_name,
|
134 |
+
args.det_class, args.dataset,
|
135 |
+
args.yolo_size, is_video=is_video,
|
136 |
+
single_pose=args.single_pose,
|
137 |
+
yolo_step=args.yolo_step) # type: ignore
|
138 |
+
print(f">>> Model loaded: {args.model}")
|
139 |
+
|
140 |
+
print(f'>>> Running inference on {input_path}')
|
141 |
+
keypoints = []
|
142 |
+
fps = []
|
143 |
+
tot_time = 0.
|
144 |
+
for (ith, img) in tqdm.tqdm(enumerate(reader), total=total_frames):
|
145 |
+
t0 = time.time()
|
146 |
+
|
147 |
+
# Run inference
|
148 |
+
frame_keypoints = model.inference(img)
|
149 |
+
keypoints.append(frame_keypoints)
|
150 |
+
|
151 |
+
delta = time.time() - t0
|
152 |
+
tot_time += delta
|
153 |
+
fps.append(delta)
|
154 |
+
|
155 |
+
# Draw the poses and save the output img
|
156 |
+
if args.show or args.save_img:
|
157 |
+
# Draw result and transform to BGR
|
158 |
+
img = model.draw(args.show_yolo, args.show_raw_yolo, args.conf_threshold)[..., ::-1]
|
159 |
+
|
160 |
+
if args.save_img:
|
161 |
+
# TODO: If exists add (1), (2), ...
|
162 |
+
if is_video:
|
163 |
+
out_writer.write(img)
|
164 |
+
else:
|
165 |
+
print('>>> Saving output image')
|
166 |
+
cv2.imwrite(output_path_img, img)
|
167 |
+
|
168 |
+
if args.show:
|
169 |
+
cv2.imshow('preview', img)
|
170 |
+
cv2.waitKey(wait)
|
171 |
+
|
172 |
+
if is_video:
|
173 |
+
tot_poses = sum(len(k) for k in keypoints)
|
174 |
+
print(f'>>> Mean inference FPS: {1 / np.mean(fps):.2f}')
|
175 |
+
print(f'>>> Total poses predicted: {tot_poses} mean per frame: '
|
176 |
+
f'{(tot_poses / (ith + 1)):.2f}')
|
177 |
+
print(f'>>> Mean FPS per pose: {(tot_poses / tot_time):.2f}')
|
178 |
+
|
179 |
+
if args.save_json:
|
180 |
+
print('>>> Saving output json')
|
181 |
+
with open(output_path_json, 'w') as f:
|
182 |
+
out = {'keypoints': keypoints,
|
183 |
+
'skeleton': joints_dict()[model.dataset]['keypoints']}
|
184 |
+
json.dump(out, f, cls=NumpyEncoder)
|
185 |
+
|
186 |
+
if is_video and args.save_img:
|
187 |
+
out_writer.release()
|
188 |
+
cv2.destroyAllWindows()
|
ViTPose/easy_ViTPose/.ipynb_checkpoints/requirements_gpu-checkpoint.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
onnxruntime-gpu>=1.13.0
|
2 |
+
tensorrt>=8.5.1.7
|
3 |
+
torch-tensorrt>=1.4.0
|
ViTPose/easy_ViTPose/Dockerfile
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM nvcr.io/nvidia/pytorch:24.07-py3
|
2 |
+
COPY . /easy_ViTPose
|
3 |
+
WORKDIR /easy_ViTPose/
|
4 |
+
ENV DEBIAN_FRONTEND=noninteractive
|
5 |
+
|
6 |
+
RUN pip uninstall -y $(pip list --format=freeze | grep opencv) && \
|
7 |
+
rm -rf /usr/local/lib/python3.10/dist-packages/cv2/
|
8 |
+
RUN pip install -e . && pip install -r requirements.txt && pip install -r requirements_gpu.txt
|
9 |
+
|
10 |
+
# OpenCV dependency
|
11 |
+
RUN apt-get update && apt-get install -y libgl1
|
ViTPose/easy_ViTPose/LICENSE
ADDED
@@ -0,0 +1,201 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Apache License
|
2 |
+
Version 2.0, January 2004
|
3 |
+
http://www.apache.org/licenses/
|
4 |
+
|
5 |
+
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
6 |
+
|
7 |
+
1. Definitions.
|
8 |
+
|
9 |
+
"License" shall mean the terms and conditions for use, reproduction,
|
10 |
+
and distribution as defined by Sections 1 through 9 of this document.
|
11 |
+
|
12 |
+
"Licensor" shall mean the copyright owner or entity authorized by
|
13 |
+
the copyright owner that is granting the License.
|
14 |
+
|
15 |
+
"Legal Entity" shall mean the union of the acting entity and all
|
16 |
+
other entities that control, are controlled by, or are under common
|
17 |
+
control with that entity. For the purposes of this definition,
|
18 |
+
"control" means (i) the power, direct or indirect, to cause the
|
19 |
+
direction or management of such entity, whether by contract or
|
20 |
+
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
21 |
+
outstanding shares, or (iii) beneficial ownership of such entity.
|
22 |
+
|
23 |
+
"You" (or "Your") shall mean an individual or Legal Entity
|
24 |
+
exercising permissions granted by this License.
|
25 |
+
|
26 |
+
"Source" form shall mean the preferred form for making modifications,
|
27 |
+
including but not limited to software source code, documentation
|
28 |
+
source, and configuration files.
|
29 |
+
|
30 |
+
"Object" form shall mean any form resulting from mechanical
|
31 |
+
transformation or translation of a Source form, including but
|
32 |
+
not limited to compiled object code, generated documentation,
|
33 |
+
and conversions to other media types.
|
34 |
+
|
35 |
+
"Work" shall mean the work of authorship, whether in Source or
|
36 |
+
Object form, made available under the License, as indicated by a
|
37 |
+
copyright notice that is included in or attached to the work
|
38 |
+
(an example is provided in the Appendix below).
|
39 |
+
|
40 |
+
"Derivative Works" shall mean any work, whether in Source or Object
|
41 |
+
form, that is based on (or derived from) the Work and for which the
|
42 |
+
editorial revisions, annotations, elaborations, or other modifications
|
43 |
+
represent, as a whole, an original work of authorship. For the purposes
|
44 |
+
of this License, Derivative Works shall not include works that remain
|
45 |
+
separable from, or merely link (or bind by name) to the interfaces of,
|
46 |
+
the Work and Derivative Works thereof.
|
47 |
+
|
48 |
+
"Contribution" shall mean any work of authorship, including
|
49 |
+
the original version of the Work and any modifications or additions
|
50 |
+
to that Work or Derivative Works thereof, that is intentionally
|
51 |
+
submitted to Licensor for inclusion in the Work by the copyright owner
|
52 |
+
or by an individual or Legal Entity authorized to submit on behalf of
|
53 |
+
the copyright owner. For the purposes of this definition, "submitted"
|
54 |
+
means any form of electronic, verbal, or written communication sent
|
55 |
+
to the Licensor or its representatives, including but not limited to
|
56 |
+
communication on electronic mailing lists, source code control systems,
|
57 |
+
and issue tracking systems that are managed by, or on behalf of, the
|
58 |
+
Licensor for the purpose of discussing and improving the Work, but
|
59 |
+
excluding communication that is conspicuously marked or otherwise
|
60 |
+
designated in writing by the copyright owner as "Not a Contribution."
|
61 |
+
|
62 |
+
"Contributor" shall mean Licensor and any individual or Legal Entity
|
63 |
+
on behalf of whom a Contribution has been received by Licensor and
|
64 |
+
subsequently incorporated within the Work.
|
65 |
+
|
66 |
+
2. Grant of Copyright License. Subject to the terms and conditions of
|
67 |
+
this License, each Contributor hereby grants to You a perpetual,
|
68 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
69 |
+
copyright license to reproduce, prepare Derivative Works of,
|
70 |
+
publicly display, publicly perform, sublicense, and distribute the
|
71 |
+
Work and such Derivative Works in Source or Object form.
|
72 |
+
|
73 |
+
3. Grant of Patent License. Subject to the terms and conditions of
|
74 |
+
this License, each Contributor hereby grants to You a perpetual,
|
75 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
76 |
+
(except as stated in this section) patent license to make, have made,
|
77 |
+
use, offer to sell, sell, import, and otherwise transfer the Work,
|
78 |
+
where such license applies only to those patent claims licensable
|
79 |
+
by such Contributor that are necessarily infringed by their
|
80 |
+
Contribution(s) alone or by combination of their Contribution(s)
|
81 |
+
with the Work to which such Contribution(s) was submitted. If You
|
82 |
+
institute patent litigation against any entity (including a
|
83 |
+
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
84 |
+
or a Contribution incorporated within the Work constitutes direct
|
85 |
+
or contributory patent infringement, then any patent licenses
|
86 |
+
granted to You under this License for that Work shall terminate
|
87 |
+
as of the date such litigation is filed.
|
88 |
+
|
89 |
+
4. Redistribution. You may reproduce and distribute copies of the
|
90 |
+
Work or Derivative Works thereof in any medium, with or without
|
91 |
+
modifications, and in Source or Object form, provided that You
|
92 |
+
meet the following conditions:
|
93 |
+
|
94 |
+
(a) You must give any other recipients of the Work or
|
95 |
+
Derivative Works a copy of this License; and
|
96 |
+
|
97 |
+
(b) You must cause any modified files to carry prominent notices
|
98 |
+
stating that You changed the files; and
|
99 |
+
|
100 |
+
(c) You must retain, in the Source form of any Derivative Works
|
101 |
+
that You distribute, all copyright, patent, trademark, and
|
102 |
+
attribution notices from the Source form of the Work,
|
103 |
+
excluding those notices that do not pertain to any part of
|
104 |
+
the Derivative Works; and
|
105 |
+
|
106 |
+
(d) If the Work includes a "NOTICE" text file as part of its
|
107 |
+
distribution, then any Derivative Works that You distribute must
|
108 |
+
include a readable copy of the attribution notices contained
|
109 |
+
within such NOTICE file, excluding those notices that do not
|
110 |
+
pertain to any part of the Derivative Works, in at least one
|
111 |
+
of the following places: within a NOTICE text file distributed
|
112 |
+
as part of the Derivative Works; within the Source form or
|
113 |
+
documentation, if provided along with the Derivative Works; or,
|
114 |
+
within a display generated by the Derivative Works, if and
|
115 |
+
wherever such third-party notices normally appear. The contents
|
116 |
+
of the NOTICE file are for informational purposes only and
|
117 |
+
do not modify the License. You may add Your own attribution
|
118 |
+
notices within Derivative Works that You distribute, alongside
|
119 |
+
or as an addendum to the NOTICE text from the Work, provided
|
120 |
+
that such additional attribution notices cannot be construed
|
121 |
+
as modifying the License.
|
122 |
+
|
123 |
+
You may add Your own copyright statement to Your modifications and
|
124 |
+
may provide additional or different license terms and conditions
|
125 |
+
for use, reproduction, or distribution of Your modifications, or
|
126 |
+
for any such Derivative Works as a whole, provided Your use,
|
127 |
+
reproduction, and distribution of the Work otherwise complies with
|
128 |
+
the conditions stated in this License.
|
129 |
+
|
130 |
+
5. Submission of Contributions. Unless You explicitly state otherwise,
|
131 |
+
any Contribution intentionally submitted for inclusion in the Work
|
132 |
+
by You to the Licensor shall be under the terms and conditions of
|
133 |
+
this License, without any additional terms or conditions.
|
134 |
+
Notwithstanding the above, nothing herein shall supersede or modify
|
135 |
+
the terms of any separate license agreement you may have executed
|
136 |
+
with Licensor regarding such Contributions.
|
137 |
+
|
138 |
+
6. Trademarks. This License does not grant permission to use the trade
|
139 |
+
names, trademarks, service marks, or product names of the Licensor,
|
140 |
+
except as required for reasonable and customary use in describing the
|
141 |
+
origin of the Work and reproducing the content of the NOTICE file.
|
142 |
+
|
143 |
+
7. Disclaimer of Warranty. Unless required by applicable law or
|
144 |
+
agreed to in writing, Licensor provides the Work (and each
|
145 |
+
Contributor provides its Contributions) on an "AS IS" BASIS,
|
146 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
147 |
+
implied, including, without limitation, any warranties or conditions
|
148 |
+
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
149 |
+
PARTICULAR PURPOSE. You are solely responsible for determining the
|
150 |
+
appropriateness of using or redistributing the Work and assume any
|
151 |
+
risks associated with Your exercise of permissions under this License.
|
152 |
+
|
153 |
+
8. Limitation of Liability. In no event and under no legal theory,
|
154 |
+
whether in tort (including negligence), contract, or otherwise,
|
155 |
+
unless required by applicable law (such as deliberate and grossly
|
156 |
+
negligent acts) or agreed to in writing, shall any Contributor be
|
157 |
+
liable to You for damages, including any direct, indirect, special,
|
158 |
+
incidental, or consequential damages of any character arising as a
|
159 |
+
result of this License or out of the use or inability to use the
|
160 |
+
Work (including but not limited to damages for loss of goodwill,
|
161 |
+
work stoppage, computer failure or malfunction, or any and all
|
162 |
+
other commercial damages or losses), even if such Contributor
|
163 |
+
has been advised of the possibility of such damages.
|
164 |
+
|
165 |
+
9. Accepting Warranty or Additional Liability. While redistributing
|
166 |
+
the Work or Derivative Works thereof, You may choose to offer,
|
167 |
+
and charge a fee for, acceptance of support, warranty, indemnity,
|
168 |
+
or other liability obligations and/or rights consistent with this
|
169 |
+
License. However, in accepting such obligations, You may act only
|
170 |
+
on Your own behalf and on Your sole responsibility, not on behalf
|
171 |
+
of any other Contributor, and only if You agree to indemnify,
|
172 |
+
defend, and hold each Contributor harmless for any liability
|
173 |
+
incurred by, or claims asserted against, such Contributor by reason
|
174 |
+
of your accepting any such warranty or additional liability.
|
175 |
+
|
176 |
+
END OF TERMS AND CONDITIONS
|
177 |
+
|
178 |
+
APPENDIX: How to apply the Apache License to your work.
|
179 |
+
|
180 |
+
To apply the Apache License to your work, attach the following
|
181 |
+
boilerplate notice, with the fields enclosed by brackets "[]"
|
182 |
+
replaced with your own identifying information. (Don't include
|
183 |
+
the brackets!) The text should be enclosed in the appropriate
|
184 |
+
comment syntax for the file format. We also recommend that a
|
185 |
+
file or class name and description of purpose be included on the
|
186 |
+
same "printed page" as the copyright notice for easier
|
187 |
+
identification within third-party archives.
|
188 |
+
|
189 |
+
Copyright [yyyy] [name of copyright owner]
|
190 |
+
|
191 |
+
Licensed under the Apache License, Version 2.0 (the "License");
|
192 |
+
you may not use this file except in compliance with the License.
|
193 |
+
You may obtain a copy of the License at
|
194 |
+
|
195 |
+
http://www.apache.org/licenses/LICENSE-2.0
|
196 |
+
|
197 |
+
Unless required by applicable law or agreed to in writing, software
|
198 |
+
distributed under the License is distributed on an "AS IS" BASIS,
|
199 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
200 |
+
See the License for the specific language governing permissions and
|
201 |
+
limitations under the License.
|
ViTPose/easy_ViTPose/README.md
ADDED
@@ -0,0 +1,275 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# easy_ViTPose
|
2 |
+
<p align="center">
|
3 |
+
<img src="https://user-images.githubusercontent.com/24314647/236082274-b25a70c8-9267-4375-97b0-eddf60a7dfc6.png" width=375> easy_ViTPose
|
4 |
+
</p>
|
5 |
+
|
6 |
+
## Accurate 2d human and animal pose estimation
|
7 |
+
|
8 |
+
<a target="_blank" href="https://colab.research.google.com/github/JunkyByte/easy_ViTPose/blob/main/colab_demo.ipynb">
|
9 |
+
<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
|
10 |
+
</a>
|
11 |
+
|
12 |
+
### Easy to use SOTA `ViTPose` [Y. Xu et al., 2022] models for fast inference.
|
13 |
+
We provide all the VitPose original models, converted for inference, with single dataset format output.
|
14 |
+
|
15 |
+
In addition to that we also provide a Coco-25 model, trained on the original coco dataset + feet https://cmu-perceptual-computing-lab.github.io/foot_keypoint_dataset/
|
16 |
+
Finetuning is not currently supported, you can check de43d54cad87404cf0ad4a7b5da6bacf4240248b and previous commits for a working state of `train.py`
|
17 |
+
|
18 |
+
> [!WARNING]
|
19 |
+
> Ultralytics `yolov8` has issue with wrong bounding boxes when using `mps`, upgrade to latest version! (Works correctly on 8.2.48)
|
20 |
+
|
21 |
+
## Results
|
22 |
+

|
23 |
+
|
24 |
+
https://github.com/JunkyByte/easy_ViTPose/assets/24314647/e9a82c17-6e99-4111-8cc8-5257910cb87e
|
25 |
+
|
26 |
+
https://github.com/JunkyByte/easy_ViTPose/assets/24314647/63af44b1-7245-4703-8906-3f034a43f9e3
|
27 |
+
|
28 |
+
(Credits dance: https://www.youtube.com/watch?v=p-rSdt0aFuw )
|
29 |
+
(Credits zebras: https://www.youtube.com/watch?v=y-vELRYS8Yk )
|
30 |
+
|
31 |
+
## Features
|
32 |
+
- Image / Video / Webcam support
|
33 |
+
- Video support using SORT algorithm to track bboxes between frames
|
34 |
+
- Torch / ONNX / Tensorrt inference
|
35 |
+
- Runs the original VitPose checkpoints from [ViTAE-Transformer/ViTPose](https://github.com/ViTAE-Transformer/ViTPose)
|
36 |
+
- 4 ViTPose architectures with different sizes and performances (s: small, b: base, l: large, h: huge)
|
37 |
+
- Multi skeleton and dataset: (AIC / MPII / COCO / COCO + FEET / COCO WHOLEBODY / APT36k / AP10k)
|
38 |
+
- Human / Animal pose estimation
|
39 |
+
- cpu / gpu / metal support
|
40 |
+
- show and save images / videos and output to json
|
41 |
+
|
42 |
+
We run YOLOv8 for detection, it does not provide complete animal detection. You can finetune a custom yolo model to detect the animal you are interested in,
|
43 |
+
if you do please open an issue, we might want to integrate other models for detection.
|
44 |
+
|
45 |
+
### Benchmark:
|
46 |
+
You can expect realtime >30 fps with modern nvidia gpus and apple silicon (using metal!).
|
47 |
+
|
48 |
+
### Skeleton reference
|
49 |
+
There are multiple skeletons for different dataset. Check the definition here [visualization.py](https://github.com/JunkyByte/easy_ViTPose/blob/main/easy_ViTPose/vit_utils/visualization.py).
|
50 |
+
|
51 |
+
## Installation and Usage
|
52 |
+
> [!IMPORTANT]
|
53 |
+
> Install `torch>2.0 with cuda / mps support` by yourself.
|
54 |
+
> also check `requirements_gpu.txt`.
|
55 |
+
|
56 |
+
```bash
|
57 |
+
git clone [email protected]:JunkyByte/easy_ViTPose.git
|
58 |
+
cd easy_ViTPose/
|
59 |
+
pip install -e .
|
60 |
+
pip install -r requirements.txt
|
61 |
+
```
|
62 |
+
|
63 |
+
### Download models
|
64 |
+
- Download the models from [Huggingface](https://huggingface.co/JunkyByte/easy_ViTPose)
|
65 |
+
We provide torch models for every dataset and architecture.
|
66 |
+
If you want to run onnx / tensorrt inference download the appropriate torch ckpt and use `export.py` to convert it.
|
67 |
+
You can use `ultralytics` `yolo export` command to export yolo to onnx and tensorrt as well.
|
68 |
+
|
69 |
+
#### Export to onnx and tensorrt
|
70 |
+
```bash
|
71 |
+
$ python export.py --help
|
72 |
+
usage: export.py [-h] --model-ckpt MODEL_CKPT --model-name {s,b,l,h} [--output OUTPUT] [--dataset DATASET]
|
73 |
+
|
74 |
+
optional arguments:
|
75 |
+
-h, --help show this help message and exit
|
76 |
+
--model-ckpt MODEL_CKPT
|
77 |
+
The torch model that shall be used for conversion
|
78 |
+
--model-name {s,b,l,h}
|
79 |
+
[s: ViT-S, b: ViT-B, l: ViT-L, h: ViT-H]
|
80 |
+
--output OUTPUT File (without extension) or dir path for checkpoint output
|
81 |
+
--dataset DATASET Name of the dataset. If None it"s extracted from the file name. ["coco", "coco_25",
|
82 |
+
"wholebody", "mpii", "ap10k", "apt36k", "aic"]
|
83 |
+
```
|
84 |
+
|
85 |
+
### Run inference
|
86 |
+
To run inference from command line you can use the `inference.py` script as follows:
|
87 |
+
```bash
|
88 |
+
$ python inference.py --help
|
89 |
+
usage: inference.py [-h] [--input INPUT] [--output-path OUTPUT_PATH] --model MODEL [--yolo YOLO] [--dataset DATASET]
|
90 |
+
[--det-class DET_CLASS] [--model-name {s,b,l,h}] [--yolo-size YOLO_SIZE]
|
91 |
+
[--conf-threshold CONF_THRESHOLD] [--rotate {0,90,180,270}] [--yolo-step YOLO_STEP]
|
92 |
+
[--single-pose] [--show] [--show-yolo] [--show-raw-yolo] [--save-img] [--save-json]
|
93 |
+
|
94 |
+
optional arguments:
|
95 |
+
-h, --help show this help message and exit
|
96 |
+
--input INPUT path to image / video or webcam ID (=cv2)
|
97 |
+
--output-path OUTPUT_PATH
|
98 |
+
output path, if the path provided is a directory output files are "input_name
|
99 |
+
+_result{extension}".
|
100 |
+
--model MODEL checkpoint path of the model
|
101 |
+
--yolo YOLO checkpoint path of the yolo model
|
102 |
+
--dataset DATASET Name of the dataset. If None it"s extracted from the file name. ["coco", "coco_25",
|
103 |
+
"wholebody", "mpii", "ap10k", "apt36k", "aic"]
|
104 |
+
--det-class DET_CLASS
|
105 |
+
["human", "cat", "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe",
|
106 |
+
"animals"]
|
107 |
+
--model-name {s,b,l,h}
|
108 |
+
[s: ViT-S, b: ViT-B, l: ViT-L, h: ViT-H]
|
109 |
+
--yolo-size YOLO_SIZE
|
110 |
+
YOLOv8 image size during inference
|
111 |
+
--conf-threshold CONF_THRESHOLD
|
112 |
+
Minimum confidence for keypoints to be drawn. [0, 1] range
|
113 |
+
--rotate {0,90,180,270}
|
114 |
+
Rotate the image of [90, 180, 270] degress counterclockwise
|
115 |
+
--yolo-step YOLO_STEP
|
116 |
+
The tracker can be used to predict the bboxes instead of yolo for performance, this flag
|
117 |
+
specifies how often yolo is applied (e.g. 1 applies yolo every frame). This does not have any
|
118 |
+
effect when is_video is False
|
119 |
+
--single-pose Do not use SORT tracker because single pose is expected in the video
|
120 |
+
--show preview result during inference
|
121 |
+
--show-yolo draw yolo results
|
122 |
+
--show-raw-yolo draw yolo result before that SORT is applied for tracking (only valid during video inference)
|
123 |
+
--save-img save image results
|
124 |
+
--save-json save json results
|
125 |
+
```
|
126 |
+
|
127 |
+
You can run inference from code as follows:
|
128 |
+
```python
|
129 |
+
import cv2
|
130 |
+
from easy_ViTPose import VitInference
|
131 |
+
|
132 |
+
# Image to run inference RGB format
|
133 |
+
img = cv2.imread('./examples/img1.jpg')
|
134 |
+
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
|
135 |
+
|
136 |
+
# set is_video=True to enable tracking in video inference
|
137 |
+
# be sure to use VitInference.reset() function to reset the tracker after each video
|
138 |
+
# There are a few flags that allows to customize VitInference, be sure to check the class definition
|
139 |
+
model_path = './ckpts/vitpose-s-coco_25.pth'
|
140 |
+
yolo_path = './yolov8s.pth'
|
141 |
+
|
142 |
+
# If you want to use MPS (on new macbooks) use the torch checkpoints for both ViTPose and Yolo
|
143 |
+
# If device is None will try to use cuda -> mps -> cpu (otherwise specify 'cpu', 'mps' or 'cuda')
|
144 |
+
# dataset and det_class parameters can be inferred from the ckpt name, but you can specify them.
|
145 |
+
model = VitInference(model_path, yolo_path, model_name='s', yolo_size=320, is_video=False, device=None)
|
146 |
+
|
147 |
+
# Infer keypoints, output is a dict where keys are person ids and values are keypoints (np.ndarray (25, 3): (y, x, score))
|
148 |
+
# If is_video=True the IDs will be consistent among the ordered video frames.
|
149 |
+
keypoints = model.inference(img)
|
150 |
+
|
151 |
+
# call model.reset() after each video
|
152 |
+
|
153 |
+
img = model.draw(show_yolo=True) # Returns RGB image with drawings
|
154 |
+
cv2.imshow('image', cv2.cvtColor(img, cv2.COLOR_RGB2BGR)); cv2.waitKey(0)
|
155 |
+
```
|
156 |
+
> [!NOTE]
|
157 |
+
> If the input file is a video [SORT](https://github.com/abewley/sort) is used to track people IDs and output consistent identifications.
|
158 |
+
|
159 |
+
### OUTPUT json format
|
160 |
+
The output format of the json files:
|
161 |
+
|
162 |
+
```
|
163 |
+
{
|
164 |
+
"keypoints":
|
165 |
+
[ # The list of frames, len(json['keypoints']) == len(video)
|
166 |
+
{ # For each frame a dict
|
167 |
+
"0": [ # keys are id to track people and value the keypoints
|
168 |
+
[121.19, 458.15, 0.99], # Each keypoint is (y, x, score)
|
169 |
+
[110.02, 469.43, 0.98],
|
170 |
+
[110.86, 445.04, 0.99],
|
171 |
+
],
|
172 |
+
"1": [
|
173 |
+
...
|
174 |
+
],
|
175 |
+
},
|
176 |
+
{
|
177 |
+
"0": [
|
178 |
+
[122.19, 458.15, 0.91],
|
179 |
+
[105.02, 469.43, 0.95],
|
180 |
+
[122.86, 445.04, 0.99],
|
181 |
+
],
|
182 |
+
"1": [
|
183 |
+
...
|
184 |
+
]
|
185 |
+
}
|
186 |
+
],
|
187 |
+
"skeleton":
|
188 |
+
{ # Skeleton reference, key the idx, value the name
|
189 |
+
"0": "nose",
|
190 |
+
"1": "left_eye",
|
191 |
+
"2": "right_eye",
|
192 |
+
"3": "left_ear",
|
193 |
+
"4": "right_ear",
|
194 |
+
"5": "neck",
|
195 |
+
...
|
196 |
+
}
|
197 |
+
}
|
198 |
+
```
|
199 |
+
|
200 |
+
## Finetuning
|
201 |
+
Finetuning is possible but not officially supported right now. If you would like to finetune and need help open an issue.
|
202 |
+
You can check `train.py`, `datasets/COCO.py` and `config.yaml` for details.
|
203 |
+
|
204 |
+
---
|
205 |
+
|
206 |
+
## Evaluation on COCO dataset
|
207 |
+
1. Download COCO dataset images and labels
|
208 |
+
- 2017 Val images [5K/1GB]: http://images.cocodataset.org/zips/val2017.zip <br>
|
209 |
+
The extracted directory looks like this:
|
210 |
+
```
|
211 |
+
val2017/
|
212 |
+
├── 000000000139.jpg
|
213 |
+
├── 000000000285.jpg
|
214 |
+
├── 000000000632.jpg
|
215 |
+
└── ...
|
216 |
+
```
|
217 |
+
- 2017 Train/Val annotations [241MB]: http://images.cocodataset.org/annotations/annotations_trainval2017.zip <br>
|
218 |
+
The extracted directory looks like this:
|
219 |
+
```
|
220 |
+
annotations/
|
221 |
+
├── person_keypoints_val2017.json
|
222 |
+
├── person_keypoints_train2017.json
|
223 |
+
└── ...
|
224 |
+
```
|
225 |
+
|
226 |
+
2. Run the following command:
|
227 |
+
|
228 |
+
```bash
|
229 |
+
|
230 |
+
$ python evaluation_on_coco.py
|
231 |
+
|
232 |
+
Command line arguments:
|
233 |
+
--model_path: Path to the pretrained ViT Pose model
|
234 |
+
|
235 |
+
--yolo_path: Path to the YOLOv8 model
|
236 |
+
|
237 |
+
--img_folder_path: Path to the directory containing COCO val images (/val2017 extracted in step 1).
|
238 |
+
|
239 |
+
--annFile: Path to json file for COCO keypoints for val set (annotations/person_keypoints_val2017.json extracted in step 1)
|
240 |
+
```
|
241 |
+
|
242 |
+
---
|
243 |
+
|
244 |
+
|
245 |
+
## Docker
|
246 |
+
The system may be built in a container using Docker. This is intended to demonstrate container-wise inference, adapt it to your own needs by changing models and skeletons:
|
247 |
+
|
248 |
+
`docker build . -t easy_vitpose`
|
249 |
+
|
250 |
+
The image is based on NVIDIA's PyTorch image, which is 20GB large.
|
251 |
+
If you have a compatible GPU set up with [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html),
|
252 |
+
ViTPose will run with hardware acceleration.
|
253 |
+
|
254 |
+
To test an example, create a folder called `cats` with a picture of a cat as `image.jpg`.
|
255 |
+
Run `./models/download.sh` to fetch the large yolov8 and ap10k ViTPose models. Then run inference using the following command (replace with the correct `cats` and `models` paths):
|
256 |
+
|
257 |
+
`docker run --gpus all --rm -it --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 -v ./models:/models -v ~/cats:/cats easy_vitpose python inference.py --det-class cat --input /cats/image.jpg --output-path /cats --save-img --model /models/vitpose-l-ap10k.onnx --yolo /models/yolov8l.pt`
|
258 |
+
|
259 |
+
The result image may be viewed in your `cats` folder.
|
260 |
+
|
261 |
+
## TODO:
|
262 |
+
- refactor finetuning (currently not available)
|
263 |
+
- benchmark and check bottlenecks of inference pipeline
|
264 |
+
- parallel batched inference
|
265 |
+
- other minor fixes
|
266 |
+
- yolo version for animal pose, check https://github.com/JunkyByte/easy_ViTPose/pull/18
|
267 |
+
- solve cuda exceptions on script exit when using tensorrt (no idea how)
|
268 |
+
- add infos about inferred informations during inference, better output of inference status (device etc)
|
269 |
+
- check if is possible to make colab work without runtime restart
|
270 |
+
|
271 |
+
Feel free to open issues, pull requests and contribute on these TODOs.
|
272 |
+
|
273 |
+
## Reference
|
274 |
+
Thanks to the VitPose authors and their official implementation [ViTAE-Transformer/ViTPose](https://github.com/ViTAE-Transformer/ViTPose).
|
275 |
+
The SORT code is taken from [abewley/sort](https://github.com/abewley/sort)
|
ViTPose/easy_ViTPose/colab_demo.ipynb
ADDED
The diff for this file is too large to render.
See raw diff
|
|
ViTPose/easy_ViTPose/easy_ViTPose.egg-info/PKG-INFO
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Metadata-Version: 2.1
|
2 |
+
Name: easy_ViTPose
|
3 |
+
Version: 1.1
|
4 |
+
Home-page: https://github.com/JunkyByte/easy_ViTPose
|
5 |
+
Author: JunkyByte
|
6 |
+
Author-email: [email protected]
|
7 |
+
License-File: LICENSE
|
ViTPose/easy_ViTPose/easy_ViTPose.egg-info/SOURCES.txt
ADDED
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
LICENSE
|
2 |
+
README.md
|
3 |
+
setup.py
|
4 |
+
easy_ViTPose/__init__.py
|
5 |
+
easy_ViTPose/inference.py
|
6 |
+
easy_ViTPose/sort.py
|
7 |
+
easy_ViTPose/train.py
|
8 |
+
easy_ViTPose.egg-info/PKG-INFO
|
9 |
+
easy_ViTPose.egg-info/SOURCES.txt
|
10 |
+
easy_ViTPose.egg-info/dependency_links.txt
|
11 |
+
easy_ViTPose.egg-info/top_level.txt
|
12 |
+
easy_ViTPose/configs/ViTPose_aic.py
|
13 |
+
easy_ViTPose/configs/ViTPose_ap10k.py
|
14 |
+
easy_ViTPose/configs/ViTPose_apt36k.py
|
15 |
+
easy_ViTPose/configs/ViTPose_coco.py
|
16 |
+
easy_ViTPose/configs/ViTPose_coco_25.py
|
17 |
+
easy_ViTPose/configs/ViTPose_common.py
|
18 |
+
easy_ViTPose/configs/ViTPose_mpii.py
|
19 |
+
easy_ViTPose/configs/ViTPose_wholebody.py
|
20 |
+
easy_ViTPose/configs/__init__.py
|
21 |
+
easy_ViTPose/datasets/COCO.py
|
22 |
+
easy_ViTPose/datasets/HumanPoseEstimation.py
|
23 |
+
easy_ViTPose/datasets/__init__.py
|
24 |
+
easy_ViTPose/vit_models/__init__.py
|
25 |
+
easy_ViTPose/vit_models/model.py
|
26 |
+
easy_ViTPose/vit_models/optimizer.py
|
27 |
+
easy_ViTPose/vit_models/backbone/__init__.py
|
28 |
+
easy_ViTPose/vit_models/backbone/vit.py
|
29 |
+
easy_ViTPose/vit_models/head/__init__.py
|
30 |
+
easy_ViTPose/vit_models/head/topdown_heatmap_base_head.py
|
31 |
+
easy_ViTPose/vit_models/head/topdown_heatmap_simple_head.py
|
32 |
+
easy_ViTPose/vit_models/losses/__init__.py
|
33 |
+
easy_ViTPose/vit_models/losses/classfication_loss.py
|
34 |
+
easy_ViTPose/vit_models/losses/heatmap_loss.py
|
35 |
+
easy_ViTPose/vit_models/losses/mesh_loss.py
|
36 |
+
easy_ViTPose/vit_models/losses/mse_loss.py
|
37 |
+
easy_ViTPose/vit_models/losses/multi_loss_factory.py
|
38 |
+
easy_ViTPose/vit_models/losses/regression_loss.py
|
39 |
+
easy_ViTPose/vit_utils/__init__.py
|
40 |
+
easy_ViTPose/vit_utils/dist_util.py
|
41 |
+
easy_ViTPose/vit_utils/inference.py
|
42 |
+
easy_ViTPose/vit_utils/logging.py
|
43 |
+
easy_ViTPose/vit_utils/top_down_eval.py
|
44 |
+
easy_ViTPose/vit_utils/train_valid_fn.py
|
45 |
+
easy_ViTPose/vit_utils/transform.py
|
46 |
+
easy_ViTPose/vit_utils/util.py
|
47 |
+
easy_ViTPose/vit_utils/visualization.py
|
48 |
+
easy_ViTPose/vit_utils/nms/__init__.py
|
49 |
+
easy_ViTPose/vit_utils/nms/nms.py
|
50 |
+
easy_ViTPose/vit_utils/nms/nms_ori.py
|
51 |
+
easy_ViTPose/vit_utils/nms/setup_linux.py
|
52 |
+
easy_ViTPose/vit_utils/post_processing/__init__.py
|
53 |
+
easy_ViTPose/vit_utils/post_processing/group.py
|
54 |
+
easy_ViTPose/vit_utils/post_processing/nms.py
|
55 |
+
easy_ViTPose/vit_utils/post_processing/one_euro_filter.py
|
56 |
+
easy_ViTPose/vit_utils/post_processing/post_transforms.py
|
ViTPose/easy_ViTPose/easy_ViTPose.egg-info/dependency_links.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
|
ViTPose/easy_ViTPose/easy_ViTPose.egg-info/top_level.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
easy_ViTPose
|
ViTPose/easy_ViTPose/easy_ViTPose/.ipynb_checkpoints/ViTPose_Inference-checkpoint.ipynb
ADDED
The diff for this file is too large to render.
See raw diff
|
|
ViTPose/easy_ViTPose/easy_ViTPose/.ipynb_checkpoints/__init__-checkpoint.py
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from .inference import VitInference
|
2 |
+
|
3 |
+
__all__ = [
|
4 |
+
'VitInference'
|
5 |
+
]
|
ViTPose/easy_ViTPose/easy_ViTPose/.ipynb_checkpoints/config-checkpoint.yaml
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Train config ---------------------------------------
|
2 |
+
log_level: logging.INFO
|
3 |
+
seed: 0
|
4 |
+
gpu_ids: 0
|
5 |
+
deterministic: True
|
6 |
+
cudnn_benchmark: True # Use cudnn
|
7 |
+
resume_from: "C:/Users/user/ViTPose/ckpts/vitpose-s-coco_25.pth" # CKPT path
|
8 |
+
#resume_from: False
|
9 |
+
gpu_ids: [0]
|
10 |
+
launcher: 'none' # When distributed training ['none', 'pytorch', 'slurm', 'mpi']
|
11 |
+
use_amp: False
|
12 |
+
validate: True
|
13 |
+
autoscale_lr: False
|
14 |
+
dist_params:
|
15 |
+
...
|
ViTPose/easy_ViTPose/easy_ViTPose/.ipynb_checkpoints/inference-checkpoint.py
ADDED
@@ -0,0 +1,337 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import abc
|
2 |
+
import os
|
3 |
+
from typing import Optional
|
4 |
+
import typing
|
5 |
+
|
6 |
+
import cv2
|
7 |
+
import numpy as np
|
8 |
+
import torch
|
9 |
+
|
10 |
+
from ultralytics import YOLO
|
11 |
+
|
12 |
+
from .configs.ViTPose_common import data_cfg
|
13 |
+
from .sort import Sort
|
14 |
+
from .vit_models.model import ViTPose
|
15 |
+
from .vit_utils.inference import draw_bboxes, pad_image
|
16 |
+
from .vit_utils.top_down_eval import keypoints_from_heatmaps
|
17 |
+
from .vit_utils.util import dyn_model_import, infer_dataset_by_path
|
18 |
+
from .vit_utils.visualization import draw_points_and_skeleton, joints_dict
|
19 |
+
|
20 |
+
try:
|
21 |
+
import torch_tensorrt
|
22 |
+
except ModuleNotFoundError:
|
23 |
+
pass
|
24 |
+
|
25 |
+
try:
|
26 |
+
import onnxruntime
|
27 |
+
except ModuleNotFoundError:
|
28 |
+
pass
|
29 |
+
|
30 |
+
__all__ = ['VitInference']
|
31 |
+
np.bool = np.bool_
|
32 |
+
MEAN = [0.485, 0.456, 0.406]
|
33 |
+
STD = [0.229, 0.224, 0.225]
|
34 |
+
|
35 |
+
|
36 |
+
DETC_TO_YOLO_YOLOC = {
|
37 |
+
'human': [0],
|
38 |
+
'cat': [15],
|
39 |
+
'dog': [16],
|
40 |
+
'horse': [17],
|
41 |
+
'sheep': [18],
|
42 |
+
'cow': [19],
|
43 |
+
'elephant': [20],
|
44 |
+
'bear': [21],
|
45 |
+
'zebra': [22],
|
46 |
+
'giraffe': [23],
|
47 |
+
'animals': [15, 16, 17, 18, 19, 20, 21, 22, 23]
|
48 |
+
}
|
49 |
+
|
50 |
+
|
51 |
+
class VitInference:
|
52 |
+
"""
|
53 |
+
Class for performing inference using ViTPose models with YOLOv8 human detection and SORT tracking.
|
54 |
+
|
55 |
+
Args:
|
56 |
+
model (str): Path to the ViT model file (.pth, .onnx, .engine).
|
57 |
+
yolo (str): Path of the YOLOv8 model to load.
|
58 |
+
model_name (str, optional): Name of the ViT model architecture to use.
|
59 |
+
Valid values are 's', 'b', 'l', 'h'.
|
60 |
+
Defaults to None, is necessary when using .pth checkpoints.
|
61 |
+
det_class (str, optional): the detection class. if None it is inferred by the dataset.
|
62 |
+
valid values are 'human', 'cat', 'dog', 'horse', 'sheep',
|
63 |
+
'cow', 'elephant', 'bear', 'zebra', 'giraffe',
|
64 |
+
'animals' (which is all previous but human)
|
65 |
+
dataset (str, optional): Name of the dataset. If None it's extracted from the file name.
|
66 |
+
Valid values are 'coco', 'coco_25', 'wholebody', 'mpii',
|
67 |
+
'ap10k', 'apt36k', 'aic'
|
68 |
+
yolo_size (int, optional): Size of the input image for YOLOv8 model. Defaults to 320.
|
69 |
+
device (str, optional): Device to use for inference. Defaults to 'cuda' if available, else 'cpu'.
|
70 |
+
is_video (bool, optional): Flag indicating if the input is video. Defaults to False.
|
71 |
+
single_pose (bool, optional): Flag indicating if the video (on images this flag has no effect)
|
72 |
+
will contain a single pose.
|
73 |
+
In this case the SORT tracker is not used (increasing performance)
|
74 |
+
but people id tracking
|
75 |
+
won't be consistent among frames.
|
76 |
+
yolo_step (int, optional): The tracker can be used to predict the bboxes instead of yolo for performance,
|
77 |
+
this flag specifies how often yolo is applied (e.g. 1 applies yolo every frame).
|
78 |
+
This does not have any effect when is_video is False.
|
79 |
+
"""
|
80 |
+
|
81 |
+
def __init__(self, model: str,
|
82 |
+
yolo: str,
|
83 |
+
model_name: Optional[str] = None,
|
84 |
+
det_class: Optional[str] = None,
|
85 |
+
dataset: Optional[str] = None,
|
86 |
+
yolo_size: Optional[int] = 320,
|
87 |
+
device: Optional[str] = None,
|
88 |
+
is_video: Optional[bool] = False,
|
89 |
+
single_pose: Optional[bool] = False,
|
90 |
+
yolo_step: Optional[int] = 1):
|
91 |
+
assert os.path.isfile(model), f'The model file {model} does not exist'
|
92 |
+
assert os.path.isfile(yolo), f'The YOLOv8 model {yolo} does not exist'
|
93 |
+
|
94 |
+
# Device priority is cuda / mps / cpu
|
95 |
+
if device is None:
|
96 |
+
if torch.cuda.is_available():
|
97 |
+
device = 'cuda'
|
98 |
+
elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
|
99 |
+
device = 'mps'
|
100 |
+
else:
|
101 |
+
device = 'cpu'
|
102 |
+
|
103 |
+
self.device = device
|
104 |
+
self.yolo = YOLO(yolo, task='detect')
|
105 |
+
self.yolo_size = yolo_size
|
106 |
+
self.yolo_step = yolo_step
|
107 |
+
self.is_video = is_video
|
108 |
+
self.single_pose = single_pose
|
109 |
+
self.reset()
|
110 |
+
|
111 |
+
# State saving during inference
|
112 |
+
self.save_state = True # Can be disabled manually
|
113 |
+
self._img = None
|
114 |
+
self._yolo_res = None
|
115 |
+
self._tracker_res = None
|
116 |
+
self._keypoints = None
|
117 |
+
|
118 |
+
# Use extension to decide which kind of model has been loaded
|
119 |
+
use_onnx = model.endswith('.onnx')
|
120 |
+
use_trt = model.endswith('.engine')
|
121 |
+
|
122 |
+
|
123 |
+
# Extract dataset name
|
124 |
+
if dataset is None:
|
125 |
+
dataset = infer_dataset_by_path(model)
|
126 |
+
|
127 |
+
assert dataset in ['mpii', 'coco', 'coco_25', 'wholebody', 'aic', 'ap10k', 'apt36k'], \
|
128 |
+
'The specified dataset is not valid'
|
129 |
+
|
130 |
+
# Dataset can now be set for visualization
|
131 |
+
self.dataset = dataset
|
132 |
+
|
133 |
+
# if we picked the dataset switch to correct yolo classes if not set
|
134 |
+
if det_class is None:
|
135 |
+
det_class = 'animals' if dataset in ['ap10k', 'apt36k'] else 'human'
|
136 |
+
self.yolo_classes = DETC_TO_YOLO_YOLOC[det_class]
|
137 |
+
|
138 |
+
assert model_name in [None, 's', 'b', 'l', 'h'], \
|
139 |
+
f'The model name {model_name} is not valid'
|
140 |
+
|
141 |
+
# onnx / trt models do not require model_cfg specification
|
142 |
+
if model_name is None:
|
143 |
+
assert use_onnx or use_trt, \
|
144 |
+
'Specify the model_name if not using onnx / trt'
|
145 |
+
else:
|
146 |
+
# Dynamically import the model class
|
147 |
+
model_cfg = dyn_model_import(self.dataset, model_name)
|
148 |
+
|
149 |
+
self.target_size = data_cfg['image_size']
|
150 |
+
if use_onnx:
|
151 |
+
self._ort_session = onnxruntime.InferenceSession(model,
|
152 |
+
providers=['CUDAExecutionProvider',
|
153 |
+
'CPUExecutionProvider'])
|
154 |
+
inf_fn = self._inference_onnx
|
155 |
+
else:
|
156 |
+
self._vit_pose = ViTPose(model_cfg)
|
157 |
+
self._vit_pose.eval()
|
158 |
+
|
159 |
+
if use_trt:
|
160 |
+
self._vit_pose = torch.jit.load(model)
|
161 |
+
else:
|
162 |
+
ckpt = torch.load(model, map_location='cpu', weights_only=True)
|
163 |
+
if 'state_dict' in ckpt:
|
164 |
+
self._vit_pose.load_state_dict(ckpt['state_dict'])
|
165 |
+
else:
|
166 |
+
self._vit_pose.load_state_dict(ckpt)
|
167 |
+
self._vit_pose.to(torch.device(device))
|
168 |
+
|
169 |
+
inf_fn = self._inference_torch
|
170 |
+
|
171 |
+
# Override _inference abstract with selected engine
|
172 |
+
self._inference = inf_fn # type: ignore
|
173 |
+
|
174 |
+
def reset(self):
|
175 |
+
"""
|
176 |
+
Reset the inference class to be ready for a new video.
|
177 |
+
This will reset the internal counter of frames, on videos
|
178 |
+
this is necessary to reset the tracker.
|
179 |
+
"""
|
180 |
+
min_hits = 3 if self.yolo_step == 1 else 1
|
181 |
+
use_tracker = self.is_video and not self.single_pose
|
182 |
+
self.tracker = Sort(max_age=self.yolo_step,
|
183 |
+
min_hits=min_hits,
|
184 |
+
iou_threshold=0.3) if use_tracker else None # TODO: Params
|
185 |
+
self.frame_counter = 0
|
186 |
+
|
187 |
+
@classmethod
|
188 |
+
def postprocess(cls, heatmaps, org_w, org_h):
|
189 |
+
"""
|
190 |
+
Postprocess the heatmaps to obtain keypoints and their probabilities.
|
191 |
+
|
192 |
+
Args:
|
193 |
+
heatmaps (ndarray): Heatmap predictions from the model.
|
194 |
+
org_w (int): Original width of the image.
|
195 |
+
org_h (int): Original height of the image.
|
196 |
+
|
197 |
+
Returns:
|
198 |
+
ndarray: Processed keypoints with probabilities.
|
199 |
+
"""
|
200 |
+
points, prob = keypoints_from_heatmaps(heatmaps=heatmaps,
|
201 |
+
center=np.array([[org_w // 2,
|
202 |
+
org_h // 2]]),
|
203 |
+
scale=np.array([[org_w, org_h]]),
|
204 |
+
unbiased=True, use_udp=True)
|
205 |
+
return np.concatenate([points[:, :, ::-1], prob], axis=2)
|
206 |
+
|
207 |
+
@abc.abstractmethod
|
208 |
+
def _inference(self, img: np.ndarray) -> np.ndarray:
|
209 |
+
"""
|
210 |
+
Abstract method for performing inference on an image.
|
211 |
+
It is overloaded by each inference engine.
|
212 |
+
|
213 |
+
Args:
|
214 |
+
img (ndarray): Input image for inference.
|
215 |
+
|
216 |
+
Returns:
|
217 |
+
ndarray: Inference results.
|
218 |
+
"""
|
219 |
+
raise NotImplementedError
|
220 |
+
|
221 |
+
def inference(self, img: np.ndarray) -> dict[typing.Any, typing.Any]:
|
222 |
+
"""
|
223 |
+
Perform inference on the input image.
|
224 |
+
|
225 |
+
Args:
|
226 |
+
img (ndarray): Input image for inference in RGB format.
|
227 |
+
|
228 |
+
Returns:
|
229 |
+
dict[typing.Any, typing.Any]: Inference results.
|
230 |
+
"""
|
231 |
+
|
232 |
+
# First use YOLOv8 for detection
|
233 |
+
res_pd = np.empty((0, 5))
|
234 |
+
results = None
|
235 |
+
if (self.tracker is None or
|
236 |
+
(self.frame_counter % self.yolo_step == 0 or self.frame_counter < 3)):
|
237 |
+
results = self.yolo(img[..., ::-1], verbose=False, imgsz=self.yolo_size,
|
238 |
+
device=self.device if self.device != 'cuda' else 0,
|
239 |
+
classes=self.yolo_classes)[0]
|
240 |
+
res_pd = np.array([r[:5].tolist() for r in # TODO: Confidence threshold
|
241 |
+
results.boxes.data.cpu().numpy() if r[4] > 0.35]).reshape((-1, 5))
|
242 |
+
self.frame_counter += 1
|
243 |
+
|
244 |
+
frame_keypoints = {}
|
245 |
+
scores_bbox = {}
|
246 |
+
ids = None
|
247 |
+
if self.tracker is not None:
|
248 |
+
res_pd = self.tracker.update(res_pd)
|
249 |
+
ids = res_pd[:, 5].astype(int).tolist()
|
250 |
+
|
251 |
+
# Prepare boxes for inference
|
252 |
+
bboxes = res_pd[:, :4].round().astype(int)
|
253 |
+
scores = res_pd[:, 4].tolist()
|
254 |
+
pad_bbox = 10
|
255 |
+
|
256 |
+
if ids is None:
|
257 |
+
ids = range(len(bboxes))
|
258 |
+
|
259 |
+
for bbox, id, score in zip(bboxes, ids, scores):
|
260 |
+
# TODO: Slightly bigger bbox
|
261 |
+
bbox[[0, 2]] = np.clip(bbox[[0, 2]] + [-pad_bbox, pad_bbox], 0, img.shape[1])
|
262 |
+
bbox[[1, 3]] = np.clip(bbox[[1, 3]] + [-pad_bbox, pad_bbox], 0, img.shape[0])
|
263 |
+
|
264 |
+
# Crop image and pad to 3/4 aspect ratio
|
265 |
+
img_inf = img[bbox[1]:bbox[3], bbox[0]:bbox[2]]
|
266 |
+
img_inf, (left_pad, top_pad) = pad_image(img_inf, 3 / 4)
|
267 |
+
|
268 |
+
keypoints = self._inference(img_inf)[0]
|
269 |
+
# Transform keypoints to original image
|
270 |
+
keypoints[:, :2] += bbox[:2][::-1] - [top_pad, left_pad]
|
271 |
+
frame_keypoints[id] = keypoints
|
272 |
+
scores_bbox[id] = score # Replace this with avg_keypoint_conf*person_obj_conf. For now, only person_obj_conf from yolo is being used.
|
273 |
+
|
274 |
+
if self.save_state:
|
275 |
+
self._img = img
|
276 |
+
self._yolo_res = results
|
277 |
+
self._tracker_res = (bboxes, ids, scores)
|
278 |
+
self._keypoints = frame_keypoints
|
279 |
+
self._scores_bbox = scores_bbox
|
280 |
+
|
281 |
+
return frame_keypoints
|
282 |
+
|
283 |
+
def draw(self, show_yolo=True, show_raw_yolo=False, confidence_threshold=0.5):
|
284 |
+
"""
|
285 |
+
Draw keypoints and bounding boxes on the image.
|
286 |
+
|
287 |
+
Args:
|
288 |
+
show_yolo (bool, optional): Whether to show YOLOv8 bounding boxes. Default is True.
|
289 |
+
show_raw_yolo (bool, optional): Whether to show raw YOLOv8 bounding boxes. Default is False.
|
290 |
+
|
291 |
+
Returns:
|
292 |
+
ndarray: Image with keypoints and bounding boxes drawn.
|
293 |
+
"""
|
294 |
+
img = self._img.copy()
|
295 |
+
bboxes, ids, scores = self._tracker_res
|
296 |
+
|
297 |
+
if self._yolo_res is not None and (show_raw_yolo or (self.tracker is None and show_yolo)):
|
298 |
+
img = np.array(self._yolo_res.plot())[..., ::-1]
|
299 |
+
|
300 |
+
if show_yolo and self.tracker is not None:
|
301 |
+
img = draw_bboxes(img, bboxes, ids, scores)
|
302 |
+
|
303 |
+
img = np.array(img)[..., ::-1] # RGB to BGR for cv2 modules
|
304 |
+
for idx, k in self._keypoints.items():
|
305 |
+
img = draw_points_and_skeleton(img.copy(), k,
|
306 |
+
joints_dict()[self.dataset]['skeleton'],
|
307 |
+
person_index=idx,
|
308 |
+
points_color_palette='gist_rainbow',
|
309 |
+
skeleton_color_palette='jet',
|
310 |
+
points_palette_samples=10,
|
311 |
+
confidence_threshold=confidence_threshold)
|
312 |
+
return img[..., ::-1] # Return RGB as original
|
313 |
+
|
314 |
+
def pre_img(self, img):
|
315 |
+
org_h, org_w = img.shape[:2]
|
316 |
+
img_input = cv2.resize(img, self.target_size, interpolation=cv2.INTER_LINEAR) / 255
|
317 |
+
img_input = ((img_input - MEAN) / STD).transpose(2, 0, 1)[None].astype(np.float32)
|
318 |
+
return img_input, org_h, org_w
|
319 |
+
|
320 |
+
@torch.no_grad()
|
321 |
+
def _inference_torch(self, img: np.ndarray) -> np.ndarray:
|
322 |
+
# Prepare input data
|
323 |
+
img_input, org_h, org_w = self.pre_img(img)
|
324 |
+
img_input = torch.from_numpy(img_input).to(torch.device(self.device))
|
325 |
+
|
326 |
+
# Feed to model
|
327 |
+
heatmaps = self._vit_pose(img_input).detach().cpu().numpy()
|
328 |
+
return self.postprocess(heatmaps, org_w, org_h)
|
329 |
+
|
330 |
+
def _inference_onnx(self, img: np.ndarray) -> np.ndarray:
|
331 |
+
# Prepare input data
|
332 |
+
img_input, org_h, org_w = self.pre_img(img)
|
333 |
+
|
334 |
+
# Feed to model
|
335 |
+
ort_inputs = {self._ort_session.get_inputs()[0].name: img_input}
|
336 |
+
heatmaps = self._ort_session.run(None, ort_inputs)[0]
|
337 |
+
return self.postprocess(heatmaps, org_w, org_h)
|
ViTPose/easy_ViTPose/easy_ViTPose/.ipynb_checkpoints/testVITPOSE-checkpoint.jpg
ADDED
![]() |
ViTPose/easy_ViTPose/easy_ViTPose/.ipynb_checkpoints/train-checkpoint.py
ADDED
@@ -0,0 +1,174 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) OpenMMLab. All rights reserved.
|
2 |
+
import argparse
|
3 |
+
import copy
|
4 |
+
import os
|
5 |
+
import os.path as osp
|
6 |
+
import time
|
7 |
+
import warnings
|
8 |
+
import click
|
9 |
+
import yaml
|
10 |
+
|
11 |
+
from glob import glob
|
12 |
+
|
13 |
+
import torch
|
14 |
+
import torch.distributed as dist
|
15 |
+
|
16 |
+
from vit_utils.util import init_random_seed, set_random_seed
|
17 |
+
from vit_utils.dist_util import get_dist_info, init_dist
|
18 |
+
from vit_utils.logging import get_root_logger
|
19 |
+
|
20 |
+
import configs.ViTPose_small_coco_256x192 as s_cfg
|
21 |
+
import configs.ViTPose_base_coco_256x192 as b_cfg
|
22 |
+
import configs.ViTPose_large_coco_256x192 as l_cfg
|
23 |
+
import configs.ViTPose_huge_coco_256x192 as h_cfg
|
24 |
+
|
25 |
+
from vit_models.model import ViTPose
|
26 |
+
from datasets.COCO import COCODataset
|
27 |
+
from vit_utils.train_valid_fn import train_model
|
28 |
+
|
29 |
+
CUR_PATH = osp.dirname(__file__)
|
30 |
+
|
31 |
+
@click.command()
|
32 |
+
@click.option('--config-path', type=click.Path(exists=True), default='config.yaml', required=True, help='train config file path')
|
33 |
+
@click.option('--model-name', type=str, default='b', required=True, help='[b: ViT-B, l: ViT-L, h: ViT-H]')
|
34 |
+
def main(config_path, model_name):
|
35 |
+
|
36 |
+
cfg = {'b':b_cfg,
|
37 |
+
's':s_cfg,
|
38 |
+
'l':l_cfg,
|
39 |
+
'h':h_cfg}.get(model_name.lower())
|
40 |
+
# Load config.yaml
|
41 |
+
with open(config_path, 'r') as f:
|
42 |
+
cfg_yaml = yaml.load(f, Loader=yaml.SafeLoader)
|
43 |
+
|
44 |
+
for k, v in cfg_yaml.items():
|
45 |
+
if hasattr(cfg, k):
|
46 |
+
raise ValueError(f"Already exists {k} in config")
|
47 |
+
else:
|
48 |
+
cfg.__setattr__(k, v)
|
49 |
+
|
50 |
+
# set cudnn_benchmark
|
51 |
+
if cfg.cudnn_benchmark:
|
52 |
+
torch.backends.cudnn.benchmark = True
|
53 |
+
|
54 |
+
# Set work directory (session-level)
|
55 |
+
if not hasattr(cfg, 'work_dir'):
|
56 |
+
cfg.__setattr__('work_dir', f"{CUR_PATH}/runs/train")
|
57 |
+
|
58 |
+
if not osp.exists(cfg.work_dir):
|
59 |
+
os.makedirs(cfg.work_dir)
|
60 |
+
session_list = sorted(glob(f"{cfg.work_dir}/*"))
|
61 |
+
if len(session_list) == 0:
|
62 |
+
session = 1
|
63 |
+
else:
|
64 |
+
session = int(os.path.basename(session_list[-1])) + 1
|
65 |
+
session_dir = osp.join(cfg.work_dir, str(session).zfill(3))
|
66 |
+
os.makedirs(session_dir)
|
67 |
+
cfg.__setattr__('work_dir', session_dir)
|
68 |
+
|
69 |
+
|
70 |
+
if cfg.autoscale_lr:
|
71 |
+
# apply the linear scaling rule (https://arxiv.org/abs/1706.02677)
|
72 |
+
cfg.optimizer['lr'] = cfg.optimizer['lr'] * len(cfg.gpu_ids) / 8
|
73 |
+
|
74 |
+
# init distributed env first, since logger depends on the dist info.
|
75 |
+
if cfg.launcher == 'none':
|
76 |
+
distributed = False
|
77 |
+
if len(cfg.gpu_ids) > 1:
|
78 |
+
warnings.warn(
|
79 |
+
f"We treat {cfg['gpu_ids']} as gpu-ids, and reset to "
|
80 |
+
f"{cfg['gpu_ids'][0:1]} as gpu-ids to avoid potential error in "
|
81 |
+
"non-distribute training time.")
|
82 |
+
cfg.gpu_ids = cfg.gpu_ids[0:1]
|
83 |
+
else:
|
84 |
+
distributed = True
|
85 |
+
init_dist(cfg.launcher, **cfg.dist_params)
|
86 |
+
# re-set gpu_ids with distributed training mode
|
87 |
+
_, world_size = get_dist_info()
|
88 |
+
cfg.gpu_ids = range(world_size)
|
89 |
+
|
90 |
+
# init the logger before other steps
|
91 |
+
timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime())
|
92 |
+
log_file = osp.join(session_dir, f'{timestamp}.log')
|
93 |
+
logger = get_root_logger(log_file=log_file)
|
94 |
+
|
95 |
+
# init the meta dict to record some important information such as
|
96 |
+
# environment info and seed, which will be logged
|
97 |
+
meta = dict()
|
98 |
+
|
99 |
+
# log some basic info
|
100 |
+
logger.info(f'Distributed training: {distributed}')
|
101 |
+
|
102 |
+
# set random seeds
|
103 |
+
seed = init_random_seed(cfg.seed)
|
104 |
+
logger.info(f"Set random seed to {seed}, "
|
105 |
+
f"deterministic: {cfg.deterministic}")
|
106 |
+
set_random_seed(seed, deterministic=cfg.deterministic)
|
107 |
+
meta['seed'] = seed
|
108 |
+
|
109 |
+
# Set model
|
110 |
+
model = ViTPose(cfg.model)
|
111 |
+
if cfg.resume_from:
|
112 |
+
# Load ckpt partially
|
113 |
+
ckpt_state = torch.load(cfg.resume_from)['state_dict']
|
114 |
+
ckpt_state.pop('keypoint_head.final_layer.bias')
|
115 |
+
ckpt_state.pop('keypoint_head.final_layer.weight')
|
116 |
+
model.load_state_dict(ckpt_state, strict=False)
|
117 |
+
|
118 |
+
# freeze the backbone, leave the head to be finetuned
|
119 |
+
model.backbone.frozen_stages = model.backbone.depth - 1
|
120 |
+
model.backbone.freeze_ffn = True
|
121 |
+
model.backbone.freeze_attn = True
|
122 |
+
model.backbone._freeze_stages()
|
123 |
+
|
124 |
+
# Set dataset
|
125 |
+
datasets_train = COCODataset(
|
126 |
+
root_path=cfg.data_root,
|
127 |
+
data_version="feet_train",
|
128 |
+
is_train=True,
|
129 |
+
use_gt_bboxes=True,
|
130 |
+
image_width=192,
|
131 |
+
image_height=256,
|
132 |
+
scale=True,
|
133 |
+
scale_factor=0.35,
|
134 |
+
flip_prob=0.5,
|
135 |
+
rotate_prob=0.5,
|
136 |
+
rotation_factor=45.,
|
137 |
+
half_body_prob=0.3,
|
138 |
+
use_different_joints_weight=True,
|
139 |
+
heatmap_sigma=3,
|
140 |
+
soft_nms=False
|
141 |
+
)
|
142 |
+
|
143 |
+
datasets_valid = COCODataset(
|
144 |
+
root_path=cfg.data_root,
|
145 |
+
data_version="feet_val",
|
146 |
+
is_train=False,
|
147 |
+
use_gt_bboxes=True,
|
148 |
+
image_width=192,
|
149 |
+
image_height=256,
|
150 |
+
scale=False,
|
151 |
+
scale_factor=0.35,
|
152 |
+
flip_prob=0.5,
|
153 |
+
rotate_prob=0.5,
|
154 |
+
rotation_factor=45.,
|
155 |
+
half_body_prob=0.3,
|
156 |
+
use_different_joints_weight=True,
|
157 |
+
heatmap_sigma=3,
|
158 |
+
soft_nms=False
|
159 |
+
)
|
160 |
+
|
161 |
+
train_model(
|
162 |
+
model=model,
|
163 |
+
datasets_train=datasets_train,
|
164 |
+
datasets_valid=datasets_valid,
|
165 |
+
cfg=cfg,
|
166 |
+
distributed=distributed,
|
167 |
+
validate=cfg.validate,
|
168 |
+
timestamp=timestamp,
|
169 |
+
meta=meta
|
170 |
+
)
|
171 |
+
|
172 |
+
|
173 |
+
if __name__ == '__main__':
|
174 |
+
main()
|
ViTPose/easy_ViTPose/easy_ViTPose/ViTPose_Inference.ipynb
ADDED
The diff for this file is too large to render.
See raw diff
|
|
ViTPose/easy_ViTPose/easy_ViTPose/__init__.py
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from .inference import VitInference
|
2 |
+
|
3 |
+
__all__ = [
|
4 |
+
'VitInference'
|
5 |
+
]
|
ViTPose/easy_ViTPose/easy_ViTPose/config.yaml
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Train config ---------------------------------------
|
2 |
+
log_level: logging.INFO
|
3 |
+
seed: 0
|
4 |
+
gpu_ids: 0
|
5 |
+
deterministic: True
|
6 |
+
cudnn_benchmark: True # Use cudnn
|
7 |
+
resume_from: "C:/Users/user/ViTPose/ckpts/vitpose-s-coco_25.pth" # CKPT path
|
8 |
+
#resume_from: False
|
9 |
+
gpu_ids: [0]
|
10 |
+
launcher: 'none' # When distributed training ['none', 'pytorch', 'slurm', 'mpi']
|
11 |
+
use_amp: False
|
12 |
+
validate: True
|
13 |
+
autoscale_lr: False
|
14 |
+
dist_params:
|
15 |
+
...
|
ViTPose/easy_ViTPose/easy_ViTPose/configs/.ipynb_checkpoints/ViTPose_common-checkpoint.py
ADDED
@@ -0,0 +1,195 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Common configuration
|
2 |
+
optimizer = dict(type='AdamW', lr=1e-3, betas=(0.9, 0.999), weight_decay=0.1,
|
3 |
+
constructor='LayerDecayOptimizerConstructor',
|
4 |
+
paramwise_cfg=dict(
|
5 |
+
num_layers=12,
|
6 |
+
layer_decay_rate=1 - 2e-4,
|
7 |
+
custom_keys={
|
8 |
+
'bias': dict(decay_multi=0.),
|
9 |
+
'pos_embed': dict(decay_mult=0.),
|
10 |
+
'relative_position_bias_table': dict(decay_mult=0.),
|
11 |
+
'norm': dict(decay_mult=0.)
|
12 |
+
}
|
13 |
+
)
|
14 |
+
)
|
15 |
+
|
16 |
+
optimizer_config = dict(grad_clip=dict(max_norm=1., norm_type=2))
|
17 |
+
|
18 |
+
# learning policy
|
19 |
+
lr_config = dict(
|
20 |
+
policy='step',
|
21 |
+
warmup='linear',
|
22 |
+
warmup_iters=300,
|
23 |
+
warmup_ratio=0.001,
|
24 |
+
step=[3])
|
25 |
+
|
26 |
+
total_epochs = 4
|
27 |
+
target_type = 'GaussianHeatmap'
|
28 |
+
|
29 |
+
data_cfg = dict(
|
30 |
+
image_size=[192, 256],
|
31 |
+
heatmap_size=[48, 64],
|
32 |
+
soft_nms=False,
|
33 |
+
nms_thr=1.0,
|
34 |
+
oks_thr=0.9,
|
35 |
+
vis_thr=0.2,
|
36 |
+
use_gt_bbox=False,
|
37 |
+
det_bbox_thr=0.0,
|
38 |
+
bbox_file='data/coco/person_detection_results/'
|
39 |
+
'COCO_val2017_detections_AP_H_56_person.json',
|
40 |
+
)
|
41 |
+
|
42 |
+
data_root = '/home/adryw/dataset/COCO17'
|
43 |
+
data = dict(
|
44 |
+
samples_per_gpu=64,
|
45 |
+
workers_per_gpu=6,
|
46 |
+
val_dataloader=dict(samples_per_gpu=128),
|
47 |
+
test_dataloader=dict(samples_per_gpu=128),
|
48 |
+
train=dict(
|
49 |
+
type='TopDownCocoDataset',
|
50 |
+
ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
|
51 |
+
img_prefix=f'{data_root}/train2017/',
|
52 |
+
data_cfg=data_cfg),
|
53 |
+
val=dict(
|
54 |
+
type='TopDownCocoDataset',
|
55 |
+
ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
|
56 |
+
img_prefix=f'{data_root}/val2017/',
|
57 |
+
data_cfg=data_cfg),
|
58 |
+
test=dict(
|
59 |
+
type='TopDownCocoDataset',
|
60 |
+
ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
|
61 |
+
img_prefix=f'{data_root}/val2017/',
|
62 |
+
data_cfg=data_cfg)
|
63 |
+
)
|
64 |
+
|
65 |
+
model_small = dict(
|
66 |
+
type='TopDown',
|
67 |
+
pretrained=None,
|
68 |
+
backbone=dict(
|
69 |
+
type='ViT',
|
70 |
+
img_size=(256, 192),
|
71 |
+
patch_size=16,
|
72 |
+
embed_dim=384,
|
73 |
+
depth=12,
|
74 |
+
num_heads=12,
|
75 |
+
ratio=1,
|
76 |
+
use_checkpoint=False,
|
77 |
+
mlp_ratio=4,
|
78 |
+
qkv_bias=True,
|
79 |
+
drop_path_rate=0.1,
|
80 |
+
),
|
81 |
+
keypoint_head=dict(
|
82 |
+
type='TopdownHeatmapSimpleHead',
|
83 |
+
in_channels=384,
|
84 |
+
num_deconv_layers=2,
|
85 |
+
num_deconv_filters=(256, 256),
|
86 |
+
num_deconv_kernels=(4, 4),
|
87 |
+
extra=dict(final_conv_kernel=1, ),
|
88 |
+
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
|
89 |
+
train_cfg=dict(),
|
90 |
+
test_cfg=dict(
|
91 |
+
flip_test=True,
|
92 |
+
post_process='default',
|
93 |
+
shift_heatmap=False,
|
94 |
+
target_type=target_type,
|
95 |
+
modulate_kernel=11,
|
96 |
+
use_udp=True))
|
97 |
+
|
98 |
+
model_base = dict(
|
99 |
+
type='TopDown',
|
100 |
+
pretrained=None,
|
101 |
+
backbone=dict(
|
102 |
+
type='ViT',
|
103 |
+
img_size=(256, 192),
|
104 |
+
patch_size=16,
|
105 |
+
embed_dim=768,
|
106 |
+
depth=12,
|
107 |
+
num_heads=12,
|
108 |
+
ratio=1,
|
109 |
+
use_checkpoint=False,
|
110 |
+
mlp_ratio=4,
|
111 |
+
qkv_bias=True,
|
112 |
+
drop_path_rate=0.3,
|
113 |
+
),
|
114 |
+
keypoint_head=dict(
|
115 |
+
type='TopdownHeatmapSimpleHead',
|
116 |
+
in_channels=768,
|
117 |
+
num_deconv_layers=2,
|
118 |
+
num_deconv_filters=(256, 256),
|
119 |
+
num_deconv_kernels=(4, 4),
|
120 |
+
extra=dict(final_conv_kernel=1, ),
|
121 |
+
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
|
122 |
+
train_cfg=dict(),
|
123 |
+
test_cfg=dict(
|
124 |
+
flip_test=True,
|
125 |
+
post_process='default',
|
126 |
+
shift_heatmap=False,
|
127 |
+
target_type=target_type,
|
128 |
+
modulate_kernel=11,
|
129 |
+
use_udp=True))
|
130 |
+
|
131 |
+
model_large = dict(
|
132 |
+
type='TopDown',
|
133 |
+
pretrained=None,
|
134 |
+
backbone=dict(
|
135 |
+
type='ViT',
|
136 |
+
img_size=(256, 192),
|
137 |
+
patch_size=16,
|
138 |
+
embed_dim=1024,
|
139 |
+
depth=24,
|
140 |
+
num_heads=16,
|
141 |
+
ratio=1,
|
142 |
+
use_checkpoint=False,
|
143 |
+
mlp_ratio=4,
|
144 |
+
qkv_bias=True,
|
145 |
+
drop_path_rate=0.5,
|
146 |
+
),
|
147 |
+
keypoint_head=dict(
|
148 |
+
type='TopdownHeatmapSimpleHead',
|
149 |
+
in_channels=1024,
|
150 |
+
num_deconv_layers=2,
|
151 |
+
num_deconv_filters=(256, 256),
|
152 |
+
num_deconv_kernels=(4, 4),
|
153 |
+
extra=dict(final_conv_kernel=1, ),
|
154 |
+
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
|
155 |
+
train_cfg=dict(),
|
156 |
+
test_cfg=dict(
|
157 |
+
flip_test=True,
|
158 |
+
post_process='default',
|
159 |
+
shift_heatmap=False,
|
160 |
+
target_type=target_type,
|
161 |
+
modulate_kernel=11,
|
162 |
+
use_udp=True))
|
163 |
+
|
164 |
+
model_huge = dict(
|
165 |
+
type='TopDown',
|
166 |
+
pretrained=None,
|
167 |
+
backbone=dict(
|
168 |
+
type='ViT',
|
169 |
+
img_size=(256, 192),
|
170 |
+
patch_size=16,
|
171 |
+
embed_dim=1280,
|
172 |
+
depth=32,
|
173 |
+
num_heads=16,
|
174 |
+
ratio=1,
|
175 |
+
use_checkpoint=False,
|
176 |
+
mlp_ratio=4,
|
177 |
+
qkv_bias=True,
|
178 |
+
drop_path_rate=0.55,
|
179 |
+
),
|
180 |
+
keypoint_head=dict(
|
181 |
+
type='TopdownHeatmapSimpleHead',
|
182 |
+
in_channels=1280,
|
183 |
+
num_deconv_layers=2,
|
184 |
+
num_deconv_filters=(256, 256),
|
185 |
+
num_deconv_kernels=(4, 4),
|
186 |
+
extra=dict(final_conv_kernel=1, ),
|
187 |
+
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
|
188 |
+
train_cfg=dict(),
|
189 |
+
test_cfg=dict(
|
190 |
+
flip_test=True,
|
191 |
+
post_process='default',
|
192 |
+
shift_heatmap=False,
|
193 |
+
target_type=target_type,
|
194 |
+
modulate_kernel=11,
|
195 |
+
use_udp=True))
|
ViTPose/easy_ViTPose/easy_ViTPose/configs/.ipynb_checkpoints/ViTPose_small_coco_256x192-checkpoint.py
ADDED
@@ -0,0 +1,173 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_base_ = [
|
2 |
+
'../../../../_base_/default_runtime.py',
|
3 |
+
'../../../../_base_/datasets/coco.py'
|
4 |
+
]
|
5 |
+
evaluation = dict(interval=10, metric='mAP', save_best='AP')
|
6 |
+
|
7 |
+
optimizer = dict(type='AdamW', lr=5e-4, betas=(0.9, 0.999), weight_decay=0.1,
|
8 |
+
constructor='LayerDecayOptimizerConstructor',
|
9 |
+
paramwise_cfg=dict(
|
10 |
+
num_layers=12,
|
11 |
+
layer_decay_rate=0.8,
|
12 |
+
custom_keys={
|
13 |
+
'bias': dict(decay_multi=0.),
|
14 |
+
'pos_embed': dict(decay_mult=0.),
|
15 |
+
'relative_position_bias_table': dict(decay_mult=0.),
|
16 |
+
'norm': dict(decay_mult=0.)
|
17 |
+
}
|
18 |
+
)
|
19 |
+
)
|
20 |
+
|
21 |
+
optimizer_config = dict(grad_clip=dict(max_norm=1., norm_type=2))
|
22 |
+
|
23 |
+
# learning policy
|
24 |
+
lr_config = dict(
|
25 |
+
policy='step',
|
26 |
+
warmup='linear',
|
27 |
+
warmup_iters=500,
|
28 |
+
warmup_ratio=0.001,
|
29 |
+
step=[170, 200])
|
30 |
+
total_epochs = 210
|
31 |
+
target_type = 'GaussianHeatmap'
|
32 |
+
channel_cfg = dict(
|
33 |
+
num_output_channels=17,
|
34 |
+
dataset_joints=17,
|
35 |
+
dataset_channel=[
|
36 |
+
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
|
37 |
+
],
|
38 |
+
inference_channel=[
|
39 |
+
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
|
40 |
+
])
|
41 |
+
|
42 |
+
# model settings
|
43 |
+
model = dict(
|
44 |
+
type='TopDown',
|
45 |
+
pretrained=None,
|
46 |
+
backbone=dict(
|
47 |
+
type='ViT',
|
48 |
+
img_size=(256, 192),
|
49 |
+
patch_size=16,
|
50 |
+
embed_dim=384,
|
51 |
+
depth=12,
|
52 |
+
num_heads=12,
|
53 |
+
ratio=1,
|
54 |
+
use_checkpoint=False,
|
55 |
+
mlp_ratio=4,
|
56 |
+
qkv_bias=True,
|
57 |
+
drop_path_rate=0.1,
|
58 |
+
),
|
59 |
+
keypoint_head=dict(
|
60 |
+
type='TopdownHeatmapSimpleHead',
|
61 |
+
in_channels=384,
|
62 |
+
num_deconv_layers=2,
|
63 |
+
num_deconv_filters=(256, 256),
|
64 |
+
num_deconv_kernels=(4, 4),
|
65 |
+
extra=dict(final_conv_kernel=1, ),
|
66 |
+
out_channels=channel_cfg['num_output_channels'],
|
67 |
+
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
|
68 |
+
train_cfg=dict(),
|
69 |
+
test_cfg=dict(
|
70 |
+
flip_test=True,
|
71 |
+
post_process='default',
|
72 |
+
shift_heatmap=False,
|
73 |
+
target_type=target_type,
|
74 |
+
modulate_kernel=11,
|
75 |
+
use_udp=True))
|
76 |
+
|
77 |
+
data_cfg = dict(
|
78 |
+
image_size=[192, 256],
|
79 |
+
heatmap_size=[48, 64],
|
80 |
+
num_output_channels=channel_cfg['num_output_channels'],
|
81 |
+
num_joints=channel_cfg['dataset_joints'],
|
82 |
+
dataset_channel=channel_cfg['dataset_channel'],
|
83 |
+
inference_channel=channel_cfg['inference_channel'],
|
84 |
+
soft_nms=False,
|
85 |
+
nms_thr=1.0,
|
86 |
+
oks_thr=0.9,
|
87 |
+
vis_thr=0.9,
|
88 |
+
use_gt_bbox=False,
|
89 |
+
det_bbox_thr=0.0,
|
90 |
+
bbox_file='data/coco/person_detection_results/'
|
91 |
+
'COCO_val2017_detections_AP_H_56_person.json',
|
92 |
+
)
|
93 |
+
|
94 |
+
train_pipeline = [
|
95 |
+
dict(type='LoadImageFromFile'),
|
96 |
+
dict(type='TopDownRandomFlip', flip_prob=0.5),
|
97 |
+
dict(
|
98 |
+
type='TopDownHalfBodyTransform',
|
99 |
+
num_joints_half_body=8,
|
100 |
+
prob_half_body=0.3),
|
101 |
+
dict(
|
102 |
+
type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
|
103 |
+
dict(type='TopDownAffine', use_udp=True),
|
104 |
+
dict(type='ToTensor'),
|
105 |
+
dict(
|
106 |
+
type='NormalizeTensor',
|
107 |
+
mean=[0.485, 0.456, 0.406],
|
108 |
+
std=[0.229, 0.224, 0.225]),
|
109 |
+
dict(
|
110 |
+
type='TopDownGenerateTarget',
|
111 |
+
sigma=2,
|
112 |
+
encoding='UDP',
|
113 |
+
target_type=target_type),
|
114 |
+
dict(
|
115 |
+
type='Collect',
|
116 |
+
keys=['img', 'target', 'target_weight'],
|
117 |
+
meta_keys=[
|
118 |
+
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
|
119 |
+
'rotation', 'bbox_score', 'flip_pairs'
|
120 |
+
]),
|
121 |
+
]
|
122 |
+
|
123 |
+
val_pipeline = [
|
124 |
+
dict(type='LoadImageFromFile'),
|
125 |
+
dict(type='TopDownAffine', use_udp=True),
|
126 |
+
dict(type='ToTensor'),
|
127 |
+
dict(
|
128 |
+
type='NormalizeTensor',
|
129 |
+
mean=[0.485, 0.456, 0.406],
|
130 |
+
std=[0.229, 0.224, 0.225]),
|
131 |
+
dict(
|
132 |
+
type='Collect',
|
133 |
+
keys=['img'],
|
134 |
+
meta_keys=[
|
135 |
+
'image_file', 'center', 'scale', 'rotation', 'bbox_score',
|
136 |
+
'flip_pairs'
|
137 |
+
]),
|
138 |
+
]
|
139 |
+
|
140 |
+
test_pipeline = val_pipeline
|
141 |
+
|
142 |
+
data_root = r'D:\ViTPose\Evaluating'
|
143 |
+
data = dict(
|
144 |
+
samples_per_gpu=4,
|
145 |
+
workers_per_gpu=4,
|
146 |
+
val_dataloader=dict(samples_per_gpu=4),
|
147 |
+
test_dataloader=dict(samples_per_gpu=4),
|
148 |
+
train=dict(
|
149 |
+
type='TopDownCocoDataset',
|
150 |
+
ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
|
151 |
+
img_prefix=f'{data_root}/train2017/',
|
152 |
+
data_cfg=data_cfg,
|
153 |
+
pipeline=train_pipeline,
|
154 |
+
# dataset_info={{_base_.dataset_info}}
|
155 |
+
),
|
156 |
+
val=dict(
|
157 |
+
type='TopDownCocoDataset',
|
158 |
+
ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
|
159 |
+
img_prefix=f'{data_root}/val2017/',
|
160 |
+
data_cfg=data_cfg,
|
161 |
+
pipeline=val_pipeline,
|
162 |
+
# dataset_info={{_base_.dataset_info}}
|
163 |
+
),
|
164 |
+
test=dict(
|
165 |
+
type='TopDownCocoDataset',
|
166 |
+
ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
|
167 |
+
img_prefix=f'{data_root}/val2017/',
|
168 |
+
data_cfg=data_cfg,
|
169 |
+
pipeline=test_pipeline,
|
170 |
+
#dataset_info={{_base_.dataset_info}}
|
171 |
+
),
|
172 |
+
)
|
173 |
+
|
ViTPose/easy_ViTPose/easy_ViTPose/configs/.ipynb_checkpoints/ViTPose_wholebody-checkpoint.py
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from .ViTPose_common import *
|
2 |
+
|
3 |
+
# Channel configuration
|
4 |
+
channel_cfg = dict(
|
5 |
+
num_output_channels=133,
|
6 |
+
dataset_joints=133,
|
7 |
+
dataset_channel=[
|
8 |
+
list(range(133)),
|
9 |
+
],
|
10 |
+
inference_channel=list(range(133)))
|
11 |
+
|
12 |
+
# Set models channels
|
13 |
+
data_cfg['num_output_channels'] = channel_cfg['num_output_channels']
|
14 |
+
data_cfg['num_joints']= channel_cfg['dataset_joints']
|
15 |
+
data_cfg['dataset_channel']= channel_cfg['dataset_channel']
|
16 |
+
data_cfg['inference_channel']= channel_cfg['inference_channel']
|
17 |
+
|
18 |
+
names = ['small', 'base', 'large', 'huge']
|
19 |
+
for name in names:
|
20 |
+
globals()[f'model_{name}']['keypoint_head']['out_channels'] = channel_cfg['num_output_channels']
|
ViTPose/easy_ViTPose/easy_ViTPose/configs/ViTPose_aic.py
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from .ViTPose_common import *
|
2 |
+
|
3 |
+
# Channel configuration
|
4 |
+
channel_cfg = dict(
|
5 |
+
num_output_channels=14,
|
6 |
+
dataset_joints=14,
|
7 |
+
dataset_channel=[
|
8 |
+
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
|
9 |
+
],
|
10 |
+
inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13])
|
11 |
+
|
12 |
+
# Set models channels
|
13 |
+
data_cfg['num_output_channels'] = channel_cfg['num_output_channels']
|
14 |
+
data_cfg['num_joints']= channel_cfg['dataset_joints']
|
15 |
+
data_cfg['dataset_channel']= channel_cfg['dataset_channel']
|
16 |
+
data_cfg['inference_channel']= channel_cfg['inference_channel']
|
17 |
+
|
18 |
+
names = ['small', 'base', 'large', 'huge']
|
19 |
+
for name in names:
|
20 |
+
globals()[f'model_{name}']['keypoint_head']['out_channels'] = channel_cfg['num_output_channels']
|
ViTPose/easy_ViTPose/easy_ViTPose/configs/ViTPose_ap10k.py
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from .ViTPose_common import *
|
2 |
+
|
3 |
+
# Channel configuration
|
4 |
+
channel_cfg = dict(
|
5 |
+
num_output_channels=17,
|
6 |
+
dataset_joints=17,
|
7 |
+
dataset_channel=[
|
8 |
+
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
|
9 |
+
],
|
10 |
+
inference_channel=[
|
11 |
+
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
|
12 |
+
])
|
13 |
+
|
14 |
+
# Set models channels
|
15 |
+
data_cfg['num_output_channels'] = channel_cfg['num_output_channels']
|
16 |
+
data_cfg['num_joints']= channel_cfg['dataset_joints']
|
17 |
+
data_cfg['dataset_channel']= channel_cfg['dataset_channel']
|
18 |
+
data_cfg['inference_channel']= channel_cfg['inference_channel']
|
19 |
+
|
20 |
+
names = ['small', 'base', 'large', 'huge']
|
21 |
+
for name in names:
|
22 |
+
globals()[f'model_{name}']['keypoint_head']['out_channels'] = channel_cfg['num_output_channels']
|
ViTPose/easy_ViTPose/easy_ViTPose/configs/ViTPose_apt36k.py
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from .ViTPose_common import *
|
2 |
+
|
3 |
+
# Channel configuration
|
4 |
+
channel_cfg = dict(
|
5 |
+
num_output_channels=17,
|
6 |
+
dataset_joints=17,
|
7 |
+
dataset_channel=[
|
8 |
+
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
|
9 |
+
],
|
10 |
+
inference_channel=[
|
11 |
+
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
|
12 |
+
])
|
13 |
+
|
14 |
+
# Set models channels
|
15 |
+
data_cfg['num_output_channels'] = channel_cfg['num_output_channels']
|
16 |
+
data_cfg['num_joints']= channel_cfg['dataset_joints']
|
17 |
+
data_cfg['dataset_channel']= channel_cfg['dataset_channel']
|
18 |
+
data_cfg['inference_channel']= channel_cfg['inference_channel']
|
19 |
+
|
20 |
+
names = ['small', 'base', 'large', 'huge']
|
21 |
+
for name in names:
|
22 |
+
globals()[f'model_{name}']['keypoint_head']['out_channels'] = channel_cfg['num_output_channels']
|
ViTPose/easy_ViTPose/easy_ViTPose/configs/ViTPose_coco.py
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from .ViTPose_common import *
|
2 |
+
|
3 |
+
# Channel configuration
|
4 |
+
channel_cfg = dict(
|
5 |
+
num_output_channels=17,
|
6 |
+
dataset_joints=17,
|
7 |
+
dataset_channel=list(range(17)),
|
8 |
+
inference_channel=list(range(17)))
|
9 |
+
|
10 |
+
# Set models channels
|
11 |
+
data_cfg['num_output_channels'] = channel_cfg['num_output_channels']
|
12 |
+
data_cfg['num_joints']= channel_cfg['dataset_joints']
|
13 |
+
data_cfg['dataset_channel']= channel_cfg['dataset_channel']
|
14 |
+
data_cfg['inference_channel']= channel_cfg['inference_channel']
|
15 |
+
|
16 |
+
names = ['small', 'base', 'large', 'huge']
|
17 |
+
for name in names:
|
18 |
+
globals()[f'model_{name}']['keypoint_head']['out_channels'] = channel_cfg['num_output_channels']
|
ViTPose/easy_ViTPose/easy_ViTPose/configs/ViTPose_coco_25.py
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from .ViTPose_common import *
|
2 |
+
|
3 |
+
# Channel configuration
|
4 |
+
channel_cfg = dict(
|
5 |
+
num_output_channels=25,
|
6 |
+
dataset_joints=25,
|
7 |
+
dataset_channel=[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
|
8 |
+
16, 17, 18, 19, 20, 21, 22, 23, 24], ],
|
9 |
+
inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
|
10 |
+
16, 17, 18, 19, 20, 21, 22, 23, 24])
|
11 |
+
|
12 |
+
# Set models channels
|
13 |
+
data_cfg['num_output_channels'] = channel_cfg['num_output_channels']
|
14 |
+
data_cfg['num_joints']= channel_cfg['dataset_joints']
|
15 |
+
data_cfg['dataset_channel']= channel_cfg['dataset_channel']
|
16 |
+
data_cfg['inference_channel']= channel_cfg['inference_channel']
|
17 |
+
|
18 |
+
names = ['small', 'base', 'large', 'huge']
|
19 |
+
for name in names:
|
20 |
+
globals()[f'model_{name}']['keypoint_head']['out_channels'] = channel_cfg['num_output_channels']
|
ViTPose/easy_ViTPose/easy_ViTPose/configs/ViTPose_common.py
ADDED
@@ -0,0 +1,195 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Common configuration
|
2 |
+
optimizer = dict(type='AdamW', lr=1e-3, betas=(0.9, 0.999), weight_decay=0.1,
|
3 |
+
constructor='LayerDecayOptimizerConstructor',
|
4 |
+
paramwise_cfg=dict(
|
5 |
+
num_layers=12,
|
6 |
+
layer_decay_rate=1 - 2e-4,
|
7 |
+
custom_keys={
|
8 |
+
'bias': dict(decay_multi=0.),
|
9 |
+
'pos_embed': dict(decay_mult=0.),
|
10 |
+
'relative_position_bias_table': dict(decay_mult=0.),
|
11 |
+
'norm': dict(decay_mult=0.)
|
12 |
+
}
|
13 |
+
)
|
14 |
+
)
|
15 |
+
|
16 |
+
optimizer_config = dict(grad_clip=dict(max_norm=1., norm_type=2))
|
17 |
+
|
18 |
+
# learning policy
|
19 |
+
lr_config = dict(
|
20 |
+
policy='step',
|
21 |
+
warmup='linear',
|
22 |
+
warmup_iters=300,
|
23 |
+
warmup_ratio=0.001,
|
24 |
+
step=[3])
|
25 |
+
|
26 |
+
total_epochs = 4
|
27 |
+
target_type = 'GaussianHeatmap'
|
28 |
+
|
29 |
+
data_cfg = dict(
|
30 |
+
image_size=[192, 256],
|
31 |
+
heatmap_size=[48, 64],
|
32 |
+
soft_nms=False,
|
33 |
+
nms_thr=1.0,
|
34 |
+
oks_thr=0.9,
|
35 |
+
vis_thr=0.2,
|
36 |
+
use_gt_bbox=False,
|
37 |
+
det_bbox_thr=0.0,
|
38 |
+
bbox_file='data/coco/person_detection_results/'
|
39 |
+
'COCO_val2017_detections_AP_H_56_person.json',
|
40 |
+
)
|
41 |
+
|
42 |
+
data_root = '/home/adryw/dataset/COCO17'
|
43 |
+
data = dict(
|
44 |
+
samples_per_gpu=64,
|
45 |
+
workers_per_gpu=6,
|
46 |
+
val_dataloader=dict(samples_per_gpu=128),
|
47 |
+
test_dataloader=dict(samples_per_gpu=128),
|
48 |
+
train=dict(
|
49 |
+
type='TopDownCocoDataset',
|
50 |
+
ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
|
51 |
+
img_prefix=f'{data_root}/train2017/',
|
52 |
+
data_cfg=data_cfg),
|
53 |
+
val=dict(
|
54 |
+
type='TopDownCocoDataset',
|
55 |
+
ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
|
56 |
+
img_prefix=f'{data_root}/val2017/',
|
57 |
+
data_cfg=data_cfg),
|
58 |
+
test=dict(
|
59 |
+
type='TopDownCocoDataset',
|
60 |
+
ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
|
61 |
+
img_prefix=f'{data_root}/val2017/',
|
62 |
+
data_cfg=data_cfg)
|
63 |
+
)
|
64 |
+
|
65 |
+
model_small = dict(
|
66 |
+
type='TopDown',
|
67 |
+
pretrained=None,
|
68 |
+
backbone=dict(
|
69 |
+
type='ViT',
|
70 |
+
img_size=(256, 192),
|
71 |
+
patch_size=16,
|
72 |
+
embed_dim=384,
|
73 |
+
depth=12,
|
74 |
+
num_heads=12,
|
75 |
+
ratio=1,
|
76 |
+
use_checkpoint=False,
|
77 |
+
mlp_ratio=4,
|
78 |
+
qkv_bias=True,
|
79 |
+
drop_path_rate=0.1,
|
80 |
+
),
|
81 |
+
keypoint_head=dict(
|
82 |
+
type='TopdownHeatmapSimpleHead',
|
83 |
+
in_channels=384,
|
84 |
+
num_deconv_layers=2,
|
85 |
+
num_deconv_filters=(256, 256),
|
86 |
+
num_deconv_kernels=(4, 4),
|
87 |
+
extra=dict(final_conv_kernel=1, ),
|
88 |
+
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
|
89 |
+
train_cfg=dict(),
|
90 |
+
test_cfg=dict(
|
91 |
+
flip_test=True,
|
92 |
+
post_process='default',
|
93 |
+
shift_heatmap=False,
|
94 |
+
target_type=target_type,
|
95 |
+
modulate_kernel=11,
|
96 |
+
use_udp=True))
|
97 |
+
|
98 |
+
model_base = dict(
|
99 |
+
type='TopDown',
|
100 |
+
pretrained=None,
|
101 |
+
backbone=dict(
|
102 |
+
type='ViT',
|
103 |
+
img_size=(256, 192),
|
104 |
+
patch_size=16,
|
105 |
+
embed_dim=768,
|
106 |
+
depth=12,
|
107 |
+
num_heads=12,
|
108 |
+
ratio=1,
|
109 |
+
use_checkpoint=False,
|
110 |
+
mlp_ratio=4,
|
111 |
+
qkv_bias=True,
|
112 |
+
drop_path_rate=0.3,
|
113 |
+
),
|
114 |
+
keypoint_head=dict(
|
115 |
+
type='TopdownHeatmapSimpleHead',
|
116 |
+
in_channels=768,
|
117 |
+
num_deconv_layers=2,
|
118 |
+
num_deconv_filters=(256, 256),
|
119 |
+
num_deconv_kernels=(4, 4),
|
120 |
+
extra=dict(final_conv_kernel=1, ),
|
121 |
+
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
|
122 |
+
train_cfg=dict(),
|
123 |
+
test_cfg=dict(
|
124 |
+
flip_test=True,
|
125 |
+
post_process='default',
|
126 |
+
shift_heatmap=False,
|
127 |
+
target_type=target_type,
|
128 |
+
modulate_kernel=11,
|
129 |
+
use_udp=True))
|
130 |
+
|
131 |
+
model_large = dict(
|
132 |
+
type='TopDown',
|
133 |
+
pretrained=None,
|
134 |
+
backbone=dict(
|
135 |
+
type='ViT',
|
136 |
+
img_size=(256, 192),
|
137 |
+
patch_size=16,
|
138 |
+
embed_dim=1024,
|
139 |
+
depth=24,
|
140 |
+
num_heads=16,
|
141 |
+
ratio=1,
|
142 |
+
use_checkpoint=False,
|
143 |
+
mlp_ratio=4,
|
144 |
+
qkv_bias=True,
|
145 |
+
drop_path_rate=0.5,
|
146 |
+
),
|
147 |
+
keypoint_head=dict(
|
148 |
+
type='TopdownHeatmapSimpleHead',
|
149 |
+
in_channels=1024,
|
150 |
+
num_deconv_layers=2,
|
151 |
+
num_deconv_filters=(256, 256),
|
152 |
+
num_deconv_kernels=(4, 4),
|
153 |
+
extra=dict(final_conv_kernel=1, ),
|
154 |
+
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
|
155 |
+
train_cfg=dict(),
|
156 |
+
test_cfg=dict(
|
157 |
+
flip_test=True,
|
158 |
+
post_process='default',
|
159 |
+
shift_heatmap=False,
|
160 |
+
target_type=target_type,
|
161 |
+
modulate_kernel=11,
|
162 |
+
use_udp=True))
|
163 |
+
|
164 |
+
model_huge = dict(
|
165 |
+
type='TopDown',
|
166 |
+
pretrained=None,
|
167 |
+
backbone=dict(
|
168 |
+
type='ViT',
|
169 |
+
img_size=(256, 192),
|
170 |
+
patch_size=16,
|
171 |
+
embed_dim=1280,
|
172 |
+
depth=32,
|
173 |
+
num_heads=16,
|
174 |
+
ratio=1,
|
175 |
+
use_checkpoint=False,
|
176 |
+
mlp_ratio=4,
|
177 |
+
qkv_bias=True,
|
178 |
+
drop_path_rate=0.55,
|
179 |
+
),
|
180 |
+
keypoint_head=dict(
|
181 |
+
type='TopdownHeatmapSimpleHead',
|
182 |
+
in_channels=1280,
|
183 |
+
num_deconv_layers=2,
|
184 |
+
num_deconv_filters=(256, 256),
|
185 |
+
num_deconv_kernels=(4, 4),
|
186 |
+
extra=dict(final_conv_kernel=1, ),
|
187 |
+
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
|
188 |
+
train_cfg=dict(),
|
189 |
+
test_cfg=dict(
|
190 |
+
flip_test=True,
|
191 |
+
post_process='default',
|
192 |
+
shift_heatmap=False,
|
193 |
+
target_type=target_type,
|
194 |
+
modulate_kernel=11,
|
195 |
+
use_udp=True))
|
ViTPose/easy_ViTPose/easy_ViTPose/configs/ViTPose_mpii.py
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from .ViTPose_common import *
|
2 |
+
|
3 |
+
# Channel configuration
|
4 |
+
channel_cfg = dict(
|
5 |
+
num_output_channels=16,
|
6 |
+
dataset_joints=16,
|
7 |
+
dataset_channel=list(range(16)),
|
8 |
+
inference_channel=list(range(16)))
|
9 |
+
|
10 |
+
# Set models channels
|
11 |
+
data_cfg['num_output_channels'] = channel_cfg['num_output_channels']
|
12 |
+
data_cfg['num_joints']= channel_cfg['dataset_joints']
|
13 |
+
data_cfg['dataset_channel']= channel_cfg['dataset_channel']
|
14 |
+
data_cfg['inference_channel']= channel_cfg['inference_channel']
|
15 |
+
|
16 |
+
names = ['small', 'base', 'large', 'huge']
|
17 |
+
for name in names:
|
18 |
+
globals()[f'model_{name}']['keypoint_head']['out_channels'] = channel_cfg['num_output_channels']
|
ViTPose/easy_ViTPose/easy_ViTPose/configs/ViTPose_small_coco_256x192.py
ADDED
@@ -0,0 +1,173 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_base_ = [
|
2 |
+
'../../../../_base_/default_runtime.py',
|
3 |
+
'../../../../_base_/datasets/coco.py'
|
4 |
+
]
|
5 |
+
evaluation = dict(interval=10, metric='mAP', save_best='AP')
|
6 |
+
|
7 |
+
optimizer = dict(type='AdamW', lr=5e-4, betas=(0.9, 0.999), weight_decay=0.1,
|
8 |
+
constructor='LayerDecayOptimizerConstructor',
|
9 |
+
paramwise_cfg=dict(
|
10 |
+
num_layers=12,
|
11 |
+
layer_decay_rate=0.8,
|
12 |
+
custom_keys={
|
13 |
+
'bias': dict(decay_multi=0.),
|
14 |
+
'pos_embed': dict(decay_mult=0.),
|
15 |
+
'relative_position_bias_table': dict(decay_mult=0.),
|
16 |
+
'norm': dict(decay_mult=0.)
|
17 |
+
}
|
18 |
+
)
|
19 |
+
)
|
20 |
+
|
21 |
+
optimizer_config = dict(grad_clip=dict(max_norm=1., norm_type=2))
|
22 |
+
|
23 |
+
# learning policy
|
24 |
+
lr_config = dict(
|
25 |
+
policy='step',
|
26 |
+
warmup='linear',
|
27 |
+
warmup_iters=500,
|
28 |
+
warmup_ratio=0.001,
|
29 |
+
step=[170, 200])
|
30 |
+
total_epochs = 210
|
31 |
+
target_type = 'GaussianHeatmap'
|
32 |
+
channel_cfg = dict(
|
33 |
+
num_output_channels=17,
|
34 |
+
dataset_joints=17,
|
35 |
+
dataset_channel=[
|
36 |
+
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
|
37 |
+
],
|
38 |
+
inference_channel=[
|
39 |
+
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
|
40 |
+
])
|
41 |
+
|
42 |
+
# model settings
|
43 |
+
model = dict(
|
44 |
+
type='TopDown',
|
45 |
+
pretrained=None,
|
46 |
+
backbone=dict(
|
47 |
+
type='ViT',
|
48 |
+
img_size=(256, 192),
|
49 |
+
patch_size=16,
|
50 |
+
embed_dim=384,
|
51 |
+
depth=12,
|
52 |
+
num_heads=12,
|
53 |
+
ratio=1,
|
54 |
+
use_checkpoint=False,
|
55 |
+
mlp_ratio=4,
|
56 |
+
qkv_bias=True,
|
57 |
+
drop_path_rate=0.1,
|
58 |
+
),
|
59 |
+
keypoint_head=dict(
|
60 |
+
type='TopdownHeatmapSimpleHead',
|
61 |
+
in_channels=384,
|
62 |
+
num_deconv_layers=2,
|
63 |
+
num_deconv_filters=(256, 256),
|
64 |
+
num_deconv_kernels=(4, 4),
|
65 |
+
extra=dict(final_conv_kernel=1, ),
|
66 |
+
out_channels=channel_cfg['num_output_channels'],
|
67 |
+
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
|
68 |
+
train_cfg=dict(),
|
69 |
+
test_cfg=dict(
|
70 |
+
flip_test=True,
|
71 |
+
post_process='default',
|
72 |
+
shift_heatmap=False,
|
73 |
+
target_type=target_type,
|
74 |
+
modulate_kernel=11,
|
75 |
+
use_udp=True))
|
76 |
+
|
77 |
+
data_cfg = dict(
|
78 |
+
image_size=[192, 256],
|
79 |
+
heatmap_size=[48, 64],
|
80 |
+
num_output_channels=channel_cfg['num_output_channels'],
|
81 |
+
num_joints=channel_cfg['dataset_joints'],
|
82 |
+
dataset_channel=channel_cfg['dataset_channel'],
|
83 |
+
inference_channel=channel_cfg['inference_channel'],
|
84 |
+
soft_nms=False,
|
85 |
+
nms_thr=1.0,
|
86 |
+
oks_thr=0.9,
|
87 |
+
vis_thr=0.9,
|
88 |
+
use_gt_bbox=False,
|
89 |
+
det_bbox_thr=0.0,
|
90 |
+
bbox_file='data/coco/person_detection_results/'
|
91 |
+
'COCO_val2017_detections_AP_H_56_person.json',
|
92 |
+
)
|
93 |
+
|
94 |
+
train_pipeline = [
|
95 |
+
dict(type='LoadImageFromFile'),
|
96 |
+
dict(type='TopDownRandomFlip', flip_prob=0.5),
|
97 |
+
dict(
|
98 |
+
type='TopDownHalfBodyTransform',
|
99 |
+
num_joints_half_body=8,
|
100 |
+
prob_half_body=0.3),
|
101 |
+
dict(
|
102 |
+
type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
|
103 |
+
dict(type='TopDownAffine', use_udp=True),
|
104 |
+
dict(type='ToTensor'),
|
105 |
+
dict(
|
106 |
+
type='NormalizeTensor',
|
107 |
+
mean=[0.485, 0.456, 0.406],
|
108 |
+
std=[0.229, 0.224, 0.225]),
|
109 |
+
dict(
|
110 |
+
type='TopDownGenerateTarget',
|
111 |
+
sigma=2,
|
112 |
+
encoding='UDP',
|
113 |
+
target_type=target_type),
|
114 |
+
dict(
|
115 |
+
type='Collect',
|
116 |
+
keys=['img', 'target', 'target_weight'],
|
117 |
+
meta_keys=[
|
118 |
+
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
|
119 |
+
'rotation', 'bbox_score', 'flip_pairs'
|
120 |
+
]),
|
121 |
+
]
|
122 |
+
|
123 |
+
val_pipeline = [
|
124 |
+
dict(type='LoadImageFromFile'),
|
125 |
+
dict(type='TopDownAffine', use_udp=True),
|
126 |
+
dict(type='ToTensor'),
|
127 |
+
dict(
|
128 |
+
type='NormalizeTensor',
|
129 |
+
mean=[0.485, 0.456, 0.406],
|
130 |
+
std=[0.229, 0.224, 0.225]),
|
131 |
+
dict(
|
132 |
+
type='Collect',
|
133 |
+
keys=['img'],
|
134 |
+
meta_keys=[
|
135 |
+
'image_file', 'center', 'scale', 'rotation', 'bbox_score',
|
136 |
+
'flip_pairs'
|
137 |
+
]),
|
138 |
+
]
|
139 |
+
|
140 |
+
test_pipeline = val_pipeline
|
141 |
+
|
142 |
+
data_root = r'D:\ViTPose\Evaluating'
|
143 |
+
data = dict(
|
144 |
+
samples_per_gpu=4,
|
145 |
+
workers_per_gpu=4,
|
146 |
+
val_dataloader=dict(samples_per_gpu=4),
|
147 |
+
test_dataloader=dict(samples_per_gpu=4),
|
148 |
+
train=dict(
|
149 |
+
type='TopDownCocoDataset',
|
150 |
+
ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
|
151 |
+
img_prefix=f'{data_root}/train2017/',
|
152 |
+
data_cfg=data_cfg,
|
153 |
+
pipeline=train_pipeline,
|
154 |
+
# dataset_info={{_base_.dataset_info}}
|
155 |
+
),
|
156 |
+
val=dict(
|
157 |
+
type='TopDownCocoDataset',
|
158 |
+
ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
|
159 |
+
img_prefix=f'{data_root}/val2017/',
|
160 |
+
data_cfg=data_cfg,
|
161 |
+
pipeline=val_pipeline,
|
162 |
+
# dataset_info={{_base_.dataset_info}}
|
163 |
+
),
|
164 |
+
test=dict(
|
165 |
+
type='TopDownCocoDataset',
|
166 |
+
ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
|
167 |
+
img_prefix=f'{data_root}/val2017/',
|
168 |
+
data_cfg=data_cfg,
|
169 |
+
pipeline=test_pipeline,
|
170 |
+
#dataset_info={{_base_.dataset_info}}
|
171 |
+
),
|
172 |
+
)
|
173 |
+
|
ViTPose/easy_ViTPose/easy_ViTPose/configs/ViTPose_wholebody.py
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from .ViTPose_common import *
|
2 |
+
|
3 |
+
# Channel configuration
|
4 |
+
channel_cfg = dict(
|
5 |
+
num_output_channels=133,
|
6 |
+
dataset_joints=133,
|
7 |
+
dataset_channel=[
|
8 |
+
list(range(133)),
|
9 |
+
],
|
10 |
+
inference_channel=list(range(133)))
|
11 |
+
|
12 |
+
# Set models channels
|
13 |
+
data_cfg['num_output_channels'] = channel_cfg['num_output_channels']
|
14 |
+
data_cfg['num_joints']= channel_cfg['dataset_joints']
|
15 |
+
data_cfg['dataset_channel']= channel_cfg['dataset_channel']
|
16 |
+
data_cfg['inference_channel']= channel_cfg['inference_channel']
|
17 |
+
|
18 |
+
names = ['small', 'base', 'large', 'huge']
|
19 |
+
for name in names:
|
20 |
+
globals()[f'model_{name}']['keypoint_head']['out_channels'] = channel_cfg['num_output_channels']
|
ViTPose/easy_ViTPose/easy_ViTPose/configs/__init__.py
ADDED
File without changes
|
ViTPose/easy_ViTPose/easy_ViTPose/configs/__pycache__/ViTPose_coco_25.cpython-39.pyc
ADDED
Binary file (697 Bytes). View file
|
|
ViTPose/easy_ViTPose/easy_ViTPose/configs/__pycache__/ViTPose_common.cpython-39.pyc
ADDED
Binary file (2.88 kB). View file
|
|
ViTPose/easy_ViTPose/easy_ViTPose/configs/__pycache__/ViTPose_small_coco_256x192.cpython-39.pyc
ADDED
Binary file (3.69 kB). View file
|
|
ViTPose/easy_ViTPose/easy_ViTPose/configs/__pycache__/__init__.cpython-39.pyc
ADDED
Binary file (158 Bytes). View file
|
|
ViTPose/easy_ViTPose/easy_ViTPose/configs/_base_/datasets/300w.py
ADDED
@@ -0,0 +1,384 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
dataset_info = dict(
|
2 |
+
dataset_name='300w',
|
3 |
+
paper_info=dict(
|
4 |
+
author='Sagonas, Christos and Antonakos, Epameinondas '
|
5 |
+
'and Tzimiropoulos, Georgios and Zafeiriou, Stefanos '
|
6 |
+
'and Pantic, Maja',
|
7 |
+
title='300 faces in-the-wild challenge: '
|
8 |
+
'Database and results',
|
9 |
+
container='Image and vision computing',
|
10 |
+
year='2016',
|
11 |
+
homepage='https://ibug.doc.ic.ac.uk/resources/300-W/',
|
12 |
+
),
|
13 |
+
keypoint_info={
|
14 |
+
0:
|
15 |
+
dict(
|
16 |
+
name='kpt-0', id=0, color=[255, 255, 255], type='', swap='kpt-16'),
|
17 |
+
1:
|
18 |
+
dict(
|
19 |
+
name='kpt-1', id=1, color=[255, 255, 255], type='', swap='kpt-15'),
|
20 |
+
2:
|
21 |
+
dict(
|
22 |
+
name='kpt-2', id=2, color=[255, 255, 255], type='', swap='kpt-14'),
|
23 |
+
3:
|
24 |
+
dict(
|
25 |
+
name='kpt-3', id=3, color=[255, 255, 255], type='', swap='kpt-13'),
|
26 |
+
4:
|
27 |
+
dict(
|
28 |
+
name='kpt-4', id=4, color=[255, 255, 255], type='', swap='kpt-12'),
|
29 |
+
5:
|
30 |
+
dict(
|
31 |
+
name='kpt-5', id=5, color=[255, 255, 255], type='', swap='kpt-11'),
|
32 |
+
6:
|
33 |
+
dict(
|
34 |
+
name='kpt-6', id=6, color=[255, 255, 255], type='', swap='kpt-10'),
|
35 |
+
7:
|
36 |
+
dict(name='kpt-7', id=7, color=[255, 255, 255], type='', swap='kpt-9'),
|
37 |
+
8:
|
38 |
+
dict(name='kpt-8', id=8, color=[255, 255, 255], type='', swap=''),
|
39 |
+
9:
|
40 |
+
dict(name='kpt-9', id=9, color=[255, 255, 255], type='', swap='kpt-7'),
|
41 |
+
10:
|
42 |
+
dict(
|
43 |
+
name='kpt-10', id=10, color=[255, 255, 255], type='',
|
44 |
+
swap='kpt-6'),
|
45 |
+
11:
|
46 |
+
dict(
|
47 |
+
name='kpt-11', id=11, color=[255, 255, 255], type='',
|
48 |
+
swap='kpt-5'),
|
49 |
+
12:
|
50 |
+
dict(
|
51 |
+
name='kpt-12', id=12, color=[255, 255, 255], type='',
|
52 |
+
swap='kpt-4'),
|
53 |
+
13:
|
54 |
+
dict(
|
55 |
+
name='kpt-13', id=13, color=[255, 255, 255], type='',
|
56 |
+
swap='kpt-3'),
|
57 |
+
14:
|
58 |
+
dict(
|
59 |
+
name='kpt-14', id=14, color=[255, 255, 255], type='',
|
60 |
+
swap='kpt-2'),
|
61 |
+
15:
|
62 |
+
dict(
|
63 |
+
name='kpt-15', id=15, color=[255, 255, 255], type='',
|
64 |
+
swap='kpt-1'),
|
65 |
+
16:
|
66 |
+
dict(
|
67 |
+
name='kpt-16', id=16, color=[255, 255, 255], type='',
|
68 |
+
swap='kpt-0'),
|
69 |
+
17:
|
70 |
+
dict(
|
71 |
+
name='kpt-17',
|
72 |
+
id=17,
|
73 |
+
color=[255, 255, 255],
|
74 |
+
type='',
|
75 |
+
swap='kpt-26'),
|
76 |
+
18:
|
77 |
+
dict(
|
78 |
+
name='kpt-18',
|
79 |
+
id=18,
|
80 |
+
color=[255, 255, 255],
|
81 |
+
type='',
|
82 |
+
swap='kpt-25'),
|
83 |
+
19:
|
84 |
+
dict(
|
85 |
+
name='kpt-19',
|
86 |
+
id=19,
|
87 |
+
color=[255, 255, 255],
|
88 |
+
type='',
|
89 |
+
swap='kpt-24'),
|
90 |
+
20:
|
91 |
+
dict(
|
92 |
+
name='kpt-20',
|
93 |
+
id=20,
|
94 |
+
color=[255, 255, 255],
|
95 |
+
type='',
|
96 |
+
swap='kpt-23'),
|
97 |
+
21:
|
98 |
+
dict(
|
99 |
+
name='kpt-21',
|
100 |
+
id=21,
|
101 |
+
color=[255, 255, 255],
|
102 |
+
type='',
|
103 |
+
swap='kpt-22'),
|
104 |
+
22:
|
105 |
+
dict(
|
106 |
+
name='kpt-22',
|
107 |
+
id=22,
|
108 |
+
color=[255, 255, 255],
|
109 |
+
type='',
|
110 |
+
swap='kpt-21'),
|
111 |
+
23:
|
112 |
+
dict(
|
113 |
+
name='kpt-23',
|
114 |
+
id=23,
|
115 |
+
color=[255, 255, 255],
|
116 |
+
type='',
|
117 |
+
swap='kpt-20'),
|
118 |
+
24:
|
119 |
+
dict(
|
120 |
+
name='kpt-24',
|
121 |
+
id=24,
|
122 |
+
color=[255, 255, 255],
|
123 |
+
type='',
|
124 |
+
swap='kpt-19'),
|
125 |
+
25:
|
126 |
+
dict(
|
127 |
+
name='kpt-25',
|
128 |
+
id=25,
|
129 |
+
color=[255, 255, 255],
|
130 |
+
type='',
|
131 |
+
swap='kpt-18'),
|
132 |
+
26:
|
133 |
+
dict(
|
134 |
+
name='kpt-26',
|
135 |
+
id=26,
|
136 |
+
color=[255, 255, 255],
|
137 |
+
type='',
|
138 |
+
swap='kpt-17'),
|
139 |
+
27:
|
140 |
+
dict(name='kpt-27', id=27, color=[255, 255, 255], type='', swap=''),
|
141 |
+
28:
|
142 |
+
dict(name='kpt-28', id=28, color=[255, 255, 255], type='', swap=''),
|
143 |
+
29:
|
144 |
+
dict(name='kpt-29', id=29, color=[255, 255, 255], type='', swap=''),
|
145 |
+
30:
|
146 |
+
dict(name='kpt-30', id=30, color=[255, 255, 255], type='', swap=''),
|
147 |
+
31:
|
148 |
+
dict(
|
149 |
+
name='kpt-31',
|
150 |
+
id=31,
|
151 |
+
color=[255, 255, 255],
|
152 |
+
type='',
|
153 |
+
swap='kpt-35'),
|
154 |
+
32:
|
155 |
+
dict(
|
156 |
+
name='kpt-32',
|
157 |
+
id=32,
|
158 |
+
color=[255, 255, 255],
|
159 |
+
type='',
|
160 |
+
swap='kpt-34'),
|
161 |
+
33:
|
162 |
+
dict(name='kpt-33', id=33, color=[255, 255, 255], type='', swap=''),
|
163 |
+
34:
|
164 |
+
dict(
|
165 |
+
name='kpt-34',
|
166 |
+
id=34,
|
167 |
+
color=[255, 255, 255],
|
168 |
+
type='',
|
169 |
+
swap='kpt-32'),
|
170 |
+
35:
|
171 |
+
dict(
|
172 |
+
name='kpt-35',
|
173 |
+
id=35,
|
174 |
+
color=[255, 255, 255],
|
175 |
+
type='',
|
176 |
+
swap='kpt-31'),
|
177 |
+
36:
|
178 |
+
dict(
|
179 |
+
name='kpt-36',
|
180 |
+
id=36,
|
181 |
+
color=[255, 255, 255],
|
182 |
+
type='',
|
183 |
+
swap='kpt-45'),
|
184 |
+
37:
|
185 |
+
dict(
|
186 |
+
name='kpt-37',
|
187 |
+
id=37,
|
188 |
+
color=[255, 255, 255],
|
189 |
+
type='',
|
190 |
+
swap='kpt-44'),
|
191 |
+
38:
|
192 |
+
dict(
|
193 |
+
name='kpt-38',
|
194 |
+
id=38,
|
195 |
+
color=[255, 255, 255],
|
196 |
+
type='',
|
197 |
+
swap='kpt-43'),
|
198 |
+
39:
|
199 |
+
dict(
|
200 |
+
name='kpt-39',
|
201 |
+
id=39,
|
202 |
+
color=[255, 255, 255],
|
203 |
+
type='',
|
204 |
+
swap='kpt-42'),
|
205 |
+
40:
|
206 |
+
dict(
|
207 |
+
name='kpt-40',
|
208 |
+
id=40,
|
209 |
+
color=[255, 255, 255],
|
210 |
+
type='',
|
211 |
+
swap='kpt-47'),
|
212 |
+
41:
|
213 |
+
dict(
|
214 |
+
name='kpt-41',
|
215 |
+
id=41,
|
216 |
+
color=[255, 255, 255],
|
217 |
+
type='',
|
218 |
+
swap='kpt-46'),
|
219 |
+
42:
|
220 |
+
dict(
|
221 |
+
name='kpt-42',
|
222 |
+
id=42,
|
223 |
+
color=[255, 255, 255],
|
224 |
+
type='',
|
225 |
+
swap='kpt-39'),
|
226 |
+
43:
|
227 |
+
dict(
|
228 |
+
name='kpt-43',
|
229 |
+
id=43,
|
230 |
+
color=[255, 255, 255],
|
231 |
+
type='',
|
232 |
+
swap='kpt-38'),
|
233 |
+
44:
|
234 |
+
dict(
|
235 |
+
name='kpt-44',
|
236 |
+
id=44,
|
237 |
+
color=[255, 255, 255],
|
238 |
+
type='',
|
239 |
+
swap='kpt-37'),
|
240 |
+
45:
|
241 |
+
dict(
|
242 |
+
name='kpt-45',
|
243 |
+
id=45,
|
244 |
+
color=[255, 255, 255],
|
245 |
+
type='',
|
246 |
+
swap='kpt-36'),
|
247 |
+
46:
|
248 |
+
dict(
|
249 |
+
name='kpt-46',
|
250 |
+
id=46,
|
251 |
+
color=[255, 255, 255],
|
252 |
+
type='',
|
253 |
+
swap='kpt-41'),
|
254 |
+
47:
|
255 |
+
dict(
|
256 |
+
name='kpt-47',
|
257 |
+
id=47,
|
258 |
+
color=[255, 255, 255],
|
259 |
+
type='',
|
260 |
+
swap='kpt-40'),
|
261 |
+
48:
|
262 |
+
dict(
|
263 |
+
name='kpt-48',
|
264 |
+
id=48,
|
265 |
+
color=[255, 255, 255],
|
266 |
+
type='',
|
267 |
+
swap='kpt-54'),
|
268 |
+
49:
|
269 |
+
dict(
|
270 |
+
name='kpt-49',
|
271 |
+
id=49,
|
272 |
+
color=[255, 255, 255],
|
273 |
+
type='',
|
274 |
+
swap='kpt-53'),
|
275 |
+
50:
|
276 |
+
dict(
|
277 |
+
name='kpt-50',
|
278 |
+
id=50,
|
279 |
+
color=[255, 255, 255],
|
280 |
+
type='',
|
281 |
+
swap='kpt-52'),
|
282 |
+
51:
|
283 |
+
dict(name='kpt-51', id=51, color=[255, 255, 255], type='', swap=''),
|
284 |
+
52:
|
285 |
+
dict(
|
286 |
+
name='kpt-52',
|
287 |
+
id=52,
|
288 |
+
color=[255, 255, 255],
|
289 |
+
type='',
|
290 |
+
swap='kpt-50'),
|
291 |
+
53:
|
292 |
+
dict(
|
293 |
+
name='kpt-53',
|
294 |
+
id=53,
|
295 |
+
color=[255, 255, 255],
|
296 |
+
type='',
|
297 |
+
swap='kpt-49'),
|
298 |
+
54:
|
299 |
+
dict(
|
300 |
+
name='kpt-54',
|
301 |
+
id=54,
|
302 |
+
color=[255, 255, 255],
|
303 |
+
type='',
|
304 |
+
swap='kpt-48'),
|
305 |
+
55:
|
306 |
+
dict(
|
307 |
+
name='kpt-55',
|
308 |
+
id=55,
|
309 |
+
color=[255, 255, 255],
|
310 |
+
type='',
|
311 |
+
swap='kpt-59'),
|
312 |
+
56:
|
313 |
+
dict(
|
314 |
+
name='kpt-56',
|
315 |
+
id=56,
|
316 |
+
color=[255, 255, 255],
|
317 |
+
type='',
|
318 |
+
swap='kpt-58'),
|
319 |
+
57:
|
320 |
+
dict(name='kpt-57', id=57, color=[255, 255, 255], type='', swap=''),
|
321 |
+
58:
|
322 |
+
dict(
|
323 |
+
name='kpt-58',
|
324 |
+
id=58,
|
325 |
+
color=[255, 255, 255],
|
326 |
+
type='',
|
327 |
+
swap='kpt-56'),
|
328 |
+
59:
|
329 |
+
dict(
|
330 |
+
name='kpt-59',
|
331 |
+
id=59,
|
332 |
+
color=[255, 255, 255],
|
333 |
+
type='',
|
334 |
+
swap='kpt-55'),
|
335 |
+
60:
|
336 |
+
dict(
|
337 |
+
name='kpt-60',
|
338 |
+
id=60,
|
339 |
+
color=[255, 255, 255],
|
340 |
+
type='',
|
341 |
+
swap='kpt-64'),
|
342 |
+
61:
|
343 |
+
dict(
|
344 |
+
name='kpt-61',
|
345 |
+
id=61,
|
346 |
+
color=[255, 255, 255],
|
347 |
+
type='',
|
348 |
+
swap='kpt-63'),
|
349 |
+
62:
|
350 |
+
dict(name='kpt-62', id=62, color=[255, 255, 255], type='', swap=''),
|
351 |
+
63:
|
352 |
+
dict(
|
353 |
+
name='kpt-63',
|
354 |
+
id=63,
|
355 |
+
color=[255, 255, 255],
|
356 |
+
type='',
|
357 |
+
swap='kpt-61'),
|
358 |
+
64:
|
359 |
+
dict(
|
360 |
+
name='kpt-64',
|
361 |
+
id=64,
|
362 |
+
color=[255, 255, 255],
|
363 |
+
type='',
|
364 |
+
swap='kpt-60'),
|
365 |
+
65:
|
366 |
+
dict(
|
367 |
+
name='kpt-65',
|
368 |
+
id=65,
|
369 |
+
color=[255, 255, 255],
|
370 |
+
type='',
|
371 |
+
swap='kpt-67'),
|
372 |
+
66:
|
373 |
+
dict(name='kpt-66', id=66, color=[255, 255, 255], type='', swap=''),
|
374 |
+
67:
|
375 |
+
dict(
|
376 |
+
name='kpt-67',
|
377 |
+
id=67,
|
378 |
+
color=[255, 255, 255],
|
379 |
+
type='',
|
380 |
+
swap='kpt-65'),
|
381 |
+
},
|
382 |
+
skeleton_info={},
|
383 |
+
joint_weights=[1.] * 68,
|
384 |
+
sigmas=[])
|
ViTPose/easy_ViTPose/easy_ViTPose/configs/_base_/datasets/aflw.py
ADDED
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
dataset_info = dict(
|
2 |
+
dataset_name='aflw',
|
3 |
+
paper_info=dict(
|
4 |
+
author='Koestinger, Martin and Wohlhart, Paul and '
|
5 |
+
'Roth, Peter M and Bischof, Horst',
|
6 |
+
title='Annotated facial landmarks in the wild: '
|
7 |
+
'A large-scale, real-world database for facial '
|
8 |
+
'landmark localization',
|
9 |
+
container='2011 IEEE international conference on computer '
|
10 |
+
'vision workshops (ICCV workshops)',
|
11 |
+
year='2011',
|
12 |
+
homepage='https://www.tugraz.at/institute/icg/research/'
|
13 |
+
'team-bischof/lrs/downloads/aflw/',
|
14 |
+
),
|
15 |
+
keypoint_info={
|
16 |
+
0:
|
17 |
+
dict(name='kpt-0', id=0, color=[255, 255, 255], type='', swap='kpt-5'),
|
18 |
+
1:
|
19 |
+
dict(name='kpt-1', id=1, color=[255, 255, 255], type='', swap='kpt-4'),
|
20 |
+
2:
|
21 |
+
dict(name='kpt-2', id=2, color=[255, 255, 255], type='', swap='kpt-3'),
|
22 |
+
3:
|
23 |
+
dict(name='kpt-3', id=3, color=[255, 255, 255], type='', swap='kpt-2'),
|
24 |
+
4:
|
25 |
+
dict(name='kpt-4', id=4, color=[255, 255, 255], type='', swap='kpt-1'),
|
26 |
+
5:
|
27 |
+
dict(name='kpt-5', id=5, color=[255, 255, 255], type='', swap='kpt-0'),
|
28 |
+
6:
|
29 |
+
dict(
|
30 |
+
name='kpt-6', id=6, color=[255, 255, 255], type='', swap='kpt-11'),
|
31 |
+
7:
|
32 |
+
dict(
|
33 |
+
name='kpt-7', id=7, color=[255, 255, 255], type='', swap='kpt-10'),
|
34 |
+
8:
|
35 |
+
dict(name='kpt-8', id=8, color=[255, 255, 255], type='', swap='kpt-9'),
|
36 |
+
9:
|
37 |
+
dict(name='kpt-9', id=9, color=[255, 255, 255], type='', swap='kpt-8'),
|
38 |
+
10:
|
39 |
+
dict(
|
40 |
+
name='kpt-10', id=10, color=[255, 255, 255], type='',
|
41 |
+
swap='kpt-7'),
|
42 |
+
11:
|
43 |
+
dict(
|
44 |
+
name='kpt-11', id=11, color=[255, 255, 255], type='',
|
45 |
+
swap='kpt-6'),
|
46 |
+
12:
|
47 |
+
dict(
|
48 |
+
name='kpt-12',
|
49 |
+
id=12,
|
50 |
+
color=[255, 255, 255],
|
51 |
+
type='',
|
52 |
+
swap='kpt-14'),
|
53 |
+
13:
|
54 |
+
dict(name='kpt-13', id=13, color=[255, 255, 255], type='', swap=''),
|
55 |
+
14:
|
56 |
+
dict(
|
57 |
+
name='kpt-14',
|
58 |
+
id=14,
|
59 |
+
color=[255, 255, 255],
|
60 |
+
type='',
|
61 |
+
swap='kpt-12'),
|
62 |
+
15:
|
63 |
+
dict(
|
64 |
+
name='kpt-15',
|
65 |
+
id=15,
|
66 |
+
color=[255, 255, 255],
|
67 |
+
type='',
|
68 |
+
swap='kpt-17'),
|
69 |
+
16:
|
70 |
+
dict(name='kpt-16', id=16, color=[255, 255, 255], type='', swap=''),
|
71 |
+
17:
|
72 |
+
dict(
|
73 |
+
name='kpt-17',
|
74 |
+
id=17,
|
75 |
+
color=[255, 255, 255],
|
76 |
+
type='',
|
77 |
+
swap='kpt-15'),
|
78 |
+
18:
|
79 |
+
dict(name='kpt-18', id=18, color=[255, 255, 255], type='', swap='')
|
80 |
+
},
|
81 |
+
skeleton_info={},
|
82 |
+
joint_weights=[1.] * 19,
|
83 |
+
sigmas=[])
|
ViTPose/easy_ViTPose/easy_ViTPose/configs/_base_/datasets/aic.py
ADDED
@@ -0,0 +1,140 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
dataset_info = dict(
|
2 |
+
dataset_name='aic',
|
3 |
+
paper_info=dict(
|
4 |
+
author='Wu, Jiahong and Zheng, He and Zhao, Bo and '
|
5 |
+
'Li, Yixin and Yan, Baoming and Liang, Rui and '
|
6 |
+
'Wang, Wenjia and Zhou, Shipei and Lin, Guosen and '
|
7 |
+
'Fu, Yanwei and others',
|
8 |
+
title='Ai challenger: A large-scale dataset for going '
|
9 |
+
'deeper in image understanding',
|
10 |
+
container='arXiv',
|
11 |
+
year='2017',
|
12 |
+
homepage='https://github.com/AIChallenger/AI_Challenger_2017',
|
13 |
+
),
|
14 |
+
keypoint_info={
|
15 |
+
0:
|
16 |
+
dict(
|
17 |
+
name='right_shoulder',
|
18 |
+
id=0,
|
19 |
+
color=[255, 128, 0],
|
20 |
+
type='upper',
|
21 |
+
swap='left_shoulder'),
|
22 |
+
1:
|
23 |
+
dict(
|
24 |
+
name='right_elbow',
|
25 |
+
id=1,
|
26 |
+
color=[255, 128, 0],
|
27 |
+
type='upper',
|
28 |
+
swap='left_elbow'),
|
29 |
+
2:
|
30 |
+
dict(
|
31 |
+
name='right_wrist',
|
32 |
+
id=2,
|
33 |
+
color=[255, 128, 0],
|
34 |
+
type='upper',
|
35 |
+
swap='left_wrist'),
|
36 |
+
3:
|
37 |
+
dict(
|
38 |
+
name='left_shoulder',
|
39 |
+
id=3,
|
40 |
+
color=[0, 255, 0],
|
41 |
+
type='upper',
|
42 |
+
swap='right_shoulder'),
|
43 |
+
4:
|
44 |
+
dict(
|
45 |
+
name='left_elbow',
|
46 |
+
id=4,
|
47 |
+
color=[0, 255, 0],
|
48 |
+
type='upper',
|
49 |
+
swap='right_elbow'),
|
50 |
+
5:
|
51 |
+
dict(
|
52 |
+
name='left_wrist',
|
53 |
+
id=5,
|
54 |
+
color=[0, 255, 0],
|
55 |
+
type='upper',
|
56 |
+
swap='right_wrist'),
|
57 |
+
6:
|
58 |
+
dict(
|
59 |
+
name='right_hip',
|
60 |
+
id=6,
|
61 |
+
color=[255, 128, 0],
|
62 |
+
type='lower',
|
63 |
+
swap='left_hip'),
|
64 |
+
7:
|
65 |
+
dict(
|
66 |
+
name='right_knee',
|
67 |
+
id=7,
|
68 |
+
color=[255, 128, 0],
|
69 |
+
type='lower',
|
70 |
+
swap='left_knee'),
|
71 |
+
8:
|
72 |
+
dict(
|
73 |
+
name='right_ankle',
|
74 |
+
id=8,
|
75 |
+
color=[255, 128, 0],
|
76 |
+
type='lower',
|
77 |
+
swap='left_ankle'),
|
78 |
+
9:
|
79 |
+
dict(
|
80 |
+
name='left_hip',
|
81 |
+
id=9,
|
82 |
+
color=[0, 255, 0],
|
83 |
+
type='lower',
|
84 |
+
swap='right_hip'),
|
85 |
+
10:
|
86 |
+
dict(
|
87 |
+
name='left_knee',
|
88 |
+
id=10,
|
89 |
+
color=[0, 255, 0],
|
90 |
+
type='lower',
|
91 |
+
swap='right_knee'),
|
92 |
+
11:
|
93 |
+
dict(
|
94 |
+
name='left_ankle',
|
95 |
+
id=11,
|
96 |
+
color=[0, 255, 0],
|
97 |
+
type='lower',
|
98 |
+
swap='right_ankle'),
|
99 |
+
12:
|
100 |
+
dict(
|
101 |
+
name='head_top',
|
102 |
+
id=12,
|
103 |
+
color=[51, 153, 255],
|
104 |
+
type='upper',
|
105 |
+
swap=''),
|
106 |
+
13:
|
107 |
+
dict(name='neck', id=13, color=[51, 153, 255], type='upper', swap='')
|
108 |
+
},
|
109 |
+
skeleton_info={
|
110 |
+
0:
|
111 |
+
dict(link=('right_wrist', 'right_elbow'), id=0, color=[255, 128, 0]),
|
112 |
+
1: dict(
|
113 |
+
link=('right_elbow', 'right_shoulder'), id=1, color=[255, 128, 0]),
|
114 |
+
2: dict(link=('right_shoulder', 'neck'), id=2, color=[51, 153, 255]),
|
115 |
+
3: dict(link=('neck', 'left_shoulder'), id=3, color=[51, 153, 255]),
|
116 |
+
4: dict(link=('left_shoulder', 'left_elbow'), id=4, color=[0, 255, 0]),
|
117 |
+
5: dict(link=('left_elbow', 'left_wrist'), id=5, color=[0, 255, 0]),
|
118 |
+
6: dict(link=('right_ankle', 'right_knee'), id=6, color=[255, 128, 0]),
|
119 |
+
7: dict(link=('right_knee', 'right_hip'), id=7, color=[255, 128, 0]),
|
120 |
+
8: dict(link=('right_hip', 'left_hip'), id=8, color=[51, 153, 255]),
|
121 |
+
9: dict(link=('left_hip', 'left_knee'), id=9, color=[0, 255, 0]),
|
122 |
+
10: dict(link=('left_knee', 'left_ankle'), id=10, color=[0, 255, 0]),
|
123 |
+
11: dict(link=('head_top', 'neck'), id=11, color=[51, 153, 255]),
|
124 |
+
12: dict(
|
125 |
+
link=('right_shoulder', 'right_hip'), id=12, color=[51, 153, 255]),
|
126 |
+
13:
|
127 |
+
dict(link=('left_shoulder', 'left_hip'), id=13, color=[51, 153, 255])
|
128 |
+
},
|
129 |
+
joint_weights=[
|
130 |
+
1., 1.2, 1.5, 1., 1.2, 1.5, 1., 1.2, 1.5, 1., 1.2, 1.5, 1., 1.
|
131 |
+
],
|
132 |
+
|
133 |
+
# 'https://github.com/AIChallenger/AI_Challenger_2017/blob/master/'
|
134 |
+
# 'Evaluation/keypoint_eval/keypoint_eval.py#L50'
|
135 |
+
# delta = 2 x sigma
|
136 |
+
sigmas=[
|
137 |
+
0.01388152, 0.01515228, 0.01057665, 0.01417709, 0.01497891, 0.01402144,
|
138 |
+
0.03909642, 0.03686941, 0.01981803, 0.03843971, 0.03412318, 0.02415081,
|
139 |
+
0.01291456, 0.01236173
|
140 |
+
])
|
ViTPose/easy_ViTPose/easy_ViTPose/configs/_base_/datasets/aic_info.py
ADDED
@@ -0,0 +1,140 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
aic_info = dict(
|
2 |
+
dataset_name='aic',
|
3 |
+
paper_info=dict(
|
4 |
+
author='Wu, Jiahong and Zheng, He and Zhao, Bo and '
|
5 |
+
'Li, Yixin and Yan, Baoming and Liang, Rui and '
|
6 |
+
'Wang, Wenjia and Zhou, Shipei and Lin, Guosen and '
|
7 |
+
'Fu, Yanwei and others',
|
8 |
+
title='Ai challenger: A large-scale dataset for going '
|
9 |
+
'deeper in image understanding',
|
10 |
+
container='arXiv',
|
11 |
+
year='2017',
|
12 |
+
homepage='https://github.com/AIChallenger/AI_Challenger_2017',
|
13 |
+
),
|
14 |
+
keypoint_info={
|
15 |
+
0:
|
16 |
+
dict(
|
17 |
+
name='right_shoulder',
|
18 |
+
id=0,
|
19 |
+
color=[255, 128, 0],
|
20 |
+
type='upper',
|
21 |
+
swap='left_shoulder'),
|
22 |
+
1:
|
23 |
+
dict(
|
24 |
+
name='right_elbow',
|
25 |
+
id=1,
|
26 |
+
color=[255, 128, 0],
|
27 |
+
type='upper',
|
28 |
+
swap='left_elbow'),
|
29 |
+
2:
|
30 |
+
dict(
|
31 |
+
name='right_wrist',
|
32 |
+
id=2,
|
33 |
+
color=[255, 128, 0],
|
34 |
+
type='upper',
|
35 |
+
swap='left_wrist'),
|
36 |
+
3:
|
37 |
+
dict(
|
38 |
+
name='left_shoulder',
|
39 |
+
id=3,
|
40 |
+
color=[0, 255, 0],
|
41 |
+
type='upper',
|
42 |
+
swap='right_shoulder'),
|
43 |
+
4:
|
44 |
+
dict(
|
45 |
+
name='left_elbow',
|
46 |
+
id=4,
|
47 |
+
color=[0, 255, 0],
|
48 |
+
type='upper',
|
49 |
+
swap='right_elbow'),
|
50 |
+
5:
|
51 |
+
dict(
|
52 |
+
name='left_wrist',
|
53 |
+
id=5,
|
54 |
+
color=[0, 255, 0],
|
55 |
+
type='upper',
|
56 |
+
swap='right_wrist'),
|
57 |
+
6:
|
58 |
+
dict(
|
59 |
+
name='right_hip',
|
60 |
+
id=6,
|
61 |
+
color=[255, 128, 0],
|
62 |
+
type='lower',
|
63 |
+
swap='left_hip'),
|
64 |
+
7:
|
65 |
+
dict(
|
66 |
+
name='right_knee',
|
67 |
+
id=7,
|
68 |
+
color=[255, 128, 0],
|
69 |
+
type='lower',
|
70 |
+
swap='left_knee'),
|
71 |
+
8:
|
72 |
+
dict(
|
73 |
+
name='right_ankle',
|
74 |
+
id=8,
|
75 |
+
color=[255, 128, 0],
|
76 |
+
type='lower',
|
77 |
+
swap='left_ankle'),
|
78 |
+
9:
|
79 |
+
dict(
|
80 |
+
name='left_hip',
|
81 |
+
id=9,
|
82 |
+
color=[0, 255, 0],
|
83 |
+
type='lower',
|
84 |
+
swap='right_hip'),
|
85 |
+
10:
|
86 |
+
dict(
|
87 |
+
name='left_knee',
|
88 |
+
id=10,
|
89 |
+
color=[0, 255, 0],
|
90 |
+
type='lower',
|
91 |
+
swap='right_knee'),
|
92 |
+
11:
|
93 |
+
dict(
|
94 |
+
name='left_ankle',
|
95 |
+
id=11,
|
96 |
+
color=[0, 255, 0],
|
97 |
+
type='lower',
|
98 |
+
swap='right_ankle'),
|
99 |
+
12:
|
100 |
+
dict(
|
101 |
+
name='head_top',
|
102 |
+
id=12,
|
103 |
+
color=[51, 153, 255],
|
104 |
+
type='upper',
|
105 |
+
swap=''),
|
106 |
+
13:
|
107 |
+
dict(name='neck', id=13, color=[51, 153, 255], type='upper', swap='')
|
108 |
+
},
|
109 |
+
skeleton_info={
|
110 |
+
0:
|
111 |
+
dict(link=('right_wrist', 'right_elbow'), id=0, color=[255, 128, 0]),
|
112 |
+
1: dict(
|
113 |
+
link=('right_elbow', 'right_shoulder'), id=1, color=[255, 128, 0]),
|
114 |
+
2: dict(link=('right_shoulder', 'neck'), id=2, color=[51, 153, 255]),
|
115 |
+
3: dict(link=('neck', 'left_shoulder'), id=3, color=[51, 153, 255]),
|
116 |
+
4: dict(link=('left_shoulder', 'left_elbow'), id=4, color=[0, 255, 0]),
|
117 |
+
5: dict(link=('left_elbow', 'left_wrist'), id=5, color=[0, 255, 0]),
|
118 |
+
6: dict(link=('right_ankle', 'right_knee'), id=6, color=[255, 128, 0]),
|
119 |
+
7: dict(link=('right_knee', 'right_hip'), id=7, color=[255, 128, 0]),
|
120 |
+
8: dict(link=('right_hip', 'left_hip'), id=8, color=[51, 153, 255]),
|
121 |
+
9: dict(link=('left_hip', 'left_knee'), id=9, color=[0, 255, 0]),
|
122 |
+
10: dict(link=('left_knee', 'left_ankle'), id=10, color=[0, 255, 0]),
|
123 |
+
11: dict(link=('head_top', 'neck'), id=11, color=[51, 153, 255]),
|
124 |
+
12: dict(
|
125 |
+
link=('right_shoulder', 'right_hip'), id=12, color=[51, 153, 255]),
|
126 |
+
13:
|
127 |
+
dict(link=('left_shoulder', 'left_hip'), id=13, color=[51, 153, 255])
|
128 |
+
},
|
129 |
+
joint_weights=[
|
130 |
+
1., 1.2, 1.5, 1., 1.2, 1.5, 1., 1.2, 1.5, 1., 1.2, 1.5, 1., 1.
|
131 |
+
],
|
132 |
+
|
133 |
+
# 'https://github.com/AIChallenger/AI_Challenger_2017/blob/master/'
|
134 |
+
# 'Evaluation/keypoint_eval/keypoint_eval.py#L50'
|
135 |
+
# delta = 2 x sigma
|
136 |
+
sigmas=[
|
137 |
+
0.01388152, 0.01515228, 0.01057665, 0.01417709, 0.01497891, 0.01402144,
|
138 |
+
0.03909642, 0.03686941, 0.01981803, 0.03843971, 0.03412318, 0.02415081,
|
139 |
+
0.01291456, 0.01236173
|
140 |
+
])
|
ViTPose/easy_ViTPose/easy_ViTPose/configs/_base_/datasets/animalpose.py
ADDED
@@ -0,0 +1,166 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
dataset_info = dict(
|
2 |
+
dataset_name='animalpose',
|
3 |
+
paper_info=dict(
|
4 |
+
author='Cao, Jinkun and Tang, Hongyang and Fang, Hao-Shu and '
|
5 |
+
'Shen, Xiaoyong and Lu, Cewu and Tai, Yu-Wing',
|
6 |
+
title='Cross-Domain Adaptation for Animal Pose Estimation',
|
7 |
+
container='The IEEE International Conference on '
|
8 |
+
'Computer Vision (ICCV)',
|
9 |
+
year='2019',
|
10 |
+
homepage='https://sites.google.com/view/animal-pose/',
|
11 |
+
),
|
12 |
+
keypoint_info={
|
13 |
+
0:
|
14 |
+
dict(
|
15 |
+
name='L_Eye', id=0, color=[0, 255, 0], type='upper', swap='R_Eye'),
|
16 |
+
1:
|
17 |
+
dict(
|
18 |
+
name='R_Eye',
|
19 |
+
id=1,
|
20 |
+
color=[255, 128, 0],
|
21 |
+
type='upper',
|
22 |
+
swap='L_Eye'),
|
23 |
+
2:
|
24 |
+
dict(
|
25 |
+
name='L_EarBase',
|
26 |
+
id=2,
|
27 |
+
color=[0, 255, 0],
|
28 |
+
type='upper',
|
29 |
+
swap='R_EarBase'),
|
30 |
+
3:
|
31 |
+
dict(
|
32 |
+
name='R_EarBase',
|
33 |
+
id=3,
|
34 |
+
color=[255, 128, 0],
|
35 |
+
type='upper',
|
36 |
+
swap='L_EarBase'),
|
37 |
+
4:
|
38 |
+
dict(name='Nose', id=4, color=[51, 153, 255], type='upper', swap=''),
|
39 |
+
5:
|
40 |
+
dict(name='Throat', id=5, color=[51, 153, 255], type='upper', swap=''),
|
41 |
+
6:
|
42 |
+
dict(
|
43 |
+
name='TailBase', id=6, color=[51, 153, 255], type='lower',
|
44 |
+
swap=''),
|
45 |
+
7:
|
46 |
+
dict(
|
47 |
+
name='Withers', id=7, color=[51, 153, 255], type='upper', swap=''),
|
48 |
+
8:
|
49 |
+
dict(
|
50 |
+
name='L_F_Elbow',
|
51 |
+
id=8,
|
52 |
+
color=[0, 255, 0],
|
53 |
+
type='upper',
|
54 |
+
swap='R_F_Elbow'),
|
55 |
+
9:
|
56 |
+
dict(
|
57 |
+
name='R_F_Elbow',
|
58 |
+
id=9,
|
59 |
+
color=[255, 128, 0],
|
60 |
+
type='upper',
|
61 |
+
swap='L_F_Elbow'),
|
62 |
+
10:
|
63 |
+
dict(
|
64 |
+
name='L_B_Elbow',
|
65 |
+
id=10,
|
66 |
+
color=[0, 255, 0],
|
67 |
+
type='lower',
|
68 |
+
swap='R_B_Elbow'),
|
69 |
+
11:
|
70 |
+
dict(
|
71 |
+
name='R_B_Elbow',
|
72 |
+
id=11,
|
73 |
+
color=[255, 128, 0],
|
74 |
+
type='lower',
|
75 |
+
swap='L_B_Elbow'),
|
76 |
+
12:
|
77 |
+
dict(
|
78 |
+
name='L_F_Knee',
|
79 |
+
id=12,
|
80 |
+
color=[0, 255, 0],
|
81 |
+
type='upper',
|
82 |
+
swap='R_F_Knee'),
|
83 |
+
13:
|
84 |
+
dict(
|
85 |
+
name='R_F_Knee',
|
86 |
+
id=13,
|
87 |
+
color=[255, 128, 0],
|
88 |
+
type='upper',
|
89 |
+
swap='L_F_Knee'),
|
90 |
+
14:
|
91 |
+
dict(
|
92 |
+
name='L_B_Knee',
|
93 |
+
id=14,
|
94 |
+
color=[0, 255, 0],
|
95 |
+
type='lower',
|
96 |
+
swap='R_B_Knee'),
|
97 |
+
15:
|
98 |
+
dict(
|
99 |
+
name='R_B_Knee',
|
100 |
+
id=15,
|
101 |
+
color=[255, 128, 0],
|
102 |
+
type='lower',
|
103 |
+
swap='L_B_Knee'),
|
104 |
+
16:
|
105 |
+
dict(
|
106 |
+
name='L_F_Paw',
|
107 |
+
id=16,
|
108 |
+
color=[0, 255, 0],
|
109 |
+
type='upper',
|
110 |
+
swap='R_F_Paw'),
|
111 |
+
17:
|
112 |
+
dict(
|
113 |
+
name='R_F_Paw',
|
114 |
+
id=17,
|
115 |
+
color=[255, 128, 0],
|
116 |
+
type='upper',
|
117 |
+
swap='L_F_Paw'),
|
118 |
+
18:
|
119 |
+
dict(
|
120 |
+
name='L_B_Paw',
|
121 |
+
id=18,
|
122 |
+
color=[0, 255, 0],
|
123 |
+
type='lower',
|
124 |
+
swap='R_B_Paw'),
|
125 |
+
19:
|
126 |
+
dict(
|
127 |
+
name='R_B_Paw',
|
128 |
+
id=19,
|
129 |
+
color=[255, 128, 0],
|
130 |
+
type='lower',
|
131 |
+
swap='L_B_Paw')
|
132 |
+
},
|
133 |
+
skeleton_info={
|
134 |
+
0: dict(link=('L_Eye', 'R_Eye'), id=0, color=[51, 153, 255]),
|
135 |
+
1: dict(link=('L_Eye', 'L_EarBase'), id=1, color=[0, 255, 0]),
|
136 |
+
2: dict(link=('R_Eye', 'R_EarBase'), id=2, color=[255, 128, 0]),
|
137 |
+
3: dict(link=('L_Eye', 'Nose'), id=3, color=[0, 255, 0]),
|
138 |
+
4: dict(link=('R_Eye', 'Nose'), id=4, color=[255, 128, 0]),
|
139 |
+
5: dict(link=('Nose', 'Throat'), id=5, color=[51, 153, 255]),
|
140 |
+
6: dict(link=('Throat', 'Withers'), id=6, color=[51, 153, 255]),
|
141 |
+
7: dict(link=('TailBase', 'Withers'), id=7, color=[51, 153, 255]),
|
142 |
+
8: dict(link=('Throat', 'L_F_Elbow'), id=8, color=[0, 255, 0]),
|
143 |
+
9: dict(link=('L_F_Elbow', 'L_F_Knee'), id=9, color=[0, 255, 0]),
|
144 |
+
10: dict(link=('L_F_Knee', 'L_F_Paw'), id=10, color=[0, 255, 0]),
|
145 |
+
11: dict(link=('Throat', 'R_F_Elbow'), id=11, color=[255, 128, 0]),
|
146 |
+
12: dict(link=('R_F_Elbow', 'R_F_Knee'), id=12, color=[255, 128, 0]),
|
147 |
+
13: dict(link=('R_F_Knee', 'R_F_Paw'), id=13, color=[255, 128, 0]),
|
148 |
+
14: dict(link=('TailBase', 'L_B_Elbow'), id=14, color=[0, 255, 0]),
|
149 |
+
15: dict(link=('L_B_Elbow', 'L_B_Knee'), id=15, color=[0, 255, 0]),
|
150 |
+
16: dict(link=('L_B_Knee', 'L_B_Paw'), id=16, color=[0, 255, 0]),
|
151 |
+
17: dict(link=('TailBase', 'R_B_Elbow'), id=17, color=[255, 128, 0]),
|
152 |
+
18: dict(link=('R_B_Elbow', 'R_B_Knee'), id=18, color=[255, 128, 0]),
|
153 |
+
19: dict(link=('R_B_Knee', 'R_B_Paw'), id=19, color=[255, 128, 0])
|
154 |
+
},
|
155 |
+
joint_weights=[
|
156 |
+
1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.2, 1.2, 1.2, 1.2,
|
157 |
+
1.5, 1.5, 1.5, 1.5
|
158 |
+
],
|
159 |
+
|
160 |
+
# Note: The original paper did not provide enough information about
|
161 |
+
# the sigmas. We modified from 'https://github.com/cocodataset/'
|
162 |
+
# 'cocoapi/blob/master/PythonAPI/pycocotools/cocoeval.py#L523'
|
163 |
+
sigmas=[
|
164 |
+
0.025, 0.025, 0.026, 0.035, 0.035, 0.10, 0.10, 0.10, 0.107, 0.107,
|
165 |
+
0.107, 0.107, 0.087, 0.087, 0.087, 0.087, 0.089, 0.089, 0.089, 0.089
|
166 |
+
])
|
ViTPose/easy_ViTPose/easy_ViTPose/configs/_base_/datasets/ap10k.py
ADDED
@@ -0,0 +1,142 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
dataset_info = dict(
|
2 |
+
dataset_name='ap10k',
|
3 |
+
paper_info=dict(
|
4 |
+
author='Yu, Hang and Xu, Yufei and Zhang, Jing and '
|
5 |
+
'Zhao, Wei and Guan, Ziyu and Tao, Dacheng',
|
6 |
+
title='AP-10K: A Benchmark for Animal Pose Estimation in the Wild',
|
7 |
+
container='35th Conference on Neural Information Processing Systems '
|
8 |
+
'(NeurIPS 2021) Track on Datasets and Bench-marks.',
|
9 |
+
year='2021',
|
10 |
+
homepage='https://github.com/AlexTheBad/AP-10K',
|
11 |
+
),
|
12 |
+
keypoint_info={
|
13 |
+
0:
|
14 |
+
dict(
|
15 |
+
name='L_Eye', id=0, color=[0, 255, 0], type='upper', swap='R_Eye'),
|
16 |
+
1:
|
17 |
+
dict(
|
18 |
+
name='R_Eye',
|
19 |
+
id=1,
|
20 |
+
color=[255, 128, 0],
|
21 |
+
type='upper',
|
22 |
+
swap='L_Eye'),
|
23 |
+
2:
|
24 |
+
dict(name='Nose', id=2, color=[51, 153, 255], type='upper', swap=''),
|
25 |
+
3:
|
26 |
+
dict(name='Neck', id=3, color=[51, 153, 255], type='upper', swap=''),
|
27 |
+
4:
|
28 |
+
dict(
|
29 |
+
name='Root of tail',
|
30 |
+
id=4,
|
31 |
+
color=[51, 153, 255],
|
32 |
+
type='lower',
|
33 |
+
swap=''),
|
34 |
+
5:
|
35 |
+
dict(
|
36 |
+
name='L_Shoulder',
|
37 |
+
id=5,
|
38 |
+
color=[51, 153, 255],
|
39 |
+
type='upper',
|
40 |
+
swap='R_Shoulder'),
|
41 |
+
6:
|
42 |
+
dict(
|
43 |
+
name='L_Elbow',
|
44 |
+
id=6,
|
45 |
+
color=[51, 153, 255],
|
46 |
+
type='upper',
|
47 |
+
swap='R_Elbow'),
|
48 |
+
7:
|
49 |
+
dict(
|
50 |
+
name='L_F_Paw',
|
51 |
+
id=7,
|
52 |
+
color=[0, 255, 0],
|
53 |
+
type='upper',
|
54 |
+
swap='R_F_Paw'),
|
55 |
+
8:
|
56 |
+
dict(
|
57 |
+
name='R_Shoulder',
|
58 |
+
id=8,
|
59 |
+
color=[0, 255, 0],
|
60 |
+
type='upper',
|
61 |
+
swap='L_Shoulder'),
|
62 |
+
9:
|
63 |
+
dict(
|
64 |
+
name='R_Elbow',
|
65 |
+
id=9,
|
66 |
+
color=[255, 128, 0],
|
67 |
+
type='upper',
|
68 |
+
swap='L_Elbow'),
|
69 |
+
10:
|
70 |
+
dict(
|
71 |
+
name='R_F_Paw',
|
72 |
+
id=10,
|
73 |
+
color=[0, 255, 0],
|
74 |
+
type='lower',
|
75 |
+
swap='L_F_Paw'),
|
76 |
+
11:
|
77 |
+
dict(
|
78 |
+
name='L_Hip',
|
79 |
+
id=11,
|
80 |
+
color=[255, 128, 0],
|
81 |
+
type='lower',
|
82 |
+
swap='R_Hip'),
|
83 |
+
12:
|
84 |
+
dict(
|
85 |
+
name='L_Knee',
|
86 |
+
id=12,
|
87 |
+
color=[255, 128, 0],
|
88 |
+
type='lower',
|
89 |
+
swap='R_Knee'),
|
90 |
+
13:
|
91 |
+
dict(
|
92 |
+
name='L_B_Paw',
|
93 |
+
id=13,
|
94 |
+
color=[0, 255, 0],
|
95 |
+
type='lower',
|
96 |
+
swap='R_B_Paw'),
|
97 |
+
14:
|
98 |
+
dict(
|
99 |
+
name='R_Hip', id=14, color=[0, 255, 0], type='lower',
|
100 |
+
swap='L_Hip'),
|
101 |
+
15:
|
102 |
+
dict(
|
103 |
+
name='R_Knee',
|
104 |
+
id=15,
|
105 |
+
color=[0, 255, 0],
|
106 |
+
type='lower',
|
107 |
+
swap='L_Knee'),
|
108 |
+
16:
|
109 |
+
dict(
|
110 |
+
name='R_B_Paw',
|
111 |
+
id=16,
|
112 |
+
color=[0, 255, 0],
|
113 |
+
type='lower',
|
114 |
+
swap='L_B_Paw'),
|
115 |
+
},
|
116 |
+
skeleton_info={
|
117 |
+
0: dict(link=('L_Eye', 'R_Eye'), id=0, color=[0, 0, 255]),
|
118 |
+
1: dict(link=('L_Eye', 'Nose'), id=1, color=[0, 0, 255]),
|
119 |
+
2: dict(link=('R_Eye', 'Nose'), id=2, color=[0, 0, 255]),
|
120 |
+
3: dict(link=('Nose', 'Neck'), id=3, color=[0, 255, 0]),
|
121 |
+
4: dict(link=('Neck', 'Root of tail'), id=4, color=[0, 255, 0]),
|
122 |
+
5: dict(link=('Neck', 'L_Shoulder'), id=5, color=[0, 255, 255]),
|
123 |
+
6: dict(link=('L_Shoulder', 'L_Elbow'), id=6, color=[0, 255, 255]),
|
124 |
+
7: dict(link=('L_Elbow', 'L_F_Paw'), id=6, color=[0, 255, 255]),
|
125 |
+
8: dict(link=('Neck', 'R_Shoulder'), id=7, color=[6, 156, 250]),
|
126 |
+
9: dict(link=('R_Shoulder', 'R_Elbow'), id=8, color=[6, 156, 250]),
|
127 |
+
10: dict(link=('R_Elbow', 'R_F_Paw'), id=9, color=[6, 156, 250]),
|
128 |
+
11: dict(link=('Root of tail', 'L_Hip'), id=10, color=[0, 255, 255]),
|
129 |
+
12: dict(link=('L_Hip', 'L_Knee'), id=11, color=[0, 255, 255]),
|
130 |
+
13: dict(link=('L_Knee', 'L_B_Paw'), id=12, color=[0, 255, 255]),
|
131 |
+
14: dict(link=('Root of tail', 'R_Hip'), id=13, color=[6, 156, 250]),
|
132 |
+
15: dict(link=('R_Hip', 'R_Knee'), id=14, color=[6, 156, 250]),
|
133 |
+
16: dict(link=('R_Knee', 'R_B_Paw'), id=15, color=[6, 156, 250]),
|
134 |
+
},
|
135 |
+
joint_weights=[
|
136 |
+
1., 1., 1., 1., 1., 1., 1., 1.2, 1.2, 1.5, 1.5, 1., 1., 1.2, 1.2, 1.5,
|
137 |
+
1.5
|
138 |
+
],
|
139 |
+
sigmas=[
|
140 |
+
0.025, 0.025, 0.026, 0.035, 0.035, 0.079, 0.072, 0.062, 0.079, 0.072,
|
141 |
+
0.062, 0.107, 0.087, 0.089, 0.107, 0.087, 0.089
|
142 |
+
])
|
ViTPose/easy_ViTPose/easy_ViTPose/configs/_base_/datasets/ap10k_info.py
ADDED
@@ -0,0 +1,142 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
ap10k_info = dict(
|
2 |
+
dataset_name='ap10k',
|
3 |
+
paper_info=dict(
|
4 |
+
author='Yu, Hang and Xu, Yufei and Zhang, Jing and '
|
5 |
+
'Zhao, Wei and Guan, Ziyu and Tao, Dacheng',
|
6 |
+
title='AP-10K: A Benchmark for Animal Pose Estimation in the Wild',
|
7 |
+
container='35th Conference on Neural Information Processing Systems '
|
8 |
+
'(NeurIPS 2021) Track on Datasets and Bench-marks.',
|
9 |
+
year='2021',
|
10 |
+
homepage='https://github.com/AlexTheBad/AP-10K',
|
11 |
+
),
|
12 |
+
keypoint_info={
|
13 |
+
0:
|
14 |
+
dict(
|
15 |
+
name='L_Eye', id=0, color=[0, 255, 0], type='upper', swap='R_Eye'),
|
16 |
+
1:
|
17 |
+
dict(
|
18 |
+
name='R_Eye',
|
19 |
+
id=1,
|
20 |
+
color=[255, 128, 0],
|
21 |
+
type='upper',
|
22 |
+
swap='L_Eye'),
|
23 |
+
2:
|
24 |
+
dict(name='Nose', id=2, color=[51, 153, 255], type='upper', swap=''),
|
25 |
+
3:
|
26 |
+
dict(name='Neck', id=3, color=[51, 153, 255], type='upper', swap=''),
|
27 |
+
4:
|
28 |
+
dict(
|
29 |
+
name='Root of tail',
|
30 |
+
id=4,
|
31 |
+
color=[51, 153, 255],
|
32 |
+
type='lower',
|
33 |
+
swap=''),
|
34 |
+
5:
|
35 |
+
dict(
|
36 |
+
name='L_Shoulder',
|
37 |
+
id=5,
|
38 |
+
color=[51, 153, 255],
|
39 |
+
type='upper',
|
40 |
+
swap='R_Shoulder'),
|
41 |
+
6:
|
42 |
+
dict(
|
43 |
+
name='L_Elbow',
|
44 |
+
id=6,
|
45 |
+
color=[51, 153, 255],
|
46 |
+
type='upper',
|
47 |
+
swap='R_Elbow'),
|
48 |
+
7:
|
49 |
+
dict(
|
50 |
+
name='L_F_Paw',
|
51 |
+
id=7,
|
52 |
+
color=[0, 255, 0],
|
53 |
+
type='upper',
|
54 |
+
swap='R_F_Paw'),
|
55 |
+
8:
|
56 |
+
dict(
|
57 |
+
name='R_Shoulder',
|
58 |
+
id=8,
|
59 |
+
color=[0, 255, 0],
|
60 |
+
type='upper',
|
61 |
+
swap='L_Shoulder'),
|
62 |
+
9:
|
63 |
+
dict(
|
64 |
+
name='R_Elbow',
|
65 |
+
id=9,
|
66 |
+
color=[255, 128, 0],
|
67 |
+
type='upper',
|
68 |
+
swap='L_Elbow'),
|
69 |
+
10:
|
70 |
+
dict(
|
71 |
+
name='R_F_Paw',
|
72 |
+
id=10,
|
73 |
+
color=[0, 255, 0],
|
74 |
+
type='lower',
|
75 |
+
swap='L_F_Paw'),
|
76 |
+
11:
|
77 |
+
dict(
|
78 |
+
name='L_Hip',
|
79 |
+
id=11,
|
80 |
+
color=[255, 128, 0],
|
81 |
+
type='lower',
|
82 |
+
swap='R_Hip'),
|
83 |
+
12:
|
84 |
+
dict(
|
85 |
+
name='L_Knee',
|
86 |
+
id=12,
|
87 |
+
color=[255, 128, 0],
|
88 |
+
type='lower',
|
89 |
+
swap='R_Knee'),
|
90 |
+
13:
|
91 |
+
dict(
|
92 |
+
name='L_B_Paw',
|
93 |
+
id=13,
|
94 |
+
color=[0, 255, 0],
|
95 |
+
type='lower',
|
96 |
+
swap='R_B_Paw'),
|
97 |
+
14:
|
98 |
+
dict(
|
99 |
+
name='R_Hip', id=14, color=[0, 255, 0], type='lower',
|
100 |
+
swap='L_Hip'),
|
101 |
+
15:
|
102 |
+
dict(
|
103 |
+
name='R_Knee',
|
104 |
+
id=15,
|
105 |
+
color=[0, 255, 0],
|
106 |
+
type='lower',
|
107 |
+
swap='L_Knee'),
|
108 |
+
16:
|
109 |
+
dict(
|
110 |
+
name='R_B_Paw',
|
111 |
+
id=16,
|
112 |
+
color=[0, 255, 0],
|
113 |
+
type='lower',
|
114 |
+
swap='L_B_Paw'),
|
115 |
+
},
|
116 |
+
skeleton_info={
|
117 |
+
0: dict(link=('L_Eye', 'R_Eye'), id=0, color=[0, 0, 255]),
|
118 |
+
1: dict(link=('L_Eye', 'Nose'), id=1, color=[0, 0, 255]),
|
119 |
+
2: dict(link=('R_Eye', 'Nose'), id=2, color=[0, 0, 255]),
|
120 |
+
3: dict(link=('Nose', 'Neck'), id=3, color=[0, 255, 0]),
|
121 |
+
4: dict(link=('Neck', 'Root of tail'), id=4, color=[0, 255, 0]),
|
122 |
+
5: dict(link=('Neck', 'L_Shoulder'), id=5, color=[0, 255, 255]),
|
123 |
+
6: dict(link=('L_Shoulder', 'L_Elbow'), id=6, color=[0, 255, 255]),
|
124 |
+
7: dict(link=('L_Elbow', 'L_F_Paw'), id=6, color=[0, 255, 255]),
|
125 |
+
8: dict(link=('Neck', 'R_Shoulder'), id=7, color=[6, 156, 250]),
|
126 |
+
9: dict(link=('R_Shoulder', 'R_Elbow'), id=8, color=[6, 156, 250]),
|
127 |
+
10: dict(link=('R_Elbow', 'R_F_Paw'), id=9, color=[6, 156, 250]),
|
128 |
+
11: dict(link=('Root of tail', 'L_Hip'), id=10, color=[0, 255, 255]),
|
129 |
+
12: dict(link=('L_Hip', 'L_Knee'), id=11, color=[0, 255, 255]),
|
130 |
+
13: dict(link=('L_Knee', 'L_B_Paw'), id=12, color=[0, 255, 255]),
|
131 |
+
14: dict(link=('Root of tail', 'R_Hip'), id=13, color=[6, 156, 250]),
|
132 |
+
15: dict(link=('R_Hip', 'R_Knee'), id=14, color=[6, 156, 250]),
|
133 |
+
16: dict(link=('R_Knee', 'R_B_Paw'), id=15, color=[6, 156, 250]),
|
134 |
+
},
|
135 |
+
joint_weights=[
|
136 |
+
1., 1., 1., 1., 1., 1., 1., 1.2, 1.2, 1.5, 1.5, 1., 1., 1.2, 1.2, 1.5,
|
137 |
+
1.5
|
138 |
+
],
|
139 |
+
sigmas=[
|
140 |
+
0.025, 0.025, 0.026, 0.035, 0.035, 0.079, 0.072, 0.062, 0.079, 0.072,
|
141 |
+
0.062, 0.107, 0.087, 0.089, 0.107, 0.087, 0.089
|
142 |
+
])
|
ViTPose/easy_ViTPose/easy_ViTPose/configs/_base_/datasets/atrw.py
ADDED
@@ -0,0 +1,144 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
dataset_info = dict(
|
2 |
+
dataset_name='atrw',
|
3 |
+
paper_info=dict(
|
4 |
+
author='Li, Shuyuan and Li, Jianguo and Tang, Hanlin '
|
5 |
+
'and Qian, Rui and Lin, Weiyao',
|
6 |
+
title='ATRW: A Benchmark for Amur Tiger '
|
7 |
+
'Re-identification in the Wild',
|
8 |
+
container='Proceedings of the 28th ACM '
|
9 |
+
'International Conference on Multimedia',
|
10 |
+
year='2020',
|
11 |
+
homepage='https://cvwc2019.github.io/challenge.html',
|
12 |
+
),
|
13 |
+
keypoint_info={
|
14 |
+
0:
|
15 |
+
dict(
|
16 |
+
name='left_ear',
|
17 |
+
id=0,
|
18 |
+
color=[51, 153, 255],
|
19 |
+
type='upper',
|
20 |
+
swap='right_ear'),
|
21 |
+
1:
|
22 |
+
dict(
|
23 |
+
name='right_ear',
|
24 |
+
id=1,
|
25 |
+
color=[51, 153, 255],
|
26 |
+
type='upper',
|
27 |
+
swap='left_ear'),
|
28 |
+
2:
|
29 |
+
dict(name='nose', id=2, color=[51, 153, 255], type='upper', swap=''),
|
30 |
+
3:
|
31 |
+
dict(
|
32 |
+
name='right_shoulder',
|
33 |
+
id=3,
|
34 |
+
color=[255, 128, 0],
|
35 |
+
type='upper',
|
36 |
+
swap='left_shoulder'),
|
37 |
+
4:
|
38 |
+
dict(
|
39 |
+
name='right_front_paw',
|
40 |
+
id=4,
|
41 |
+
color=[255, 128, 0],
|
42 |
+
type='upper',
|
43 |
+
swap='left_front_paw'),
|
44 |
+
5:
|
45 |
+
dict(
|
46 |
+
name='left_shoulder',
|
47 |
+
id=5,
|
48 |
+
color=[0, 255, 0],
|
49 |
+
type='upper',
|
50 |
+
swap='right_shoulder'),
|
51 |
+
6:
|
52 |
+
dict(
|
53 |
+
name='left_front_paw',
|
54 |
+
id=6,
|
55 |
+
color=[0, 255, 0],
|
56 |
+
type='upper',
|
57 |
+
swap='right_front_paw'),
|
58 |
+
7:
|
59 |
+
dict(
|
60 |
+
name='right_hip',
|
61 |
+
id=7,
|
62 |
+
color=[255, 128, 0],
|
63 |
+
type='lower',
|
64 |
+
swap='left_hip'),
|
65 |
+
8:
|
66 |
+
dict(
|
67 |
+
name='right_knee',
|
68 |
+
id=8,
|
69 |
+
color=[255, 128, 0],
|
70 |
+
type='lower',
|
71 |
+
swap='left_knee'),
|
72 |
+
9:
|
73 |
+
dict(
|
74 |
+
name='right_back_paw',
|
75 |
+
id=9,
|
76 |
+
color=[255, 128, 0],
|
77 |
+
type='lower',
|
78 |
+
swap='left_back_paw'),
|
79 |
+
10:
|
80 |
+
dict(
|
81 |
+
name='left_hip',
|
82 |
+
id=10,
|
83 |
+
color=[0, 255, 0],
|
84 |
+
type='lower',
|
85 |
+
swap='right_hip'),
|
86 |
+
11:
|
87 |
+
dict(
|
88 |
+
name='left_knee',
|
89 |
+
id=11,
|
90 |
+
color=[0, 255, 0],
|
91 |
+
type='lower',
|
92 |
+
swap='right_knee'),
|
93 |
+
12:
|
94 |
+
dict(
|
95 |
+
name='left_back_paw',
|
96 |
+
id=12,
|
97 |
+
color=[0, 255, 0],
|
98 |
+
type='lower',
|
99 |
+
swap='right_back_paw'),
|
100 |
+
13:
|
101 |
+
dict(name='tail', id=13, color=[51, 153, 255], type='lower', swap=''),
|
102 |
+
14:
|
103 |
+
dict(
|
104 |
+
name='center', id=14, color=[51, 153, 255], type='lower', swap=''),
|
105 |
+
},
|
106 |
+
skeleton_info={
|
107 |
+
0:
|
108 |
+
dict(link=('left_ear', 'nose'), id=0, color=[51, 153, 255]),
|
109 |
+
1:
|
110 |
+
dict(link=('right_ear', 'nose'), id=1, color=[51, 153, 255]),
|
111 |
+
2:
|
112 |
+
dict(link=('nose', 'center'), id=2, color=[51, 153, 255]),
|
113 |
+
3:
|
114 |
+
dict(
|
115 |
+
link=('left_shoulder', 'left_front_paw'), id=3, color=[0, 255, 0]),
|
116 |
+
4:
|
117 |
+
dict(link=('left_shoulder', 'center'), id=4, color=[0, 255, 0]),
|
118 |
+
5:
|
119 |
+
dict(
|
120 |
+
link=('right_shoulder', 'right_front_paw'),
|
121 |
+
id=5,
|
122 |
+
color=[255, 128, 0]),
|
123 |
+
6:
|
124 |
+
dict(link=('right_shoulder', 'center'), id=6, color=[255, 128, 0]),
|
125 |
+
7:
|
126 |
+
dict(link=('tail', 'center'), id=7, color=[51, 153, 255]),
|
127 |
+
8:
|
128 |
+
dict(link=('right_back_paw', 'right_knee'), id=8, color=[255, 128, 0]),
|
129 |
+
9:
|
130 |
+
dict(link=('right_knee', 'right_hip'), id=9, color=[255, 128, 0]),
|
131 |
+
10:
|
132 |
+
dict(link=('right_hip', 'tail'), id=10, color=[255, 128, 0]),
|
133 |
+
11:
|
134 |
+
dict(link=('left_back_paw', 'left_knee'), id=11, color=[0, 255, 0]),
|
135 |
+
12:
|
136 |
+
dict(link=('left_knee', 'left_hip'), id=12, color=[0, 255, 0]),
|
137 |
+
13:
|
138 |
+
dict(link=('left_hip', 'tail'), id=13, color=[0, 255, 0]),
|
139 |
+
},
|
140 |
+
joint_weights=[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
|
141 |
+
sigmas=[
|
142 |
+
0.0277, 0.0823, 0.0831, 0.0202, 0.0716, 0.0263, 0.0646, 0.0302, 0.0440,
|
143 |
+
0.0316, 0.0333, 0.0547, 0.0263, 0.0683, 0.0539
|
144 |
+
])
|