Spaces:
Runtime error
Runtime error
Amanpreet
commited on
Commit
·
1cdc47e
1
Parent(s):
4276ea6
added 2
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +1 -0
- .gitignore +7 -0
- VideoToNPZ/INFERENCE_EN.md +2 -0
- VideoToNPZ/checkpoint/gastnet/81_frame_model.bin +3 -0
- VideoToNPZ/checkpoint/hrnet/pose_coco/pose_hrnet_w48_384x288.pth +3 -0
- VideoToNPZ/checkpoint/yolov3/yolov3.weights +3 -0
- VideoToNPZ/common/arguments.py +86 -0
- VideoToNPZ/common/camera.py +63 -0
- VideoToNPZ/common/generators.py +236 -0
- VideoToNPZ/common/graph_utils.py +45 -0
- VideoToNPZ/common/loss.py +90 -0
- VideoToNPZ/common/quaternion.py +36 -0
- VideoToNPZ/common/skeleton.py +81 -0
- VideoToNPZ/data/data_utils.py +95 -0
- VideoToNPZ/gen_skes.py +116 -0
- VideoToNPZ/lib/detector/__init__.py +6 -0
- VideoToNPZ/lib/detector/yolov3/__init__.py +0 -0
- VideoToNPZ/lib/detector/yolov3/bbox.py +111 -0
- VideoToNPZ/lib/detector/yolov3/cfg/tiny-yolo-voc.cfg +134 -0
- VideoToNPZ/lib/detector/yolov3/cfg/yolo-voc.cfg +258 -0
- VideoToNPZ/lib/detector/yolov3/cfg/yolo.cfg +258 -0
- VideoToNPZ/lib/detector/yolov3/cfg/yolov3.cfg +789 -0
- VideoToNPZ/lib/detector/yolov3/darknet.py +433 -0
- VideoToNPZ/lib/detector/yolov3/data/coco.names +80 -0
- VideoToNPZ/lib/detector/yolov3/data/pallete +0 -0
- VideoToNPZ/lib/detector/yolov3/data/voc.names +20 -0
- VideoToNPZ/lib/detector/yolov3/human_detector.py +155 -0
- VideoToNPZ/lib/detector/yolov3/preprocess.py +63 -0
- VideoToNPZ/lib/detector/yolov3/util.py +225 -0
- VideoToNPZ/lib/pose/__init__.py +10 -0
- VideoToNPZ/lib/pose/hrnet/experiments/coco/hrnet/w32_256x192_adam_lr1e-3.yaml +127 -0
- VideoToNPZ/lib/pose/hrnet/experiments/coco/hrnet/w32_384x288_adam_lr1e-3.yaml +127 -0
- VideoToNPZ/lib/pose/hrnet/experiments/coco/hrnet/w48_256x192_adam_lr1e-3.yaml +127 -0
- VideoToNPZ/lib/pose/hrnet/experiments/coco/hrnet/w48_384x288_adam_lr1e-3.yaml +127 -0
- VideoToNPZ/lib/pose/hrnet/experiments/coco/resnet/res101_256x192_d256x3_adam_lr1e-3.yaml +83 -0
- VideoToNPZ/lib/pose/hrnet/experiments/coco/resnet/res101_384x288_d256x3_adam_lr1e-3.yaml +83 -0
- VideoToNPZ/lib/pose/hrnet/experiments/coco/resnet/res152_256x192_d256x3_adam_lr1e-3.yaml +83 -0
- VideoToNPZ/lib/pose/hrnet/experiments/coco/resnet/res152_384x288_d256x3_adam_lr1e-3.yaml +83 -0
- VideoToNPZ/lib/pose/hrnet/experiments/coco/resnet/res50_256x192_d256x3_adam_lr1e-3.yaml +83 -0
- VideoToNPZ/lib/pose/hrnet/experiments/coco/resnet/res50_384x288_d256x3_adam_lr1e-3.yaml +83 -0
- VideoToNPZ/lib/pose/hrnet/experiments/mpii/hrnet/w32_256x256_adam_lr1e-3.yaml +120 -0
- VideoToNPZ/lib/pose/hrnet/experiments/mpii/hrnet/w48_256x256_adam_lr1e-3.yaml +120 -0
- VideoToNPZ/lib/pose/hrnet/experiments/mpii/resnet/res101_256x256_d256x3_adam_lr1e-3.yaml +86 -0
- VideoToNPZ/lib/pose/hrnet/experiments/mpii/resnet/res152_256x256_d256x3_adam_lr1e-3.yaml +86 -0
- VideoToNPZ/lib/pose/hrnet/experiments/mpii/resnet/res50_256x256_d256x3_adam_lr1e-3.yaml +86 -0
- VideoToNPZ/lib/pose/hrnet/lib/Makefile +4 -0
- VideoToNPZ/lib/pose/hrnet/lib/config/__init__.py +9 -0
- VideoToNPZ/lib/pose/hrnet/lib/config/default.py +160 -0
- VideoToNPZ/lib/pose/hrnet/lib/config/models.py +58 -0
- VideoToNPZ/lib/pose/hrnet/lib/models/__init__.py +16 -0
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
*.weights filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
venv/
|
2 |
+
_pycache_/
|
3 |
+
*.pyc
|
4 |
+
*.bvh
|
5 |
+
*.obj
|
6 |
+
*.npz
|
7 |
+
*.mp4
|
VideoToNPZ/INFERENCE_EN.md
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
|
2 |
+
python gen_skes.py -v baseball.mp4
|
VideoToNPZ/checkpoint/gastnet/81_frame_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3150eb3125ca66242a888fd06b4eb7d8a8b755607370225c24f0b9c794d35cc4
|
3 |
+
size 28333160
|
VideoToNPZ/checkpoint/hrnet/pose_coco/pose_hrnet_w48_384x288.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:95e0fec3194826d5e3f806ea89be68bbb84517b114c3a32b3058c56610b5ef61
|
3 |
+
size 255061287
|
VideoToNPZ/checkpoint/yolov3/yolov3.weights
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:523e4e69e1d015393a1b0a441cef1d9c7659e3eb2d7e15f793f060a21b32f297
|
3 |
+
size 248007048
|
VideoToNPZ/common/arguments.py
ADDED
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
|
3 |
+
|
4 |
+
def parse_args():
|
5 |
+
parser = argparse.ArgumentParser(description='Training script')
|
6 |
+
|
7 |
+
# General arguments
|
8 |
+
parser.add_argument('-d', '--dataset', default='h36m', type=str, metavar='NAME',
|
9 |
+
help='target dataset') # h36m or humaneva
|
10 |
+
parser.add_argument('-k', '--keypoints', default='cpn_ft_h36m_dbb', type=str, metavar='NAME',
|
11 |
+
help='2D detections to use')
|
12 |
+
parser.add_argument('-str', '--subjects-train', default='S1,S5,S6,S7,S8', type=str, metavar='LIST',
|
13 |
+
help='training subjects separated by comma')
|
14 |
+
parser.add_argument('-ste', '--subjects-test', default='S9,S11', type=str, metavar='LIST',
|
15 |
+
help='test subjects separated by comma')
|
16 |
+
parser.add_argument('-a', '--actions', default='*', type=str, metavar='LIST',
|
17 |
+
help='actions to train/test on, separated by comma, or * for all')
|
18 |
+
parser.add_argument('-c', '--checkpoint', default='checkpoint', type=str, metavar='PATH',
|
19 |
+
help='checkpoint directory')
|
20 |
+
parser.add_argument('--checkpoint-frequency', default=10, type=int, metavar='N',
|
21 |
+
help='create a checkpoint every N epochs')
|
22 |
+
parser.add_argument('-r', '--resume', default='', type=str, metavar='FILENAME',
|
23 |
+
help='checkpoint to resume (file name)')
|
24 |
+
parser.add_argument('--evaluate', default='', type=str, metavar='FILENAME',
|
25 |
+
help='checkpoint to evaluate (file name)')
|
26 |
+
parser.add_argument('--render', action='store_true', help='visualize a particular video')
|
27 |
+
parser.add_argument('--by-subject', action='store_true', help='break down error by subject (on evaluation)')
|
28 |
+
parser.add_argument('--export-training-curves', action='store_true', help='save training curves as .png images')
|
29 |
+
|
30 |
+
# Model arguments
|
31 |
+
parser.add_argument('-s', '--stride', default=1, type=int, metavar='N', help='chunk size to use during training')
|
32 |
+
parser.add_argument('-arc', '--architecture', default='3,3,3', type=str, metavar='LAYERS',
|
33 |
+
help='filter widths separated by comma')
|
34 |
+
parser.add_argument('--causal', action='store_true', help='use causal convolutions for real-time processing')
|
35 |
+
parser.add_argument('-ch', '--channels', default=128, type=int, metavar='N',
|
36 |
+
help='number of channels in convolution layers')
|
37 |
+
|
38 |
+
# Experimental setting
|
39 |
+
parser.add_argument('-e', '--epochs', default=60, type=int, metavar='N', help='number of training epochs')
|
40 |
+
parser.add_argument('-b', '--batch-size', default=128, type=int, metavar='N',
|
41 |
+
help='batch size in terms of predicted frames')
|
42 |
+
parser.add_argument('-drop', '--dropout', default=0.05, type=float, metavar='P', help='dropout probability')
|
43 |
+
parser.add_argument('-lr', '--learning-rate', default=0.001, type=float, metavar='LR', help='initial learning rate')
|
44 |
+
parser.add_argument('-lrd', '--lr-decay', default=0.95, type=float, metavar='LR',
|
45 |
+
help='learning rate decay per epoch')
|
46 |
+
parser.add_argument('-no-da', '--no-data-augmentation', dest='data_augmentation', action='store_false',
|
47 |
+
help='disable train-time flipping')
|
48 |
+
parser.add_argument('-no-tta', '--no-test-time-augmentation', dest='test_time_augmentation', action='store_false',
|
49 |
+
help='disable test-time flipping')
|
50 |
+
parser.add_argument('--subset', default=1, type=float, metavar='FRACTION', help='reduce dataset size by fraction')
|
51 |
+
parser.add_argument('--downsample', default=5, type=int, metavar='FACTOR',
|
52 |
+
help='downsample frame rate by factor (semi-supervised)')
|
53 |
+
parser.add_argument('--no-eval', action='store_true',
|
54 |
+
help='disable epoch evaluation while training (small speed-up)')
|
55 |
+
parser.add_argument('--disable-optimizations', action='store_true',
|
56 |
+
help='disable optimized model for single-frame predictions')
|
57 |
+
|
58 |
+
# Visualization
|
59 |
+
parser.add_argument('--viz-subject', type=str, metavar='STR', help='subject to render')
|
60 |
+
parser.add_argument('--viz-action', type=str, metavar='STR', help='action to render')
|
61 |
+
parser.add_argument('--viz-camera', type=int, default=0, metavar='N', help='camera to render')
|
62 |
+
parser.add_argument('--viz-video', type=str, metavar='PATH', help='path to input video')
|
63 |
+
parser.add_argument('--viz-skip', type=int, default=0, metavar='N', help='skip first N frames of input video')
|
64 |
+
parser.add_argument('--viz-output', type=str, metavar='PATH', help='output file name (.gif or .mp4)')
|
65 |
+
parser.add_argument('--viz-export', type=str, metavar='PATH', help='output file name for coordinates')
|
66 |
+
parser.add_argument('--viz-bitrate', type=int, default=3000, metavar='N', help='bitrate for mp4 videos')
|
67 |
+
parser.add_argument('--viz-no-ground-truth', action='store_true', help='do not show ground-truth poses')
|
68 |
+
parser.add_argument('--viz-limit', type=int, default=-1, metavar='N', help='only render first N frames')
|
69 |
+
parser.add_argument('--viz-downsample', type=int, default=1, metavar='N', help='downsample FPS by a factor N')
|
70 |
+
parser.add_argument('--viz-size', type=int, default=5, metavar='N', help='image size')
|
71 |
+
|
72 |
+
parser.set_defaults(bone_length_term=True)
|
73 |
+
parser.set_defaults(data_augmentation=True)
|
74 |
+
parser.set_defaults(test_time_augmentation=True)
|
75 |
+
|
76 |
+
args = parser.parse_args()
|
77 |
+
# Check invalid configuration
|
78 |
+
if args.resume and args.evaluate:
|
79 |
+
print('Invalid flags: --resume and --evaluate cannot be set at the same time')
|
80 |
+
exit()
|
81 |
+
|
82 |
+
if args.export_training_curves and args.no_eval:
|
83 |
+
print('Invalid flags: --export-training-curves and --no-eval cannot be set at the same time')
|
84 |
+
exit()
|
85 |
+
|
86 |
+
return args
|
VideoToNPZ/common/camera.py
ADDED
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import torch
|
3 |
+
|
4 |
+
from tools.utils import wrap
|
5 |
+
from common.quaternion import qort, qinverse
|
6 |
+
|
7 |
+
|
8 |
+
def normalize_screen_coordinates(X, w, h):
|
9 |
+
assert X.shape[-1] == 2
|
10 |
+
|
11 |
+
# Normalize so that [0, w] is mapped to [-1, 1], while preserving the aspect ratio
|
12 |
+
return X/w*2 - [1, h/w]
|
13 |
+
|
14 |
+
|
15 |
+
def image_coordinates(X, w, h):
|
16 |
+
assert X.shape[-1] == 2
|
17 |
+
|
18 |
+
# Reverse camera frame normalization
|
19 |
+
return (X + [1, h/w]) * w / 2
|
20 |
+
|
21 |
+
|
22 |
+
def world_to_camera(X, R, t):
|
23 |
+
Rt = wrap(qinverse, R) # Invert rotation
|
24 |
+
return wrap(qort, np.tile(Rt, (*X.shape[:-1], 1)), X - t) # Rotate and translate
|
25 |
+
|
26 |
+
|
27 |
+
def camera_to_world(X, R, t):
|
28 |
+
return wrap(qort, np.tile(R, (*X.shape[:-1], 1)), X) + t
|
29 |
+
|
30 |
+
|
31 |
+
def project_to_2d(X, camera_params):
|
32 |
+
"""
|
33 |
+
Project 3D points to 2D using the Human3.6M camera projection function.
|
34 |
+
This is a differentiable and batched reimplementation of the original MATLAB script.
|
35 |
+
|
36 |
+
Arguments:
|
37 |
+
X -- 3D points in *camera space* to transform (N, *, 3)
|
38 |
+
camera_params -- intrinsic parameteres (N, 2+2+3+2=9)
|
39 |
+
"""
|
40 |
+
assert X.shape[-1] == 3
|
41 |
+
assert len(camera_params.shape) == 2
|
42 |
+
assert camera_params.shape[-1] == 9
|
43 |
+
assert X.shape[0] == camera_params.shape[0]
|
44 |
+
|
45 |
+
while len(camera_params.shape) < len(X.shape):
|
46 |
+
camera_params = camera_params.unsqueeze(1)
|
47 |
+
|
48 |
+
f = camera_params[..., :2]
|
49 |
+
c = camera_params[..., 2:4]
|
50 |
+
k = camera_params[..., 4:7]
|
51 |
+
p = camera_params[..., 7:]
|
52 |
+
|
53 |
+
# XX = torch.clamp(X[..., :2] / X[..., 2:], min=-1, max=1)
|
54 |
+
XX = X[..., :2] / X[..., 2:]
|
55 |
+
r2 = torch.sum(XX[..., :2]**2, dim=len(XX.shape)-1, keepdim=True)
|
56 |
+
|
57 |
+
radial = 1 + torch.sum(k * torch.cat((r2, r2**2, r2**3), dim=len(r2.shape)-1), dim=len(r2.shape)-1, keepdim=True)
|
58 |
+
tan = torch.sum(p*XX, dim=len(XX.shape)-1, keepdim=True)
|
59 |
+
|
60 |
+
XXX = XX*(radial + tan) + p*r2
|
61 |
+
|
62 |
+
return f*XXX + c
|
63 |
+
|
VideoToNPZ/common/generators.py
ADDED
@@ -0,0 +1,236 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from itertools import zip_longest
|
2 |
+
import numpy as np
|
3 |
+
|
4 |
+
|
5 |
+
class ChunkedGenerator:
|
6 |
+
"""
|
7 |
+
Batched data generator, used for training.
|
8 |
+
The sequences are split into equal-length chunks and padded as necessary.
|
9 |
+
|
10 |
+
Arguments:
|
11 |
+
batch_size -- the batch size to use for training
|
12 |
+
cameras -- list of cameras, one element for each video (optional, used for semi-supervised training)
|
13 |
+
poses_3d -- list of ground-truth 3D poses, one element for each video (optional, used for supervised training)
|
14 |
+
poses_2d -- list of input 2D keypoints, one element for each video
|
15 |
+
chunk_length -- number of output frames to predict for each training example (usually 1)
|
16 |
+
pad -- 2D input padding to compensate for valid convolutions, per side (depends on the receptive field)
|
17 |
+
causal_shift -- asymmetric padding offset when causal convolutions are used (usually 0 or "pad")
|
18 |
+
shuffle -- randomly shuffle the dataset before each epoch
|
19 |
+
random_seed -- initial seed to use for the random generator
|
20 |
+
augment -- augment the dataset by flipping poses horizontally
|
21 |
+
kps_left and kps_right -- list of left/right 2D keypoints if flipping is enabled
|
22 |
+
joints_left and joints_right -- list of left/right 3D joints if flipping is enabled
|
23 |
+
"""
|
24 |
+
def __init__(self, batch_size, cameras, poses_3d, poses_2d,
|
25 |
+
chunk_length, pad=0, causal_shift=0,
|
26 |
+
shuffle=True, random_seed=1234,
|
27 |
+
augment=False, kps_left=None, kps_right=None, joints_left=None, joints_right=None,
|
28 |
+
endless=False):
|
29 |
+
assert poses_3d is None or len(poses_3d) == len(poses_2d), (len(poses_3d), len(poses_2d))
|
30 |
+
assert cameras is None or len(cameras) == len(poses_2d)
|
31 |
+
|
32 |
+
# Build lineage info
|
33 |
+
pairs = [] # (seq_idx, start_frame, end_frame, flip) tuples
|
34 |
+
for i in range(len(poses_2d)):
|
35 |
+
assert poses_3d is None or poses_3d[i].shape[0] == poses_2d[i].shape[0]
|
36 |
+
n_chunks = (poses_2d[i].shape[0] + chunk_length - 1) // chunk_length
|
37 |
+
offset = (n_chunks * chunk_length - poses_2d[i].shape[0]) // 2
|
38 |
+
bounds = np.arange(n_chunks + 1) * chunk_length - offset
|
39 |
+
augment_vector = np.full(len(bounds)-1, False, dtype=bool)
|
40 |
+
pairs += zip(np.repeat(i, len(bounds)-1), bounds[:-1], bounds[1:], augment_vector)
|
41 |
+
if augment:
|
42 |
+
pairs += zip(np.repeat(i, len(bounds)-1), bounds[:-1], bounds[1:], ~augment_vector)
|
43 |
+
|
44 |
+
# Initialize buffers
|
45 |
+
if cameras is not None:
|
46 |
+
self.batch_cam = np.empty((batch_size, cameras[0].shape[-1]))
|
47 |
+
if poses_3d is not None:
|
48 |
+
self.batch_3d = np.empty((batch_size, chunk_length, poses_3d[0].shape[-2], poses_3d[0].shape[-1]))
|
49 |
+
self.batch_2d = np.empty((batch_size, chunk_length + 2*pad, poses_2d[0].shape[-2], poses_2d[0].shape[-1]))
|
50 |
+
|
51 |
+
self.num_batches = (len(pairs) + batch_size - 1) // batch_size
|
52 |
+
self.batch_size = batch_size
|
53 |
+
self.random = np.random.RandomState(random_seed)
|
54 |
+
self.pairs = pairs
|
55 |
+
self.shuffle = shuffle
|
56 |
+
self.pad = pad
|
57 |
+
self.causal_shift = causal_shift
|
58 |
+
self.endless = endless
|
59 |
+
self.state = None
|
60 |
+
|
61 |
+
self.cameras = cameras
|
62 |
+
self.poses_3d = poses_3d
|
63 |
+
self.poses_2d = poses_2d
|
64 |
+
|
65 |
+
self.augment = augment
|
66 |
+
self.kps_left = kps_left
|
67 |
+
self.kps_right = kps_right
|
68 |
+
self.joints_left = joints_left
|
69 |
+
self.joints_right = joints_right
|
70 |
+
|
71 |
+
def num_frames(self):
|
72 |
+
return self.num_batches * self.batch_size
|
73 |
+
|
74 |
+
def random_state(self):
|
75 |
+
return self.random
|
76 |
+
|
77 |
+
def set_random_state(self, random):
|
78 |
+
self.random = random
|
79 |
+
|
80 |
+
def augment_enabled(self):
|
81 |
+
return self.augment
|
82 |
+
|
83 |
+
def next_pairs(self):
|
84 |
+
if self.state is None:
|
85 |
+
if self.shuffle:
|
86 |
+
pairs = self.random.permutation(self.pairs)
|
87 |
+
else:
|
88 |
+
pairs = self.pairs
|
89 |
+
return 0, pairs
|
90 |
+
else:
|
91 |
+
return self.state
|
92 |
+
|
93 |
+
def next_epoch(self):
|
94 |
+
enabled = True
|
95 |
+
while enabled:
|
96 |
+
start_idx, pairs = self.next_pairs()
|
97 |
+
for b_i in range(start_idx, self.num_batches):
|
98 |
+
chunks = pairs[b_i*self.batch_size : (b_i+1)*self.batch_size]
|
99 |
+
for i, (seq_i, start_3d, end_3d, flip) in enumerate(chunks):
|
100 |
+
start_2d = start_3d - self.pad - self.causal_shift
|
101 |
+
end_2d = end_3d + self.pad - self.causal_shift
|
102 |
+
|
103 |
+
# 2D poses
|
104 |
+
seq_2d = self.poses_2d[seq_i]
|
105 |
+
low_2d = max(start_2d, 0)
|
106 |
+
high_2d = min(end_2d, seq_2d.shape[0])
|
107 |
+
pad_left_2d = low_2d - start_2d
|
108 |
+
pad_right_2d = end_2d - high_2d
|
109 |
+
if pad_left_2d != 0 or pad_right_2d != 0:
|
110 |
+
self.batch_2d[i] = np.pad(seq_2d[low_2d:high_2d], ((pad_left_2d, pad_right_2d), (0, 0), (0, 0)), "edge")
|
111 |
+
else:
|
112 |
+
self.batch_2d[i] = seq_2d[low_2d:high_2d]
|
113 |
+
|
114 |
+
if flip:
|
115 |
+
# Flip 2D keypoints
|
116 |
+
self.batch_2d[i, :, :, 0] *= -1
|
117 |
+
self.batch_2d[i, :, self.kps_left + self.kps_right] = self.batch_2d[i, :, self.kps_right + self.kps_left]
|
118 |
+
|
119 |
+
# 3D poses
|
120 |
+
if self.poses_3d is not None:
|
121 |
+
seq_3d = self.poses_3d[seq_i]
|
122 |
+
low_3d = max(start_3d, 0)
|
123 |
+
high_3d = min(end_3d, seq_3d.shape[0])
|
124 |
+
pad_left_3d = low_3d - start_3d
|
125 |
+
pad_right_3d = end_3d - high_3d
|
126 |
+
if pad_left_3d != 0 or pad_right_3d != 0:
|
127 |
+
self.batch_3d[i] = np.pad(seq_3d[low_3d:high_3d], ((pad_left_3d, pad_right_3d), (0, 0), (0, 0)), "edge")
|
128 |
+
else:
|
129 |
+
self.batch_3d[i] = seq_3d[low_3d:high_3d]
|
130 |
+
|
131 |
+
if flip:
|
132 |
+
# Flip 3D joints
|
133 |
+
self.batch_3d[i, :, :, 0] *= -1
|
134 |
+
self.batch_3d[i, :, self.joints_left + self.joints_right] = \
|
135 |
+
self.batch_3d[i, :, self.joints_right + self.joints_left]
|
136 |
+
|
137 |
+
# Cameras
|
138 |
+
if self.cameras is not None:
|
139 |
+
self.batch_cam[i] = self.cameras[seq_i]
|
140 |
+
if flip:
|
141 |
+
# Flip horizontal distortion coefficients
|
142 |
+
self.batch_cam[i, 2] *= -1
|
143 |
+
self.batch_cam[i, 7] *= -1
|
144 |
+
|
145 |
+
if self.endless:
|
146 |
+
self.state = (b_i + 1, pairs)
|
147 |
+
if self.poses_3d is None and self.cameras is None:
|
148 |
+
yield None, None, self.batch_2d[:len(chunks)]
|
149 |
+
elif self.poses_3d is not None and self.cameras is None:
|
150 |
+
yield None, self.batch_3d[:len(chunks)], self.batch_2d[:(len(chunks))]
|
151 |
+
elif self.poses_3d is None:
|
152 |
+
yield self.batch_cam, None, self.batch_2d[:len(chunks)]
|
153 |
+
else:
|
154 |
+
yield self.batch_cam[:len(chunks)], self.batch_3d[:len(chunks)], self.batch_2d[:len(chunks)]
|
155 |
+
|
156 |
+
if self.endless:
|
157 |
+
self.state = None
|
158 |
+
else:
|
159 |
+
enabled = False
|
160 |
+
|
161 |
+
|
162 |
+
class UnchunkedGenerator:
|
163 |
+
"""
|
164 |
+
Non-batched data generator, used for testing.
|
165 |
+
Sequences are returned one at a time (i.e. batch size = 1), without chunking.
|
166 |
+
|
167 |
+
If data augmentation is enabled, the batches contain two sequences (i.e. batch size = 2),
|
168 |
+
the second of which is a mirrored version of the first.
|
169 |
+
|
170 |
+
Arguments:
|
171 |
+
cameras -- list of cameras, one element for each video (optional, used for semi-supervised training)
|
172 |
+
poses_3d -- list of ground-truth 3D poses, one element for each video (optional, used for supervised training)
|
173 |
+
poses_2d -- list of input 2D keypoints, one element for each video
|
174 |
+
pad -- 2D input padding to compensate for valid convolutions, per side (depends on the receptive field)
|
175 |
+
causal_shift -- asymmetric padding offset when causal convolutions are used (usually 0 or "pad")
|
176 |
+
augment -- augment the dataset by flipping poses horizontally
|
177 |
+
kps_left and kps_right -- list of left/right 2D keypoints if flipping is enabled
|
178 |
+
joints_left and joints_right -- list of left/right 3D joints if flipping is enabled
|
179 |
+
"""
|
180 |
+
|
181 |
+
def __init__(self, cameras, poses_3d, poses_2d, pad=0, causal_shift=0,
|
182 |
+
augment=False, kps_left=None, kps_right=None, joints_left=None, joints_right=None):
|
183 |
+
assert poses_3d is None or len(poses_3d) == len(poses_2d)
|
184 |
+
assert cameras is None or len(cameras) == len(poses_2d)
|
185 |
+
|
186 |
+
self.augment = augment
|
187 |
+
self.kps_left = kps_left
|
188 |
+
self.kps_right = kps_right
|
189 |
+
self.joints_left = joints_left
|
190 |
+
self.joints_right = joints_right
|
191 |
+
|
192 |
+
self.pad = pad
|
193 |
+
self.causal_shift = causal_shift
|
194 |
+
self.cameras = [] if cameras is None else cameras
|
195 |
+
self.poses_3d = [] if poses_3d is None else poses_3d
|
196 |
+
self.poses_2d = poses_2d
|
197 |
+
|
198 |
+
def num_frames(self):
|
199 |
+
count = 0
|
200 |
+
for p in self.poses_2d:
|
201 |
+
count += p.shape[0]
|
202 |
+
return count
|
203 |
+
|
204 |
+
def augment_enabled(self):
|
205 |
+
return self.augment
|
206 |
+
|
207 |
+
def set_augment(self, augment):
|
208 |
+
self.augment = augment
|
209 |
+
|
210 |
+
def next_epoch(self):
|
211 |
+
for seq_cam, seq_3d, seq_2d in zip_longest(self.cameras, self.poses_3d, self.poses_2d):
|
212 |
+
batch_cam = None if seq_cam is None else np.expand_dims(seq_cam, axis=0)
|
213 |
+
batch_3d = None if seq_3d is None else np.expand_dims(seq_3d, axis=0)
|
214 |
+
batch_2d = np.expand_dims(np.pad(seq_2d,
|
215 |
+
((self.pad + self.causal_shift, self.pad - self.causal_shift), (0, 0),
|
216 |
+
(0, 0)),
|
217 |
+
'edge'), axis=0)
|
218 |
+
if self.augment:
|
219 |
+
# Append flipped version
|
220 |
+
if batch_cam is not None:
|
221 |
+
batch_cam = np.concatenate((batch_cam, batch_cam), axis=0)
|
222 |
+
batch_cam[1, 2] *= -1
|
223 |
+
batch_cam[1, 7] *= -1
|
224 |
+
|
225 |
+
if batch_3d is not None:
|
226 |
+
batch_3d = np.concatenate((batch_3d, batch_3d), axis=0)
|
227 |
+
batch_3d[1, :, :, 0] *= -1
|
228 |
+
batch_3d[1, :, self.joints_left + self.joints_right] = batch_3d[1, :,
|
229 |
+
self.joints_right + self.joints_left]
|
230 |
+
|
231 |
+
batch_2d = np.concatenate((batch_2d, batch_2d), axis=0)
|
232 |
+
batch_2d[1, :, :, 0] *= -1
|
233 |
+
batch_2d[1, :, self.kps_left + self.kps_right] = batch_2d[1, :, self.kps_right + self.kps_left]
|
234 |
+
|
235 |
+
yield batch_cam, batch_3d, batch_2d
|
236 |
+
|
VideoToNPZ/common/graph_utils.py
ADDED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from __future__ import absolute_import
|
2 |
+
|
3 |
+
import torch
|
4 |
+
import numpy as np
|
5 |
+
import scipy.sparse as sp
|
6 |
+
|
7 |
+
|
8 |
+
def normalize(mx):
|
9 |
+
"""Row-normalize sparse matrix"""
|
10 |
+
rowsum = np.array(mx.sum(1))
|
11 |
+
r_inv = np.power(rowsum, -1).flatten()
|
12 |
+
r_inv[np.isinf(r_inv)] = 0.
|
13 |
+
r_mat_inv = sp.diags(r_inv)
|
14 |
+
mx = r_mat_inv.dot(mx)
|
15 |
+
return mx
|
16 |
+
|
17 |
+
|
18 |
+
def sparse_mx_to_torch_sparse_tensor(sparse_mx):
|
19 |
+
"""Convert a scipy sparse matrix to a torch sparse tensor."""
|
20 |
+
sparse_mx = sparse_mx.tocoo().astype(np.float32)
|
21 |
+
indices = torch.from_numpy(np.vstack((sparse_mx.row, sparse_mx.col)).astype(np.int64))
|
22 |
+
values = torch.from_numpy(sparse_mx.data)
|
23 |
+
shape = torch.Size(sparse_mx.shape)
|
24 |
+
return torch.sparse.FloatTensor(indices, values, shape)
|
25 |
+
|
26 |
+
|
27 |
+
def adj_mx_from_edges(num_pts, edges, sparse=True):
|
28 |
+
edges = np.array(edges, dtype=np.int32)
|
29 |
+
data, i, j = np.ones(edges.shape[0]), edges[:, 0], edges[:, 1]
|
30 |
+
adj_mx = sp.coo_matrix((data, (i, j)), shape=(num_pts, num_pts), dtype=np.float32)
|
31 |
+
|
32 |
+
# build symmetric adjacency matrix
|
33 |
+
adj_mx = adj_mx + adj_mx.T.multiply(adj_mx.T > adj_mx) - adj_mx.multiply(adj_mx.T > adj_mx)
|
34 |
+
adj_mx = normalize(adj_mx + sp.eye(adj_mx.shape[0]))
|
35 |
+
if sparse:
|
36 |
+
adj_mx = sparse_mx_to_torch_sparse_tensor(adj_mx)
|
37 |
+
else:
|
38 |
+
adj_mx = torch.tensor(adj_mx.todense(), dtype=torch.float)
|
39 |
+
return adj_mx
|
40 |
+
|
41 |
+
|
42 |
+
def adj_mx_from_skeleton(skeleton):
|
43 |
+
num_joints = skeleton.num_joints()
|
44 |
+
edges = list(filter(lambda x: x[1] >= 0, zip(list(range(0, num_joints)), skeleton.parents())))
|
45 |
+
return adj_mx_from_edges(num_joints, edges, sparse=False)
|
VideoToNPZ/common/loss.py
ADDED
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import numpy as np
|
3 |
+
|
4 |
+
|
5 |
+
def mpjpe(predicted, target):
|
6 |
+
"""
|
7 |
+
Mean per-joint position error (i.e. mean Euclidean distance),
|
8 |
+
often referred to as "Protocol #1" in many papers.
|
9 |
+
"""
|
10 |
+
assert predicted.shape == target.shape
|
11 |
+
return torch.mean(torch.norm(predicted - target, dim=len(target.shape) - 1))
|
12 |
+
|
13 |
+
|
14 |
+
def p_mpjpe(predicted, target):
|
15 |
+
"""
|
16 |
+
Pose error: MPJPE after rigid alignment (scale, rotation, and translation),
|
17 |
+
often referred to as "Protocol #2" in many papers.
|
18 |
+
"""
|
19 |
+
assert predicted.shape == target.shape
|
20 |
+
|
21 |
+
muX = np.mean(target, axis=1, keepdims=True)
|
22 |
+
muY = np.mean(predicted, axis=1, keepdims=True)
|
23 |
+
|
24 |
+
X0 = target - muX
|
25 |
+
Y0 = predicted - muY
|
26 |
+
|
27 |
+
normX = np.sqrt(np.sum(X0 ** 2, axis=(1, 2), keepdims=True))
|
28 |
+
normY = np.sqrt(np.sum(Y0 ** 2, axis=(1, 2), keepdims=True))
|
29 |
+
|
30 |
+
X0 /= normX
|
31 |
+
Y0 /= normY
|
32 |
+
|
33 |
+
H = np.matmul(X0.transpose(0, 2, 1), Y0)
|
34 |
+
U, s, Vt = np.linalg.svd(H)
|
35 |
+
V = Vt.transpose(0, 2, 1)
|
36 |
+
R = np.matmul(V, U.transpose(0, 2, 1))
|
37 |
+
|
38 |
+
# Avoid improper rotations (reflections), i.e. rotations with det(R) = -1
|
39 |
+
sign_detR = np.sign(np.expand_dims(np.linalg.det(R), axis=1))
|
40 |
+
V[:, :, -1] *= sign_detR
|
41 |
+
s[:, -1] *= sign_detR.flatten()
|
42 |
+
R = np.matmul(V, U.transpose(0, 2, 1)) # Rotation
|
43 |
+
|
44 |
+
tr = np.expand_dims(np.sum(s, axis=1, keepdims=True), axis=2)
|
45 |
+
|
46 |
+
a = tr * normX / normY # Scale
|
47 |
+
t = muX - a * np.matmul(muY, R) # Translation
|
48 |
+
|
49 |
+
# Perform rigid transformation on the input
|
50 |
+
predicted_aligned = a * np.matmul(predicted, R) + t
|
51 |
+
|
52 |
+
# Return MPJPE
|
53 |
+
return np.mean(np.linalg.norm(predicted_aligned - target, axis=len(target.shape) - 1))
|
54 |
+
|
55 |
+
|
56 |
+
def euclidean_losses(actual, target):
|
57 |
+
"""Calculate the average Euclidean loss for multi-point samples.
|
58 |
+
|
59 |
+
Each sample must contain `n` points, each with `d` dimensions. For example,
|
60 |
+
in the MPII human pose estimation task n=16 (16 joint locations) and
|
61 |
+
d=2 (locations are 2D).
|
62 |
+
|
63 |
+
Args:
|
64 |
+
actual (Tensor): Predictions (B x L x D)
|
65 |
+
target (Tensor): Ground truth target (B x L x D)
|
66 |
+
"""
|
67 |
+
|
68 |
+
assert actual.size() == target.size(), 'input tensors must have the same size'
|
69 |
+
|
70 |
+
# Calculate Euclidean distances between actual and target locations
|
71 |
+
diff = actual - target
|
72 |
+
dist_sq = diff.pow(2).sum(-1, keepdim=False)
|
73 |
+
dist = dist_sq.sqrt()
|
74 |
+
return dist
|
75 |
+
|
76 |
+
|
77 |
+
def pck(actual, expected, threshold=150):
|
78 |
+
dists = euclidean_losses(actual, expected)
|
79 |
+
return (dists < threshold).double().mean().item()
|
80 |
+
|
81 |
+
|
82 |
+
def auc(actual, expected):
|
83 |
+
# This range of thresholds mimics `mpii_compute_3d_pck.m`, which is provided as part of the
|
84 |
+
# MPI-INF-3DHP test data release.
|
85 |
+
thresholds = torch.linspace(0, 150, 31).tolist()
|
86 |
+
|
87 |
+
pck_values = torch.DoubleTensor(len(thresholds))
|
88 |
+
for i, threshold in enumerate(thresholds):
|
89 |
+
pck_values[i] = pck(actual, expected, threshold=threshold)
|
90 |
+
return pck_values.mean().item()
|
VideoToNPZ/common/quaternion.py
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
|
3 |
+
|
4 |
+
def qort(q, v):
|
5 |
+
"""
|
6 |
+
Rotate vector(s) v about the rotation described by quaternion(s) q.
|
7 |
+
Expects a tensor of shape (*, 4) for q and a tensor of shape (*, 3) for v,
|
8 |
+
where * denotes any number of dimensions.
|
9 |
+
Returns a tensor of shape (*, 3).
|
10 |
+
"""
|
11 |
+
assert q.shape[-1] == 4
|
12 |
+
assert v.shape[-1] == 3
|
13 |
+
assert q.shape[:-1] == v.shape[:-1]
|
14 |
+
|
15 |
+
qvec = q[..., 1:]
|
16 |
+
uv = torch.cross(qvec, v, dim=len(q.shape)-1)
|
17 |
+
uuv = torch.cross(qvec, uv, dim=len(q.shape)-1)
|
18 |
+
return v + 2 * (q[..., :1] * uv + uuv)
|
19 |
+
|
20 |
+
|
21 |
+
def qinverse(q, inplace=False):
|
22 |
+
# We assume the quaternion to be normalized
|
23 |
+
"""
|
24 |
+
The quaternions provided in the code are from the camera coordinate to the world coordinate.
|
25 |
+
Therefore, the quaternions from the world coordinate to the camera coordinate is the transpose of quaternions from
|
26 |
+
the camera coordinates to the world coordinate.The precondition is that the quaternion is a unit quaternion.
|
27 |
+
So the inverse of the quaternions is equal to the transposition of the quaternions.
|
28 |
+
"""
|
29 |
+
if inplace:
|
30 |
+
q[..., 1:] *= -1
|
31 |
+
return q
|
32 |
+
else:
|
33 |
+
w = q[..., :1]
|
34 |
+
xyz = q[..., 1:]
|
35 |
+
return torch.cat((w, -xyz), dim=len(q.shape)-1)
|
36 |
+
|
VideoToNPZ/common/skeleton.py
ADDED
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
|
3 |
+
|
4 |
+
class Skeleton:
|
5 |
+
def __init__(self, parents, joints_left, joints_right):
|
6 |
+
assert len(joints_left) == len(joints_right)
|
7 |
+
|
8 |
+
self._parents = parents
|
9 |
+
self._joints_left = joints_left
|
10 |
+
self._joints_right = joints_right
|
11 |
+
|
12 |
+
def num_joints(self):
|
13 |
+
return len(self._parents)
|
14 |
+
|
15 |
+
def parents(self):
|
16 |
+
return self._parents
|
17 |
+
|
18 |
+
def has_children(self):
|
19 |
+
return self._has_children
|
20 |
+
|
21 |
+
def children(self):
|
22 |
+
return self._children
|
23 |
+
|
24 |
+
def remove_joints(self, joints_to_remove):
|
25 |
+
"""
|
26 |
+
Remove the joints specified in 'joints_to_remove'.
|
27 |
+
"""
|
28 |
+
valid_joints = []
|
29 |
+
for joint in range(len(self._parents)):
|
30 |
+
if joint not in joints_to_remove:
|
31 |
+
valid_joints.append(joint)
|
32 |
+
|
33 |
+
for i in range(len(self._parents)):
|
34 |
+
while self._parents[i] in joints_to_remove:
|
35 |
+
self._parents[i] = self._parents[self._parents[i]]
|
36 |
+
|
37 |
+
index_offsets = np.zeros(len(self._parents), dtype=int)
|
38 |
+
new_parents = []
|
39 |
+
for i, parent in enumerate(self._parents):
|
40 |
+
if i not in joints_to_remove:
|
41 |
+
new_parents.append(parent - index_offsets[parent])
|
42 |
+
else:
|
43 |
+
index_offsets[i:] += 1
|
44 |
+
self._parents = np.array(new_parents)
|
45 |
+
|
46 |
+
if self._joints_left is not None:
|
47 |
+
new_joints_left = []
|
48 |
+
for joint in self._joints_left:
|
49 |
+
if joint in valid_joints:
|
50 |
+
new_joints_left.append(joint - index_offsets[joint])
|
51 |
+
self._joints_left = new_joints_left
|
52 |
+
|
53 |
+
if self._joints_right is not None:
|
54 |
+
new_joints_right = []
|
55 |
+
for joint in self._joints_right:
|
56 |
+
if joint in valid_joints:
|
57 |
+
new_joints_right.append(joint - index_offsets[joint])
|
58 |
+
self._joints_right = new_joints_right
|
59 |
+
|
60 |
+
self._compute_metadata()
|
61 |
+
|
62 |
+
return valid_joints
|
63 |
+
|
64 |
+
def joints_left(self):
|
65 |
+
return self._joints_left
|
66 |
+
|
67 |
+
def joints_right(self):
|
68 |
+
return self._joints_right
|
69 |
+
|
70 |
+
def _compute_metadata(self):
|
71 |
+
self._has_children = np.zeros(len(self._parents)).astype(bool)
|
72 |
+
for i, parent in enumerate(self._parents):
|
73 |
+
if parent != -1:
|
74 |
+
self._has_children[parent] = True
|
75 |
+
|
76 |
+
self._children = []
|
77 |
+
for parents in enumerate(self._parents):
|
78 |
+
self._children.append([])
|
79 |
+
for i, parent in enumerate(self._parents):
|
80 |
+
if parent != -1:
|
81 |
+
self._children[parent].append(i)
|
VideoToNPZ/data/data_utils.py
ADDED
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import h5py
|
3 |
+
|
4 |
+
mpii_metadata = {
|
5 |
+
'layout_name': 'mpii',
|
6 |
+
'num_joints': 16,
|
7 |
+
'keypoints_symmetry': [
|
8 |
+
[3, 4, 5, 13, 14, 15],
|
9 |
+
[0, 1, 2, 10, 11, 12],
|
10 |
+
]
|
11 |
+
}
|
12 |
+
|
13 |
+
coco_metadata = {
|
14 |
+
'layout_name': 'coco',
|
15 |
+
'num_joints': 17,
|
16 |
+
'keypoints_symmetry': [
|
17 |
+
[1, 3, 5, 7, 9, 11, 13, 15],
|
18 |
+
[2, 4, 6, 8, 10, 12, 14, 16],
|
19 |
+
]
|
20 |
+
}
|
21 |
+
|
22 |
+
h36m_metadata = {
|
23 |
+
'layout_name': 'h36m',
|
24 |
+
'num_joints': 17,
|
25 |
+
'keypoints_symmetry': [
|
26 |
+
[4, 5, 6, 11, 12, 13],
|
27 |
+
[1, 2, 3, 14, 15, 16],
|
28 |
+
]
|
29 |
+
}
|
30 |
+
|
31 |
+
humaneva15_metadata = {
|
32 |
+
'layout_name': 'humaneva15',
|
33 |
+
'num_joints': 15,
|
34 |
+
'keypoints_symmetry': [
|
35 |
+
[2, 3, 4, 8, 9, 10],
|
36 |
+
[5, 6, 7, 11, 12, 13]
|
37 |
+
]
|
38 |
+
}
|
39 |
+
|
40 |
+
humaneva20_metadata = {
|
41 |
+
'layout_name': 'humaneva20',
|
42 |
+
'num_joints': 20,
|
43 |
+
'keypoints_symmetry': [
|
44 |
+
[3, 4, 5, 6, 11, 12, 13, 14],
|
45 |
+
[7, 8, 9, 10, 15, 16, 17, 18]
|
46 |
+
]
|
47 |
+
}
|
48 |
+
|
49 |
+
def suggest_metadata(name):
|
50 |
+
names = []
|
51 |
+
for metadata in [mpii_metadata, coco_metadata, h36m_metadata, humaneva15_metadata, humaneva20_metadata]:
|
52 |
+
if metadata['layout_name'] in name:
|
53 |
+
return metadata
|
54 |
+
names.append(metadata['layout_name'])
|
55 |
+
raise KeyError('Cannot infer keypoint layout from name "{}". Tried {}.'.format(name, names))
|
56 |
+
|
57 |
+
def import_detectron_poses(path):
|
58 |
+
# Latin1 encoding because Detectron runs on Python 2.7
|
59 |
+
data = np.load(path, encoding='latin1')
|
60 |
+
kp = data['keypoints']
|
61 |
+
bb = data['boxes']
|
62 |
+
results = []
|
63 |
+
for i in range(len(bb)):
|
64 |
+
if len(bb[i][1]) == 0:
|
65 |
+
assert i > 0
|
66 |
+
# Use last pose in case of detection failure
|
67 |
+
results.append(results[-1])
|
68 |
+
continue
|
69 |
+
best_match = np.argmax(bb[i][1][:, 4])
|
70 |
+
keypoints = kp[i][1][best_match].T.copy()
|
71 |
+
results.append(keypoints)
|
72 |
+
results = np.array(results)
|
73 |
+
return results[:, :, 4:6] # Soft-argmax
|
74 |
+
#return results[:, :, [0, 1, 3]] # Argmax + score
|
75 |
+
|
76 |
+
|
77 |
+
def import_cpn_poses(path):
|
78 |
+
data = np.load(path)
|
79 |
+
kp = data['keypoints']
|
80 |
+
return kp[:, :, :2]
|
81 |
+
|
82 |
+
|
83 |
+
def import_sh_poses(path):
|
84 |
+
with h5py.File(path) as hf:
|
85 |
+
positions = hf['poses'].value
|
86 |
+
return positions.astype('float32')
|
87 |
+
|
88 |
+
def suggest_pose_importer(name):
|
89 |
+
if 'detectron' in name:
|
90 |
+
return import_detectron_poses
|
91 |
+
if 'cpn' in name:
|
92 |
+
return import_cpn_poses
|
93 |
+
if 'sh' in name:
|
94 |
+
return import_sh_poses
|
95 |
+
raise KeyError('Cannot infer keypoint format from name "{}". Tried detectron, cpn, sh.'.format(name))
|
VideoToNPZ/gen_skes.py
ADDED
@@ -0,0 +1,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import sys
|
3 |
+
import os.path as osp
|
4 |
+
import os
|
5 |
+
import argparse
|
6 |
+
import cv2
|
7 |
+
import time
|
8 |
+
import h5py
|
9 |
+
from tqdm import tqdm
|
10 |
+
import numpy as np
|
11 |
+
import warnings
|
12 |
+
import signal
|
13 |
+
|
14 |
+
warnings.filterwarnings('ignore')
|
15 |
+
|
16 |
+
sys.path.insert(0, osp.dirname(osp.realpath(__file__)))
|
17 |
+
from tools.utils import get_path
|
18 |
+
from model.gast_net import SpatioTemporalModel, SpatioTemporalModelOptimized1f
|
19 |
+
from common.skeleton import Skeleton
|
20 |
+
from common.graph_utils import adj_mx_from_skeleton
|
21 |
+
from common.generators import *
|
22 |
+
from tools.preprocess import load_kpts_json, h36m_coco_format, revise_kpts, revise_skes
|
23 |
+
from tools.inference import gen_pose
|
24 |
+
from tools.vis_kpts import plot_keypoint
|
25 |
+
|
26 |
+
cur_dir, chk_root, data_root, lib_root, output_root = get_path(__file__)
|
27 |
+
model_dir = chk_root + 'gastnet/'
|
28 |
+
sys.path.insert(1, lib_root)
|
29 |
+
from lib.pose import gen_video_kpts as hrnet_pose
|
30 |
+
sys.path.pop(1)
|
31 |
+
sys.path.pop(0)
|
32 |
+
|
33 |
+
skeleton = Skeleton(parents=[-1, 0, 1, 2, 0, 4, 5, 0, 7, 8, 9, 8, 11, 12, 8, 14, 15],
|
34 |
+
joints_left=[4, 5, 6, 11, 12, 13], joints_right=[1, 2, 3, 14, 15, 16])
|
35 |
+
adj = adj_mx_from_skeleton(skeleton)
|
36 |
+
|
37 |
+
joints_left, joints_right = [4, 5, 6, 11, 12, 13], [1, 2, 3, 14, 15, 16]
|
38 |
+
kps_left, kps_right = [4, 5, 6, 11, 12, 13], [1, 2, 3, 14, 15, 16]
|
39 |
+
|
40 |
+
# Set up signal handler for keyboard interrupt
|
41 |
+
def signal_handler(sig, frame):
|
42 |
+
print("\nInterrupted by user, shutting down...")
|
43 |
+
if 'pool' in locals() and pool is not None:
|
44 |
+
pool.terminate()
|
45 |
+
pool.join()
|
46 |
+
sys.exit(0)
|
47 |
+
|
48 |
+
signal.signal(signal.SIGINT, signal_handler)
|
49 |
+
|
50 |
+
def load_model_layer():
|
51 |
+
chk = model_dir + '81_frame_model.bin'
|
52 |
+
filters_width = [3, 3, 3, 3]
|
53 |
+
channels = 64
|
54 |
+
|
55 |
+
model_pos = SpatioTemporalModel(adj, 17, 2, 17, filter_widths=filters_width, channels=channels, dropout=0.05)
|
56 |
+
|
57 |
+
checkpoint = torch.load(chk)
|
58 |
+
model_pos.load_state_dict(checkpoint['model_pos'])
|
59 |
+
|
60 |
+
if torch.cuda.is_available():
|
61 |
+
model_pos = model_pos.cuda()
|
62 |
+
model_pos = model_pos.eval()
|
63 |
+
|
64 |
+
return model_pos
|
65 |
+
|
66 |
+
def generate_skeletons(video=''):
|
67 |
+
cap = cv2.VideoCapture(video)
|
68 |
+
width = cap.get(cv2.CAP_PROP_FRAME_WIDTH)
|
69 |
+
height = cap.get(cv2.CAP_PROP_FRAME_HEIGHT)
|
70 |
+
|
71 |
+
keypoints, scores = hrnet_pose(video, det_dim=416, gen_output=True)
|
72 |
+
keypoints, scores, valid_frames = h36m_coco_format(keypoints, scores)
|
73 |
+
re_kpts = revise_kpts(keypoints, scores, valid_frames)
|
74 |
+
num_person = len(re_kpts)
|
75 |
+
|
76 |
+
model_pos = load_model_layer()
|
77 |
+
|
78 |
+
pad = (81 - 1) // 2
|
79 |
+
causal_shift = 0
|
80 |
+
|
81 |
+
prediction = gen_pose(re_kpts, valid_frames, width, height, model_pos, pad, causal_shift)
|
82 |
+
|
83 |
+
print('Recording 3D Pose:')
|
84 |
+
|
85 |
+
# Add a loading bar
|
86 |
+
for i in tqdm(range(100)):
|
87 |
+
time.sleep(0.01)
|
88 |
+
|
89 |
+
# Create output directory with absolute path
|
90 |
+
output_dir = os.path.abspath('../outputs/')
|
91 |
+
print(f"Creating output directory: {output_dir}")
|
92 |
+
os.makedirs(output_dir, exist_ok=True)
|
93 |
+
|
94 |
+
npz_dir = os.path.join(output_dir, 'npz')
|
95 |
+
print(f"Creating NPZ directory: {npz_dir}")
|
96 |
+
os.makedirs(npz_dir, exist_ok=True)
|
97 |
+
|
98 |
+
output_npz = os.path.join(npz_dir, os.path.basename(video).split('.')[0] + '.npz')
|
99 |
+
print(f"Saving NPZ to: {output_npz}")
|
100 |
+
np.savez_compressed(output_npz, reconstruction=prediction)
|
101 |
+
print(f"NPZ saved successfully: {output_npz}")
|
102 |
+
|
103 |
+
def arg_parse():
|
104 |
+
parser = argparse.ArgumentParser('Generating skeleton demo.')
|
105 |
+
parser.add_argument('-v', '--video', type=str)
|
106 |
+
args = parser.parse_args()
|
107 |
+
return args
|
108 |
+
|
109 |
+
if __name__ == "__main__":
|
110 |
+
args = arg_parse()
|
111 |
+
# Use the video path as-is if absolute, otherwise prepend data_root
|
112 |
+
if os.path.isabs(args.video):
|
113 |
+
video_path = args.video
|
114 |
+
else:
|
115 |
+
video_path = os.path.join(data_root, 'video', args.video)
|
116 |
+
generate_skeletons(video=video_path)
|
VideoToNPZ/lib/detector/__init__.py
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
import os.path as osp
|
3 |
+
|
4 |
+
sys.path.insert(0, osp.join(osp.dirname(osp.realpath(__file__)), 'yolov3'))
|
5 |
+
from human_detector import yolo_human_det, load_model
|
6 |
+
sys.path.pop(0)
|
VideoToNPZ/lib/detector/yolov3/__init__.py
ADDED
File without changes
|
VideoToNPZ/lib/detector/yolov3/bbox.py
ADDED
@@ -0,0 +1,111 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from __future__ import division
|
2 |
+
|
3 |
+
import torch
|
4 |
+
import random
|
5 |
+
import numpy as np
|
6 |
+
import cv2
|
7 |
+
|
8 |
+
|
9 |
+
def confidence_filter(result, confidence):
|
10 |
+
conf_mask = (result[:,:,4] > confidence).float().unsqueeze(2)
|
11 |
+
result = result*conf_mask
|
12 |
+
|
13 |
+
return result
|
14 |
+
|
15 |
+
|
16 |
+
def confidence_filter_cls(result, confidence):
|
17 |
+
max_scores = torch.max(result[:,:,5:25], 2)[0]
|
18 |
+
res = torch.cat((result, max_scores),2)
|
19 |
+
print(res.shape)
|
20 |
+
|
21 |
+
|
22 |
+
cond_1 = (res[:,:,4] > confidence).float()
|
23 |
+
cond_2 = (res[:,:,25] > 0.995).float()
|
24 |
+
|
25 |
+
conf = cond_1 + cond_2
|
26 |
+
conf = torch.clamp(conf, 0.0, 1.0)
|
27 |
+
conf = conf.unsqueeze(2)
|
28 |
+
result = result*conf
|
29 |
+
return result
|
30 |
+
|
31 |
+
|
32 |
+
def get_abs_coord(box):
|
33 |
+
box[2], box[3] = abs(box[2]), abs(box[3])
|
34 |
+
x1 = (box[0] - box[2]/2) - 1
|
35 |
+
y1 = (box[1] - box[3]/2) - 1
|
36 |
+
x2 = (box[0] + box[2]/2) - 1
|
37 |
+
y2 = (box[1] + box[3]/2) - 1
|
38 |
+
return x1, y1, x2, y2
|
39 |
+
|
40 |
+
|
41 |
+
def sanity_fix(box):
|
42 |
+
if (box[0] > box[2]):
|
43 |
+
box[0], box[2] = box[2], box[0]
|
44 |
+
|
45 |
+
if (box[1] > box[3]):
|
46 |
+
box[1], box[3] = box[3], box[1]
|
47 |
+
|
48 |
+
return box
|
49 |
+
|
50 |
+
|
51 |
+
def bbox_iou(box1, box2):
|
52 |
+
"""
|
53 |
+
Returns the IoU of two bounding boxes
|
54 |
+
|
55 |
+
"""
|
56 |
+
# Get the coordinates of bounding boxes
|
57 |
+
b1_x1, b1_y1, b1_x2, b1_y2 = box1[:, 0], box1[:, 1], box1[:, 2], box1[:, 3]
|
58 |
+
b2_x1, b2_y1, b2_x2, b2_y2 = box2[:, 0], box2[:, 1], box2[:, 2], box2[:, 3]
|
59 |
+
|
60 |
+
# get the corrdinates of the intersection rectangle
|
61 |
+
inter_rect_x1 = torch.max(b1_x1, b2_x1)
|
62 |
+
inter_rect_y1 = torch.max(b1_y1, b2_y1)
|
63 |
+
inter_rect_x2 = torch.min(b1_x2, b2_x2)
|
64 |
+
inter_rect_y2 = torch.min(b1_y2, b2_y2)
|
65 |
+
|
66 |
+
# Intersection area
|
67 |
+
if torch.cuda.is_available():
|
68 |
+
inter_area = torch.max(inter_rect_x2 - inter_rect_x1 + 1, torch.zeros(inter_rect_x2.shape).cuda())*torch.max(inter_rect_y2 - inter_rect_y1 + 1, torch.zeros(inter_rect_x2.shape).cuda())
|
69 |
+
else:
|
70 |
+
inter_area = torch.max(inter_rect_x2 - inter_rect_x1 + 1, torch.zeros(inter_rect_x2.shape))*torch.max(inter_rect_y2 - inter_rect_y1 + 1, torch.zeros(inter_rect_x2.shape))
|
71 |
+
|
72 |
+
# Union Area
|
73 |
+
b1_area = (b1_x2 - b1_x1 + 1)*(b1_y2 - b1_y1 + 1)
|
74 |
+
b2_area = (b2_x2 - b2_x1 + 1)*(b2_y2 - b2_y1 + 1)
|
75 |
+
|
76 |
+
iou = inter_area / (b1_area + b2_area - inter_area)
|
77 |
+
|
78 |
+
return iou
|
79 |
+
|
80 |
+
|
81 |
+
def pred_corner_coord(prediction):
|
82 |
+
#Get indices of non-zero confidence bboxes
|
83 |
+
ind_nz = torch.nonzero(prediction[:,:,4]).transpose(0,1).contiguous()
|
84 |
+
|
85 |
+
box = prediction[ind_nz[0], ind_nz[1]]
|
86 |
+
|
87 |
+
box_a = box.new(box.shape)
|
88 |
+
box_a[:,0] = (box[:,0] - box[:,2]/2)
|
89 |
+
box_a[:,1] = (box[:,1] - box[:,3]/2)
|
90 |
+
box_a[:,2] = (box[:,0] + box[:,2]/2)
|
91 |
+
box_a[:,3] = (box[:,1] + box[:,3]/2)
|
92 |
+
box[:,:4] = box_a[:,:4]
|
93 |
+
|
94 |
+
prediction[ind_nz[0], ind_nz[1]] = box
|
95 |
+
|
96 |
+
return prediction
|
97 |
+
|
98 |
+
|
99 |
+
def write(x, batches, results, colors, classes):
|
100 |
+
c1 = tuple(x[1:3].int())
|
101 |
+
c2 = tuple(x[3:5].int())
|
102 |
+
img = results[int(x[0])]
|
103 |
+
cls = int(x[-1])
|
104 |
+
label = "{0}".format(classes[cls])
|
105 |
+
color = random.choice(colors)
|
106 |
+
cv2.rectangle(img, c1, c2,color, 1)
|
107 |
+
t_size = cv2.getTextSize(label, cv2.FONT_HERSHEY_PLAIN, 1 , 1)[0]
|
108 |
+
c2 = c1[0] + t_size[0] + 3, c1[1] + t_size[1] + 4
|
109 |
+
cv2.rectangle(img, c1, c2,color, -1)
|
110 |
+
cv2.putText(img, label, (c1[0], c1[1] + t_size[1] + 4), cv2.FONT_HERSHEY_PLAIN, 1, [225,255,255], 1);
|
111 |
+
return img
|
VideoToNPZ/lib/detector/yolov3/cfg/tiny-yolo-voc.cfg
ADDED
@@ -0,0 +1,134 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[net]
|
2 |
+
batch=64
|
3 |
+
subdivisions=8
|
4 |
+
width=416
|
5 |
+
height=416
|
6 |
+
channels=3
|
7 |
+
momentum=0.9
|
8 |
+
decay=0.0005
|
9 |
+
angle=0
|
10 |
+
saturation = 1.5
|
11 |
+
exposure = 1.5
|
12 |
+
hue=.1
|
13 |
+
|
14 |
+
learning_rate=0.001
|
15 |
+
max_batches = 40200
|
16 |
+
policy=steps
|
17 |
+
steps=-1,100,20000,30000
|
18 |
+
scales=.1,10,.1,.1
|
19 |
+
|
20 |
+
[convolutional]
|
21 |
+
batch_normalize=1
|
22 |
+
filters=16
|
23 |
+
size=3
|
24 |
+
stride=1
|
25 |
+
pad=1
|
26 |
+
activation=leaky
|
27 |
+
|
28 |
+
[maxpool]
|
29 |
+
size=2
|
30 |
+
stride=2
|
31 |
+
|
32 |
+
[convolutional]
|
33 |
+
batch_normalize=1
|
34 |
+
filters=32
|
35 |
+
size=3
|
36 |
+
stride=1
|
37 |
+
pad=1
|
38 |
+
activation=leaky
|
39 |
+
|
40 |
+
[maxpool]
|
41 |
+
size=2
|
42 |
+
stride=2
|
43 |
+
|
44 |
+
[convolutional]
|
45 |
+
batch_normalize=1
|
46 |
+
filters=64
|
47 |
+
size=3
|
48 |
+
stride=1
|
49 |
+
pad=1
|
50 |
+
activation=leaky
|
51 |
+
|
52 |
+
[maxpool]
|
53 |
+
size=2
|
54 |
+
stride=2
|
55 |
+
|
56 |
+
[convolutional]
|
57 |
+
batch_normalize=1
|
58 |
+
filters=128
|
59 |
+
size=3
|
60 |
+
stride=1
|
61 |
+
pad=1
|
62 |
+
activation=leaky
|
63 |
+
|
64 |
+
[maxpool]
|
65 |
+
size=2
|
66 |
+
stride=2
|
67 |
+
|
68 |
+
[convolutional]
|
69 |
+
batch_normalize=1
|
70 |
+
filters=256
|
71 |
+
size=3
|
72 |
+
stride=1
|
73 |
+
pad=1
|
74 |
+
activation=leaky
|
75 |
+
|
76 |
+
[maxpool]
|
77 |
+
size=2
|
78 |
+
stride=2
|
79 |
+
|
80 |
+
[convolutional]
|
81 |
+
batch_normalize=1
|
82 |
+
filters=512
|
83 |
+
size=3
|
84 |
+
stride=1
|
85 |
+
pad=1
|
86 |
+
activation=leaky
|
87 |
+
|
88 |
+
[maxpool]
|
89 |
+
size=2
|
90 |
+
stride=1
|
91 |
+
|
92 |
+
[convolutional]
|
93 |
+
batch_normalize=1
|
94 |
+
filters=1024
|
95 |
+
size=3
|
96 |
+
stride=1
|
97 |
+
pad=1
|
98 |
+
activation=leaky
|
99 |
+
|
100 |
+
###########
|
101 |
+
|
102 |
+
[convolutional]
|
103 |
+
batch_normalize=1
|
104 |
+
size=3
|
105 |
+
stride=1
|
106 |
+
pad=1
|
107 |
+
filters=1024
|
108 |
+
activation=leaky
|
109 |
+
|
110 |
+
[convolutional]
|
111 |
+
size=1
|
112 |
+
stride=1
|
113 |
+
pad=1
|
114 |
+
filters=125
|
115 |
+
activation=linear
|
116 |
+
|
117 |
+
[region]
|
118 |
+
anchors = 1.08,1.19, 3.42,4.41, 6.63,11.38, 9.42,5.11, 16.62,10.52
|
119 |
+
bias_match=1
|
120 |
+
classes=20
|
121 |
+
coords=4
|
122 |
+
num=5
|
123 |
+
softmax=1
|
124 |
+
jitter=.2
|
125 |
+
rescore=1
|
126 |
+
|
127 |
+
object_scale=5
|
128 |
+
noobject_scale=1
|
129 |
+
class_scale=1
|
130 |
+
coord_scale=1
|
131 |
+
|
132 |
+
absolute=1
|
133 |
+
thresh = .6
|
134 |
+
random=1
|
VideoToNPZ/lib/detector/yolov3/cfg/yolo-voc.cfg
ADDED
@@ -0,0 +1,258 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[net]
|
2 |
+
# Testing
|
3 |
+
batch=64
|
4 |
+
subdivisions=8
|
5 |
+
# Training
|
6 |
+
# batch=64
|
7 |
+
# subdivisions=8
|
8 |
+
height=416
|
9 |
+
width=416
|
10 |
+
channels=3
|
11 |
+
momentum=0.9
|
12 |
+
decay=0.0005
|
13 |
+
angle=0
|
14 |
+
saturation = 1.5
|
15 |
+
exposure = 1.5
|
16 |
+
hue=.1
|
17 |
+
|
18 |
+
learning_rate=0.001
|
19 |
+
burn_in=1000
|
20 |
+
max_batches = 80200
|
21 |
+
policy=steps
|
22 |
+
steps=-1,500,40000,60000
|
23 |
+
scales=0.1,10,.1,.1
|
24 |
+
|
25 |
+
[convolutional]
|
26 |
+
batch_normalize=1
|
27 |
+
filters=32
|
28 |
+
size=3
|
29 |
+
stride=1
|
30 |
+
pad=1
|
31 |
+
activation=leaky
|
32 |
+
|
33 |
+
[maxpool]
|
34 |
+
size=2
|
35 |
+
stride=2
|
36 |
+
|
37 |
+
[convolutional]
|
38 |
+
batch_normalize=1
|
39 |
+
filters=64
|
40 |
+
size=3
|
41 |
+
stride=1
|
42 |
+
pad=1
|
43 |
+
activation=leaky
|
44 |
+
|
45 |
+
[maxpool]
|
46 |
+
size=2
|
47 |
+
stride=2
|
48 |
+
|
49 |
+
[convolutional]
|
50 |
+
batch_normalize=1
|
51 |
+
filters=128
|
52 |
+
size=3
|
53 |
+
stride=1
|
54 |
+
pad=1
|
55 |
+
activation=leaky
|
56 |
+
|
57 |
+
[convolutional]
|
58 |
+
batch_normalize=1
|
59 |
+
filters=64
|
60 |
+
size=1
|
61 |
+
stride=1
|
62 |
+
pad=1
|
63 |
+
activation=leaky
|
64 |
+
|
65 |
+
[convolutional]
|
66 |
+
batch_normalize=1
|
67 |
+
filters=128
|
68 |
+
size=3
|
69 |
+
stride=1
|
70 |
+
pad=1
|
71 |
+
activation=leaky
|
72 |
+
|
73 |
+
[maxpool]
|
74 |
+
size=2
|
75 |
+
stride=2
|
76 |
+
|
77 |
+
[convolutional]
|
78 |
+
batch_normalize=1
|
79 |
+
filters=256
|
80 |
+
size=3
|
81 |
+
stride=1
|
82 |
+
pad=1
|
83 |
+
activation=leaky
|
84 |
+
|
85 |
+
[convolutional]
|
86 |
+
batch_normalize=1
|
87 |
+
filters=128
|
88 |
+
size=1
|
89 |
+
stride=1
|
90 |
+
pad=1
|
91 |
+
activation=leaky
|
92 |
+
|
93 |
+
[convolutional]
|
94 |
+
batch_normalize=1
|
95 |
+
filters=256
|
96 |
+
size=3
|
97 |
+
stride=1
|
98 |
+
pad=1
|
99 |
+
activation=leaky
|
100 |
+
|
101 |
+
[maxpool]
|
102 |
+
size=2
|
103 |
+
stride=2
|
104 |
+
|
105 |
+
[convolutional]
|
106 |
+
batch_normalize=1
|
107 |
+
filters=512
|
108 |
+
size=3
|
109 |
+
stride=1
|
110 |
+
pad=1
|
111 |
+
activation=leaky
|
112 |
+
|
113 |
+
[convolutional]
|
114 |
+
batch_normalize=1
|
115 |
+
filters=256
|
116 |
+
size=1
|
117 |
+
stride=1
|
118 |
+
pad=1
|
119 |
+
activation=leaky
|
120 |
+
|
121 |
+
[convolutional]
|
122 |
+
batch_normalize=1
|
123 |
+
filters=512
|
124 |
+
size=3
|
125 |
+
stride=1
|
126 |
+
pad=1
|
127 |
+
activation=leaky
|
128 |
+
|
129 |
+
[convolutional]
|
130 |
+
batch_normalize=1
|
131 |
+
filters=256
|
132 |
+
size=1
|
133 |
+
stride=1
|
134 |
+
pad=1
|
135 |
+
activation=leaky
|
136 |
+
|
137 |
+
[convolutional]
|
138 |
+
batch_normalize=1
|
139 |
+
filters=512
|
140 |
+
size=3
|
141 |
+
stride=1
|
142 |
+
pad=1
|
143 |
+
activation=leaky
|
144 |
+
|
145 |
+
[maxpool]
|
146 |
+
size=2
|
147 |
+
stride=2
|
148 |
+
|
149 |
+
[convolutional]
|
150 |
+
batch_normalize=1
|
151 |
+
filters=1024
|
152 |
+
size=3
|
153 |
+
stride=1
|
154 |
+
pad=1
|
155 |
+
activation=leaky
|
156 |
+
|
157 |
+
[convolutional]
|
158 |
+
batch_normalize=1
|
159 |
+
filters=512
|
160 |
+
size=1
|
161 |
+
stride=1
|
162 |
+
pad=1
|
163 |
+
activation=leaky
|
164 |
+
|
165 |
+
[convolutional]
|
166 |
+
batch_normalize=1
|
167 |
+
filters=1024
|
168 |
+
size=3
|
169 |
+
stride=1
|
170 |
+
pad=1
|
171 |
+
activation=leaky
|
172 |
+
|
173 |
+
[convolutional]
|
174 |
+
batch_normalize=1
|
175 |
+
filters=512
|
176 |
+
size=1
|
177 |
+
stride=1
|
178 |
+
pad=1
|
179 |
+
activation=leaky
|
180 |
+
|
181 |
+
[convolutional]
|
182 |
+
batch_normalize=1
|
183 |
+
filters=1024
|
184 |
+
size=3
|
185 |
+
stride=1
|
186 |
+
pad=1
|
187 |
+
activation=leaky
|
188 |
+
|
189 |
+
|
190 |
+
#######
|
191 |
+
|
192 |
+
[convolutional]
|
193 |
+
batch_normalize=1
|
194 |
+
size=3
|
195 |
+
stride=1
|
196 |
+
pad=1
|
197 |
+
filters=1024
|
198 |
+
activation=leaky
|
199 |
+
|
200 |
+
[convolutional]
|
201 |
+
batch_normalize=1
|
202 |
+
size=3
|
203 |
+
stride=1
|
204 |
+
pad=1
|
205 |
+
filters=1024
|
206 |
+
activation=leaky
|
207 |
+
|
208 |
+
[route]
|
209 |
+
layers=-9
|
210 |
+
|
211 |
+
[convolutional]
|
212 |
+
batch_normalize=1
|
213 |
+
size=1
|
214 |
+
stride=1
|
215 |
+
pad=1
|
216 |
+
filters=64
|
217 |
+
activation=leaky
|
218 |
+
|
219 |
+
[reorg]
|
220 |
+
stride=2
|
221 |
+
|
222 |
+
[route]
|
223 |
+
layers=-1,-4
|
224 |
+
|
225 |
+
[convolutional]
|
226 |
+
batch_normalize=1
|
227 |
+
size=3
|
228 |
+
stride=1
|
229 |
+
pad=1
|
230 |
+
filters=1024
|
231 |
+
activation=leaky
|
232 |
+
|
233 |
+
[convolutional]
|
234 |
+
size=1
|
235 |
+
stride=1
|
236 |
+
pad=1
|
237 |
+
filters=125
|
238 |
+
activation=linear
|
239 |
+
|
240 |
+
|
241 |
+
[region]
|
242 |
+
anchors = 1.3221, 1.73145, 3.19275, 4.00944, 5.05587, 8.09892, 9.47112, 4.84053, 11.2364, 10.0071
|
243 |
+
bias_match=1
|
244 |
+
classes=20
|
245 |
+
coords=4
|
246 |
+
num=5
|
247 |
+
softmax=1
|
248 |
+
jitter=.3
|
249 |
+
rescore=1
|
250 |
+
|
251 |
+
object_scale=5
|
252 |
+
noobject_scale=1
|
253 |
+
class_scale=1
|
254 |
+
coord_scale=1
|
255 |
+
|
256 |
+
absolute=1
|
257 |
+
thresh = .6
|
258 |
+
random=1
|
VideoToNPZ/lib/detector/yolov3/cfg/yolo.cfg
ADDED
@@ -0,0 +1,258 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[net]
|
2 |
+
# Testing
|
3 |
+
batch=1
|
4 |
+
subdivisions=1
|
5 |
+
# Training
|
6 |
+
# batch=64
|
7 |
+
# subdivisions=8
|
8 |
+
width=416
|
9 |
+
height=416
|
10 |
+
channels=3
|
11 |
+
momentum=0.9
|
12 |
+
decay=0.0005
|
13 |
+
angle=0
|
14 |
+
saturation = 1.5
|
15 |
+
exposure = 1.5
|
16 |
+
hue=.1
|
17 |
+
|
18 |
+
learning_rate=0.001
|
19 |
+
burn_in=1000
|
20 |
+
max_batches = 500200
|
21 |
+
policy=steps
|
22 |
+
steps=400000,450000
|
23 |
+
scales=.1,.1
|
24 |
+
|
25 |
+
[convolutional]
|
26 |
+
batch_normalize=1
|
27 |
+
filters=32
|
28 |
+
size=3
|
29 |
+
stride=1
|
30 |
+
pad=1
|
31 |
+
activation=leaky
|
32 |
+
|
33 |
+
[maxpool]
|
34 |
+
size=2
|
35 |
+
stride=2
|
36 |
+
|
37 |
+
[convolutional]
|
38 |
+
batch_normalize=1
|
39 |
+
filters=64
|
40 |
+
size=3
|
41 |
+
stride=1
|
42 |
+
pad=1
|
43 |
+
activation=leaky
|
44 |
+
|
45 |
+
[maxpool]
|
46 |
+
size=2
|
47 |
+
stride=2
|
48 |
+
|
49 |
+
[convolutional]
|
50 |
+
batch_normalize=1
|
51 |
+
filters=128
|
52 |
+
size=3
|
53 |
+
stride=1
|
54 |
+
pad=1
|
55 |
+
activation=leaky
|
56 |
+
|
57 |
+
[convolutional]
|
58 |
+
batch_normalize=1
|
59 |
+
filters=64
|
60 |
+
size=1
|
61 |
+
stride=1
|
62 |
+
pad=1
|
63 |
+
activation=leaky
|
64 |
+
|
65 |
+
[convolutional]
|
66 |
+
batch_normalize=1
|
67 |
+
filters=128
|
68 |
+
size=3
|
69 |
+
stride=1
|
70 |
+
pad=1
|
71 |
+
activation=leaky
|
72 |
+
|
73 |
+
[maxpool]
|
74 |
+
size=2
|
75 |
+
stride=2
|
76 |
+
|
77 |
+
[convolutional]
|
78 |
+
batch_normalize=1
|
79 |
+
filters=256
|
80 |
+
size=3
|
81 |
+
stride=1
|
82 |
+
pad=1
|
83 |
+
activation=leaky
|
84 |
+
|
85 |
+
[convolutional]
|
86 |
+
batch_normalize=1
|
87 |
+
filters=128
|
88 |
+
size=1
|
89 |
+
stride=1
|
90 |
+
pad=1
|
91 |
+
activation=leaky
|
92 |
+
|
93 |
+
[convolutional]
|
94 |
+
batch_normalize=1
|
95 |
+
filters=256
|
96 |
+
size=3
|
97 |
+
stride=1
|
98 |
+
pad=1
|
99 |
+
activation=leaky
|
100 |
+
|
101 |
+
[maxpool]
|
102 |
+
size=2
|
103 |
+
stride=2
|
104 |
+
|
105 |
+
[convolutional]
|
106 |
+
batch_normalize=1
|
107 |
+
filters=512
|
108 |
+
size=3
|
109 |
+
stride=1
|
110 |
+
pad=1
|
111 |
+
activation=leaky
|
112 |
+
|
113 |
+
[convolutional]
|
114 |
+
batch_normalize=1
|
115 |
+
filters=256
|
116 |
+
size=1
|
117 |
+
stride=1
|
118 |
+
pad=1
|
119 |
+
activation=leaky
|
120 |
+
|
121 |
+
[convolutional]
|
122 |
+
batch_normalize=1
|
123 |
+
filters=512
|
124 |
+
size=3
|
125 |
+
stride=1
|
126 |
+
pad=1
|
127 |
+
activation=leaky
|
128 |
+
|
129 |
+
[convolutional]
|
130 |
+
batch_normalize=1
|
131 |
+
filters=256
|
132 |
+
size=1
|
133 |
+
stride=1
|
134 |
+
pad=1
|
135 |
+
activation=leaky
|
136 |
+
|
137 |
+
[convolutional]
|
138 |
+
batch_normalize=1
|
139 |
+
filters=512
|
140 |
+
size=3
|
141 |
+
stride=1
|
142 |
+
pad=1
|
143 |
+
activation=leaky
|
144 |
+
|
145 |
+
[maxpool]
|
146 |
+
size=2
|
147 |
+
stride=2
|
148 |
+
|
149 |
+
[convolutional]
|
150 |
+
batch_normalize=1
|
151 |
+
filters=1024
|
152 |
+
size=3
|
153 |
+
stride=1
|
154 |
+
pad=1
|
155 |
+
activation=leaky
|
156 |
+
|
157 |
+
[convolutional]
|
158 |
+
batch_normalize=1
|
159 |
+
filters=512
|
160 |
+
size=1
|
161 |
+
stride=1
|
162 |
+
pad=1
|
163 |
+
activation=leaky
|
164 |
+
|
165 |
+
[convolutional]
|
166 |
+
batch_normalize=1
|
167 |
+
filters=1024
|
168 |
+
size=3
|
169 |
+
stride=1
|
170 |
+
pad=1
|
171 |
+
activation=leaky
|
172 |
+
|
173 |
+
[convolutional]
|
174 |
+
batch_normalize=1
|
175 |
+
filters=512
|
176 |
+
size=1
|
177 |
+
stride=1
|
178 |
+
pad=1
|
179 |
+
activation=leaky
|
180 |
+
|
181 |
+
[convolutional]
|
182 |
+
batch_normalize=1
|
183 |
+
filters=1024
|
184 |
+
size=3
|
185 |
+
stride=1
|
186 |
+
pad=1
|
187 |
+
activation=leaky
|
188 |
+
|
189 |
+
|
190 |
+
#######
|
191 |
+
|
192 |
+
[convolutional]
|
193 |
+
batch_normalize=1
|
194 |
+
size=3
|
195 |
+
stride=1
|
196 |
+
pad=1
|
197 |
+
filters=1024
|
198 |
+
activation=leaky
|
199 |
+
|
200 |
+
[convolutional]
|
201 |
+
batch_normalize=1
|
202 |
+
size=3
|
203 |
+
stride=1
|
204 |
+
pad=1
|
205 |
+
filters=1024
|
206 |
+
activation=leaky
|
207 |
+
|
208 |
+
[route]
|
209 |
+
layers=-9
|
210 |
+
|
211 |
+
[convolutional]
|
212 |
+
batch_normalize=1
|
213 |
+
size=1
|
214 |
+
stride=1
|
215 |
+
pad=1
|
216 |
+
filters=64
|
217 |
+
activation=leaky
|
218 |
+
|
219 |
+
[reorg]
|
220 |
+
stride=2
|
221 |
+
|
222 |
+
[route]
|
223 |
+
layers=-1,-4
|
224 |
+
|
225 |
+
[convolutional]
|
226 |
+
batch_normalize=1
|
227 |
+
size=3
|
228 |
+
stride=1
|
229 |
+
pad=1
|
230 |
+
filters=1024
|
231 |
+
activation=leaky
|
232 |
+
|
233 |
+
[convolutional]
|
234 |
+
size=1
|
235 |
+
stride=1
|
236 |
+
pad=1
|
237 |
+
filters=425
|
238 |
+
activation=linear
|
239 |
+
|
240 |
+
|
241 |
+
[region]
|
242 |
+
anchors = 0.57273, 0.677385, 1.87446, 2.06253, 3.33843, 5.47434, 7.88282, 3.52778, 9.77052, 9.16828
|
243 |
+
bias_match=1
|
244 |
+
classes=80
|
245 |
+
coords=4
|
246 |
+
num=5
|
247 |
+
softmax=1
|
248 |
+
jitter=.3
|
249 |
+
rescore=1
|
250 |
+
|
251 |
+
object_scale=5
|
252 |
+
noobject_scale=1
|
253 |
+
class_scale=1
|
254 |
+
coord_scale=1
|
255 |
+
|
256 |
+
absolute=1
|
257 |
+
thresh = .6
|
258 |
+
random=1
|
VideoToNPZ/lib/detector/yolov3/cfg/yolov3.cfg
ADDED
@@ -0,0 +1,789 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[net]
|
2 |
+
# Testing
|
3 |
+
batch=1
|
4 |
+
subdivisions=1
|
5 |
+
# Training
|
6 |
+
# batch=64
|
7 |
+
# subdivisions=16
|
8 |
+
width= 320
|
9 |
+
height = 320
|
10 |
+
channels=3
|
11 |
+
momentum=0.9
|
12 |
+
decay=0.0005
|
13 |
+
angle=0
|
14 |
+
saturation = 1.5
|
15 |
+
exposure = 1.5
|
16 |
+
hue=.1
|
17 |
+
|
18 |
+
learning_rate=0.001
|
19 |
+
burn_in=1000
|
20 |
+
max_batches = 500200
|
21 |
+
policy=steps
|
22 |
+
steps=400000,450000
|
23 |
+
scales=.1,.1
|
24 |
+
|
25 |
+
[convolutional]
|
26 |
+
batch_normalize=1
|
27 |
+
filters=32
|
28 |
+
size=3
|
29 |
+
stride=1
|
30 |
+
pad=1
|
31 |
+
activation=leaky
|
32 |
+
|
33 |
+
# Downsample
|
34 |
+
|
35 |
+
[convolutional]
|
36 |
+
batch_normalize=1
|
37 |
+
filters=64
|
38 |
+
size=3
|
39 |
+
stride=2
|
40 |
+
pad=1
|
41 |
+
activation=leaky
|
42 |
+
|
43 |
+
[convolutional]
|
44 |
+
batch_normalize=1
|
45 |
+
filters=32
|
46 |
+
size=1
|
47 |
+
stride=1
|
48 |
+
pad=1
|
49 |
+
activation=leaky
|
50 |
+
|
51 |
+
[convolutional]
|
52 |
+
batch_normalize=1
|
53 |
+
filters=64
|
54 |
+
size=3
|
55 |
+
stride=1
|
56 |
+
pad=1
|
57 |
+
activation=leaky
|
58 |
+
|
59 |
+
[shortcut]
|
60 |
+
from=-3
|
61 |
+
activation=linear
|
62 |
+
|
63 |
+
# Downsample
|
64 |
+
|
65 |
+
[convolutional]
|
66 |
+
batch_normalize=1
|
67 |
+
filters=128
|
68 |
+
size=3
|
69 |
+
stride=2
|
70 |
+
pad=1
|
71 |
+
activation=leaky
|
72 |
+
|
73 |
+
[convolutional]
|
74 |
+
batch_normalize=1
|
75 |
+
filters=64
|
76 |
+
size=1
|
77 |
+
stride=1
|
78 |
+
pad=1
|
79 |
+
activation=leaky
|
80 |
+
|
81 |
+
[convolutional]
|
82 |
+
batch_normalize=1
|
83 |
+
filters=128
|
84 |
+
size=3
|
85 |
+
stride=1
|
86 |
+
pad=1
|
87 |
+
activation=leaky
|
88 |
+
|
89 |
+
[shortcut]
|
90 |
+
from=-3
|
91 |
+
activation=linear
|
92 |
+
|
93 |
+
[convolutional]
|
94 |
+
batch_normalize=1
|
95 |
+
filters=64
|
96 |
+
size=1
|
97 |
+
stride=1
|
98 |
+
pad=1
|
99 |
+
activation=leaky
|
100 |
+
|
101 |
+
[convolutional]
|
102 |
+
batch_normalize=1
|
103 |
+
filters=128
|
104 |
+
size=3
|
105 |
+
stride=1
|
106 |
+
pad=1
|
107 |
+
activation=leaky
|
108 |
+
|
109 |
+
[shortcut]
|
110 |
+
from=-3
|
111 |
+
activation=linear
|
112 |
+
|
113 |
+
# Downsample
|
114 |
+
|
115 |
+
[convolutional]
|
116 |
+
batch_normalize=1
|
117 |
+
filters=256
|
118 |
+
size=3
|
119 |
+
stride=2
|
120 |
+
pad=1
|
121 |
+
activation=leaky
|
122 |
+
|
123 |
+
[convolutional]
|
124 |
+
batch_normalize=1
|
125 |
+
filters=128
|
126 |
+
size=1
|
127 |
+
stride=1
|
128 |
+
pad=1
|
129 |
+
activation=leaky
|
130 |
+
|
131 |
+
[convolutional]
|
132 |
+
batch_normalize=1
|
133 |
+
filters=256
|
134 |
+
size=3
|
135 |
+
stride=1
|
136 |
+
pad=1
|
137 |
+
activation=leaky
|
138 |
+
|
139 |
+
[shortcut]
|
140 |
+
from=-3
|
141 |
+
activation=linear
|
142 |
+
|
143 |
+
[convolutional]
|
144 |
+
batch_normalize=1
|
145 |
+
filters=128
|
146 |
+
size=1
|
147 |
+
stride=1
|
148 |
+
pad=1
|
149 |
+
activation=leaky
|
150 |
+
|
151 |
+
[convolutional]
|
152 |
+
batch_normalize=1
|
153 |
+
filters=256
|
154 |
+
size=3
|
155 |
+
stride=1
|
156 |
+
pad=1
|
157 |
+
activation=leaky
|
158 |
+
|
159 |
+
[shortcut]
|
160 |
+
from=-3
|
161 |
+
activation=linear
|
162 |
+
|
163 |
+
[convolutional]
|
164 |
+
batch_normalize=1
|
165 |
+
filters=128
|
166 |
+
size=1
|
167 |
+
stride=1
|
168 |
+
pad=1
|
169 |
+
activation=leaky
|
170 |
+
|
171 |
+
[convolutional]
|
172 |
+
batch_normalize=1
|
173 |
+
filters=256
|
174 |
+
size=3
|
175 |
+
stride=1
|
176 |
+
pad=1
|
177 |
+
activation=leaky
|
178 |
+
|
179 |
+
[shortcut]
|
180 |
+
from=-3
|
181 |
+
activation=linear
|
182 |
+
|
183 |
+
[convolutional]
|
184 |
+
batch_normalize=1
|
185 |
+
filters=128
|
186 |
+
size=1
|
187 |
+
stride=1
|
188 |
+
pad=1
|
189 |
+
activation=leaky
|
190 |
+
|
191 |
+
[convolutional]
|
192 |
+
batch_normalize=1
|
193 |
+
filters=256
|
194 |
+
size=3
|
195 |
+
stride=1
|
196 |
+
pad=1
|
197 |
+
activation=leaky
|
198 |
+
|
199 |
+
[shortcut]
|
200 |
+
from=-3
|
201 |
+
activation=linear
|
202 |
+
|
203 |
+
|
204 |
+
[convolutional]
|
205 |
+
batch_normalize=1
|
206 |
+
filters=128
|
207 |
+
size=1
|
208 |
+
stride=1
|
209 |
+
pad=1
|
210 |
+
activation=leaky
|
211 |
+
|
212 |
+
[convolutional]
|
213 |
+
batch_normalize=1
|
214 |
+
filters=256
|
215 |
+
size=3
|
216 |
+
stride=1
|
217 |
+
pad=1
|
218 |
+
activation=leaky
|
219 |
+
|
220 |
+
[shortcut]
|
221 |
+
from=-3
|
222 |
+
activation=linear
|
223 |
+
|
224 |
+
[convolutional]
|
225 |
+
batch_normalize=1
|
226 |
+
filters=128
|
227 |
+
size=1
|
228 |
+
stride=1
|
229 |
+
pad=1
|
230 |
+
activation=leaky
|
231 |
+
|
232 |
+
[convolutional]
|
233 |
+
batch_normalize=1
|
234 |
+
filters=256
|
235 |
+
size=3
|
236 |
+
stride=1
|
237 |
+
pad=1
|
238 |
+
activation=leaky
|
239 |
+
|
240 |
+
[shortcut]
|
241 |
+
from=-3
|
242 |
+
activation=linear
|
243 |
+
|
244 |
+
[convolutional]
|
245 |
+
batch_normalize=1
|
246 |
+
filters=128
|
247 |
+
size=1
|
248 |
+
stride=1
|
249 |
+
pad=1
|
250 |
+
activation=leaky
|
251 |
+
|
252 |
+
[convolutional]
|
253 |
+
batch_normalize=1
|
254 |
+
filters=256
|
255 |
+
size=3
|
256 |
+
stride=1
|
257 |
+
pad=1
|
258 |
+
activation=leaky
|
259 |
+
|
260 |
+
[shortcut]
|
261 |
+
from=-3
|
262 |
+
activation=linear
|
263 |
+
|
264 |
+
[convolutional]
|
265 |
+
batch_normalize=1
|
266 |
+
filters=128
|
267 |
+
size=1
|
268 |
+
stride=1
|
269 |
+
pad=1
|
270 |
+
activation=leaky
|
271 |
+
|
272 |
+
[convolutional]
|
273 |
+
batch_normalize=1
|
274 |
+
filters=256
|
275 |
+
size=3
|
276 |
+
stride=1
|
277 |
+
pad=1
|
278 |
+
activation=leaky
|
279 |
+
|
280 |
+
[shortcut]
|
281 |
+
from=-3
|
282 |
+
activation=linear
|
283 |
+
|
284 |
+
# Downsample
|
285 |
+
|
286 |
+
[convolutional]
|
287 |
+
batch_normalize=1
|
288 |
+
filters=512
|
289 |
+
size=3
|
290 |
+
stride=2
|
291 |
+
pad=1
|
292 |
+
activation=leaky
|
293 |
+
|
294 |
+
[convolutional]
|
295 |
+
batch_normalize=1
|
296 |
+
filters=256
|
297 |
+
size=1
|
298 |
+
stride=1
|
299 |
+
pad=1
|
300 |
+
activation=leaky
|
301 |
+
|
302 |
+
[convolutional]
|
303 |
+
batch_normalize=1
|
304 |
+
filters=512
|
305 |
+
size=3
|
306 |
+
stride=1
|
307 |
+
pad=1
|
308 |
+
activation=leaky
|
309 |
+
|
310 |
+
[shortcut]
|
311 |
+
from=-3
|
312 |
+
activation=linear
|
313 |
+
|
314 |
+
|
315 |
+
[convolutional]
|
316 |
+
batch_normalize=1
|
317 |
+
filters=256
|
318 |
+
size=1
|
319 |
+
stride=1
|
320 |
+
pad=1
|
321 |
+
activation=leaky
|
322 |
+
|
323 |
+
[convolutional]
|
324 |
+
batch_normalize=1
|
325 |
+
filters=512
|
326 |
+
size=3
|
327 |
+
stride=1
|
328 |
+
pad=1
|
329 |
+
activation=leaky
|
330 |
+
|
331 |
+
[shortcut]
|
332 |
+
from=-3
|
333 |
+
activation=linear
|
334 |
+
|
335 |
+
|
336 |
+
[convolutional]
|
337 |
+
batch_normalize=1
|
338 |
+
filters=256
|
339 |
+
size=1
|
340 |
+
stride=1
|
341 |
+
pad=1
|
342 |
+
activation=leaky
|
343 |
+
|
344 |
+
[convolutional]
|
345 |
+
batch_normalize=1
|
346 |
+
filters=512
|
347 |
+
size=3
|
348 |
+
stride=1
|
349 |
+
pad=1
|
350 |
+
activation=leaky
|
351 |
+
|
352 |
+
[shortcut]
|
353 |
+
from=-3
|
354 |
+
activation=linear
|
355 |
+
|
356 |
+
|
357 |
+
[convolutional]
|
358 |
+
batch_normalize=1
|
359 |
+
filters=256
|
360 |
+
size=1
|
361 |
+
stride=1
|
362 |
+
pad=1
|
363 |
+
activation=leaky
|
364 |
+
|
365 |
+
[convolutional]
|
366 |
+
batch_normalize=1
|
367 |
+
filters=512
|
368 |
+
size=3
|
369 |
+
stride=1
|
370 |
+
pad=1
|
371 |
+
activation=leaky
|
372 |
+
|
373 |
+
[shortcut]
|
374 |
+
from=-3
|
375 |
+
activation=linear
|
376 |
+
|
377 |
+
[convolutional]
|
378 |
+
batch_normalize=1
|
379 |
+
filters=256
|
380 |
+
size=1
|
381 |
+
stride=1
|
382 |
+
pad=1
|
383 |
+
activation=leaky
|
384 |
+
|
385 |
+
[convolutional]
|
386 |
+
batch_normalize=1
|
387 |
+
filters=512
|
388 |
+
size=3
|
389 |
+
stride=1
|
390 |
+
pad=1
|
391 |
+
activation=leaky
|
392 |
+
|
393 |
+
[shortcut]
|
394 |
+
from=-3
|
395 |
+
activation=linear
|
396 |
+
|
397 |
+
|
398 |
+
[convolutional]
|
399 |
+
batch_normalize=1
|
400 |
+
filters=256
|
401 |
+
size=1
|
402 |
+
stride=1
|
403 |
+
pad=1
|
404 |
+
activation=leaky
|
405 |
+
|
406 |
+
[convolutional]
|
407 |
+
batch_normalize=1
|
408 |
+
filters=512
|
409 |
+
size=3
|
410 |
+
stride=1
|
411 |
+
pad=1
|
412 |
+
activation=leaky
|
413 |
+
|
414 |
+
[shortcut]
|
415 |
+
from=-3
|
416 |
+
activation=linear
|
417 |
+
|
418 |
+
|
419 |
+
[convolutional]
|
420 |
+
batch_normalize=1
|
421 |
+
filters=256
|
422 |
+
size=1
|
423 |
+
stride=1
|
424 |
+
pad=1
|
425 |
+
activation=leaky
|
426 |
+
|
427 |
+
[convolutional]
|
428 |
+
batch_normalize=1
|
429 |
+
filters=512
|
430 |
+
size=3
|
431 |
+
stride=1
|
432 |
+
pad=1
|
433 |
+
activation=leaky
|
434 |
+
|
435 |
+
[shortcut]
|
436 |
+
from=-3
|
437 |
+
activation=linear
|
438 |
+
|
439 |
+
[convolutional]
|
440 |
+
batch_normalize=1
|
441 |
+
filters=256
|
442 |
+
size=1
|
443 |
+
stride=1
|
444 |
+
pad=1
|
445 |
+
activation=leaky
|
446 |
+
|
447 |
+
[convolutional]
|
448 |
+
batch_normalize=1
|
449 |
+
filters=512
|
450 |
+
size=3
|
451 |
+
stride=1
|
452 |
+
pad=1
|
453 |
+
activation=leaky
|
454 |
+
|
455 |
+
[shortcut]
|
456 |
+
from=-3
|
457 |
+
activation=linear
|
458 |
+
|
459 |
+
# Downsample
|
460 |
+
|
461 |
+
[convolutional]
|
462 |
+
batch_normalize=1
|
463 |
+
filters=1024
|
464 |
+
size=3
|
465 |
+
stride=2
|
466 |
+
pad=1
|
467 |
+
activation=leaky
|
468 |
+
|
469 |
+
[convolutional]
|
470 |
+
batch_normalize=1
|
471 |
+
filters=512
|
472 |
+
size=1
|
473 |
+
stride=1
|
474 |
+
pad=1
|
475 |
+
activation=leaky
|
476 |
+
|
477 |
+
[convolutional]
|
478 |
+
batch_normalize=1
|
479 |
+
filters=1024
|
480 |
+
size=3
|
481 |
+
stride=1
|
482 |
+
pad=1
|
483 |
+
activation=leaky
|
484 |
+
|
485 |
+
[shortcut]
|
486 |
+
from=-3
|
487 |
+
activation=linear
|
488 |
+
|
489 |
+
[convolutional]
|
490 |
+
batch_normalize=1
|
491 |
+
filters=512
|
492 |
+
size=1
|
493 |
+
stride=1
|
494 |
+
pad=1
|
495 |
+
activation=leaky
|
496 |
+
|
497 |
+
[convolutional]
|
498 |
+
batch_normalize=1
|
499 |
+
filters=1024
|
500 |
+
size=3
|
501 |
+
stride=1
|
502 |
+
pad=1
|
503 |
+
activation=leaky
|
504 |
+
|
505 |
+
[shortcut]
|
506 |
+
from=-3
|
507 |
+
activation=linear
|
508 |
+
|
509 |
+
[convolutional]
|
510 |
+
batch_normalize=1
|
511 |
+
filters=512
|
512 |
+
size=1
|
513 |
+
stride=1
|
514 |
+
pad=1
|
515 |
+
activation=leaky
|
516 |
+
|
517 |
+
[convolutional]
|
518 |
+
batch_normalize=1
|
519 |
+
filters=1024
|
520 |
+
size=3
|
521 |
+
stride=1
|
522 |
+
pad=1
|
523 |
+
activation=leaky
|
524 |
+
|
525 |
+
[shortcut]
|
526 |
+
from=-3
|
527 |
+
activation=linear
|
528 |
+
|
529 |
+
[convolutional]
|
530 |
+
batch_normalize=1
|
531 |
+
filters=512
|
532 |
+
size=1
|
533 |
+
stride=1
|
534 |
+
pad=1
|
535 |
+
activation=leaky
|
536 |
+
|
537 |
+
[convolutional]
|
538 |
+
batch_normalize=1
|
539 |
+
filters=1024
|
540 |
+
size=3
|
541 |
+
stride=1
|
542 |
+
pad=1
|
543 |
+
activation=leaky
|
544 |
+
|
545 |
+
[shortcut]
|
546 |
+
from=-3
|
547 |
+
activation=linear
|
548 |
+
|
549 |
+
######################
|
550 |
+
|
551 |
+
[convolutional]
|
552 |
+
batch_normalize=1
|
553 |
+
filters=512
|
554 |
+
size=1
|
555 |
+
stride=1
|
556 |
+
pad=1
|
557 |
+
activation=leaky
|
558 |
+
|
559 |
+
[convolutional]
|
560 |
+
batch_normalize=1
|
561 |
+
size=3
|
562 |
+
stride=1
|
563 |
+
pad=1
|
564 |
+
filters=1024
|
565 |
+
activation=leaky
|
566 |
+
|
567 |
+
[convolutional]
|
568 |
+
batch_normalize=1
|
569 |
+
filters=512
|
570 |
+
size=1
|
571 |
+
stride=1
|
572 |
+
pad=1
|
573 |
+
activation=leaky
|
574 |
+
|
575 |
+
[convolutional]
|
576 |
+
batch_normalize=1
|
577 |
+
size=3
|
578 |
+
stride=1
|
579 |
+
pad=1
|
580 |
+
filters=1024
|
581 |
+
activation=leaky
|
582 |
+
|
583 |
+
[convolutional]
|
584 |
+
batch_normalize=1
|
585 |
+
filters=512
|
586 |
+
size=1
|
587 |
+
stride=1
|
588 |
+
pad=1
|
589 |
+
activation=leaky
|
590 |
+
|
591 |
+
[convolutional]
|
592 |
+
batch_normalize=1
|
593 |
+
size=3
|
594 |
+
stride=1
|
595 |
+
pad=1
|
596 |
+
filters=1024
|
597 |
+
activation=leaky
|
598 |
+
|
599 |
+
[convolutional]
|
600 |
+
size=1
|
601 |
+
stride=1
|
602 |
+
pad=1
|
603 |
+
filters=255
|
604 |
+
activation=linear
|
605 |
+
|
606 |
+
|
607 |
+
[yolo]
|
608 |
+
mask = 6,7,8
|
609 |
+
anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326
|
610 |
+
classes=80
|
611 |
+
num=9
|
612 |
+
jitter=.3
|
613 |
+
ignore_thresh = .5
|
614 |
+
truth_thresh = 1
|
615 |
+
random=1
|
616 |
+
|
617 |
+
|
618 |
+
[route]
|
619 |
+
layers = -4
|
620 |
+
|
621 |
+
[convolutional]
|
622 |
+
batch_normalize=1
|
623 |
+
filters=256
|
624 |
+
size=1
|
625 |
+
stride=1
|
626 |
+
pad=1
|
627 |
+
activation=leaky
|
628 |
+
|
629 |
+
[upsample]
|
630 |
+
stride=2
|
631 |
+
|
632 |
+
[route]
|
633 |
+
layers = -1, 61
|
634 |
+
|
635 |
+
|
636 |
+
|
637 |
+
[convolutional]
|
638 |
+
batch_normalize=1
|
639 |
+
filters=256
|
640 |
+
size=1
|
641 |
+
stride=1
|
642 |
+
pad=1
|
643 |
+
activation=leaky
|
644 |
+
|
645 |
+
[convolutional]
|
646 |
+
batch_normalize=1
|
647 |
+
size=3
|
648 |
+
stride=1
|
649 |
+
pad=1
|
650 |
+
filters=512
|
651 |
+
activation=leaky
|
652 |
+
|
653 |
+
[convolutional]
|
654 |
+
batch_normalize=1
|
655 |
+
filters=256
|
656 |
+
size=1
|
657 |
+
stride=1
|
658 |
+
pad=1
|
659 |
+
activation=leaky
|
660 |
+
|
661 |
+
[convolutional]
|
662 |
+
batch_normalize=1
|
663 |
+
size=3
|
664 |
+
stride=1
|
665 |
+
pad=1
|
666 |
+
filters=512
|
667 |
+
activation=leaky
|
668 |
+
|
669 |
+
[convolutional]
|
670 |
+
batch_normalize=1
|
671 |
+
filters=256
|
672 |
+
size=1
|
673 |
+
stride=1
|
674 |
+
pad=1
|
675 |
+
activation=leaky
|
676 |
+
|
677 |
+
[convolutional]
|
678 |
+
batch_normalize=1
|
679 |
+
size=3
|
680 |
+
stride=1
|
681 |
+
pad=1
|
682 |
+
filters=512
|
683 |
+
activation=leaky
|
684 |
+
|
685 |
+
[convolutional]
|
686 |
+
size=1
|
687 |
+
stride=1
|
688 |
+
pad=1
|
689 |
+
filters=255
|
690 |
+
activation=linear
|
691 |
+
|
692 |
+
|
693 |
+
[yolo]
|
694 |
+
mask = 3,4,5
|
695 |
+
anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326
|
696 |
+
classes=80
|
697 |
+
num=9
|
698 |
+
jitter=.3
|
699 |
+
ignore_thresh = .5
|
700 |
+
truth_thresh = 1
|
701 |
+
random=1
|
702 |
+
|
703 |
+
|
704 |
+
|
705 |
+
[route]
|
706 |
+
layers = -4
|
707 |
+
|
708 |
+
[convolutional]
|
709 |
+
batch_normalize=1
|
710 |
+
filters=128
|
711 |
+
size=1
|
712 |
+
stride=1
|
713 |
+
pad=1
|
714 |
+
activation=leaky
|
715 |
+
|
716 |
+
[upsample]
|
717 |
+
stride=2
|
718 |
+
|
719 |
+
[route]
|
720 |
+
layers = -1, 36
|
721 |
+
|
722 |
+
|
723 |
+
|
724 |
+
[convolutional]
|
725 |
+
batch_normalize=1
|
726 |
+
filters=128
|
727 |
+
size=1
|
728 |
+
stride=1
|
729 |
+
pad=1
|
730 |
+
activation=leaky
|
731 |
+
|
732 |
+
[convolutional]
|
733 |
+
batch_normalize=1
|
734 |
+
size=3
|
735 |
+
stride=1
|
736 |
+
pad=1
|
737 |
+
filters=256
|
738 |
+
activation=leaky
|
739 |
+
|
740 |
+
[convolutional]
|
741 |
+
batch_normalize=1
|
742 |
+
filters=128
|
743 |
+
size=1
|
744 |
+
stride=1
|
745 |
+
pad=1
|
746 |
+
activation=leaky
|
747 |
+
|
748 |
+
[convolutional]
|
749 |
+
batch_normalize=1
|
750 |
+
size=3
|
751 |
+
stride=1
|
752 |
+
pad=1
|
753 |
+
filters=256
|
754 |
+
activation=leaky
|
755 |
+
|
756 |
+
[convolutional]
|
757 |
+
batch_normalize=1
|
758 |
+
filters=128
|
759 |
+
size=1
|
760 |
+
stride=1
|
761 |
+
pad=1
|
762 |
+
activation=leaky
|
763 |
+
|
764 |
+
[convolutional]
|
765 |
+
batch_normalize=1
|
766 |
+
size=3
|
767 |
+
stride=1
|
768 |
+
pad=1
|
769 |
+
filters=256
|
770 |
+
activation=leaky
|
771 |
+
|
772 |
+
[convolutional]
|
773 |
+
size=1
|
774 |
+
stride=1
|
775 |
+
pad=1
|
776 |
+
filters=255
|
777 |
+
activation=linear
|
778 |
+
|
779 |
+
|
780 |
+
[yolo]
|
781 |
+
mask = 0,1,2
|
782 |
+
anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326
|
783 |
+
classes=80
|
784 |
+
num=9
|
785 |
+
jitter=.3
|
786 |
+
ignore_thresh = .5
|
787 |
+
truth_thresh = 1
|
788 |
+
random=1
|
789 |
+
|
VideoToNPZ/lib/detector/yolov3/darknet.py
ADDED
@@ -0,0 +1,433 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from __future__ import division
|
2 |
+
|
3 |
+
import torch
|
4 |
+
import torch.nn as nn
|
5 |
+
import torch.nn.functional as F
|
6 |
+
import numpy as np
|
7 |
+
import cv2
|
8 |
+
import os
|
9 |
+
import sys
|
10 |
+
|
11 |
+
from util import convert2cpu as cpu
|
12 |
+
from util import predict_transform
|
13 |
+
|
14 |
+
|
15 |
+
class test_net(nn.Module):
|
16 |
+
def __init__(self, num_layers, input_size):
|
17 |
+
super(test_net, self).__init__()
|
18 |
+
self.num_layers= num_layers
|
19 |
+
self.linear_1 = nn.Linear(input_size, 5)
|
20 |
+
self.middle = nn.ModuleList([nn.Linear(5,5) for x in range(num_layers)])
|
21 |
+
self.output = nn.Linear(5,2)
|
22 |
+
|
23 |
+
def forward(self, x):
|
24 |
+
x = x.view(-1)
|
25 |
+
fwd = nn.Sequential(self.linear_1, *self.middle, self.output)
|
26 |
+
return fwd(x)
|
27 |
+
|
28 |
+
|
29 |
+
def get_test_input():
|
30 |
+
img = cv2.imread("dog-cycle-car.png")
|
31 |
+
img = cv2.resize(img, (416, 416))
|
32 |
+
img_ = img[:, :, ::-1].transpose((2, 0, 1))
|
33 |
+
img_ = img_[np.newaxis, :, :, :]/255.0
|
34 |
+
img_ = torch.from_numpy(img_).float()
|
35 |
+
return img_
|
36 |
+
|
37 |
+
|
38 |
+
def parse_cfg(cfgfile):
|
39 |
+
"""
|
40 |
+
Takes a configuration file
|
41 |
+
|
42 |
+
Returns a list of blocks. Each blocks describes a block in the neural
|
43 |
+
network to be built. Block is represented as a dictionary in the list
|
44 |
+
|
45 |
+
"""
|
46 |
+
# cfgfile = os.path.join(sys.path[-1], cfgfile)
|
47 |
+
file = open(cfgfile, 'r')
|
48 |
+
lines = file.read().split('\n') # store the lines in a list
|
49 |
+
lines = [x for x in lines if len(x) > 0] # get read of the empty lines
|
50 |
+
lines = [x for x in lines if x[0] != '#']
|
51 |
+
lines = [x.rstrip().lstrip() for x in lines]
|
52 |
+
|
53 |
+
block = {}
|
54 |
+
blocks = []
|
55 |
+
|
56 |
+
for line in lines:
|
57 |
+
if line[0] == "[": # This marks the start of a new block
|
58 |
+
if len(block) != 0:
|
59 |
+
blocks.append(block)
|
60 |
+
block = {}
|
61 |
+
block["type"] = line[1:-1].rstrip()
|
62 |
+
else:
|
63 |
+
key,value = line.split("=")
|
64 |
+
block[key.rstrip()] = value.lstrip()
|
65 |
+
blocks.append(block)
|
66 |
+
|
67 |
+
return blocks
|
68 |
+
|
69 |
+
|
70 |
+
class MaxPoolStride1(nn.Module):
|
71 |
+
def __init__(self, kernel_size):
|
72 |
+
super(MaxPoolStride1, self).__init__()
|
73 |
+
self.kernel_size = kernel_size
|
74 |
+
self.pad = kernel_size - 1
|
75 |
+
|
76 |
+
def forward(self, x):
|
77 |
+
padded_x = F.pad(x, (0, self.pad, 0, self.pad), mode="replicate")
|
78 |
+
pooled_x = nn.MaxPool2d(self.kernel_size, self.pad)(padded_x)
|
79 |
+
return pooled_x
|
80 |
+
|
81 |
+
|
82 |
+
class EmptyLayer(nn.Module):
|
83 |
+
def __init__(self):
|
84 |
+
super(EmptyLayer, self).__init__()
|
85 |
+
|
86 |
+
|
87 |
+
class DetectionLayer(nn.Module):
|
88 |
+
def __init__(self, anchors):
|
89 |
+
super(DetectionLayer, self).__init__()
|
90 |
+
self.anchors = anchors
|
91 |
+
|
92 |
+
def forward(self, x, inp_dim, num_classes, confidence):
|
93 |
+
x = x.data
|
94 |
+
global CUDA
|
95 |
+
prediction = x
|
96 |
+
prediction = predict_transform(prediction, inp_dim, self.anchors, num_classes, confidence, CUDA)
|
97 |
+
return prediction
|
98 |
+
|
99 |
+
|
100 |
+
class Upsample(nn.Module):
|
101 |
+
def __init__(self, stride=2):
|
102 |
+
super(Upsample, self).__init__()
|
103 |
+
self.stride = stride
|
104 |
+
|
105 |
+
def forward(self, x):
|
106 |
+
stride = self.stride
|
107 |
+
assert(x.data.dim() == 4)
|
108 |
+
B = x.data.size(0)
|
109 |
+
C = x.data.size(1)
|
110 |
+
H = x.data.size(2)
|
111 |
+
W = x.data.size(3)
|
112 |
+
ws = stride
|
113 |
+
hs = stride
|
114 |
+
x = x.view(B, C, H, 1, W, 1).expand(B, C, H, stride, W, stride).contiguous().view(B, C, H*stride, W*stride)
|
115 |
+
return x
|
116 |
+
|
117 |
+
|
118 |
+
class ReOrgLayer(nn.Module):
|
119 |
+
def __init__(self, stride=2):
|
120 |
+
super(ReOrgLayer, self).__init__()
|
121 |
+
self.stride= stride
|
122 |
+
|
123 |
+
def forward(self, x):
|
124 |
+
assert(x.data.dim() == 4)
|
125 |
+
B, C, H, W = x.data.shape
|
126 |
+
hs = self.stride
|
127 |
+
ws = self.stride
|
128 |
+
assert(H % hs == 0), "The stride " + str(self.stride) + " is not a proper divisor of height " + str(H)
|
129 |
+
assert(W % ws == 0), "The stride " + str(self.stride) + " is not a proper divisor of height " + str(W)
|
130 |
+
x = x.view(B, C, H // hs, hs, W // ws, ws).transpose(-2, -3).contiguous()
|
131 |
+
x = x.view(B, C, H // hs * W // ws, hs, ws)
|
132 |
+
x = x.view(B, C, H // hs * W // ws, hs*ws).transpose(-1, -2).contiguous()
|
133 |
+
x = x.view(B, C, ws*hs, H // ws, W // ws).transpose(1, 2).contiguous()
|
134 |
+
x = x.view(B, C*ws*hs, H // ws, W // ws)
|
135 |
+
return x
|
136 |
+
|
137 |
+
|
138 |
+
def create_modules(blocks):
|
139 |
+
net_info = blocks[0] # Captures the information about the input and pre-processing
|
140 |
+
|
141 |
+
module_list = nn.ModuleList()
|
142 |
+
|
143 |
+
index = 0 # indexing blocks helps with implementing route layers (skip connections)
|
144 |
+
prev_filters = 3
|
145 |
+
output_filters = []
|
146 |
+
|
147 |
+
for x in blocks:
|
148 |
+
module = nn.Sequential()
|
149 |
+
if x["type"] == "net":
|
150 |
+
continue
|
151 |
+
|
152 |
+
# If it's a convolutional layer
|
153 |
+
if x["type"] == "convolutional":
|
154 |
+
# Get the info about the layer
|
155 |
+
activation = x["activation"]
|
156 |
+
try:
|
157 |
+
batch_normalize = int(x["batch_normalize"])
|
158 |
+
bias = False
|
159 |
+
except:
|
160 |
+
batch_normalize = 0
|
161 |
+
bias = True
|
162 |
+
|
163 |
+
filters= int(x["filters"])
|
164 |
+
padding = int(x["pad"])
|
165 |
+
kernel_size = int(x["size"])
|
166 |
+
stride = int(x["stride"])
|
167 |
+
|
168 |
+
if padding:
|
169 |
+
pad = (kernel_size - 1) // 2
|
170 |
+
else:
|
171 |
+
pad = 0
|
172 |
+
|
173 |
+
# Add the convolutional layer
|
174 |
+
conv = nn.Conv2d(prev_filters, filters, kernel_size, stride, pad, bias = bias)
|
175 |
+
module.add_module("conv_{0}".format(index), conv)
|
176 |
+
|
177 |
+
# Add the Batch Norm Layer
|
178 |
+
if batch_normalize:
|
179 |
+
bn = nn.BatchNorm2d(filters)
|
180 |
+
module.add_module("batch_norm_{0}".format(index), bn)
|
181 |
+
|
182 |
+
# Check the activation.
|
183 |
+
# It is either Linear or a Leaky ReLU for YOLO
|
184 |
+
if activation == "leaky":
|
185 |
+
activn = nn.LeakyReLU(0.1, inplace = True)
|
186 |
+
module.add_module("leaky_{0}".format(index), activn)
|
187 |
+
|
188 |
+
# If it's an upsampling layer
|
189 |
+
# We use Bilinear2dUpsampling
|
190 |
+
|
191 |
+
elif x["type"] == "upsample":
|
192 |
+
stride = int(x["stride"])
|
193 |
+
# upsample = Upsample(stride)
|
194 |
+
upsample = nn.Upsample(scale_factor=2, mode="nearest")
|
195 |
+
module.add_module("upsample_{}".format(index), upsample)
|
196 |
+
|
197 |
+
# If it is a route layer
|
198 |
+
elif (x["type"] == "route"):
|
199 |
+
x["layers"] = x["layers"].split(',')
|
200 |
+
|
201 |
+
# Start of a route
|
202 |
+
start = int(x["layers"][0])
|
203 |
+
|
204 |
+
# end, if there exists one.
|
205 |
+
try:
|
206 |
+
end = int(x["layers"][1])
|
207 |
+
except:
|
208 |
+
end = 0
|
209 |
+
|
210 |
+
# Positive anotation
|
211 |
+
if start > 0:
|
212 |
+
start = start - index
|
213 |
+
|
214 |
+
if end > 0:
|
215 |
+
end = end - index
|
216 |
+
|
217 |
+
route = EmptyLayer()
|
218 |
+
module.add_module("route_{0}".format(index), route)
|
219 |
+
|
220 |
+
if end < 0:
|
221 |
+
filters = output_filters[index + start] + output_filters[index + end]
|
222 |
+
else:
|
223 |
+
filters = output_filters[index + start]
|
224 |
+
|
225 |
+
# shortcut corresponds to skip connection
|
226 |
+
elif x["type"] == "shortcut":
|
227 |
+
from_ = int(x["from"])
|
228 |
+
shortcut = EmptyLayer()
|
229 |
+
module.add_module("shortcut_{}".format(index), shortcut)
|
230 |
+
|
231 |
+
elif x["type"] == "maxpool":
|
232 |
+
stride = int(x["stride"])
|
233 |
+
size = int(x["size"])
|
234 |
+
if stride != 1:
|
235 |
+
maxpool = nn.MaxPool2d(size, stride)
|
236 |
+
else:
|
237 |
+
maxpool = MaxPoolStride1(size)
|
238 |
+
|
239 |
+
module.add_module("maxpool_{}".format(index), maxpool)
|
240 |
+
|
241 |
+
# Yolo is the detection layer
|
242 |
+
elif x["type"] == "yolo":
|
243 |
+
mask = x["mask"].split(",")
|
244 |
+
mask = [int(x) for x in mask]
|
245 |
+
|
246 |
+
anchors = x["anchors"].split(",")
|
247 |
+
anchors = [int(a) for a in anchors]
|
248 |
+
anchors = [(anchors[i], anchors[i+1]) for i in range(0, len(anchors),2)]
|
249 |
+
anchors = [anchors[i] for i in mask]
|
250 |
+
|
251 |
+
detection = DetectionLayer(anchors)
|
252 |
+
module.add_module("Detection_{}".format(index), detection)
|
253 |
+
|
254 |
+
else:
|
255 |
+
print("Something I dunno")
|
256 |
+
assert False
|
257 |
+
|
258 |
+
module_list.append(module)
|
259 |
+
prev_filters = filters
|
260 |
+
output_filters.append(filters)
|
261 |
+
index += 1
|
262 |
+
|
263 |
+
return (net_info, module_list)
|
264 |
+
|
265 |
+
|
266 |
+
class Darknet(nn.Module):
|
267 |
+
def __init__(self, cfgfile):
|
268 |
+
super(Darknet, self).__init__()
|
269 |
+
self.blocks = parse_cfg(cfgfile)
|
270 |
+
self.net_info, self.module_list = create_modules(self.blocks)
|
271 |
+
self.header = torch.IntTensor([0, 0, 0, 0])
|
272 |
+
self.seen = 0
|
273 |
+
|
274 |
+
def get_blocks(self):
|
275 |
+
return self.blocks
|
276 |
+
|
277 |
+
def get_module_list(self):
|
278 |
+
return self.module_list
|
279 |
+
|
280 |
+
def forward(self, x, CUDA):
|
281 |
+
detections = []
|
282 |
+
modules = self.blocks[1:]
|
283 |
+
outputs = {} # We cache the outputs for the route layer
|
284 |
+
|
285 |
+
write = 0
|
286 |
+
for i in range(len(modules)):
|
287 |
+
|
288 |
+
module_type = (modules[i]["type"])
|
289 |
+
if module_type == "convolutional" or module_type == "upsample" or module_type == "maxpool":
|
290 |
+
|
291 |
+
x = self.module_list[i](x)
|
292 |
+
outputs[i] = x
|
293 |
+
|
294 |
+
elif module_type == "route":
|
295 |
+
layers = modules[i]["layers"]
|
296 |
+
layers = [int(a) for a in layers]
|
297 |
+
|
298 |
+
if (layers[0]) > 0:
|
299 |
+
layers[0] = layers[0] - i
|
300 |
+
|
301 |
+
if len(layers) == 1:
|
302 |
+
x = outputs[i + (layers[0])]
|
303 |
+
|
304 |
+
else:
|
305 |
+
if (layers[1]) > 0:
|
306 |
+
layers[1] = layers[1] - i
|
307 |
+
|
308 |
+
map1 = outputs[i + layers[0]]
|
309 |
+
map2 = outputs[i + layers[1]]
|
310 |
+
|
311 |
+
x = torch.cat((map1, map2), 1)
|
312 |
+
outputs[i] = x
|
313 |
+
|
314 |
+
elif module_type == "shortcut":
|
315 |
+
from_ = int(modules[i]["from"])
|
316 |
+
x = outputs[i-1] + outputs[i+from_]
|
317 |
+
outputs[i] = x
|
318 |
+
|
319 |
+
elif module_type == 'yolo':
|
320 |
+
|
321 |
+
anchors = self.module_list[i][0].anchors
|
322 |
+
# Get the input dimensions
|
323 |
+
inp_dim = int(self.net_info["height"])
|
324 |
+
|
325 |
+
# Get the number of classes
|
326 |
+
num_classes = int(modules[i]["classes"])
|
327 |
+
|
328 |
+
# Output the result
|
329 |
+
x = x.data
|
330 |
+
x = predict_transform(x, inp_dim, anchors, num_classes, CUDA)
|
331 |
+
|
332 |
+
if type(x) == int:
|
333 |
+
continue
|
334 |
+
|
335 |
+
if not write:
|
336 |
+
detections = x
|
337 |
+
write = 1
|
338 |
+
else:
|
339 |
+
detections = torch.cat((detections, x), 1)
|
340 |
+
|
341 |
+
outputs[i] = outputs[i-1]
|
342 |
+
|
343 |
+
try:
|
344 |
+
return detections
|
345 |
+
except:
|
346 |
+
return 0
|
347 |
+
|
348 |
+
def load_weights(self, weightfile):
|
349 |
+
# Introduction: https://blog.paperspace.com/how-to-implement-a-yolo-v3-object-detector-from-scratch-in-pytorch-part-3/
|
350 |
+
# Open the weights file
|
351 |
+
# weightfile = os.path.join(sys.path[-1], weightfile)
|
352 |
+
fp = open(weightfile, "rb")
|
353 |
+
|
354 |
+
# The first 5 values are header information
|
355 |
+
# 1. Major version number
|
356 |
+
# 2. Minor Version Number
|
357 |
+
# 3. Subversion number
|
358 |
+
# 4.5 Images seen by the network (during training)
|
359 |
+
header = np.fromfile(fp, dtype = np.int32, count = 5)
|
360 |
+
self.header = torch.from_numpy(header)
|
361 |
+
self.seen = self.header[3]
|
362 |
+
|
363 |
+
# The rest of the values are the weights
|
364 |
+
# Let's load them up
|
365 |
+
weights = np.fromfile(fp, dtype = np.float32)
|
366 |
+
|
367 |
+
ptr = 0
|
368 |
+
for i in range(len(self.module_list)):
|
369 |
+
module_type = self.blocks[i + 1]["type"]
|
370 |
+
|
371 |
+
if module_type == "convolutional":
|
372 |
+
model = self.module_list[i]
|
373 |
+
try:
|
374 |
+
batch_normalize = int(self.blocks[i+1]["batch_normalize"])
|
375 |
+
except:
|
376 |
+
batch_normalize = 0
|
377 |
+
|
378 |
+
conv = model[0]
|
379 |
+
|
380 |
+
if (batch_normalize):
|
381 |
+
bn = model[1]
|
382 |
+
|
383 |
+
# Get the number of weights of Batch Norm Layer
|
384 |
+
num_bn_biases = bn.bias.numel()
|
385 |
+
|
386 |
+
# Load the weights
|
387 |
+
bn_biases = torch.from_numpy(weights[ptr:ptr + num_bn_biases])
|
388 |
+
ptr += num_bn_biases
|
389 |
+
|
390 |
+
bn_weights = torch.from_numpy(weights[ptr: ptr + num_bn_biases])
|
391 |
+
ptr += num_bn_biases
|
392 |
+
|
393 |
+
bn_running_mean = torch.from_numpy(weights[ptr: ptr + num_bn_biases])
|
394 |
+
ptr += num_bn_biases
|
395 |
+
|
396 |
+
bn_running_var = torch.from_numpy(weights[ptr: ptr + num_bn_biases])
|
397 |
+
ptr += num_bn_biases
|
398 |
+
|
399 |
+
# Cast the loaded weights into dims of model weights.
|
400 |
+
bn_biases = bn_biases.view_as(bn.bias.data)
|
401 |
+
bn_weights = bn_weights.view_as(bn.weight.data)
|
402 |
+
bn_running_mean = bn_running_mean.view_as(bn.running_mean)
|
403 |
+
bn_running_var = bn_running_var.view_as(bn.running_var)
|
404 |
+
|
405 |
+
# Copy the data to model
|
406 |
+
bn.bias.data.copy_(bn_biases)
|
407 |
+
bn.weight.data.copy_(bn_weights)
|
408 |
+
bn.running_mean.copy_(bn_running_mean)
|
409 |
+
bn.running_var.copy_(bn_running_var)
|
410 |
+
|
411 |
+
else:
|
412 |
+
# Number of biases
|
413 |
+
num_biases = conv.bias.numel()
|
414 |
+
|
415 |
+
# Load the weights
|
416 |
+
conv_biases = torch.from_numpy(weights[ptr: ptr + num_biases])
|
417 |
+
ptr = ptr + num_biases
|
418 |
+
|
419 |
+
# reshape the loaded weights according to the dims of the model weights
|
420 |
+
conv_biases = conv_biases.view_as(conv.bias.data)
|
421 |
+
|
422 |
+
# Finally copy the data
|
423 |
+
conv.bias.data.copy_(conv_biases)
|
424 |
+
|
425 |
+
# Let us load the weights for the Convolutional layers
|
426 |
+
num_weights = conv.weight.numel()
|
427 |
+
|
428 |
+
# Do the same as above for weights
|
429 |
+
conv_weights = torch.from_numpy(weights[ptr:ptr+num_weights])
|
430 |
+
ptr = ptr + num_weights
|
431 |
+
|
432 |
+
conv_weights = conv_weights.view_as(conv.weight.data)
|
433 |
+
conv.weight.data.copy_(conv_weights)
|
VideoToNPZ/lib/detector/yolov3/data/coco.names
ADDED
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
person
|
2 |
+
bicycle
|
3 |
+
car
|
4 |
+
motorbike
|
5 |
+
aeroplane
|
6 |
+
bus
|
7 |
+
train
|
8 |
+
truck
|
9 |
+
boat
|
10 |
+
traffic light
|
11 |
+
fire hydrant
|
12 |
+
stop sign
|
13 |
+
parking meter
|
14 |
+
bench
|
15 |
+
bird
|
16 |
+
cat
|
17 |
+
dog
|
18 |
+
horse
|
19 |
+
sheep
|
20 |
+
cow
|
21 |
+
elephant
|
22 |
+
bear
|
23 |
+
zebra
|
24 |
+
giraffe
|
25 |
+
backpack
|
26 |
+
umbrella
|
27 |
+
handbag
|
28 |
+
tie
|
29 |
+
suitcase
|
30 |
+
frisbee
|
31 |
+
skis
|
32 |
+
snowboard
|
33 |
+
sports ball
|
34 |
+
kite
|
35 |
+
baseball bat
|
36 |
+
baseball glove
|
37 |
+
skateboard
|
38 |
+
surfboard
|
39 |
+
tennis racket
|
40 |
+
bottle
|
41 |
+
wine glass
|
42 |
+
cup
|
43 |
+
fork
|
44 |
+
knife
|
45 |
+
spoon
|
46 |
+
bowl
|
47 |
+
banana
|
48 |
+
apple
|
49 |
+
sandwich
|
50 |
+
orange
|
51 |
+
broccoli
|
52 |
+
carrot
|
53 |
+
hot dog
|
54 |
+
pizza
|
55 |
+
donut
|
56 |
+
cake
|
57 |
+
chair
|
58 |
+
sofa
|
59 |
+
pottedplant
|
60 |
+
bed
|
61 |
+
diningtable
|
62 |
+
toilet
|
63 |
+
tvmonitor
|
64 |
+
laptop
|
65 |
+
mouse
|
66 |
+
remote
|
67 |
+
keyboard
|
68 |
+
cell phone
|
69 |
+
microwave
|
70 |
+
oven
|
71 |
+
toaster
|
72 |
+
sink
|
73 |
+
refrigerator
|
74 |
+
book
|
75 |
+
clock
|
76 |
+
vase
|
77 |
+
scissors
|
78 |
+
teddy bear
|
79 |
+
hair drier
|
80 |
+
toothbrush
|
VideoToNPZ/lib/detector/yolov3/data/pallete
ADDED
Binary file (908 Bytes). View file
|
|
VideoToNPZ/lib/detector/yolov3/data/voc.names
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
aeroplane
|
2 |
+
bicycle
|
3 |
+
bird
|
4 |
+
boat
|
5 |
+
bottle
|
6 |
+
bus
|
7 |
+
car
|
8 |
+
cat
|
9 |
+
chair
|
10 |
+
cow
|
11 |
+
diningtable
|
12 |
+
dog
|
13 |
+
horse
|
14 |
+
motorbike
|
15 |
+
person
|
16 |
+
pottedplant
|
17 |
+
sheep
|
18 |
+
sofa
|
19 |
+
train
|
20 |
+
tvmonitor
|
VideoToNPZ/lib/detector/yolov3/human_detector.py
ADDED
@@ -0,0 +1,155 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from __future__ import division
|
2 |
+
import time
|
3 |
+
import torch
|
4 |
+
import numpy as np
|
5 |
+
import cv2
|
6 |
+
import os
|
7 |
+
import sys
|
8 |
+
import random
|
9 |
+
import pickle as pkl
|
10 |
+
import argparse
|
11 |
+
|
12 |
+
from util import *
|
13 |
+
from darknet import Darknet
|
14 |
+
from preprocess import letterbox_image
|
15 |
+
import preprocess
|
16 |
+
|
17 |
+
|
18 |
+
cur_dir = os.path.dirname(os.path.realpath(__file__))
|
19 |
+
project_root = os.path.join(cur_dir, '../../../')
|
20 |
+
chk_root = os.path.join(project_root, 'checkpoint/')
|
21 |
+
data_root = os.path.join(project_root, 'data/')
|
22 |
+
|
23 |
+
|
24 |
+
sys.path.insert(0, project_root)
|
25 |
+
sys.path.pop(0)
|
26 |
+
|
27 |
+
|
28 |
+
def prep_image(img, inp_dim):
|
29 |
+
"""
|
30 |
+
Prepare image for inputting to the neural network.
|
31 |
+
|
32 |
+
Returns a Variable
|
33 |
+
"""
|
34 |
+
ori_img = img
|
35 |
+
dim = ori_img.shape[1], ori_img.shape[0]
|
36 |
+
img = cv2.resize(ori_img, (inp_dim, inp_dim))
|
37 |
+
img_ = img[:, :, ::-1].transpose((2, 0, 1)).copy()
|
38 |
+
img_ = torch.from_numpy(img_).float().div(255.0).unsqueeze(0)
|
39 |
+
return img_, ori_img, dim
|
40 |
+
|
41 |
+
|
42 |
+
def write(x, img, colors):
|
43 |
+
x = [int(i) for i in x]
|
44 |
+
c1 = tuple(x[0:2])
|
45 |
+
c2 = tuple(x[2:4])
|
46 |
+
|
47 |
+
label = 'People {}'.format(0)
|
48 |
+
color = (0, 0, 255)
|
49 |
+
cv2.rectangle(img, c1, c2, color, 2)
|
50 |
+
t_size = cv2.getTextSize(label, cv2.FONT_HERSHEY_PLAIN, 1, 1)[0]
|
51 |
+
c2 = c1[0] + t_size[0] + 3, c1[1] + t_size[1] + 4
|
52 |
+
cv2.rectangle(img, c1, c2, color, -1)
|
53 |
+
cv2.putText(img, label, (c1[0], c1[1] + t_size[1] + 4), cv2.FONT_HERSHEY_PLAIN, 1, [225, 255, 255], 1)
|
54 |
+
return img
|
55 |
+
|
56 |
+
|
57 |
+
def arg_parse():
|
58 |
+
""""
|
59 |
+
Parse arguements to the detect module
|
60 |
+
|
61 |
+
"""
|
62 |
+
parser = argparse.ArgumentParser(description='YOLO v3 Cam Demo')
|
63 |
+
parser.add_argument('--confidence', dest='confidence', type=float, default=0.70,
|
64 |
+
help='Object Confidence to filter predictions')
|
65 |
+
parser.add_argument('--nms-thresh', dest='nms_thresh', type=float, default=0.4, help='NMS Threshold')
|
66 |
+
parser.add_argument('--reso', dest='reso', default=416, type=int, help='Input resolution of the network. '
|
67 |
+
'Increase to increase accuracy. Decrease to increase speed. (160, 416)')
|
68 |
+
parser.add_argument('-wf', '--weight-file', type=str, default=chk_root + 'yolov3/yolov3.weights', help='The path'
|
69 |
+
'of model weight file')
|
70 |
+
parser.add_argument('-cf', '--cfg-file', type=str, default=cur_dir + '/cfg/yolov3.cfg', help='weight file')
|
71 |
+
parser.add_argument('-a', '--animation', action='store_true', help='output animation')
|
72 |
+
parser.add_argument('-v', '--video', type=str, default='camera', help='The input video path')
|
73 |
+
parser.add_argument('-i', '--image', type=str, default=cur_dir + '/data/dog-cycle-car.png',
|
74 |
+
help='The input video path')
|
75 |
+
parser.add_argument('-np', '--num-person', type=int, default=1, help='number of estimated human poses. [1, 2]')
|
76 |
+
return parser.parse_args()
|
77 |
+
|
78 |
+
|
79 |
+
def load_model(args=None, CUDA=None, inp_dim=416):
|
80 |
+
if args is None:
|
81 |
+
args = arg_parse()
|
82 |
+
|
83 |
+
if CUDA is None:
|
84 |
+
CUDA = torch.cuda.is_available()
|
85 |
+
|
86 |
+
# Set up the neural network
|
87 |
+
model = Darknet(args.cfg_file)
|
88 |
+
model.load_weights(args.weight_file)
|
89 |
+
|
90 |
+
model.net_info["height"] = inp_dim
|
91 |
+
assert inp_dim % 32 == 0
|
92 |
+
assert inp_dim > 32
|
93 |
+
|
94 |
+
# If there's a GPU availible, put the model on GPU
|
95 |
+
if CUDA:
|
96 |
+
model.cuda()
|
97 |
+
|
98 |
+
# Set the model in evaluation mode
|
99 |
+
model.eval()
|
100 |
+
|
101 |
+
return model
|
102 |
+
|
103 |
+
|
104 |
+
def yolo_human_det(img, model=None, reso=416, confidence=0.70):
|
105 |
+
args = arg_parse()
|
106 |
+
# args.reso = reso
|
107 |
+
inp_dim = reso
|
108 |
+
num_classes = 80
|
109 |
+
|
110 |
+
CUDA = torch.cuda.is_available()
|
111 |
+
if model is None:
|
112 |
+
model = load_model(args, CUDA, inp_dim)
|
113 |
+
|
114 |
+
if type(img) == str:
|
115 |
+
assert os.path.isfile(img), 'The image path does not exist'
|
116 |
+
img = cv2.imread(img)
|
117 |
+
|
118 |
+
img, ori_img, img_dim = preprocess.prep_image(img, inp_dim)
|
119 |
+
img_dim = torch.FloatTensor(img_dim).repeat(1, 2)
|
120 |
+
|
121 |
+
with torch.no_grad():
|
122 |
+
if CUDA:
|
123 |
+
img_dim = img_dim.cuda()
|
124 |
+
img = img.cuda()
|
125 |
+
output = model(img, CUDA)
|
126 |
+
output = write_results(output, confidence, num_classes, nms=True, nms_conf=args.nms_thresh, det_hm=True)
|
127 |
+
|
128 |
+
if len(output) == 0:
|
129 |
+
return None, None
|
130 |
+
|
131 |
+
img_dim = img_dim.repeat(output.size(0), 1)
|
132 |
+
scaling_factor = torch.min(inp_dim / img_dim, 1)[0].view(-1, 1)
|
133 |
+
|
134 |
+
output[:, [1, 3]] -= (inp_dim - scaling_factor * img_dim[:, 0].view(-1, 1)) / 2
|
135 |
+
output[:, [2, 4]] -= (inp_dim - scaling_factor * img_dim[:, 1].view(-1, 1)) / 2
|
136 |
+
output[:, 1:5] /= scaling_factor
|
137 |
+
|
138 |
+
for i in range(output.shape[0]):
|
139 |
+
output[i, [1, 3]] = torch.clamp(output[i, [1, 3]], 0.0, img_dim[i, 0])
|
140 |
+
output[i, [2, 4]] = torch.clamp(output[i, [2, 4]], 0.0, img_dim[i, 1])
|
141 |
+
|
142 |
+
bboxs = []
|
143 |
+
scores = []
|
144 |
+
for i in range(len(output)):
|
145 |
+
item = output[i]
|
146 |
+
bbox = item[1:5].cpu().numpy()
|
147 |
+
# conver float32 to .2f data
|
148 |
+
bbox = [round(i, 2) for i in list(bbox)]
|
149 |
+
score = item[5].cpu().numpy()
|
150 |
+
bboxs.append(bbox)
|
151 |
+
scores.append(score)
|
152 |
+
scores = np.expand_dims(np.array(scores), 1)
|
153 |
+
bboxs = np.array(bboxs)
|
154 |
+
|
155 |
+
return bboxs, scores
|
VideoToNPZ/lib/detector/yolov3/preprocess.py
ADDED
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from __future__ import division
|
2 |
+
|
3 |
+
import torch
|
4 |
+
import numpy as np
|
5 |
+
import cv2
|
6 |
+
from PIL import Image
|
7 |
+
|
8 |
+
|
9 |
+
def letterbox_image(img, inp_dim):
|
10 |
+
'''resize image with unchanged aspect ratio using padding'''
|
11 |
+
img_w, img_h = img.shape[1], img.shape[0]
|
12 |
+
w, h = inp_dim
|
13 |
+
new_w = int(img_w * min(w/img_w, h/img_h))
|
14 |
+
new_h = int(img_h * min(w/img_w, h/img_h))
|
15 |
+
resized_image = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_CUBIC)
|
16 |
+
|
17 |
+
canvas = np.full((inp_dim[1], inp_dim[0], 3), 128)
|
18 |
+
|
19 |
+
canvas[(h - new_h) // 2:(h - new_h) // 2 + new_h, (w - new_w) // 2:(w - new_w) // 2 + new_w, :] = resized_image
|
20 |
+
|
21 |
+
return canvas
|
22 |
+
|
23 |
+
|
24 |
+
def prep_image(img, inp_dim):
|
25 |
+
"""
|
26 |
+
Prepare image for inputting to the neural network.
|
27 |
+
|
28 |
+
Returns a Variable
|
29 |
+
"""
|
30 |
+
if type(img) == str:
|
31 |
+
orig_im = cv2.imread(img)
|
32 |
+
else:
|
33 |
+
orig_im = img
|
34 |
+
dim = orig_im.shape[1], orig_im.shape[0]
|
35 |
+
img = (letterbox_image(orig_im, (inp_dim, inp_dim)))
|
36 |
+
img_ = img[:, :, ::-1].transpose((2, 0, 1)).copy()
|
37 |
+
img_ = torch.from_numpy(img_).float().div(255.0).unsqueeze(0)
|
38 |
+
return img_, orig_im, dim
|
39 |
+
|
40 |
+
|
41 |
+
def prep_image_pil(img, network_dim):
|
42 |
+
orig_im = Image.open(img)
|
43 |
+
img = orig_im.convert('RGB')
|
44 |
+
dim = img.size
|
45 |
+
img = img.resize(network_dim)
|
46 |
+
img = torch.ByteTensor(torch.ByteStorage.from_buffer(img.tobytes()))
|
47 |
+
img = img.view(*network_dim, 3).transpose(0, 1).transpose(0, 2).contiguous()
|
48 |
+
img = img.view(1, 3, *network_dim)
|
49 |
+
img = img.float().div(255.0)
|
50 |
+
return img, orig_im, dim
|
51 |
+
|
52 |
+
|
53 |
+
def inp_to_image(inp):
|
54 |
+
inp = inp.cpu().squeeze()
|
55 |
+
inp = inp * 255
|
56 |
+
try:
|
57 |
+
inp = inp.data.numpy()
|
58 |
+
except RuntimeError:
|
59 |
+
inp = inp.numpy()
|
60 |
+
inp = inp.transpose(1, 2, 0)
|
61 |
+
|
62 |
+
inp = inp[:, :, ::-1]
|
63 |
+
return inp
|
VideoToNPZ/lib/detector/yolov3/util.py
ADDED
@@ -0,0 +1,225 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from __future__ import division
|
2 |
+
|
3 |
+
import torch
|
4 |
+
import numpy as np
|
5 |
+
import cv2
|
6 |
+
import os.path as osp
|
7 |
+
from bbox import bbox_iou
|
8 |
+
|
9 |
+
|
10 |
+
def get_path(cur_file):
|
11 |
+
cur_dir = osp.dirname(osp.realpath(cur_file))
|
12 |
+
project_root = osp.join(cur_dir, '../../../')
|
13 |
+
chk_root = osp.join(project_root, 'checkpoint/')
|
14 |
+
data_root = osp.join(project_root, 'data/')
|
15 |
+
|
16 |
+
return project_root, chk_root, data_root, cur_dir
|
17 |
+
|
18 |
+
|
19 |
+
def count_parameters(model):
|
20 |
+
return sum(p.numel() for p in model.parameters())
|
21 |
+
|
22 |
+
|
23 |
+
def count_learnable_parameters(model):
|
24 |
+
return sum(p.numel() for p in model.parameters() if p.requires_grad)
|
25 |
+
|
26 |
+
|
27 |
+
def convert2cpu(matrix):
|
28 |
+
if matrix.is_cuda:
|
29 |
+
return torch.FloatTensor(matrix.size()).copy_(matrix)
|
30 |
+
else:
|
31 |
+
return matrix
|
32 |
+
|
33 |
+
|
34 |
+
def predict_transform(prediction, inp_dim, anchors, num_classes, CUDA = True):
|
35 |
+
batch_size = prediction.size(0)
|
36 |
+
stride = inp_dim // prediction.size(2)
|
37 |
+
grid_size = inp_dim // stride
|
38 |
+
bbox_attrs = 5 + num_classes
|
39 |
+
num_anchors = len(anchors)
|
40 |
+
|
41 |
+
anchors = [(a[0]/stride, a[1]/stride) for a in anchors]
|
42 |
+
|
43 |
+
prediction = prediction.view(batch_size, bbox_attrs*num_anchors, grid_size*grid_size)
|
44 |
+
prediction = prediction.transpose(1, 2).contiguous()
|
45 |
+
prediction = prediction.view(batch_size, grid_size*grid_size*num_anchors, bbox_attrs)
|
46 |
+
|
47 |
+
# Sigmoid the centre_X, centre_Y. and object confidencce
|
48 |
+
prediction[:, :, 0] = torch.sigmoid(prediction[:, :, 0])
|
49 |
+
prediction[:, :, 1] = torch.sigmoid(prediction[:, :, 1])
|
50 |
+
prediction[:, :, 4] = torch.sigmoid(prediction[:, :, 4])
|
51 |
+
|
52 |
+
# Add the center offsets
|
53 |
+
grid_len = np.arange(grid_size)
|
54 |
+
a, b = np.meshgrid(grid_len, grid_len)
|
55 |
+
|
56 |
+
x_offset = torch.FloatTensor(a).view(-1, 1)
|
57 |
+
y_offset = torch.FloatTensor(b).view(-1, 1)
|
58 |
+
|
59 |
+
if CUDA:
|
60 |
+
x_offset = x_offset.cuda()
|
61 |
+
y_offset = y_offset.cuda()
|
62 |
+
|
63 |
+
x_y_offset = torch.cat((x_offset, y_offset), 1).repeat(1, num_anchors).view(-1, 2).unsqueeze(0)
|
64 |
+
|
65 |
+
prediction[:, :, :2] += x_y_offset
|
66 |
+
|
67 |
+
# log space transform height and the width
|
68 |
+
anchors = torch.FloatTensor(anchors)
|
69 |
+
|
70 |
+
if CUDA:
|
71 |
+
anchors = anchors.cuda()
|
72 |
+
|
73 |
+
anchors = anchors.repeat(grid_size*grid_size, 1).unsqueeze(0)
|
74 |
+
prediction[:, :, 2:4] = torch.exp(prediction[:, :, 2:4])*anchors
|
75 |
+
|
76 |
+
# Softmax the class scores
|
77 |
+
prediction[:, :, 5: 5 + num_classes] = torch.sigmoid((prediction[:, :, 5: 5 + num_classes]))
|
78 |
+
|
79 |
+
prediction[:, :, :4] *= stride
|
80 |
+
|
81 |
+
return prediction
|
82 |
+
|
83 |
+
|
84 |
+
def load_classes(namesfile):
|
85 |
+
fp = open(namesfile, "r")
|
86 |
+
names = fp.read().split("\n")[:-1]
|
87 |
+
return names
|
88 |
+
|
89 |
+
|
90 |
+
def get_im_dim(im):
|
91 |
+
im = cv2.imread(im)
|
92 |
+
w, h = im.shape[1], im.shape[0]
|
93 |
+
return w, h
|
94 |
+
|
95 |
+
|
96 |
+
def unique(tensor):
|
97 |
+
tensor_np = tensor.cpu().numpy()
|
98 |
+
unique_np = np.unique(tensor_np)
|
99 |
+
unique_tensor = torch.from_numpy(unique_np)
|
100 |
+
|
101 |
+
tensor_res = tensor.new(unique_tensor.shape)
|
102 |
+
tensor_res.copy_(unique_tensor)
|
103 |
+
return tensor_res
|
104 |
+
|
105 |
+
|
106 |
+
# ADD SOFT NMS
|
107 |
+
def write_results(prediction, confidence, num_classes, nms=True, nms_conf=0.4, det_hm=False):
|
108 |
+
"""
|
109 |
+
https://blog.paperspace.com/how-to-implement-a-yolo-v3-object-detector-from-scratch-in-pytorch-part-4/
|
110 |
+
prediction: (B x 10647 x 85)
|
111 |
+
B: the number of images in a batch,
|
112 |
+
10647: the number of bounding boxes predicted per image. (52×52+26×26+13×13)×3=10647
|
113 |
+
85: the number of bounding box attributes. (c_x, c_y, w, h, object confidence, and 80 class scores)
|
114 |
+
|
115 |
+
output: Num_obj × [img_index, x_1, y_1, x_2, y_2, object confidence, class_score, label_index]
|
116 |
+
"""
|
117 |
+
|
118 |
+
conf_mask = (prediction[:, :, 4] > confidence).float().unsqueeze(2)
|
119 |
+
prediction = prediction*conf_mask
|
120 |
+
|
121 |
+
box_a = prediction.new(prediction.shape)
|
122 |
+
box_a[:, :, 0] = (prediction[:, :, 0] - prediction[:, :, 2]/2)
|
123 |
+
box_a[:, :, 1] = (prediction[:, :, 1] - prediction[:, :, 3]/2)
|
124 |
+
box_a[:, :, 2] = (prediction[:, :, 0] + prediction[:, :, 2]/2)
|
125 |
+
box_a[:, :, 3] = (prediction[:, :, 1] + prediction[:, :, 3]/2)
|
126 |
+
prediction[:, :, :4] = box_a[:, :, :4]
|
127 |
+
|
128 |
+
batch_size = prediction.size(0)
|
129 |
+
|
130 |
+
output = prediction.new(1, prediction.size(2) + 1)
|
131 |
+
write = False
|
132 |
+
|
133 |
+
for ind in range(batch_size):
|
134 |
+
# select the image from the batch
|
135 |
+
image_pred = prediction[ind]
|
136 |
+
|
137 |
+
# Get the class having maximum score, and the index of that class
|
138 |
+
# Get rid of num_classes softmax scores
|
139 |
+
# Add the class index and the class score of class having maximum score
|
140 |
+
max_conf, max_conf_index = torch.max(image_pred[:, 5:5 + num_classes], 1)
|
141 |
+
max_conf = max_conf.float().unsqueeze(1)
|
142 |
+
max_conf_index = max_conf_index.float().unsqueeze(1)
|
143 |
+
seq = (image_pred[:, :5], max_conf, max_conf_index)
|
144 |
+
image_pred = torch.cat(seq, 1) # image_pred:(10647, 7) 7:[x1, y1, x2, y2, obj_score, max_conf, max_conf_index]
|
145 |
+
|
146 |
+
# Get rid of the zero entries
|
147 |
+
non_zero_ind = (torch.nonzero(image_pred[:, 4]))
|
148 |
+
image_pred__ = image_pred[non_zero_ind.squeeze(), :].view(-1, 7)
|
149 |
+
|
150 |
+
# filters out people id
|
151 |
+
if det_hm:
|
152 |
+
cls_mask = (image_pred__[:, -1] == 0).float()
|
153 |
+
class_mask_ind = torch.nonzero(cls_mask).squeeze()
|
154 |
+
image_pred_ = image_pred__[class_mask_ind].view(-1, 7)
|
155 |
+
|
156 |
+
if torch.sum(cls_mask) == 0:
|
157 |
+
return image_pred_
|
158 |
+
else:
|
159 |
+
image_pred_ = image_pred__
|
160 |
+
|
161 |
+
# Get the various classes detected in the image
|
162 |
+
try:
|
163 |
+
# img_classes = unique(image_pred_[:, -1])
|
164 |
+
img_classes = torch.unique(image_pred_[:, -1], sorted=True).float()
|
165 |
+
except:
|
166 |
+
continue
|
167 |
+
|
168 |
+
# We will do NMS classwise
|
169 |
+
# import ipdb;ipdb.set_trace()
|
170 |
+
for cls in img_classes:
|
171 |
+
# get the detections with one particular class
|
172 |
+
cls_mask = image_pred_*(image_pred_[:, -1] == cls).float().unsqueeze(1)
|
173 |
+
class_mask_ind = torch.nonzero(cls_mask[:, -2]).squeeze()
|
174 |
+
image_pred_class = image_pred_[class_mask_ind].view(-1, 7)
|
175 |
+
|
176 |
+
# sort the detections such that the entry with the maximum objectness
|
177 |
+
# confidence is at the top
|
178 |
+
conf_sort_index = torch.sort(image_pred_class[:, 4], descending=True)[1]
|
179 |
+
image_pred_class = image_pred_class[conf_sort_index]
|
180 |
+
idx = image_pred_class.size(0)
|
181 |
+
|
182 |
+
# from soft_NMS import soft_nms
|
183 |
+
# boxes = image_pred_class[:,:4]
|
184 |
+
# scores = image_pred_class[:, 4]
|
185 |
+
# k, N = soft_nms(boxes, scores, method=2)
|
186 |
+
# image_pred_class = image_pred_class[k]
|
187 |
+
|
188 |
+
# if nms has to be done
|
189 |
+
if nms:
|
190 |
+
# For each detection
|
191 |
+
for i in range(idx):
|
192 |
+
# Get the IOUs of all boxes that come after the one we are looking at
|
193 |
+
# in the loop
|
194 |
+
try:
|
195 |
+
ious = bbox_iou(image_pred_class[i].unsqueeze(0), image_pred_class[i+1:])
|
196 |
+
except ValueError:
|
197 |
+
break
|
198 |
+
|
199 |
+
except IndexError:
|
200 |
+
break
|
201 |
+
|
202 |
+
# Zero out all the detections that have IoU > threshold
|
203 |
+
iou_mask = (ious < nms_conf).float().unsqueeze(1)
|
204 |
+
image_pred_class[i+1:] *= iou_mask
|
205 |
+
|
206 |
+
# Remove the zero entries
|
207 |
+
non_zero_ind = torch.nonzero(image_pred_class[:, 4]).squeeze()
|
208 |
+
image_pred_class = image_pred_class[non_zero_ind].view(-1, 7)
|
209 |
+
|
210 |
+
# Concatenate the batch_id of the image to the detection
|
211 |
+
# this helps us identify which image does the detection correspond to
|
212 |
+
# We use a linear structure to hold ALL the detections from the batch
|
213 |
+
# the batch_dim is flattened
|
214 |
+
# batch is identified by extra batch column
|
215 |
+
|
216 |
+
batch_ind = image_pred_class.new(image_pred_class.size(0), 1).fill_(ind)
|
217 |
+
seq = batch_ind, image_pred_class
|
218 |
+
if not write:
|
219 |
+
output = torch.cat(seq, 1)
|
220 |
+
write = True
|
221 |
+
else:
|
222 |
+
out = torch.cat(seq, 1)
|
223 |
+
output = torch.cat((output, out))
|
224 |
+
|
225 |
+
return output
|
VideoToNPZ/lib/pose/__init__.py
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
import os.path as osp
|
3 |
+
|
4 |
+
sys.path.insert(1, osp.join(osp.dirname(osp.realpath(__file__)), 'hrnet/pose_estimation'))
|
5 |
+
from gen_kpts import gen_img_kpts, gen_video_kpts, load_default_model
|
6 |
+
sys.path.insert(2, osp.join(osp.dirname(osp.realpath(__file__)), 'hrnet/lib/utils'))
|
7 |
+
from utilitys import plot_keypoint, write, PreProcess, box_to_center_scale, load_json
|
8 |
+
|
9 |
+
sys.path.pop(1)
|
10 |
+
sys.path.pop(2)
|
VideoToNPZ/lib/pose/hrnet/experiments/coco/hrnet/w32_256x192_adam_lr1e-3.yaml
ADDED
@@ -0,0 +1,127 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
AUTO_RESUME: true
|
2 |
+
CUDNN:
|
3 |
+
BENCHMARK: true
|
4 |
+
DETERMINISTIC: false
|
5 |
+
ENABLED: true
|
6 |
+
DATA_DIR: ''
|
7 |
+
GPUS: (0,1,2,3)
|
8 |
+
OUTPUT_DIR: 'output'
|
9 |
+
LOG_DIR: 'log'
|
10 |
+
WORKERS: 24
|
11 |
+
PRINT_FREQ: 100
|
12 |
+
|
13 |
+
DATASET:
|
14 |
+
COLOR_RGB: true
|
15 |
+
DATASET: 'coco'
|
16 |
+
DATA_FORMAT: jpg
|
17 |
+
FLIP: true
|
18 |
+
NUM_JOINTS_HALF_BODY: 8
|
19 |
+
PROB_HALF_BODY: 0.3
|
20 |
+
ROOT: 'data/coco/'
|
21 |
+
ROT_FACTOR: 45
|
22 |
+
SCALE_FACTOR: 0.35
|
23 |
+
TEST_SET: 'val2017'
|
24 |
+
TRAIN_SET: 'train2017'
|
25 |
+
MODEL:
|
26 |
+
INIT_WEIGHTS: true
|
27 |
+
NAME: pose_hrnet
|
28 |
+
NUM_JOINTS: 17
|
29 |
+
PRETRAINED: 'models/pytorch/imagenet/hrnet_w32-36af842e.pth'
|
30 |
+
TARGET_TYPE: gaussian
|
31 |
+
IMAGE_SIZE:
|
32 |
+
- 192
|
33 |
+
- 256
|
34 |
+
HEATMAP_SIZE:
|
35 |
+
- 48
|
36 |
+
- 64
|
37 |
+
SIGMA: 2
|
38 |
+
EXTRA:
|
39 |
+
PRETRAINED_LAYERS:
|
40 |
+
- 'conv1'
|
41 |
+
- 'bn1'
|
42 |
+
- 'conv2'
|
43 |
+
- 'bn2'
|
44 |
+
- 'layer1'
|
45 |
+
- 'transition1'
|
46 |
+
- 'stage2'
|
47 |
+
- 'transition2'
|
48 |
+
- 'stage3'
|
49 |
+
- 'transition3'
|
50 |
+
- 'stage4'
|
51 |
+
FINAL_CONV_KERNEL: 1
|
52 |
+
STAGE2:
|
53 |
+
NUM_MODULES: 1
|
54 |
+
NUM_BRANCHES: 2
|
55 |
+
BLOCK: BASIC
|
56 |
+
NUM_BLOCKS:
|
57 |
+
- 4
|
58 |
+
- 4
|
59 |
+
NUM_CHANNELS:
|
60 |
+
- 32
|
61 |
+
- 64
|
62 |
+
FUSE_METHOD: SUM
|
63 |
+
STAGE3:
|
64 |
+
NUM_MODULES: 4
|
65 |
+
NUM_BRANCHES: 3
|
66 |
+
BLOCK: BASIC
|
67 |
+
NUM_BLOCKS:
|
68 |
+
- 4
|
69 |
+
- 4
|
70 |
+
- 4
|
71 |
+
NUM_CHANNELS:
|
72 |
+
- 32
|
73 |
+
- 64
|
74 |
+
- 128
|
75 |
+
FUSE_METHOD: SUM
|
76 |
+
STAGE4:
|
77 |
+
NUM_MODULES: 3
|
78 |
+
NUM_BRANCHES: 4
|
79 |
+
BLOCK: BASIC
|
80 |
+
NUM_BLOCKS:
|
81 |
+
- 4
|
82 |
+
- 4
|
83 |
+
- 4
|
84 |
+
- 4
|
85 |
+
NUM_CHANNELS:
|
86 |
+
- 32
|
87 |
+
- 64
|
88 |
+
- 128
|
89 |
+
- 256
|
90 |
+
FUSE_METHOD: SUM
|
91 |
+
LOSS:
|
92 |
+
USE_TARGET_WEIGHT: true
|
93 |
+
TRAIN:
|
94 |
+
BATCH_SIZE_PER_GPU: 32
|
95 |
+
SHUFFLE: true
|
96 |
+
BEGIN_EPOCH: 0
|
97 |
+
END_EPOCH: 210
|
98 |
+
OPTIMIZER: adam
|
99 |
+
LR: 0.001
|
100 |
+
LR_FACTOR: 0.1
|
101 |
+
LR_STEP:
|
102 |
+
- 170
|
103 |
+
- 200
|
104 |
+
WD: 0.0001
|
105 |
+
GAMMA1: 0.99
|
106 |
+
GAMMA2: 0.0
|
107 |
+
MOMENTUM: 0.9
|
108 |
+
NESTEROV: false
|
109 |
+
TEST:
|
110 |
+
BATCH_SIZE_PER_GPU: 32
|
111 |
+
COCO_BBOX_FILE: 'data/coco/person_detection_results/COCO_val2017_detections_AP_H_56_person.json'
|
112 |
+
BBOX_THRE: 1.0
|
113 |
+
IMAGE_THRE: 0.0
|
114 |
+
IN_VIS_THRE: 0.2
|
115 |
+
MODEL_FILE: ''
|
116 |
+
NMS_THRE: 1.0
|
117 |
+
OKS_THRE: 0.9
|
118 |
+
USE_GT_BBOX: true
|
119 |
+
FLIP_TEST: true
|
120 |
+
POST_PROCESS: true
|
121 |
+
SHIFT_HEATMAP: true
|
122 |
+
DEBUG:
|
123 |
+
DEBUG: true
|
124 |
+
SAVE_BATCH_IMAGES_GT: true
|
125 |
+
SAVE_BATCH_IMAGES_PRED: true
|
126 |
+
SAVE_HEATMAPS_GT: true
|
127 |
+
SAVE_HEATMAPS_PRED: true
|
VideoToNPZ/lib/pose/hrnet/experiments/coco/hrnet/w32_384x288_adam_lr1e-3.yaml
ADDED
@@ -0,0 +1,127 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
AUTO_RESUME: true
|
2 |
+
CUDNN:
|
3 |
+
BENCHMARK: true
|
4 |
+
DETERMINISTIC: false
|
5 |
+
ENABLED: true
|
6 |
+
DATA_DIR: ''
|
7 |
+
GPUS: (0,1,2,3)
|
8 |
+
OUTPUT_DIR: 'output'
|
9 |
+
LOG_DIR: 'log'
|
10 |
+
WORKERS: 24
|
11 |
+
PRINT_FREQ: 100
|
12 |
+
|
13 |
+
DATASET:
|
14 |
+
COLOR_RGB: true
|
15 |
+
DATASET: 'coco'
|
16 |
+
DATA_FORMAT: jpg
|
17 |
+
FLIP: true
|
18 |
+
NUM_JOINTS_HALF_BODY: 8
|
19 |
+
PROB_HALF_BODY: 0.3
|
20 |
+
ROOT: 'data/coco/'
|
21 |
+
ROT_FACTOR: 45
|
22 |
+
SCALE_FACTOR: 0.35
|
23 |
+
TEST_SET: 'val2017'
|
24 |
+
TRAIN_SET: 'train2017'
|
25 |
+
MODEL:
|
26 |
+
INIT_WEIGHTS: true
|
27 |
+
NAME: pose_hrnet
|
28 |
+
NUM_JOINTS: 17
|
29 |
+
PRETRAINED: 'models/pytorch/imagenet/hrnet_w32-36af842e.pth'
|
30 |
+
TARGET_TYPE: gaussian
|
31 |
+
IMAGE_SIZE:
|
32 |
+
- 288
|
33 |
+
- 384
|
34 |
+
HEATMAP_SIZE:
|
35 |
+
- 72
|
36 |
+
- 96
|
37 |
+
SIGMA: 3
|
38 |
+
EXTRA:
|
39 |
+
PRETRAINED_LAYERS:
|
40 |
+
- 'conv1'
|
41 |
+
- 'bn1'
|
42 |
+
- 'conv2'
|
43 |
+
- 'bn2'
|
44 |
+
- 'layer1'
|
45 |
+
- 'transition1'
|
46 |
+
- 'stage2'
|
47 |
+
- 'transition2'
|
48 |
+
- 'stage3'
|
49 |
+
- 'transition3'
|
50 |
+
- 'stage4'
|
51 |
+
FINAL_CONV_KERNEL: 1
|
52 |
+
STAGE2:
|
53 |
+
NUM_MODULES: 1
|
54 |
+
NUM_BRANCHES: 2
|
55 |
+
BLOCK: BASIC
|
56 |
+
NUM_BLOCKS:
|
57 |
+
- 4
|
58 |
+
- 4
|
59 |
+
NUM_CHANNELS:
|
60 |
+
- 32
|
61 |
+
- 64
|
62 |
+
FUSE_METHOD: SUM
|
63 |
+
STAGE3:
|
64 |
+
NUM_MODULES: 4
|
65 |
+
NUM_BRANCHES: 3
|
66 |
+
BLOCK: BASIC
|
67 |
+
NUM_BLOCKS:
|
68 |
+
- 4
|
69 |
+
- 4
|
70 |
+
- 4
|
71 |
+
NUM_CHANNELS:
|
72 |
+
- 32
|
73 |
+
- 64
|
74 |
+
- 128
|
75 |
+
FUSE_METHOD: SUM
|
76 |
+
STAGE4:
|
77 |
+
NUM_MODULES: 3
|
78 |
+
NUM_BRANCHES: 4
|
79 |
+
BLOCK: BASIC
|
80 |
+
NUM_BLOCKS:
|
81 |
+
- 4
|
82 |
+
- 4
|
83 |
+
- 4
|
84 |
+
- 4
|
85 |
+
NUM_CHANNELS:
|
86 |
+
- 32
|
87 |
+
- 64
|
88 |
+
- 128
|
89 |
+
- 256
|
90 |
+
FUSE_METHOD: SUM
|
91 |
+
LOSS:
|
92 |
+
USE_TARGET_WEIGHT: true
|
93 |
+
TRAIN:
|
94 |
+
BATCH_SIZE_PER_GPU: 32
|
95 |
+
SHUFFLE: true
|
96 |
+
BEGIN_EPOCH: 0
|
97 |
+
END_EPOCH: 210
|
98 |
+
OPTIMIZER: adam
|
99 |
+
LR: 0.001
|
100 |
+
LR_FACTOR: 0.1
|
101 |
+
LR_STEP:
|
102 |
+
- 170
|
103 |
+
- 200
|
104 |
+
WD: 0.0001
|
105 |
+
GAMMA1: 0.99
|
106 |
+
GAMMA2: 0.0
|
107 |
+
MOMENTUM: 0.9
|
108 |
+
NESTEROV: false
|
109 |
+
TEST:
|
110 |
+
BATCH_SIZE_PER_GPU: 32
|
111 |
+
COCO_BBOX_FILE: 'data/coco/person_detection_results/COCO_val2017_detections_AP_H_56_person.json'
|
112 |
+
BBOX_THRE: 1.0
|
113 |
+
IMAGE_THRE: 0.0
|
114 |
+
IN_VIS_THRE: 0.2
|
115 |
+
MODEL_FILE: ''
|
116 |
+
NMS_THRE: 1.0
|
117 |
+
OKS_THRE: 0.9
|
118 |
+
USE_GT_BBOX: true
|
119 |
+
FLIP_TEST: true
|
120 |
+
POST_PROCESS: true
|
121 |
+
SHIFT_HEATMAP: true
|
122 |
+
DEBUG:
|
123 |
+
DEBUG: true
|
124 |
+
SAVE_BATCH_IMAGES_GT: true
|
125 |
+
SAVE_BATCH_IMAGES_PRED: true
|
126 |
+
SAVE_HEATMAPS_GT: true
|
127 |
+
SAVE_HEATMAPS_PRED: true
|
VideoToNPZ/lib/pose/hrnet/experiments/coco/hrnet/w48_256x192_adam_lr1e-3.yaml
ADDED
@@ -0,0 +1,127 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
AUTO_RESUME: true
|
2 |
+
CUDNN:
|
3 |
+
BENCHMARK: true
|
4 |
+
DETERMINISTIC: false
|
5 |
+
ENABLED: true
|
6 |
+
DATA_DIR: ''
|
7 |
+
GPUS: (0,1,2,3)
|
8 |
+
OUTPUT_DIR: 'output'
|
9 |
+
LOG_DIR: 'log'
|
10 |
+
WORKERS: 24
|
11 |
+
PRINT_FREQ: 100
|
12 |
+
|
13 |
+
DATASET:
|
14 |
+
COLOR_RGB: true
|
15 |
+
DATASET: 'coco'
|
16 |
+
DATA_FORMAT: jpg
|
17 |
+
FLIP: true
|
18 |
+
NUM_JOINTS_HALF_BODY: 8
|
19 |
+
PROB_HALF_BODY: 0.3
|
20 |
+
ROOT: 'data/coco/'
|
21 |
+
ROT_FACTOR: 45
|
22 |
+
SCALE_FACTOR: 0.35
|
23 |
+
TEST_SET: 'val2017'
|
24 |
+
TRAIN_SET: 'train2017'
|
25 |
+
MODEL:
|
26 |
+
INIT_WEIGHTS: true
|
27 |
+
NAME: pose_hrnet
|
28 |
+
NUM_JOINTS: 17
|
29 |
+
PRETRAINED: 'models/pytorch/imagenet/hrnet_w48-8ef0771d.pth'
|
30 |
+
TARGET_TYPE: gaussian
|
31 |
+
IMAGE_SIZE:
|
32 |
+
- 192
|
33 |
+
- 256
|
34 |
+
HEATMAP_SIZE:
|
35 |
+
- 48
|
36 |
+
- 64
|
37 |
+
SIGMA: 2
|
38 |
+
EXTRA:
|
39 |
+
PRETRAINED_LAYERS:
|
40 |
+
- 'conv1'
|
41 |
+
- 'bn1'
|
42 |
+
- 'conv2'
|
43 |
+
- 'bn2'
|
44 |
+
- 'layer1'
|
45 |
+
- 'transition1'
|
46 |
+
- 'stage2'
|
47 |
+
- 'transition2'
|
48 |
+
- 'stage3'
|
49 |
+
- 'transition3'
|
50 |
+
- 'stage4'
|
51 |
+
FINAL_CONV_KERNEL: 1
|
52 |
+
STAGE2:
|
53 |
+
NUM_MODULES: 1
|
54 |
+
NUM_BRANCHES: 2
|
55 |
+
BLOCK: BASIC
|
56 |
+
NUM_BLOCKS:
|
57 |
+
- 4
|
58 |
+
- 4
|
59 |
+
NUM_CHANNELS:
|
60 |
+
- 48
|
61 |
+
- 96
|
62 |
+
FUSE_METHOD: SUM
|
63 |
+
STAGE3:
|
64 |
+
NUM_MODULES: 4
|
65 |
+
NUM_BRANCHES: 3
|
66 |
+
BLOCK: BASIC
|
67 |
+
NUM_BLOCKS:
|
68 |
+
- 4
|
69 |
+
- 4
|
70 |
+
- 4
|
71 |
+
NUM_CHANNELS:
|
72 |
+
- 48
|
73 |
+
- 96
|
74 |
+
- 192
|
75 |
+
FUSE_METHOD: SUM
|
76 |
+
STAGE4:
|
77 |
+
NUM_MODULES: 3
|
78 |
+
NUM_BRANCHES: 4
|
79 |
+
BLOCK: BASIC
|
80 |
+
NUM_BLOCKS:
|
81 |
+
- 4
|
82 |
+
- 4
|
83 |
+
- 4
|
84 |
+
- 4
|
85 |
+
NUM_CHANNELS:
|
86 |
+
- 48
|
87 |
+
- 96
|
88 |
+
- 192
|
89 |
+
- 384
|
90 |
+
FUSE_METHOD: SUM
|
91 |
+
LOSS:
|
92 |
+
USE_TARGET_WEIGHT: true
|
93 |
+
TRAIN:
|
94 |
+
BATCH_SIZE_PER_GPU: 32
|
95 |
+
SHUFFLE: true
|
96 |
+
BEGIN_EPOCH: 0
|
97 |
+
END_EPOCH: 210
|
98 |
+
OPTIMIZER: adam
|
99 |
+
LR: 0.001
|
100 |
+
LR_FACTOR: 0.1
|
101 |
+
LR_STEP:
|
102 |
+
- 170
|
103 |
+
- 200
|
104 |
+
WD: 0.0001
|
105 |
+
GAMMA1: 0.99
|
106 |
+
GAMMA2: 0.0
|
107 |
+
MOMENTUM: 0.9
|
108 |
+
NESTEROV: false
|
109 |
+
TEST:
|
110 |
+
BATCH_SIZE_PER_GPU: 32
|
111 |
+
COCO_BBOX_FILE: 'data/coco/person_detection_results/COCO_val2017_detections_AP_H_56_person.json'
|
112 |
+
BBOX_THRE: 1.0
|
113 |
+
IMAGE_THRE: 0.0
|
114 |
+
IN_VIS_THRE: 0.2
|
115 |
+
MODEL_FILE: ''
|
116 |
+
NMS_THRE: 1.0
|
117 |
+
OKS_THRE: 0.9
|
118 |
+
USE_GT_BBOX: true
|
119 |
+
FLIP_TEST: true
|
120 |
+
POST_PROCESS: true
|
121 |
+
SHIFT_HEATMAP: true
|
122 |
+
DEBUG:
|
123 |
+
DEBUG: true
|
124 |
+
SAVE_BATCH_IMAGES_GT: true
|
125 |
+
SAVE_BATCH_IMAGES_PRED: true
|
126 |
+
SAVE_HEATMAPS_GT: true
|
127 |
+
SAVE_HEATMAPS_PRED: true
|
VideoToNPZ/lib/pose/hrnet/experiments/coco/hrnet/w48_384x288_adam_lr1e-3.yaml
ADDED
@@ -0,0 +1,127 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
AUTO_RESUME: true
|
2 |
+
CUDNN:
|
3 |
+
BENCHMARK: true
|
4 |
+
DETERMINISTIC: false
|
5 |
+
ENABLED: true
|
6 |
+
DATA_DIR: ''
|
7 |
+
GPUS: (0,1,2,3)
|
8 |
+
OUTPUT_DIR: 'output'
|
9 |
+
LOG_DIR: 'log'
|
10 |
+
WORKERS: 24
|
11 |
+
PRINT_FREQ: 100
|
12 |
+
|
13 |
+
DATASET:
|
14 |
+
COLOR_RGB: true
|
15 |
+
DATASET: 'coco'
|
16 |
+
DATA_FORMAT: jpg
|
17 |
+
FLIP: true
|
18 |
+
NUM_JOINTS_HALF_BODY: 8
|
19 |
+
PROB_HALF_BODY: 0.3
|
20 |
+
ROOT: 'data/coco/'
|
21 |
+
ROT_FACTOR: 45
|
22 |
+
SCALE_FACTOR: 0.35
|
23 |
+
TEST_SET: 'val2017'
|
24 |
+
TRAIN_SET: 'train2017'
|
25 |
+
MODEL:
|
26 |
+
INIT_WEIGHTS: true
|
27 |
+
NAME: pose_hrnet
|
28 |
+
NUM_JOINTS: 17
|
29 |
+
PRETRAINED: 'models/pytorch/imagenet/hrnet_w48-8ef0771d.pth'
|
30 |
+
TARGET_TYPE: gaussian
|
31 |
+
IMAGE_SIZE:
|
32 |
+
- 288
|
33 |
+
- 384
|
34 |
+
HEATMAP_SIZE:
|
35 |
+
- 72
|
36 |
+
- 96
|
37 |
+
SIGMA: 3
|
38 |
+
EXTRA:
|
39 |
+
PRETRAINED_LAYERS:
|
40 |
+
- 'conv1'
|
41 |
+
- 'bn1'
|
42 |
+
- 'conv2'
|
43 |
+
- 'bn2'
|
44 |
+
- 'layer1'
|
45 |
+
- 'transition1'
|
46 |
+
- 'stage2'
|
47 |
+
- 'transition2'
|
48 |
+
- 'stage3'
|
49 |
+
- 'transition3'
|
50 |
+
- 'stage4'
|
51 |
+
FINAL_CONV_KERNEL: 1
|
52 |
+
STAGE2:
|
53 |
+
NUM_MODULES: 1
|
54 |
+
NUM_BRANCHES: 2
|
55 |
+
BLOCK: BASIC
|
56 |
+
NUM_BLOCKS:
|
57 |
+
- 4
|
58 |
+
- 4
|
59 |
+
NUM_CHANNELS:
|
60 |
+
- 48
|
61 |
+
- 96
|
62 |
+
FUSE_METHOD: SUM
|
63 |
+
STAGE3:
|
64 |
+
NUM_MODULES: 4
|
65 |
+
NUM_BRANCHES: 3
|
66 |
+
BLOCK: BASIC
|
67 |
+
NUM_BLOCKS:
|
68 |
+
- 4
|
69 |
+
- 4
|
70 |
+
- 4
|
71 |
+
NUM_CHANNELS:
|
72 |
+
- 48
|
73 |
+
- 96
|
74 |
+
- 192
|
75 |
+
FUSE_METHOD: SUM
|
76 |
+
STAGE4:
|
77 |
+
NUM_MODULES: 3
|
78 |
+
NUM_BRANCHES: 4
|
79 |
+
BLOCK: BASIC
|
80 |
+
NUM_BLOCKS:
|
81 |
+
- 4
|
82 |
+
- 4
|
83 |
+
- 4
|
84 |
+
- 4
|
85 |
+
NUM_CHANNELS:
|
86 |
+
- 48
|
87 |
+
- 96
|
88 |
+
- 192
|
89 |
+
- 384
|
90 |
+
FUSE_METHOD: SUM
|
91 |
+
LOSS:
|
92 |
+
USE_TARGET_WEIGHT: true
|
93 |
+
TRAIN:
|
94 |
+
BATCH_SIZE_PER_GPU: 24
|
95 |
+
SHUFFLE: true
|
96 |
+
BEGIN_EPOCH: 0
|
97 |
+
END_EPOCH: 210
|
98 |
+
OPTIMIZER: adam
|
99 |
+
LR: 0.001
|
100 |
+
LR_FACTOR: 0.1
|
101 |
+
LR_STEP:
|
102 |
+
- 170
|
103 |
+
- 200
|
104 |
+
WD: 0.0001
|
105 |
+
GAMMA1: 0.99
|
106 |
+
GAMMA2: 0.0
|
107 |
+
MOMENTUM: 0.9
|
108 |
+
NESTEROV: false
|
109 |
+
TEST:
|
110 |
+
BATCH_SIZE_PER_GPU: 24
|
111 |
+
COCO_BBOX_FILE: 'data/coco/person_detection_results/COCO_val2017_detections_AP_H_56_person.json'
|
112 |
+
BBOX_THRE: 1.0
|
113 |
+
IMAGE_THRE: 0.0
|
114 |
+
IN_VIS_THRE: 0.2
|
115 |
+
MODEL_FILE: ''
|
116 |
+
NMS_THRE: 1.0
|
117 |
+
OKS_THRE: 0.9
|
118 |
+
USE_GT_BBOX: true
|
119 |
+
FLIP_TEST: true
|
120 |
+
POST_PROCESS: true
|
121 |
+
SHIFT_HEATMAP: true
|
122 |
+
DEBUG:
|
123 |
+
DEBUG: true
|
124 |
+
SAVE_BATCH_IMAGES_GT: true
|
125 |
+
SAVE_BATCH_IMAGES_PRED: true
|
126 |
+
SAVE_HEATMAPS_GT: true
|
127 |
+
SAVE_HEATMAPS_PRED: true
|
VideoToNPZ/lib/pose/hrnet/experiments/coco/resnet/res101_256x192_d256x3_adam_lr1e-3.yaml
ADDED
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
AUTO_RESUME: true
|
2 |
+
CUDNN:
|
3 |
+
BENCHMARK: true
|
4 |
+
DETERMINISTIC: false
|
5 |
+
ENABLED: true
|
6 |
+
DATA_DIR: ''
|
7 |
+
GPUS: (0,1,2,3)
|
8 |
+
OUTPUT_DIR: 'output'
|
9 |
+
LOG_DIR: 'log'
|
10 |
+
WORKERS: 24
|
11 |
+
PRINT_FREQ: 100
|
12 |
+
|
13 |
+
DATASET:
|
14 |
+
COLOR_RGB: false
|
15 |
+
DATASET: 'coco'
|
16 |
+
ROOT: 'data/coco/'
|
17 |
+
TEST_SET: 'val2017'
|
18 |
+
TRAIN_SET: 'train2017'
|
19 |
+
FLIP: true
|
20 |
+
ROT_FACTOR: 40
|
21 |
+
SCALE_FACTOR: 0.3
|
22 |
+
MODEL:
|
23 |
+
NAME: 'pose_resnet'
|
24 |
+
PRETRAINED: 'models/pytorch/imagenet/resnet101-5d3b4d8f.pth'
|
25 |
+
IMAGE_SIZE:
|
26 |
+
- 192
|
27 |
+
- 256
|
28 |
+
HEATMAP_SIZE:
|
29 |
+
- 48
|
30 |
+
- 64
|
31 |
+
SIGMA: 2
|
32 |
+
NUM_JOINTS: 17
|
33 |
+
TARGET_TYPE: 'gaussian'
|
34 |
+
EXTRA:
|
35 |
+
FINAL_CONV_KERNEL: 1
|
36 |
+
DECONV_WITH_BIAS: false
|
37 |
+
NUM_DECONV_LAYERS: 3
|
38 |
+
NUM_DECONV_FILTERS:
|
39 |
+
- 256
|
40 |
+
- 256
|
41 |
+
- 256
|
42 |
+
NUM_DECONV_KERNELS:
|
43 |
+
- 4
|
44 |
+
- 4
|
45 |
+
- 4
|
46 |
+
NUM_LAYERS: 101
|
47 |
+
LOSS:
|
48 |
+
USE_TARGET_WEIGHT: true
|
49 |
+
TRAIN:
|
50 |
+
BATCH_SIZE_PER_GPU: 32
|
51 |
+
SHUFFLE: true
|
52 |
+
BEGIN_EPOCH: 0
|
53 |
+
END_EPOCH: 140
|
54 |
+
OPTIMIZER: 'adam'
|
55 |
+
LR: 0.001
|
56 |
+
LR_FACTOR: 0.1
|
57 |
+
LR_STEP:
|
58 |
+
- 90
|
59 |
+
- 120
|
60 |
+
WD: 0.0001
|
61 |
+
GAMMA1: 0.99
|
62 |
+
GAMMA2: 0.0
|
63 |
+
MOMENTUM: 0.9
|
64 |
+
NESTEROV: false
|
65 |
+
TEST:
|
66 |
+
BATCH_SIZE_PER_GPU: 32
|
67 |
+
COCO_BBOX_FILE: 'data/coco/person_detection_results/COCO_val2017_detections_AP_H_56_person.json'
|
68 |
+
BBOX_THRE: 1.0
|
69 |
+
IMAGE_THRE: 0.0
|
70 |
+
IN_VIS_THRE: 0.2
|
71 |
+
MODEL_FILE: ''
|
72 |
+
NMS_THRE: 1.0
|
73 |
+
OKS_THRE: 0.9
|
74 |
+
FLIP_TEST: true
|
75 |
+
POST_PROCESS: true
|
76 |
+
SHIFT_HEATMAP: true
|
77 |
+
USE_GT_BBOX: true
|
78 |
+
DEBUG:
|
79 |
+
DEBUG: true
|
80 |
+
SAVE_BATCH_IMAGES_GT: true
|
81 |
+
SAVE_BATCH_IMAGES_PRED: true
|
82 |
+
SAVE_HEATMAPS_GT: true
|
83 |
+
SAVE_HEATMAPS_PRED: true
|
VideoToNPZ/lib/pose/hrnet/experiments/coco/resnet/res101_384x288_d256x3_adam_lr1e-3.yaml
ADDED
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
AUTO_RESUME: true
|
2 |
+
CUDNN:
|
3 |
+
BENCHMARK: true
|
4 |
+
DETERMINISTIC: false
|
5 |
+
ENABLED: true
|
6 |
+
DATA_DIR: ''
|
7 |
+
GPUS: (0,1,2,3)
|
8 |
+
OUTPUT_DIR: 'output'
|
9 |
+
LOG_DIR: 'log'
|
10 |
+
WORKERS: 24
|
11 |
+
PRINT_FREQ: 100
|
12 |
+
|
13 |
+
DATASET:
|
14 |
+
COLOR_RGB: false
|
15 |
+
DATASET: 'coco'
|
16 |
+
ROOT: 'data/coco/'
|
17 |
+
TEST_SET: 'val2017'
|
18 |
+
TRAIN_SET: 'train2017'
|
19 |
+
FLIP: true
|
20 |
+
ROT_FACTOR: 40
|
21 |
+
SCALE_FACTOR: 0.3
|
22 |
+
MODEL:
|
23 |
+
NAME: 'pose_resnet'
|
24 |
+
PRETRAINED: 'models/pytorch/imagenet/resnet101-5d3b4d8f.pth'
|
25 |
+
IMAGE_SIZE:
|
26 |
+
- 288
|
27 |
+
- 384
|
28 |
+
HEATMAP_SIZE:
|
29 |
+
- 72
|
30 |
+
- 96
|
31 |
+
SIGMA: 3
|
32 |
+
NUM_JOINTS: 17
|
33 |
+
TARGET_TYPE: 'gaussian'
|
34 |
+
EXTRA:
|
35 |
+
FINAL_CONV_KERNEL: 1
|
36 |
+
DECONV_WITH_BIAS: false
|
37 |
+
NUM_DECONV_LAYERS: 3
|
38 |
+
NUM_DECONV_FILTERS:
|
39 |
+
- 256
|
40 |
+
- 256
|
41 |
+
- 256
|
42 |
+
NUM_DECONV_KERNELS:
|
43 |
+
- 4
|
44 |
+
- 4
|
45 |
+
- 4
|
46 |
+
NUM_LAYERS: 101
|
47 |
+
LOSS:
|
48 |
+
USE_TARGET_WEIGHT: true
|
49 |
+
TRAIN:
|
50 |
+
BATCH_SIZE_PER_GPU: 32
|
51 |
+
SHUFFLE: true
|
52 |
+
BEGIN_EPOCH: 0
|
53 |
+
END_EPOCH: 140
|
54 |
+
OPTIMIZER: 'adam'
|
55 |
+
LR: 0.001
|
56 |
+
LR_FACTOR: 0.1
|
57 |
+
LR_STEP:
|
58 |
+
- 90
|
59 |
+
- 120
|
60 |
+
WD: 0.0001
|
61 |
+
GAMMA1: 0.99
|
62 |
+
GAMMA2: 0.0
|
63 |
+
MOMENTUM: 0.9
|
64 |
+
NESTEROV: false
|
65 |
+
TEST:
|
66 |
+
BATCH_SIZE_PER_GPU: 32
|
67 |
+
COCO_BBOX_FILE: 'data/coco/person_detection_results/COCO_val2017_detections_AP_H_56_person.json'
|
68 |
+
BBOX_THRE: 1.0
|
69 |
+
IMAGE_THRE: 0.0
|
70 |
+
IN_VIS_THRE: 0.2
|
71 |
+
MODEL_FILE: ''
|
72 |
+
NMS_THRE: 1.0
|
73 |
+
OKS_THRE: 0.9
|
74 |
+
FLIP_TEST: true
|
75 |
+
POST_PROCESS: true
|
76 |
+
SHIFT_HEATMAP: true
|
77 |
+
USE_GT_BBOX: true
|
78 |
+
DEBUG:
|
79 |
+
DEBUG: true
|
80 |
+
SAVE_BATCH_IMAGES_GT: true
|
81 |
+
SAVE_BATCH_IMAGES_PRED: true
|
82 |
+
SAVE_HEATMAPS_GT: true
|
83 |
+
SAVE_HEATMAPS_PRED: true
|
VideoToNPZ/lib/pose/hrnet/experiments/coco/resnet/res152_256x192_d256x3_adam_lr1e-3.yaml
ADDED
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
AUTO_RESUME: true
|
2 |
+
CUDNN:
|
3 |
+
BENCHMARK: true
|
4 |
+
DETERMINISTIC: false
|
5 |
+
ENABLED: true
|
6 |
+
DATA_DIR: ''
|
7 |
+
GPUS: (0,1,2,3)
|
8 |
+
OUTPUT_DIR: 'output'
|
9 |
+
LOG_DIR: 'log'
|
10 |
+
WORKERS: 24
|
11 |
+
PRINT_FREQ: 100
|
12 |
+
|
13 |
+
DATASET:
|
14 |
+
COLOR_RGB: false
|
15 |
+
DATASET: 'coco'
|
16 |
+
ROOT: 'data/coco/'
|
17 |
+
TEST_SET: 'val2017'
|
18 |
+
TRAIN_SET: 'train2017'
|
19 |
+
FLIP: true
|
20 |
+
ROT_FACTOR: 40
|
21 |
+
SCALE_FACTOR: 0.3
|
22 |
+
MODEL:
|
23 |
+
NAME: 'pose_resnet'
|
24 |
+
PRETRAINED: 'models/pytorch/imagenet/resnet152-b121ed2d.pth'
|
25 |
+
IMAGE_SIZE:
|
26 |
+
- 192
|
27 |
+
- 256
|
28 |
+
HEATMAP_SIZE:
|
29 |
+
- 48
|
30 |
+
- 64
|
31 |
+
SIGMA: 2
|
32 |
+
NUM_JOINTS: 17
|
33 |
+
TARGET_TYPE: 'gaussian'
|
34 |
+
EXTRA:
|
35 |
+
FINAL_CONV_KERNEL: 1
|
36 |
+
DECONV_WITH_BIAS: false
|
37 |
+
NUM_DECONV_LAYERS: 3
|
38 |
+
NUM_DECONV_FILTERS:
|
39 |
+
- 256
|
40 |
+
- 256
|
41 |
+
- 256
|
42 |
+
NUM_DECONV_KERNELS:
|
43 |
+
- 4
|
44 |
+
- 4
|
45 |
+
- 4
|
46 |
+
NUM_LAYERS: 152
|
47 |
+
LOSS:
|
48 |
+
USE_TARGET_WEIGHT: true
|
49 |
+
TRAIN:
|
50 |
+
BATCH_SIZE_PER_GPU: 32
|
51 |
+
SHUFFLE: true
|
52 |
+
BEGIN_EPOCH: 0
|
53 |
+
END_EPOCH: 140
|
54 |
+
OPTIMIZER: 'adam'
|
55 |
+
LR: 0.001
|
56 |
+
LR_FACTOR: 0.1
|
57 |
+
LR_STEP:
|
58 |
+
- 90
|
59 |
+
- 120
|
60 |
+
WD: 0.0001
|
61 |
+
GAMMA1: 0.99
|
62 |
+
GAMMA2: 0.0
|
63 |
+
MOMENTUM: 0.9
|
64 |
+
NESTEROV: false
|
65 |
+
TEST:
|
66 |
+
BATCH_SIZE_PER_GPU: 32
|
67 |
+
COCO_BBOX_FILE: 'data/coco/person_detection_results/COCO_val2017_detections_AP_H_56_person.json'
|
68 |
+
BBOX_THRE: 1.0
|
69 |
+
IMAGE_THRE: 0.0
|
70 |
+
IN_VIS_THRE: 0.2
|
71 |
+
MODEL_FILE: ''
|
72 |
+
NMS_THRE: 1.0
|
73 |
+
OKS_THRE: 0.9
|
74 |
+
FLIP_TEST: true
|
75 |
+
POST_PROCESS: true
|
76 |
+
SHIFT_HEATMAP: true
|
77 |
+
USE_GT_BBOX: true
|
78 |
+
DEBUG:
|
79 |
+
DEBUG: true
|
80 |
+
SAVE_BATCH_IMAGES_GT: true
|
81 |
+
SAVE_BATCH_IMAGES_PRED: true
|
82 |
+
SAVE_HEATMAPS_GT: true
|
83 |
+
SAVE_HEATMAPS_PRED: true
|
VideoToNPZ/lib/pose/hrnet/experiments/coco/resnet/res152_384x288_d256x3_adam_lr1e-3.yaml
ADDED
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
AUTO_RESUME: true
|
2 |
+
CUDNN:
|
3 |
+
BENCHMARK: true
|
4 |
+
DETERMINISTIC: false
|
5 |
+
ENABLED: true
|
6 |
+
DATA_DIR: ''
|
7 |
+
GPUS: (0,1,2,3)
|
8 |
+
OUTPUT_DIR: 'output'
|
9 |
+
LOG_DIR: 'log'
|
10 |
+
WORKERS: 24
|
11 |
+
PRINT_FREQ: 100
|
12 |
+
|
13 |
+
DATASET:
|
14 |
+
COLOR_RGB: false
|
15 |
+
DATASET: 'coco'
|
16 |
+
ROOT: 'data/coco/'
|
17 |
+
TEST_SET: 'val2017'
|
18 |
+
TRAIN_SET: 'train2017'
|
19 |
+
FLIP: true
|
20 |
+
ROT_FACTOR: 40
|
21 |
+
SCALE_FACTOR: 0.3
|
22 |
+
MODEL:
|
23 |
+
NAME: 'pose_resnet'
|
24 |
+
PRETRAINED: 'models/pytorch/imagenet/resnet152-b121ed2d.pth'
|
25 |
+
IMAGE_SIZE:
|
26 |
+
- 288
|
27 |
+
- 384
|
28 |
+
HEATMAP_SIZE:
|
29 |
+
- 72
|
30 |
+
- 96
|
31 |
+
SIGMA: 3
|
32 |
+
NUM_JOINTS: 17
|
33 |
+
TARGET_TYPE: 'gaussian'
|
34 |
+
EXTRA:
|
35 |
+
FINAL_CONV_KERNEL: 1
|
36 |
+
DECONV_WITH_BIAS: false
|
37 |
+
NUM_DECONV_LAYERS: 3
|
38 |
+
NUM_DECONV_FILTERS:
|
39 |
+
- 256
|
40 |
+
- 256
|
41 |
+
- 256
|
42 |
+
NUM_DECONV_KERNELS:
|
43 |
+
- 4
|
44 |
+
- 4
|
45 |
+
- 4
|
46 |
+
NUM_LAYERS: 152
|
47 |
+
LOSS:
|
48 |
+
USE_TARGET_WEIGHT: true
|
49 |
+
TRAIN:
|
50 |
+
BATCH_SIZE_PER_GPU: 32
|
51 |
+
SHUFFLE: true
|
52 |
+
BEGIN_EPOCH: 0
|
53 |
+
END_EPOCH: 140
|
54 |
+
OPTIMIZER: 'adam'
|
55 |
+
LR: 0.001
|
56 |
+
LR_FACTOR: 0.1
|
57 |
+
LR_STEP:
|
58 |
+
- 90
|
59 |
+
- 120
|
60 |
+
WD: 0.0001
|
61 |
+
GAMMA1: 0.99
|
62 |
+
GAMMA2: 0.0
|
63 |
+
MOMENTUM: 0.9
|
64 |
+
NESTEROV: false
|
65 |
+
TEST:
|
66 |
+
BATCH_SIZE_PER_GPU: 32
|
67 |
+
COCO_BBOX_FILE: 'data/coco/person_detection_results/COCO_val2017_detections_AP_H_56_person.json'
|
68 |
+
BBOX_THRE: 1.0
|
69 |
+
IMAGE_THRE: 0.0
|
70 |
+
IN_VIS_THRE: 0.2
|
71 |
+
MODEL_FILE: ''
|
72 |
+
NMS_THRE: 1.0
|
73 |
+
OKS_THRE: 0.9
|
74 |
+
FLIP_TEST: true
|
75 |
+
POST_PROCESS: true
|
76 |
+
SHIFT_HEATMAP: true
|
77 |
+
USE_GT_BBOX: true
|
78 |
+
DEBUG:
|
79 |
+
DEBUG: true
|
80 |
+
SAVE_BATCH_IMAGES_GT: true
|
81 |
+
SAVE_BATCH_IMAGES_PRED: true
|
82 |
+
SAVE_HEATMAPS_GT: true
|
83 |
+
SAVE_HEATMAPS_PRED: true
|
VideoToNPZ/lib/pose/hrnet/experiments/coco/resnet/res50_256x192_d256x3_adam_lr1e-3.yaml
ADDED
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
AUTO_RESUME: true
|
2 |
+
CUDNN:
|
3 |
+
BENCHMARK: true
|
4 |
+
DETERMINISTIC: false
|
5 |
+
ENABLED: true
|
6 |
+
DATA_DIR: ''
|
7 |
+
GPUS: (0,1,2,3)
|
8 |
+
OUTPUT_DIR: 'output'
|
9 |
+
LOG_DIR: 'log'
|
10 |
+
WORKERS: 24
|
11 |
+
PRINT_FREQ: 100
|
12 |
+
|
13 |
+
DATASET:
|
14 |
+
COLOR_RGB: false
|
15 |
+
DATASET: 'coco'
|
16 |
+
ROOT: 'data/coco/'
|
17 |
+
TEST_SET: 'val2017'
|
18 |
+
TRAIN_SET: 'train2017'
|
19 |
+
FLIP: true
|
20 |
+
ROT_FACTOR: 40
|
21 |
+
SCALE_FACTOR: 0.3
|
22 |
+
MODEL:
|
23 |
+
NAME: 'pose_resnet'
|
24 |
+
PRETRAINED: 'models/pytorch/imagenet/resnet50-19c8e357.pth'
|
25 |
+
IMAGE_SIZE:
|
26 |
+
- 192
|
27 |
+
- 256
|
28 |
+
HEATMAP_SIZE:
|
29 |
+
- 48
|
30 |
+
- 64
|
31 |
+
SIGMA: 2
|
32 |
+
NUM_JOINTS: 17
|
33 |
+
TARGET_TYPE: 'gaussian'
|
34 |
+
EXTRA:
|
35 |
+
FINAL_CONV_KERNEL: 1
|
36 |
+
DECONV_WITH_BIAS: false
|
37 |
+
NUM_DECONV_LAYERS: 3
|
38 |
+
NUM_DECONV_FILTERS:
|
39 |
+
- 256
|
40 |
+
- 256
|
41 |
+
- 256
|
42 |
+
NUM_DECONV_KERNELS:
|
43 |
+
- 4
|
44 |
+
- 4
|
45 |
+
- 4
|
46 |
+
NUM_LAYERS: 50
|
47 |
+
LOSS:
|
48 |
+
USE_TARGET_WEIGHT: true
|
49 |
+
TRAIN:
|
50 |
+
BATCH_SIZE_PER_GPU: 32
|
51 |
+
SHUFFLE: true
|
52 |
+
BEGIN_EPOCH: 0
|
53 |
+
END_EPOCH: 140
|
54 |
+
OPTIMIZER: 'adam'
|
55 |
+
LR: 0.001
|
56 |
+
LR_FACTOR: 0.1
|
57 |
+
LR_STEP:
|
58 |
+
- 90
|
59 |
+
- 120
|
60 |
+
WD: 0.0001
|
61 |
+
GAMMA1: 0.99
|
62 |
+
GAMMA2: 0.0
|
63 |
+
MOMENTUM: 0.9
|
64 |
+
NESTEROV: false
|
65 |
+
TEST:
|
66 |
+
BATCH_SIZE_PER_GPU: 32
|
67 |
+
COCO_BBOX_FILE: 'data/coco/person_detection_results/COCO_val2017_detections_AP_H_56_person.json'
|
68 |
+
BBOX_THRE: 1.0
|
69 |
+
IMAGE_THRE: 0.0
|
70 |
+
IN_VIS_THRE: 0.2
|
71 |
+
MODEL_FILE: ''
|
72 |
+
NMS_THRE: 1.0
|
73 |
+
OKS_THRE: 0.9
|
74 |
+
FLIP_TEST: true
|
75 |
+
POST_PROCESS: true
|
76 |
+
SHIFT_HEATMAP: true
|
77 |
+
USE_GT_BBOX: true
|
78 |
+
DEBUG:
|
79 |
+
DEBUG: true
|
80 |
+
SAVE_BATCH_IMAGES_GT: true
|
81 |
+
SAVE_BATCH_IMAGES_PRED: true
|
82 |
+
SAVE_HEATMAPS_GT: true
|
83 |
+
SAVE_HEATMAPS_PRED: true
|
VideoToNPZ/lib/pose/hrnet/experiments/coco/resnet/res50_384x288_d256x3_adam_lr1e-3.yaml
ADDED
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
AUTO_RESUME: true
|
2 |
+
CUDNN:
|
3 |
+
BENCHMARK: true
|
4 |
+
DETERMINISTIC: false
|
5 |
+
ENABLED: true
|
6 |
+
DATA_DIR: ''
|
7 |
+
GPUS: (0,1,2,3)
|
8 |
+
OUTPUT_DIR: 'output'
|
9 |
+
LOG_DIR: 'log'
|
10 |
+
WORKERS: 24
|
11 |
+
PRINT_FREQ: 100
|
12 |
+
|
13 |
+
DATASET:
|
14 |
+
COLOR_RGB: false
|
15 |
+
DATASET: 'coco'
|
16 |
+
ROOT: 'data/coco/'
|
17 |
+
TEST_SET: 'val2017'
|
18 |
+
TRAIN_SET: 'train2017'
|
19 |
+
FLIP: true
|
20 |
+
ROT_FACTOR: 40
|
21 |
+
SCALE_FACTOR: 0.3
|
22 |
+
MODEL:
|
23 |
+
NAME: 'pose_resnet'
|
24 |
+
PRETRAINED: 'models/pytorch/imagenet/resnet50-19c8e357.pth'
|
25 |
+
IMAGE_SIZE:
|
26 |
+
- 288
|
27 |
+
- 384
|
28 |
+
HEATMAP_SIZE:
|
29 |
+
- 72
|
30 |
+
- 96
|
31 |
+
SIGMA: 3
|
32 |
+
NUM_JOINTS: 17
|
33 |
+
TARGET_TYPE: 'gaussian'
|
34 |
+
EXTRA:
|
35 |
+
FINAL_CONV_KERNEL: 1
|
36 |
+
DECONV_WITH_BIAS: false
|
37 |
+
NUM_DECONV_LAYERS: 3
|
38 |
+
NUM_DECONV_FILTERS:
|
39 |
+
- 256
|
40 |
+
- 256
|
41 |
+
- 256
|
42 |
+
NUM_DECONV_KERNELS:
|
43 |
+
- 4
|
44 |
+
- 4
|
45 |
+
- 4
|
46 |
+
NUM_LAYERS: 50
|
47 |
+
LOSS:
|
48 |
+
USE_TARGET_WEIGHT: true
|
49 |
+
TRAIN:
|
50 |
+
BATCH_SIZE_PER_GPU: 32
|
51 |
+
SHUFFLE: true
|
52 |
+
BEGIN_EPOCH: 0
|
53 |
+
END_EPOCH: 140
|
54 |
+
OPTIMIZER: 'adam'
|
55 |
+
LR: 0.001
|
56 |
+
LR_FACTOR: 0.1
|
57 |
+
LR_STEP:
|
58 |
+
- 90
|
59 |
+
- 120
|
60 |
+
WD: 0.0001
|
61 |
+
GAMMA1: 0.99
|
62 |
+
GAMMA2: 0.0
|
63 |
+
MOMENTUM: 0.9
|
64 |
+
NESTEROV: false
|
65 |
+
TEST:
|
66 |
+
BATCH_SIZE_PER_GPU: 32
|
67 |
+
COCO_BBOX_FILE: 'data/coco/person_detection_results/COCO_val2017_detections_AP_H_56_person.json'
|
68 |
+
BBOX_THRE: 1.0
|
69 |
+
IMAGE_THRE: 0.0
|
70 |
+
IN_VIS_THRE: 0.2
|
71 |
+
MODEL_FILE: ''
|
72 |
+
NMS_THRE: 1.0
|
73 |
+
OKS_THRE: 0.9
|
74 |
+
FLIP_TEST: true
|
75 |
+
POST_PROCESS: true
|
76 |
+
SHIFT_HEATMAP: true
|
77 |
+
USE_GT_BBOX: true
|
78 |
+
DEBUG:
|
79 |
+
DEBUG: true
|
80 |
+
SAVE_BATCH_IMAGES_GT: true
|
81 |
+
SAVE_BATCH_IMAGES_PRED: true
|
82 |
+
SAVE_HEATMAPS_GT: true
|
83 |
+
SAVE_HEATMAPS_PRED: true
|
VideoToNPZ/lib/pose/hrnet/experiments/mpii/hrnet/w32_256x256_adam_lr1e-3.yaml
ADDED
@@ -0,0 +1,120 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
AUTO_RESUME: true
|
2 |
+
CUDNN:
|
3 |
+
BENCHMARK: true
|
4 |
+
DETERMINISTIC: false
|
5 |
+
ENABLED: true
|
6 |
+
DATA_DIR: ''
|
7 |
+
GPUS: (0,1,2,3)
|
8 |
+
OUTPUT_DIR: 'output'
|
9 |
+
LOG_DIR: 'log'
|
10 |
+
WORKERS: 24
|
11 |
+
PRINT_FREQ: 100
|
12 |
+
|
13 |
+
DATASET:
|
14 |
+
COLOR_RGB: true
|
15 |
+
DATASET: mpii
|
16 |
+
DATA_FORMAT: jpg
|
17 |
+
FLIP: true
|
18 |
+
NUM_JOINTS_HALF_BODY: 8
|
19 |
+
PROB_HALF_BODY: -1.0
|
20 |
+
ROOT: 'data/mpii/'
|
21 |
+
ROT_FACTOR: 30
|
22 |
+
SCALE_FACTOR: 0.25
|
23 |
+
TEST_SET: valid
|
24 |
+
TRAIN_SET: train
|
25 |
+
MODEL:
|
26 |
+
INIT_WEIGHTS: true
|
27 |
+
NAME: pose_hrnet
|
28 |
+
NUM_JOINTS: 16
|
29 |
+
PRETRAINED: 'models/pytorch/imagenet/hrnet_w32-36af842e.pth'
|
30 |
+
TARGET_TYPE: gaussian
|
31 |
+
IMAGE_SIZE:
|
32 |
+
- 256
|
33 |
+
- 256
|
34 |
+
HEATMAP_SIZE:
|
35 |
+
- 64
|
36 |
+
- 64
|
37 |
+
SIGMA: 2
|
38 |
+
EXTRA:
|
39 |
+
PRETRAINED_LAYERS:
|
40 |
+
- 'conv1'
|
41 |
+
- 'bn1'
|
42 |
+
- 'conv2'
|
43 |
+
- 'bn2'
|
44 |
+
- 'layer1'
|
45 |
+
- 'transition1'
|
46 |
+
- 'stage2'
|
47 |
+
- 'transition2'
|
48 |
+
- 'stage3'
|
49 |
+
- 'transition3'
|
50 |
+
- 'stage4'
|
51 |
+
FINAL_CONV_KERNEL: 1
|
52 |
+
STAGE2:
|
53 |
+
NUM_MODULES: 1
|
54 |
+
NUM_BRANCHES: 2
|
55 |
+
BLOCK: BASIC
|
56 |
+
NUM_BLOCKS:
|
57 |
+
- 4
|
58 |
+
- 4
|
59 |
+
NUM_CHANNELS:
|
60 |
+
- 32
|
61 |
+
- 64
|
62 |
+
FUSE_METHOD: SUM
|
63 |
+
STAGE3:
|
64 |
+
NUM_MODULES: 4
|
65 |
+
NUM_BRANCHES: 3
|
66 |
+
BLOCK: BASIC
|
67 |
+
NUM_BLOCKS:
|
68 |
+
- 4
|
69 |
+
- 4
|
70 |
+
- 4
|
71 |
+
NUM_CHANNELS:
|
72 |
+
- 32
|
73 |
+
- 64
|
74 |
+
- 128
|
75 |
+
FUSE_METHOD: SUM
|
76 |
+
STAGE4:
|
77 |
+
NUM_MODULES: 3
|
78 |
+
NUM_BRANCHES: 4
|
79 |
+
BLOCK: BASIC
|
80 |
+
NUM_BLOCKS:
|
81 |
+
- 4
|
82 |
+
- 4
|
83 |
+
- 4
|
84 |
+
- 4
|
85 |
+
NUM_CHANNELS:
|
86 |
+
- 32
|
87 |
+
- 64
|
88 |
+
- 128
|
89 |
+
- 256
|
90 |
+
FUSE_METHOD: SUM
|
91 |
+
LOSS:
|
92 |
+
USE_TARGET_WEIGHT: true
|
93 |
+
TRAIN:
|
94 |
+
BATCH_SIZE_PER_GPU: 32
|
95 |
+
SHUFFLE: true
|
96 |
+
BEGIN_EPOCH: 0
|
97 |
+
END_EPOCH: 210
|
98 |
+
OPTIMIZER: adam
|
99 |
+
LR: 0.001
|
100 |
+
LR_FACTOR: 0.1
|
101 |
+
LR_STEP:
|
102 |
+
- 170
|
103 |
+
- 200
|
104 |
+
WD: 0.0001
|
105 |
+
GAMMA1: 0.99
|
106 |
+
GAMMA2: 0.0
|
107 |
+
MOMENTUM: 0.9
|
108 |
+
NESTEROV: false
|
109 |
+
TEST:
|
110 |
+
BATCH_SIZE_PER_GPU: 32
|
111 |
+
MODEL_FILE: ''
|
112 |
+
FLIP_TEST: true
|
113 |
+
POST_PROCESS: true
|
114 |
+
SHIFT_HEATMAP: true
|
115 |
+
DEBUG:
|
116 |
+
DEBUG: true
|
117 |
+
SAVE_BATCH_IMAGES_GT: true
|
118 |
+
SAVE_BATCH_IMAGES_PRED: true
|
119 |
+
SAVE_HEATMAPS_GT: true
|
120 |
+
SAVE_HEATMAPS_PRED: true
|
VideoToNPZ/lib/pose/hrnet/experiments/mpii/hrnet/w48_256x256_adam_lr1e-3.yaml
ADDED
@@ -0,0 +1,120 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
AUTO_RESUME: true
|
2 |
+
CUDNN:
|
3 |
+
BENCHMARK: true
|
4 |
+
DETERMINISTIC: false
|
5 |
+
ENABLED: true
|
6 |
+
DATA_DIR: ''
|
7 |
+
GPUS: (0,1,2,3)
|
8 |
+
OUTPUT_DIR: 'output'
|
9 |
+
LOG_DIR: 'log'
|
10 |
+
WORKERS: 24
|
11 |
+
PRINT_FREQ: 100
|
12 |
+
|
13 |
+
DATASET:
|
14 |
+
COLOR_RGB: true
|
15 |
+
DATASET: mpii
|
16 |
+
DATA_FORMAT: jpg
|
17 |
+
FLIP: true
|
18 |
+
NUM_JOINTS_HALF_BODY: 8
|
19 |
+
PROB_HALF_BODY: -1.0
|
20 |
+
ROOT: 'data/mpii/'
|
21 |
+
ROT_FACTOR: 30
|
22 |
+
SCALE_FACTOR: 0.25
|
23 |
+
TEST_SET: valid
|
24 |
+
TRAIN_SET: train
|
25 |
+
MODEL:
|
26 |
+
INIT_WEIGHTS: true
|
27 |
+
NAME: pose_hrnet
|
28 |
+
NUM_JOINTS: 16
|
29 |
+
PRETRAINED: 'models/pytorch/imagenet/hrnet_w48-8ef0771d.pth'
|
30 |
+
TARGET_TYPE: gaussian
|
31 |
+
IMAGE_SIZE:
|
32 |
+
- 256
|
33 |
+
- 256
|
34 |
+
HEATMAP_SIZE:
|
35 |
+
- 64
|
36 |
+
- 64
|
37 |
+
SIGMA: 2
|
38 |
+
EXTRA:
|
39 |
+
PRETRAINED_LAYERS:
|
40 |
+
- 'conv1'
|
41 |
+
- 'bn1'
|
42 |
+
- 'conv2'
|
43 |
+
- 'bn2'
|
44 |
+
- 'layer1'
|
45 |
+
- 'transition1'
|
46 |
+
- 'stage2'
|
47 |
+
- 'transition2'
|
48 |
+
- 'stage3'
|
49 |
+
- 'transition3'
|
50 |
+
- 'stage4'
|
51 |
+
FINAL_CONV_KERNEL: 1
|
52 |
+
STAGE2:
|
53 |
+
NUM_MODULES: 1
|
54 |
+
NUM_BRANCHES: 2
|
55 |
+
BLOCK: BASIC
|
56 |
+
NUM_BLOCKS:
|
57 |
+
- 4
|
58 |
+
- 4
|
59 |
+
NUM_CHANNELS:
|
60 |
+
- 48
|
61 |
+
- 96
|
62 |
+
FUSE_METHOD: SUM
|
63 |
+
STAGE3:
|
64 |
+
NUM_MODULES: 4
|
65 |
+
NUM_BRANCHES: 3
|
66 |
+
BLOCK: BASIC
|
67 |
+
NUM_BLOCKS:
|
68 |
+
- 4
|
69 |
+
- 4
|
70 |
+
- 4
|
71 |
+
NUM_CHANNELS:
|
72 |
+
- 48
|
73 |
+
- 96
|
74 |
+
- 192
|
75 |
+
FUSE_METHOD: SUM
|
76 |
+
STAGE4:
|
77 |
+
NUM_MODULES: 3
|
78 |
+
NUM_BRANCHES: 4
|
79 |
+
BLOCK: BASIC
|
80 |
+
NUM_BLOCKS:
|
81 |
+
- 4
|
82 |
+
- 4
|
83 |
+
- 4
|
84 |
+
- 4
|
85 |
+
NUM_CHANNELS:
|
86 |
+
- 48
|
87 |
+
- 96
|
88 |
+
- 192
|
89 |
+
- 384
|
90 |
+
FUSE_METHOD: SUM
|
91 |
+
LOSS:
|
92 |
+
USE_TARGET_WEIGHT: true
|
93 |
+
TRAIN:
|
94 |
+
BATCH_SIZE_PER_GPU: 32
|
95 |
+
SHUFFLE: true
|
96 |
+
BEGIN_EPOCH: 0
|
97 |
+
END_EPOCH: 210
|
98 |
+
OPTIMIZER: adam
|
99 |
+
LR: 0.001
|
100 |
+
LR_FACTOR: 0.1
|
101 |
+
LR_STEP:
|
102 |
+
- 170
|
103 |
+
- 200
|
104 |
+
WD: 0.0001
|
105 |
+
GAMMA1: 0.99
|
106 |
+
GAMMA2: 0.0
|
107 |
+
MOMENTUM: 0.9
|
108 |
+
NESTEROV: false
|
109 |
+
TEST:
|
110 |
+
BATCH_SIZE_PER_GPU: 32
|
111 |
+
MODEL_FILE: ''
|
112 |
+
FLIP_TEST: true
|
113 |
+
POST_PROCESS: true
|
114 |
+
SHIFT_HEATMAP: true
|
115 |
+
DEBUG:
|
116 |
+
DEBUG: true
|
117 |
+
SAVE_BATCH_IMAGES_GT: true
|
118 |
+
SAVE_BATCH_IMAGES_PRED: true
|
119 |
+
SAVE_HEATMAPS_GT: true
|
120 |
+
SAVE_HEATMAPS_PRED: true
|
VideoToNPZ/lib/pose/hrnet/experiments/mpii/resnet/res101_256x256_d256x3_adam_lr1e-3.yaml
ADDED
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
AUTO_RESUME: true
|
2 |
+
CUDNN:
|
3 |
+
BENCHMARK: true
|
4 |
+
DETERMINISTIC: false
|
5 |
+
ENABLED: true
|
6 |
+
DATA_DIR: ''
|
7 |
+
GPUS: (0,1,2,3)
|
8 |
+
OUTPUT_DIR: 'output'
|
9 |
+
LOG_DIR: 'log'
|
10 |
+
WORKERS: 24
|
11 |
+
PRINT_FREQ: 100
|
12 |
+
|
13 |
+
DATASET:
|
14 |
+
COLOR_RGB: false
|
15 |
+
DATASET: mpii
|
16 |
+
DATA_FORMAT: jpg
|
17 |
+
FLIP: true
|
18 |
+
NUM_JOINTS_HALF_BODY: 8
|
19 |
+
PROB_HALF_BODY: -1.0
|
20 |
+
ROOT: 'data/mpii/'
|
21 |
+
ROT_FACTOR: 30
|
22 |
+
SCALE_FACTOR: 0.25
|
23 |
+
TEST_SET: valid
|
24 |
+
TRAIN_SET: train
|
25 |
+
MODEL:
|
26 |
+
NAME: 'pose_resnet'
|
27 |
+
PRETRAINED: 'models/pytorch/imagenet/resnet101-5d3b4d8f.pth'
|
28 |
+
IMAGE_SIZE:
|
29 |
+
- 256
|
30 |
+
- 256
|
31 |
+
HEATMAP_SIZE:
|
32 |
+
- 64
|
33 |
+
- 64
|
34 |
+
SIGMA: 2
|
35 |
+
NUM_JOINTS: 16
|
36 |
+
TARGET_TYPE: 'gaussian'
|
37 |
+
EXTRA:
|
38 |
+
FINAL_CONV_KERNEL: 1
|
39 |
+
DECONV_WITH_BIAS: false
|
40 |
+
NUM_DECONV_LAYERS: 3
|
41 |
+
NUM_DECONV_FILTERS:
|
42 |
+
- 256
|
43 |
+
- 256
|
44 |
+
- 256
|
45 |
+
NUM_DECONV_KERNELS:
|
46 |
+
- 4
|
47 |
+
- 4
|
48 |
+
- 4
|
49 |
+
NUM_LAYERS: 101
|
50 |
+
LOSS:
|
51 |
+
USE_TARGET_WEIGHT: true
|
52 |
+
TRAIN:
|
53 |
+
BATCH_SIZE_PER_GPU: 32
|
54 |
+
SHUFFLE: true
|
55 |
+
BEGIN_EPOCH: 0
|
56 |
+
END_EPOCH: 140
|
57 |
+
OPTIMIZER: 'adam'
|
58 |
+
LR: 0.001
|
59 |
+
LR_FACTOR: 0.1
|
60 |
+
LR_STEP:
|
61 |
+
- 90
|
62 |
+
- 120
|
63 |
+
WD: 0.0001
|
64 |
+
GAMMA1: 0.99
|
65 |
+
GAMMA2: 0.0
|
66 |
+
MOMENTUM: 0.9
|
67 |
+
NESTEROV: false
|
68 |
+
TEST:
|
69 |
+
BATCH_SIZE_PER_GPU: 32
|
70 |
+
COCO_BBOX_FILE: 'data/coco/person_detection_results/COCO_val2017_detections_AP_H_56_person.json'
|
71 |
+
BBOX_THRE: 1.0
|
72 |
+
IMAGE_THRE: 0.0
|
73 |
+
IN_VIS_THRE: 0.2
|
74 |
+
MODEL_FILE: ''
|
75 |
+
NMS_THRE: 1.0
|
76 |
+
OKS_THRE: 0.9
|
77 |
+
FLIP_TEST: true
|
78 |
+
POST_PROCESS: true
|
79 |
+
SHIFT_HEATMAP: true
|
80 |
+
USE_GT_BBOX: true
|
81 |
+
DEBUG:
|
82 |
+
DEBUG: true
|
83 |
+
SAVE_BATCH_IMAGES_GT: true
|
84 |
+
SAVE_BATCH_IMAGES_PRED: true
|
85 |
+
SAVE_HEATMAPS_GT: true
|
86 |
+
SAVE_HEATMAPS_PRED: true
|
VideoToNPZ/lib/pose/hrnet/experiments/mpii/resnet/res152_256x256_d256x3_adam_lr1e-3.yaml
ADDED
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
AUTO_RESUME: true
|
2 |
+
CUDNN:
|
3 |
+
BENCHMARK: true
|
4 |
+
DETERMINISTIC: false
|
5 |
+
ENABLED: true
|
6 |
+
DATA_DIR: ''
|
7 |
+
GPUS: (0,1,2,3)
|
8 |
+
OUTPUT_DIR: 'output'
|
9 |
+
LOG_DIR: 'log'
|
10 |
+
WORKERS: 24
|
11 |
+
PRINT_FREQ: 100
|
12 |
+
|
13 |
+
DATASET:
|
14 |
+
COLOR_RGB: false
|
15 |
+
DATASET: mpii
|
16 |
+
DATA_FORMAT: jpg
|
17 |
+
FLIP: true
|
18 |
+
NUM_JOINTS_HALF_BODY: 8
|
19 |
+
PROB_HALF_BODY: -1.0
|
20 |
+
ROOT: 'data/mpii/'
|
21 |
+
ROT_FACTOR: 30
|
22 |
+
SCALE_FACTOR: 0.25
|
23 |
+
TEST_SET: valid
|
24 |
+
TRAIN_SET: train
|
25 |
+
MODEL:
|
26 |
+
NAME: 'pose_resnet'
|
27 |
+
PRETRAINED: 'models/pytorch/imagenet/resnet152-b121ed2d.pth'
|
28 |
+
IMAGE_SIZE:
|
29 |
+
- 256
|
30 |
+
- 256
|
31 |
+
HEATMAP_SIZE:
|
32 |
+
- 64
|
33 |
+
- 64
|
34 |
+
SIGMA: 2
|
35 |
+
NUM_JOINTS: 16
|
36 |
+
TARGET_TYPE: 'gaussian'
|
37 |
+
EXTRA:
|
38 |
+
FINAL_CONV_KERNEL: 1
|
39 |
+
DECONV_WITH_BIAS: false
|
40 |
+
NUM_DECONV_LAYERS: 3
|
41 |
+
NUM_DECONV_FILTERS:
|
42 |
+
- 256
|
43 |
+
- 256
|
44 |
+
- 256
|
45 |
+
NUM_DECONV_KERNELS:
|
46 |
+
- 4
|
47 |
+
- 4
|
48 |
+
- 4
|
49 |
+
NUM_LAYERS: 152
|
50 |
+
LOSS:
|
51 |
+
USE_TARGET_WEIGHT: true
|
52 |
+
TRAIN:
|
53 |
+
BATCH_SIZE_PER_GPU: 32
|
54 |
+
SHUFFLE: true
|
55 |
+
BEGIN_EPOCH: 0
|
56 |
+
END_EPOCH: 140
|
57 |
+
OPTIMIZER: 'adam'
|
58 |
+
LR: 0.001
|
59 |
+
LR_FACTOR: 0.1
|
60 |
+
LR_STEP:
|
61 |
+
- 90
|
62 |
+
- 120
|
63 |
+
WD: 0.0001
|
64 |
+
GAMMA1: 0.99
|
65 |
+
GAMMA2: 0.0
|
66 |
+
MOMENTUM: 0.9
|
67 |
+
NESTEROV: false
|
68 |
+
TEST:
|
69 |
+
BATCH_SIZE_PER_GPU: 32
|
70 |
+
COCO_BBOX_FILE: 'data/coco/person_detection_results/COCO_val2017_detections_AP_H_56_person.json'
|
71 |
+
BBOX_THRE: 1.0
|
72 |
+
IMAGE_THRE: 0.0
|
73 |
+
IN_VIS_THRE: 0.2
|
74 |
+
MODEL_FILE: ''
|
75 |
+
NMS_THRE: 1.0
|
76 |
+
OKS_THRE: 0.9
|
77 |
+
FLIP_TEST: true
|
78 |
+
POST_PROCESS: true
|
79 |
+
SHIFT_HEATMAP: true
|
80 |
+
USE_GT_BBOX: true
|
81 |
+
DEBUG:
|
82 |
+
DEBUG: true
|
83 |
+
SAVE_BATCH_IMAGES_GT: true
|
84 |
+
SAVE_BATCH_IMAGES_PRED: true
|
85 |
+
SAVE_HEATMAPS_GT: true
|
86 |
+
SAVE_HEATMAPS_PRED: true
|
VideoToNPZ/lib/pose/hrnet/experiments/mpii/resnet/res50_256x256_d256x3_adam_lr1e-3.yaml
ADDED
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
AUTO_RESUME: true
|
2 |
+
CUDNN:
|
3 |
+
BENCHMARK: true
|
4 |
+
DETERMINISTIC: false
|
5 |
+
ENABLED: true
|
6 |
+
DATA_DIR: ''
|
7 |
+
GPUS: (0,1,2,3)
|
8 |
+
OUTPUT_DIR: 'output'
|
9 |
+
LOG_DIR: 'log'
|
10 |
+
WORKERS: 24
|
11 |
+
PRINT_FREQ: 100
|
12 |
+
|
13 |
+
DATASET:
|
14 |
+
COLOR_RGB: false
|
15 |
+
DATASET: mpii
|
16 |
+
DATA_FORMAT: jpg
|
17 |
+
FLIP: true
|
18 |
+
NUM_JOINTS_HALF_BODY: 8
|
19 |
+
PROB_HALF_BODY: -1.0
|
20 |
+
ROOT: 'data/mpii/'
|
21 |
+
ROT_FACTOR: 30
|
22 |
+
SCALE_FACTOR: 0.25
|
23 |
+
TEST_SET: valid
|
24 |
+
TRAIN_SET: train
|
25 |
+
MODEL:
|
26 |
+
NAME: 'pose_resnet'
|
27 |
+
PRETRAINED: 'models/pytorch/imagenet/resnet50-19c8e357.pth'
|
28 |
+
IMAGE_SIZE:
|
29 |
+
- 256
|
30 |
+
- 256
|
31 |
+
HEATMAP_SIZE:
|
32 |
+
- 64
|
33 |
+
- 64
|
34 |
+
SIGMA: 2
|
35 |
+
NUM_JOINTS: 16
|
36 |
+
TARGET_TYPE: 'gaussian'
|
37 |
+
EXTRA:
|
38 |
+
FINAL_CONV_KERNEL: 1
|
39 |
+
DECONV_WITH_BIAS: false
|
40 |
+
NUM_DECONV_LAYERS: 3
|
41 |
+
NUM_DECONV_FILTERS:
|
42 |
+
- 256
|
43 |
+
- 256
|
44 |
+
- 256
|
45 |
+
NUM_DECONV_KERNELS:
|
46 |
+
- 4
|
47 |
+
- 4
|
48 |
+
- 4
|
49 |
+
NUM_LAYERS: 50
|
50 |
+
LOSS:
|
51 |
+
USE_TARGET_WEIGHT: true
|
52 |
+
TRAIN:
|
53 |
+
BATCH_SIZE_PER_GPU: 32
|
54 |
+
SHUFFLE: true
|
55 |
+
BEGIN_EPOCH: 0
|
56 |
+
END_EPOCH: 140
|
57 |
+
OPTIMIZER: 'adam'
|
58 |
+
LR: 0.001
|
59 |
+
LR_FACTOR: 0.1
|
60 |
+
LR_STEP:
|
61 |
+
- 90
|
62 |
+
- 120
|
63 |
+
WD: 0.0001
|
64 |
+
GAMMA1: 0.99
|
65 |
+
GAMMA2: 0.0
|
66 |
+
MOMENTUM: 0.9
|
67 |
+
NESTEROV: false
|
68 |
+
TEST:
|
69 |
+
BATCH_SIZE_PER_GPU: 32
|
70 |
+
COCO_BBOX_FILE: 'data/coco/person_detection_results/COCO_val2017_detections_AP_H_56_person.json'
|
71 |
+
BBOX_THRE: 1.0
|
72 |
+
IMAGE_THRE: 0.0
|
73 |
+
IN_VIS_THRE: 0.2
|
74 |
+
MODEL_FILE: ''
|
75 |
+
NMS_THRE: 1.0
|
76 |
+
OKS_THRE: 0.9
|
77 |
+
FLIP_TEST: true
|
78 |
+
POST_PROCESS: true
|
79 |
+
SHIFT_HEATMAP: true
|
80 |
+
USE_GT_BBOX: true
|
81 |
+
DEBUG:
|
82 |
+
DEBUG: true
|
83 |
+
SAVE_BATCH_IMAGES_GT: true
|
84 |
+
SAVE_BATCH_IMAGES_PRED: true
|
85 |
+
SAVE_HEATMAPS_GT: true
|
86 |
+
SAVE_HEATMAPS_PRED: true
|
VideoToNPZ/lib/pose/hrnet/lib/Makefile
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
all:
|
2 |
+
cd nms; python setup_linux.py build_ext --inplace; rm -rf build; cd ../../
|
3 |
+
clean:
|
4 |
+
cd nms; rm *.so; cd ../../
|
VideoToNPZ/lib/pose/hrnet/lib/config/__init__.py
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# ------------------------------------------------------------------------------
|
2 |
+
# Copyright (c) Microsoft
|
3 |
+
# Licensed under the MIT License.
|
4 |
+
# Written by Bin Xiao ([email protected])
|
5 |
+
# ------------------------------------------------------------------------------
|
6 |
+
|
7 |
+
from .default import _C as cfg
|
8 |
+
from .default import update_config
|
9 |
+
from .models import MODEL_EXTRAS
|
VideoToNPZ/lib/pose/hrnet/lib/config/default.py
ADDED
@@ -0,0 +1,160 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
# ------------------------------------------------------------------------------
|
3 |
+
# Copyright (c) Microsoft
|
4 |
+
# Licensed under the MIT License.
|
5 |
+
# Written by Bin Xiao ([email protected])
|
6 |
+
# ------------------------------------------------------------------------------
|
7 |
+
|
8 |
+
from __future__ import absolute_import
|
9 |
+
from __future__ import division
|
10 |
+
from __future__ import print_function
|
11 |
+
|
12 |
+
import os
|
13 |
+
|
14 |
+
from yacs.config import CfgNode as CN
|
15 |
+
|
16 |
+
|
17 |
+
_C = CN()
|
18 |
+
|
19 |
+
_C.OUTPUT_DIR = ''
|
20 |
+
_C.LOG_DIR = ''
|
21 |
+
_C.DATA_DIR = ''
|
22 |
+
_C.GPUS = (0,)
|
23 |
+
_C.WORKERS = 4
|
24 |
+
_C.PRINT_FREQ = 20
|
25 |
+
_C.AUTO_RESUME = False
|
26 |
+
_C.PIN_MEMORY = True
|
27 |
+
_C.RANK = 0
|
28 |
+
|
29 |
+
# Cudnn related params
|
30 |
+
_C.CUDNN = CN()
|
31 |
+
_C.CUDNN.BENCHMARK = True
|
32 |
+
_C.CUDNN.DETERMINISTIC = False
|
33 |
+
_C.CUDNN.ENABLED = True
|
34 |
+
|
35 |
+
# common params for NETWORK
|
36 |
+
_C.MODEL = CN()
|
37 |
+
_C.MODEL.NAME = 'pose_hrnet'
|
38 |
+
_C.MODEL.INIT_WEIGHTS = True
|
39 |
+
_C.MODEL.PRETRAINED = ''
|
40 |
+
_C.MODEL.NUM_JOINTS = 17
|
41 |
+
_C.MODEL.TAG_PER_JOINT = True
|
42 |
+
_C.MODEL.TARGET_TYPE = 'gaussian'
|
43 |
+
_C.MODEL.IMAGE_SIZE = [256, 256] # width * height, ex: 192 * 256
|
44 |
+
_C.MODEL.HEATMAP_SIZE = [64, 64] # width * height, ex: 24 * 32
|
45 |
+
_C.MODEL.SIGMA = 2
|
46 |
+
_C.MODEL.EXTRA = CN(new_allowed=True)
|
47 |
+
|
48 |
+
_C.LOSS = CN()
|
49 |
+
_C.LOSS.USE_OHKM = False
|
50 |
+
_C.LOSS.TOPK = 8
|
51 |
+
_C.LOSS.USE_TARGET_WEIGHT = True
|
52 |
+
_C.LOSS.USE_DIFFERENT_JOINTS_WEIGHT = False
|
53 |
+
|
54 |
+
# DATASET related params
|
55 |
+
_C.DATASET = CN()
|
56 |
+
_C.DATASET.ROOT = ''
|
57 |
+
_C.DATASET.DATASET = 'mpii'
|
58 |
+
_C.DATASET.TRAIN_SET = 'train'
|
59 |
+
_C.DATASET.TEST_SET = 'valid'
|
60 |
+
_C.DATASET.DATA_FORMAT = 'jpg'
|
61 |
+
_C.DATASET.HYBRID_JOINTS_TYPE = ''
|
62 |
+
_C.DATASET.SELECT_DATA = False
|
63 |
+
|
64 |
+
# training data augmentation
|
65 |
+
_C.DATASET.FLIP = True
|
66 |
+
_C.DATASET.SCALE_FACTOR = 0.25
|
67 |
+
_C.DATASET.ROT_FACTOR = 30
|
68 |
+
_C.DATASET.PROB_HALF_BODY = 0.0
|
69 |
+
_C.DATASET.NUM_JOINTS_HALF_BODY = 8
|
70 |
+
_C.DATASET.COLOR_RGB = False
|
71 |
+
|
72 |
+
# train
|
73 |
+
_C.TRAIN = CN()
|
74 |
+
|
75 |
+
_C.TRAIN.LR_FACTOR = 0.1
|
76 |
+
_C.TRAIN.LR_STEP = [90, 110]
|
77 |
+
_C.TRAIN.LR = 0.001
|
78 |
+
|
79 |
+
_C.TRAIN.OPTIMIZER = 'adam'
|
80 |
+
_C.TRAIN.MOMENTUM = 0.9
|
81 |
+
_C.TRAIN.WD = 0.0001
|
82 |
+
_C.TRAIN.NESTEROV = False
|
83 |
+
_C.TRAIN.GAMMA1 = 0.99
|
84 |
+
_C.TRAIN.GAMMA2 = 0.0
|
85 |
+
|
86 |
+
_C.TRAIN.BEGIN_EPOCH = 0
|
87 |
+
_C.TRAIN.END_EPOCH = 140
|
88 |
+
|
89 |
+
_C.TRAIN.RESUME = False
|
90 |
+
_C.TRAIN.CHECKPOINT = ''
|
91 |
+
|
92 |
+
_C.TRAIN.BATCH_SIZE_PER_GPU = 32
|
93 |
+
_C.TRAIN.SHUFFLE = True
|
94 |
+
|
95 |
+
# testing
|
96 |
+
_C.TEST = CN()
|
97 |
+
|
98 |
+
# size of images for each device
|
99 |
+
_C.TEST.BATCH_SIZE_PER_GPU = 32
|
100 |
+
# Test Model Epoch
|
101 |
+
_C.TEST.FLIP_TEST = False
|
102 |
+
_C.TEST.POST_PROCESS = False
|
103 |
+
_C.TEST.SHIFT_HEATMAP = False
|
104 |
+
|
105 |
+
_C.TEST.USE_GT_BBOX = False
|
106 |
+
|
107 |
+
# nms
|
108 |
+
_C.TEST.IMAGE_THRE = 0.1
|
109 |
+
_C.TEST.NMS_THRE = 0.6
|
110 |
+
_C.TEST.SOFT_NMS = False
|
111 |
+
_C.TEST.OKS_THRE = 0.5
|
112 |
+
_C.TEST.IN_VIS_THRE = 0.0
|
113 |
+
_C.TEST.COCO_BBOX_FILE = ''
|
114 |
+
_C.TEST.BBOX_THRE = 1.0
|
115 |
+
_C.TEST.MODEL_FILE = ''
|
116 |
+
|
117 |
+
# debug
|
118 |
+
_C.DEBUG = CN()
|
119 |
+
_C.DEBUG.DEBUG = False
|
120 |
+
_C.DEBUG.SAVE_BATCH_IMAGES_GT = False
|
121 |
+
_C.DEBUG.SAVE_BATCH_IMAGES_PRED = False
|
122 |
+
_C.DEBUG.SAVE_HEATMAPS_GT = False
|
123 |
+
_C.DEBUG.SAVE_HEATMAPS_PRED = False
|
124 |
+
|
125 |
+
|
126 |
+
def update_config(cfg, args):
|
127 |
+
cfg.defrost()
|
128 |
+
cfg.merge_from_file(args.cfg)
|
129 |
+
cfg.merge_from_list(args.opts)
|
130 |
+
|
131 |
+
if args.modelDir:
|
132 |
+
cfg.OUTPUT_DIR = args.modelDir
|
133 |
+
|
134 |
+
# if args.logDir:
|
135 |
+
# cfg.LOG_DIR = args.logDir
|
136 |
+
#
|
137 |
+
# if args.dataDir:
|
138 |
+
# cfg.DATA_DIR = args.dataDir
|
139 |
+
#
|
140 |
+
# cfg.DATASET.ROOT = os.path.join(
|
141 |
+
# cfg.DATA_DIR, cfg.DATASET.ROOT
|
142 |
+
# )
|
143 |
+
#
|
144 |
+
# cfg.MODEL.PRETRAINED = os.path.join(
|
145 |
+
# cfg.DATA_DIR, cfg.MODEL.PRETRAINED
|
146 |
+
# )
|
147 |
+
#
|
148 |
+
# if cfg.TEST.MODEL_FILE:
|
149 |
+
# cfg.TEST.MODEL_FILE = os.path.join(
|
150 |
+
# cfg.DATA_DIR, cfg.TEST.MODEL_FILE
|
151 |
+
# )
|
152 |
+
|
153 |
+
cfg.freeze()
|
154 |
+
|
155 |
+
|
156 |
+
if __name__ == '__main__':
|
157 |
+
import sys
|
158 |
+
with open(sys.argv[1], 'w') as f:
|
159 |
+
print(_C, file=f)
|
160 |
+
|
VideoToNPZ/lib/pose/hrnet/lib/config/models.py
ADDED
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# ------------------------------------------------------------------------------
|
2 |
+
# Copyright (c) Microsoft
|
3 |
+
# Licensed under the MIT License.
|
4 |
+
# Written by Bin Xiao ([email protected])
|
5 |
+
# ------------------------------------------------------------------------------
|
6 |
+
|
7 |
+
from __future__ import absolute_import
|
8 |
+
from __future__ import division
|
9 |
+
from __future__ import print_function
|
10 |
+
|
11 |
+
from yacs.config import CfgNode as CN
|
12 |
+
|
13 |
+
|
14 |
+
# pose_resnet related params
|
15 |
+
POSE_RESNET = CN()
|
16 |
+
POSE_RESNET.NUM_LAYERS = 50
|
17 |
+
POSE_RESNET.DECONV_WITH_BIAS = False
|
18 |
+
POSE_RESNET.NUM_DECONV_LAYERS = 3
|
19 |
+
POSE_RESNET.NUM_DECONV_FILTERS = [256, 256, 256]
|
20 |
+
POSE_RESNET.NUM_DECONV_KERNELS = [4, 4, 4]
|
21 |
+
POSE_RESNET.FINAL_CONV_KERNEL = 1
|
22 |
+
POSE_RESNET.PRETRAINED_LAYERS = ['*']
|
23 |
+
|
24 |
+
# pose_multi_resoluton_net related params
|
25 |
+
POSE_HIGH_RESOLUTION_NET = CN()
|
26 |
+
POSE_HIGH_RESOLUTION_NET.PRETRAINED_LAYERS = ['*']
|
27 |
+
POSE_HIGH_RESOLUTION_NET.STEM_INPLANES = 64
|
28 |
+
POSE_HIGH_RESOLUTION_NET.FINAL_CONV_KERNEL = 1
|
29 |
+
|
30 |
+
POSE_HIGH_RESOLUTION_NET.STAGE2 = CN()
|
31 |
+
POSE_HIGH_RESOLUTION_NET.STAGE2.NUM_MODULES = 1
|
32 |
+
POSE_HIGH_RESOLUTION_NET.STAGE2.NUM_BRANCHES = 2
|
33 |
+
POSE_HIGH_RESOLUTION_NET.STAGE2.NUM_BLOCKS = [4, 4]
|
34 |
+
POSE_HIGH_RESOLUTION_NET.STAGE2.NUM_CHANNELS = [32, 64]
|
35 |
+
POSE_HIGH_RESOLUTION_NET.STAGE2.BLOCK = 'BASIC'
|
36 |
+
POSE_HIGH_RESOLUTION_NET.STAGE2.FUSE_METHOD = 'SUM'
|
37 |
+
|
38 |
+
POSE_HIGH_RESOLUTION_NET.STAGE3 = CN()
|
39 |
+
POSE_HIGH_RESOLUTION_NET.STAGE3.NUM_MODULES = 1
|
40 |
+
POSE_HIGH_RESOLUTION_NET.STAGE3.NUM_BRANCHES = 3
|
41 |
+
POSE_HIGH_RESOLUTION_NET.STAGE3.NUM_BLOCKS = [4, 4, 4]
|
42 |
+
POSE_HIGH_RESOLUTION_NET.STAGE3.NUM_CHANNELS = [32, 64, 128]
|
43 |
+
POSE_HIGH_RESOLUTION_NET.STAGE3.BLOCK = 'BASIC'
|
44 |
+
POSE_HIGH_RESOLUTION_NET.STAGE3.FUSE_METHOD = 'SUM'
|
45 |
+
|
46 |
+
POSE_HIGH_RESOLUTION_NET.STAGE4 = CN()
|
47 |
+
POSE_HIGH_RESOLUTION_NET.STAGE4.NUM_MODULES = 1
|
48 |
+
POSE_HIGH_RESOLUTION_NET.STAGE4.NUM_BRANCHES = 4
|
49 |
+
POSE_HIGH_RESOLUTION_NET.STAGE4.NUM_BLOCKS = [4, 4, 4, 4]
|
50 |
+
POSE_HIGH_RESOLUTION_NET.STAGE4.NUM_CHANNELS = [32, 64, 128, 256]
|
51 |
+
POSE_HIGH_RESOLUTION_NET.STAGE4.BLOCK = 'BASIC'
|
52 |
+
POSE_HIGH_RESOLUTION_NET.STAGE4.FUSE_METHOD = 'SUM'
|
53 |
+
|
54 |
+
|
55 |
+
MODEL_EXTRAS = {
|
56 |
+
'pose_resnet': POSE_RESNET,
|
57 |
+
'pose_high_resolution_net': POSE_HIGH_RESOLUTION_NET,
|
58 |
+
}
|
VideoToNPZ/lib/pose/hrnet/lib/models/__init__.py
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# ------------------------------------------------------------------------------
|
2 |
+
# Copyright (c) Microsoft
|
3 |
+
# Licensed under the MIT License.
|
4 |
+
# Written by Bin Xiao ([email protected])
|
5 |
+
# ------------------------------------------------------------------------------
|
6 |
+
|
7 |
+
from __future__ import absolute_import
|
8 |
+
from __future__ import division
|
9 |
+
from __future__ import print_function
|
10 |
+
|
11 |
+
from __future__ import absolute_import
|
12 |
+
from __future__ import division
|
13 |
+
from __future__ import print_function
|
14 |
+
|
15 |
+
import models.pose_resnet
|
16 |
+
import models.pose_hrnet
|