Amanpreet commited on
Commit
1cdc47e
·
1 Parent(s): 4276ea6
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +1 -0
  2. .gitignore +7 -0
  3. VideoToNPZ/INFERENCE_EN.md +2 -0
  4. VideoToNPZ/checkpoint/gastnet/81_frame_model.bin +3 -0
  5. VideoToNPZ/checkpoint/hrnet/pose_coco/pose_hrnet_w48_384x288.pth +3 -0
  6. VideoToNPZ/checkpoint/yolov3/yolov3.weights +3 -0
  7. VideoToNPZ/common/arguments.py +86 -0
  8. VideoToNPZ/common/camera.py +63 -0
  9. VideoToNPZ/common/generators.py +236 -0
  10. VideoToNPZ/common/graph_utils.py +45 -0
  11. VideoToNPZ/common/loss.py +90 -0
  12. VideoToNPZ/common/quaternion.py +36 -0
  13. VideoToNPZ/common/skeleton.py +81 -0
  14. VideoToNPZ/data/data_utils.py +95 -0
  15. VideoToNPZ/gen_skes.py +116 -0
  16. VideoToNPZ/lib/detector/__init__.py +6 -0
  17. VideoToNPZ/lib/detector/yolov3/__init__.py +0 -0
  18. VideoToNPZ/lib/detector/yolov3/bbox.py +111 -0
  19. VideoToNPZ/lib/detector/yolov3/cfg/tiny-yolo-voc.cfg +134 -0
  20. VideoToNPZ/lib/detector/yolov3/cfg/yolo-voc.cfg +258 -0
  21. VideoToNPZ/lib/detector/yolov3/cfg/yolo.cfg +258 -0
  22. VideoToNPZ/lib/detector/yolov3/cfg/yolov3.cfg +789 -0
  23. VideoToNPZ/lib/detector/yolov3/darknet.py +433 -0
  24. VideoToNPZ/lib/detector/yolov3/data/coco.names +80 -0
  25. VideoToNPZ/lib/detector/yolov3/data/pallete +0 -0
  26. VideoToNPZ/lib/detector/yolov3/data/voc.names +20 -0
  27. VideoToNPZ/lib/detector/yolov3/human_detector.py +155 -0
  28. VideoToNPZ/lib/detector/yolov3/preprocess.py +63 -0
  29. VideoToNPZ/lib/detector/yolov3/util.py +225 -0
  30. VideoToNPZ/lib/pose/__init__.py +10 -0
  31. VideoToNPZ/lib/pose/hrnet/experiments/coco/hrnet/w32_256x192_adam_lr1e-3.yaml +127 -0
  32. VideoToNPZ/lib/pose/hrnet/experiments/coco/hrnet/w32_384x288_adam_lr1e-3.yaml +127 -0
  33. VideoToNPZ/lib/pose/hrnet/experiments/coco/hrnet/w48_256x192_adam_lr1e-3.yaml +127 -0
  34. VideoToNPZ/lib/pose/hrnet/experiments/coco/hrnet/w48_384x288_adam_lr1e-3.yaml +127 -0
  35. VideoToNPZ/lib/pose/hrnet/experiments/coco/resnet/res101_256x192_d256x3_adam_lr1e-3.yaml +83 -0
  36. VideoToNPZ/lib/pose/hrnet/experiments/coco/resnet/res101_384x288_d256x3_adam_lr1e-3.yaml +83 -0
  37. VideoToNPZ/lib/pose/hrnet/experiments/coco/resnet/res152_256x192_d256x3_adam_lr1e-3.yaml +83 -0
  38. VideoToNPZ/lib/pose/hrnet/experiments/coco/resnet/res152_384x288_d256x3_adam_lr1e-3.yaml +83 -0
  39. VideoToNPZ/lib/pose/hrnet/experiments/coco/resnet/res50_256x192_d256x3_adam_lr1e-3.yaml +83 -0
  40. VideoToNPZ/lib/pose/hrnet/experiments/coco/resnet/res50_384x288_d256x3_adam_lr1e-3.yaml +83 -0
  41. VideoToNPZ/lib/pose/hrnet/experiments/mpii/hrnet/w32_256x256_adam_lr1e-3.yaml +120 -0
  42. VideoToNPZ/lib/pose/hrnet/experiments/mpii/hrnet/w48_256x256_adam_lr1e-3.yaml +120 -0
  43. VideoToNPZ/lib/pose/hrnet/experiments/mpii/resnet/res101_256x256_d256x3_adam_lr1e-3.yaml +86 -0
  44. VideoToNPZ/lib/pose/hrnet/experiments/mpii/resnet/res152_256x256_d256x3_adam_lr1e-3.yaml +86 -0
  45. VideoToNPZ/lib/pose/hrnet/experiments/mpii/resnet/res50_256x256_d256x3_adam_lr1e-3.yaml +86 -0
  46. VideoToNPZ/lib/pose/hrnet/lib/Makefile +4 -0
  47. VideoToNPZ/lib/pose/hrnet/lib/config/__init__.py +9 -0
  48. VideoToNPZ/lib/pose/hrnet/lib/config/default.py +160 -0
  49. VideoToNPZ/lib/pose/hrnet/lib/config/models.py +58 -0
  50. VideoToNPZ/lib/pose/hrnet/lib/models/__init__.py +16 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.weights filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ venv/
2
+ _pycache_/
3
+ *.pyc
4
+ *.bvh
5
+ *.obj
6
+ *.npz
7
+ *.mp4
VideoToNPZ/INFERENCE_EN.md ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+
2
+ python gen_skes.py -v baseball.mp4
VideoToNPZ/checkpoint/gastnet/81_frame_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3150eb3125ca66242a888fd06b4eb7d8a8b755607370225c24f0b9c794d35cc4
3
+ size 28333160
VideoToNPZ/checkpoint/hrnet/pose_coco/pose_hrnet_w48_384x288.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:95e0fec3194826d5e3f806ea89be68bbb84517b114c3a32b3058c56610b5ef61
3
+ size 255061287
VideoToNPZ/checkpoint/yolov3/yolov3.weights ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:523e4e69e1d015393a1b0a441cef1d9c7659e3eb2d7e15f793f060a21b32f297
3
+ size 248007048
VideoToNPZ/common/arguments.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+
3
+
4
+ def parse_args():
5
+ parser = argparse.ArgumentParser(description='Training script')
6
+
7
+ # General arguments
8
+ parser.add_argument('-d', '--dataset', default='h36m', type=str, metavar='NAME',
9
+ help='target dataset') # h36m or humaneva
10
+ parser.add_argument('-k', '--keypoints', default='cpn_ft_h36m_dbb', type=str, metavar='NAME',
11
+ help='2D detections to use')
12
+ parser.add_argument('-str', '--subjects-train', default='S1,S5,S6,S7,S8', type=str, metavar='LIST',
13
+ help='training subjects separated by comma')
14
+ parser.add_argument('-ste', '--subjects-test', default='S9,S11', type=str, metavar='LIST',
15
+ help='test subjects separated by comma')
16
+ parser.add_argument('-a', '--actions', default='*', type=str, metavar='LIST',
17
+ help='actions to train/test on, separated by comma, or * for all')
18
+ parser.add_argument('-c', '--checkpoint', default='checkpoint', type=str, metavar='PATH',
19
+ help='checkpoint directory')
20
+ parser.add_argument('--checkpoint-frequency', default=10, type=int, metavar='N',
21
+ help='create a checkpoint every N epochs')
22
+ parser.add_argument('-r', '--resume', default='', type=str, metavar='FILENAME',
23
+ help='checkpoint to resume (file name)')
24
+ parser.add_argument('--evaluate', default='', type=str, metavar='FILENAME',
25
+ help='checkpoint to evaluate (file name)')
26
+ parser.add_argument('--render', action='store_true', help='visualize a particular video')
27
+ parser.add_argument('--by-subject', action='store_true', help='break down error by subject (on evaluation)')
28
+ parser.add_argument('--export-training-curves', action='store_true', help='save training curves as .png images')
29
+
30
+ # Model arguments
31
+ parser.add_argument('-s', '--stride', default=1, type=int, metavar='N', help='chunk size to use during training')
32
+ parser.add_argument('-arc', '--architecture', default='3,3,3', type=str, metavar='LAYERS',
33
+ help='filter widths separated by comma')
34
+ parser.add_argument('--causal', action='store_true', help='use causal convolutions for real-time processing')
35
+ parser.add_argument('-ch', '--channels', default=128, type=int, metavar='N',
36
+ help='number of channels in convolution layers')
37
+
38
+ # Experimental setting
39
+ parser.add_argument('-e', '--epochs', default=60, type=int, metavar='N', help='number of training epochs')
40
+ parser.add_argument('-b', '--batch-size', default=128, type=int, metavar='N',
41
+ help='batch size in terms of predicted frames')
42
+ parser.add_argument('-drop', '--dropout', default=0.05, type=float, metavar='P', help='dropout probability')
43
+ parser.add_argument('-lr', '--learning-rate', default=0.001, type=float, metavar='LR', help='initial learning rate')
44
+ parser.add_argument('-lrd', '--lr-decay', default=0.95, type=float, metavar='LR',
45
+ help='learning rate decay per epoch')
46
+ parser.add_argument('-no-da', '--no-data-augmentation', dest='data_augmentation', action='store_false',
47
+ help='disable train-time flipping')
48
+ parser.add_argument('-no-tta', '--no-test-time-augmentation', dest='test_time_augmentation', action='store_false',
49
+ help='disable test-time flipping')
50
+ parser.add_argument('--subset', default=1, type=float, metavar='FRACTION', help='reduce dataset size by fraction')
51
+ parser.add_argument('--downsample', default=5, type=int, metavar='FACTOR',
52
+ help='downsample frame rate by factor (semi-supervised)')
53
+ parser.add_argument('--no-eval', action='store_true',
54
+ help='disable epoch evaluation while training (small speed-up)')
55
+ parser.add_argument('--disable-optimizations', action='store_true',
56
+ help='disable optimized model for single-frame predictions')
57
+
58
+ # Visualization
59
+ parser.add_argument('--viz-subject', type=str, metavar='STR', help='subject to render')
60
+ parser.add_argument('--viz-action', type=str, metavar='STR', help='action to render')
61
+ parser.add_argument('--viz-camera', type=int, default=0, metavar='N', help='camera to render')
62
+ parser.add_argument('--viz-video', type=str, metavar='PATH', help='path to input video')
63
+ parser.add_argument('--viz-skip', type=int, default=0, metavar='N', help='skip first N frames of input video')
64
+ parser.add_argument('--viz-output', type=str, metavar='PATH', help='output file name (.gif or .mp4)')
65
+ parser.add_argument('--viz-export', type=str, metavar='PATH', help='output file name for coordinates')
66
+ parser.add_argument('--viz-bitrate', type=int, default=3000, metavar='N', help='bitrate for mp4 videos')
67
+ parser.add_argument('--viz-no-ground-truth', action='store_true', help='do not show ground-truth poses')
68
+ parser.add_argument('--viz-limit', type=int, default=-1, metavar='N', help='only render first N frames')
69
+ parser.add_argument('--viz-downsample', type=int, default=1, metavar='N', help='downsample FPS by a factor N')
70
+ parser.add_argument('--viz-size', type=int, default=5, metavar='N', help='image size')
71
+
72
+ parser.set_defaults(bone_length_term=True)
73
+ parser.set_defaults(data_augmentation=True)
74
+ parser.set_defaults(test_time_augmentation=True)
75
+
76
+ args = parser.parse_args()
77
+ # Check invalid configuration
78
+ if args.resume and args.evaluate:
79
+ print('Invalid flags: --resume and --evaluate cannot be set at the same time')
80
+ exit()
81
+
82
+ if args.export_training_curves and args.no_eval:
83
+ print('Invalid flags: --export-training-curves and --no-eval cannot be set at the same time')
84
+ exit()
85
+
86
+ return args
VideoToNPZ/common/camera.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import torch
3
+
4
+ from tools.utils import wrap
5
+ from common.quaternion import qort, qinverse
6
+
7
+
8
+ def normalize_screen_coordinates(X, w, h):
9
+ assert X.shape[-1] == 2
10
+
11
+ # Normalize so that [0, w] is mapped to [-1, 1], while preserving the aspect ratio
12
+ return X/w*2 - [1, h/w]
13
+
14
+
15
+ def image_coordinates(X, w, h):
16
+ assert X.shape[-1] == 2
17
+
18
+ # Reverse camera frame normalization
19
+ return (X + [1, h/w]) * w / 2
20
+
21
+
22
+ def world_to_camera(X, R, t):
23
+ Rt = wrap(qinverse, R) # Invert rotation
24
+ return wrap(qort, np.tile(Rt, (*X.shape[:-1], 1)), X - t) # Rotate and translate
25
+
26
+
27
+ def camera_to_world(X, R, t):
28
+ return wrap(qort, np.tile(R, (*X.shape[:-1], 1)), X) + t
29
+
30
+
31
+ def project_to_2d(X, camera_params):
32
+ """
33
+ Project 3D points to 2D using the Human3.6M camera projection function.
34
+ This is a differentiable and batched reimplementation of the original MATLAB script.
35
+
36
+ Arguments:
37
+ X -- 3D points in *camera space* to transform (N, *, 3)
38
+ camera_params -- intrinsic parameteres (N, 2+2+3+2=9)
39
+ """
40
+ assert X.shape[-1] == 3
41
+ assert len(camera_params.shape) == 2
42
+ assert camera_params.shape[-1] == 9
43
+ assert X.shape[0] == camera_params.shape[0]
44
+
45
+ while len(camera_params.shape) < len(X.shape):
46
+ camera_params = camera_params.unsqueeze(1)
47
+
48
+ f = camera_params[..., :2]
49
+ c = camera_params[..., 2:4]
50
+ k = camera_params[..., 4:7]
51
+ p = camera_params[..., 7:]
52
+
53
+ # XX = torch.clamp(X[..., :2] / X[..., 2:], min=-1, max=1)
54
+ XX = X[..., :2] / X[..., 2:]
55
+ r2 = torch.sum(XX[..., :2]**2, dim=len(XX.shape)-1, keepdim=True)
56
+
57
+ radial = 1 + torch.sum(k * torch.cat((r2, r2**2, r2**3), dim=len(r2.shape)-1), dim=len(r2.shape)-1, keepdim=True)
58
+ tan = torch.sum(p*XX, dim=len(XX.shape)-1, keepdim=True)
59
+
60
+ XXX = XX*(radial + tan) + p*r2
61
+
62
+ return f*XXX + c
63
+
VideoToNPZ/common/generators.py ADDED
@@ -0,0 +1,236 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from itertools import zip_longest
2
+ import numpy as np
3
+
4
+
5
+ class ChunkedGenerator:
6
+ """
7
+ Batched data generator, used for training.
8
+ The sequences are split into equal-length chunks and padded as necessary.
9
+
10
+ Arguments:
11
+ batch_size -- the batch size to use for training
12
+ cameras -- list of cameras, one element for each video (optional, used for semi-supervised training)
13
+ poses_3d -- list of ground-truth 3D poses, one element for each video (optional, used for supervised training)
14
+ poses_2d -- list of input 2D keypoints, one element for each video
15
+ chunk_length -- number of output frames to predict for each training example (usually 1)
16
+ pad -- 2D input padding to compensate for valid convolutions, per side (depends on the receptive field)
17
+ causal_shift -- asymmetric padding offset when causal convolutions are used (usually 0 or "pad")
18
+ shuffle -- randomly shuffle the dataset before each epoch
19
+ random_seed -- initial seed to use for the random generator
20
+ augment -- augment the dataset by flipping poses horizontally
21
+ kps_left and kps_right -- list of left/right 2D keypoints if flipping is enabled
22
+ joints_left and joints_right -- list of left/right 3D joints if flipping is enabled
23
+ """
24
+ def __init__(self, batch_size, cameras, poses_3d, poses_2d,
25
+ chunk_length, pad=0, causal_shift=0,
26
+ shuffle=True, random_seed=1234,
27
+ augment=False, kps_left=None, kps_right=None, joints_left=None, joints_right=None,
28
+ endless=False):
29
+ assert poses_3d is None or len(poses_3d) == len(poses_2d), (len(poses_3d), len(poses_2d))
30
+ assert cameras is None or len(cameras) == len(poses_2d)
31
+
32
+ # Build lineage info
33
+ pairs = [] # (seq_idx, start_frame, end_frame, flip) tuples
34
+ for i in range(len(poses_2d)):
35
+ assert poses_3d is None or poses_3d[i].shape[0] == poses_2d[i].shape[0]
36
+ n_chunks = (poses_2d[i].shape[0] + chunk_length - 1) // chunk_length
37
+ offset = (n_chunks * chunk_length - poses_2d[i].shape[0]) // 2
38
+ bounds = np.arange(n_chunks + 1) * chunk_length - offset
39
+ augment_vector = np.full(len(bounds)-1, False, dtype=bool)
40
+ pairs += zip(np.repeat(i, len(bounds)-1), bounds[:-1], bounds[1:], augment_vector)
41
+ if augment:
42
+ pairs += zip(np.repeat(i, len(bounds)-1), bounds[:-1], bounds[1:], ~augment_vector)
43
+
44
+ # Initialize buffers
45
+ if cameras is not None:
46
+ self.batch_cam = np.empty((batch_size, cameras[0].shape[-1]))
47
+ if poses_3d is not None:
48
+ self.batch_3d = np.empty((batch_size, chunk_length, poses_3d[0].shape[-2], poses_3d[0].shape[-1]))
49
+ self.batch_2d = np.empty((batch_size, chunk_length + 2*pad, poses_2d[0].shape[-2], poses_2d[0].shape[-1]))
50
+
51
+ self.num_batches = (len(pairs) + batch_size - 1) // batch_size
52
+ self.batch_size = batch_size
53
+ self.random = np.random.RandomState(random_seed)
54
+ self.pairs = pairs
55
+ self.shuffle = shuffle
56
+ self.pad = pad
57
+ self.causal_shift = causal_shift
58
+ self.endless = endless
59
+ self.state = None
60
+
61
+ self.cameras = cameras
62
+ self.poses_3d = poses_3d
63
+ self.poses_2d = poses_2d
64
+
65
+ self.augment = augment
66
+ self.kps_left = kps_left
67
+ self.kps_right = kps_right
68
+ self.joints_left = joints_left
69
+ self.joints_right = joints_right
70
+
71
+ def num_frames(self):
72
+ return self.num_batches * self.batch_size
73
+
74
+ def random_state(self):
75
+ return self.random
76
+
77
+ def set_random_state(self, random):
78
+ self.random = random
79
+
80
+ def augment_enabled(self):
81
+ return self.augment
82
+
83
+ def next_pairs(self):
84
+ if self.state is None:
85
+ if self.shuffle:
86
+ pairs = self.random.permutation(self.pairs)
87
+ else:
88
+ pairs = self.pairs
89
+ return 0, pairs
90
+ else:
91
+ return self.state
92
+
93
+ def next_epoch(self):
94
+ enabled = True
95
+ while enabled:
96
+ start_idx, pairs = self.next_pairs()
97
+ for b_i in range(start_idx, self.num_batches):
98
+ chunks = pairs[b_i*self.batch_size : (b_i+1)*self.batch_size]
99
+ for i, (seq_i, start_3d, end_3d, flip) in enumerate(chunks):
100
+ start_2d = start_3d - self.pad - self.causal_shift
101
+ end_2d = end_3d + self.pad - self.causal_shift
102
+
103
+ # 2D poses
104
+ seq_2d = self.poses_2d[seq_i]
105
+ low_2d = max(start_2d, 0)
106
+ high_2d = min(end_2d, seq_2d.shape[0])
107
+ pad_left_2d = low_2d - start_2d
108
+ pad_right_2d = end_2d - high_2d
109
+ if pad_left_2d != 0 or pad_right_2d != 0:
110
+ self.batch_2d[i] = np.pad(seq_2d[low_2d:high_2d], ((pad_left_2d, pad_right_2d), (0, 0), (0, 0)), "edge")
111
+ else:
112
+ self.batch_2d[i] = seq_2d[low_2d:high_2d]
113
+
114
+ if flip:
115
+ # Flip 2D keypoints
116
+ self.batch_2d[i, :, :, 0] *= -1
117
+ self.batch_2d[i, :, self.kps_left + self.kps_right] = self.batch_2d[i, :, self.kps_right + self.kps_left]
118
+
119
+ # 3D poses
120
+ if self.poses_3d is not None:
121
+ seq_3d = self.poses_3d[seq_i]
122
+ low_3d = max(start_3d, 0)
123
+ high_3d = min(end_3d, seq_3d.shape[0])
124
+ pad_left_3d = low_3d - start_3d
125
+ pad_right_3d = end_3d - high_3d
126
+ if pad_left_3d != 0 or pad_right_3d != 0:
127
+ self.batch_3d[i] = np.pad(seq_3d[low_3d:high_3d], ((pad_left_3d, pad_right_3d), (0, 0), (0, 0)), "edge")
128
+ else:
129
+ self.batch_3d[i] = seq_3d[low_3d:high_3d]
130
+
131
+ if flip:
132
+ # Flip 3D joints
133
+ self.batch_3d[i, :, :, 0] *= -1
134
+ self.batch_3d[i, :, self.joints_left + self.joints_right] = \
135
+ self.batch_3d[i, :, self.joints_right + self.joints_left]
136
+
137
+ # Cameras
138
+ if self.cameras is not None:
139
+ self.batch_cam[i] = self.cameras[seq_i]
140
+ if flip:
141
+ # Flip horizontal distortion coefficients
142
+ self.batch_cam[i, 2] *= -1
143
+ self.batch_cam[i, 7] *= -1
144
+
145
+ if self.endless:
146
+ self.state = (b_i + 1, pairs)
147
+ if self.poses_3d is None and self.cameras is None:
148
+ yield None, None, self.batch_2d[:len(chunks)]
149
+ elif self.poses_3d is not None and self.cameras is None:
150
+ yield None, self.batch_3d[:len(chunks)], self.batch_2d[:(len(chunks))]
151
+ elif self.poses_3d is None:
152
+ yield self.batch_cam, None, self.batch_2d[:len(chunks)]
153
+ else:
154
+ yield self.batch_cam[:len(chunks)], self.batch_3d[:len(chunks)], self.batch_2d[:len(chunks)]
155
+
156
+ if self.endless:
157
+ self.state = None
158
+ else:
159
+ enabled = False
160
+
161
+
162
+ class UnchunkedGenerator:
163
+ """
164
+ Non-batched data generator, used for testing.
165
+ Sequences are returned one at a time (i.e. batch size = 1), without chunking.
166
+
167
+ If data augmentation is enabled, the batches contain two sequences (i.e. batch size = 2),
168
+ the second of which is a mirrored version of the first.
169
+
170
+ Arguments:
171
+ cameras -- list of cameras, one element for each video (optional, used for semi-supervised training)
172
+ poses_3d -- list of ground-truth 3D poses, one element for each video (optional, used for supervised training)
173
+ poses_2d -- list of input 2D keypoints, one element for each video
174
+ pad -- 2D input padding to compensate for valid convolutions, per side (depends on the receptive field)
175
+ causal_shift -- asymmetric padding offset when causal convolutions are used (usually 0 or "pad")
176
+ augment -- augment the dataset by flipping poses horizontally
177
+ kps_left and kps_right -- list of left/right 2D keypoints if flipping is enabled
178
+ joints_left and joints_right -- list of left/right 3D joints if flipping is enabled
179
+ """
180
+
181
+ def __init__(self, cameras, poses_3d, poses_2d, pad=0, causal_shift=0,
182
+ augment=False, kps_left=None, kps_right=None, joints_left=None, joints_right=None):
183
+ assert poses_3d is None or len(poses_3d) == len(poses_2d)
184
+ assert cameras is None or len(cameras) == len(poses_2d)
185
+
186
+ self.augment = augment
187
+ self.kps_left = kps_left
188
+ self.kps_right = kps_right
189
+ self.joints_left = joints_left
190
+ self.joints_right = joints_right
191
+
192
+ self.pad = pad
193
+ self.causal_shift = causal_shift
194
+ self.cameras = [] if cameras is None else cameras
195
+ self.poses_3d = [] if poses_3d is None else poses_3d
196
+ self.poses_2d = poses_2d
197
+
198
+ def num_frames(self):
199
+ count = 0
200
+ for p in self.poses_2d:
201
+ count += p.shape[0]
202
+ return count
203
+
204
+ def augment_enabled(self):
205
+ return self.augment
206
+
207
+ def set_augment(self, augment):
208
+ self.augment = augment
209
+
210
+ def next_epoch(self):
211
+ for seq_cam, seq_3d, seq_2d in zip_longest(self.cameras, self.poses_3d, self.poses_2d):
212
+ batch_cam = None if seq_cam is None else np.expand_dims(seq_cam, axis=0)
213
+ batch_3d = None if seq_3d is None else np.expand_dims(seq_3d, axis=0)
214
+ batch_2d = np.expand_dims(np.pad(seq_2d,
215
+ ((self.pad + self.causal_shift, self.pad - self.causal_shift), (0, 0),
216
+ (0, 0)),
217
+ 'edge'), axis=0)
218
+ if self.augment:
219
+ # Append flipped version
220
+ if batch_cam is not None:
221
+ batch_cam = np.concatenate((batch_cam, batch_cam), axis=0)
222
+ batch_cam[1, 2] *= -1
223
+ batch_cam[1, 7] *= -1
224
+
225
+ if batch_3d is not None:
226
+ batch_3d = np.concatenate((batch_3d, batch_3d), axis=0)
227
+ batch_3d[1, :, :, 0] *= -1
228
+ batch_3d[1, :, self.joints_left + self.joints_right] = batch_3d[1, :,
229
+ self.joints_right + self.joints_left]
230
+
231
+ batch_2d = np.concatenate((batch_2d, batch_2d), axis=0)
232
+ batch_2d[1, :, :, 0] *= -1
233
+ batch_2d[1, :, self.kps_left + self.kps_right] = batch_2d[1, :, self.kps_right + self.kps_left]
234
+
235
+ yield batch_cam, batch_3d, batch_2d
236
+
VideoToNPZ/common/graph_utils.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import absolute_import
2
+
3
+ import torch
4
+ import numpy as np
5
+ import scipy.sparse as sp
6
+
7
+
8
+ def normalize(mx):
9
+ """Row-normalize sparse matrix"""
10
+ rowsum = np.array(mx.sum(1))
11
+ r_inv = np.power(rowsum, -1).flatten()
12
+ r_inv[np.isinf(r_inv)] = 0.
13
+ r_mat_inv = sp.diags(r_inv)
14
+ mx = r_mat_inv.dot(mx)
15
+ return mx
16
+
17
+
18
+ def sparse_mx_to_torch_sparse_tensor(sparse_mx):
19
+ """Convert a scipy sparse matrix to a torch sparse tensor."""
20
+ sparse_mx = sparse_mx.tocoo().astype(np.float32)
21
+ indices = torch.from_numpy(np.vstack((sparse_mx.row, sparse_mx.col)).astype(np.int64))
22
+ values = torch.from_numpy(sparse_mx.data)
23
+ shape = torch.Size(sparse_mx.shape)
24
+ return torch.sparse.FloatTensor(indices, values, shape)
25
+
26
+
27
+ def adj_mx_from_edges(num_pts, edges, sparse=True):
28
+ edges = np.array(edges, dtype=np.int32)
29
+ data, i, j = np.ones(edges.shape[0]), edges[:, 0], edges[:, 1]
30
+ adj_mx = sp.coo_matrix((data, (i, j)), shape=(num_pts, num_pts), dtype=np.float32)
31
+
32
+ # build symmetric adjacency matrix
33
+ adj_mx = adj_mx + adj_mx.T.multiply(adj_mx.T > adj_mx) - adj_mx.multiply(adj_mx.T > adj_mx)
34
+ adj_mx = normalize(adj_mx + sp.eye(adj_mx.shape[0]))
35
+ if sparse:
36
+ adj_mx = sparse_mx_to_torch_sparse_tensor(adj_mx)
37
+ else:
38
+ adj_mx = torch.tensor(adj_mx.todense(), dtype=torch.float)
39
+ return adj_mx
40
+
41
+
42
+ def adj_mx_from_skeleton(skeleton):
43
+ num_joints = skeleton.num_joints()
44
+ edges = list(filter(lambda x: x[1] >= 0, zip(list(range(0, num_joints)), skeleton.parents())))
45
+ return adj_mx_from_edges(num_joints, edges, sparse=False)
VideoToNPZ/common/loss.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import numpy as np
3
+
4
+
5
+ def mpjpe(predicted, target):
6
+ """
7
+ Mean per-joint position error (i.e. mean Euclidean distance),
8
+ often referred to as "Protocol #1" in many papers.
9
+ """
10
+ assert predicted.shape == target.shape
11
+ return torch.mean(torch.norm(predicted - target, dim=len(target.shape) - 1))
12
+
13
+
14
+ def p_mpjpe(predicted, target):
15
+ """
16
+ Pose error: MPJPE after rigid alignment (scale, rotation, and translation),
17
+ often referred to as "Protocol #2" in many papers.
18
+ """
19
+ assert predicted.shape == target.shape
20
+
21
+ muX = np.mean(target, axis=1, keepdims=True)
22
+ muY = np.mean(predicted, axis=1, keepdims=True)
23
+
24
+ X0 = target - muX
25
+ Y0 = predicted - muY
26
+
27
+ normX = np.sqrt(np.sum(X0 ** 2, axis=(1, 2), keepdims=True))
28
+ normY = np.sqrt(np.sum(Y0 ** 2, axis=(1, 2), keepdims=True))
29
+
30
+ X0 /= normX
31
+ Y0 /= normY
32
+
33
+ H = np.matmul(X0.transpose(0, 2, 1), Y0)
34
+ U, s, Vt = np.linalg.svd(H)
35
+ V = Vt.transpose(0, 2, 1)
36
+ R = np.matmul(V, U.transpose(0, 2, 1))
37
+
38
+ # Avoid improper rotations (reflections), i.e. rotations with det(R) = -1
39
+ sign_detR = np.sign(np.expand_dims(np.linalg.det(R), axis=1))
40
+ V[:, :, -1] *= sign_detR
41
+ s[:, -1] *= sign_detR.flatten()
42
+ R = np.matmul(V, U.transpose(0, 2, 1)) # Rotation
43
+
44
+ tr = np.expand_dims(np.sum(s, axis=1, keepdims=True), axis=2)
45
+
46
+ a = tr * normX / normY # Scale
47
+ t = muX - a * np.matmul(muY, R) # Translation
48
+
49
+ # Perform rigid transformation on the input
50
+ predicted_aligned = a * np.matmul(predicted, R) + t
51
+
52
+ # Return MPJPE
53
+ return np.mean(np.linalg.norm(predicted_aligned - target, axis=len(target.shape) - 1))
54
+
55
+
56
+ def euclidean_losses(actual, target):
57
+ """Calculate the average Euclidean loss for multi-point samples.
58
+
59
+ Each sample must contain `n` points, each with `d` dimensions. For example,
60
+ in the MPII human pose estimation task n=16 (16 joint locations) and
61
+ d=2 (locations are 2D).
62
+
63
+ Args:
64
+ actual (Tensor): Predictions (B x L x D)
65
+ target (Tensor): Ground truth target (B x L x D)
66
+ """
67
+
68
+ assert actual.size() == target.size(), 'input tensors must have the same size'
69
+
70
+ # Calculate Euclidean distances between actual and target locations
71
+ diff = actual - target
72
+ dist_sq = diff.pow(2).sum(-1, keepdim=False)
73
+ dist = dist_sq.sqrt()
74
+ return dist
75
+
76
+
77
+ def pck(actual, expected, threshold=150):
78
+ dists = euclidean_losses(actual, expected)
79
+ return (dists < threshold).double().mean().item()
80
+
81
+
82
+ def auc(actual, expected):
83
+ # This range of thresholds mimics `mpii_compute_3d_pck.m`, which is provided as part of the
84
+ # MPI-INF-3DHP test data release.
85
+ thresholds = torch.linspace(0, 150, 31).tolist()
86
+
87
+ pck_values = torch.DoubleTensor(len(thresholds))
88
+ for i, threshold in enumerate(thresholds):
89
+ pck_values[i] = pck(actual, expected, threshold=threshold)
90
+ return pck_values.mean().item()
VideoToNPZ/common/quaternion.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+
3
+
4
+ def qort(q, v):
5
+ """
6
+ Rotate vector(s) v about the rotation described by quaternion(s) q.
7
+ Expects a tensor of shape (*, 4) for q and a tensor of shape (*, 3) for v,
8
+ where * denotes any number of dimensions.
9
+ Returns a tensor of shape (*, 3).
10
+ """
11
+ assert q.shape[-1] == 4
12
+ assert v.shape[-1] == 3
13
+ assert q.shape[:-1] == v.shape[:-1]
14
+
15
+ qvec = q[..., 1:]
16
+ uv = torch.cross(qvec, v, dim=len(q.shape)-1)
17
+ uuv = torch.cross(qvec, uv, dim=len(q.shape)-1)
18
+ return v + 2 * (q[..., :1] * uv + uuv)
19
+
20
+
21
+ def qinverse(q, inplace=False):
22
+ # We assume the quaternion to be normalized
23
+ """
24
+ The quaternions provided in the code are from the camera coordinate to the world coordinate.
25
+ Therefore, the quaternions from the world coordinate to the camera coordinate is the transpose of quaternions from
26
+ the camera coordinates to the world coordinate.The precondition is that the quaternion is a unit quaternion.
27
+ So the inverse of the quaternions is equal to the transposition of the quaternions.
28
+ """
29
+ if inplace:
30
+ q[..., 1:] *= -1
31
+ return q
32
+ else:
33
+ w = q[..., :1]
34
+ xyz = q[..., 1:]
35
+ return torch.cat((w, -xyz), dim=len(q.shape)-1)
36
+
VideoToNPZ/common/skeleton.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+
3
+
4
+ class Skeleton:
5
+ def __init__(self, parents, joints_left, joints_right):
6
+ assert len(joints_left) == len(joints_right)
7
+
8
+ self._parents = parents
9
+ self._joints_left = joints_left
10
+ self._joints_right = joints_right
11
+
12
+ def num_joints(self):
13
+ return len(self._parents)
14
+
15
+ def parents(self):
16
+ return self._parents
17
+
18
+ def has_children(self):
19
+ return self._has_children
20
+
21
+ def children(self):
22
+ return self._children
23
+
24
+ def remove_joints(self, joints_to_remove):
25
+ """
26
+ Remove the joints specified in 'joints_to_remove'.
27
+ """
28
+ valid_joints = []
29
+ for joint in range(len(self._parents)):
30
+ if joint not in joints_to_remove:
31
+ valid_joints.append(joint)
32
+
33
+ for i in range(len(self._parents)):
34
+ while self._parents[i] in joints_to_remove:
35
+ self._parents[i] = self._parents[self._parents[i]]
36
+
37
+ index_offsets = np.zeros(len(self._parents), dtype=int)
38
+ new_parents = []
39
+ for i, parent in enumerate(self._parents):
40
+ if i not in joints_to_remove:
41
+ new_parents.append(parent - index_offsets[parent])
42
+ else:
43
+ index_offsets[i:] += 1
44
+ self._parents = np.array(new_parents)
45
+
46
+ if self._joints_left is not None:
47
+ new_joints_left = []
48
+ for joint in self._joints_left:
49
+ if joint in valid_joints:
50
+ new_joints_left.append(joint - index_offsets[joint])
51
+ self._joints_left = new_joints_left
52
+
53
+ if self._joints_right is not None:
54
+ new_joints_right = []
55
+ for joint in self._joints_right:
56
+ if joint in valid_joints:
57
+ new_joints_right.append(joint - index_offsets[joint])
58
+ self._joints_right = new_joints_right
59
+
60
+ self._compute_metadata()
61
+
62
+ return valid_joints
63
+
64
+ def joints_left(self):
65
+ return self._joints_left
66
+
67
+ def joints_right(self):
68
+ return self._joints_right
69
+
70
+ def _compute_metadata(self):
71
+ self._has_children = np.zeros(len(self._parents)).astype(bool)
72
+ for i, parent in enumerate(self._parents):
73
+ if parent != -1:
74
+ self._has_children[parent] = True
75
+
76
+ self._children = []
77
+ for parents in enumerate(self._parents):
78
+ self._children.append([])
79
+ for i, parent in enumerate(self._parents):
80
+ if parent != -1:
81
+ self._children[parent].append(i)
VideoToNPZ/data/data_utils.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import h5py
3
+
4
+ mpii_metadata = {
5
+ 'layout_name': 'mpii',
6
+ 'num_joints': 16,
7
+ 'keypoints_symmetry': [
8
+ [3, 4, 5, 13, 14, 15],
9
+ [0, 1, 2, 10, 11, 12],
10
+ ]
11
+ }
12
+
13
+ coco_metadata = {
14
+ 'layout_name': 'coco',
15
+ 'num_joints': 17,
16
+ 'keypoints_symmetry': [
17
+ [1, 3, 5, 7, 9, 11, 13, 15],
18
+ [2, 4, 6, 8, 10, 12, 14, 16],
19
+ ]
20
+ }
21
+
22
+ h36m_metadata = {
23
+ 'layout_name': 'h36m',
24
+ 'num_joints': 17,
25
+ 'keypoints_symmetry': [
26
+ [4, 5, 6, 11, 12, 13],
27
+ [1, 2, 3, 14, 15, 16],
28
+ ]
29
+ }
30
+
31
+ humaneva15_metadata = {
32
+ 'layout_name': 'humaneva15',
33
+ 'num_joints': 15,
34
+ 'keypoints_symmetry': [
35
+ [2, 3, 4, 8, 9, 10],
36
+ [5, 6, 7, 11, 12, 13]
37
+ ]
38
+ }
39
+
40
+ humaneva20_metadata = {
41
+ 'layout_name': 'humaneva20',
42
+ 'num_joints': 20,
43
+ 'keypoints_symmetry': [
44
+ [3, 4, 5, 6, 11, 12, 13, 14],
45
+ [7, 8, 9, 10, 15, 16, 17, 18]
46
+ ]
47
+ }
48
+
49
+ def suggest_metadata(name):
50
+ names = []
51
+ for metadata in [mpii_metadata, coco_metadata, h36m_metadata, humaneva15_metadata, humaneva20_metadata]:
52
+ if metadata['layout_name'] in name:
53
+ return metadata
54
+ names.append(metadata['layout_name'])
55
+ raise KeyError('Cannot infer keypoint layout from name "{}". Tried {}.'.format(name, names))
56
+
57
+ def import_detectron_poses(path):
58
+ # Latin1 encoding because Detectron runs on Python 2.7
59
+ data = np.load(path, encoding='latin1')
60
+ kp = data['keypoints']
61
+ bb = data['boxes']
62
+ results = []
63
+ for i in range(len(bb)):
64
+ if len(bb[i][1]) == 0:
65
+ assert i > 0
66
+ # Use last pose in case of detection failure
67
+ results.append(results[-1])
68
+ continue
69
+ best_match = np.argmax(bb[i][1][:, 4])
70
+ keypoints = kp[i][1][best_match].T.copy()
71
+ results.append(keypoints)
72
+ results = np.array(results)
73
+ return results[:, :, 4:6] # Soft-argmax
74
+ #return results[:, :, [0, 1, 3]] # Argmax + score
75
+
76
+
77
+ def import_cpn_poses(path):
78
+ data = np.load(path)
79
+ kp = data['keypoints']
80
+ return kp[:, :, :2]
81
+
82
+
83
+ def import_sh_poses(path):
84
+ with h5py.File(path) as hf:
85
+ positions = hf['poses'].value
86
+ return positions.astype('float32')
87
+
88
+ def suggest_pose_importer(name):
89
+ if 'detectron' in name:
90
+ return import_detectron_poses
91
+ if 'cpn' in name:
92
+ return import_cpn_poses
93
+ if 'sh' in name:
94
+ return import_sh_poses
95
+ raise KeyError('Cannot infer keypoint format from name "{}". Tried detectron, cpn, sh.'.format(name))
VideoToNPZ/gen_skes.py ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import sys
3
+ import os.path as osp
4
+ import os
5
+ import argparse
6
+ import cv2
7
+ import time
8
+ import h5py
9
+ from tqdm import tqdm
10
+ import numpy as np
11
+ import warnings
12
+ import signal
13
+
14
+ warnings.filterwarnings('ignore')
15
+
16
+ sys.path.insert(0, osp.dirname(osp.realpath(__file__)))
17
+ from tools.utils import get_path
18
+ from model.gast_net import SpatioTemporalModel, SpatioTemporalModelOptimized1f
19
+ from common.skeleton import Skeleton
20
+ from common.graph_utils import adj_mx_from_skeleton
21
+ from common.generators import *
22
+ from tools.preprocess import load_kpts_json, h36m_coco_format, revise_kpts, revise_skes
23
+ from tools.inference import gen_pose
24
+ from tools.vis_kpts import plot_keypoint
25
+
26
+ cur_dir, chk_root, data_root, lib_root, output_root = get_path(__file__)
27
+ model_dir = chk_root + 'gastnet/'
28
+ sys.path.insert(1, lib_root)
29
+ from lib.pose import gen_video_kpts as hrnet_pose
30
+ sys.path.pop(1)
31
+ sys.path.pop(0)
32
+
33
+ skeleton = Skeleton(parents=[-1, 0, 1, 2, 0, 4, 5, 0, 7, 8, 9, 8, 11, 12, 8, 14, 15],
34
+ joints_left=[4, 5, 6, 11, 12, 13], joints_right=[1, 2, 3, 14, 15, 16])
35
+ adj = adj_mx_from_skeleton(skeleton)
36
+
37
+ joints_left, joints_right = [4, 5, 6, 11, 12, 13], [1, 2, 3, 14, 15, 16]
38
+ kps_left, kps_right = [4, 5, 6, 11, 12, 13], [1, 2, 3, 14, 15, 16]
39
+
40
+ # Set up signal handler for keyboard interrupt
41
+ def signal_handler(sig, frame):
42
+ print("\nInterrupted by user, shutting down...")
43
+ if 'pool' in locals() and pool is not None:
44
+ pool.terminate()
45
+ pool.join()
46
+ sys.exit(0)
47
+
48
+ signal.signal(signal.SIGINT, signal_handler)
49
+
50
+ def load_model_layer():
51
+ chk = model_dir + '81_frame_model.bin'
52
+ filters_width = [3, 3, 3, 3]
53
+ channels = 64
54
+
55
+ model_pos = SpatioTemporalModel(adj, 17, 2, 17, filter_widths=filters_width, channels=channels, dropout=0.05)
56
+
57
+ checkpoint = torch.load(chk)
58
+ model_pos.load_state_dict(checkpoint['model_pos'])
59
+
60
+ if torch.cuda.is_available():
61
+ model_pos = model_pos.cuda()
62
+ model_pos = model_pos.eval()
63
+
64
+ return model_pos
65
+
66
+ def generate_skeletons(video=''):
67
+ cap = cv2.VideoCapture(video)
68
+ width = cap.get(cv2.CAP_PROP_FRAME_WIDTH)
69
+ height = cap.get(cv2.CAP_PROP_FRAME_HEIGHT)
70
+
71
+ keypoints, scores = hrnet_pose(video, det_dim=416, gen_output=True)
72
+ keypoints, scores, valid_frames = h36m_coco_format(keypoints, scores)
73
+ re_kpts = revise_kpts(keypoints, scores, valid_frames)
74
+ num_person = len(re_kpts)
75
+
76
+ model_pos = load_model_layer()
77
+
78
+ pad = (81 - 1) // 2
79
+ causal_shift = 0
80
+
81
+ prediction = gen_pose(re_kpts, valid_frames, width, height, model_pos, pad, causal_shift)
82
+
83
+ print('Recording 3D Pose:')
84
+
85
+ # Add a loading bar
86
+ for i in tqdm(range(100)):
87
+ time.sleep(0.01)
88
+
89
+ # Create output directory with absolute path
90
+ output_dir = os.path.abspath('../outputs/')
91
+ print(f"Creating output directory: {output_dir}")
92
+ os.makedirs(output_dir, exist_ok=True)
93
+
94
+ npz_dir = os.path.join(output_dir, 'npz')
95
+ print(f"Creating NPZ directory: {npz_dir}")
96
+ os.makedirs(npz_dir, exist_ok=True)
97
+
98
+ output_npz = os.path.join(npz_dir, os.path.basename(video).split('.')[0] + '.npz')
99
+ print(f"Saving NPZ to: {output_npz}")
100
+ np.savez_compressed(output_npz, reconstruction=prediction)
101
+ print(f"NPZ saved successfully: {output_npz}")
102
+
103
+ def arg_parse():
104
+ parser = argparse.ArgumentParser('Generating skeleton demo.')
105
+ parser.add_argument('-v', '--video', type=str)
106
+ args = parser.parse_args()
107
+ return args
108
+
109
+ if __name__ == "__main__":
110
+ args = arg_parse()
111
+ # Use the video path as-is if absolute, otherwise prepend data_root
112
+ if os.path.isabs(args.video):
113
+ video_path = args.video
114
+ else:
115
+ video_path = os.path.join(data_root, 'video', args.video)
116
+ generate_skeletons(video=video_path)
VideoToNPZ/lib/detector/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ import sys
2
+ import os.path as osp
3
+
4
+ sys.path.insert(0, osp.join(osp.dirname(osp.realpath(__file__)), 'yolov3'))
5
+ from human_detector import yolo_human_det, load_model
6
+ sys.path.pop(0)
VideoToNPZ/lib/detector/yolov3/__init__.py ADDED
File without changes
VideoToNPZ/lib/detector/yolov3/bbox.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import division
2
+
3
+ import torch
4
+ import random
5
+ import numpy as np
6
+ import cv2
7
+
8
+
9
+ def confidence_filter(result, confidence):
10
+ conf_mask = (result[:,:,4] > confidence).float().unsqueeze(2)
11
+ result = result*conf_mask
12
+
13
+ return result
14
+
15
+
16
+ def confidence_filter_cls(result, confidence):
17
+ max_scores = torch.max(result[:,:,5:25], 2)[0]
18
+ res = torch.cat((result, max_scores),2)
19
+ print(res.shape)
20
+
21
+
22
+ cond_1 = (res[:,:,4] > confidence).float()
23
+ cond_2 = (res[:,:,25] > 0.995).float()
24
+
25
+ conf = cond_1 + cond_2
26
+ conf = torch.clamp(conf, 0.0, 1.0)
27
+ conf = conf.unsqueeze(2)
28
+ result = result*conf
29
+ return result
30
+
31
+
32
+ def get_abs_coord(box):
33
+ box[2], box[3] = abs(box[2]), abs(box[3])
34
+ x1 = (box[0] - box[2]/2) - 1
35
+ y1 = (box[1] - box[3]/2) - 1
36
+ x2 = (box[0] + box[2]/2) - 1
37
+ y2 = (box[1] + box[3]/2) - 1
38
+ return x1, y1, x2, y2
39
+
40
+
41
+ def sanity_fix(box):
42
+ if (box[0] > box[2]):
43
+ box[0], box[2] = box[2], box[0]
44
+
45
+ if (box[1] > box[3]):
46
+ box[1], box[3] = box[3], box[1]
47
+
48
+ return box
49
+
50
+
51
+ def bbox_iou(box1, box2):
52
+ """
53
+ Returns the IoU of two bounding boxes
54
+
55
+ """
56
+ # Get the coordinates of bounding boxes
57
+ b1_x1, b1_y1, b1_x2, b1_y2 = box1[:, 0], box1[:, 1], box1[:, 2], box1[:, 3]
58
+ b2_x1, b2_y1, b2_x2, b2_y2 = box2[:, 0], box2[:, 1], box2[:, 2], box2[:, 3]
59
+
60
+ # get the corrdinates of the intersection rectangle
61
+ inter_rect_x1 = torch.max(b1_x1, b2_x1)
62
+ inter_rect_y1 = torch.max(b1_y1, b2_y1)
63
+ inter_rect_x2 = torch.min(b1_x2, b2_x2)
64
+ inter_rect_y2 = torch.min(b1_y2, b2_y2)
65
+
66
+ # Intersection area
67
+ if torch.cuda.is_available():
68
+ inter_area = torch.max(inter_rect_x2 - inter_rect_x1 + 1, torch.zeros(inter_rect_x2.shape).cuda())*torch.max(inter_rect_y2 - inter_rect_y1 + 1, torch.zeros(inter_rect_x2.shape).cuda())
69
+ else:
70
+ inter_area = torch.max(inter_rect_x2 - inter_rect_x1 + 1, torch.zeros(inter_rect_x2.shape))*torch.max(inter_rect_y2 - inter_rect_y1 + 1, torch.zeros(inter_rect_x2.shape))
71
+
72
+ # Union Area
73
+ b1_area = (b1_x2 - b1_x1 + 1)*(b1_y2 - b1_y1 + 1)
74
+ b2_area = (b2_x2 - b2_x1 + 1)*(b2_y2 - b2_y1 + 1)
75
+
76
+ iou = inter_area / (b1_area + b2_area - inter_area)
77
+
78
+ return iou
79
+
80
+
81
+ def pred_corner_coord(prediction):
82
+ #Get indices of non-zero confidence bboxes
83
+ ind_nz = torch.nonzero(prediction[:,:,4]).transpose(0,1).contiguous()
84
+
85
+ box = prediction[ind_nz[0], ind_nz[1]]
86
+
87
+ box_a = box.new(box.shape)
88
+ box_a[:,0] = (box[:,0] - box[:,2]/2)
89
+ box_a[:,1] = (box[:,1] - box[:,3]/2)
90
+ box_a[:,2] = (box[:,0] + box[:,2]/2)
91
+ box_a[:,3] = (box[:,1] + box[:,3]/2)
92
+ box[:,:4] = box_a[:,:4]
93
+
94
+ prediction[ind_nz[0], ind_nz[1]] = box
95
+
96
+ return prediction
97
+
98
+
99
+ def write(x, batches, results, colors, classes):
100
+ c1 = tuple(x[1:3].int())
101
+ c2 = tuple(x[3:5].int())
102
+ img = results[int(x[0])]
103
+ cls = int(x[-1])
104
+ label = "{0}".format(classes[cls])
105
+ color = random.choice(colors)
106
+ cv2.rectangle(img, c1, c2,color, 1)
107
+ t_size = cv2.getTextSize(label, cv2.FONT_HERSHEY_PLAIN, 1 , 1)[0]
108
+ c2 = c1[0] + t_size[0] + 3, c1[1] + t_size[1] + 4
109
+ cv2.rectangle(img, c1, c2,color, -1)
110
+ cv2.putText(img, label, (c1[0], c1[1] + t_size[1] + 4), cv2.FONT_HERSHEY_PLAIN, 1, [225,255,255], 1);
111
+ return img
VideoToNPZ/lib/detector/yolov3/cfg/tiny-yolo-voc.cfg ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [net]
2
+ batch=64
3
+ subdivisions=8
4
+ width=416
5
+ height=416
6
+ channels=3
7
+ momentum=0.9
8
+ decay=0.0005
9
+ angle=0
10
+ saturation = 1.5
11
+ exposure = 1.5
12
+ hue=.1
13
+
14
+ learning_rate=0.001
15
+ max_batches = 40200
16
+ policy=steps
17
+ steps=-1,100,20000,30000
18
+ scales=.1,10,.1,.1
19
+
20
+ [convolutional]
21
+ batch_normalize=1
22
+ filters=16
23
+ size=3
24
+ stride=1
25
+ pad=1
26
+ activation=leaky
27
+
28
+ [maxpool]
29
+ size=2
30
+ stride=2
31
+
32
+ [convolutional]
33
+ batch_normalize=1
34
+ filters=32
35
+ size=3
36
+ stride=1
37
+ pad=1
38
+ activation=leaky
39
+
40
+ [maxpool]
41
+ size=2
42
+ stride=2
43
+
44
+ [convolutional]
45
+ batch_normalize=1
46
+ filters=64
47
+ size=3
48
+ stride=1
49
+ pad=1
50
+ activation=leaky
51
+
52
+ [maxpool]
53
+ size=2
54
+ stride=2
55
+
56
+ [convolutional]
57
+ batch_normalize=1
58
+ filters=128
59
+ size=3
60
+ stride=1
61
+ pad=1
62
+ activation=leaky
63
+
64
+ [maxpool]
65
+ size=2
66
+ stride=2
67
+
68
+ [convolutional]
69
+ batch_normalize=1
70
+ filters=256
71
+ size=3
72
+ stride=1
73
+ pad=1
74
+ activation=leaky
75
+
76
+ [maxpool]
77
+ size=2
78
+ stride=2
79
+
80
+ [convolutional]
81
+ batch_normalize=1
82
+ filters=512
83
+ size=3
84
+ stride=1
85
+ pad=1
86
+ activation=leaky
87
+
88
+ [maxpool]
89
+ size=2
90
+ stride=1
91
+
92
+ [convolutional]
93
+ batch_normalize=1
94
+ filters=1024
95
+ size=3
96
+ stride=1
97
+ pad=1
98
+ activation=leaky
99
+
100
+ ###########
101
+
102
+ [convolutional]
103
+ batch_normalize=1
104
+ size=3
105
+ stride=1
106
+ pad=1
107
+ filters=1024
108
+ activation=leaky
109
+
110
+ [convolutional]
111
+ size=1
112
+ stride=1
113
+ pad=1
114
+ filters=125
115
+ activation=linear
116
+
117
+ [region]
118
+ anchors = 1.08,1.19, 3.42,4.41, 6.63,11.38, 9.42,5.11, 16.62,10.52
119
+ bias_match=1
120
+ classes=20
121
+ coords=4
122
+ num=5
123
+ softmax=1
124
+ jitter=.2
125
+ rescore=1
126
+
127
+ object_scale=5
128
+ noobject_scale=1
129
+ class_scale=1
130
+ coord_scale=1
131
+
132
+ absolute=1
133
+ thresh = .6
134
+ random=1
VideoToNPZ/lib/detector/yolov3/cfg/yolo-voc.cfg ADDED
@@ -0,0 +1,258 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [net]
2
+ # Testing
3
+ batch=64
4
+ subdivisions=8
5
+ # Training
6
+ # batch=64
7
+ # subdivisions=8
8
+ height=416
9
+ width=416
10
+ channels=3
11
+ momentum=0.9
12
+ decay=0.0005
13
+ angle=0
14
+ saturation = 1.5
15
+ exposure = 1.5
16
+ hue=.1
17
+
18
+ learning_rate=0.001
19
+ burn_in=1000
20
+ max_batches = 80200
21
+ policy=steps
22
+ steps=-1,500,40000,60000
23
+ scales=0.1,10,.1,.1
24
+
25
+ [convolutional]
26
+ batch_normalize=1
27
+ filters=32
28
+ size=3
29
+ stride=1
30
+ pad=1
31
+ activation=leaky
32
+
33
+ [maxpool]
34
+ size=2
35
+ stride=2
36
+
37
+ [convolutional]
38
+ batch_normalize=1
39
+ filters=64
40
+ size=3
41
+ stride=1
42
+ pad=1
43
+ activation=leaky
44
+
45
+ [maxpool]
46
+ size=2
47
+ stride=2
48
+
49
+ [convolutional]
50
+ batch_normalize=1
51
+ filters=128
52
+ size=3
53
+ stride=1
54
+ pad=1
55
+ activation=leaky
56
+
57
+ [convolutional]
58
+ batch_normalize=1
59
+ filters=64
60
+ size=1
61
+ stride=1
62
+ pad=1
63
+ activation=leaky
64
+
65
+ [convolutional]
66
+ batch_normalize=1
67
+ filters=128
68
+ size=3
69
+ stride=1
70
+ pad=1
71
+ activation=leaky
72
+
73
+ [maxpool]
74
+ size=2
75
+ stride=2
76
+
77
+ [convolutional]
78
+ batch_normalize=1
79
+ filters=256
80
+ size=3
81
+ stride=1
82
+ pad=1
83
+ activation=leaky
84
+
85
+ [convolutional]
86
+ batch_normalize=1
87
+ filters=128
88
+ size=1
89
+ stride=1
90
+ pad=1
91
+ activation=leaky
92
+
93
+ [convolutional]
94
+ batch_normalize=1
95
+ filters=256
96
+ size=3
97
+ stride=1
98
+ pad=1
99
+ activation=leaky
100
+
101
+ [maxpool]
102
+ size=2
103
+ stride=2
104
+
105
+ [convolutional]
106
+ batch_normalize=1
107
+ filters=512
108
+ size=3
109
+ stride=1
110
+ pad=1
111
+ activation=leaky
112
+
113
+ [convolutional]
114
+ batch_normalize=1
115
+ filters=256
116
+ size=1
117
+ stride=1
118
+ pad=1
119
+ activation=leaky
120
+
121
+ [convolutional]
122
+ batch_normalize=1
123
+ filters=512
124
+ size=3
125
+ stride=1
126
+ pad=1
127
+ activation=leaky
128
+
129
+ [convolutional]
130
+ batch_normalize=1
131
+ filters=256
132
+ size=1
133
+ stride=1
134
+ pad=1
135
+ activation=leaky
136
+
137
+ [convolutional]
138
+ batch_normalize=1
139
+ filters=512
140
+ size=3
141
+ stride=1
142
+ pad=1
143
+ activation=leaky
144
+
145
+ [maxpool]
146
+ size=2
147
+ stride=2
148
+
149
+ [convolutional]
150
+ batch_normalize=1
151
+ filters=1024
152
+ size=3
153
+ stride=1
154
+ pad=1
155
+ activation=leaky
156
+
157
+ [convolutional]
158
+ batch_normalize=1
159
+ filters=512
160
+ size=1
161
+ stride=1
162
+ pad=1
163
+ activation=leaky
164
+
165
+ [convolutional]
166
+ batch_normalize=1
167
+ filters=1024
168
+ size=3
169
+ stride=1
170
+ pad=1
171
+ activation=leaky
172
+
173
+ [convolutional]
174
+ batch_normalize=1
175
+ filters=512
176
+ size=1
177
+ stride=1
178
+ pad=1
179
+ activation=leaky
180
+
181
+ [convolutional]
182
+ batch_normalize=1
183
+ filters=1024
184
+ size=3
185
+ stride=1
186
+ pad=1
187
+ activation=leaky
188
+
189
+
190
+ #######
191
+
192
+ [convolutional]
193
+ batch_normalize=1
194
+ size=3
195
+ stride=1
196
+ pad=1
197
+ filters=1024
198
+ activation=leaky
199
+
200
+ [convolutional]
201
+ batch_normalize=1
202
+ size=3
203
+ stride=1
204
+ pad=1
205
+ filters=1024
206
+ activation=leaky
207
+
208
+ [route]
209
+ layers=-9
210
+
211
+ [convolutional]
212
+ batch_normalize=1
213
+ size=1
214
+ stride=1
215
+ pad=1
216
+ filters=64
217
+ activation=leaky
218
+
219
+ [reorg]
220
+ stride=2
221
+
222
+ [route]
223
+ layers=-1,-4
224
+
225
+ [convolutional]
226
+ batch_normalize=1
227
+ size=3
228
+ stride=1
229
+ pad=1
230
+ filters=1024
231
+ activation=leaky
232
+
233
+ [convolutional]
234
+ size=1
235
+ stride=1
236
+ pad=1
237
+ filters=125
238
+ activation=linear
239
+
240
+
241
+ [region]
242
+ anchors = 1.3221, 1.73145, 3.19275, 4.00944, 5.05587, 8.09892, 9.47112, 4.84053, 11.2364, 10.0071
243
+ bias_match=1
244
+ classes=20
245
+ coords=4
246
+ num=5
247
+ softmax=1
248
+ jitter=.3
249
+ rescore=1
250
+
251
+ object_scale=5
252
+ noobject_scale=1
253
+ class_scale=1
254
+ coord_scale=1
255
+
256
+ absolute=1
257
+ thresh = .6
258
+ random=1
VideoToNPZ/lib/detector/yolov3/cfg/yolo.cfg ADDED
@@ -0,0 +1,258 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [net]
2
+ # Testing
3
+ batch=1
4
+ subdivisions=1
5
+ # Training
6
+ # batch=64
7
+ # subdivisions=8
8
+ width=416
9
+ height=416
10
+ channels=3
11
+ momentum=0.9
12
+ decay=0.0005
13
+ angle=0
14
+ saturation = 1.5
15
+ exposure = 1.5
16
+ hue=.1
17
+
18
+ learning_rate=0.001
19
+ burn_in=1000
20
+ max_batches = 500200
21
+ policy=steps
22
+ steps=400000,450000
23
+ scales=.1,.1
24
+
25
+ [convolutional]
26
+ batch_normalize=1
27
+ filters=32
28
+ size=3
29
+ stride=1
30
+ pad=1
31
+ activation=leaky
32
+
33
+ [maxpool]
34
+ size=2
35
+ stride=2
36
+
37
+ [convolutional]
38
+ batch_normalize=1
39
+ filters=64
40
+ size=3
41
+ stride=1
42
+ pad=1
43
+ activation=leaky
44
+
45
+ [maxpool]
46
+ size=2
47
+ stride=2
48
+
49
+ [convolutional]
50
+ batch_normalize=1
51
+ filters=128
52
+ size=3
53
+ stride=1
54
+ pad=1
55
+ activation=leaky
56
+
57
+ [convolutional]
58
+ batch_normalize=1
59
+ filters=64
60
+ size=1
61
+ stride=1
62
+ pad=1
63
+ activation=leaky
64
+
65
+ [convolutional]
66
+ batch_normalize=1
67
+ filters=128
68
+ size=3
69
+ stride=1
70
+ pad=1
71
+ activation=leaky
72
+
73
+ [maxpool]
74
+ size=2
75
+ stride=2
76
+
77
+ [convolutional]
78
+ batch_normalize=1
79
+ filters=256
80
+ size=3
81
+ stride=1
82
+ pad=1
83
+ activation=leaky
84
+
85
+ [convolutional]
86
+ batch_normalize=1
87
+ filters=128
88
+ size=1
89
+ stride=1
90
+ pad=1
91
+ activation=leaky
92
+
93
+ [convolutional]
94
+ batch_normalize=1
95
+ filters=256
96
+ size=3
97
+ stride=1
98
+ pad=1
99
+ activation=leaky
100
+
101
+ [maxpool]
102
+ size=2
103
+ stride=2
104
+
105
+ [convolutional]
106
+ batch_normalize=1
107
+ filters=512
108
+ size=3
109
+ stride=1
110
+ pad=1
111
+ activation=leaky
112
+
113
+ [convolutional]
114
+ batch_normalize=1
115
+ filters=256
116
+ size=1
117
+ stride=1
118
+ pad=1
119
+ activation=leaky
120
+
121
+ [convolutional]
122
+ batch_normalize=1
123
+ filters=512
124
+ size=3
125
+ stride=1
126
+ pad=1
127
+ activation=leaky
128
+
129
+ [convolutional]
130
+ batch_normalize=1
131
+ filters=256
132
+ size=1
133
+ stride=1
134
+ pad=1
135
+ activation=leaky
136
+
137
+ [convolutional]
138
+ batch_normalize=1
139
+ filters=512
140
+ size=3
141
+ stride=1
142
+ pad=1
143
+ activation=leaky
144
+
145
+ [maxpool]
146
+ size=2
147
+ stride=2
148
+
149
+ [convolutional]
150
+ batch_normalize=1
151
+ filters=1024
152
+ size=3
153
+ stride=1
154
+ pad=1
155
+ activation=leaky
156
+
157
+ [convolutional]
158
+ batch_normalize=1
159
+ filters=512
160
+ size=1
161
+ stride=1
162
+ pad=1
163
+ activation=leaky
164
+
165
+ [convolutional]
166
+ batch_normalize=1
167
+ filters=1024
168
+ size=3
169
+ stride=1
170
+ pad=1
171
+ activation=leaky
172
+
173
+ [convolutional]
174
+ batch_normalize=1
175
+ filters=512
176
+ size=1
177
+ stride=1
178
+ pad=1
179
+ activation=leaky
180
+
181
+ [convolutional]
182
+ batch_normalize=1
183
+ filters=1024
184
+ size=3
185
+ stride=1
186
+ pad=1
187
+ activation=leaky
188
+
189
+
190
+ #######
191
+
192
+ [convolutional]
193
+ batch_normalize=1
194
+ size=3
195
+ stride=1
196
+ pad=1
197
+ filters=1024
198
+ activation=leaky
199
+
200
+ [convolutional]
201
+ batch_normalize=1
202
+ size=3
203
+ stride=1
204
+ pad=1
205
+ filters=1024
206
+ activation=leaky
207
+
208
+ [route]
209
+ layers=-9
210
+
211
+ [convolutional]
212
+ batch_normalize=1
213
+ size=1
214
+ stride=1
215
+ pad=1
216
+ filters=64
217
+ activation=leaky
218
+
219
+ [reorg]
220
+ stride=2
221
+
222
+ [route]
223
+ layers=-1,-4
224
+
225
+ [convolutional]
226
+ batch_normalize=1
227
+ size=3
228
+ stride=1
229
+ pad=1
230
+ filters=1024
231
+ activation=leaky
232
+
233
+ [convolutional]
234
+ size=1
235
+ stride=1
236
+ pad=1
237
+ filters=425
238
+ activation=linear
239
+
240
+
241
+ [region]
242
+ anchors = 0.57273, 0.677385, 1.87446, 2.06253, 3.33843, 5.47434, 7.88282, 3.52778, 9.77052, 9.16828
243
+ bias_match=1
244
+ classes=80
245
+ coords=4
246
+ num=5
247
+ softmax=1
248
+ jitter=.3
249
+ rescore=1
250
+
251
+ object_scale=5
252
+ noobject_scale=1
253
+ class_scale=1
254
+ coord_scale=1
255
+
256
+ absolute=1
257
+ thresh = .6
258
+ random=1
VideoToNPZ/lib/detector/yolov3/cfg/yolov3.cfg ADDED
@@ -0,0 +1,789 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [net]
2
+ # Testing
3
+ batch=1
4
+ subdivisions=1
5
+ # Training
6
+ # batch=64
7
+ # subdivisions=16
8
+ width= 320
9
+ height = 320
10
+ channels=3
11
+ momentum=0.9
12
+ decay=0.0005
13
+ angle=0
14
+ saturation = 1.5
15
+ exposure = 1.5
16
+ hue=.1
17
+
18
+ learning_rate=0.001
19
+ burn_in=1000
20
+ max_batches = 500200
21
+ policy=steps
22
+ steps=400000,450000
23
+ scales=.1,.1
24
+
25
+ [convolutional]
26
+ batch_normalize=1
27
+ filters=32
28
+ size=3
29
+ stride=1
30
+ pad=1
31
+ activation=leaky
32
+
33
+ # Downsample
34
+
35
+ [convolutional]
36
+ batch_normalize=1
37
+ filters=64
38
+ size=3
39
+ stride=2
40
+ pad=1
41
+ activation=leaky
42
+
43
+ [convolutional]
44
+ batch_normalize=1
45
+ filters=32
46
+ size=1
47
+ stride=1
48
+ pad=1
49
+ activation=leaky
50
+
51
+ [convolutional]
52
+ batch_normalize=1
53
+ filters=64
54
+ size=3
55
+ stride=1
56
+ pad=1
57
+ activation=leaky
58
+
59
+ [shortcut]
60
+ from=-3
61
+ activation=linear
62
+
63
+ # Downsample
64
+
65
+ [convolutional]
66
+ batch_normalize=1
67
+ filters=128
68
+ size=3
69
+ stride=2
70
+ pad=1
71
+ activation=leaky
72
+
73
+ [convolutional]
74
+ batch_normalize=1
75
+ filters=64
76
+ size=1
77
+ stride=1
78
+ pad=1
79
+ activation=leaky
80
+
81
+ [convolutional]
82
+ batch_normalize=1
83
+ filters=128
84
+ size=3
85
+ stride=1
86
+ pad=1
87
+ activation=leaky
88
+
89
+ [shortcut]
90
+ from=-3
91
+ activation=linear
92
+
93
+ [convolutional]
94
+ batch_normalize=1
95
+ filters=64
96
+ size=1
97
+ stride=1
98
+ pad=1
99
+ activation=leaky
100
+
101
+ [convolutional]
102
+ batch_normalize=1
103
+ filters=128
104
+ size=3
105
+ stride=1
106
+ pad=1
107
+ activation=leaky
108
+
109
+ [shortcut]
110
+ from=-3
111
+ activation=linear
112
+
113
+ # Downsample
114
+
115
+ [convolutional]
116
+ batch_normalize=1
117
+ filters=256
118
+ size=3
119
+ stride=2
120
+ pad=1
121
+ activation=leaky
122
+
123
+ [convolutional]
124
+ batch_normalize=1
125
+ filters=128
126
+ size=1
127
+ stride=1
128
+ pad=1
129
+ activation=leaky
130
+
131
+ [convolutional]
132
+ batch_normalize=1
133
+ filters=256
134
+ size=3
135
+ stride=1
136
+ pad=1
137
+ activation=leaky
138
+
139
+ [shortcut]
140
+ from=-3
141
+ activation=linear
142
+
143
+ [convolutional]
144
+ batch_normalize=1
145
+ filters=128
146
+ size=1
147
+ stride=1
148
+ pad=1
149
+ activation=leaky
150
+
151
+ [convolutional]
152
+ batch_normalize=1
153
+ filters=256
154
+ size=3
155
+ stride=1
156
+ pad=1
157
+ activation=leaky
158
+
159
+ [shortcut]
160
+ from=-3
161
+ activation=linear
162
+
163
+ [convolutional]
164
+ batch_normalize=1
165
+ filters=128
166
+ size=1
167
+ stride=1
168
+ pad=1
169
+ activation=leaky
170
+
171
+ [convolutional]
172
+ batch_normalize=1
173
+ filters=256
174
+ size=3
175
+ stride=1
176
+ pad=1
177
+ activation=leaky
178
+
179
+ [shortcut]
180
+ from=-3
181
+ activation=linear
182
+
183
+ [convolutional]
184
+ batch_normalize=1
185
+ filters=128
186
+ size=1
187
+ stride=1
188
+ pad=1
189
+ activation=leaky
190
+
191
+ [convolutional]
192
+ batch_normalize=1
193
+ filters=256
194
+ size=3
195
+ stride=1
196
+ pad=1
197
+ activation=leaky
198
+
199
+ [shortcut]
200
+ from=-3
201
+ activation=linear
202
+
203
+
204
+ [convolutional]
205
+ batch_normalize=1
206
+ filters=128
207
+ size=1
208
+ stride=1
209
+ pad=1
210
+ activation=leaky
211
+
212
+ [convolutional]
213
+ batch_normalize=1
214
+ filters=256
215
+ size=3
216
+ stride=1
217
+ pad=1
218
+ activation=leaky
219
+
220
+ [shortcut]
221
+ from=-3
222
+ activation=linear
223
+
224
+ [convolutional]
225
+ batch_normalize=1
226
+ filters=128
227
+ size=1
228
+ stride=1
229
+ pad=1
230
+ activation=leaky
231
+
232
+ [convolutional]
233
+ batch_normalize=1
234
+ filters=256
235
+ size=3
236
+ stride=1
237
+ pad=1
238
+ activation=leaky
239
+
240
+ [shortcut]
241
+ from=-3
242
+ activation=linear
243
+
244
+ [convolutional]
245
+ batch_normalize=1
246
+ filters=128
247
+ size=1
248
+ stride=1
249
+ pad=1
250
+ activation=leaky
251
+
252
+ [convolutional]
253
+ batch_normalize=1
254
+ filters=256
255
+ size=3
256
+ stride=1
257
+ pad=1
258
+ activation=leaky
259
+
260
+ [shortcut]
261
+ from=-3
262
+ activation=linear
263
+
264
+ [convolutional]
265
+ batch_normalize=1
266
+ filters=128
267
+ size=1
268
+ stride=1
269
+ pad=1
270
+ activation=leaky
271
+
272
+ [convolutional]
273
+ batch_normalize=1
274
+ filters=256
275
+ size=3
276
+ stride=1
277
+ pad=1
278
+ activation=leaky
279
+
280
+ [shortcut]
281
+ from=-3
282
+ activation=linear
283
+
284
+ # Downsample
285
+
286
+ [convolutional]
287
+ batch_normalize=1
288
+ filters=512
289
+ size=3
290
+ stride=2
291
+ pad=1
292
+ activation=leaky
293
+
294
+ [convolutional]
295
+ batch_normalize=1
296
+ filters=256
297
+ size=1
298
+ stride=1
299
+ pad=1
300
+ activation=leaky
301
+
302
+ [convolutional]
303
+ batch_normalize=1
304
+ filters=512
305
+ size=3
306
+ stride=1
307
+ pad=1
308
+ activation=leaky
309
+
310
+ [shortcut]
311
+ from=-3
312
+ activation=linear
313
+
314
+
315
+ [convolutional]
316
+ batch_normalize=1
317
+ filters=256
318
+ size=1
319
+ stride=1
320
+ pad=1
321
+ activation=leaky
322
+
323
+ [convolutional]
324
+ batch_normalize=1
325
+ filters=512
326
+ size=3
327
+ stride=1
328
+ pad=1
329
+ activation=leaky
330
+
331
+ [shortcut]
332
+ from=-3
333
+ activation=linear
334
+
335
+
336
+ [convolutional]
337
+ batch_normalize=1
338
+ filters=256
339
+ size=1
340
+ stride=1
341
+ pad=1
342
+ activation=leaky
343
+
344
+ [convolutional]
345
+ batch_normalize=1
346
+ filters=512
347
+ size=3
348
+ stride=1
349
+ pad=1
350
+ activation=leaky
351
+
352
+ [shortcut]
353
+ from=-3
354
+ activation=linear
355
+
356
+
357
+ [convolutional]
358
+ batch_normalize=1
359
+ filters=256
360
+ size=1
361
+ stride=1
362
+ pad=1
363
+ activation=leaky
364
+
365
+ [convolutional]
366
+ batch_normalize=1
367
+ filters=512
368
+ size=3
369
+ stride=1
370
+ pad=1
371
+ activation=leaky
372
+
373
+ [shortcut]
374
+ from=-3
375
+ activation=linear
376
+
377
+ [convolutional]
378
+ batch_normalize=1
379
+ filters=256
380
+ size=1
381
+ stride=1
382
+ pad=1
383
+ activation=leaky
384
+
385
+ [convolutional]
386
+ batch_normalize=1
387
+ filters=512
388
+ size=3
389
+ stride=1
390
+ pad=1
391
+ activation=leaky
392
+
393
+ [shortcut]
394
+ from=-3
395
+ activation=linear
396
+
397
+
398
+ [convolutional]
399
+ batch_normalize=1
400
+ filters=256
401
+ size=1
402
+ stride=1
403
+ pad=1
404
+ activation=leaky
405
+
406
+ [convolutional]
407
+ batch_normalize=1
408
+ filters=512
409
+ size=3
410
+ stride=1
411
+ pad=1
412
+ activation=leaky
413
+
414
+ [shortcut]
415
+ from=-3
416
+ activation=linear
417
+
418
+
419
+ [convolutional]
420
+ batch_normalize=1
421
+ filters=256
422
+ size=1
423
+ stride=1
424
+ pad=1
425
+ activation=leaky
426
+
427
+ [convolutional]
428
+ batch_normalize=1
429
+ filters=512
430
+ size=3
431
+ stride=1
432
+ pad=1
433
+ activation=leaky
434
+
435
+ [shortcut]
436
+ from=-3
437
+ activation=linear
438
+
439
+ [convolutional]
440
+ batch_normalize=1
441
+ filters=256
442
+ size=1
443
+ stride=1
444
+ pad=1
445
+ activation=leaky
446
+
447
+ [convolutional]
448
+ batch_normalize=1
449
+ filters=512
450
+ size=3
451
+ stride=1
452
+ pad=1
453
+ activation=leaky
454
+
455
+ [shortcut]
456
+ from=-3
457
+ activation=linear
458
+
459
+ # Downsample
460
+
461
+ [convolutional]
462
+ batch_normalize=1
463
+ filters=1024
464
+ size=3
465
+ stride=2
466
+ pad=1
467
+ activation=leaky
468
+
469
+ [convolutional]
470
+ batch_normalize=1
471
+ filters=512
472
+ size=1
473
+ stride=1
474
+ pad=1
475
+ activation=leaky
476
+
477
+ [convolutional]
478
+ batch_normalize=1
479
+ filters=1024
480
+ size=3
481
+ stride=1
482
+ pad=1
483
+ activation=leaky
484
+
485
+ [shortcut]
486
+ from=-3
487
+ activation=linear
488
+
489
+ [convolutional]
490
+ batch_normalize=1
491
+ filters=512
492
+ size=1
493
+ stride=1
494
+ pad=1
495
+ activation=leaky
496
+
497
+ [convolutional]
498
+ batch_normalize=1
499
+ filters=1024
500
+ size=3
501
+ stride=1
502
+ pad=1
503
+ activation=leaky
504
+
505
+ [shortcut]
506
+ from=-3
507
+ activation=linear
508
+
509
+ [convolutional]
510
+ batch_normalize=1
511
+ filters=512
512
+ size=1
513
+ stride=1
514
+ pad=1
515
+ activation=leaky
516
+
517
+ [convolutional]
518
+ batch_normalize=1
519
+ filters=1024
520
+ size=3
521
+ stride=1
522
+ pad=1
523
+ activation=leaky
524
+
525
+ [shortcut]
526
+ from=-3
527
+ activation=linear
528
+
529
+ [convolutional]
530
+ batch_normalize=1
531
+ filters=512
532
+ size=1
533
+ stride=1
534
+ pad=1
535
+ activation=leaky
536
+
537
+ [convolutional]
538
+ batch_normalize=1
539
+ filters=1024
540
+ size=3
541
+ stride=1
542
+ pad=1
543
+ activation=leaky
544
+
545
+ [shortcut]
546
+ from=-3
547
+ activation=linear
548
+
549
+ ######################
550
+
551
+ [convolutional]
552
+ batch_normalize=1
553
+ filters=512
554
+ size=1
555
+ stride=1
556
+ pad=1
557
+ activation=leaky
558
+
559
+ [convolutional]
560
+ batch_normalize=1
561
+ size=3
562
+ stride=1
563
+ pad=1
564
+ filters=1024
565
+ activation=leaky
566
+
567
+ [convolutional]
568
+ batch_normalize=1
569
+ filters=512
570
+ size=1
571
+ stride=1
572
+ pad=1
573
+ activation=leaky
574
+
575
+ [convolutional]
576
+ batch_normalize=1
577
+ size=3
578
+ stride=1
579
+ pad=1
580
+ filters=1024
581
+ activation=leaky
582
+
583
+ [convolutional]
584
+ batch_normalize=1
585
+ filters=512
586
+ size=1
587
+ stride=1
588
+ pad=1
589
+ activation=leaky
590
+
591
+ [convolutional]
592
+ batch_normalize=1
593
+ size=3
594
+ stride=1
595
+ pad=1
596
+ filters=1024
597
+ activation=leaky
598
+
599
+ [convolutional]
600
+ size=1
601
+ stride=1
602
+ pad=1
603
+ filters=255
604
+ activation=linear
605
+
606
+
607
+ [yolo]
608
+ mask = 6,7,8
609
+ anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326
610
+ classes=80
611
+ num=9
612
+ jitter=.3
613
+ ignore_thresh = .5
614
+ truth_thresh = 1
615
+ random=1
616
+
617
+
618
+ [route]
619
+ layers = -4
620
+
621
+ [convolutional]
622
+ batch_normalize=1
623
+ filters=256
624
+ size=1
625
+ stride=1
626
+ pad=1
627
+ activation=leaky
628
+
629
+ [upsample]
630
+ stride=2
631
+
632
+ [route]
633
+ layers = -1, 61
634
+
635
+
636
+
637
+ [convolutional]
638
+ batch_normalize=1
639
+ filters=256
640
+ size=1
641
+ stride=1
642
+ pad=1
643
+ activation=leaky
644
+
645
+ [convolutional]
646
+ batch_normalize=1
647
+ size=3
648
+ stride=1
649
+ pad=1
650
+ filters=512
651
+ activation=leaky
652
+
653
+ [convolutional]
654
+ batch_normalize=1
655
+ filters=256
656
+ size=1
657
+ stride=1
658
+ pad=1
659
+ activation=leaky
660
+
661
+ [convolutional]
662
+ batch_normalize=1
663
+ size=3
664
+ stride=1
665
+ pad=1
666
+ filters=512
667
+ activation=leaky
668
+
669
+ [convolutional]
670
+ batch_normalize=1
671
+ filters=256
672
+ size=1
673
+ stride=1
674
+ pad=1
675
+ activation=leaky
676
+
677
+ [convolutional]
678
+ batch_normalize=1
679
+ size=3
680
+ stride=1
681
+ pad=1
682
+ filters=512
683
+ activation=leaky
684
+
685
+ [convolutional]
686
+ size=1
687
+ stride=1
688
+ pad=1
689
+ filters=255
690
+ activation=linear
691
+
692
+
693
+ [yolo]
694
+ mask = 3,4,5
695
+ anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326
696
+ classes=80
697
+ num=9
698
+ jitter=.3
699
+ ignore_thresh = .5
700
+ truth_thresh = 1
701
+ random=1
702
+
703
+
704
+
705
+ [route]
706
+ layers = -4
707
+
708
+ [convolutional]
709
+ batch_normalize=1
710
+ filters=128
711
+ size=1
712
+ stride=1
713
+ pad=1
714
+ activation=leaky
715
+
716
+ [upsample]
717
+ stride=2
718
+
719
+ [route]
720
+ layers = -1, 36
721
+
722
+
723
+
724
+ [convolutional]
725
+ batch_normalize=1
726
+ filters=128
727
+ size=1
728
+ stride=1
729
+ pad=1
730
+ activation=leaky
731
+
732
+ [convolutional]
733
+ batch_normalize=1
734
+ size=3
735
+ stride=1
736
+ pad=1
737
+ filters=256
738
+ activation=leaky
739
+
740
+ [convolutional]
741
+ batch_normalize=1
742
+ filters=128
743
+ size=1
744
+ stride=1
745
+ pad=1
746
+ activation=leaky
747
+
748
+ [convolutional]
749
+ batch_normalize=1
750
+ size=3
751
+ stride=1
752
+ pad=1
753
+ filters=256
754
+ activation=leaky
755
+
756
+ [convolutional]
757
+ batch_normalize=1
758
+ filters=128
759
+ size=1
760
+ stride=1
761
+ pad=1
762
+ activation=leaky
763
+
764
+ [convolutional]
765
+ batch_normalize=1
766
+ size=3
767
+ stride=1
768
+ pad=1
769
+ filters=256
770
+ activation=leaky
771
+
772
+ [convolutional]
773
+ size=1
774
+ stride=1
775
+ pad=1
776
+ filters=255
777
+ activation=linear
778
+
779
+
780
+ [yolo]
781
+ mask = 0,1,2
782
+ anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326
783
+ classes=80
784
+ num=9
785
+ jitter=.3
786
+ ignore_thresh = .5
787
+ truth_thresh = 1
788
+ random=1
789
+
VideoToNPZ/lib/detector/yolov3/darknet.py ADDED
@@ -0,0 +1,433 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import division
2
+
3
+ import torch
4
+ import torch.nn as nn
5
+ import torch.nn.functional as F
6
+ import numpy as np
7
+ import cv2
8
+ import os
9
+ import sys
10
+
11
+ from util import convert2cpu as cpu
12
+ from util import predict_transform
13
+
14
+
15
+ class test_net(nn.Module):
16
+ def __init__(self, num_layers, input_size):
17
+ super(test_net, self).__init__()
18
+ self.num_layers= num_layers
19
+ self.linear_1 = nn.Linear(input_size, 5)
20
+ self.middle = nn.ModuleList([nn.Linear(5,5) for x in range(num_layers)])
21
+ self.output = nn.Linear(5,2)
22
+
23
+ def forward(self, x):
24
+ x = x.view(-1)
25
+ fwd = nn.Sequential(self.linear_1, *self.middle, self.output)
26
+ return fwd(x)
27
+
28
+
29
+ def get_test_input():
30
+ img = cv2.imread("dog-cycle-car.png")
31
+ img = cv2.resize(img, (416, 416))
32
+ img_ = img[:, :, ::-1].transpose((2, 0, 1))
33
+ img_ = img_[np.newaxis, :, :, :]/255.0
34
+ img_ = torch.from_numpy(img_).float()
35
+ return img_
36
+
37
+
38
+ def parse_cfg(cfgfile):
39
+ """
40
+ Takes a configuration file
41
+
42
+ Returns a list of blocks. Each blocks describes a block in the neural
43
+ network to be built. Block is represented as a dictionary in the list
44
+
45
+ """
46
+ # cfgfile = os.path.join(sys.path[-1], cfgfile)
47
+ file = open(cfgfile, 'r')
48
+ lines = file.read().split('\n') # store the lines in a list
49
+ lines = [x for x in lines if len(x) > 0] # get read of the empty lines
50
+ lines = [x for x in lines if x[0] != '#']
51
+ lines = [x.rstrip().lstrip() for x in lines]
52
+
53
+ block = {}
54
+ blocks = []
55
+
56
+ for line in lines:
57
+ if line[0] == "[": # This marks the start of a new block
58
+ if len(block) != 0:
59
+ blocks.append(block)
60
+ block = {}
61
+ block["type"] = line[1:-1].rstrip()
62
+ else:
63
+ key,value = line.split("=")
64
+ block[key.rstrip()] = value.lstrip()
65
+ blocks.append(block)
66
+
67
+ return blocks
68
+
69
+
70
+ class MaxPoolStride1(nn.Module):
71
+ def __init__(self, kernel_size):
72
+ super(MaxPoolStride1, self).__init__()
73
+ self.kernel_size = kernel_size
74
+ self.pad = kernel_size - 1
75
+
76
+ def forward(self, x):
77
+ padded_x = F.pad(x, (0, self.pad, 0, self.pad), mode="replicate")
78
+ pooled_x = nn.MaxPool2d(self.kernel_size, self.pad)(padded_x)
79
+ return pooled_x
80
+
81
+
82
+ class EmptyLayer(nn.Module):
83
+ def __init__(self):
84
+ super(EmptyLayer, self).__init__()
85
+
86
+
87
+ class DetectionLayer(nn.Module):
88
+ def __init__(self, anchors):
89
+ super(DetectionLayer, self).__init__()
90
+ self.anchors = anchors
91
+
92
+ def forward(self, x, inp_dim, num_classes, confidence):
93
+ x = x.data
94
+ global CUDA
95
+ prediction = x
96
+ prediction = predict_transform(prediction, inp_dim, self.anchors, num_classes, confidence, CUDA)
97
+ return prediction
98
+
99
+
100
+ class Upsample(nn.Module):
101
+ def __init__(self, stride=2):
102
+ super(Upsample, self).__init__()
103
+ self.stride = stride
104
+
105
+ def forward(self, x):
106
+ stride = self.stride
107
+ assert(x.data.dim() == 4)
108
+ B = x.data.size(0)
109
+ C = x.data.size(1)
110
+ H = x.data.size(2)
111
+ W = x.data.size(3)
112
+ ws = stride
113
+ hs = stride
114
+ x = x.view(B, C, H, 1, W, 1).expand(B, C, H, stride, W, stride).contiguous().view(B, C, H*stride, W*stride)
115
+ return x
116
+
117
+
118
+ class ReOrgLayer(nn.Module):
119
+ def __init__(self, stride=2):
120
+ super(ReOrgLayer, self).__init__()
121
+ self.stride= stride
122
+
123
+ def forward(self, x):
124
+ assert(x.data.dim() == 4)
125
+ B, C, H, W = x.data.shape
126
+ hs = self.stride
127
+ ws = self.stride
128
+ assert(H % hs == 0), "The stride " + str(self.stride) + " is not a proper divisor of height " + str(H)
129
+ assert(W % ws == 0), "The stride " + str(self.stride) + " is not a proper divisor of height " + str(W)
130
+ x = x.view(B, C, H // hs, hs, W // ws, ws).transpose(-2, -3).contiguous()
131
+ x = x.view(B, C, H // hs * W // ws, hs, ws)
132
+ x = x.view(B, C, H // hs * W // ws, hs*ws).transpose(-1, -2).contiguous()
133
+ x = x.view(B, C, ws*hs, H // ws, W // ws).transpose(1, 2).contiguous()
134
+ x = x.view(B, C*ws*hs, H // ws, W // ws)
135
+ return x
136
+
137
+
138
+ def create_modules(blocks):
139
+ net_info = blocks[0] # Captures the information about the input and pre-processing
140
+
141
+ module_list = nn.ModuleList()
142
+
143
+ index = 0 # indexing blocks helps with implementing route layers (skip connections)
144
+ prev_filters = 3
145
+ output_filters = []
146
+
147
+ for x in blocks:
148
+ module = nn.Sequential()
149
+ if x["type"] == "net":
150
+ continue
151
+
152
+ # If it's a convolutional layer
153
+ if x["type"] == "convolutional":
154
+ # Get the info about the layer
155
+ activation = x["activation"]
156
+ try:
157
+ batch_normalize = int(x["batch_normalize"])
158
+ bias = False
159
+ except:
160
+ batch_normalize = 0
161
+ bias = True
162
+
163
+ filters= int(x["filters"])
164
+ padding = int(x["pad"])
165
+ kernel_size = int(x["size"])
166
+ stride = int(x["stride"])
167
+
168
+ if padding:
169
+ pad = (kernel_size - 1) // 2
170
+ else:
171
+ pad = 0
172
+
173
+ # Add the convolutional layer
174
+ conv = nn.Conv2d(prev_filters, filters, kernel_size, stride, pad, bias = bias)
175
+ module.add_module("conv_{0}".format(index), conv)
176
+
177
+ # Add the Batch Norm Layer
178
+ if batch_normalize:
179
+ bn = nn.BatchNorm2d(filters)
180
+ module.add_module("batch_norm_{0}".format(index), bn)
181
+
182
+ # Check the activation.
183
+ # It is either Linear or a Leaky ReLU for YOLO
184
+ if activation == "leaky":
185
+ activn = nn.LeakyReLU(0.1, inplace = True)
186
+ module.add_module("leaky_{0}".format(index), activn)
187
+
188
+ # If it's an upsampling layer
189
+ # We use Bilinear2dUpsampling
190
+
191
+ elif x["type"] == "upsample":
192
+ stride = int(x["stride"])
193
+ # upsample = Upsample(stride)
194
+ upsample = nn.Upsample(scale_factor=2, mode="nearest")
195
+ module.add_module("upsample_{}".format(index), upsample)
196
+
197
+ # If it is a route layer
198
+ elif (x["type"] == "route"):
199
+ x["layers"] = x["layers"].split(',')
200
+
201
+ # Start of a route
202
+ start = int(x["layers"][0])
203
+
204
+ # end, if there exists one.
205
+ try:
206
+ end = int(x["layers"][1])
207
+ except:
208
+ end = 0
209
+
210
+ # Positive anotation
211
+ if start > 0:
212
+ start = start - index
213
+
214
+ if end > 0:
215
+ end = end - index
216
+
217
+ route = EmptyLayer()
218
+ module.add_module("route_{0}".format(index), route)
219
+
220
+ if end < 0:
221
+ filters = output_filters[index + start] + output_filters[index + end]
222
+ else:
223
+ filters = output_filters[index + start]
224
+
225
+ # shortcut corresponds to skip connection
226
+ elif x["type"] == "shortcut":
227
+ from_ = int(x["from"])
228
+ shortcut = EmptyLayer()
229
+ module.add_module("shortcut_{}".format(index), shortcut)
230
+
231
+ elif x["type"] == "maxpool":
232
+ stride = int(x["stride"])
233
+ size = int(x["size"])
234
+ if stride != 1:
235
+ maxpool = nn.MaxPool2d(size, stride)
236
+ else:
237
+ maxpool = MaxPoolStride1(size)
238
+
239
+ module.add_module("maxpool_{}".format(index), maxpool)
240
+
241
+ # Yolo is the detection layer
242
+ elif x["type"] == "yolo":
243
+ mask = x["mask"].split(",")
244
+ mask = [int(x) for x in mask]
245
+
246
+ anchors = x["anchors"].split(",")
247
+ anchors = [int(a) for a in anchors]
248
+ anchors = [(anchors[i], anchors[i+1]) for i in range(0, len(anchors),2)]
249
+ anchors = [anchors[i] for i in mask]
250
+
251
+ detection = DetectionLayer(anchors)
252
+ module.add_module("Detection_{}".format(index), detection)
253
+
254
+ else:
255
+ print("Something I dunno")
256
+ assert False
257
+
258
+ module_list.append(module)
259
+ prev_filters = filters
260
+ output_filters.append(filters)
261
+ index += 1
262
+
263
+ return (net_info, module_list)
264
+
265
+
266
+ class Darknet(nn.Module):
267
+ def __init__(self, cfgfile):
268
+ super(Darknet, self).__init__()
269
+ self.blocks = parse_cfg(cfgfile)
270
+ self.net_info, self.module_list = create_modules(self.blocks)
271
+ self.header = torch.IntTensor([0, 0, 0, 0])
272
+ self.seen = 0
273
+
274
+ def get_blocks(self):
275
+ return self.blocks
276
+
277
+ def get_module_list(self):
278
+ return self.module_list
279
+
280
+ def forward(self, x, CUDA):
281
+ detections = []
282
+ modules = self.blocks[1:]
283
+ outputs = {} # We cache the outputs for the route layer
284
+
285
+ write = 0
286
+ for i in range(len(modules)):
287
+
288
+ module_type = (modules[i]["type"])
289
+ if module_type == "convolutional" or module_type == "upsample" or module_type == "maxpool":
290
+
291
+ x = self.module_list[i](x)
292
+ outputs[i] = x
293
+
294
+ elif module_type == "route":
295
+ layers = modules[i]["layers"]
296
+ layers = [int(a) for a in layers]
297
+
298
+ if (layers[0]) > 0:
299
+ layers[0] = layers[0] - i
300
+
301
+ if len(layers) == 1:
302
+ x = outputs[i + (layers[0])]
303
+
304
+ else:
305
+ if (layers[1]) > 0:
306
+ layers[1] = layers[1] - i
307
+
308
+ map1 = outputs[i + layers[0]]
309
+ map2 = outputs[i + layers[1]]
310
+
311
+ x = torch.cat((map1, map2), 1)
312
+ outputs[i] = x
313
+
314
+ elif module_type == "shortcut":
315
+ from_ = int(modules[i]["from"])
316
+ x = outputs[i-1] + outputs[i+from_]
317
+ outputs[i] = x
318
+
319
+ elif module_type == 'yolo':
320
+
321
+ anchors = self.module_list[i][0].anchors
322
+ # Get the input dimensions
323
+ inp_dim = int(self.net_info["height"])
324
+
325
+ # Get the number of classes
326
+ num_classes = int(modules[i]["classes"])
327
+
328
+ # Output the result
329
+ x = x.data
330
+ x = predict_transform(x, inp_dim, anchors, num_classes, CUDA)
331
+
332
+ if type(x) == int:
333
+ continue
334
+
335
+ if not write:
336
+ detections = x
337
+ write = 1
338
+ else:
339
+ detections = torch.cat((detections, x), 1)
340
+
341
+ outputs[i] = outputs[i-1]
342
+
343
+ try:
344
+ return detections
345
+ except:
346
+ return 0
347
+
348
+ def load_weights(self, weightfile):
349
+ # Introduction: https://blog.paperspace.com/how-to-implement-a-yolo-v3-object-detector-from-scratch-in-pytorch-part-3/
350
+ # Open the weights file
351
+ # weightfile = os.path.join(sys.path[-1], weightfile)
352
+ fp = open(weightfile, "rb")
353
+
354
+ # The first 5 values are header information
355
+ # 1. Major version number
356
+ # 2. Minor Version Number
357
+ # 3. Subversion number
358
+ # 4.5 Images seen by the network (during training)
359
+ header = np.fromfile(fp, dtype = np.int32, count = 5)
360
+ self.header = torch.from_numpy(header)
361
+ self.seen = self.header[3]
362
+
363
+ # The rest of the values are the weights
364
+ # Let's load them up
365
+ weights = np.fromfile(fp, dtype = np.float32)
366
+
367
+ ptr = 0
368
+ for i in range(len(self.module_list)):
369
+ module_type = self.blocks[i + 1]["type"]
370
+
371
+ if module_type == "convolutional":
372
+ model = self.module_list[i]
373
+ try:
374
+ batch_normalize = int(self.blocks[i+1]["batch_normalize"])
375
+ except:
376
+ batch_normalize = 0
377
+
378
+ conv = model[0]
379
+
380
+ if (batch_normalize):
381
+ bn = model[1]
382
+
383
+ # Get the number of weights of Batch Norm Layer
384
+ num_bn_biases = bn.bias.numel()
385
+
386
+ # Load the weights
387
+ bn_biases = torch.from_numpy(weights[ptr:ptr + num_bn_biases])
388
+ ptr += num_bn_biases
389
+
390
+ bn_weights = torch.from_numpy(weights[ptr: ptr + num_bn_biases])
391
+ ptr += num_bn_biases
392
+
393
+ bn_running_mean = torch.from_numpy(weights[ptr: ptr + num_bn_biases])
394
+ ptr += num_bn_biases
395
+
396
+ bn_running_var = torch.from_numpy(weights[ptr: ptr + num_bn_biases])
397
+ ptr += num_bn_biases
398
+
399
+ # Cast the loaded weights into dims of model weights.
400
+ bn_biases = bn_biases.view_as(bn.bias.data)
401
+ bn_weights = bn_weights.view_as(bn.weight.data)
402
+ bn_running_mean = bn_running_mean.view_as(bn.running_mean)
403
+ bn_running_var = bn_running_var.view_as(bn.running_var)
404
+
405
+ # Copy the data to model
406
+ bn.bias.data.copy_(bn_biases)
407
+ bn.weight.data.copy_(bn_weights)
408
+ bn.running_mean.copy_(bn_running_mean)
409
+ bn.running_var.copy_(bn_running_var)
410
+
411
+ else:
412
+ # Number of biases
413
+ num_biases = conv.bias.numel()
414
+
415
+ # Load the weights
416
+ conv_biases = torch.from_numpy(weights[ptr: ptr + num_biases])
417
+ ptr = ptr + num_biases
418
+
419
+ # reshape the loaded weights according to the dims of the model weights
420
+ conv_biases = conv_biases.view_as(conv.bias.data)
421
+
422
+ # Finally copy the data
423
+ conv.bias.data.copy_(conv_biases)
424
+
425
+ # Let us load the weights for the Convolutional layers
426
+ num_weights = conv.weight.numel()
427
+
428
+ # Do the same as above for weights
429
+ conv_weights = torch.from_numpy(weights[ptr:ptr+num_weights])
430
+ ptr = ptr + num_weights
431
+
432
+ conv_weights = conv_weights.view_as(conv.weight.data)
433
+ conv.weight.data.copy_(conv_weights)
VideoToNPZ/lib/detector/yolov3/data/coco.names ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ person
2
+ bicycle
3
+ car
4
+ motorbike
5
+ aeroplane
6
+ bus
7
+ train
8
+ truck
9
+ boat
10
+ traffic light
11
+ fire hydrant
12
+ stop sign
13
+ parking meter
14
+ bench
15
+ bird
16
+ cat
17
+ dog
18
+ horse
19
+ sheep
20
+ cow
21
+ elephant
22
+ bear
23
+ zebra
24
+ giraffe
25
+ backpack
26
+ umbrella
27
+ handbag
28
+ tie
29
+ suitcase
30
+ frisbee
31
+ skis
32
+ snowboard
33
+ sports ball
34
+ kite
35
+ baseball bat
36
+ baseball glove
37
+ skateboard
38
+ surfboard
39
+ tennis racket
40
+ bottle
41
+ wine glass
42
+ cup
43
+ fork
44
+ knife
45
+ spoon
46
+ bowl
47
+ banana
48
+ apple
49
+ sandwich
50
+ orange
51
+ broccoli
52
+ carrot
53
+ hot dog
54
+ pizza
55
+ donut
56
+ cake
57
+ chair
58
+ sofa
59
+ pottedplant
60
+ bed
61
+ diningtable
62
+ toilet
63
+ tvmonitor
64
+ laptop
65
+ mouse
66
+ remote
67
+ keyboard
68
+ cell phone
69
+ microwave
70
+ oven
71
+ toaster
72
+ sink
73
+ refrigerator
74
+ book
75
+ clock
76
+ vase
77
+ scissors
78
+ teddy bear
79
+ hair drier
80
+ toothbrush
VideoToNPZ/lib/detector/yolov3/data/pallete ADDED
Binary file (908 Bytes). View file
 
VideoToNPZ/lib/detector/yolov3/data/voc.names ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aeroplane
2
+ bicycle
3
+ bird
4
+ boat
5
+ bottle
6
+ bus
7
+ car
8
+ cat
9
+ chair
10
+ cow
11
+ diningtable
12
+ dog
13
+ horse
14
+ motorbike
15
+ person
16
+ pottedplant
17
+ sheep
18
+ sofa
19
+ train
20
+ tvmonitor
VideoToNPZ/lib/detector/yolov3/human_detector.py ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import division
2
+ import time
3
+ import torch
4
+ import numpy as np
5
+ import cv2
6
+ import os
7
+ import sys
8
+ import random
9
+ import pickle as pkl
10
+ import argparse
11
+
12
+ from util import *
13
+ from darknet import Darknet
14
+ from preprocess import letterbox_image
15
+ import preprocess
16
+
17
+
18
+ cur_dir = os.path.dirname(os.path.realpath(__file__))
19
+ project_root = os.path.join(cur_dir, '../../../')
20
+ chk_root = os.path.join(project_root, 'checkpoint/')
21
+ data_root = os.path.join(project_root, 'data/')
22
+
23
+
24
+ sys.path.insert(0, project_root)
25
+ sys.path.pop(0)
26
+
27
+
28
+ def prep_image(img, inp_dim):
29
+ """
30
+ Prepare image for inputting to the neural network.
31
+
32
+ Returns a Variable
33
+ """
34
+ ori_img = img
35
+ dim = ori_img.shape[1], ori_img.shape[0]
36
+ img = cv2.resize(ori_img, (inp_dim, inp_dim))
37
+ img_ = img[:, :, ::-1].transpose((2, 0, 1)).copy()
38
+ img_ = torch.from_numpy(img_).float().div(255.0).unsqueeze(0)
39
+ return img_, ori_img, dim
40
+
41
+
42
+ def write(x, img, colors):
43
+ x = [int(i) for i in x]
44
+ c1 = tuple(x[0:2])
45
+ c2 = tuple(x[2:4])
46
+
47
+ label = 'People {}'.format(0)
48
+ color = (0, 0, 255)
49
+ cv2.rectangle(img, c1, c2, color, 2)
50
+ t_size = cv2.getTextSize(label, cv2.FONT_HERSHEY_PLAIN, 1, 1)[0]
51
+ c2 = c1[0] + t_size[0] + 3, c1[1] + t_size[1] + 4
52
+ cv2.rectangle(img, c1, c2, color, -1)
53
+ cv2.putText(img, label, (c1[0], c1[1] + t_size[1] + 4), cv2.FONT_HERSHEY_PLAIN, 1, [225, 255, 255], 1)
54
+ return img
55
+
56
+
57
+ def arg_parse():
58
+ """"
59
+ Parse arguements to the detect module
60
+
61
+ """
62
+ parser = argparse.ArgumentParser(description='YOLO v3 Cam Demo')
63
+ parser.add_argument('--confidence', dest='confidence', type=float, default=0.70,
64
+ help='Object Confidence to filter predictions')
65
+ parser.add_argument('--nms-thresh', dest='nms_thresh', type=float, default=0.4, help='NMS Threshold')
66
+ parser.add_argument('--reso', dest='reso', default=416, type=int, help='Input resolution of the network. '
67
+ 'Increase to increase accuracy. Decrease to increase speed. (160, 416)')
68
+ parser.add_argument('-wf', '--weight-file', type=str, default=chk_root + 'yolov3/yolov3.weights', help='The path'
69
+ 'of model weight file')
70
+ parser.add_argument('-cf', '--cfg-file', type=str, default=cur_dir + '/cfg/yolov3.cfg', help='weight file')
71
+ parser.add_argument('-a', '--animation', action='store_true', help='output animation')
72
+ parser.add_argument('-v', '--video', type=str, default='camera', help='The input video path')
73
+ parser.add_argument('-i', '--image', type=str, default=cur_dir + '/data/dog-cycle-car.png',
74
+ help='The input video path')
75
+ parser.add_argument('-np', '--num-person', type=int, default=1, help='number of estimated human poses. [1, 2]')
76
+ return parser.parse_args()
77
+
78
+
79
+ def load_model(args=None, CUDA=None, inp_dim=416):
80
+ if args is None:
81
+ args = arg_parse()
82
+
83
+ if CUDA is None:
84
+ CUDA = torch.cuda.is_available()
85
+
86
+ # Set up the neural network
87
+ model = Darknet(args.cfg_file)
88
+ model.load_weights(args.weight_file)
89
+
90
+ model.net_info["height"] = inp_dim
91
+ assert inp_dim % 32 == 0
92
+ assert inp_dim > 32
93
+
94
+ # If there's a GPU availible, put the model on GPU
95
+ if CUDA:
96
+ model.cuda()
97
+
98
+ # Set the model in evaluation mode
99
+ model.eval()
100
+
101
+ return model
102
+
103
+
104
+ def yolo_human_det(img, model=None, reso=416, confidence=0.70):
105
+ args = arg_parse()
106
+ # args.reso = reso
107
+ inp_dim = reso
108
+ num_classes = 80
109
+
110
+ CUDA = torch.cuda.is_available()
111
+ if model is None:
112
+ model = load_model(args, CUDA, inp_dim)
113
+
114
+ if type(img) == str:
115
+ assert os.path.isfile(img), 'The image path does not exist'
116
+ img = cv2.imread(img)
117
+
118
+ img, ori_img, img_dim = preprocess.prep_image(img, inp_dim)
119
+ img_dim = torch.FloatTensor(img_dim).repeat(1, 2)
120
+
121
+ with torch.no_grad():
122
+ if CUDA:
123
+ img_dim = img_dim.cuda()
124
+ img = img.cuda()
125
+ output = model(img, CUDA)
126
+ output = write_results(output, confidence, num_classes, nms=True, nms_conf=args.nms_thresh, det_hm=True)
127
+
128
+ if len(output) == 0:
129
+ return None, None
130
+
131
+ img_dim = img_dim.repeat(output.size(0), 1)
132
+ scaling_factor = torch.min(inp_dim / img_dim, 1)[0].view(-1, 1)
133
+
134
+ output[:, [1, 3]] -= (inp_dim - scaling_factor * img_dim[:, 0].view(-1, 1)) / 2
135
+ output[:, [2, 4]] -= (inp_dim - scaling_factor * img_dim[:, 1].view(-1, 1)) / 2
136
+ output[:, 1:5] /= scaling_factor
137
+
138
+ for i in range(output.shape[0]):
139
+ output[i, [1, 3]] = torch.clamp(output[i, [1, 3]], 0.0, img_dim[i, 0])
140
+ output[i, [2, 4]] = torch.clamp(output[i, [2, 4]], 0.0, img_dim[i, 1])
141
+
142
+ bboxs = []
143
+ scores = []
144
+ for i in range(len(output)):
145
+ item = output[i]
146
+ bbox = item[1:5].cpu().numpy()
147
+ # conver float32 to .2f data
148
+ bbox = [round(i, 2) for i in list(bbox)]
149
+ score = item[5].cpu().numpy()
150
+ bboxs.append(bbox)
151
+ scores.append(score)
152
+ scores = np.expand_dims(np.array(scores), 1)
153
+ bboxs = np.array(bboxs)
154
+
155
+ return bboxs, scores
VideoToNPZ/lib/detector/yolov3/preprocess.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import division
2
+
3
+ import torch
4
+ import numpy as np
5
+ import cv2
6
+ from PIL import Image
7
+
8
+
9
+ def letterbox_image(img, inp_dim):
10
+ '''resize image with unchanged aspect ratio using padding'''
11
+ img_w, img_h = img.shape[1], img.shape[0]
12
+ w, h = inp_dim
13
+ new_w = int(img_w * min(w/img_w, h/img_h))
14
+ new_h = int(img_h * min(w/img_w, h/img_h))
15
+ resized_image = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_CUBIC)
16
+
17
+ canvas = np.full((inp_dim[1], inp_dim[0], 3), 128)
18
+
19
+ canvas[(h - new_h) // 2:(h - new_h) // 2 + new_h, (w - new_w) // 2:(w - new_w) // 2 + new_w, :] = resized_image
20
+
21
+ return canvas
22
+
23
+
24
+ def prep_image(img, inp_dim):
25
+ """
26
+ Prepare image for inputting to the neural network.
27
+
28
+ Returns a Variable
29
+ """
30
+ if type(img) == str:
31
+ orig_im = cv2.imread(img)
32
+ else:
33
+ orig_im = img
34
+ dim = orig_im.shape[1], orig_im.shape[0]
35
+ img = (letterbox_image(orig_im, (inp_dim, inp_dim)))
36
+ img_ = img[:, :, ::-1].transpose((2, 0, 1)).copy()
37
+ img_ = torch.from_numpy(img_).float().div(255.0).unsqueeze(0)
38
+ return img_, orig_im, dim
39
+
40
+
41
+ def prep_image_pil(img, network_dim):
42
+ orig_im = Image.open(img)
43
+ img = orig_im.convert('RGB')
44
+ dim = img.size
45
+ img = img.resize(network_dim)
46
+ img = torch.ByteTensor(torch.ByteStorage.from_buffer(img.tobytes()))
47
+ img = img.view(*network_dim, 3).transpose(0, 1).transpose(0, 2).contiguous()
48
+ img = img.view(1, 3, *network_dim)
49
+ img = img.float().div(255.0)
50
+ return img, orig_im, dim
51
+
52
+
53
+ def inp_to_image(inp):
54
+ inp = inp.cpu().squeeze()
55
+ inp = inp * 255
56
+ try:
57
+ inp = inp.data.numpy()
58
+ except RuntimeError:
59
+ inp = inp.numpy()
60
+ inp = inp.transpose(1, 2, 0)
61
+
62
+ inp = inp[:, :, ::-1]
63
+ return inp
VideoToNPZ/lib/detector/yolov3/util.py ADDED
@@ -0,0 +1,225 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import division
2
+
3
+ import torch
4
+ import numpy as np
5
+ import cv2
6
+ import os.path as osp
7
+ from bbox import bbox_iou
8
+
9
+
10
+ def get_path(cur_file):
11
+ cur_dir = osp.dirname(osp.realpath(cur_file))
12
+ project_root = osp.join(cur_dir, '../../../')
13
+ chk_root = osp.join(project_root, 'checkpoint/')
14
+ data_root = osp.join(project_root, 'data/')
15
+
16
+ return project_root, chk_root, data_root, cur_dir
17
+
18
+
19
+ def count_parameters(model):
20
+ return sum(p.numel() for p in model.parameters())
21
+
22
+
23
+ def count_learnable_parameters(model):
24
+ return sum(p.numel() for p in model.parameters() if p.requires_grad)
25
+
26
+
27
+ def convert2cpu(matrix):
28
+ if matrix.is_cuda:
29
+ return torch.FloatTensor(matrix.size()).copy_(matrix)
30
+ else:
31
+ return matrix
32
+
33
+
34
+ def predict_transform(prediction, inp_dim, anchors, num_classes, CUDA = True):
35
+ batch_size = prediction.size(0)
36
+ stride = inp_dim // prediction.size(2)
37
+ grid_size = inp_dim // stride
38
+ bbox_attrs = 5 + num_classes
39
+ num_anchors = len(anchors)
40
+
41
+ anchors = [(a[0]/stride, a[1]/stride) for a in anchors]
42
+
43
+ prediction = prediction.view(batch_size, bbox_attrs*num_anchors, grid_size*grid_size)
44
+ prediction = prediction.transpose(1, 2).contiguous()
45
+ prediction = prediction.view(batch_size, grid_size*grid_size*num_anchors, bbox_attrs)
46
+
47
+ # Sigmoid the centre_X, centre_Y. and object confidencce
48
+ prediction[:, :, 0] = torch.sigmoid(prediction[:, :, 0])
49
+ prediction[:, :, 1] = torch.sigmoid(prediction[:, :, 1])
50
+ prediction[:, :, 4] = torch.sigmoid(prediction[:, :, 4])
51
+
52
+ # Add the center offsets
53
+ grid_len = np.arange(grid_size)
54
+ a, b = np.meshgrid(grid_len, grid_len)
55
+
56
+ x_offset = torch.FloatTensor(a).view(-1, 1)
57
+ y_offset = torch.FloatTensor(b).view(-1, 1)
58
+
59
+ if CUDA:
60
+ x_offset = x_offset.cuda()
61
+ y_offset = y_offset.cuda()
62
+
63
+ x_y_offset = torch.cat((x_offset, y_offset), 1).repeat(1, num_anchors).view(-1, 2).unsqueeze(0)
64
+
65
+ prediction[:, :, :2] += x_y_offset
66
+
67
+ # log space transform height and the width
68
+ anchors = torch.FloatTensor(anchors)
69
+
70
+ if CUDA:
71
+ anchors = anchors.cuda()
72
+
73
+ anchors = anchors.repeat(grid_size*grid_size, 1).unsqueeze(0)
74
+ prediction[:, :, 2:4] = torch.exp(prediction[:, :, 2:4])*anchors
75
+
76
+ # Softmax the class scores
77
+ prediction[:, :, 5: 5 + num_classes] = torch.sigmoid((prediction[:, :, 5: 5 + num_classes]))
78
+
79
+ prediction[:, :, :4] *= stride
80
+
81
+ return prediction
82
+
83
+
84
+ def load_classes(namesfile):
85
+ fp = open(namesfile, "r")
86
+ names = fp.read().split("\n")[:-1]
87
+ return names
88
+
89
+
90
+ def get_im_dim(im):
91
+ im = cv2.imread(im)
92
+ w, h = im.shape[1], im.shape[0]
93
+ return w, h
94
+
95
+
96
+ def unique(tensor):
97
+ tensor_np = tensor.cpu().numpy()
98
+ unique_np = np.unique(tensor_np)
99
+ unique_tensor = torch.from_numpy(unique_np)
100
+
101
+ tensor_res = tensor.new(unique_tensor.shape)
102
+ tensor_res.copy_(unique_tensor)
103
+ return tensor_res
104
+
105
+
106
+ # ADD SOFT NMS
107
+ def write_results(prediction, confidence, num_classes, nms=True, nms_conf=0.4, det_hm=False):
108
+ """
109
+ https://blog.paperspace.com/how-to-implement-a-yolo-v3-object-detector-from-scratch-in-pytorch-part-4/
110
+ prediction: (B x 10647 x 85)
111
+ B: the number of images in a batch,
112
+ 10647: the number of bounding boxes predicted per image. (52×52+26×26+13×13)×3=10647
113
+ 85: the number of bounding box attributes. (c_x, c_y, w, h, object confidence, and 80 class scores)
114
+
115
+ output: Num_obj × [img_index, x_1, y_1, x_2, y_2, object confidence, class_score, label_index]
116
+ """
117
+
118
+ conf_mask = (prediction[:, :, 4] > confidence).float().unsqueeze(2)
119
+ prediction = prediction*conf_mask
120
+
121
+ box_a = prediction.new(prediction.shape)
122
+ box_a[:, :, 0] = (prediction[:, :, 0] - prediction[:, :, 2]/2)
123
+ box_a[:, :, 1] = (prediction[:, :, 1] - prediction[:, :, 3]/2)
124
+ box_a[:, :, 2] = (prediction[:, :, 0] + prediction[:, :, 2]/2)
125
+ box_a[:, :, 3] = (prediction[:, :, 1] + prediction[:, :, 3]/2)
126
+ prediction[:, :, :4] = box_a[:, :, :4]
127
+
128
+ batch_size = prediction.size(0)
129
+
130
+ output = prediction.new(1, prediction.size(2) + 1)
131
+ write = False
132
+
133
+ for ind in range(batch_size):
134
+ # select the image from the batch
135
+ image_pred = prediction[ind]
136
+
137
+ # Get the class having maximum score, and the index of that class
138
+ # Get rid of num_classes softmax scores
139
+ # Add the class index and the class score of class having maximum score
140
+ max_conf, max_conf_index = torch.max(image_pred[:, 5:5 + num_classes], 1)
141
+ max_conf = max_conf.float().unsqueeze(1)
142
+ max_conf_index = max_conf_index.float().unsqueeze(1)
143
+ seq = (image_pred[:, :5], max_conf, max_conf_index)
144
+ image_pred = torch.cat(seq, 1) # image_pred:(10647, 7) 7:[x1, y1, x2, y2, obj_score, max_conf, max_conf_index]
145
+
146
+ # Get rid of the zero entries
147
+ non_zero_ind = (torch.nonzero(image_pred[:, 4]))
148
+ image_pred__ = image_pred[non_zero_ind.squeeze(), :].view(-1, 7)
149
+
150
+ # filters out people id
151
+ if det_hm:
152
+ cls_mask = (image_pred__[:, -1] == 0).float()
153
+ class_mask_ind = torch.nonzero(cls_mask).squeeze()
154
+ image_pred_ = image_pred__[class_mask_ind].view(-1, 7)
155
+
156
+ if torch.sum(cls_mask) == 0:
157
+ return image_pred_
158
+ else:
159
+ image_pred_ = image_pred__
160
+
161
+ # Get the various classes detected in the image
162
+ try:
163
+ # img_classes = unique(image_pred_[:, -1])
164
+ img_classes = torch.unique(image_pred_[:, -1], sorted=True).float()
165
+ except:
166
+ continue
167
+
168
+ # We will do NMS classwise
169
+ # import ipdb;ipdb.set_trace()
170
+ for cls in img_classes:
171
+ # get the detections with one particular class
172
+ cls_mask = image_pred_*(image_pred_[:, -1] == cls).float().unsqueeze(1)
173
+ class_mask_ind = torch.nonzero(cls_mask[:, -2]).squeeze()
174
+ image_pred_class = image_pred_[class_mask_ind].view(-1, 7)
175
+
176
+ # sort the detections such that the entry with the maximum objectness
177
+ # confidence is at the top
178
+ conf_sort_index = torch.sort(image_pred_class[:, 4], descending=True)[1]
179
+ image_pred_class = image_pred_class[conf_sort_index]
180
+ idx = image_pred_class.size(0)
181
+
182
+ # from soft_NMS import soft_nms
183
+ # boxes = image_pred_class[:,:4]
184
+ # scores = image_pred_class[:, 4]
185
+ # k, N = soft_nms(boxes, scores, method=2)
186
+ # image_pred_class = image_pred_class[k]
187
+
188
+ # if nms has to be done
189
+ if nms:
190
+ # For each detection
191
+ for i in range(idx):
192
+ # Get the IOUs of all boxes that come after the one we are looking at
193
+ # in the loop
194
+ try:
195
+ ious = bbox_iou(image_pred_class[i].unsqueeze(0), image_pred_class[i+1:])
196
+ except ValueError:
197
+ break
198
+
199
+ except IndexError:
200
+ break
201
+
202
+ # Zero out all the detections that have IoU > threshold
203
+ iou_mask = (ious < nms_conf).float().unsqueeze(1)
204
+ image_pred_class[i+1:] *= iou_mask
205
+
206
+ # Remove the zero entries
207
+ non_zero_ind = torch.nonzero(image_pred_class[:, 4]).squeeze()
208
+ image_pred_class = image_pred_class[non_zero_ind].view(-1, 7)
209
+
210
+ # Concatenate the batch_id of the image to the detection
211
+ # this helps us identify which image does the detection correspond to
212
+ # We use a linear structure to hold ALL the detections from the batch
213
+ # the batch_dim is flattened
214
+ # batch is identified by extra batch column
215
+
216
+ batch_ind = image_pred_class.new(image_pred_class.size(0), 1).fill_(ind)
217
+ seq = batch_ind, image_pred_class
218
+ if not write:
219
+ output = torch.cat(seq, 1)
220
+ write = True
221
+ else:
222
+ out = torch.cat(seq, 1)
223
+ output = torch.cat((output, out))
224
+
225
+ return output
VideoToNPZ/lib/pose/__init__.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import os.path as osp
3
+
4
+ sys.path.insert(1, osp.join(osp.dirname(osp.realpath(__file__)), 'hrnet/pose_estimation'))
5
+ from gen_kpts import gen_img_kpts, gen_video_kpts, load_default_model
6
+ sys.path.insert(2, osp.join(osp.dirname(osp.realpath(__file__)), 'hrnet/lib/utils'))
7
+ from utilitys import plot_keypoint, write, PreProcess, box_to_center_scale, load_json
8
+
9
+ sys.path.pop(1)
10
+ sys.path.pop(2)
VideoToNPZ/lib/pose/hrnet/experiments/coco/hrnet/w32_256x192_adam_lr1e-3.yaml ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ AUTO_RESUME: true
2
+ CUDNN:
3
+ BENCHMARK: true
4
+ DETERMINISTIC: false
5
+ ENABLED: true
6
+ DATA_DIR: ''
7
+ GPUS: (0,1,2,3)
8
+ OUTPUT_DIR: 'output'
9
+ LOG_DIR: 'log'
10
+ WORKERS: 24
11
+ PRINT_FREQ: 100
12
+
13
+ DATASET:
14
+ COLOR_RGB: true
15
+ DATASET: 'coco'
16
+ DATA_FORMAT: jpg
17
+ FLIP: true
18
+ NUM_JOINTS_HALF_BODY: 8
19
+ PROB_HALF_BODY: 0.3
20
+ ROOT: 'data/coco/'
21
+ ROT_FACTOR: 45
22
+ SCALE_FACTOR: 0.35
23
+ TEST_SET: 'val2017'
24
+ TRAIN_SET: 'train2017'
25
+ MODEL:
26
+ INIT_WEIGHTS: true
27
+ NAME: pose_hrnet
28
+ NUM_JOINTS: 17
29
+ PRETRAINED: 'models/pytorch/imagenet/hrnet_w32-36af842e.pth'
30
+ TARGET_TYPE: gaussian
31
+ IMAGE_SIZE:
32
+ - 192
33
+ - 256
34
+ HEATMAP_SIZE:
35
+ - 48
36
+ - 64
37
+ SIGMA: 2
38
+ EXTRA:
39
+ PRETRAINED_LAYERS:
40
+ - 'conv1'
41
+ - 'bn1'
42
+ - 'conv2'
43
+ - 'bn2'
44
+ - 'layer1'
45
+ - 'transition1'
46
+ - 'stage2'
47
+ - 'transition2'
48
+ - 'stage3'
49
+ - 'transition3'
50
+ - 'stage4'
51
+ FINAL_CONV_KERNEL: 1
52
+ STAGE2:
53
+ NUM_MODULES: 1
54
+ NUM_BRANCHES: 2
55
+ BLOCK: BASIC
56
+ NUM_BLOCKS:
57
+ - 4
58
+ - 4
59
+ NUM_CHANNELS:
60
+ - 32
61
+ - 64
62
+ FUSE_METHOD: SUM
63
+ STAGE3:
64
+ NUM_MODULES: 4
65
+ NUM_BRANCHES: 3
66
+ BLOCK: BASIC
67
+ NUM_BLOCKS:
68
+ - 4
69
+ - 4
70
+ - 4
71
+ NUM_CHANNELS:
72
+ - 32
73
+ - 64
74
+ - 128
75
+ FUSE_METHOD: SUM
76
+ STAGE4:
77
+ NUM_MODULES: 3
78
+ NUM_BRANCHES: 4
79
+ BLOCK: BASIC
80
+ NUM_BLOCKS:
81
+ - 4
82
+ - 4
83
+ - 4
84
+ - 4
85
+ NUM_CHANNELS:
86
+ - 32
87
+ - 64
88
+ - 128
89
+ - 256
90
+ FUSE_METHOD: SUM
91
+ LOSS:
92
+ USE_TARGET_WEIGHT: true
93
+ TRAIN:
94
+ BATCH_SIZE_PER_GPU: 32
95
+ SHUFFLE: true
96
+ BEGIN_EPOCH: 0
97
+ END_EPOCH: 210
98
+ OPTIMIZER: adam
99
+ LR: 0.001
100
+ LR_FACTOR: 0.1
101
+ LR_STEP:
102
+ - 170
103
+ - 200
104
+ WD: 0.0001
105
+ GAMMA1: 0.99
106
+ GAMMA2: 0.0
107
+ MOMENTUM: 0.9
108
+ NESTEROV: false
109
+ TEST:
110
+ BATCH_SIZE_PER_GPU: 32
111
+ COCO_BBOX_FILE: 'data/coco/person_detection_results/COCO_val2017_detections_AP_H_56_person.json'
112
+ BBOX_THRE: 1.0
113
+ IMAGE_THRE: 0.0
114
+ IN_VIS_THRE: 0.2
115
+ MODEL_FILE: ''
116
+ NMS_THRE: 1.0
117
+ OKS_THRE: 0.9
118
+ USE_GT_BBOX: true
119
+ FLIP_TEST: true
120
+ POST_PROCESS: true
121
+ SHIFT_HEATMAP: true
122
+ DEBUG:
123
+ DEBUG: true
124
+ SAVE_BATCH_IMAGES_GT: true
125
+ SAVE_BATCH_IMAGES_PRED: true
126
+ SAVE_HEATMAPS_GT: true
127
+ SAVE_HEATMAPS_PRED: true
VideoToNPZ/lib/pose/hrnet/experiments/coco/hrnet/w32_384x288_adam_lr1e-3.yaml ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ AUTO_RESUME: true
2
+ CUDNN:
3
+ BENCHMARK: true
4
+ DETERMINISTIC: false
5
+ ENABLED: true
6
+ DATA_DIR: ''
7
+ GPUS: (0,1,2,3)
8
+ OUTPUT_DIR: 'output'
9
+ LOG_DIR: 'log'
10
+ WORKERS: 24
11
+ PRINT_FREQ: 100
12
+
13
+ DATASET:
14
+ COLOR_RGB: true
15
+ DATASET: 'coco'
16
+ DATA_FORMAT: jpg
17
+ FLIP: true
18
+ NUM_JOINTS_HALF_BODY: 8
19
+ PROB_HALF_BODY: 0.3
20
+ ROOT: 'data/coco/'
21
+ ROT_FACTOR: 45
22
+ SCALE_FACTOR: 0.35
23
+ TEST_SET: 'val2017'
24
+ TRAIN_SET: 'train2017'
25
+ MODEL:
26
+ INIT_WEIGHTS: true
27
+ NAME: pose_hrnet
28
+ NUM_JOINTS: 17
29
+ PRETRAINED: 'models/pytorch/imagenet/hrnet_w32-36af842e.pth'
30
+ TARGET_TYPE: gaussian
31
+ IMAGE_SIZE:
32
+ - 288
33
+ - 384
34
+ HEATMAP_SIZE:
35
+ - 72
36
+ - 96
37
+ SIGMA: 3
38
+ EXTRA:
39
+ PRETRAINED_LAYERS:
40
+ - 'conv1'
41
+ - 'bn1'
42
+ - 'conv2'
43
+ - 'bn2'
44
+ - 'layer1'
45
+ - 'transition1'
46
+ - 'stage2'
47
+ - 'transition2'
48
+ - 'stage3'
49
+ - 'transition3'
50
+ - 'stage4'
51
+ FINAL_CONV_KERNEL: 1
52
+ STAGE2:
53
+ NUM_MODULES: 1
54
+ NUM_BRANCHES: 2
55
+ BLOCK: BASIC
56
+ NUM_BLOCKS:
57
+ - 4
58
+ - 4
59
+ NUM_CHANNELS:
60
+ - 32
61
+ - 64
62
+ FUSE_METHOD: SUM
63
+ STAGE3:
64
+ NUM_MODULES: 4
65
+ NUM_BRANCHES: 3
66
+ BLOCK: BASIC
67
+ NUM_BLOCKS:
68
+ - 4
69
+ - 4
70
+ - 4
71
+ NUM_CHANNELS:
72
+ - 32
73
+ - 64
74
+ - 128
75
+ FUSE_METHOD: SUM
76
+ STAGE4:
77
+ NUM_MODULES: 3
78
+ NUM_BRANCHES: 4
79
+ BLOCK: BASIC
80
+ NUM_BLOCKS:
81
+ - 4
82
+ - 4
83
+ - 4
84
+ - 4
85
+ NUM_CHANNELS:
86
+ - 32
87
+ - 64
88
+ - 128
89
+ - 256
90
+ FUSE_METHOD: SUM
91
+ LOSS:
92
+ USE_TARGET_WEIGHT: true
93
+ TRAIN:
94
+ BATCH_SIZE_PER_GPU: 32
95
+ SHUFFLE: true
96
+ BEGIN_EPOCH: 0
97
+ END_EPOCH: 210
98
+ OPTIMIZER: adam
99
+ LR: 0.001
100
+ LR_FACTOR: 0.1
101
+ LR_STEP:
102
+ - 170
103
+ - 200
104
+ WD: 0.0001
105
+ GAMMA1: 0.99
106
+ GAMMA2: 0.0
107
+ MOMENTUM: 0.9
108
+ NESTEROV: false
109
+ TEST:
110
+ BATCH_SIZE_PER_GPU: 32
111
+ COCO_BBOX_FILE: 'data/coco/person_detection_results/COCO_val2017_detections_AP_H_56_person.json'
112
+ BBOX_THRE: 1.0
113
+ IMAGE_THRE: 0.0
114
+ IN_VIS_THRE: 0.2
115
+ MODEL_FILE: ''
116
+ NMS_THRE: 1.0
117
+ OKS_THRE: 0.9
118
+ USE_GT_BBOX: true
119
+ FLIP_TEST: true
120
+ POST_PROCESS: true
121
+ SHIFT_HEATMAP: true
122
+ DEBUG:
123
+ DEBUG: true
124
+ SAVE_BATCH_IMAGES_GT: true
125
+ SAVE_BATCH_IMAGES_PRED: true
126
+ SAVE_HEATMAPS_GT: true
127
+ SAVE_HEATMAPS_PRED: true
VideoToNPZ/lib/pose/hrnet/experiments/coco/hrnet/w48_256x192_adam_lr1e-3.yaml ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ AUTO_RESUME: true
2
+ CUDNN:
3
+ BENCHMARK: true
4
+ DETERMINISTIC: false
5
+ ENABLED: true
6
+ DATA_DIR: ''
7
+ GPUS: (0,1,2,3)
8
+ OUTPUT_DIR: 'output'
9
+ LOG_DIR: 'log'
10
+ WORKERS: 24
11
+ PRINT_FREQ: 100
12
+
13
+ DATASET:
14
+ COLOR_RGB: true
15
+ DATASET: 'coco'
16
+ DATA_FORMAT: jpg
17
+ FLIP: true
18
+ NUM_JOINTS_HALF_BODY: 8
19
+ PROB_HALF_BODY: 0.3
20
+ ROOT: 'data/coco/'
21
+ ROT_FACTOR: 45
22
+ SCALE_FACTOR: 0.35
23
+ TEST_SET: 'val2017'
24
+ TRAIN_SET: 'train2017'
25
+ MODEL:
26
+ INIT_WEIGHTS: true
27
+ NAME: pose_hrnet
28
+ NUM_JOINTS: 17
29
+ PRETRAINED: 'models/pytorch/imagenet/hrnet_w48-8ef0771d.pth'
30
+ TARGET_TYPE: gaussian
31
+ IMAGE_SIZE:
32
+ - 192
33
+ - 256
34
+ HEATMAP_SIZE:
35
+ - 48
36
+ - 64
37
+ SIGMA: 2
38
+ EXTRA:
39
+ PRETRAINED_LAYERS:
40
+ - 'conv1'
41
+ - 'bn1'
42
+ - 'conv2'
43
+ - 'bn2'
44
+ - 'layer1'
45
+ - 'transition1'
46
+ - 'stage2'
47
+ - 'transition2'
48
+ - 'stage3'
49
+ - 'transition3'
50
+ - 'stage4'
51
+ FINAL_CONV_KERNEL: 1
52
+ STAGE2:
53
+ NUM_MODULES: 1
54
+ NUM_BRANCHES: 2
55
+ BLOCK: BASIC
56
+ NUM_BLOCKS:
57
+ - 4
58
+ - 4
59
+ NUM_CHANNELS:
60
+ - 48
61
+ - 96
62
+ FUSE_METHOD: SUM
63
+ STAGE3:
64
+ NUM_MODULES: 4
65
+ NUM_BRANCHES: 3
66
+ BLOCK: BASIC
67
+ NUM_BLOCKS:
68
+ - 4
69
+ - 4
70
+ - 4
71
+ NUM_CHANNELS:
72
+ - 48
73
+ - 96
74
+ - 192
75
+ FUSE_METHOD: SUM
76
+ STAGE4:
77
+ NUM_MODULES: 3
78
+ NUM_BRANCHES: 4
79
+ BLOCK: BASIC
80
+ NUM_BLOCKS:
81
+ - 4
82
+ - 4
83
+ - 4
84
+ - 4
85
+ NUM_CHANNELS:
86
+ - 48
87
+ - 96
88
+ - 192
89
+ - 384
90
+ FUSE_METHOD: SUM
91
+ LOSS:
92
+ USE_TARGET_WEIGHT: true
93
+ TRAIN:
94
+ BATCH_SIZE_PER_GPU: 32
95
+ SHUFFLE: true
96
+ BEGIN_EPOCH: 0
97
+ END_EPOCH: 210
98
+ OPTIMIZER: adam
99
+ LR: 0.001
100
+ LR_FACTOR: 0.1
101
+ LR_STEP:
102
+ - 170
103
+ - 200
104
+ WD: 0.0001
105
+ GAMMA1: 0.99
106
+ GAMMA2: 0.0
107
+ MOMENTUM: 0.9
108
+ NESTEROV: false
109
+ TEST:
110
+ BATCH_SIZE_PER_GPU: 32
111
+ COCO_BBOX_FILE: 'data/coco/person_detection_results/COCO_val2017_detections_AP_H_56_person.json'
112
+ BBOX_THRE: 1.0
113
+ IMAGE_THRE: 0.0
114
+ IN_VIS_THRE: 0.2
115
+ MODEL_FILE: ''
116
+ NMS_THRE: 1.0
117
+ OKS_THRE: 0.9
118
+ USE_GT_BBOX: true
119
+ FLIP_TEST: true
120
+ POST_PROCESS: true
121
+ SHIFT_HEATMAP: true
122
+ DEBUG:
123
+ DEBUG: true
124
+ SAVE_BATCH_IMAGES_GT: true
125
+ SAVE_BATCH_IMAGES_PRED: true
126
+ SAVE_HEATMAPS_GT: true
127
+ SAVE_HEATMAPS_PRED: true
VideoToNPZ/lib/pose/hrnet/experiments/coco/hrnet/w48_384x288_adam_lr1e-3.yaml ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ AUTO_RESUME: true
2
+ CUDNN:
3
+ BENCHMARK: true
4
+ DETERMINISTIC: false
5
+ ENABLED: true
6
+ DATA_DIR: ''
7
+ GPUS: (0,1,2,3)
8
+ OUTPUT_DIR: 'output'
9
+ LOG_DIR: 'log'
10
+ WORKERS: 24
11
+ PRINT_FREQ: 100
12
+
13
+ DATASET:
14
+ COLOR_RGB: true
15
+ DATASET: 'coco'
16
+ DATA_FORMAT: jpg
17
+ FLIP: true
18
+ NUM_JOINTS_HALF_BODY: 8
19
+ PROB_HALF_BODY: 0.3
20
+ ROOT: 'data/coco/'
21
+ ROT_FACTOR: 45
22
+ SCALE_FACTOR: 0.35
23
+ TEST_SET: 'val2017'
24
+ TRAIN_SET: 'train2017'
25
+ MODEL:
26
+ INIT_WEIGHTS: true
27
+ NAME: pose_hrnet
28
+ NUM_JOINTS: 17
29
+ PRETRAINED: 'models/pytorch/imagenet/hrnet_w48-8ef0771d.pth'
30
+ TARGET_TYPE: gaussian
31
+ IMAGE_SIZE:
32
+ - 288
33
+ - 384
34
+ HEATMAP_SIZE:
35
+ - 72
36
+ - 96
37
+ SIGMA: 3
38
+ EXTRA:
39
+ PRETRAINED_LAYERS:
40
+ - 'conv1'
41
+ - 'bn1'
42
+ - 'conv2'
43
+ - 'bn2'
44
+ - 'layer1'
45
+ - 'transition1'
46
+ - 'stage2'
47
+ - 'transition2'
48
+ - 'stage3'
49
+ - 'transition3'
50
+ - 'stage4'
51
+ FINAL_CONV_KERNEL: 1
52
+ STAGE2:
53
+ NUM_MODULES: 1
54
+ NUM_BRANCHES: 2
55
+ BLOCK: BASIC
56
+ NUM_BLOCKS:
57
+ - 4
58
+ - 4
59
+ NUM_CHANNELS:
60
+ - 48
61
+ - 96
62
+ FUSE_METHOD: SUM
63
+ STAGE3:
64
+ NUM_MODULES: 4
65
+ NUM_BRANCHES: 3
66
+ BLOCK: BASIC
67
+ NUM_BLOCKS:
68
+ - 4
69
+ - 4
70
+ - 4
71
+ NUM_CHANNELS:
72
+ - 48
73
+ - 96
74
+ - 192
75
+ FUSE_METHOD: SUM
76
+ STAGE4:
77
+ NUM_MODULES: 3
78
+ NUM_BRANCHES: 4
79
+ BLOCK: BASIC
80
+ NUM_BLOCKS:
81
+ - 4
82
+ - 4
83
+ - 4
84
+ - 4
85
+ NUM_CHANNELS:
86
+ - 48
87
+ - 96
88
+ - 192
89
+ - 384
90
+ FUSE_METHOD: SUM
91
+ LOSS:
92
+ USE_TARGET_WEIGHT: true
93
+ TRAIN:
94
+ BATCH_SIZE_PER_GPU: 24
95
+ SHUFFLE: true
96
+ BEGIN_EPOCH: 0
97
+ END_EPOCH: 210
98
+ OPTIMIZER: adam
99
+ LR: 0.001
100
+ LR_FACTOR: 0.1
101
+ LR_STEP:
102
+ - 170
103
+ - 200
104
+ WD: 0.0001
105
+ GAMMA1: 0.99
106
+ GAMMA2: 0.0
107
+ MOMENTUM: 0.9
108
+ NESTEROV: false
109
+ TEST:
110
+ BATCH_SIZE_PER_GPU: 24
111
+ COCO_BBOX_FILE: 'data/coco/person_detection_results/COCO_val2017_detections_AP_H_56_person.json'
112
+ BBOX_THRE: 1.0
113
+ IMAGE_THRE: 0.0
114
+ IN_VIS_THRE: 0.2
115
+ MODEL_FILE: ''
116
+ NMS_THRE: 1.0
117
+ OKS_THRE: 0.9
118
+ USE_GT_BBOX: true
119
+ FLIP_TEST: true
120
+ POST_PROCESS: true
121
+ SHIFT_HEATMAP: true
122
+ DEBUG:
123
+ DEBUG: true
124
+ SAVE_BATCH_IMAGES_GT: true
125
+ SAVE_BATCH_IMAGES_PRED: true
126
+ SAVE_HEATMAPS_GT: true
127
+ SAVE_HEATMAPS_PRED: true
VideoToNPZ/lib/pose/hrnet/experiments/coco/resnet/res101_256x192_d256x3_adam_lr1e-3.yaml ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ AUTO_RESUME: true
2
+ CUDNN:
3
+ BENCHMARK: true
4
+ DETERMINISTIC: false
5
+ ENABLED: true
6
+ DATA_DIR: ''
7
+ GPUS: (0,1,2,3)
8
+ OUTPUT_DIR: 'output'
9
+ LOG_DIR: 'log'
10
+ WORKERS: 24
11
+ PRINT_FREQ: 100
12
+
13
+ DATASET:
14
+ COLOR_RGB: false
15
+ DATASET: 'coco'
16
+ ROOT: 'data/coco/'
17
+ TEST_SET: 'val2017'
18
+ TRAIN_SET: 'train2017'
19
+ FLIP: true
20
+ ROT_FACTOR: 40
21
+ SCALE_FACTOR: 0.3
22
+ MODEL:
23
+ NAME: 'pose_resnet'
24
+ PRETRAINED: 'models/pytorch/imagenet/resnet101-5d3b4d8f.pth'
25
+ IMAGE_SIZE:
26
+ - 192
27
+ - 256
28
+ HEATMAP_SIZE:
29
+ - 48
30
+ - 64
31
+ SIGMA: 2
32
+ NUM_JOINTS: 17
33
+ TARGET_TYPE: 'gaussian'
34
+ EXTRA:
35
+ FINAL_CONV_KERNEL: 1
36
+ DECONV_WITH_BIAS: false
37
+ NUM_DECONV_LAYERS: 3
38
+ NUM_DECONV_FILTERS:
39
+ - 256
40
+ - 256
41
+ - 256
42
+ NUM_DECONV_KERNELS:
43
+ - 4
44
+ - 4
45
+ - 4
46
+ NUM_LAYERS: 101
47
+ LOSS:
48
+ USE_TARGET_WEIGHT: true
49
+ TRAIN:
50
+ BATCH_SIZE_PER_GPU: 32
51
+ SHUFFLE: true
52
+ BEGIN_EPOCH: 0
53
+ END_EPOCH: 140
54
+ OPTIMIZER: 'adam'
55
+ LR: 0.001
56
+ LR_FACTOR: 0.1
57
+ LR_STEP:
58
+ - 90
59
+ - 120
60
+ WD: 0.0001
61
+ GAMMA1: 0.99
62
+ GAMMA2: 0.0
63
+ MOMENTUM: 0.9
64
+ NESTEROV: false
65
+ TEST:
66
+ BATCH_SIZE_PER_GPU: 32
67
+ COCO_BBOX_FILE: 'data/coco/person_detection_results/COCO_val2017_detections_AP_H_56_person.json'
68
+ BBOX_THRE: 1.0
69
+ IMAGE_THRE: 0.0
70
+ IN_VIS_THRE: 0.2
71
+ MODEL_FILE: ''
72
+ NMS_THRE: 1.0
73
+ OKS_THRE: 0.9
74
+ FLIP_TEST: true
75
+ POST_PROCESS: true
76
+ SHIFT_HEATMAP: true
77
+ USE_GT_BBOX: true
78
+ DEBUG:
79
+ DEBUG: true
80
+ SAVE_BATCH_IMAGES_GT: true
81
+ SAVE_BATCH_IMAGES_PRED: true
82
+ SAVE_HEATMAPS_GT: true
83
+ SAVE_HEATMAPS_PRED: true
VideoToNPZ/lib/pose/hrnet/experiments/coco/resnet/res101_384x288_d256x3_adam_lr1e-3.yaml ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ AUTO_RESUME: true
2
+ CUDNN:
3
+ BENCHMARK: true
4
+ DETERMINISTIC: false
5
+ ENABLED: true
6
+ DATA_DIR: ''
7
+ GPUS: (0,1,2,3)
8
+ OUTPUT_DIR: 'output'
9
+ LOG_DIR: 'log'
10
+ WORKERS: 24
11
+ PRINT_FREQ: 100
12
+
13
+ DATASET:
14
+ COLOR_RGB: false
15
+ DATASET: 'coco'
16
+ ROOT: 'data/coco/'
17
+ TEST_SET: 'val2017'
18
+ TRAIN_SET: 'train2017'
19
+ FLIP: true
20
+ ROT_FACTOR: 40
21
+ SCALE_FACTOR: 0.3
22
+ MODEL:
23
+ NAME: 'pose_resnet'
24
+ PRETRAINED: 'models/pytorch/imagenet/resnet101-5d3b4d8f.pth'
25
+ IMAGE_SIZE:
26
+ - 288
27
+ - 384
28
+ HEATMAP_SIZE:
29
+ - 72
30
+ - 96
31
+ SIGMA: 3
32
+ NUM_JOINTS: 17
33
+ TARGET_TYPE: 'gaussian'
34
+ EXTRA:
35
+ FINAL_CONV_KERNEL: 1
36
+ DECONV_WITH_BIAS: false
37
+ NUM_DECONV_LAYERS: 3
38
+ NUM_DECONV_FILTERS:
39
+ - 256
40
+ - 256
41
+ - 256
42
+ NUM_DECONV_KERNELS:
43
+ - 4
44
+ - 4
45
+ - 4
46
+ NUM_LAYERS: 101
47
+ LOSS:
48
+ USE_TARGET_WEIGHT: true
49
+ TRAIN:
50
+ BATCH_SIZE_PER_GPU: 32
51
+ SHUFFLE: true
52
+ BEGIN_EPOCH: 0
53
+ END_EPOCH: 140
54
+ OPTIMIZER: 'adam'
55
+ LR: 0.001
56
+ LR_FACTOR: 0.1
57
+ LR_STEP:
58
+ - 90
59
+ - 120
60
+ WD: 0.0001
61
+ GAMMA1: 0.99
62
+ GAMMA2: 0.0
63
+ MOMENTUM: 0.9
64
+ NESTEROV: false
65
+ TEST:
66
+ BATCH_SIZE_PER_GPU: 32
67
+ COCO_BBOX_FILE: 'data/coco/person_detection_results/COCO_val2017_detections_AP_H_56_person.json'
68
+ BBOX_THRE: 1.0
69
+ IMAGE_THRE: 0.0
70
+ IN_VIS_THRE: 0.2
71
+ MODEL_FILE: ''
72
+ NMS_THRE: 1.0
73
+ OKS_THRE: 0.9
74
+ FLIP_TEST: true
75
+ POST_PROCESS: true
76
+ SHIFT_HEATMAP: true
77
+ USE_GT_BBOX: true
78
+ DEBUG:
79
+ DEBUG: true
80
+ SAVE_BATCH_IMAGES_GT: true
81
+ SAVE_BATCH_IMAGES_PRED: true
82
+ SAVE_HEATMAPS_GT: true
83
+ SAVE_HEATMAPS_PRED: true
VideoToNPZ/lib/pose/hrnet/experiments/coco/resnet/res152_256x192_d256x3_adam_lr1e-3.yaml ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ AUTO_RESUME: true
2
+ CUDNN:
3
+ BENCHMARK: true
4
+ DETERMINISTIC: false
5
+ ENABLED: true
6
+ DATA_DIR: ''
7
+ GPUS: (0,1,2,3)
8
+ OUTPUT_DIR: 'output'
9
+ LOG_DIR: 'log'
10
+ WORKERS: 24
11
+ PRINT_FREQ: 100
12
+
13
+ DATASET:
14
+ COLOR_RGB: false
15
+ DATASET: 'coco'
16
+ ROOT: 'data/coco/'
17
+ TEST_SET: 'val2017'
18
+ TRAIN_SET: 'train2017'
19
+ FLIP: true
20
+ ROT_FACTOR: 40
21
+ SCALE_FACTOR: 0.3
22
+ MODEL:
23
+ NAME: 'pose_resnet'
24
+ PRETRAINED: 'models/pytorch/imagenet/resnet152-b121ed2d.pth'
25
+ IMAGE_SIZE:
26
+ - 192
27
+ - 256
28
+ HEATMAP_SIZE:
29
+ - 48
30
+ - 64
31
+ SIGMA: 2
32
+ NUM_JOINTS: 17
33
+ TARGET_TYPE: 'gaussian'
34
+ EXTRA:
35
+ FINAL_CONV_KERNEL: 1
36
+ DECONV_WITH_BIAS: false
37
+ NUM_DECONV_LAYERS: 3
38
+ NUM_DECONV_FILTERS:
39
+ - 256
40
+ - 256
41
+ - 256
42
+ NUM_DECONV_KERNELS:
43
+ - 4
44
+ - 4
45
+ - 4
46
+ NUM_LAYERS: 152
47
+ LOSS:
48
+ USE_TARGET_WEIGHT: true
49
+ TRAIN:
50
+ BATCH_SIZE_PER_GPU: 32
51
+ SHUFFLE: true
52
+ BEGIN_EPOCH: 0
53
+ END_EPOCH: 140
54
+ OPTIMIZER: 'adam'
55
+ LR: 0.001
56
+ LR_FACTOR: 0.1
57
+ LR_STEP:
58
+ - 90
59
+ - 120
60
+ WD: 0.0001
61
+ GAMMA1: 0.99
62
+ GAMMA2: 0.0
63
+ MOMENTUM: 0.9
64
+ NESTEROV: false
65
+ TEST:
66
+ BATCH_SIZE_PER_GPU: 32
67
+ COCO_BBOX_FILE: 'data/coco/person_detection_results/COCO_val2017_detections_AP_H_56_person.json'
68
+ BBOX_THRE: 1.0
69
+ IMAGE_THRE: 0.0
70
+ IN_VIS_THRE: 0.2
71
+ MODEL_FILE: ''
72
+ NMS_THRE: 1.0
73
+ OKS_THRE: 0.9
74
+ FLIP_TEST: true
75
+ POST_PROCESS: true
76
+ SHIFT_HEATMAP: true
77
+ USE_GT_BBOX: true
78
+ DEBUG:
79
+ DEBUG: true
80
+ SAVE_BATCH_IMAGES_GT: true
81
+ SAVE_BATCH_IMAGES_PRED: true
82
+ SAVE_HEATMAPS_GT: true
83
+ SAVE_HEATMAPS_PRED: true
VideoToNPZ/lib/pose/hrnet/experiments/coco/resnet/res152_384x288_d256x3_adam_lr1e-3.yaml ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ AUTO_RESUME: true
2
+ CUDNN:
3
+ BENCHMARK: true
4
+ DETERMINISTIC: false
5
+ ENABLED: true
6
+ DATA_DIR: ''
7
+ GPUS: (0,1,2,3)
8
+ OUTPUT_DIR: 'output'
9
+ LOG_DIR: 'log'
10
+ WORKERS: 24
11
+ PRINT_FREQ: 100
12
+
13
+ DATASET:
14
+ COLOR_RGB: false
15
+ DATASET: 'coco'
16
+ ROOT: 'data/coco/'
17
+ TEST_SET: 'val2017'
18
+ TRAIN_SET: 'train2017'
19
+ FLIP: true
20
+ ROT_FACTOR: 40
21
+ SCALE_FACTOR: 0.3
22
+ MODEL:
23
+ NAME: 'pose_resnet'
24
+ PRETRAINED: 'models/pytorch/imagenet/resnet152-b121ed2d.pth'
25
+ IMAGE_SIZE:
26
+ - 288
27
+ - 384
28
+ HEATMAP_SIZE:
29
+ - 72
30
+ - 96
31
+ SIGMA: 3
32
+ NUM_JOINTS: 17
33
+ TARGET_TYPE: 'gaussian'
34
+ EXTRA:
35
+ FINAL_CONV_KERNEL: 1
36
+ DECONV_WITH_BIAS: false
37
+ NUM_DECONV_LAYERS: 3
38
+ NUM_DECONV_FILTERS:
39
+ - 256
40
+ - 256
41
+ - 256
42
+ NUM_DECONV_KERNELS:
43
+ - 4
44
+ - 4
45
+ - 4
46
+ NUM_LAYERS: 152
47
+ LOSS:
48
+ USE_TARGET_WEIGHT: true
49
+ TRAIN:
50
+ BATCH_SIZE_PER_GPU: 32
51
+ SHUFFLE: true
52
+ BEGIN_EPOCH: 0
53
+ END_EPOCH: 140
54
+ OPTIMIZER: 'adam'
55
+ LR: 0.001
56
+ LR_FACTOR: 0.1
57
+ LR_STEP:
58
+ - 90
59
+ - 120
60
+ WD: 0.0001
61
+ GAMMA1: 0.99
62
+ GAMMA2: 0.0
63
+ MOMENTUM: 0.9
64
+ NESTEROV: false
65
+ TEST:
66
+ BATCH_SIZE_PER_GPU: 32
67
+ COCO_BBOX_FILE: 'data/coco/person_detection_results/COCO_val2017_detections_AP_H_56_person.json'
68
+ BBOX_THRE: 1.0
69
+ IMAGE_THRE: 0.0
70
+ IN_VIS_THRE: 0.2
71
+ MODEL_FILE: ''
72
+ NMS_THRE: 1.0
73
+ OKS_THRE: 0.9
74
+ FLIP_TEST: true
75
+ POST_PROCESS: true
76
+ SHIFT_HEATMAP: true
77
+ USE_GT_BBOX: true
78
+ DEBUG:
79
+ DEBUG: true
80
+ SAVE_BATCH_IMAGES_GT: true
81
+ SAVE_BATCH_IMAGES_PRED: true
82
+ SAVE_HEATMAPS_GT: true
83
+ SAVE_HEATMAPS_PRED: true
VideoToNPZ/lib/pose/hrnet/experiments/coco/resnet/res50_256x192_d256x3_adam_lr1e-3.yaml ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ AUTO_RESUME: true
2
+ CUDNN:
3
+ BENCHMARK: true
4
+ DETERMINISTIC: false
5
+ ENABLED: true
6
+ DATA_DIR: ''
7
+ GPUS: (0,1,2,3)
8
+ OUTPUT_DIR: 'output'
9
+ LOG_DIR: 'log'
10
+ WORKERS: 24
11
+ PRINT_FREQ: 100
12
+
13
+ DATASET:
14
+ COLOR_RGB: false
15
+ DATASET: 'coco'
16
+ ROOT: 'data/coco/'
17
+ TEST_SET: 'val2017'
18
+ TRAIN_SET: 'train2017'
19
+ FLIP: true
20
+ ROT_FACTOR: 40
21
+ SCALE_FACTOR: 0.3
22
+ MODEL:
23
+ NAME: 'pose_resnet'
24
+ PRETRAINED: 'models/pytorch/imagenet/resnet50-19c8e357.pth'
25
+ IMAGE_SIZE:
26
+ - 192
27
+ - 256
28
+ HEATMAP_SIZE:
29
+ - 48
30
+ - 64
31
+ SIGMA: 2
32
+ NUM_JOINTS: 17
33
+ TARGET_TYPE: 'gaussian'
34
+ EXTRA:
35
+ FINAL_CONV_KERNEL: 1
36
+ DECONV_WITH_BIAS: false
37
+ NUM_DECONV_LAYERS: 3
38
+ NUM_DECONV_FILTERS:
39
+ - 256
40
+ - 256
41
+ - 256
42
+ NUM_DECONV_KERNELS:
43
+ - 4
44
+ - 4
45
+ - 4
46
+ NUM_LAYERS: 50
47
+ LOSS:
48
+ USE_TARGET_WEIGHT: true
49
+ TRAIN:
50
+ BATCH_SIZE_PER_GPU: 32
51
+ SHUFFLE: true
52
+ BEGIN_EPOCH: 0
53
+ END_EPOCH: 140
54
+ OPTIMIZER: 'adam'
55
+ LR: 0.001
56
+ LR_FACTOR: 0.1
57
+ LR_STEP:
58
+ - 90
59
+ - 120
60
+ WD: 0.0001
61
+ GAMMA1: 0.99
62
+ GAMMA2: 0.0
63
+ MOMENTUM: 0.9
64
+ NESTEROV: false
65
+ TEST:
66
+ BATCH_SIZE_PER_GPU: 32
67
+ COCO_BBOX_FILE: 'data/coco/person_detection_results/COCO_val2017_detections_AP_H_56_person.json'
68
+ BBOX_THRE: 1.0
69
+ IMAGE_THRE: 0.0
70
+ IN_VIS_THRE: 0.2
71
+ MODEL_FILE: ''
72
+ NMS_THRE: 1.0
73
+ OKS_THRE: 0.9
74
+ FLIP_TEST: true
75
+ POST_PROCESS: true
76
+ SHIFT_HEATMAP: true
77
+ USE_GT_BBOX: true
78
+ DEBUG:
79
+ DEBUG: true
80
+ SAVE_BATCH_IMAGES_GT: true
81
+ SAVE_BATCH_IMAGES_PRED: true
82
+ SAVE_HEATMAPS_GT: true
83
+ SAVE_HEATMAPS_PRED: true
VideoToNPZ/lib/pose/hrnet/experiments/coco/resnet/res50_384x288_d256x3_adam_lr1e-3.yaml ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ AUTO_RESUME: true
2
+ CUDNN:
3
+ BENCHMARK: true
4
+ DETERMINISTIC: false
5
+ ENABLED: true
6
+ DATA_DIR: ''
7
+ GPUS: (0,1,2,3)
8
+ OUTPUT_DIR: 'output'
9
+ LOG_DIR: 'log'
10
+ WORKERS: 24
11
+ PRINT_FREQ: 100
12
+
13
+ DATASET:
14
+ COLOR_RGB: false
15
+ DATASET: 'coco'
16
+ ROOT: 'data/coco/'
17
+ TEST_SET: 'val2017'
18
+ TRAIN_SET: 'train2017'
19
+ FLIP: true
20
+ ROT_FACTOR: 40
21
+ SCALE_FACTOR: 0.3
22
+ MODEL:
23
+ NAME: 'pose_resnet'
24
+ PRETRAINED: 'models/pytorch/imagenet/resnet50-19c8e357.pth'
25
+ IMAGE_SIZE:
26
+ - 288
27
+ - 384
28
+ HEATMAP_SIZE:
29
+ - 72
30
+ - 96
31
+ SIGMA: 3
32
+ NUM_JOINTS: 17
33
+ TARGET_TYPE: 'gaussian'
34
+ EXTRA:
35
+ FINAL_CONV_KERNEL: 1
36
+ DECONV_WITH_BIAS: false
37
+ NUM_DECONV_LAYERS: 3
38
+ NUM_DECONV_FILTERS:
39
+ - 256
40
+ - 256
41
+ - 256
42
+ NUM_DECONV_KERNELS:
43
+ - 4
44
+ - 4
45
+ - 4
46
+ NUM_LAYERS: 50
47
+ LOSS:
48
+ USE_TARGET_WEIGHT: true
49
+ TRAIN:
50
+ BATCH_SIZE_PER_GPU: 32
51
+ SHUFFLE: true
52
+ BEGIN_EPOCH: 0
53
+ END_EPOCH: 140
54
+ OPTIMIZER: 'adam'
55
+ LR: 0.001
56
+ LR_FACTOR: 0.1
57
+ LR_STEP:
58
+ - 90
59
+ - 120
60
+ WD: 0.0001
61
+ GAMMA1: 0.99
62
+ GAMMA2: 0.0
63
+ MOMENTUM: 0.9
64
+ NESTEROV: false
65
+ TEST:
66
+ BATCH_SIZE_PER_GPU: 32
67
+ COCO_BBOX_FILE: 'data/coco/person_detection_results/COCO_val2017_detections_AP_H_56_person.json'
68
+ BBOX_THRE: 1.0
69
+ IMAGE_THRE: 0.0
70
+ IN_VIS_THRE: 0.2
71
+ MODEL_FILE: ''
72
+ NMS_THRE: 1.0
73
+ OKS_THRE: 0.9
74
+ FLIP_TEST: true
75
+ POST_PROCESS: true
76
+ SHIFT_HEATMAP: true
77
+ USE_GT_BBOX: true
78
+ DEBUG:
79
+ DEBUG: true
80
+ SAVE_BATCH_IMAGES_GT: true
81
+ SAVE_BATCH_IMAGES_PRED: true
82
+ SAVE_HEATMAPS_GT: true
83
+ SAVE_HEATMAPS_PRED: true
VideoToNPZ/lib/pose/hrnet/experiments/mpii/hrnet/w32_256x256_adam_lr1e-3.yaml ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ AUTO_RESUME: true
2
+ CUDNN:
3
+ BENCHMARK: true
4
+ DETERMINISTIC: false
5
+ ENABLED: true
6
+ DATA_DIR: ''
7
+ GPUS: (0,1,2,3)
8
+ OUTPUT_DIR: 'output'
9
+ LOG_DIR: 'log'
10
+ WORKERS: 24
11
+ PRINT_FREQ: 100
12
+
13
+ DATASET:
14
+ COLOR_RGB: true
15
+ DATASET: mpii
16
+ DATA_FORMAT: jpg
17
+ FLIP: true
18
+ NUM_JOINTS_HALF_BODY: 8
19
+ PROB_HALF_BODY: -1.0
20
+ ROOT: 'data/mpii/'
21
+ ROT_FACTOR: 30
22
+ SCALE_FACTOR: 0.25
23
+ TEST_SET: valid
24
+ TRAIN_SET: train
25
+ MODEL:
26
+ INIT_WEIGHTS: true
27
+ NAME: pose_hrnet
28
+ NUM_JOINTS: 16
29
+ PRETRAINED: 'models/pytorch/imagenet/hrnet_w32-36af842e.pth'
30
+ TARGET_TYPE: gaussian
31
+ IMAGE_SIZE:
32
+ - 256
33
+ - 256
34
+ HEATMAP_SIZE:
35
+ - 64
36
+ - 64
37
+ SIGMA: 2
38
+ EXTRA:
39
+ PRETRAINED_LAYERS:
40
+ - 'conv1'
41
+ - 'bn1'
42
+ - 'conv2'
43
+ - 'bn2'
44
+ - 'layer1'
45
+ - 'transition1'
46
+ - 'stage2'
47
+ - 'transition2'
48
+ - 'stage3'
49
+ - 'transition3'
50
+ - 'stage4'
51
+ FINAL_CONV_KERNEL: 1
52
+ STAGE2:
53
+ NUM_MODULES: 1
54
+ NUM_BRANCHES: 2
55
+ BLOCK: BASIC
56
+ NUM_BLOCKS:
57
+ - 4
58
+ - 4
59
+ NUM_CHANNELS:
60
+ - 32
61
+ - 64
62
+ FUSE_METHOD: SUM
63
+ STAGE3:
64
+ NUM_MODULES: 4
65
+ NUM_BRANCHES: 3
66
+ BLOCK: BASIC
67
+ NUM_BLOCKS:
68
+ - 4
69
+ - 4
70
+ - 4
71
+ NUM_CHANNELS:
72
+ - 32
73
+ - 64
74
+ - 128
75
+ FUSE_METHOD: SUM
76
+ STAGE4:
77
+ NUM_MODULES: 3
78
+ NUM_BRANCHES: 4
79
+ BLOCK: BASIC
80
+ NUM_BLOCKS:
81
+ - 4
82
+ - 4
83
+ - 4
84
+ - 4
85
+ NUM_CHANNELS:
86
+ - 32
87
+ - 64
88
+ - 128
89
+ - 256
90
+ FUSE_METHOD: SUM
91
+ LOSS:
92
+ USE_TARGET_WEIGHT: true
93
+ TRAIN:
94
+ BATCH_SIZE_PER_GPU: 32
95
+ SHUFFLE: true
96
+ BEGIN_EPOCH: 0
97
+ END_EPOCH: 210
98
+ OPTIMIZER: adam
99
+ LR: 0.001
100
+ LR_FACTOR: 0.1
101
+ LR_STEP:
102
+ - 170
103
+ - 200
104
+ WD: 0.0001
105
+ GAMMA1: 0.99
106
+ GAMMA2: 0.0
107
+ MOMENTUM: 0.9
108
+ NESTEROV: false
109
+ TEST:
110
+ BATCH_SIZE_PER_GPU: 32
111
+ MODEL_FILE: ''
112
+ FLIP_TEST: true
113
+ POST_PROCESS: true
114
+ SHIFT_HEATMAP: true
115
+ DEBUG:
116
+ DEBUG: true
117
+ SAVE_BATCH_IMAGES_GT: true
118
+ SAVE_BATCH_IMAGES_PRED: true
119
+ SAVE_HEATMAPS_GT: true
120
+ SAVE_HEATMAPS_PRED: true
VideoToNPZ/lib/pose/hrnet/experiments/mpii/hrnet/w48_256x256_adam_lr1e-3.yaml ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ AUTO_RESUME: true
2
+ CUDNN:
3
+ BENCHMARK: true
4
+ DETERMINISTIC: false
5
+ ENABLED: true
6
+ DATA_DIR: ''
7
+ GPUS: (0,1,2,3)
8
+ OUTPUT_DIR: 'output'
9
+ LOG_DIR: 'log'
10
+ WORKERS: 24
11
+ PRINT_FREQ: 100
12
+
13
+ DATASET:
14
+ COLOR_RGB: true
15
+ DATASET: mpii
16
+ DATA_FORMAT: jpg
17
+ FLIP: true
18
+ NUM_JOINTS_HALF_BODY: 8
19
+ PROB_HALF_BODY: -1.0
20
+ ROOT: 'data/mpii/'
21
+ ROT_FACTOR: 30
22
+ SCALE_FACTOR: 0.25
23
+ TEST_SET: valid
24
+ TRAIN_SET: train
25
+ MODEL:
26
+ INIT_WEIGHTS: true
27
+ NAME: pose_hrnet
28
+ NUM_JOINTS: 16
29
+ PRETRAINED: 'models/pytorch/imagenet/hrnet_w48-8ef0771d.pth'
30
+ TARGET_TYPE: gaussian
31
+ IMAGE_SIZE:
32
+ - 256
33
+ - 256
34
+ HEATMAP_SIZE:
35
+ - 64
36
+ - 64
37
+ SIGMA: 2
38
+ EXTRA:
39
+ PRETRAINED_LAYERS:
40
+ - 'conv1'
41
+ - 'bn1'
42
+ - 'conv2'
43
+ - 'bn2'
44
+ - 'layer1'
45
+ - 'transition1'
46
+ - 'stage2'
47
+ - 'transition2'
48
+ - 'stage3'
49
+ - 'transition3'
50
+ - 'stage4'
51
+ FINAL_CONV_KERNEL: 1
52
+ STAGE2:
53
+ NUM_MODULES: 1
54
+ NUM_BRANCHES: 2
55
+ BLOCK: BASIC
56
+ NUM_BLOCKS:
57
+ - 4
58
+ - 4
59
+ NUM_CHANNELS:
60
+ - 48
61
+ - 96
62
+ FUSE_METHOD: SUM
63
+ STAGE3:
64
+ NUM_MODULES: 4
65
+ NUM_BRANCHES: 3
66
+ BLOCK: BASIC
67
+ NUM_BLOCKS:
68
+ - 4
69
+ - 4
70
+ - 4
71
+ NUM_CHANNELS:
72
+ - 48
73
+ - 96
74
+ - 192
75
+ FUSE_METHOD: SUM
76
+ STAGE4:
77
+ NUM_MODULES: 3
78
+ NUM_BRANCHES: 4
79
+ BLOCK: BASIC
80
+ NUM_BLOCKS:
81
+ - 4
82
+ - 4
83
+ - 4
84
+ - 4
85
+ NUM_CHANNELS:
86
+ - 48
87
+ - 96
88
+ - 192
89
+ - 384
90
+ FUSE_METHOD: SUM
91
+ LOSS:
92
+ USE_TARGET_WEIGHT: true
93
+ TRAIN:
94
+ BATCH_SIZE_PER_GPU: 32
95
+ SHUFFLE: true
96
+ BEGIN_EPOCH: 0
97
+ END_EPOCH: 210
98
+ OPTIMIZER: adam
99
+ LR: 0.001
100
+ LR_FACTOR: 0.1
101
+ LR_STEP:
102
+ - 170
103
+ - 200
104
+ WD: 0.0001
105
+ GAMMA1: 0.99
106
+ GAMMA2: 0.0
107
+ MOMENTUM: 0.9
108
+ NESTEROV: false
109
+ TEST:
110
+ BATCH_SIZE_PER_GPU: 32
111
+ MODEL_FILE: ''
112
+ FLIP_TEST: true
113
+ POST_PROCESS: true
114
+ SHIFT_HEATMAP: true
115
+ DEBUG:
116
+ DEBUG: true
117
+ SAVE_BATCH_IMAGES_GT: true
118
+ SAVE_BATCH_IMAGES_PRED: true
119
+ SAVE_HEATMAPS_GT: true
120
+ SAVE_HEATMAPS_PRED: true
VideoToNPZ/lib/pose/hrnet/experiments/mpii/resnet/res101_256x256_d256x3_adam_lr1e-3.yaml ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ AUTO_RESUME: true
2
+ CUDNN:
3
+ BENCHMARK: true
4
+ DETERMINISTIC: false
5
+ ENABLED: true
6
+ DATA_DIR: ''
7
+ GPUS: (0,1,2,3)
8
+ OUTPUT_DIR: 'output'
9
+ LOG_DIR: 'log'
10
+ WORKERS: 24
11
+ PRINT_FREQ: 100
12
+
13
+ DATASET:
14
+ COLOR_RGB: false
15
+ DATASET: mpii
16
+ DATA_FORMAT: jpg
17
+ FLIP: true
18
+ NUM_JOINTS_HALF_BODY: 8
19
+ PROB_HALF_BODY: -1.0
20
+ ROOT: 'data/mpii/'
21
+ ROT_FACTOR: 30
22
+ SCALE_FACTOR: 0.25
23
+ TEST_SET: valid
24
+ TRAIN_SET: train
25
+ MODEL:
26
+ NAME: 'pose_resnet'
27
+ PRETRAINED: 'models/pytorch/imagenet/resnet101-5d3b4d8f.pth'
28
+ IMAGE_SIZE:
29
+ - 256
30
+ - 256
31
+ HEATMAP_SIZE:
32
+ - 64
33
+ - 64
34
+ SIGMA: 2
35
+ NUM_JOINTS: 16
36
+ TARGET_TYPE: 'gaussian'
37
+ EXTRA:
38
+ FINAL_CONV_KERNEL: 1
39
+ DECONV_WITH_BIAS: false
40
+ NUM_DECONV_LAYERS: 3
41
+ NUM_DECONV_FILTERS:
42
+ - 256
43
+ - 256
44
+ - 256
45
+ NUM_DECONV_KERNELS:
46
+ - 4
47
+ - 4
48
+ - 4
49
+ NUM_LAYERS: 101
50
+ LOSS:
51
+ USE_TARGET_WEIGHT: true
52
+ TRAIN:
53
+ BATCH_SIZE_PER_GPU: 32
54
+ SHUFFLE: true
55
+ BEGIN_EPOCH: 0
56
+ END_EPOCH: 140
57
+ OPTIMIZER: 'adam'
58
+ LR: 0.001
59
+ LR_FACTOR: 0.1
60
+ LR_STEP:
61
+ - 90
62
+ - 120
63
+ WD: 0.0001
64
+ GAMMA1: 0.99
65
+ GAMMA2: 0.0
66
+ MOMENTUM: 0.9
67
+ NESTEROV: false
68
+ TEST:
69
+ BATCH_SIZE_PER_GPU: 32
70
+ COCO_BBOX_FILE: 'data/coco/person_detection_results/COCO_val2017_detections_AP_H_56_person.json'
71
+ BBOX_THRE: 1.0
72
+ IMAGE_THRE: 0.0
73
+ IN_VIS_THRE: 0.2
74
+ MODEL_FILE: ''
75
+ NMS_THRE: 1.0
76
+ OKS_THRE: 0.9
77
+ FLIP_TEST: true
78
+ POST_PROCESS: true
79
+ SHIFT_HEATMAP: true
80
+ USE_GT_BBOX: true
81
+ DEBUG:
82
+ DEBUG: true
83
+ SAVE_BATCH_IMAGES_GT: true
84
+ SAVE_BATCH_IMAGES_PRED: true
85
+ SAVE_HEATMAPS_GT: true
86
+ SAVE_HEATMAPS_PRED: true
VideoToNPZ/lib/pose/hrnet/experiments/mpii/resnet/res152_256x256_d256x3_adam_lr1e-3.yaml ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ AUTO_RESUME: true
2
+ CUDNN:
3
+ BENCHMARK: true
4
+ DETERMINISTIC: false
5
+ ENABLED: true
6
+ DATA_DIR: ''
7
+ GPUS: (0,1,2,3)
8
+ OUTPUT_DIR: 'output'
9
+ LOG_DIR: 'log'
10
+ WORKERS: 24
11
+ PRINT_FREQ: 100
12
+
13
+ DATASET:
14
+ COLOR_RGB: false
15
+ DATASET: mpii
16
+ DATA_FORMAT: jpg
17
+ FLIP: true
18
+ NUM_JOINTS_HALF_BODY: 8
19
+ PROB_HALF_BODY: -1.0
20
+ ROOT: 'data/mpii/'
21
+ ROT_FACTOR: 30
22
+ SCALE_FACTOR: 0.25
23
+ TEST_SET: valid
24
+ TRAIN_SET: train
25
+ MODEL:
26
+ NAME: 'pose_resnet'
27
+ PRETRAINED: 'models/pytorch/imagenet/resnet152-b121ed2d.pth'
28
+ IMAGE_SIZE:
29
+ - 256
30
+ - 256
31
+ HEATMAP_SIZE:
32
+ - 64
33
+ - 64
34
+ SIGMA: 2
35
+ NUM_JOINTS: 16
36
+ TARGET_TYPE: 'gaussian'
37
+ EXTRA:
38
+ FINAL_CONV_KERNEL: 1
39
+ DECONV_WITH_BIAS: false
40
+ NUM_DECONV_LAYERS: 3
41
+ NUM_DECONV_FILTERS:
42
+ - 256
43
+ - 256
44
+ - 256
45
+ NUM_DECONV_KERNELS:
46
+ - 4
47
+ - 4
48
+ - 4
49
+ NUM_LAYERS: 152
50
+ LOSS:
51
+ USE_TARGET_WEIGHT: true
52
+ TRAIN:
53
+ BATCH_SIZE_PER_GPU: 32
54
+ SHUFFLE: true
55
+ BEGIN_EPOCH: 0
56
+ END_EPOCH: 140
57
+ OPTIMIZER: 'adam'
58
+ LR: 0.001
59
+ LR_FACTOR: 0.1
60
+ LR_STEP:
61
+ - 90
62
+ - 120
63
+ WD: 0.0001
64
+ GAMMA1: 0.99
65
+ GAMMA2: 0.0
66
+ MOMENTUM: 0.9
67
+ NESTEROV: false
68
+ TEST:
69
+ BATCH_SIZE_PER_GPU: 32
70
+ COCO_BBOX_FILE: 'data/coco/person_detection_results/COCO_val2017_detections_AP_H_56_person.json'
71
+ BBOX_THRE: 1.0
72
+ IMAGE_THRE: 0.0
73
+ IN_VIS_THRE: 0.2
74
+ MODEL_FILE: ''
75
+ NMS_THRE: 1.0
76
+ OKS_THRE: 0.9
77
+ FLIP_TEST: true
78
+ POST_PROCESS: true
79
+ SHIFT_HEATMAP: true
80
+ USE_GT_BBOX: true
81
+ DEBUG:
82
+ DEBUG: true
83
+ SAVE_BATCH_IMAGES_GT: true
84
+ SAVE_BATCH_IMAGES_PRED: true
85
+ SAVE_HEATMAPS_GT: true
86
+ SAVE_HEATMAPS_PRED: true
VideoToNPZ/lib/pose/hrnet/experiments/mpii/resnet/res50_256x256_d256x3_adam_lr1e-3.yaml ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ AUTO_RESUME: true
2
+ CUDNN:
3
+ BENCHMARK: true
4
+ DETERMINISTIC: false
5
+ ENABLED: true
6
+ DATA_DIR: ''
7
+ GPUS: (0,1,2,3)
8
+ OUTPUT_DIR: 'output'
9
+ LOG_DIR: 'log'
10
+ WORKERS: 24
11
+ PRINT_FREQ: 100
12
+
13
+ DATASET:
14
+ COLOR_RGB: false
15
+ DATASET: mpii
16
+ DATA_FORMAT: jpg
17
+ FLIP: true
18
+ NUM_JOINTS_HALF_BODY: 8
19
+ PROB_HALF_BODY: -1.0
20
+ ROOT: 'data/mpii/'
21
+ ROT_FACTOR: 30
22
+ SCALE_FACTOR: 0.25
23
+ TEST_SET: valid
24
+ TRAIN_SET: train
25
+ MODEL:
26
+ NAME: 'pose_resnet'
27
+ PRETRAINED: 'models/pytorch/imagenet/resnet50-19c8e357.pth'
28
+ IMAGE_SIZE:
29
+ - 256
30
+ - 256
31
+ HEATMAP_SIZE:
32
+ - 64
33
+ - 64
34
+ SIGMA: 2
35
+ NUM_JOINTS: 16
36
+ TARGET_TYPE: 'gaussian'
37
+ EXTRA:
38
+ FINAL_CONV_KERNEL: 1
39
+ DECONV_WITH_BIAS: false
40
+ NUM_DECONV_LAYERS: 3
41
+ NUM_DECONV_FILTERS:
42
+ - 256
43
+ - 256
44
+ - 256
45
+ NUM_DECONV_KERNELS:
46
+ - 4
47
+ - 4
48
+ - 4
49
+ NUM_LAYERS: 50
50
+ LOSS:
51
+ USE_TARGET_WEIGHT: true
52
+ TRAIN:
53
+ BATCH_SIZE_PER_GPU: 32
54
+ SHUFFLE: true
55
+ BEGIN_EPOCH: 0
56
+ END_EPOCH: 140
57
+ OPTIMIZER: 'adam'
58
+ LR: 0.001
59
+ LR_FACTOR: 0.1
60
+ LR_STEP:
61
+ - 90
62
+ - 120
63
+ WD: 0.0001
64
+ GAMMA1: 0.99
65
+ GAMMA2: 0.0
66
+ MOMENTUM: 0.9
67
+ NESTEROV: false
68
+ TEST:
69
+ BATCH_SIZE_PER_GPU: 32
70
+ COCO_BBOX_FILE: 'data/coco/person_detection_results/COCO_val2017_detections_AP_H_56_person.json'
71
+ BBOX_THRE: 1.0
72
+ IMAGE_THRE: 0.0
73
+ IN_VIS_THRE: 0.2
74
+ MODEL_FILE: ''
75
+ NMS_THRE: 1.0
76
+ OKS_THRE: 0.9
77
+ FLIP_TEST: true
78
+ POST_PROCESS: true
79
+ SHIFT_HEATMAP: true
80
+ USE_GT_BBOX: true
81
+ DEBUG:
82
+ DEBUG: true
83
+ SAVE_BATCH_IMAGES_GT: true
84
+ SAVE_BATCH_IMAGES_PRED: true
85
+ SAVE_HEATMAPS_GT: true
86
+ SAVE_HEATMAPS_PRED: true
VideoToNPZ/lib/pose/hrnet/lib/Makefile ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ all:
2
+ cd nms; python setup_linux.py build_ext --inplace; rm -rf build; cd ../../
3
+ clean:
4
+ cd nms; rm *.so; cd ../../
VideoToNPZ/lib/pose/hrnet/lib/config/__init__.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ # ------------------------------------------------------------------------------
2
+ # Copyright (c) Microsoft
3
+ # Licensed under the MIT License.
4
+ # Written by Bin Xiao ([email protected])
5
+ # ------------------------------------------------------------------------------
6
+
7
+ from .default import _C as cfg
8
+ from .default import update_config
9
+ from .models import MODEL_EXTRAS
VideoToNPZ/lib/pose/hrnet/lib/config/default.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # ------------------------------------------------------------------------------
3
+ # Copyright (c) Microsoft
4
+ # Licensed under the MIT License.
5
+ # Written by Bin Xiao ([email protected])
6
+ # ------------------------------------------------------------------------------
7
+
8
+ from __future__ import absolute_import
9
+ from __future__ import division
10
+ from __future__ import print_function
11
+
12
+ import os
13
+
14
+ from yacs.config import CfgNode as CN
15
+
16
+
17
+ _C = CN()
18
+
19
+ _C.OUTPUT_DIR = ''
20
+ _C.LOG_DIR = ''
21
+ _C.DATA_DIR = ''
22
+ _C.GPUS = (0,)
23
+ _C.WORKERS = 4
24
+ _C.PRINT_FREQ = 20
25
+ _C.AUTO_RESUME = False
26
+ _C.PIN_MEMORY = True
27
+ _C.RANK = 0
28
+
29
+ # Cudnn related params
30
+ _C.CUDNN = CN()
31
+ _C.CUDNN.BENCHMARK = True
32
+ _C.CUDNN.DETERMINISTIC = False
33
+ _C.CUDNN.ENABLED = True
34
+
35
+ # common params for NETWORK
36
+ _C.MODEL = CN()
37
+ _C.MODEL.NAME = 'pose_hrnet'
38
+ _C.MODEL.INIT_WEIGHTS = True
39
+ _C.MODEL.PRETRAINED = ''
40
+ _C.MODEL.NUM_JOINTS = 17
41
+ _C.MODEL.TAG_PER_JOINT = True
42
+ _C.MODEL.TARGET_TYPE = 'gaussian'
43
+ _C.MODEL.IMAGE_SIZE = [256, 256] # width * height, ex: 192 * 256
44
+ _C.MODEL.HEATMAP_SIZE = [64, 64] # width * height, ex: 24 * 32
45
+ _C.MODEL.SIGMA = 2
46
+ _C.MODEL.EXTRA = CN(new_allowed=True)
47
+
48
+ _C.LOSS = CN()
49
+ _C.LOSS.USE_OHKM = False
50
+ _C.LOSS.TOPK = 8
51
+ _C.LOSS.USE_TARGET_WEIGHT = True
52
+ _C.LOSS.USE_DIFFERENT_JOINTS_WEIGHT = False
53
+
54
+ # DATASET related params
55
+ _C.DATASET = CN()
56
+ _C.DATASET.ROOT = ''
57
+ _C.DATASET.DATASET = 'mpii'
58
+ _C.DATASET.TRAIN_SET = 'train'
59
+ _C.DATASET.TEST_SET = 'valid'
60
+ _C.DATASET.DATA_FORMAT = 'jpg'
61
+ _C.DATASET.HYBRID_JOINTS_TYPE = ''
62
+ _C.DATASET.SELECT_DATA = False
63
+
64
+ # training data augmentation
65
+ _C.DATASET.FLIP = True
66
+ _C.DATASET.SCALE_FACTOR = 0.25
67
+ _C.DATASET.ROT_FACTOR = 30
68
+ _C.DATASET.PROB_HALF_BODY = 0.0
69
+ _C.DATASET.NUM_JOINTS_HALF_BODY = 8
70
+ _C.DATASET.COLOR_RGB = False
71
+
72
+ # train
73
+ _C.TRAIN = CN()
74
+
75
+ _C.TRAIN.LR_FACTOR = 0.1
76
+ _C.TRAIN.LR_STEP = [90, 110]
77
+ _C.TRAIN.LR = 0.001
78
+
79
+ _C.TRAIN.OPTIMIZER = 'adam'
80
+ _C.TRAIN.MOMENTUM = 0.9
81
+ _C.TRAIN.WD = 0.0001
82
+ _C.TRAIN.NESTEROV = False
83
+ _C.TRAIN.GAMMA1 = 0.99
84
+ _C.TRAIN.GAMMA2 = 0.0
85
+
86
+ _C.TRAIN.BEGIN_EPOCH = 0
87
+ _C.TRAIN.END_EPOCH = 140
88
+
89
+ _C.TRAIN.RESUME = False
90
+ _C.TRAIN.CHECKPOINT = ''
91
+
92
+ _C.TRAIN.BATCH_SIZE_PER_GPU = 32
93
+ _C.TRAIN.SHUFFLE = True
94
+
95
+ # testing
96
+ _C.TEST = CN()
97
+
98
+ # size of images for each device
99
+ _C.TEST.BATCH_SIZE_PER_GPU = 32
100
+ # Test Model Epoch
101
+ _C.TEST.FLIP_TEST = False
102
+ _C.TEST.POST_PROCESS = False
103
+ _C.TEST.SHIFT_HEATMAP = False
104
+
105
+ _C.TEST.USE_GT_BBOX = False
106
+
107
+ # nms
108
+ _C.TEST.IMAGE_THRE = 0.1
109
+ _C.TEST.NMS_THRE = 0.6
110
+ _C.TEST.SOFT_NMS = False
111
+ _C.TEST.OKS_THRE = 0.5
112
+ _C.TEST.IN_VIS_THRE = 0.0
113
+ _C.TEST.COCO_BBOX_FILE = ''
114
+ _C.TEST.BBOX_THRE = 1.0
115
+ _C.TEST.MODEL_FILE = ''
116
+
117
+ # debug
118
+ _C.DEBUG = CN()
119
+ _C.DEBUG.DEBUG = False
120
+ _C.DEBUG.SAVE_BATCH_IMAGES_GT = False
121
+ _C.DEBUG.SAVE_BATCH_IMAGES_PRED = False
122
+ _C.DEBUG.SAVE_HEATMAPS_GT = False
123
+ _C.DEBUG.SAVE_HEATMAPS_PRED = False
124
+
125
+
126
+ def update_config(cfg, args):
127
+ cfg.defrost()
128
+ cfg.merge_from_file(args.cfg)
129
+ cfg.merge_from_list(args.opts)
130
+
131
+ if args.modelDir:
132
+ cfg.OUTPUT_DIR = args.modelDir
133
+
134
+ # if args.logDir:
135
+ # cfg.LOG_DIR = args.logDir
136
+ #
137
+ # if args.dataDir:
138
+ # cfg.DATA_DIR = args.dataDir
139
+ #
140
+ # cfg.DATASET.ROOT = os.path.join(
141
+ # cfg.DATA_DIR, cfg.DATASET.ROOT
142
+ # )
143
+ #
144
+ # cfg.MODEL.PRETRAINED = os.path.join(
145
+ # cfg.DATA_DIR, cfg.MODEL.PRETRAINED
146
+ # )
147
+ #
148
+ # if cfg.TEST.MODEL_FILE:
149
+ # cfg.TEST.MODEL_FILE = os.path.join(
150
+ # cfg.DATA_DIR, cfg.TEST.MODEL_FILE
151
+ # )
152
+
153
+ cfg.freeze()
154
+
155
+
156
+ if __name__ == '__main__':
157
+ import sys
158
+ with open(sys.argv[1], 'w') as f:
159
+ print(_C, file=f)
160
+
VideoToNPZ/lib/pose/hrnet/lib/config/models.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ------------------------------------------------------------------------------
2
+ # Copyright (c) Microsoft
3
+ # Licensed under the MIT License.
4
+ # Written by Bin Xiao ([email protected])
5
+ # ------------------------------------------------------------------------------
6
+
7
+ from __future__ import absolute_import
8
+ from __future__ import division
9
+ from __future__ import print_function
10
+
11
+ from yacs.config import CfgNode as CN
12
+
13
+
14
+ # pose_resnet related params
15
+ POSE_RESNET = CN()
16
+ POSE_RESNET.NUM_LAYERS = 50
17
+ POSE_RESNET.DECONV_WITH_BIAS = False
18
+ POSE_RESNET.NUM_DECONV_LAYERS = 3
19
+ POSE_RESNET.NUM_DECONV_FILTERS = [256, 256, 256]
20
+ POSE_RESNET.NUM_DECONV_KERNELS = [4, 4, 4]
21
+ POSE_RESNET.FINAL_CONV_KERNEL = 1
22
+ POSE_RESNET.PRETRAINED_LAYERS = ['*']
23
+
24
+ # pose_multi_resoluton_net related params
25
+ POSE_HIGH_RESOLUTION_NET = CN()
26
+ POSE_HIGH_RESOLUTION_NET.PRETRAINED_LAYERS = ['*']
27
+ POSE_HIGH_RESOLUTION_NET.STEM_INPLANES = 64
28
+ POSE_HIGH_RESOLUTION_NET.FINAL_CONV_KERNEL = 1
29
+
30
+ POSE_HIGH_RESOLUTION_NET.STAGE2 = CN()
31
+ POSE_HIGH_RESOLUTION_NET.STAGE2.NUM_MODULES = 1
32
+ POSE_HIGH_RESOLUTION_NET.STAGE2.NUM_BRANCHES = 2
33
+ POSE_HIGH_RESOLUTION_NET.STAGE2.NUM_BLOCKS = [4, 4]
34
+ POSE_HIGH_RESOLUTION_NET.STAGE2.NUM_CHANNELS = [32, 64]
35
+ POSE_HIGH_RESOLUTION_NET.STAGE2.BLOCK = 'BASIC'
36
+ POSE_HIGH_RESOLUTION_NET.STAGE2.FUSE_METHOD = 'SUM'
37
+
38
+ POSE_HIGH_RESOLUTION_NET.STAGE3 = CN()
39
+ POSE_HIGH_RESOLUTION_NET.STAGE3.NUM_MODULES = 1
40
+ POSE_HIGH_RESOLUTION_NET.STAGE3.NUM_BRANCHES = 3
41
+ POSE_HIGH_RESOLUTION_NET.STAGE3.NUM_BLOCKS = [4, 4, 4]
42
+ POSE_HIGH_RESOLUTION_NET.STAGE3.NUM_CHANNELS = [32, 64, 128]
43
+ POSE_HIGH_RESOLUTION_NET.STAGE3.BLOCK = 'BASIC'
44
+ POSE_HIGH_RESOLUTION_NET.STAGE3.FUSE_METHOD = 'SUM'
45
+
46
+ POSE_HIGH_RESOLUTION_NET.STAGE4 = CN()
47
+ POSE_HIGH_RESOLUTION_NET.STAGE4.NUM_MODULES = 1
48
+ POSE_HIGH_RESOLUTION_NET.STAGE4.NUM_BRANCHES = 4
49
+ POSE_HIGH_RESOLUTION_NET.STAGE4.NUM_BLOCKS = [4, 4, 4, 4]
50
+ POSE_HIGH_RESOLUTION_NET.STAGE4.NUM_CHANNELS = [32, 64, 128, 256]
51
+ POSE_HIGH_RESOLUTION_NET.STAGE4.BLOCK = 'BASIC'
52
+ POSE_HIGH_RESOLUTION_NET.STAGE4.FUSE_METHOD = 'SUM'
53
+
54
+
55
+ MODEL_EXTRAS = {
56
+ 'pose_resnet': POSE_RESNET,
57
+ 'pose_high_resolution_net': POSE_HIGH_RESOLUTION_NET,
58
+ }
VideoToNPZ/lib/pose/hrnet/lib/models/__init__.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ------------------------------------------------------------------------------
2
+ # Copyright (c) Microsoft
3
+ # Licensed under the MIT License.
4
+ # Written by Bin Xiao ([email protected])
5
+ # ------------------------------------------------------------------------------
6
+
7
+ from __future__ import absolute_import
8
+ from __future__ import division
9
+ from __future__ import print_function
10
+
11
+ from __future__ import absolute_import
12
+ from __future__ import division
13
+ from __future__ import print_function
14
+
15
+ import models.pose_resnet
16
+ import models.pose_hrnet