Spaces:

amanpreet7
/

Sam

Runtime error

App Files Files Community

Amanpreet commited on Mar 4

Commit

1cdc47e

1 Parent(s): 4276ea6

added 2

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +1 -0
.gitignore +7 -0
VideoToNPZ/INFERENCE_EN.md +2 -0
VideoToNPZ/checkpoint/gastnet/81_frame_model.bin +3 -0
VideoToNPZ/checkpoint/hrnet/pose_coco/pose_hrnet_w48_384x288.pth +3 -0
VideoToNPZ/checkpoint/yolov3/yolov3.weights +3 -0
VideoToNPZ/common/arguments.py +86 -0
VideoToNPZ/common/camera.py +63 -0
VideoToNPZ/common/generators.py +236 -0
VideoToNPZ/common/graph_utils.py +45 -0
VideoToNPZ/common/loss.py +90 -0
VideoToNPZ/common/quaternion.py +36 -0
VideoToNPZ/common/skeleton.py +81 -0
VideoToNPZ/data/data_utils.py +95 -0
VideoToNPZ/gen_skes.py +116 -0
VideoToNPZ/lib/detector/__init__.py +6 -0
VideoToNPZ/lib/detector/yolov3/__init__.py +0 -0
VideoToNPZ/lib/detector/yolov3/bbox.py +111 -0
VideoToNPZ/lib/detector/yolov3/cfg/tiny-yolo-voc.cfg +134 -0
VideoToNPZ/lib/detector/yolov3/cfg/yolo-voc.cfg +258 -0
VideoToNPZ/lib/detector/yolov3/cfg/yolo.cfg +258 -0
VideoToNPZ/lib/detector/yolov3/cfg/yolov3.cfg +789 -0
VideoToNPZ/lib/detector/yolov3/darknet.py +433 -0
VideoToNPZ/lib/detector/yolov3/data/coco.names +80 -0
VideoToNPZ/lib/detector/yolov3/data/pallete +0 -0
VideoToNPZ/lib/detector/yolov3/data/voc.names +20 -0
VideoToNPZ/lib/detector/yolov3/human_detector.py +155 -0
VideoToNPZ/lib/detector/yolov3/preprocess.py +63 -0
VideoToNPZ/lib/detector/yolov3/util.py +225 -0
VideoToNPZ/lib/pose/__init__.py +10 -0
VideoToNPZ/lib/pose/hrnet/experiments/coco/hrnet/w32_256x192_adam_lr1e-3.yaml +127 -0
VideoToNPZ/lib/pose/hrnet/experiments/coco/hrnet/w32_384x288_adam_lr1e-3.yaml +127 -0
VideoToNPZ/lib/pose/hrnet/experiments/coco/hrnet/w48_256x192_adam_lr1e-3.yaml +127 -0
VideoToNPZ/lib/pose/hrnet/experiments/coco/hrnet/w48_384x288_adam_lr1e-3.yaml +127 -0
VideoToNPZ/lib/pose/hrnet/experiments/coco/resnet/res101_256x192_d256x3_adam_lr1e-3.yaml +83 -0
VideoToNPZ/lib/pose/hrnet/experiments/coco/resnet/res101_384x288_d256x3_adam_lr1e-3.yaml +83 -0
VideoToNPZ/lib/pose/hrnet/experiments/coco/resnet/res152_256x192_d256x3_adam_lr1e-3.yaml +83 -0
VideoToNPZ/lib/pose/hrnet/experiments/coco/resnet/res152_384x288_d256x3_adam_lr1e-3.yaml +83 -0
VideoToNPZ/lib/pose/hrnet/experiments/coco/resnet/res50_256x192_d256x3_adam_lr1e-3.yaml +83 -0
VideoToNPZ/lib/pose/hrnet/experiments/coco/resnet/res50_384x288_d256x3_adam_lr1e-3.yaml +83 -0
VideoToNPZ/lib/pose/hrnet/experiments/mpii/hrnet/w32_256x256_adam_lr1e-3.yaml +120 -0
VideoToNPZ/lib/pose/hrnet/experiments/mpii/hrnet/w48_256x256_adam_lr1e-3.yaml +120 -0
VideoToNPZ/lib/pose/hrnet/experiments/mpii/resnet/res101_256x256_d256x3_adam_lr1e-3.yaml +86 -0
VideoToNPZ/lib/pose/hrnet/experiments/mpii/resnet/res152_256x256_d256x3_adam_lr1e-3.yaml +86 -0
VideoToNPZ/lib/pose/hrnet/experiments/mpii/resnet/res50_256x256_d256x3_adam_lr1e-3.yaml +86 -0
VideoToNPZ/lib/pose/hrnet/lib/Makefile +4 -0
VideoToNPZ/lib/pose/hrnet/lib/config/__init__.py +9 -0
VideoToNPZ/lib/pose/hrnet/lib/config/default.py +160 -0
VideoToNPZ/lib/pose/hrnet/lib/config/models.py +58 -0
VideoToNPZ/lib/pose/hrnet/lib/models/__init__.py +16 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.weights filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,7 @@

+venv/
+_pycache_/
+*.pyc
+*.bvh
+*.obj
+*.npz
+*.mp4

VideoToNPZ/INFERENCE_EN.md ADDED Viewed

	@@ -0,0 +1,2 @@


1	+
2	+ python gen_skes.py -v baseball.mp4

VideoToNPZ/checkpoint/gastnet/81_frame_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3150eb3125ca66242a888fd06b4eb7d8a8b755607370225c24f0b9c794d35cc4
+size 28333160

VideoToNPZ/checkpoint/hrnet/pose_coco/pose_hrnet_w48_384x288.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:95e0fec3194826d5e3f806ea89be68bbb84517b114c3a32b3058c56610b5ef61
+size 255061287

VideoToNPZ/checkpoint/yolov3/yolov3.weights ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:523e4e69e1d015393a1b0a441cef1d9c7659e3eb2d7e15f793f060a21b32f297
+size 248007048

VideoToNPZ/common/arguments.py ADDED Viewed

	@@ -0,0 +1,86 @@

+import argparse
+def parse_args():
+    parser = argparse.ArgumentParser(description='Training script')
+    # General arguments
+    parser.add_argument('-d', '--dataset', default='h36m', type=str, metavar='NAME',
+                        help='target dataset')  # h36m or humaneva
+    parser.add_argument('-k', '--keypoints', default='cpn_ft_h36m_dbb', type=str, metavar='NAME',
+                        help='2D detections to use')
+    parser.add_argument('-str', '--subjects-train', default='S1,S5,S6,S7,S8', type=str, metavar='LIST',
+                        help='training subjects separated by comma')
+    parser.add_argument('-ste', '--subjects-test', default='S9,S11', type=str, metavar='LIST',
+                        help='test subjects separated by comma')
+    parser.add_argument('-a', '--actions', default='*', type=str, metavar='LIST',
+                        help='actions to train/test on, separated by comma, or * for all')
+    parser.add_argument('-c', '--checkpoint', default='checkpoint', type=str, metavar='PATH',
+                        help='checkpoint directory')
+    parser.add_argument('--checkpoint-frequency', default=10, type=int, metavar='N',
+                        help='create a checkpoint every N epochs')
+    parser.add_argument('-r', '--resume', default='', type=str, metavar='FILENAME',
+                        help='checkpoint to resume (file name)')
+    parser.add_argument('--evaluate', default='', type=str, metavar='FILENAME',
+                        help='checkpoint to evaluate (file name)')
+    parser.add_argument('--render', action='store_true', help='visualize a particular video')
+    parser.add_argument('--by-subject', action='store_true', help='break down error by subject (on evaluation)')
+    parser.add_argument('--export-training-curves', action='store_true', help='save training curves as .png images')
+    # Model arguments
+    parser.add_argument('-s', '--stride', default=1, type=int, metavar='N', help='chunk size to use during training')
+    parser.add_argument('-arc', '--architecture', default='3,3,3', type=str, metavar='LAYERS',
+                        help='filter widths separated by comma')
+    parser.add_argument('--causal', action='store_true', help='use causal convolutions for real-time processing')
+    parser.add_argument('-ch', '--channels', default=128, type=int, metavar='N',
+                        help='number of channels in convolution layers')
+    # Experimental setting
+    parser.add_argument('-e', '--epochs', default=60, type=int, metavar='N', help='number of training epochs')
+    parser.add_argument('-b', '--batch-size', default=128, type=int, metavar='N',
+                        help='batch size in terms of predicted frames')
+    parser.add_argument('-drop', '--dropout', default=0.05, type=float, metavar='P', help='dropout probability')
+    parser.add_argument('-lr', '--learning-rate', default=0.001, type=float, metavar='LR', help='initial learning rate')
+    parser.add_argument('-lrd', '--lr-decay', default=0.95, type=float, metavar='LR',
+                        help='learning rate decay per epoch')
+    parser.add_argument('-no-da', '--no-data-augmentation', dest='data_augmentation', action='store_false',
+                        help='disable train-time flipping')
+    parser.add_argument('-no-tta', '--no-test-time-augmentation', dest='test_time_augmentation', action='store_false',
+                        help='disable test-time flipping')
+    parser.add_argument('--subset', default=1, type=float, metavar='FRACTION', help='reduce dataset size by fraction')
+    parser.add_argument('--downsample', default=5, type=int, metavar='FACTOR',
+                        help='downsample frame rate by factor (semi-supervised)')
+    parser.add_argument('--no-eval', action='store_true',
+                        help='disable epoch evaluation while training (small speed-up)')
+    parser.add_argument('--disable-optimizations', action='store_true',
+                        help='disable optimized model for single-frame predictions')
+    # Visualization
+    parser.add_argument('--viz-subject', type=str, metavar='STR', help='subject to render')
+    parser.add_argument('--viz-action', type=str, metavar='STR', help='action to render')
+    parser.add_argument('--viz-camera', type=int, default=0, metavar='N', help='camera to render')
+    parser.add_argument('--viz-video', type=str, metavar='PATH', help='path to input video')
+    parser.add_argument('--viz-skip', type=int, default=0, metavar='N', help='skip first N frames of input video')
+    parser.add_argument('--viz-output', type=str, metavar='PATH', help='output file name (.gif or .mp4)')
+    parser.add_argument('--viz-export', type=str, metavar='PATH', help='output file name for coordinates')
+    parser.add_argument('--viz-bitrate', type=int, default=3000, metavar='N', help='bitrate for mp4 videos')
+    parser.add_argument('--viz-no-ground-truth', action='store_true', help='do not show ground-truth poses')
+    parser.add_argument('--viz-limit', type=int, default=-1, metavar='N', help='only render first N frames')
+    parser.add_argument('--viz-downsample', type=int, default=1, metavar='N', help='downsample FPS by a factor N')
+    parser.add_argument('--viz-size', type=int, default=5, metavar='N', help='image size')
+    parser.set_defaults(bone_length_term=True)
+    parser.set_defaults(data_augmentation=True)
+    parser.set_defaults(test_time_augmentation=True)
+    args = parser.parse_args()
+    # Check invalid configuration
+    if args.resume and args.evaluate:
+        print('Invalid flags: --resume and --evaluate cannot be set at the same time')
+        exit()
+    if args.export_training_curves and args.no_eval:
+        print('Invalid flags: --export-training-curves and --no-eval cannot be set at the same time')
+        exit()
+    return args

VideoToNPZ/common/camera.py ADDED Viewed

	@@ -0,0 +1,63 @@

+import numpy as np
+import torch
+from tools.utils import wrap
+from common.quaternion import qort, qinverse
+def normalize_screen_coordinates(X, w, h):
+    assert X.shape[-1] == 2
+    # Normalize so that [0, w] is mapped to [-1, 1], while preserving the aspect ratio
+    return X/w*2 - [1, h/w]
+def image_coordinates(X, w, h):
+    assert X.shape[-1] == 2
+    # Reverse camera frame normalization
+    return (X + [1, h/w]) * w / 2
+def world_to_camera(X, R, t):
+    Rt = wrap(qinverse, R)  # Invert rotation
+    return wrap(qort, np.tile(Rt, (*X.shape[:-1], 1)), X - t)  # Rotate and translate
+def camera_to_world(X, R, t):
+    return wrap(qort, np.tile(R, (*X.shape[:-1], 1)), X) + t
+def project_to_2d(X, camera_params):
+    """
+    Project 3D points to 2D using the Human3.6M camera projection function.
+    This is a differentiable and batched reimplementation of the original MATLAB script.
+    Arguments:
+    X -- 3D points in *camera space* to transform (N, *, 3)
+    camera_params -- intrinsic parameteres (N, 2+2+3+2=9)
+    """
+    assert X.shape[-1] == 3
+    assert len(camera_params.shape) == 2
+    assert camera_params.shape[-1] == 9
+    assert X.shape[0] == camera_params.shape[0]
+    while len(camera_params.shape) < len(X.shape):
+        camera_params = camera_params.unsqueeze(1)
+    f = camera_params[..., :2]
+    c = camera_params[..., 2:4]
+    k = camera_params[..., 4:7]
+    p = camera_params[..., 7:]
+    # XX = torch.clamp(X[..., :2] / X[..., 2:], min=-1, max=1)
+    XX = X[..., :2] / X[..., 2:]
+    r2 = torch.sum(XX[..., :2]**2, dim=len(XX.shape)-1, keepdim=True)
+    radial = 1 + torch.sum(k * torch.cat((r2, r2**2, r2**3), dim=len(r2.shape)-1), dim=len(r2.shape)-1, keepdim=True)
+    tan = torch.sum(p*XX, dim=len(XX.shape)-1, keepdim=True)
+    XXX = XX*(radial + tan) + p*r2
+    return f*XXX + c

VideoToNPZ/common/generators.py ADDED Viewed

	@@ -0,0 +1,236 @@

+from itertools import zip_longest
+import numpy as np
+class ChunkedGenerator:
+    """
+        Batched data generator, used for training.
+        The sequences are split into equal-length chunks and padded as necessary.
+        Arguments:
+        batch_size -- the batch size to use for training
+        cameras -- list of cameras, one element for each video (optional, used for semi-supervised training)
+        poses_3d -- list of ground-truth 3D poses, one element for each video (optional, used for supervised training)
+        poses_2d -- list of input 2D keypoints, one element for each video
+        chunk_length -- number of output frames to predict for each training example (usually 1)
+        pad -- 2D input padding to compensate for valid convolutions, per side (depends on the receptive field)
+        causal_shift -- asymmetric padding offset when causal convolutions are used (usually 0 or "pad")
+        shuffle -- randomly shuffle the dataset before each epoch
+        random_seed -- initial seed to use for the random generator
+        augment -- augment the dataset by flipping poses horizontally
+        kps_left and kps_right -- list of left/right 2D keypoints if flipping is enabled
+        joints_left and joints_right -- list of left/right 3D joints if flipping is enabled
+    """
+    def __init__(self, batch_size, cameras, poses_3d, poses_2d,
+                 chunk_length, pad=0, causal_shift=0,
+                 shuffle=True, random_seed=1234,
+                 augment=False, kps_left=None, kps_right=None, joints_left=None, joints_right=None,
+                 endless=False):
+        assert poses_3d is None or len(poses_3d) == len(poses_2d), (len(poses_3d), len(poses_2d))
+        assert cameras is None or len(cameras) == len(poses_2d)
+        # Build lineage info
+        pairs = []  # (seq_idx, start_frame, end_frame, flip) tuples
+        for i in range(len(poses_2d)):
+            assert poses_3d is None or poses_3d[i].shape[0] == poses_2d[i].shape[0]
+            n_chunks = (poses_2d[i].shape[0] + chunk_length - 1) // chunk_length
+            offset = (n_chunks * chunk_length - poses_2d[i].shape[0]) // 2
+            bounds = np.arange(n_chunks + 1) * chunk_length - offset
+            augment_vector = np.full(len(bounds)-1, False, dtype=bool)
+            pairs += zip(np.repeat(i, len(bounds)-1), bounds[:-1], bounds[1:], augment_vector)
+            if augment:
+                pairs += zip(np.repeat(i, len(bounds)-1), bounds[:-1], bounds[1:], ~augment_vector)
+            # Initialize buffers
+            if cameras is not None:
+                self.batch_cam = np.empty((batch_size, cameras[0].shape[-1]))
+            if poses_3d is not None:
+                self.batch_3d = np.empty((batch_size, chunk_length, poses_3d[0].shape[-2], poses_3d[0].shape[-1]))
+            self.batch_2d = np.empty((batch_size, chunk_length + 2*pad, poses_2d[0].shape[-2], poses_2d[0].shape[-1]))
+            self.num_batches = (len(pairs) + batch_size - 1) // batch_size
+            self.batch_size = batch_size
+            self.random = np.random.RandomState(random_seed)
+            self.pairs = pairs
+            self.shuffle = shuffle
+            self.pad = pad
+            self.causal_shift = causal_shift
+            self.endless = endless
+            self.state = None
+            self.cameras = cameras
+            self.poses_3d = poses_3d
+            self.poses_2d = poses_2d
+            self.augment = augment
+            self.kps_left = kps_left
+            self.kps_right = kps_right
+            self.joints_left = joints_left
+            self.joints_right = joints_right
+    def num_frames(self):
+        return self.num_batches * self.batch_size
+    def random_state(self):
+        return self.random
+    def set_random_state(self, random):
+        self.random = random
+    def augment_enabled(self):
+        return self.augment
+    def next_pairs(self):
+        if self.state is None:
+            if self.shuffle:
+                pairs = self.random.permutation(self.pairs)
+            else:
+                pairs = self.pairs
+            return 0, pairs
+        else:
+            return self.state
+    def next_epoch(self):
+        enabled = True
+        while enabled:
+            start_idx, pairs = self.next_pairs()
+            for b_i in range(start_idx, self.num_batches):
+                chunks = pairs[b_i*self.batch_size : (b_i+1)*self.batch_size]
+                for i, (seq_i, start_3d, end_3d, flip) in enumerate(chunks):
+                    start_2d = start_3d - self.pad - self.causal_shift
+                    end_2d = end_3d + self.pad - self.causal_shift
+                    # 2D poses
+                    seq_2d = self.poses_2d[seq_i]
+                    low_2d = max(start_2d, 0)
+                    high_2d = min(end_2d, seq_2d.shape[0])
+                    pad_left_2d = low_2d - start_2d
+                    pad_right_2d = end_2d - high_2d
+                    if pad_left_2d != 0 or pad_right_2d != 0:
+                        self.batch_2d[i] = np.pad(seq_2d[low_2d:high_2d], ((pad_left_2d, pad_right_2d), (0, 0), (0, 0)), "edge")
+                    else:
+                        self.batch_2d[i] = seq_2d[low_2d:high_2d]
+                    if flip:
+                        # Flip 2D keypoints
+                        self.batch_2d[i, :, :, 0] *= -1
+                        self.batch_2d[i, :, self.kps_left + self.kps_right] = self.batch_2d[i, :, self.kps_right + self.kps_left]
+                    # 3D poses
+                    if self.poses_3d is not None:
+                        seq_3d = self.poses_3d[seq_i]
+                        low_3d = max(start_3d, 0)
+                        high_3d = min(end_3d, seq_3d.shape[0])
+                        pad_left_3d = low_3d - start_3d
+                        pad_right_3d = end_3d - high_3d
+                        if pad_left_3d != 0 or pad_right_3d != 0:
+                            self.batch_3d[i] = np.pad(seq_3d[low_3d:high_3d], ((pad_left_3d, pad_right_3d), (0, 0), (0, 0)), "edge")
+                        else:
+                            self.batch_3d[i] = seq_3d[low_3d:high_3d]
+                        if flip:
+                            # Flip 3D joints
+                            self.batch_3d[i, :, :, 0] *= -1
+                            self.batch_3d[i, :, self.joints_left + self.joints_right] = \
+                                    self.batch_3d[i, :, self.joints_right + self.joints_left]
+                    # Cameras
+                    if self.cameras is not None:
+                        self.batch_cam[i] = self.cameras[seq_i]
+                        if flip:
+                            # Flip horizontal distortion coefficients
+                            self.batch_cam[i, 2] *= -1
+                            self.batch_cam[i, 7] *= -1
+                if self.endless:
+                    self.state = (b_i + 1, pairs)
+                if self.poses_3d is None and self.cameras is None:
+                    yield None, None, self.batch_2d[:len(chunks)]
+                elif self.poses_3d is not None and self.cameras is None:
+                    yield None, self.batch_3d[:len(chunks)], self.batch_2d[:(len(chunks))]
+                elif self.poses_3d is None:
+                    yield self.batch_cam, None, self.batch_2d[:len(chunks)]
+                else:
+                    yield self.batch_cam[:len(chunks)], self.batch_3d[:len(chunks)], self.batch_2d[:len(chunks)]
+            if self.endless:
+                self.state = None
+            else:
+                enabled = False
+class UnchunkedGenerator:
+    """
+    Non-batched data generator, used for testing.
+    Sequences are returned one at a time (i.e. batch size = 1), without chunking.
+    If data augmentation is enabled, the batches contain two sequences (i.e. batch size = 2),
+    the second of which is a mirrored version of the first.
+    Arguments:
+    cameras -- list of cameras, one element for each video (optional, used for semi-supervised training)
+    poses_3d -- list of ground-truth 3D poses, one element for each video (optional, used for supervised training)
+    poses_2d -- list of input 2D keypoints, one element for each video
+    pad -- 2D input padding to compensate for valid convolutions, per side (depends on the receptive field)
+    causal_shift -- asymmetric padding offset when causal convolutions are used (usually 0 or "pad")
+    augment -- augment the dataset by flipping poses horizontally
+    kps_left and kps_right -- list of left/right 2D keypoints if flipping is enabled
+    joints_left and joints_right -- list of left/right 3D joints if flipping is enabled
+    """
+    def __init__(self, cameras, poses_3d, poses_2d, pad=0, causal_shift=0,
+                 augment=False, kps_left=None, kps_right=None, joints_left=None, joints_right=None):
+        assert poses_3d is None or len(poses_3d) == len(poses_2d)
+        assert cameras is None or len(cameras) == len(poses_2d)
+        self.augment = augment
+        self.kps_left = kps_left
+        self.kps_right = kps_right
+        self.joints_left = joints_left
+        self.joints_right = joints_right
+        self.pad = pad
+        self.causal_shift = causal_shift
+        self.cameras = [] if cameras is None else cameras
+        self.poses_3d = [] if poses_3d is None else poses_3d
+        self.poses_2d = poses_2d
+    def num_frames(self):
+        count = 0
+        for p in self.poses_2d:
+            count += p.shape[0]
+        return count
+    def augment_enabled(self):
+        return self.augment
+    def set_augment(self, augment):
+        self.augment = augment
+    def next_epoch(self):
+        for seq_cam, seq_3d, seq_2d in zip_longest(self.cameras, self.poses_3d, self.poses_2d):
+            batch_cam = None if seq_cam is None else np.expand_dims(seq_cam, axis=0)
+            batch_3d = None if seq_3d is None else np.expand_dims(seq_3d, axis=0)
+            batch_2d = np.expand_dims(np.pad(seq_2d,
+                                             ((self.pad + self.causal_shift, self.pad - self.causal_shift), (0, 0),
+                                              (0, 0)),
+                                             'edge'), axis=0)
+            if self.augment:
+                # Append flipped version
+                if batch_cam is not None:
+                    batch_cam = np.concatenate((batch_cam, batch_cam), axis=0)
+                    batch_cam[1, 2] *= -1
+                    batch_cam[1, 7] *= -1
+                if batch_3d is not None:
+                    batch_3d = np.concatenate((batch_3d, batch_3d), axis=0)
+                    batch_3d[1, :, :, 0] *= -1
+                    batch_3d[1, :, self.joints_left + self.joints_right] = batch_3d[1, :,
+                                                                           self.joints_right + self.joints_left]
+                batch_2d = np.concatenate((batch_2d, batch_2d), axis=0)
+                batch_2d[1, :, :, 0] *= -1
+                batch_2d[1, :, self.kps_left + self.kps_right] = batch_2d[1, :, self.kps_right + self.kps_left]
+            yield batch_cam, batch_3d, batch_2d

VideoToNPZ/common/graph_utils.py ADDED Viewed

	@@ -0,0 +1,45 @@

+from __future__ import absolute_import
+import torch
+import numpy as np
+import scipy.sparse as sp
+def normalize(mx):
+    """Row-normalize sparse matrix"""
+    rowsum = np.array(mx.sum(1))
+    r_inv = np.power(rowsum, -1).flatten()
+    r_inv[np.isinf(r_inv)] = 0.
+    r_mat_inv = sp.diags(r_inv)
+    mx = r_mat_inv.dot(mx)
+    return mx
+def sparse_mx_to_torch_sparse_tensor(sparse_mx):
+    """Convert a scipy sparse matrix to a torch sparse tensor."""
+    sparse_mx = sparse_mx.tocoo().astype(np.float32)
+    indices = torch.from_numpy(np.vstack((sparse_mx.row, sparse_mx.col)).astype(np.int64))
+    values = torch.from_numpy(sparse_mx.data)
+    shape = torch.Size(sparse_mx.shape)
+    return torch.sparse.FloatTensor(indices, values, shape)
+def adj_mx_from_edges(num_pts, edges, sparse=True):
+    edges = np.array(edges, dtype=np.int32)
+    data, i, j = np.ones(edges.shape[0]), edges[:, 0], edges[:, 1]
+    adj_mx = sp.coo_matrix((data, (i, j)), shape=(num_pts, num_pts), dtype=np.float32)
+    # build symmetric adjacency matrix
+    adj_mx = adj_mx + adj_mx.T.multiply(adj_mx.T > adj_mx) - adj_mx.multiply(adj_mx.T > adj_mx)
+    adj_mx = normalize(adj_mx + sp.eye(adj_mx.shape[0]))
+    if sparse:
+        adj_mx = sparse_mx_to_torch_sparse_tensor(adj_mx)
+    else:
+        adj_mx = torch.tensor(adj_mx.todense(), dtype=torch.float)
+    return adj_mx
+def adj_mx_from_skeleton(skeleton):
+    num_joints = skeleton.num_joints()
+    edges = list(filter(lambda x: x[1] >= 0, zip(list(range(0, num_joints)), skeleton.parents())))
+    return adj_mx_from_edges(num_joints, edges, sparse=False)

VideoToNPZ/common/loss.py ADDED Viewed

	@@ -0,0 +1,90 @@

+import torch
+import numpy as np
+def mpjpe(predicted, target):
+    """
+    Mean per-joint position error (i.e. mean Euclidean distance),
+    often referred to as "Protocol #1" in many papers.
+    """
+    assert predicted.shape == target.shape
+    return torch.mean(torch.norm(predicted - target, dim=len(target.shape) - 1))
+def p_mpjpe(predicted, target):
+    """
+    Pose error: MPJPE after rigid alignment (scale, rotation, and translation),
+    often referred to as "Protocol #2" in many papers.
+    """
+    assert predicted.shape == target.shape
+    muX = np.mean(target, axis=1, keepdims=True)
+    muY = np.mean(predicted, axis=1, keepdims=True)
+    X0 = target - muX
+    Y0 = predicted - muY
+    normX = np.sqrt(np.sum(X0 ** 2, axis=(1, 2), keepdims=True))
+    normY = np.sqrt(np.sum(Y0 ** 2, axis=(1, 2), keepdims=True))
+    X0 /= normX
+    Y0 /= normY
+    H = np.matmul(X0.transpose(0, 2, 1), Y0)
+    U, s, Vt = np.linalg.svd(H)
+    V = Vt.transpose(0, 2, 1)
+    R = np.matmul(V, U.transpose(0, 2, 1))
+    # Avoid improper rotations (reflections), i.e. rotations with det(R) = -1
+    sign_detR = np.sign(np.expand_dims(np.linalg.det(R), axis=1))
+    V[:, :, -1] *= sign_detR
+    s[:, -1] *= sign_detR.flatten()
+    R = np.matmul(V, U.transpose(0, 2, 1))  # Rotation
+    tr = np.expand_dims(np.sum(s, axis=1, keepdims=True), axis=2)
+    a = tr * normX / normY  # Scale
+    t = muX - a * np.matmul(muY, R)  # Translation
+    # Perform rigid transformation on the input
+    predicted_aligned = a * np.matmul(predicted, R) + t
+    # Return MPJPE
+    return np.mean(np.linalg.norm(predicted_aligned - target, axis=len(target.shape) - 1))
+def euclidean_losses(actual, target):
+    """Calculate the average Euclidean loss for multi-point samples.
+    Each sample must contain `n` points, each with `d` dimensions. For example,
+    in the MPII human pose estimation task n=16 (16 joint locations) and
+    d=2 (locations are 2D).
+    Args:
+        actual (Tensor): Predictions (B x L x D)
+        target (Tensor): Ground truth target (B x L x D)
+    """
+    assert actual.size() == target.size(), 'input tensors must have the same size'
+    # Calculate Euclidean distances between actual and target locations
+    diff = actual - target
+    dist_sq = diff.pow(2).sum(-1, keepdim=False)
+    dist = dist_sq.sqrt()
+    return dist
+def pck(actual, expected, threshold=150):
+    dists = euclidean_losses(actual, expected)
+    return (dists < threshold).double().mean().item()
+def auc(actual, expected):
+    # This range of thresholds mimics `mpii_compute_3d_pck.m`, which is provided as part of the
+    # MPI-INF-3DHP test data release.
+    thresholds = torch.linspace(0, 150, 31).tolist()
+    pck_values = torch.DoubleTensor(len(thresholds))
+    for i, threshold in enumerate(thresholds):
+        pck_values[i] = pck(actual, expected, threshold=threshold)
+    return pck_values.mean().item()

VideoToNPZ/common/quaternion.py ADDED Viewed

	@@ -0,0 +1,36 @@

+import torch
+def qort(q, v):
+    """
+    Rotate vector(s) v about the rotation described by quaternion(s) q.
+    Expects a tensor of shape (*, 4) for q and a tensor of shape (*, 3) for v,
+    where * denotes any number of dimensions.
+    Returns a tensor of shape (*, 3).
+    """
+    assert q.shape[-1] == 4
+    assert v.shape[-1] == 3
+    assert q.shape[:-1] == v.shape[:-1]
+    qvec = q[..., 1:]
+    uv = torch.cross(qvec, v, dim=len(q.shape)-1)
+    uuv = torch.cross(qvec, uv, dim=len(q.shape)-1)
+    return v + 2 * (q[..., :1] * uv + uuv)
+def qinverse(q, inplace=False):
+    # We assume the quaternion to be normalized
+    """
+    The quaternions provided in the code are from the camera coordinate to the world coordinate.
+    Therefore, the quaternions from the world coordinate to the camera coordinate is the transpose of quaternions from
+    the camera coordinates to the world coordinate.The precondition is that the quaternion is a unit quaternion.
+    So the inverse of the quaternions is equal to the transposition of the quaternions.
+    """
+    if inplace:
+        q[..., 1:] *= -1
+        return q
+    else:
+        w = q[..., :1]
+        xyz = q[..., 1:]
+        return torch.cat((w, -xyz), dim=len(q.shape)-1)

VideoToNPZ/common/skeleton.py ADDED Viewed

	@@ -0,0 +1,81 @@

+import numpy as np
+class Skeleton:
+    def __init__(self, parents, joints_left, joints_right):
+        assert len(joints_left) == len(joints_right)
+        self._parents = parents
+        self._joints_left = joints_left
+        self._joints_right = joints_right
+    def num_joints(self):
+        return len(self._parents)
+    def parents(self):
+        return self._parents
+    def has_children(self):
+        return self._has_children
+    def children(self):
+        return self._children
+    def remove_joints(self, joints_to_remove):
+        """
+        Remove the joints specified in 'joints_to_remove'.
+        """
+        valid_joints = []
+        for joint in range(len(self._parents)):
+            if joint not in joints_to_remove:
+                valid_joints.append(joint)
+        for i in range(len(self._parents)):
+            while self._parents[i] in joints_to_remove:
+                self._parents[i] = self._parents[self._parents[i]]
+        index_offsets = np.zeros(len(self._parents), dtype=int)
+        new_parents = []
+        for i, parent in enumerate(self._parents):
+            if i not in joints_to_remove:
+                new_parents.append(parent - index_offsets[parent])
+            else:
+                index_offsets[i:] += 1
+        self._parents = np.array(new_parents)
+        if self._joints_left is not None:
+            new_joints_left = []
+            for joint in self._joints_left:
+                if joint in valid_joints:
+                    new_joints_left.append(joint - index_offsets[joint])
+            self._joints_left = new_joints_left
+        if self._joints_right is not None:
+            new_joints_right = []
+            for joint in self._joints_right:
+                if joint in valid_joints:
+                    new_joints_right.append(joint - index_offsets[joint])
+            self._joints_right = new_joints_right
+        self._compute_metadata()
+        return valid_joints
+    def joints_left(self):
+        return self._joints_left
+    def joints_right(self):
+        return self._joints_right
+    def _compute_metadata(self):
+        self._has_children = np.zeros(len(self._parents)).astype(bool)
+        for i, parent in enumerate(self._parents):
+            if parent != -1:
+                self._has_children[parent] = True
+        self._children = []
+        for parents in enumerate(self._parents):
+            self._children.append([])
+        for i, parent in enumerate(self._parents):
+            if parent != -1:
+                    self._children[parent].append(i)

VideoToNPZ/data/data_utils.py ADDED Viewed

	@@ -0,0 +1,95 @@

+import numpy as np
+import h5py
+mpii_metadata = {
+    'layout_name': 'mpii',
+    'num_joints': 16,
+    'keypoints_symmetry': [
+        [3, 4, 5, 13, 14, 15],
+        [0, 1, 2, 10, 11, 12],
+    ]
+}
+coco_metadata = {
+    'layout_name': 'coco',
+    'num_joints': 17,
+    'keypoints_symmetry': [
+        [1, 3, 5, 7, 9, 11, 13, 15],
+        [2, 4, 6, 8, 10, 12, 14, 16],
+    ]
+}
+h36m_metadata = {
+    'layout_name': 'h36m',
+    'num_joints': 17,
+    'keypoints_symmetry': [
+        [4, 5, 6, 11, 12, 13],
+        [1, 2, 3, 14, 15, 16],
+    ]
+}
+humaneva15_metadata = {
+    'layout_name': 'humaneva15',
+    'num_joints': 15,
+    'keypoints_symmetry': [
+        [2, 3, 4, 8, 9, 10],
+        [5, 6, 7, 11, 12, 13]
+    ]
+}
+humaneva20_metadata = {
+    'layout_name': 'humaneva20',
+    'num_joints': 20,
+    'keypoints_symmetry': [
+        [3, 4, 5, 6, 11, 12, 13, 14],
+        [7, 8, 9, 10, 15, 16, 17, 18]
+    ]
+}
+def suggest_metadata(name):
+    names = []
+    for metadata in [mpii_metadata, coco_metadata, h36m_metadata, humaneva15_metadata, humaneva20_metadata]:
+        if metadata['layout_name'] in name:
+            return metadata
+        names.append(metadata['layout_name'])
+    raise KeyError('Cannot infer keypoint layout from name "{}". Tried {}.'.format(name, names))
+def import_detectron_poses(path):
+    # Latin1 encoding because Detectron runs on Python 2.7
+    data = np.load(path, encoding='latin1')
+    kp = data['keypoints']
+    bb = data['boxes']
+    results = []
+    for i in range(len(bb)):
+        if len(bb[i][1]) == 0:
+            assert i > 0
+            # Use last pose in case of detection failure
+            results.append(results[-1])
+            continue
+        best_match = np.argmax(bb[i][1][:, 4])
+        keypoints = kp[i][1][best_match].T.copy()
+        results.append(keypoints)
+    results = np.array(results)
+    return results[:, :, 4:6] # Soft-argmax
+    #return results[:, :, [0, 1, 3]] # Argmax + score
+def import_cpn_poses(path):
+    data = np.load(path)
+    kp = data['keypoints']
+    return kp[:, :, :2]
+def import_sh_poses(path):
+    with h5py.File(path) as hf:
+        positions = hf['poses'].value
+    return positions.astype('float32')
+def suggest_pose_importer(name):
+    if 'detectron' in name:
+        return import_detectron_poses
+    if 'cpn' in name:
+        return import_cpn_poses
+    if 'sh' in name:
+        return import_sh_poses
+    raise KeyError('Cannot infer keypoint format from name "{}". Tried detectron, cpn, sh.'.format(name))

VideoToNPZ/gen_skes.py ADDED Viewed

	@@ -0,0 +1,116 @@

+import torch
+import sys
+import os.path as osp
+import os
+import argparse
+import cv2
+import time
+import h5py
+from tqdm import tqdm
+import numpy as np
+import warnings
+import signal
+warnings.filterwarnings('ignore')
+sys.path.insert(0, osp.dirname(osp.realpath(__file__)))
+from tools.utils import get_path
+from model.gast_net import SpatioTemporalModel, SpatioTemporalModelOptimized1f
+from common.skeleton import Skeleton
+from common.graph_utils import adj_mx_from_skeleton
+from common.generators import *
+from tools.preprocess import load_kpts_json, h36m_coco_format, revise_kpts, revise_skes
+from tools.inference import gen_pose
+from tools.vis_kpts import plot_keypoint
+cur_dir, chk_root, data_root, lib_root, output_root = get_path(__file__)
+model_dir = chk_root + 'gastnet/'
+sys.path.insert(1, lib_root)
+from lib.pose import gen_video_kpts as hrnet_pose
+sys.path.pop(1)
+sys.path.pop(0)
+skeleton = Skeleton(parents=[-1, 0, 1, 2, 0, 4, 5, 0, 7, 8, 9, 8, 11, 12, 8, 14, 15],
+                    joints_left=[4, 5, 6, 11, 12, 13], joints_right=[1, 2, 3, 14, 15, 16])
+adj = adj_mx_from_skeleton(skeleton)
+joints_left, joints_right = [4, 5, 6, 11, 12, 13], [1, 2, 3, 14, 15, 16]
+kps_left, kps_right = [4, 5, 6, 11, 12, 13], [1, 2, 3, 14, 15, 16]
+# Set up signal handler for keyboard interrupt
+def signal_handler(sig, frame):
+    print("\nInterrupted by user, shutting down...")
+    if 'pool' in locals() and pool is not None:
+        pool.terminate()
+        pool.join()
+    sys.exit(0)
+signal.signal(signal.SIGINT, signal_handler)
+def load_model_layer():
+    chk = model_dir + '81_frame_model.bin'
+    filters_width = [3, 3, 3, 3]
+    channels = 64
+    model_pos = SpatioTemporalModel(adj, 17, 2, 17, filter_widths=filters_width, channels=channels, dropout=0.05)
+    checkpoint = torch.load(chk)
+    model_pos.load_state_dict(checkpoint['model_pos'])
+    if torch.cuda.is_available():
+        model_pos = model_pos.cuda()
+    model_pos = model_pos.eval()
+    return model_pos
+def generate_skeletons(video=''):
+    cap = cv2.VideoCapture(video)
+    width = cap.get(cv2.CAP_PROP_FRAME_WIDTH)
+    height = cap.get(cv2.CAP_PROP_FRAME_HEIGHT)
+    keypoints, scores = hrnet_pose(video, det_dim=416, gen_output=True)
+    keypoints, scores, valid_frames = h36m_coco_format(keypoints, scores)
+    re_kpts = revise_kpts(keypoints, scores, valid_frames)
+    num_person = len(re_kpts)
+    model_pos = load_model_layer()
+    pad = (81 - 1) // 2
+    causal_shift = 0
+    prediction = gen_pose(re_kpts, valid_frames, width, height, model_pos, pad, causal_shift)
+    print('Recording 3D Pose:')
+    # Add a loading bar
+    for i in tqdm(range(100)):
+        time.sleep(0.01)
+    # Create output directory with absolute path
+    output_dir = os.path.abspath('../outputs/')
+    print(f"Creating output directory: {output_dir}")
+    os.makedirs(output_dir, exist_ok=True)
+    npz_dir = os.path.join(output_dir, 'npz')
+    print(f"Creating NPZ directory: {npz_dir}")
+    os.makedirs(npz_dir, exist_ok=True)
+    output_npz = os.path.join(npz_dir, os.path.basename(video).split('.')[0] + '.npz')
+    print(f"Saving NPZ to: {output_npz}")
+    np.savez_compressed(output_npz, reconstruction=prediction)
+    print(f"NPZ saved successfully: {output_npz}")
+def arg_parse():
+    parser = argparse.ArgumentParser('Generating skeleton demo.')
+    parser.add_argument('-v', '--video', type=str)
+    args = parser.parse_args()
+    return args
+if __name__ == "__main__":
+    args = arg_parse()
+    # Use the video path as-is if absolute, otherwise prepend data_root
+    if os.path.isabs(args.video):
+        video_path = args.video
+    else:
+        video_path = os.path.join(data_root, 'video', args.video)
+    generate_skeletons(video=video_path)

VideoToNPZ/lib/detector/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+import sys
+import os.path as osp
+sys.path.insert(0, osp.join(osp.dirname(osp.realpath(__file__)), 'yolov3'))
+from human_detector import yolo_human_det, load_model
+sys.path.pop(0)

VideoToNPZ/lib/detector/yolov3/__init__.py ADDED Viewed

File without changes

VideoToNPZ/lib/detector/yolov3/bbox.py ADDED Viewed

	@@ -0,0 +1,111 @@

+from __future__ import division
+import torch
+import random
+import numpy as np
+import cv2
+def confidence_filter(result, confidence):
+    conf_mask = (result[:,:,4] > confidence).float().unsqueeze(2)
+    result = result*conf_mask
+    return result
+def confidence_filter_cls(result, confidence):
+    max_scores = torch.max(result[:,:,5:25], 2)[0]
+    res = torch.cat((result, max_scores),2)
+    print(res.shape)
+    cond_1 = (res[:,:,4] > confidence).float()
+    cond_2 = (res[:,:,25] > 0.995).float()
+    conf = cond_1 + cond_2
+    conf = torch.clamp(conf, 0.0, 1.0)
+    conf = conf.unsqueeze(2)
+    result = result*conf
+    return result
+def get_abs_coord(box):
+    box[2], box[3] = abs(box[2]), abs(box[3])
+    x1 = (box[0] - box[2]/2) - 1
+    y1 = (box[1] - box[3]/2) - 1
+    x2 = (box[0] + box[2]/2) - 1
+    y2 = (box[1] + box[3]/2) - 1
+    return x1, y1, x2, y2
+def sanity_fix(box):
+    if (box[0] > box[2]):
+        box[0], box[2] = box[2], box[0]
+    if (box[1] >  box[3]):
+        box[1], box[3] = box[3], box[1]
+    return box
+def bbox_iou(box1, box2):
+    """
+    Returns the IoU of two bounding boxes
+    """
+    # Get the coordinates of bounding boxes
+    b1_x1, b1_y1, b1_x2, b1_y2 = box1[:, 0], box1[:, 1], box1[:, 2], box1[:, 3]
+    b2_x1, b2_y1, b2_x2, b2_y2 = box2[:, 0], box2[:, 1], box2[:, 2], box2[:, 3]
+    # get the corrdinates of the intersection rectangle
+    inter_rect_x1 = torch.max(b1_x1, b2_x1)
+    inter_rect_y1 = torch.max(b1_y1, b2_y1)
+    inter_rect_x2 = torch.min(b1_x2, b2_x2)
+    inter_rect_y2 = torch.min(b1_y2, b2_y2)
+    # Intersection area
+    if torch.cuda.is_available():
+            inter_area = torch.max(inter_rect_x2 - inter_rect_x1 + 1, torch.zeros(inter_rect_x2.shape).cuda())*torch.max(inter_rect_y2 - inter_rect_y1 + 1, torch.zeros(inter_rect_x2.shape).cuda())
+    else:
+            inter_area = torch.max(inter_rect_x2 - inter_rect_x1 + 1, torch.zeros(inter_rect_x2.shape))*torch.max(inter_rect_y2 - inter_rect_y1 + 1, torch.zeros(inter_rect_x2.shape))
+    # Union Area
+    b1_area = (b1_x2 - b1_x1 + 1)*(b1_y2 - b1_y1 + 1)
+    b2_area = (b2_x2 - b2_x1 + 1)*(b2_y2 - b2_y1 + 1)
+    iou = inter_area / (b1_area + b2_area - inter_area)
+    return iou
+def pred_corner_coord(prediction):
+    #Get indices of non-zero confidence bboxes
+    ind_nz = torch.nonzero(prediction[:,:,4]).transpose(0,1).contiguous()
+    box = prediction[ind_nz[0], ind_nz[1]]
+    box_a = box.new(box.shape)
+    box_a[:,0] = (box[:,0] - box[:,2]/2)
+    box_a[:,1] = (box[:,1] - box[:,3]/2)
+    box_a[:,2] = (box[:,0] + box[:,2]/2)
+    box_a[:,3] = (box[:,1] + box[:,3]/2)
+    box[:,:4] = box_a[:,:4]
+    prediction[ind_nz[0], ind_nz[1]] = box
+    return prediction
+def write(x, batches, results, colors, classes):
+    c1 = tuple(x[1:3].int())
+    c2 = tuple(x[3:5].int())
+    img = results[int(x[0])]
+    cls = int(x[-1])
+    label = "{0}".format(classes[cls])
+    color = random.choice(colors)
+    cv2.rectangle(img, c1, c2,color, 1)
+    t_size = cv2.getTextSize(label, cv2.FONT_HERSHEY_PLAIN, 1 , 1)[0]
+    c2 = c1[0] + t_size[0] + 3, c1[1] + t_size[1] + 4
+    cv2.rectangle(img, c1, c2,color, -1)
+    cv2.putText(img, label, (c1[0], c1[1] + t_size[1] + 4), cv2.FONT_HERSHEY_PLAIN, 1, [225,255,255], 1);
+    return img

VideoToNPZ/lib/detector/yolov3/cfg/tiny-yolo-voc.cfg ADDED Viewed

	@@ -0,0 +1,134 @@

+[net]
+batch=64
+subdivisions=8
+width=416
+height=416
+channels=3
+momentum=0.9
+decay=0.0005
+angle=0
+saturation = 1.5
+exposure = 1.5
+hue=.1
+learning_rate=0.001
+max_batches = 40200
+policy=steps
+steps=-1,100,20000,30000
+scales=.1,10,.1,.1
+[convolutional]
+batch_normalize=1
+filters=16
+size=3
+stride=1
+pad=1
+activation=leaky
+[maxpool]
+size=2
+stride=2
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+[maxpool]
+size=2
+stride=2
+[convolutional]
+batch_normalize=1
+filters=64
+size=3
+stride=1
+pad=1
+activation=leaky
+[maxpool]
+size=2
+stride=2
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=leaky
+[maxpool]
+size=2
+stride=2
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+[maxpool]
+size=2
+stride=2
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+[maxpool]
+size=2
+stride=1
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=1
+pad=1
+activation=leaky
+###########
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=1024
+activation=leaky
+[convolutional]
+size=1
+stride=1
+pad=1
+filters=125
+activation=linear
+[region]
+anchors = 1.08,1.19,  3.42,4.41,  6.63,11.38,  9.42,5.11,  16.62,10.52
+bias_match=1
+classes=20
+coords=4
+num=5
+softmax=1
+jitter=.2
+rescore=1
+object_scale=5
+noobject_scale=1
+class_scale=1
+coord_scale=1
+absolute=1
+thresh = .6
+random=1

VideoToNPZ/lib/detector/yolov3/cfg/yolo-voc.cfg ADDED Viewed

	@@ -0,0 +1,258 @@

+[net]
+# Testing
+batch=64
+subdivisions=8
+# Training
+# batch=64
+# subdivisions=8
+height=416
+width=416
+channels=3
+momentum=0.9
+decay=0.0005
+angle=0
+saturation = 1.5
+exposure = 1.5
+hue=.1
+learning_rate=0.001
+burn_in=1000
+max_batches = 80200
+policy=steps
+steps=-1,500,40000,60000
+scales=0.1,10,.1,.1
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+[maxpool]
+size=2
+stride=2
+[convolutional]
+batch_normalize=1
+filters=64
+size=3
+stride=1
+pad=1
+activation=leaky
+[maxpool]
+size=2
+stride=2
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=leaky
+[convolutional]
+batch_normalize=1
+filters=64
+size=1
+stride=1
+pad=1
+activation=leaky
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=leaky
+[maxpool]
+size=2
+stride=2
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+[maxpool]
+size=2
+stride=2
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+[maxpool]
+size=2
+stride=2
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=1
+pad=1
+activation=leaky
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=1
+pad=1
+activation=leaky
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=1
+pad=1
+activation=leaky
+#######
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=1024
+activation=leaky
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=1024
+activation=leaky
+[route]
+layers=-9
+[convolutional]
+batch_normalize=1
+size=1
+stride=1
+pad=1
+filters=64
+activation=leaky
+[reorg]
+stride=2
+[route]
+layers=-1,-4
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=1024
+activation=leaky
+[convolutional]
+size=1
+stride=1
+pad=1
+filters=125
+activation=linear
+[region]
+anchors =  1.3221, 1.73145, 3.19275, 4.00944, 5.05587, 8.09892, 9.47112, 4.84053, 11.2364, 10.0071
+bias_match=1
+classes=20
+coords=4
+num=5
+softmax=1
+jitter=.3
+rescore=1
+object_scale=5
+noobject_scale=1
+class_scale=1
+coord_scale=1
+absolute=1
+thresh = .6
+random=1

VideoToNPZ/lib/detector/yolov3/cfg/yolo.cfg ADDED Viewed

	@@ -0,0 +1,258 @@

+[net]
+# Testing
+batch=1
+subdivisions=1
+# Training
+# batch=64
+# subdivisions=8
+width=416
+height=416
+channels=3
+momentum=0.9
+decay=0.0005
+angle=0
+saturation = 1.5
+exposure = 1.5
+hue=.1
+learning_rate=0.001
+burn_in=1000
+max_batches = 500200
+policy=steps
+steps=400000,450000
+scales=.1,.1
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+[maxpool]
+size=2
+stride=2
+[convolutional]
+batch_normalize=1
+filters=64
+size=3
+stride=1
+pad=1
+activation=leaky
+[maxpool]
+size=2
+stride=2
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=leaky
+[convolutional]
+batch_normalize=1
+filters=64
+size=1
+stride=1
+pad=1
+activation=leaky
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=leaky
+[maxpool]
+size=2
+stride=2
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+[maxpool]
+size=2
+stride=2
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+[maxpool]
+size=2
+stride=2
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=1
+pad=1
+activation=leaky
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=1
+pad=1
+activation=leaky
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=1
+pad=1
+activation=leaky
+#######
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=1024
+activation=leaky
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=1024
+activation=leaky
+[route]
+layers=-9
+[convolutional]
+batch_normalize=1
+size=1
+stride=1
+pad=1
+filters=64
+activation=leaky
+[reorg]
+stride=2
+[route]
+layers=-1,-4
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=1024
+activation=leaky
+[convolutional]
+size=1
+stride=1
+pad=1
+filters=425
+activation=linear
+[region]
+anchors =  0.57273, 0.677385, 1.87446, 2.06253, 3.33843, 5.47434, 7.88282, 3.52778, 9.77052, 9.16828
+bias_match=1
+classes=80
+coords=4
+num=5
+softmax=1
+jitter=.3
+rescore=1
+object_scale=5
+noobject_scale=1
+class_scale=1
+coord_scale=1
+absolute=1
+thresh = .6
+random=1

VideoToNPZ/lib/detector/yolov3/cfg/yolov3.cfg ADDED Viewed

	@@ -0,0 +1,789 @@

+[net]
+# Testing
+batch=1
+subdivisions=1
+# Training
+# batch=64
+# subdivisions=16
+width= 320
+height = 320
+channels=3
+momentum=0.9
+decay=0.0005
+angle=0
+saturation = 1.5
+exposure = 1.5
+hue=.1
+learning_rate=0.001
+burn_in=1000
+max_batches = 500200
+policy=steps
+steps=400000,450000
+scales=.1,.1
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+# Downsample
+[convolutional]
+batch_normalize=1
+filters=64
+size=3
+stride=2
+pad=1
+activation=leaky
+[convolutional]
+batch_normalize=1
+filters=32
+size=1
+stride=1
+pad=1
+activation=leaky
+[convolutional]
+batch_normalize=1
+filters=64
+size=3
+stride=1
+pad=1
+activation=leaky
+[shortcut]
+from=-3
+activation=linear
+# Downsample
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=2
+pad=1
+activation=leaky
+[convolutional]
+batch_normalize=1
+filters=64
+size=1
+stride=1
+pad=1
+activation=leaky
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=leaky
+[shortcut]
+from=-3
+activation=linear
+[convolutional]
+batch_normalize=1
+filters=64
+size=1
+stride=1
+pad=1
+activation=leaky
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=leaky
+[shortcut]
+from=-3
+activation=linear
+# Downsample
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=2
+pad=1
+activation=leaky
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+[shortcut]
+from=-3
+activation=linear
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+[shortcut]
+from=-3
+activation=linear
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+[shortcut]
+from=-3
+activation=linear
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+[shortcut]
+from=-3
+activation=linear
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+[shortcut]
+from=-3
+activation=linear
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+[shortcut]
+from=-3
+activation=linear
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+[shortcut]
+from=-3
+activation=linear
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+[shortcut]
+from=-3
+activation=linear
+# Downsample
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=2
+pad=1
+activation=leaky
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+[shortcut]
+from=-3
+activation=linear
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+[shortcut]
+from=-3
+activation=linear
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+[shortcut]
+from=-3
+activation=linear
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+[shortcut]
+from=-3
+activation=linear
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+[shortcut]
+from=-3
+activation=linear
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+[shortcut]
+from=-3
+activation=linear
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+[shortcut]
+from=-3
+activation=linear
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+[shortcut]
+from=-3
+activation=linear
+# Downsample
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=2
+pad=1
+activation=leaky
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=1
+pad=1
+activation=leaky
+[shortcut]
+from=-3
+activation=linear
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=1
+pad=1
+activation=leaky
+[shortcut]
+from=-3
+activation=linear
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=1
+pad=1
+activation=leaky
+[shortcut]
+from=-3
+activation=linear
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=1
+pad=1
+activation=leaky
+[shortcut]
+from=-3
+activation=linear
+######################
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=1024
+activation=leaky
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=1024
+activation=leaky
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=1024
+activation=leaky
+[convolutional]
+size=1
+stride=1
+pad=1
+filters=255
+activation=linear
+[yolo]
+mask = 6,7,8
+anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
+classes=80
+num=9
+jitter=.3
+ignore_thresh = .5
+truth_thresh = 1
+random=1
+[route]
+layers = -4
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+[upsample]
+stride=2
+[route]
+layers = -1, 61
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=512
+activation=leaky
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=512
+activation=leaky
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=512
+activation=leaky
+[convolutional]
+size=1
+stride=1
+pad=1
+filters=255
+activation=linear
+[yolo]
+mask = 3,4,5
+anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
+classes=80
+num=9
+jitter=.3
+ignore_thresh = .5
+truth_thresh = 1
+random=1
+[route]
+layers = -4
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+[upsample]
+stride=2
+[route]
+layers = -1, 36
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=256
+activation=leaky
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=256
+activation=leaky
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=256
+activation=leaky
+[convolutional]
+size=1
+stride=1
+pad=1
+filters=255
+activation=linear
+[yolo]
+mask = 0,1,2
+anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
+classes=80
+num=9
+jitter=.3
+ignore_thresh = .5
+truth_thresh = 1
+random=1

VideoToNPZ/lib/detector/yolov3/darknet.py ADDED Viewed

	@@ -0,0 +1,433 @@

+from __future__ import division
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+import cv2
+import os
+import sys
+from util import convert2cpu as cpu
+from util import predict_transform
+class test_net(nn.Module):
+    def __init__(self, num_layers, input_size):
+        super(test_net, self).__init__()
+        self.num_layers= num_layers
+        self.linear_1 = nn.Linear(input_size, 5)
+        self.middle = nn.ModuleList([nn.Linear(5,5) for x in range(num_layers)])
+        self.output = nn.Linear(5,2)
+    def forward(self, x):
+        x = x.view(-1)
+        fwd = nn.Sequential(self.linear_1, *self.middle, self.output)
+        return fwd(x)
+def get_test_input():
+    img = cv2.imread("dog-cycle-car.png")
+    img = cv2.resize(img, (416, 416))
+    img_ = img[:, :, ::-1].transpose((2, 0, 1))
+    img_ = img_[np.newaxis, :, :, :]/255.0
+    img_ = torch.from_numpy(img_).float()
+    return img_
+def parse_cfg(cfgfile):
+    """
+    Takes a configuration file
+    Returns a list of blocks. Each blocks describes a block in the neural
+    network to be built. Block is represented as a dictionary in the list
+    """
+    # cfgfile = os.path.join(sys.path[-1], cfgfile)
+    file = open(cfgfile, 'r')
+    lines = file.read().split('\n')     # store the lines in a list
+    lines = [x for x in lines if len(x) > 0]  # get read of the empty lines
+    lines = [x for x in lines if x[0] != '#']
+    lines = [x.rstrip().lstrip() for x in lines]
+    block = {}
+    blocks = []
+    for line in lines:
+        if line[0] == "[":               # This marks the start of a new block
+            if len(block) != 0:
+                blocks.append(block)
+                block = {}
+            block["type"] = line[1:-1].rstrip()
+        else:
+            key,value = line.split("=")
+            block[key.rstrip()] = value.lstrip()
+    blocks.append(block)
+    return blocks
+class MaxPoolStride1(nn.Module):
+    def __init__(self, kernel_size):
+        super(MaxPoolStride1, self).__init__()
+        self.kernel_size = kernel_size
+        self.pad = kernel_size - 1
+    def forward(self, x):
+        padded_x = F.pad(x, (0, self.pad, 0, self.pad), mode="replicate")
+        pooled_x = nn.MaxPool2d(self.kernel_size, self.pad)(padded_x)
+        return pooled_x
+class EmptyLayer(nn.Module):
+    def __init__(self):
+        super(EmptyLayer, self).__init__()
+class DetectionLayer(nn.Module):
+    def __init__(self, anchors):
+        super(DetectionLayer, self).__init__()
+        self.anchors = anchors
+    def forward(self, x, inp_dim, num_classes, confidence):
+        x = x.data
+        global CUDA
+        prediction = x
+        prediction = predict_transform(prediction, inp_dim, self.anchors, num_classes, confidence, CUDA)
+        return prediction
+class Upsample(nn.Module):
+    def __init__(self, stride=2):
+        super(Upsample, self).__init__()
+        self.stride = stride
+    def forward(self, x):
+        stride = self.stride
+        assert(x.data.dim() == 4)
+        B = x.data.size(0)
+        C = x.data.size(1)
+        H = x.data.size(2)
+        W = x.data.size(3)
+        ws = stride
+        hs = stride
+        x = x.view(B, C, H, 1, W, 1).expand(B, C, H, stride, W, stride).contiguous().view(B, C, H*stride, W*stride)
+        return x
+class ReOrgLayer(nn.Module):
+    def __init__(self, stride=2):
+        super(ReOrgLayer, self).__init__()
+        self.stride= stride
+    def forward(self, x):
+        assert(x.data.dim() == 4)
+        B, C, H, W = x.data.shape
+        hs = self.stride
+        ws = self.stride
+        assert(H % hs == 0),  "The stride " + str(self.stride) + " is not a proper divisor of height " + str(H)
+        assert(W % ws == 0),  "The stride " + str(self.stride) + " is not a proper divisor of height " + str(W)
+        x = x.view(B, C, H // hs, hs, W // ws, ws).transpose(-2, -3).contiguous()
+        x = x.view(B, C, H // hs * W // ws, hs, ws)
+        x = x.view(B, C, H // hs * W // ws, hs*ws).transpose(-1, -2).contiguous()
+        x = x.view(B, C, ws*hs, H // ws, W // ws).transpose(1, 2).contiguous()
+        x = x.view(B, C*ws*hs, H // ws, W // ws)
+        return x
+def create_modules(blocks):
+    net_info = blocks[0]     # Captures the information about the input and pre-processing
+    module_list = nn.ModuleList()
+    index = 0    # indexing blocks helps with implementing route  layers (skip connections)
+    prev_filters = 3
+    output_filters = []
+    for x in blocks:
+        module = nn.Sequential()
+        if x["type"] == "net":
+            continue
+        # If it's a convolutional layer
+        if x["type"] == "convolutional":
+            # Get the info about the layer
+            activation = x["activation"]
+            try:
+                batch_normalize = int(x["batch_normalize"])
+                bias = False
+            except:
+                batch_normalize = 0
+                bias = True
+            filters= int(x["filters"])
+            padding = int(x["pad"])
+            kernel_size = int(x["size"])
+            stride = int(x["stride"])
+            if padding:
+                pad = (kernel_size - 1) // 2
+            else:
+                pad = 0
+            # Add the convolutional layer
+            conv = nn.Conv2d(prev_filters, filters, kernel_size, stride, pad, bias = bias)
+            module.add_module("conv_{0}".format(index), conv)
+            # Add the Batch Norm Layer
+            if batch_normalize:
+                bn = nn.BatchNorm2d(filters)
+                module.add_module("batch_norm_{0}".format(index), bn)
+            # Check the activation.
+            # It is either Linear or a Leaky ReLU for YOLO
+            if activation == "leaky":
+                activn = nn.LeakyReLU(0.1, inplace = True)
+                module.add_module("leaky_{0}".format(index), activn)
+        # If it's an upsampling layer
+        # We use Bilinear2dUpsampling
+        elif x["type"] == "upsample":
+            stride = int(x["stride"])
+#           upsample = Upsample(stride)
+            upsample = nn.Upsample(scale_factor=2, mode="nearest")
+            module.add_module("upsample_{}".format(index), upsample)
+        # If it is a route layer
+        elif (x["type"] == "route"):
+            x["layers"] = x["layers"].split(',')
+            # Start  of a route
+            start = int(x["layers"][0])
+            # end, if there exists one.
+            try:
+                end = int(x["layers"][1])
+            except:
+                end = 0
+            # Positive anotation
+            if start > 0:
+                start = start - index
+            if end > 0:
+                end = end - index
+            route = EmptyLayer()
+            module.add_module("route_{0}".format(index), route)
+            if end < 0:
+                filters = output_filters[index + start] + output_filters[index + end]
+            else:
+                filters = output_filters[index + start]
+        # shortcut corresponds to skip connection
+        elif x["type"] == "shortcut":
+            from_ = int(x["from"])
+            shortcut = EmptyLayer()
+            module.add_module("shortcut_{}".format(index), shortcut)
+        elif x["type"] == "maxpool":
+            stride = int(x["stride"])
+            size = int(x["size"])
+            if stride != 1:
+                maxpool = nn.MaxPool2d(size, stride)
+            else:
+                maxpool = MaxPoolStride1(size)
+            module.add_module("maxpool_{}".format(index), maxpool)
+        # Yolo is the detection layer
+        elif x["type"] == "yolo":
+            mask = x["mask"].split(",")
+            mask = [int(x) for x in mask]
+            anchors = x["anchors"].split(",")
+            anchors = [int(a) for a in anchors]
+            anchors = [(anchors[i], anchors[i+1]) for i in range(0, len(anchors),2)]
+            anchors = [anchors[i] for i in mask]
+            detection = DetectionLayer(anchors)
+            module.add_module("Detection_{}".format(index), detection)
+        else:
+            print("Something I dunno")
+            assert False
+        module_list.append(module)
+        prev_filters = filters
+        output_filters.append(filters)
+        index += 1
+    return (net_info, module_list)
+class Darknet(nn.Module):
+    def __init__(self, cfgfile):
+        super(Darknet, self).__init__()
+        self.blocks = parse_cfg(cfgfile)
+        self.net_info, self.module_list = create_modules(self.blocks)
+        self.header = torch.IntTensor([0, 0, 0, 0])
+        self.seen = 0
+    def get_blocks(self):
+        return self.blocks
+    def get_module_list(self):
+        return self.module_list
+    def forward(self, x, CUDA):
+        detections = []
+        modules = self.blocks[1:]
+        outputs = {}   # We cache the outputs for the route layer
+        write = 0
+        for i in range(len(modules)):
+            module_type = (modules[i]["type"])
+            if module_type == "convolutional" or module_type == "upsample" or module_type == "maxpool":
+                x = self.module_list[i](x)
+                outputs[i] = x
+            elif module_type == "route":
+                layers = modules[i]["layers"]
+                layers = [int(a) for a in layers]
+                if (layers[0]) > 0:
+                    layers[0] = layers[0] - i
+                if len(layers) == 1:
+                    x = outputs[i + (layers[0])]
+                else:
+                    if (layers[1]) > 0:
+                        layers[1] = layers[1] - i
+                    map1 = outputs[i + layers[0]]
+                    map2 = outputs[i + layers[1]]
+                    x = torch.cat((map1, map2), 1)
+                outputs[i] = x
+            elif module_type == "shortcut":
+                from_ = int(modules[i]["from"])
+                x = outputs[i-1] + outputs[i+from_]
+                outputs[i] = x
+            elif module_type == 'yolo':
+                anchors = self.module_list[i][0].anchors
+                # Get the input dimensions
+                inp_dim = int(self.net_info["height"])
+                # Get the number of classes
+                num_classes = int(modules[i]["classes"])
+                # Output the result
+                x = x.data
+                x = predict_transform(x, inp_dim, anchors, num_classes, CUDA)
+                if type(x) == int:
+                    continue
+                if not write:
+                    detections = x
+                    write = 1
+                else:
+                    detections = torch.cat((detections, x), 1)
+                outputs[i] = outputs[i-1]
+        try:
+            return detections
+        except:
+            return 0
+    def load_weights(self, weightfile):
+        # Introduction: https://blog.paperspace.com/how-to-implement-a-yolo-v3-object-detector-from-scratch-in-pytorch-part-3/
+        # Open the weights file
+        # weightfile = os.path.join(sys.path[-1], weightfile)
+        fp = open(weightfile, "rb")
+        # The first 5 values are header information
+        # 1. Major version number
+        # 2. Minor Version Number
+        # 3. Subversion number
+        # 4.5 Images seen by the network (during training)
+        header = np.fromfile(fp, dtype = np.int32, count = 5)
+        self.header = torch.from_numpy(header)
+        self.seen = self.header[3]
+        # The rest of the values are the weights
+        # Let's load them up
+        weights = np.fromfile(fp, dtype = np.float32)
+        ptr = 0
+        for i in range(len(self.module_list)):
+            module_type = self.blocks[i + 1]["type"]
+            if module_type == "convolutional":
+                model = self.module_list[i]
+                try:
+                    batch_normalize = int(self.blocks[i+1]["batch_normalize"])
+                except:
+                    batch_normalize = 0
+                conv = model[0]
+                if (batch_normalize):
+                    bn = model[1]
+                    # Get the number of weights of Batch Norm Layer
+                    num_bn_biases = bn.bias.numel()
+                    # Load the weights
+                    bn_biases = torch.from_numpy(weights[ptr:ptr + num_bn_biases])
+                    ptr += num_bn_biases
+                    bn_weights = torch.from_numpy(weights[ptr: ptr + num_bn_biases])
+                    ptr += num_bn_biases
+                    bn_running_mean = torch.from_numpy(weights[ptr: ptr + num_bn_biases])
+                    ptr += num_bn_biases
+                    bn_running_var = torch.from_numpy(weights[ptr: ptr + num_bn_biases])
+                    ptr += num_bn_biases
+                    # Cast the loaded weights into dims of model weights.
+                    bn_biases = bn_biases.view_as(bn.bias.data)
+                    bn_weights = bn_weights.view_as(bn.weight.data)
+                    bn_running_mean = bn_running_mean.view_as(bn.running_mean)
+                    bn_running_var = bn_running_var.view_as(bn.running_var)
+                    # Copy the data to model
+                    bn.bias.data.copy_(bn_biases)
+                    bn.weight.data.copy_(bn_weights)
+                    bn.running_mean.copy_(bn_running_mean)
+                    bn.running_var.copy_(bn_running_var)
+                else:
+                    # Number of biases
+                    num_biases = conv.bias.numel()
+                    # Load the weights
+                    conv_biases = torch.from_numpy(weights[ptr: ptr + num_biases])
+                    ptr = ptr + num_biases
+                    # reshape the loaded weights according to the dims of the model weights
+                    conv_biases = conv_biases.view_as(conv.bias.data)
+                    # Finally copy the data
+                    conv.bias.data.copy_(conv_biases)
+                # Let us load the weights for the Convolutional layers
+                num_weights = conv.weight.numel()
+                # Do the same as above for weights
+                conv_weights = torch.from_numpy(weights[ptr:ptr+num_weights])
+                ptr = ptr + num_weights
+                conv_weights = conv_weights.view_as(conv.weight.data)
+                conv.weight.data.copy_(conv_weights)

VideoToNPZ/lib/detector/yolov3/data/coco.names ADDED Viewed

	@@ -0,0 +1,80 @@

+person
+bicycle
+car
+motorbike
+aeroplane
+bus
+train
+truck
+boat
+traffic light
+fire hydrant
+stop sign
+parking meter
+bench
+bird
+cat
+dog
+horse
+sheep
+cow
+elephant
+bear
+zebra
+giraffe
+backpack
+umbrella
+handbag
+tie
+suitcase
+frisbee
+skis
+snowboard
+sports ball
+kite
+baseball bat
+baseball glove
+skateboard
+surfboard
+tennis racket
+bottle
+wine glass
+cup
+fork
+knife
+spoon
+bowl
+banana
+apple
+sandwich
+orange
+broccoli
+carrot
+hot dog
+pizza
+donut
+cake
+chair
+sofa
+pottedplant
+bed
+diningtable
+toilet
+tvmonitor
+laptop
+mouse
+remote
+keyboard
+cell phone
+microwave
+oven
+toaster
+sink
+refrigerator
+book
+clock
+vase
+scissors
+teddy bear
+hair drier
+toothbrush

VideoToNPZ/lib/detector/yolov3/data/pallete ADDED Viewed

Binary file (908 Bytes). View file

VideoToNPZ/lib/detector/yolov3/data/voc.names ADDED Viewed

	@@ -0,0 +1,20 @@

+aeroplane
+bicycle
+bird
+boat
+bottle
+bus
+car
+cat
+chair
+cow
+diningtable
+dog
+horse
+motorbike
+person
+pottedplant
+sheep
+sofa
+train
+tvmonitor

VideoToNPZ/lib/detector/yolov3/human_detector.py ADDED Viewed

	@@ -0,0 +1,155 @@

+from __future__ import division
+import time
+import torch
+import numpy as np
+import cv2
+import os
+import sys
+import random
+import pickle as pkl
+import argparse
+from util import *
+from darknet import Darknet
+from preprocess import letterbox_image
+import preprocess
+cur_dir = os.path.dirname(os.path.realpath(__file__))
+project_root = os.path.join(cur_dir, '../../../')
+chk_root = os.path.join(project_root, 'checkpoint/')
+data_root = os.path.join(project_root, 'data/')
+sys.path.insert(0, project_root)
+sys.path.pop(0)
+def prep_image(img, inp_dim):
+    """
+    Prepare image for inputting to the neural network.
+    Returns a Variable
+    """
+    ori_img = img
+    dim = ori_img.shape[1], ori_img.shape[0]
+    img = cv2.resize(ori_img, (inp_dim, inp_dim))
+    img_ = img[:, :, ::-1].transpose((2, 0, 1)).copy()
+    img_ = torch.from_numpy(img_).float().div(255.0).unsqueeze(0)
+    return img_, ori_img, dim
+def write(x, img, colors):
+    x = [int(i) for i in x]
+    c1 = tuple(x[0:2])
+    c2 = tuple(x[2:4])
+    label = 'People {}'.format(0)
+    color = (0, 0, 255)
+    cv2.rectangle(img, c1, c2, color, 2)
+    t_size = cv2.getTextSize(label, cv2.FONT_HERSHEY_PLAIN, 1, 1)[0]
+    c2 = c1[0] + t_size[0] + 3, c1[1] + t_size[1] + 4
+    cv2.rectangle(img, c1, c2, color, -1)
+    cv2.putText(img, label, (c1[0], c1[1] + t_size[1] + 4), cv2.FONT_HERSHEY_PLAIN, 1, [225, 255, 255], 1)
+    return img
+def arg_parse():
+    """"
+    Parse arguements to the detect module
+    """
+    parser = argparse.ArgumentParser(description='YOLO v3 Cam Demo')
+    parser.add_argument('--confidence', dest='confidence', type=float, default=0.70,
+                        help='Object Confidence to filter predictions')
+    parser.add_argument('--nms-thresh', dest='nms_thresh', type=float, default=0.4, help='NMS Threshold')
+    parser.add_argument('--reso', dest='reso', default=416, type=int, help='Input resolution of the network. '
+                        'Increase to increase accuracy. Decrease to increase speed. (160, 416)')
+    parser.add_argument('-wf', '--weight-file', type=str, default=chk_root + 'yolov3/yolov3.weights', help='The path'
+                        'of model weight file')
+    parser.add_argument('-cf', '--cfg-file', type=str, default=cur_dir + '/cfg/yolov3.cfg', help='weight file')
+    parser.add_argument('-a', '--animation', action='store_true', help='output animation')
+    parser.add_argument('-v', '--video', type=str, default='camera', help='The input video path')
+    parser.add_argument('-i', '--image', type=str, default=cur_dir + '/data/dog-cycle-car.png',
+                        help='The input video path')
+    parser.add_argument('-np', '--num-person', type=int, default=1, help='number of estimated human poses. [1, 2]')
+    return parser.parse_args()
+def load_model(args=None, CUDA=None, inp_dim=416):
+    if args is None:
+        args = arg_parse()
+    if CUDA is None:
+        CUDA = torch.cuda.is_available()
+    # Set up the neural network
+    model = Darknet(args.cfg_file)
+    model.load_weights(args.weight_file)
+    model.net_info["height"] = inp_dim
+    assert inp_dim % 32 == 0
+    assert inp_dim > 32
+    # If there's a GPU availible, put the model on GPU
+    if CUDA:
+        model.cuda()
+    # Set the model in evaluation mode
+    model.eval()
+    return model
+def yolo_human_det(img, model=None, reso=416, confidence=0.70):
+    args = arg_parse()
+    # args.reso = reso
+    inp_dim = reso
+    num_classes = 80
+    CUDA = torch.cuda.is_available()
+    if model is None:
+        model = load_model(args, CUDA, inp_dim)
+    if type(img) == str:
+        assert os.path.isfile(img), 'The image path does not exist'
+        img = cv2.imread(img)
+    img, ori_img, img_dim = preprocess.prep_image(img, inp_dim)
+    img_dim = torch.FloatTensor(img_dim).repeat(1, 2)
+    with torch.no_grad():
+        if CUDA:
+            img_dim = img_dim.cuda()
+            img = img.cuda()
+        output = model(img, CUDA)
+        output = write_results(output, confidence, num_classes, nms=True, nms_conf=args.nms_thresh, det_hm=True)
+        if len(output) == 0:
+            return None, None
+        img_dim = img_dim.repeat(output.size(0), 1)
+        scaling_factor = torch.min(inp_dim / img_dim, 1)[0].view(-1, 1)
+        output[:, [1, 3]] -= (inp_dim - scaling_factor * img_dim[:, 0].view(-1, 1)) / 2
+        output[:, [2, 4]] -= (inp_dim - scaling_factor * img_dim[:, 1].view(-1, 1)) / 2
+        output[:, 1:5] /= scaling_factor
+        for i in range(output.shape[0]):
+            output[i, [1, 3]] = torch.clamp(output[i, [1, 3]], 0.0, img_dim[i, 0])
+            output[i, [2, 4]] = torch.clamp(output[i, [2, 4]], 0.0, img_dim[i, 1])
+    bboxs = []
+    scores = []
+    for i in range(len(output)):
+        item = output[i]
+        bbox = item[1:5].cpu().numpy()
+        # conver float32 to .2f data
+        bbox = [round(i, 2) for i in list(bbox)]
+        score = item[5].cpu().numpy()
+        bboxs.append(bbox)
+        scores.append(score)
+    scores = np.expand_dims(np.array(scores), 1)
+    bboxs = np.array(bboxs)
+    return bboxs, scores

VideoToNPZ/lib/detector/yolov3/preprocess.py ADDED Viewed

	@@ -0,0 +1,63 @@

+from __future__ import division
+import torch
+import numpy as np
+import cv2
+from PIL import Image
+def letterbox_image(img, inp_dim):
+    '''resize image with unchanged aspect ratio using padding'''
+    img_w, img_h = img.shape[1], img.shape[0]
+    w, h = inp_dim
+    new_w = int(img_w * min(w/img_w, h/img_h))
+    new_h = int(img_h * min(w/img_w, h/img_h))
+    resized_image = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_CUBIC)
+    canvas = np.full((inp_dim[1], inp_dim[0], 3), 128)
+    canvas[(h - new_h) // 2:(h - new_h) // 2 + new_h, (w - new_w) // 2:(w - new_w) // 2 + new_w, :] = resized_image
+    return canvas
+def prep_image(img, inp_dim):
+    """
+    Prepare image for inputting to the neural network.
+    Returns a Variable
+    """
+    if type(img) == str:
+        orig_im = cv2.imread(img)
+    else:
+        orig_im = img
+    dim = orig_im.shape[1], orig_im.shape[0]
+    img = (letterbox_image(orig_im, (inp_dim, inp_dim)))
+    img_ = img[:, :, ::-1].transpose((2, 0, 1)).copy()
+    img_ = torch.from_numpy(img_).float().div(255.0).unsqueeze(0)
+    return img_, orig_im, dim
+def prep_image_pil(img, network_dim):
+    orig_im = Image.open(img)
+    img = orig_im.convert('RGB')
+    dim = img.size
+    img = img.resize(network_dim)
+    img = torch.ByteTensor(torch.ByteStorage.from_buffer(img.tobytes()))
+    img = img.view(*network_dim, 3).transpose(0, 1).transpose(0, 2).contiguous()
+    img = img.view(1, 3, *network_dim)
+    img = img.float().div(255.0)
+    return img, orig_im, dim
+def inp_to_image(inp):
+    inp = inp.cpu().squeeze()
+    inp = inp * 255
+    try:
+        inp = inp.data.numpy()
+    except RuntimeError:
+        inp = inp.numpy()
+    inp = inp.transpose(1, 2, 0)
+    inp = inp[:, :, ::-1]
+    return inp

VideoToNPZ/lib/detector/yolov3/util.py ADDED Viewed

	@@ -0,0 +1,225 @@

+from __future__ import division
+import torch
+import numpy as np
+import cv2
+import os.path as osp
+from bbox import bbox_iou
+def get_path(cur_file):
+    cur_dir = osp.dirname(osp.realpath(cur_file))
+    project_root = osp.join(cur_dir, '../../../')
+    chk_root = osp.join(project_root, 'checkpoint/')
+    data_root = osp.join(project_root, 'data/')
+    return project_root, chk_root, data_root, cur_dir
+def count_parameters(model):
+    return sum(p.numel() for p in model.parameters())
+def count_learnable_parameters(model):
+    return sum(p.numel() for p in model.parameters() if p.requires_grad)
+def convert2cpu(matrix):
+    if matrix.is_cuda:
+        return torch.FloatTensor(matrix.size()).copy_(matrix)
+    else:
+        return matrix
+def predict_transform(prediction, inp_dim, anchors, num_classes, CUDA = True):
+    batch_size = prediction.size(0)
+    stride = inp_dim // prediction.size(2)
+    grid_size = inp_dim // stride
+    bbox_attrs = 5 + num_classes
+    num_anchors = len(anchors)
+    anchors = [(a[0]/stride, a[1]/stride) for a in anchors]
+    prediction = prediction.view(batch_size, bbox_attrs*num_anchors, grid_size*grid_size)
+    prediction = prediction.transpose(1, 2).contiguous()
+    prediction = prediction.view(batch_size, grid_size*grid_size*num_anchors, bbox_attrs)
+    # Sigmoid the  centre_X, centre_Y. and object confidencce
+    prediction[:, :, 0] = torch.sigmoid(prediction[:, :, 0])
+    prediction[:, :, 1] = torch.sigmoid(prediction[:, :, 1])
+    prediction[:, :, 4] = torch.sigmoid(prediction[:, :, 4])
+    # Add the center offsets
+    grid_len = np.arange(grid_size)
+    a, b = np.meshgrid(grid_len, grid_len)
+    x_offset = torch.FloatTensor(a).view(-1, 1)
+    y_offset = torch.FloatTensor(b).view(-1, 1)
+    if CUDA:
+        x_offset = x_offset.cuda()
+        y_offset = y_offset.cuda()
+    x_y_offset = torch.cat((x_offset, y_offset), 1).repeat(1, num_anchors).view(-1, 2).unsqueeze(0)
+    prediction[:, :, :2] += x_y_offset
+    # log space transform height and the width
+    anchors = torch.FloatTensor(anchors)
+    if CUDA:
+        anchors = anchors.cuda()
+    anchors = anchors.repeat(grid_size*grid_size, 1).unsqueeze(0)
+    prediction[:, :, 2:4] = torch.exp(prediction[:, :, 2:4])*anchors
+    # Softmax the class scores
+    prediction[:, :, 5: 5 + num_classes] = torch.sigmoid((prediction[:, :, 5: 5 + num_classes]))
+    prediction[:, :, :4] *= stride
+    return prediction
+def load_classes(namesfile):
+    fp = open(namesfile, "r")
+    names = fp.read().split("\n")[:-1]
+    return names
+def get_im_dim(im):
+    im = cv2.imread(im)
+    w, h = im.shape[1], im.shape[0]
+    return w, h
+def unique(tensor):
+    tensor_np = tensor.cpu().numpy()
+    unique_np = np.unique(tensor_np)
+    unique_tensor = torch.from_numpy(unique_np)
+    tensor_res = tensor.new(unique_tensor.shape)
+    tensor_res.copy_(unique_tensor)
+    return tensor_res
+# ADD SOFT NMS
+def write_results(prediction, confidence, num_classes, nms=True, nms_conf=0.4, det_hm=False):
+    """
+        https://blog.paperspace.com/how-to-implement-a-yolo-v3-object-detector-from-scratch-in-pytorch-part-4/
+        prediction: (B x 10647 x 85)
+        B: the number of images in a batch,
+        10647: the number of bounding boxes predicted per image. (52×52+26×26+13×13)×3=10647
+        85: the number of bounding box attributes. (c_x, c_y, w, h, object confidence, and 80 class scores)
+        output: Num_obj × [img_index, x_1, y_1, x_2, y_2, object confidence, class_score, label_index]
+    """
+    conf_mask = (prediction[:, :, 4] > confidence).float().unsqueeze(2)
+    prediction = prediction*conf_mask
+    box_a = prediction.new(prediction.shape)
+    box_a[:, :, 0] = (prediction[:, :, 0] - prediction[:, :, 2]/2)
+    box_a[:, :, 1] = (prediction[:, :, 1] - prediction[:, :, 3]/2)
+    box_a[:, :, 2] = (prediction[:, :, 0] + prediction[:, :, 2]/2)
+    box_a[:, :, 3] = (prediction[:, :, 1] + prediction[:, :, 3]/2)
+    prediction[:, :, :4] = box_a[:, :, :4]
+    batch_size = prediction.size(0)
+    output = prediction.new(1, prediction.size(2) + 1)
+    write = False
+    for ind in range(batch_size):
+        # select the image from the batch
+        image_pred = prediction[ind]
+        # Get the class having maximum score, and the index of that class
+        # Get rid of num_classes softmax scores
+        # Add the class index and the class score of class having maximum score
+        max_conf, max_conf_index = torch.max(image_pred[:, 5:5 + num_classes], 1)
+        max_conf = max_conf.float().unsqueeze(1)
+        max_conf_index = max_conf_index.float().unsqueeze(1)
+        seq = (image_pred[:, :5], max_conf, max_conf_index)
+        image_pred = torch.cat(seq, 1)  # image_pred:(10647, 7) 7:[x1, y1, x2, y2, obj_score, max_conf, max_conf_index]
+        # Get rid of the zero entries
+        non_zero_ind = (torch.nonzero(image_pred[:, 4]))
+        image_pred__ = image_pred[non_zero_ind.squeeze(), :].view(-1, 7)
+        # filters out people id
+        if det_hm:
+            cls_mask = (image_pred__[:, -1] == 0).float()
+            class_mask_ind = torch.nonzero(cls_mask).squeeze()
+            image_pred_ = image_pred__[class_mask_ind].view(-1, 7)
+            if torch.sum(cls_mask) == 0:
+                return image_pred_
+        else:
+            image_pred_ = image_pred__
+        # Get the various classes detected in the image
+        try:
+            # img_classes = unique(image_pred_[:, -1])
+            img_classes = torch.unique(image_pred_[:, -1], sorted=True).float()
+        except:
+             continue
+        # We will do NMS classwise
+        #  import ipdb;ipdb.set_trace()
+        for cls in img_classes:
+            # get the detections with one particular class
+            cls_mask = image_pred_*(image_pred_[:, -1] == cls).float().unsqueeze(1)
+            class_mask_ind = torch.nonzero(cls_mask[:, -2]).squeeze()
+            image_pred_class = image_pred_[class_mask_ind].view(-1, 7)
+            # sort the detections such that the entry with the maximum objectness
+            # confidence is at the top
+            conf_sort_index = torch.sort(image_pred_class[:, 4], descending=True)[1]
+            image_pred_class = image_pred_class[conf_sort_index]
+            idx = image_pred_class.size(0)
+            #  from soft_NMS import soft_nms
+            #  boxes = image_pred_class[:,:4]
+            #  scores = image_pred_class[:, 4]
+            #  k, N = soft_nms(boxes, scores, method=2)
+            #  image_pred_class = image_pred_class[k]
+            # if nms has to be done
+            if nms:
+                # For each detection
+                for i in range(idx):
+                    # Get the IOUs of all boxes that come after the one we are looking at
+                    # in the loop
+                    try:
+                        ious = bbox_iou(image_pred_class[i].unsqueeze(0), image_pred_class[i+1:])
+                    except ValueError:
+                        break
+                    except IndexError:
+                        break
+                    # Zero out all the detections that have IoU > threshold
+                    iou_mask = (ious < nms_conf).float().unsqueeze(1)
+                    image_pred_class[i+1:] *= iou_mask
+                    #  Remove the zero entries
+                    non_zero_ind = torch.nonzero(image_pred_class[:, 4]).squeeze()
+                    image_pred_class = image_pred_class[non_zero_ind].view(-1, 7)
+            # Concatenate the batch_id of the image to the detection
+            # this helps us identify which image does the detection correspond to
+            # We use a linear structure to hold ALL the detections from the batch
+            # the batch_dim is flattened
+            # batch is identified by extra batch column
+            batch_ind = image_pred_class.new(image_pred_class.size(0), 1).fill_(ind)
+            seq = batch_ind, image_pred_class
+            if not write:
+                output = torch.cat(seq, 1)
+                write = True
+            else:
+                out = torch.cat(seq, 1)
+                output = torch.cat((output, out))
+    return output

VideoToNPZ/lib/pose/__init__.py ADDED Viewed

	@@ -0,0 +1,10 @@

+import sys
+import os.path as osp
+sys.path.insert(1, osp.join(osp.dirname(osp.realpath(__file__)), 'hrnet/pose_estimation'))
+from gen_kpts import gen_img_kpts, gen_video_kpts, load_default_model
+sys.path.insert(2, osp.join(osp.dirname(osp.realpath(__file__)), 'hrnet/lib/utils'))
+from utilitys import plot_keypoint, write, PreProcess, box_to_center_scale, load_json
+sys.path.pop(1)
+sys.path.pop(2)

VideoToNPZ/lib/pose/hrnet/experiments/coco/hrnet/w32_256x192_adam_lr1e-3.yaml ADDED Viewed

	@@ -0,0 +1,127 @@

+AUTO_RESUME: true
+CUDNN:
+  BENCHMARK: true
+  DETERMINISTIC: false
+  ENABLED: true
+DATA_DIR: ''
+GPUS: (0,1,2,3)
+OUTPUT_DIR: 'output'
+LOG_DIR: 'log'
+WORKERS: 24
+PRINT_FREQ: 100
+DATASET:
+  COLOR_RGB: true
+  DATASET: 'coco'
+  DATA_FORMAT: jpg
+  FLIP: true
+  NUM_JOINTS_HALF_BODY: 8
+  PROB_HALF_BODY: 0.3
+  ROOT: 'data/coco/'
+  ROT_FACTOR: 45
+  SCALE_FACTOR: 0.35
+  TEST_SET: 'val2017'
+  TRAIN_SET: 'train2017'
+MODEL:
+  INIT_WEIGHTS: true
+  NAME: pose_hrnet
+  NUM_JOINTS: 17
+  PRETRAINED: 'models/pytorch/imagenet/hrnet_w32-36af842e.pth'
+  TARGET_TYPE: gaussian
+  IMAGE_SIZE:
+  - 192
+  - 256
+  HEATMAP_SIZE:
+  - 48
+  - 64
+  SIGMA: 2
+  EXTRA:
+    PRETRAINED_LAYERS:
+    - 'conv1'
+    - 'bn1'
+    - 'conv2'
+    - 'bn2'
+    - 'layer1'
+    - 'transition1'
+    - 'stage2'
+    - 'transition2'
+    - 'stage3'
+    - 'transition3'
+    - 'stage4'
+    FINAL_CONV_KERNEL: 1
+    STAGE2:
+      NUM_MODULES: 1
+      NUM_BRANCHES: 2
+      BLOCK: BASIC
+      NUM_BLOCKS:
+      - 4
+      - 4
+      NUM_CHANNELS:
+      - 32
+      - 64
+      FUSE_METHOD: SUM
+    STAGE3:
+      NUM_MODULES: 4
+      NUM_BRANCHES: 3
+      BLOCK: BASIC
+      NUM_BLOCKS:
+      - 4
+      - 4
+      - 4
+      NUM_CHANNELS:
+      - 32
+      - 64
+      - 128
+      FUSE_METHOD: SUM
+    STAGE4:
+      NUM_MODULES: 3
+      NUM_BRANCHES: 4
+      BLOCK: BASIC
+      NUM_BLOCKS:
+      - 4
+      - 4
+      - 4
+      - 4
+      NUM_CHANNELS:
+      - 32
+      - 64
+      - 128
+      - 256
+      FUSE_METHOD: SUM
+LOSS:
+  USE_TARGET_WEIGHT: true
+TRAIN:
+  BATCH_SIZE_PER_GPU: 32
+  SHUFFLE: true
+  BEGIN_EPOCH: 0
+  END_EPOCH: 210
+  OPTIMIZER: adam
+  LR: 0.001
+  LR_FACTOR: 0.1
+  LR_STEP:
+  - 170
+  - 200
+  WD: 0.0001
+  GAMMA1: 0.99
+  GAMMA2: 0.0
+  MOMENTUM: 0.9
+  NESTEROV: false
+TEST:
+  BATCH_SIZE_PER_GPU: 32
+  COCO_BBOX_FILE: 'data/coco/person_detection_results/COCO_val2017_detections_AP_H_56_person.json'
+  BBOX_THRE: 1.0
+  IMAGE_THRE: 0.0
+  IN_VIS_THRE: 0.2
+  MODEL_FILE: ''
+  NMS_THRE: 1.0
+  OKS_THRE: 0.9
+  USE_GT_BBOX: true
+  FLIP_TEST: true
+  POST_PROCESS: true
+  SHIFT_HEATMAP: true
+DEBUG:
+  DEBUG: true
+  SAVE_BATCH_IMAGES_GT: true
+  SAVE_BATCH_IMAGES_PRED: true
+  SAVE_HEATMAPS_GT: true
+  SAVE_HEATMAPS_PRED: true

VideoToNPZ/lib/pose/hrnet/experiments/coco/hrnet/w32_384x288_adam_lr1e-3.yaml ADDED Viewed

	@@ -0,0 +1,127 @@

+AUTO_RESUME: true
+CUDNN:
+  BENCHMARK: true
+  DETERMINISTIC: false
+  ENABLED: true
+DATA_DIR: ''
+GPUS: (0,1,2,3)
+OUTPUT_DIR: 'output'
+LOG_DIR: 'log'
+WORKERS: 24
+PRINT_FREQ: 100
+DATASET:
+  COLOR_RGB: true
+  DATASET: 'coco'
+  DATA_FORMAT: jpg
+  FLIP: true
+  NUM_JOINTS_HALF_BODY: 8
+  PROB_HALF_BODY: 0.3
+  ROOT: 'data/coco/'
+  ROT_FACTOR: 45
+  SCALE_FACTOR: 0.35
+  TEST_SET: 'val2017'
+  TRAIN_SET: 'train2017'
+MODEL:
+  INIT_WEIGHTS: true
+  NAME: pose_hrnet
+  NUM_JOINTS: 17
+  PRETRAINED: 'models/pytorch/imagenet/hrnet_w32-36af842e.pth'
+  TARGET_TYPE: gaussian
+  IMAGE_SIZE:
+  - 288
+  - 384
+  HEATMAP_SIZE:
+  - 72
+  - 96
+  SIGMA: 3
+  EXTRA:
+    PRETRAINED_LAYERS:
+    - 'conv1'
+    - 'bn1'
+    - 'conv2'
+    - 'bn2'
+    - 'layer1'
+    - 'transition1'
+    - 'stage2'
+    - 'transition2'
+    - 'stage3'
+    - 'transition3'
+    - 'stage4'
+    FINAL_CONV_KERNEL: 1
+    STAGE2:
+      NUM_MODULES: 1
+      NUM_BRANCHES: 2
+      BLOCK: BASIC
+      NUM_BLOCKS:
+      - 4
+      - 4
+      NUM_CHANNELS:
+      - 32
+      - 64
+      FUSE_METHOD: SUM
+    STAGE3:
+      NUM_MODULES: 4
+      NUM_BRANCHES: 3
+      BLOCK: BASIC
+      NUM_BLOCKS:
+      - 4
+      - 4
+      - 4
+      NUM_CHANNELS:
+      - 32
+      - 64
+      - 128
+      FUSE_METHOD: SUM
+    STAGE4:
+      NUM_MODULES: 3
+      NUM_BRANCHES: 4
+      BLOCK: BASIC
+      NUM_BLOCKS:
+      - 4
+      - 4
+      - 4
+      - 4
+      NUM_CHANNELS:
+      - 32
+      - 64
+      - 128
+      - 256
+      FUSE_METHOD: SUM
+LOSS:
+  USE_TARGET_WEIGHT: true
+TRAIN:
+  BATCH_SIZE_PER_GPU: 32
+  SHUFFLE: true
+  BEGIN_EPOCH: 0
+  END_EPOCH: 210
+  OPTIMIZER: adam
+  LR: 0.001
+  LR_FACTOR: 0.1
+  LR_STEP:
+  - 170
+  - 200
+  WD: 0.0001
+  GAMMA1: 0.99
+  GAMMA2: 0.0
+  MOMENTUM: 0.9
+  NESTEROV: false
+TEST:
+  BATCH_SIZE_PER_GPU: 32
+  COCO_BBOX_FILE: 'data/coco/person_detection_results/COCO_val2017_detections_AP_H_56_person.json'
+  BBOX_THRE: 1.0
+  IMAGE_THRE: 0.0
+  IN_VIS_THRE: 0.2
+  MODEL_FILE: ''
+  NMS_THRE: 1.0
+  OKS_THRE: 0.9
+  USE_GT_BBOX: true
+  FLIP_TEST: true
+  POST_PROCESS: true
+  SHIFT_HEATMAP: true
+DEBUG:
+  DEBUG: true
+  SAVE_BATCH_IMAGES_GT: true
+  SAVE_BATCH_IMAGES_PRED: true
+  SAVE_HEATMAPS_GT: true
+  SAVE_HEATMAPS_PRED: true

VideoToNPZ/lib/pose/hrnet/experiments/coco/hrnet/w48_256x192_adam_lr1e-3.yaml ADDED Viewed

	@@ -0,0 +1,127 @@

+AUTO_RESUME: true
+CUDNN:
+  BENCHMARK: true
+  DETERMINISTIC: false
+  ENABLED: true
+DATA_DIR: ''
+GPUS: (0,1,2,3)
+OUTPUT_DIR: 'output'
+LOG_DIR: 'log'
+WORKERS: 24
+PRINT_FREQ: 100
+DATASET:
+  COLOR_RGB: true
+  DATASET: 'coco'
+  DATA_FORMAT: jpg
+  FLIP: true
+  NUM_JOINTS_HALF_BODY: 8
+  PROB_HALF_BODY: 0.3
+  ROOT: 'data/coco/'
+  ROT_FACTOR: 45
+  SCALE_FACTOR: 0.35
+  TEST_SET: 'val2017'
+  TRAIN_SET: 'train2017'
+MODEL:
+  INIT_WEIGHTS: true
+  NAME: pose_hrnet
+  NUM_JOINTS: 17
+  PRETRAINED: 'models/pytorch/imagenet/hrnet_w48-8ef0771d.pth'
+  TARGET_TYPE: gaussian
+  IMAGE_SIZE:
+  - 192
+  - 256
+  HEATMAP_SIZE:
+  - 48
+  - 64
+  SIGMA: 2
+  EXTRA:
+    PRETRAINED_LAYERS:
+    - 'conv1'
+    - 'bn1'
+    - 'conv2'
+    - 'bn2'
+    - 'layer1'
+    - 'transition1'
+    - 'stage2'
+    - 'transition2'
+    - 'stage3'
+    - 'transition3'
+    - 'stage4'
+    FINAL_CONV_KERNEL: 1
+    STAGE2:
+      NUM_MODULES: 1
+      NUM_BRANCHES: 2
+      BLOCK: BASIC
+      NUM_BLOCKS:
+      - 4
+      - 4
+      NUM_CHANNELS:
+      - 48
+      - 96
+      FUSE_METHOD: SUM
+    STAGE3:
+      NUM_MODULES: 4
+      NUM_BRANCHES: 3
+      BLOCK: BASIC
+      NUM_BLOCKS:
+      - 4
+      - 4
+      - 4
+      NUM_CHANNELS:
+      - 48
+      - 96
+      - 192
+      FUSE_METHOD: SUM
+    STAGE4:
+      NUM_MODULES: 3
+      NUM_BRANCHES: 4
+      BLOCK: BASIC
+      NUM_BLOCKS:
+      - 4
+      - 4
+      - 4
+      - 4
+      NUM_CHANNELS:
+      - 48
+      - 96
+      - 192
+      - 384
+      FUSE_METHOD: SUM
+LOSS:
+  USE_TARGET_WEIGHT: true
+TRAIN:
+  BATCH_SIZE_PER_GPU: 32
+  SHUFFLE: true
+  BEGIN_EPOCH: 0
+  END_EPOCH: 210
+  OPTIMIZER: adam
+  LR: 0.001
+  LR_FACTOR: 0.1
+  LR_STEP:
+  - 170
+  - 200
+  WD: 0.0001
+  GAMMA1: 0.99
+  GAMMA2: 0.0
+  MOMENTUM: 0.9
+  NESTEROV: false
+TEST:
+  BATCH_SIZE_PER_GPU: 32
+  COCO_BBOX_FILE: 'data/coco/person_detection_results/COCO_val2017_detections_AP_H_56_person.json'
+  BBOX_THRE: 1.0
+  IMAGE_THRE: 0.0
+  IN_VIS_THRE: 0.2
+  MODEL_FILE: ''
+  NMS_THRE: 1.0
+  OKS_THRE: 0.9
+  USE_GT_BBOX: true
+  FLIP_TEST: true
+  POST_PROCESS: true
+  SHIFT_HEATMAP: true
+DEBUG:
+  DEBUG: true
+  SAVE_BATCH_IMAGES_GT: true
+  SAVE_BATCH_IMAGES_PRED: true
+  SAVE_HEATMAPS_GT: true
+  SAVE_HEATMAPS_PRED: true

VideoToNPZ/lib/pose/hrnet/experiments/coco/hrnet/w48_384x288_adam_lr1e-3.yaml ADDED Viewed

	@@ -0,0 +1,127 @@

+AUTO_RESUME: true
+CUDNN:
+  BENCHMARK: true
+  DETERMINISTIC: false
+  ENABLED: true
+DATA_DIR: ''
+GPUS: (0,1,2,3)
+OUTPUT_DIR: 'output'
+LOG_DIR: 'log'
+WORKERS: 24
+PRINT_FREQ: 100
+DATASET:
+  COLOR_RGB: true
+  DATASET: 'coco'
+  DATA_FORMAT: jpg
+  FLIP: true
+  NUM_JOINTS_HALF_BODY: 8
+  PROB_HALF_BODY: 0.3
+  ROOT: 'data/coco/'
+  ROT_FACTOR: 45
+  SCALE_FACTOR: 0.35
+  TEST_SET: 'val2017'
+  TRAIN_SET: 'train2017'
+MODEL:
+  INIT_WEIGHTS: true
+  NAME: pose_hrnet
+  NUM_JOINTS: 17
+  PRETRAINED: 'models/pytorch/imagenet/hrnet_w48-8ef0771d.pth'
+  TARGET_TYPE: gaussian
+  IMAGE_SIZE:
+  - 288
+  - 384
+  HEATMAP_SIZE:
+  - 72
+  - 96
+  SIGMA: 3
+  EXTRA:
+    PRETRAINED_LAYERS:
+    - 'conv1'
+    - 'bn1'
+    - 'conv2'
+    - 'bn2'
+    - 'layer1'
+    - 'transition1'
+    - 'stage2'
+    - 'transition2'
+    - 'stage3'
+    - 'transition3'
+    - 'stage4'
+    FINAL_CONV_KERNEL: 1
+    STAGE2:
+      NUM_MODULES: 1
+      NUM_BRANCHES: 2
+      BLOCK: BASIC
+      NUM_BLOCKS:
+      - 4
+      - 4
+      NUM_CHANNELS:
+      - 48
+      - 96
+      FUSE_METHOD: SUM
+    STAGE3:
+      NUM_MODULES: 4
+      NUM_BRANCHES: 3
+      BLOCK: BASIC
+      NUM_BLOCKS:
+      - 4
+      - 4
+      - 4
+      NUM_CHANNELS:
+      - 48
+      - 96
+      - 192
+      FUSE_METHOD: SUM
+    STAGE4:
+      NUM_MODULES: 3
+      NUM_BRANCHES: 4
+      BLOCK: BASIC
+      NUM_BLOCKS:
+      - 4
+      - 4
+      - 4
+      - 4
+      NUM_CHANNELS:
+      - 48
+      - 96
+      - 192
+      - 384
+      FUSE_METHOD: SUM
+LOSS:
+  USE_TARGET_WEIGHT: true
+TRAIN:
+  BATCH_SIZE_PER_GPU: 24
+  SHUFFLE: true
+  BEGIN_EPOCH: 0
+  END_EPOCH: 210
+  OPTIMIZER: adam
+  LR: 0.001
+  LR_FACTOR: 0.1
+  LR_STEP:
+  - 170
+  - 200
+  WD: 0.0001
+  GAMMA1: 0.99
+  GAMMA2: 0.0
+  MOMENTUM: 0.9
+  NESTEROV: false
+TEST:
+  BATCH_SIZE_PER_GPU: 24
+  COCO_BBOX_FILE: 'data/coco/person_detection_results/COCO_val2017_detections_AP_H_56_person.json'
+  BBOX_THRE: 1.0
+  IMAGE_THRE: 0.0
+  IN_VIS_THRE: 0.2
+  MODEL_FILE: ''
+  NMS_THRE: 1.0
+  OKS_THRE: 0.9
+  USE_GT_BBOX: true
+  FLIP_TEST: true
+  POST_PROCESS: true
+  SHIFT_HEATMAP: true
+DEBUG:
+  DEBUG: true
+  SAVE_BATCH_IMAGES_GT: true
+  SAVE_BATCH_IMAGES_PRED: true
+  SAVE_HEATMAPS_GT: true
+  SAVE_HEATMAPS_PRED: true

VideoToNPZ/lib/pose/hrnet/experiments/coco/resnet/res101_256x192_d256x3_adam_lr1e-3.yaml ADDED Viewed

	@@ -0,0 +1,83 @@

+AUTO_RESUME: true
+CUDNN:
+  BENCHMARK: true
+  DETERMINISTIC: false
+  ENABLED: true
+DATA_DIR: ''
+GPUS: (0,1,2,3)
+OUTPUT_DIR: 'output'
+LOG_DIR: 'log'
+WORKERS: 24
+PRINT_FREQ: 100
+DATASET:
+  COLOR_RGB: false
+  DATASET: 'coco'
+  ROOT: 'data/coco/'
+  TEST_SET: 'val2017'
+  TRAIN_SET: 'train2017'
+  FLIP: true
+  ROT_FACTOR: 40
+  SCALE_FACTOR: 0.3
+MODEL:
+  NAME: 'pose_resnet'
+  PRETRAINED: 'models/pytorch/imagenet/resnet101-5d3b4d8f.pth'
+  IMAGE_SIZE:
+  - 192
+  - 256
+  HEATMAP_SIZE:
+  - 48
+  - 64
+  SIGMA: 2
+  NUM_JOINTS: 17
+  TARGET_TYPE: 'gaussian'
+  EXTRA:
+    FINAL_CONV_KERNEL: 1
+    DECONV_WITH_BIAS: false
+    NUM_DECONV_LAYERS: 3
+    NUM_DECONV_FILTERS:
+    - 256
+    - 256
+    - 256
+    NUM_DECONV_KERNELS:
+    - 4
+    - 4
+    - 4
+    NUM_LAYERS: 101
+LOSS:
+  USE_TARGET_WEIGHT: true
+TRAIN:
+  BATCH_SIZE_PER_GPU: 32
+  SHUFFLE: true
+  BEGIN_EPOCH: 0
+  END_EPOCH: 140
+  OPTIMIZER: 'adam'
+  LR: 0.001
+  LR_FACTOR: 0.1
+  LR_STEP:
+  - 90
+  - 120
+  WD: 0.0001
+  GAMMA1: 0.99
+  GAMMA2: 0.0
+  MOMENTUM: 0.9
+  NESTEROV: false
+TEST:
+  BATCH_SIZE_PER_GPU: 32
+  COCO_BBOX_FILE: 'data/coco/person_detection_results/COCO_val2017_detections_AP_H_56_person.json'
+  BBOX_THRE: 1.0
+  IMAGE_THRE: 0.0
+  IN_VIS_THRE: 0.2
+  MODEL_FILE: ''
+  NMS_THRE: 1.0
+  OKS_THRE: 0.9
+  FLIP_TEST: true
+  POST_PROCESS: true
+  SHIFT_HEATMAP: true
+  USE_GT_BBOX: true
+DEBUG:
+  DEBUG: true
+  SAVE_BATCH_IMAGES_GT: true
+  SAVE_BATCH_IMAGES_PRED: true
+  SAVE_HEATMAPS_GT: true
+  SAVE_HEATMAPS_PRED: true

VideoToNPZ/lib/pose/hrnet/experiments/coco/resnet/res101_384x288_d256x3_adam_lr1e-3.yaml ADDED Viewed

	@@ -0,0 +1,83 @@

+AUTO_RESUME: true
+CUDNN:
+  BENCHMARK: true
+  DETERMINISTIC: false
+  ENABLED: true
+DATA_DIR: ''
+GPUS: (0,1,2,3)
+OUTPUT_DIR: 'output'
+LOG_DIR: 'log'
+WORKERS: 24
+PRINT_FREQ: 100
+DATASET:
+  COLOR_RGB: false
+  DATASET: 'coco'
+  ROOT: 'data/coco/'
+  TEST_SET: 'val2017'
+  TRAIN_SET: 'train2017'
+  FLIP: true
+  ROT_FACTOR: 40
+  SCALE_FACTOR: 0.3
+MODEL:
+  NAME: 'pose_resnet'
+  PRETRAINED: 'models/pytorch/imagenet/resnet101-5d3b4d8f.pth'
+  IMAGE_SIZE:
+  - 288
+  - 384
+  HEATMAP_SIZE:
+  - 72
+  - 96
+  SIGMA: 3
+  NUM_JOINTS: 17
+  TARGET_TYPE: 'gaussian'
+  EXTRA:
+    FINAL_CONV_KERNEL: 1
+    DECONV_WITH_BIAS: false
+    NUM_DECONV_LAYERS: 3
+    NUM_DECONV_FILTERS:
+    - 256
+    - 256
+    - 256
+    NUM_DECONV_KERNELS:
+    - 4
+    - 4
+    - 4
+    NUM_LAYERS: 101
+LOSS:
+  USE_TARGET_WEIGHT: true
+TRAIN:
+  BATCH_SIZE_PER_GPU: 32
+  SHUFFLE: true
+  BEGIN_EPOCH: 0
+  END_EPOCH: 140
+  OPTIMIZER: 'adam'
+  LR: 0.001
+  LR_FACTOR: 0.1
+  LR_STEP:
+  - 90
+  - 120
+  WD: 0.0001
+  GAMMA1: 0.99
+  GAMMA2: 0.0
+  MOMENTUM: 0.9
+  NESTEROV: false
+TEST:
+  BATCH_SIZE_PER_GPU: 32
+  COCO_BBOX_FILE: 'data/coco/person_detection_results/COCO_val2017_detections_AP_H_56_person.json'
+  BBOX_THRE: 1.0
+  IMAGE_THRE: 0.0
+  IN_VIS_THRE: 0.2
+  MODEL_FILE: ''
+  NMS_THRE: 1.0
+  OKS_THRE: 0.9
+  FLIP_TEST: true
+  POST_PROCESS: true
+  SHIFT_HEATMAP: true
+  USE_GT_BBOX: true
+DEBUG:
+  DEBUG: true
+  SAVE_BATCH_IMAGES_GT: true
+  SAVE_BATCH_IMAGES_PRED: true
+  SAVE_HEATMAPS_GT: true
+  SAVE_HEATMAPS_PRED: true

VideoToNPZ/lib/pose/hrnet/experiments/coco/resnet/res152_256x192_d256x3_adam_lr1e-3.yaml ADDED Viewed

	@@ -0,0 +1,83 @@

+AUTO_RESUME: true
+CUDNN:
+  BENCHMARK: true
+  DETERMINISTIC: false
+  ENABLED: true
+DATA_DIR: ''
+GPUS: (0,1,2,3)
+OUTPUT_DIR: 'output'
+LOG_DIR: 'log'
+WORKERS: 24
+PRINT_FREQ: 100
+DATASET:
+  COLOR_RGB: false
+  DATASET: 'coco'
+  ROOT: 'data/coco/'
+  TEST_SET: 'val2017'
+  TRAIN_SET: 'train2017'
+  FLIP: true
+  ROT_FACTOR: 40
+  SCALE_FACTOR: 0.3
+MODEL:
+  NAME: 'pose_resnet'
+  PRETRAINED: 'models/pytorch/imagenet/resnet152-b121ed2d.pth'
+  IMAGE_SIZE:
+  - 192
+  - 256
+  HEATMAP_SIZE:
+  - 48
+  - 64
+  SIGMA: 2
+  NUM_JOINTS: 17
+  TARGET_TYPE: 'gaussian'
+  EXTRA:
+    FINAL_CONV_KERNEL: 1
+    DECONV_WITH_BIAS: false
+    NUM_DECONV_LAYERS: 3
+    NUM_DECONV_FILTERS:
+    - 256
+    - 256
+    - 256
+    NUM_DECONV_KERNELS:
+    - 4
+    - 4
+    - 4
+    NUM_LAYERS: 152
+LOSS:
+  USE_TARGET_WEIGHT: true
+TRAIN:
+  BATCH_SIZE_PER_GPU: 32
+  SHUFFLE: true
+  BEGIN_EPOCH: 0
+  END_EPOCH: 140
+  OPTIMIZER: 'adam'
+  LR: 0.001
+  LR_FACTOR: 0.1
+  LR_STEP:
+  - 90
+  - 120
+  WD: 0.0001
+  GAMMA1: 0.99
+  GAMMA2: 0.0
+  MOMENTUM: 0.9
+  NESTEROV: false
+TEST:
+  BATCH_SIZE_PER_GPU: 32
+  COCO_BBOX_FILE: 'data/coco/person_detection_results/COCO_val2017_detections_AP_H_56_person.json'
+  BBOX_THRE: 1.0
+  IMAGE_THRE: 0.0
+  IN_VIS_THRE: 0.2
+  MODEL_FILE: ''
+  NMS_THRE: 1.0
+  OKS_THRE: 0.9
+  FLIP_TEST: true
+  POST_PROCESS: true
+  SHIFT_HEATMAP: true
+  USE_GT_BBOX: true
+DEBUG:
+  DEBUG: true
+  SAVE_BATCH_IMAGES_GT: true
+  SAVE_BATCH_IMAGES_PRED: true
+  SAVE_HEATMAPS_GT: true
+  SAVE_HEATMAPS_PRED: true

VideoToNPZ/lib/pose/hrnet/experiments/coco/resnet/res152_384x288_d256x3_adam_lr1e-3.yaml ADDED Viewed

	@@ -0,0 +1,83 @@

+AUTO_RESUME: true
+CUDNN:
+  BENCHMARK: true
+  DETERMINISTIC: false
+  ENABLED: true
+DATA_DIR: ''
+GPUS: (0,1,2,3)
+OUTPUT_DIR: 'output'
+LOG_DIR: 'log'
+WORKERS: 24
+PRINT_FREQ: 100
+DATASET:
+  COLOR_RGB: false
+  DATASET: 'coco'
+  ROOT: 'data/coco/'
+  TEST_SET: 'val2017'
+  TRAIN_SET: 'train2017'
+  FLIP: true
+  ROT_FACTOR: 40
+  SCALE_FACTOR: 0.3
+MODEL:
+  NAME: 'pose_resnet'
+  PRETRAINED: 'models/pytorch/imagenet/resnet152-b121ed2d.pth'
+  IMAGE_SIZE:
+  - 288
+  - 384
+  HEATMAP_SIZE:
+  - 72
+  - 96
+  SIGMA: 3
+  NUM_JOINTS: 17
+  TARGET_TYPE: 'gaussian'
+  EXTRA:
+    FINAL_CONV_KERNEL: 1
+    DECONV_WITH_BIAS: false
+    NUM_DECONV_LAYERS: 3
+    NUM_DECONV_FILTERS:
+    - 256
+    - 256
+    - 256
+    NUM_DECONV_KERNELS:
+    - 4
+    - 4
+    - 4
+    NUM_LAYERS: 152
+LOSS:
+  USE_TARGET_WEIGHT: true
+TRAIN:
+  BATCH_SIZE_PER_GPU: 32
+  SHUFFLE: true
+  BEGIN_EPOCH: 0
+  END_EPOCH: 140
+  OPTIMIZER: 'adam'
+  LR: 0.001
+  LR_FACTOR: 0.1
+  LR_STEP:
+  - 90
+  - 120
+  WD: 0.0001
+  GAMMA1: 0.99
+  GAMMA2: 0.0
+  MOMENTUM: 0.9
+  NESTEROV: false
+TEST:
+  BATCH_SIZE_PER_GPU: 32
+  COCO_BBOX_FILE: 'data/coco/person_detection_results/COCO_val2017_detections_AP_H_56_person.json'
+  BBOX_THRE: 1.0
+  IMAGE_THRE: 0.0
+  IN_VIS_THRE: 0.2
+  MODEL_FILE: ''
+  NMS_THRE: 1.0
+  OKS_THRE: 0.9
+  FLIP_TEST: true
+  POST_PROCESS: true
+  SHIFT_HEATMAP: true
+  USE_GT_BBOX: true
+DEBUG:
+  DEBUG: true
+  SAVE_BATCH_IMAGES_GT: true
+  SAVE_BATCH_IMAGES_PRED: true
+  SAVE_HEATMAPS_GT: true
+  SAVE_HEATMAPS_PRED: true

VideoToNPZ/lib/pose/hrnet/experiments/coco/resnet/res50_256x192_d256x3_adam_lr1e-3.yaml ADDED Viewed

	@@ -0,0 +1,83 @@

+AUTO_RESUME: true
+CUDNN:
+  BENCHMARK: true
+  DETERMINISTIC: false
+  ENABLED: true
+DATA_DIR: ''
+GPUS: (0,1,2,3)
+OUTPUT_DIR: 'output'
+LOG_DIR: 'log'
+WORKERS: 24
+PRINT_FREQ: 100
+DATASET:
+  COLOR_RGB: false
+  DATASET: 'coco'
+  ROOT: 'data/coco/'
+  TEST_SET: 'val2017'
+  TRAIN_SET: 'train2017'
+  FLIP: true
+  ROT_FACTOR: 40
+  SCALE_FACTOR: 0.3
+MODEL:
+  NAME: 'pose_resnet'
+  PRETRAINED: 'models/pytorch/imagenet/resnet50-19c8e357.pth'
+  IMAGE_SIZE:
+  - 192
+  - 256
+  HEATMAP_SIZE:
+  - 48
+  - 64
+  SIGMA: 2
+  NUM_JOINTS: 17
+  TARGET_TYPE: 'gaussian'
+  EXTRA:
+    FINAL_CONV_KERNEL: 1
+    DECONV_WITH_BIAS: false
+    NUM_DECONV_LAYERS: 3
+    NUM_DECONV_FILTERS:
+    - 256
+    - 256
+    - 256
+    NUM_DECONV_KERNELS:
+    - 4
+    - 4
+    - 4
+    NUM_LAYERS: 50
+LOSS:
+  USE_TARGET_WEIGHT: true
+TRAIN:
+  BATCH_SIZE_PER_GPU: 32
+  SHUFFLE: true
+  BEGIN_EPOCH: 0
+  END_EPOCH: 140
+  OPTIMIZER: 'adam'
+  LR: 0.001
+  LR_FACTOR: 0.1
+  LR_STEP:
+  - 90
+  - 120
+  WD: 0.0001
+  GAMMA1: 0.99
+  GAMMA2: 0.0
+  MOMENTUM: 0.9
+  NESTEROV: false
+TEST:
+  BATCH_SIZE_PER_GPU: 32
+  COCO_BBOX_FILE: 'data/coco/person_detection_results/COCO_val2017_detections_AP_H_56_person.json'
+  BBOX_THRE: 1.0
+  IMAGE_THRE: 0.0
+  IN_VIS_THRE: 0.2
+  MODEL_FILE: ''
+  NMS_THRE: 1.0
+  OKS_THRE: 0.9
+  FLIP_TEST: true
+  POST_PROCESS: true
+  SHIFT_HEATMAP: true
+  USE_GT_BBOX: true
+DEBUG:
+  DEBUG: true
+  SAVE_BATCH_IMAGES_GT: true
+  SAVE_BATCH_IMAGES_PRED: true
+  SAVE_HEATMAPS_GT: true
+  SAVE_HEATMAPS_PRED: true

VideoToNPZ/lib/pose/hrnet/experiments/coco/resnet/res50_384x288_d256x3_adam_lr1e-3.yaml ADDED Viewed

	@@ -0,0 +1,83 @@

+AUTO_RESUME: true
+CUDNN:
+  BENCHMARK: true
+  DETERMINISTIC: false
+  ENABLED: true
+DATA_DIR: ''
+GPUS: (0,1,2,3)
+OUTPUT_DIR: 'output'
+LOG_DIR: 'log'
+WORKERS: 24
+PRINT_FREQ: 100
+DATASET:
+  COLOR_RGB: false
+  DATASET: 'coco'
+  ROOT: 'data/coco/'
+  TEST_SET: 'val2017'
+  TRAIN_SET: 'train2017'
+  FLIP: true
+  ROT_FACTOR: 40
+  SCALE_FACTOR: 0.3
+MODEL:
+  NAME: 'pose_resnet'
+  PRETRAINED: 'models/pytorch/imagenet/resnet50-19c8e357.pth'
+  IMAGE_SIZE:
+  - 288
+  - 384
+  HEATMAP_SIZE:
+  - 72
+  - 96
+  SIGMA: 3
+  NUM_JOINTS: 17
+  TARGET_TYPE: 'gaussian'
+  EXTRA:
+    FINAL_CONV_KERNEL: 1
+    DECONV_WITH_BIAS: false
+    NUM_DECONV_LAYERS: 3
+    NUM_DECONV_FILTERS:
+    - 256
+    - 256
+    - 256
+    NUM_DECONV_KERNELS:
+    - 4
+    - 4
+    - 4
+    NUM_LAYERS: 50
+LOSS:
+  USE_TARGET_WEIGHT: true
+TRAIN:
+  BATCH_SIZE_PER_GPU: 32
+  SHUFFLE: true
+  BEGIN_EPOCH: 0
+  END_EPOCH: 140
+  OPTIMIZER: 'adam'
+  LR: 0.001
+  LR_FACTOR: 0.1
+  LR_STEP:
+  - 90
+  - 120
+  WD: 0.0001
+  GAMMA1: 0.99
+  GAMMA2: 0.0
+  MOMENTUM: 0.9
+  NESTEROV: false
+TEST:
+  BATCH_SIZE_PER_GPU: 32
+  COCO_BBOX_FILE: 'data/coco/person_detection_results/COCO_val2017_detections_AP_H_56_person.json'
+  BBOX_THRE: 1.0
+  IMAGE_THRE: 0.0
+  IN_VIS_THRE: 0.2
+  MODEL_FILE: ''
+  NMS_THRE: 1.0
+  OKS_THRE: 0.9
+  FLIP_TEST: true
+  POST_PROCESS: true
+  SHIFT_HEATMAP: true
+  USE_GT_BBOX: true
+DEBUG:
+  DEBUG: true
+  SAVE_BATCH_IMAGES_GT: true
+  SAVE_BATCH_IMAGES_PRED: true
+  SAVE_HEATMAPS_GT: true
+  SAVE_HEATMAPS_PRED: true

VideoToNPZ/lib/pose/hrnet/experiments/mpii/hrnet/w32_256x256_adam_lr1e-3.yaml ADDED Viewed

	@@ -0,0 +1,120 @@

+AUTO_RESUME: true
+CUDNN:
+  BENCHMARK: true
+  DETERMINISTIC: false
+  ENABLED: true
+DATA_DIR: ''
+GPUS: (0,1,2,3)
+OUTPUT_DIR: 'output'
+LOG_DIR: 'log'
+WORKERS: 24
+PRINT_FREQ: 100
+DATASET:
+  COLOR_RGB: true
+  DATASET: mpii
+  DATA_FORMAT: jpg
+  FLIP: true
+  NUM_JOINTS_HALF_BODY: 8
+  PROB_HALF_BODY: -1.0
+  ROOT: 'data/mpii/'
+  ROT_FACTOR: 30
+  SCALE_FACTOR: 0.25
+  TEST_SET: valid
+  TRAIN_SET: train
+MODEL:
+  INIT_WEIGHTS: true
+  NAME: pose_hrnet
+  NUM_JOINTS: 16
+  PRETRAINED: 'models/pytorch/imagenet/hrnet_w32-36af842e.pth'
+  TARGET_TYPE: gaussian
+  IMAGE_SIZE:
+  - 256
+  - 256
+  HEATMAP_SIZE:
+  - 64
+  - 64
+  SIGMA: 2
+  EXTRA:
+    PRETRAINED_LAYERS:
+    - 'conv1'
+    - 'bn1'
+    - 'conv2'
+    - 'bn2'
+    - 'layer1'
+    - 'transition1'
+    - 'stage2'
+    - 'transition2'
+    - 'stage3'
+    - 'transition3'
+    - 'stage4'
+    FINAL_CONV_KERNEL: 1
+    STAGE2:
+      NUM_MODULES: 1
+      NUM_BRANCHES: 2
+      BLOCK: BASIC
+      NUM_BLOCKS:
+      - 4
+      - 4
+      NUM_CHANNELS:
+      - 32
+      - 64
+      FUSE_METHOD: SUM
+    STAGE3:
+      NUM_MODULES: 4
+      NUM_BRANCHES: 3
+      BLOCK: BASIC
+      NUM_BLOCKS:
+      - 4
+      - 4
+      - 4
+      NUM_CHANNELS:
+      - 32
+      - 64
+      - 128
+      FUSE_METHOD: SUM
+    STAGE4:
+      NUM_MODULES: 3
+      NUM_BRANCHES: 4
+      BLOCK: BASIC
+      NUM_BLOCKS:
+      - 4
+      - 4
+      - 4
+      - 4
+      NUM_CHANNELS:
+      - 32
+      - 64
+      - 128
+      - 256
+      FUSE_METHOD: SUM
+LOSS:
+  USE_TARGET_WEIGHT: true
+TRAIN:
+  BATCH_SIZE_PER_GPU: 32
+  SHUFFLE: true
+  BEGIN_EPOCH: 0
+  END_EPOCH: 210
+  OPTIMIZER: adam
+  LR: 0.001
+  LR_FACTOR: 0.1
+  LR_STEP:
+  - 170
+  - 200
+  WD: 0.0001
+  GAMMA1: 0.99
+  GAMMA2: 0.0
+  MOMENTUM: 0.9
+  NESTEROV: false
+TEST:
+  BATCH_SIZE_PER_GPU: 32
+  MODEL_FILE: ''
+  FLIP_TEST: true
+  POST_PROCESS: true
+  SHIFT_HEATMAP: true
+DEBUG:
+  DEBUG: true
+  SAVE_BATCH_IMAGES_GT: true
+  SAVE_BATCH_IMAGES_PRED: true
+  SAVE_HEATMAPS_GT: true
+  SAVE_HEATMAPS_PRED: true

VideoToNPZ/lib/pose/hrnet/experiments/mpii/hrnet/w48_256x256_adam_lr1e-3.yaml ADDED Viewed

	@@ -0,0 +1,120 @@

+AUTO_RESUME: true
+CUDNN:
+  BENCHMARK: true
+  DETERMINISTIC: false
+  ENABLED: true
+DATA_DIR: ''
+GPUS: (0,1,2,3)
+OUTPUT_DIR: 'output'
+LOG_DIR: 'log'
+WORKERS: 24
+PRINT_FREQ: 100
+DATASET:
+  COLOR_RGB: true
+  DATASET: mpii
+  DATA_FORMAT: jpg
+  FLIP: true
+  NUM_JOINTS_HALF_BODY: 8
+  PROB_HALF_BODY: -1.0
+  ROOT: 'data/mpii/'
+  ROT_FACTOR: 30
+  SCALE_FACTOR: 0.25
+  TEST_SET: valid
+  TRAIN_SET: train
+MODEL:
+  INIT_WEIGHTS: true
+  NAME: pose_hrnet
+  NUM_JOINTS: 16
+  PRETRAINED: 'models/pytorch/imagenet/hrnet_w48-8ef0771d.pth'
+  TARGET_TYPE: gaussian
+  IMAGE_SIZE:
+  - 256
+  - 256
+  HEATMAP_SIZE:
+  - 64
+  - 64
+  SIGMA: 2
+  EXTRA:
+    PRETRAINED_LAYERS:
+    - 'conv1'
+    - 'bn1'
+    - 'conv2'
+    - 'bn2'
+    - 'layer1'
+    - 'transition1'
+    - 'stage2'
+    - 'transition2'
+    - 'stage3'
+    - 'transition3'
+    - 'stage4'
+    FINAL_CONV_KERNEL: 1
+    STAGE2:
+      NUM_MODULES: 1
+      NUM_BRANCHES: 2
+      BLOCK: BASIC
+      NUM_BLOCKS:
+      - 4
+      - 4
+      NUM_CHANNELS:
+      - 48
+      - 96
+      FUSE_METHOD: SUM
+    STAGE3:
+      NUM_MODULES: 4
+      NUM_BRANCHES: 3
+      BLOCK: BASIC
+      NUM_BLOCKS:
+      - 4
+      - 4
+      - 4
+      NUM_CHANNELS:
+      - 48
+      - 96
+      - 192
+      FUSE_METHOD: SUM
+    STAGE4:
+      NUM_MODULES: 3
+      NUM_BRANCHES: 4
+      BLOCK: BASIC
+      NUM_BLOCKS:
+      - 4
+      - 4
+      - 4
+      - 4
+      NUM_CHANNELS:
+      - 48
+      - 96
+      - 192
+      - 384
+      FUSE_METHOD: SUM
+LOSS:
+  USE_TARGET_WEIGHT: true
+TRAIN:
+  BATCH_SIZE_PER_GPU: 32
+  SHUFFLE: true
+  BEGIN_EPOCH: 0
+  END_EPOCH: 210
+  OPTIMIZER: adam
+  LR: 0.001
+  LR_FACTOR: 0.1
+  LR_STEP:
+  - 170
+  - 200
+  WD: 0.0001
+  GAMMA1: 0.99
+  GAMMA2: 0.0
+  MOMENTUM: 0.9
+  NESTEROV: false
+TEST:
+  BATCH_SIZE_PER_GPU: 32
+  MODEL_FILE: ''
+  FLIP_TEST: true
+  POST_PROCESS: true
+  SHIFT_HEATMAP: true
+DEBUG:
+  DEBUG: true
+  SAVE_BATCH_IMAGES_GT: true
+  SAVE_BATCH_IMAGES_PRED: true
+  SAVE_HEATMAPS_GT: true
+  SAVE_HEATMAPS_PRED: true

VideoToNPZ/lib/pose/hrnet/experiments/mpii/resnet/res101_256x256_d256x3_adam_lr1e-3.yaml ADDED Viewed

	@@ -0,0 +1,86 @@

+AUTO_RESUME: true
+CUDNN:
+  BENCHMARK: true
+  DETERMINISTIC: false
+  ENABLED: true
+DATA_DIR: ''
+GPUS: (0,1,2,3)
+OUTPUT_DIR: 'output'
+LOG_DIR: 'log'
+WORKERS: 24
+PRINT_FREQ: 100
+DATASET:
+  COLOR_RGB: false
+  DATASET: mpii
+  DATA_FORMAT: jpg
+  FLIP: true
+  NUM_JOINTS_HALF_BODY: 8
+  PROB_HALF_BODY: -1.0
+  ROOT: 'data/mpii/'
+  ROT_FACTOR: 30
+  SCALE_FACTOR: 0.25
+  TEST_SET: valid
+  TRAIN_SET: train
+MODEL:
+  NAME: 'pose_resnet'
+  PRETRAINED: 'models/pytorch/imagenet/resnet101-5d3b4d8f.pth'
+  IMAGE_SIZE:
+  - 256
+  - 256
+  HEATMAP_SIZE:
+  - 64
+  - 64
+  SIGMA: 2
+  NUM_JOINTS: 16
+  TARGET_TYPE: 'gaussian'
+  EXTRA:
+    FINAL_CONV_KERNEL: 1
+    DECONV_WITH_BIAS: false
+    NUM_DECONV_LAYERS: 3
+    NUM_DECONV_FILTERS:
+    - 256
+    - 256
+    - 256
+    NUM_DECONV_KERNELS:
+    - 4
+    - 4
+    - 4
+    NUM_LAYERS: 101
+LOSS:
+  USE_TARGET_WEIGHT: true
+TRAIN:
+  BATCH_SIZE_PER_GPU: 32
+  SHUFFLE: true
+  BEGIN_EPOCH: 0
+  END_EPOCH: 140
+  OPTIMIZER: 'adam'
+  LR: 0.001
+  LR_FACTOR: 0.1
+  LR_STEP:
+  - 90
+  - 120
+  WD: 0.0001
+  GAMMA1: 0.99
+  GAMMA2: 0.0
+  MOMENTUM: 0.9
+  NESTEROV: false
+TEST:
+  BATCH_SIZE_PER_GPU: 32
+  COCO_BBOX_FILE: 'data/coco/person_detection_results/COCO_val2017_detections_AP_H_56_person.json'
+  BBOX_THRE: 1.0
+  IMAGE_THRE: 0.0
+  IN_VIS_THRE: 0.2
+  MODEL_FILE: ''
+  NMS_THRE: 1.0
+  OKS_THRE: 0.9
+  FLIP_TEST: true
+  POST_PROCESS: true
+  SHIFT_HEATMAP: true
+  USE_GT_BBOX: true
+DEBUG:
+  DEBUG: true
+  SAVE_BATCH_IMAGES_GT: true
+  SAVE_BATCH_IMAGES_PRED: true
+  SAVE_HEATMAPS_GT: true
+  SAVE_HEATMAPS_PRED: true

VideoToNPZ/lib/pose/hrnet/experiments/mpii/resnet/res152_256x256_d256x3_adam_lr1e-3.yaml ADDED Viewed

	@@ -0,0 +1,86 @@

+AUTO_RESUME: true
+CUDNN:
+  BENCHMARK: true
+  DETERMINISTIC: false
+  ENABLED: true
+DATA_DIR: ''
+GPUS: (0,1,2,3)
+OUTPUT_DIR: 'output'
+LOG_DIR: 'log'
+WORKERS: 24
+PRINT_FREQ: 100
+DATASET:
+  COLOR_RGB: false
+  DATASET: mpii
+  DATA_FORMAT: jpg
+  FLIP: true
+  NUM_JOINTS_HALF_BODY: 8
+  PROB_HALF_BODY: -1.0
+  ROOT: 'data/mpii/'
+  ROT_FACTOR: 30
+  SCALE_FACTOR: 0.25
+  TEST_SET: valid
+  TRAIN_SET: train
+MODEL:
+  NAME: 'pose_resnet'
+  PRETRAINED: 'models/pytorch/imagenet/resnet152-b121ed2d.pth'
+  IMAGE_SIZE:
+  - 256
+  - 256
+  HEATMAP_SIZE:
+  - 64
+  - 64
+  SIGMA: 2
+  NUM_JOINTS: 16
+  TARGET_TYPE: 'gaussian'
+  EXTRA:
+    FINAL_CONV_KERNEL: 1
+    DECONV_WITH_BIAS: false
+    NUM_DECONV_LAYERS: 3
+    NUM_DECONV_FILTERS:
+    - 256
+    - 256
+    - 256
+    NUM_DECONV_KERNELS:
+    - 4
+    - 4
+    - 4
+    NUM_LAYERS: 152
+LOSS:
+  USE_TARGET_WEIGHT: true
+TRAIN:
+  BATCH_SIZE_PER_GPU: 32
+  SHUFFLE: true
+  BEGIN_EPOCH: 0
+  END_EPOCH: 140
+  OPTIMIZER: 'adam'
+  LR: 0.001
+  LR_FACTOR: 0.1
+  LR_STEP:
+  - 90
+  - 120
+  WD: 0.0001
+  GAMMA1: 0.99
+  GAMMA2: 0.0
+  MOMENTUM: 0.9
+  NESTEROV: false
+TEST:
+  BATCH_SIZE_PER_GPU: 32
+  COCO_BBOX_FILE: 'data/coco/person_detection_results/COCO_val2017_detections_AP_H_56_person.json'
+  BBOX_THRE: 1.0
+  IMAGE_THRE: 0.0
+  IN_VIS_THRE: 0.2
+  MODEL_FILE: ''
+  NMS_THRE: 1.0
+  OKS_THRE: 0.9
+  FLIP_TEST: true
+  POST_PROCESS: true
+  SHIFT_HEATMAP: true
+  USE_GT_BBOX: true
+DEBUG:
+  DEBUG: true
+  SAVE_BATCH_IMAGES_GT: true
+  SAVE_BATCH_IMAGES_PRED: true
+  SAVE_HEATMAPS_GT: true
+  SAVE_HEATMAPS_PRED: true

VideoToNPZ/lib/pose/hrnet/experiments/mpii/resnet/res50_256x256_d256x3_adam_lr1e-3.yaml ADDED Viewed

	@@ -0,0 +1,86 @@

+AUTO_RESUME: true
+CUDNN:
+  BENCHMARK: true
+  DETERMINISTIC: false
+  ENABLED: true
+DATA_DIR: ''
+GPUS: (0,1,2,3)
+OUTPUT_DIR: 'output'
+LOG_DIR: 'log'
+WORKERS: 24
+PRINT_FREQ: 100
+DATASET:
+  COLOR_RGB: false
+  DATASET: mpii
+  DATA_FORMAT: jpg
+  FLIP: true
+  NUM_JOINTS_HALF_BODY: 8
+  PROB_HALF_BODY: -1.0
+  ROOT: 'data/mpii/'
+  ROT_FACTOR: 30
+  SCALE_FACTOR: 0.25
+  TEST_SET: valid
+  TRAIN_SET: train
+MODEL:
+  NAME: 'pose_resnet'
+  PRETRAINED: 'models/pytorch/imagenet/resnet50-19c8e357.pth'
+  IMAGE_SIZE:
+  - 256
+  - 256
+  HEATMAP_SIZE:
+  - 64
+  - 64
+  SIGMA: 2
+  NUM_JOINTS: 16
+  TARGET_TYPE: 'gaussian'
+  EXTRA:
+    FINAL_CONV_KERNEL: 1
+    DECONV_WITH_BIAS: false
+    NUM_DECONV_LAYERS: 3
+    NUM_DECONV_FILTERS:
+    - 256
+    - 256
+    - 256
+    NUM_DECONV_KERNELS:
+    - 4
+    - 4
+    - 4
+    NUM_LAYERS: 50
+LOSS:
+  USE_TARGET_WEIGHT: true
+TRAIN:
+  BATCH_SIZE_PER_GPU: 32
+  SHUFFLE: true
+  BEGIN_EPOCH: 0
+  END_EPOCH: 140
+  OPTIMIZER: 'adam'
+  LR: 0.001
+  LR_FACTOR: 0.1
+  LR_STEP:
+  - 90
+  - 120
+  WD: 0.0001
+  GAMMA1: 0.99
+  GAMMA2: 0.0
+  MOMENTUM: 0.9
+  NESTEROV: false
+TEST:
+  BATCH_SIZE_PER_GPU: 32
+  COCO_BBOX_FILE: 'data/coco/person_detection_results/COCO_val2017_detections_AP_H_56_person.json'
+  BBOX_THRE: 1.0
+  IMAGE_THRE: 0.0
+  IN_VIS_THRE: 0.2
+  MODEL_FILE: ''
+  NMS_THRE: 1.0
+  OKS_THRE: 0.9
+  FLIP_TEST: true
+  POST_PROCESS: true
+  SHIFT_HEATMAP: true
+  USE_GT_BBOX: true
+DEBUG:
+  DEBUG: true
+  SAVE_BATCH_IMAGES_GT: true
+  SAVE_BATCH_IMAGES_PRED: true
+  SAVE_HEATMAPS_GT: true
+  SAVE_HEATMAPS_PRED: true

VideoToNPZ/lib/pose/hrnet/lib/Makefile ADDED Viewed

	@@ -0,0 +1,4 @@

+all:
+	cd nms; python setup_linux.py build_ext --inplace; rm -rf build; cd ../../
+clean:
+	cd nms; rm *.so; cd ../../

VideoToNPZ/lib/pose/hrnet/lib/config/__init__.py ADDED Viewed

	@@ -0,0 +1,9 @@

+# ------------------------------------------------------------------------------
+# Copyright (c) Microsoft
+# Licensed under the MIT License.
+# Written by Bin Xiao ([email protected])
+# ------------------------------------------------------------------------------
+from .default import _C as cfg
+from .default import update_config
+from .models import MODEL_EXTRAS

VideoToNPZ/lib/pose/hrnet/lib/config/default.py ADDED Viewed

	@@ -0,0 +1,160 @@

+# ------------------------------------------------------------------------------
+# Copyright (c) Microsoft
+# Licensed under the MIT License.
+# Written by Bin Xiao ([email protected])
+# ------------------------------------------------------------------------------
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import os
+from yacs.config import CfgNode as CN
+_C = CN()
+_C.OUTPUT_DIR = ''
+_C.LOG_DIR = ''
+_C.DATA_DIR = ''
+_C.GPUS = (0,)
+_C.WORKERS = 4
+_C.PRINT_FREQ = 20
+_C.AUTO_RESUME = False
+_C.PIN_MEMORY = True
+_C.RANK = 0
+# Cudnn related params
+_C.CUDNN = CN()
+_C.CUDNN.BENCHMARK = True
+_C.CUDNN.DETERMINISTIC = False
+_C.CUDNN.ENABLED = True
+# common params for NETWORK
+_C.MODEL = CN()
+_C.MODEL.NAME = 'pose_hrnet'
+_C.MODEL.INIT_WEIGHTS = True
+_C.MODEL.PRETRAINED = ''
+_C.MODEL.NUM_JOINTS = 17
+_C.MODEL.TAG_PER_JOINT = True
+_C.MODEL.TARGET_TYPE = 'gaussian'
+_C.MODEL.IMAGE_SIZE = [256, 256]  # width * height, ex: 192 * 256
+_C.MODEL.HEATMAP_SIZE = [64, 64]  # width * height, ex: 24 * 32
+_C.MODEL.SIGMA = 2
+_C.MODEL.EXTRA = CN(new_allowed=True)
+_C.LOSS = CN()
+_C.LOSS.USE_OHKM = False
+_C.LOSS.TOPK = 8
+_C.LOSS.USE_TARGET_WEIGHT = True
+_C.LOSS.USE_DIFFERENT_JOINTS_WEIGHT = False
+# DATASET related params
+_C.DATASET = CN()
+_C.DATASET.ROOT = ''
+_C.DATASET.DATASET = 'mpii'
+_C.DATASET.TRAIN_SET = 'train'
+_C.DATASET.TEST_SET = 'valid'
+_C.DATASET.DATA_FORMAT = 'jpg'
+_C.DATASET.HYBRID_JOINTS_TYPE = ''
+_C.DATASET.SELECT_DATA = False
+# training data augmentation
+_C.DATASET.FLIP = True
+_C.DATASET.SCALE_FACTOR = 0.25
+_C.DATASET.ROT_FACTOR = 30
+_C.DATASET.PROB_HALF_BODY = 0.0
+_C.DATASET.NUM_JOINTS_HALF_BODY = 8
+_C.DATASET.COLOR_RGB = False
+# train
+_C.TRAIN = CN()
+_C.TRAIN.LR_FACTOR = 0.1
+_C.TRAIN.LR_STEP = [90, 110]
+_C.TRAIN.LR = 0.001
+_C.TRAIN.OPTIMIZER = 'adam'
+_C.TRAIN.MOMENTUM = 0.9
+_C.TRAIN.WD = 0.0001
+_C.TRAIN.NESTEROV = False
+_C.TRAIN.GAMMA1 = 0.99
+_C.TRAIN.GAMMA2 = 0.0
+_C.TRAIN.BEGIN_EPOCH = 0
+_C.TRAIN.END_EPOCH = 140
+_C.TRAIN.RESUME = False
+_C.TRAIN.CHECKPOINT = ''
+_C.TRAIN.BATCH_SIZE_PER_GPU = 32
+_C.TRAIN.SHUFFLE = True
+# testing
+_C.TEST = CN()
+# size of images for each device
+_C.TEST.BATCH_SIZE_PER_GPU = 32
+# Test Model Epoch
+_C.TEST.FLIP_TEST = False
+_C.TEST.POST_PROCESS = False
+_C.TEST.SHIFT_HEATMAP = False
+_C.TEST.USE_GT_BBOX = False
+# nms
+_C.TEST.IMAGE_THRE = 0.1
+_C.TEST.NMS_THRE = 0.6
+_C.TEST.SOFT_NMS = False
+_C.TEST.OKS_THRE = 0.5
+_C.TEST.IN_VIS_THRE = 0.0
+_C.TEST.COCO_BBOX_FILE = ''
+_C.TEST.BBOX_THRE = 1.0
+_C.TEST.MODEL_FILE = ''
+# debug
+_C.DEBUG = CN()
+_C.DEBUG.DEBUG = False
+_C.DEBUG.SAVE_BATCH_IMAGES_GT = False
+_C.DEBUG.SAVE_BATCH_IMAGES_PRED = False
+_C.DEBUG.SAVE_HEATMAPS_GT = False
+_C.DEBUG.SAVE_HEATMAPS_PRED = False
+def update_config(cfg, args):
+    cfg.defrost()
+    cfg.merge_from_file(args.cfg)
+    cfg.merge_from_list(args.opts)
+    if args.modelDir:
+        cfg.OUTPUT_DIR = args.modelDir
+    # if args.logDir:
+    #     cfg.LOG_DIR = args.logDir
+    #
+    # if args.dataDir:
+    #     cfg.DATA_DIR = args.dataDir
+    #
+    # cfg.DATASET.ROOT = os.path.join(
+    #     cfg.DATA_DIR, cfg.DATASET.ROOT
+    # )
+    #
+    # cfg.MODEL.PRETRAINED = os.path.join(
+    #     cfg.DATA_DIR, cfg.MODEL.PRETRAINED
+    # )
+    #
+    # if cfg.TEST.MODEL_FILE:
+    #     cfg.TEST.MODEL_FILE = os.path.join(
+    #         cfg.DATA_DIR, cfg.TEST.MODEL_FILE
+    #     )
+    cfg.freeze()
+if __name__ == '__main__':
+    import sys
+    with open(sys.argv[1], 'w') as f:
+        print(_C, file=f)

VideoToNPZ/lib/pose/hrnet/lib/config/models.py ADDED Viewed

	@@ -0,0 +1,58 @@

+# ------------------------------------------------------------------------------
+# Copyright (c) Microsoft
+# Licensed under the MIT License.
+# Written by Bin Xiao ([email protected])
+# ------------------------------------------------------------------------------
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from yacs.config import CfgNode as CN
+# pose_resnet related params
+POSE_RESNET = CN()
+POSE_RESNET.NUM_LAYERS = 50
+POSE_RESNET.DECONV_WITH_BIAS = False
+POSE_RESNET.NUM_DECONV_LAYERS = 3
+POSE_RESNET.NUM_DECONV_FILTERS = [256, 256, 256]
+POSE_RESNET.NUM_DECONV_KERNELS = [4, 4, 4]
+POSE_RESNET.FINAL_CONV_KERNEL = 1
+POSE_RESNET.PRETRAINED_LAYERS = ['*']
+# pose_multi_resoluton_net related params
+POSE_HIGH_RESOLUTION_NET = CN()
+POSE_HIGH_RESOLUTION_NET.PRETRAINED_LAYERS = ['*']
+POSE_HIGH_RESOLUTION_NET.STEM_INPLANES = 64
+POSE_HIGH_RESOLUTION_NET.FINAL_CONV_KERNEL = 1
+POSE_HIGH_RESOLUTION_NET.STAGE2 = CN()
+POSE_HIGH_RESOLUTION_NET.STAGE2.NUM_MODULES = 1
+POSE_HIGH_RESOLUTION_NET.STAGE2.NUM_BRANCHES = 2
+POSE_HIGH_RESOLUTION_NET.STAGE2.NUM_BLOCKS = [4, 4]
+POSE_HIGH_RESOLUTION_NET.STAGE2.NUM_CHANNELS = [32, 64]
+POSE_HIGH_RESOLUTION_NET.STAGE2.BLOCK = 'BASIC'
+POSE_HIGH_RESOLUTION_NET.STAGE2.FUSE_METHOD = 'SUM'
+POSE_HIGH_RESOLUTION_NET.STAGE3 = CN()
+POSE_HIGH_RESOLUTION_NET.STAGE3.NUM_MODULES = 1
+POSE_HIGH_RESOLUTION_NET.STAGE3.NUM_BRANCHES = 3
+POSE_HIGH_RESOLUTION_NET.STAGE3.NUM_BLOCKS = [4, 4, 4]
+POSE_HIGH_RESOLUTION_NET.STAGE3.NUM_CHANNELS = [32, 64, 128]
+POSE_HIGH_RESOLUTION_NET.STAGE3.BLOCK = 'BASIC'
+POSE_HIGH_RESOLUTION_NET.STAGE3.FUSE_METHOD = 'SUM'
+POSE_HIGH_RESOLUTION_NET.STAGE4 = CN()
+POSE_HIGH_RESOLUTION_NET.STAGE4.NUM_MODULES = 1
+POSE_HIGH_RESOLUTION_NET.STAGE4.NUM_BRANCHES = 4
+POSE_HIGH_RESOLUTION_NET.STAGE4.NUM_BLOCKS = [4, 4, 4, 4]
+POSE_HIGH_RESOLUTION_NET.STAGE4.NUM_CHANNELS = [32, 64, 128, 256]
+POSE_HIGH_RESOLUTION_NET.STAGE4.BLOCK = 'BASIC'
+POSE_HIGH_RESOLUTION_NET.STAGE4.FUSE_METHOD = 'SUM'
+MODEL_EXTRAS = {
+    'pose_resnet': POSE_RESNET,
+    'pose_high_resolution_net': POSE_HIGH_RESOLUTION_NET,
+}

VideoToNPZ/lib/pose/hrnet/lib/models/__init__.py ADDED Viewed

	@@ -0,0 +1,16 @@

+# ------------------------------------------------------------------------------
+# Copyright (c) Microsoft
+# Licensed under the MIT License.
+# Written by Bin Xiao ([email protected])
+# ------------------------------------------------------------------------------
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import models.pose_resnet
+import models.pose_hrnet