diff --git a/.gitattributes b/.gitattributes
index a6344aac8c09253b3b630fb776ae94478aa0275b..f5894da75edb37f7e550a2924d850e51fbe116d0 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.weights filter=lfs diff=lfs merge=lfs -text
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..044da9b5783620e46b9ce1c2a0abf053ebeae759
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,7 @@
+venv/
+_pycache_/
+*.pyc
+*.bvh
+*.obj
+*.npz
+*.mp4
\ No newline at end of file
diff --git a/VideoToNPZ/INFERENCE_EN.md b/VideoToNPZ/INFERENCE_EN.md
new file mode 100644
index 0000000000000000000000000000000000000000..de704682748ff64757379026b805f48c375c64a1
--- /dev/null
+++ b/VideoToNPZ/INFERENCE_EN.md
@@ -0,0 +1,2 @@
+
+    python gen_skes.py -v baseball.mp4
diff --git a/VideoToNPZ/checkpoint/gastnet/81_frame_model.bin b/VideoToNPZ/checkpoint/gastnet/81_frame_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..7ab636bcc966684a8a3be7ded4ff0d9311d76575
--- /dev/null
+++ b/VideoToNPZ/checkpoint/gastnet/81_frame_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3150eb3125ca66242a888fd06b4eb7d8a8b755607370225c24f0b9c794d35cc4
+size 28333160
diff --git a/VideoToNPZ/checkpoint/hrnet/pose_coco/pose_hrnet_w48_384x288.pth b/VideoToNPZ/checkpoint/hrnet/pose_coco/pose_hrnet_w48_384x288.pth
new file mode 100644
index 0000000000000000000000000000000000000000..8cea6c32352118068ddda01cb72a33c8450b3e31
--- /dev/null
+++ b/VideoToNPZ/checkpoint/hrnet/pose_coco/pose_hrnet_w48_384x288.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:95e0fec3194826d5e3f806ea89be68bbb84517b114c3a32b3058c56610b5ef61
+size 255061287
diff --git a/VideoToNPZ/checkpoint/yolov3/yolov3.weights b/VideoToNPZ/checkpoint/yolov3/yolov3.weights
new file mode 100644
index 0000000000000000000000000000000000000000..550ca2f10867af32a8434dd7cddb5d305a77c97f
--- /dev/null
+++ b/VideoToNPZ/checkpoint/yolov3/yolov3.weights
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:523e4e69e1d015393a1b0a441cef1d9c7659e3eb2d7e15f793f060a21b32f297
+size 248007048
diff --git a/VideoToNPZ/common/arguments.py b/VideoToNPZ/common/arguments.py
new file mode 100644
index 0000000000000000000000000000000000000000..f4bf40e987e5a5cf41b50bb228472d0841e551b7
--- /dev/null
+++ b/VideoToNPZ/common/arguments.py
@@ -0,0 +1,86 @@
+import argparse
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Training script')
+
+    # General arguments
+    parser.add_argument('-d', '--dataset', default='h36m', type=str, metavar='NAME',
+                        help='target dataset')  # h36m or humaneva
+    parser.add_argument('-k', '--keypoints', default='cpn_ft_h36m_dbb', type=str, metavar='NAME',
+                        help='2D detections to use')
+    parser.add_argument('-str', '--subjects-train', default='S1,S5,S6,S7,S8', type=str, metavar='LIST',
+                        help='training subjects separated by comma')
+    parser.add_argument('-ste', '--subjects-test', default='S9,S11', type=str, metavar='LIST',
+                        help='test subjects separated by comma')
+    parser.add_argument('-a', '--actions', default='*', type=str, metavar='LIST',
+                        help='actions to train/test on, separated by comma, or * for all')
+    parser.add_argument('-c', '--checkpoint', default='checkpoint', type=str, metavar='PATH',
+                        help='checkpoint directory')
+    parser.add_argument('--checkpoint-frequency', default=10, type=int, metavar='N',
+                        help='create a checkpoint every N epochs')
+    parser.add_argument('-r', '--resume', default='', type=str, metavar='FILENAME',
+                        help='checkpoint to resume (file name)')
+    parser.add_argument('--evaluate', default='', type=str, metavar='FILENAME',
+                        help='checkpoint to evaluate (file name)')
+    parser.add_argument('--render', action='store_true', help='visualize a particular video')
+    parser.add_argument('--by-subject', action='store_true', help='break down error by subject (on evaluation)')
+    parser.add_argument('--export-training-curves', action='store_true', help='save training curves as .png images')
+
+    # Model arguments
+    parser.add_argument('-s', '--stride', default=1, type=int, metavar='N', help='chunk size to use during training')
+    parser.add_argument('-arc', '--architecture', default='3,3,3', type=str, metavar='LAYERS',
+                        help='filter widths separated by comma')
+    parser.add_argument('--causal', action='store_true', help='use causal convolutions for real-time processing')
+    parser.add_argument('-ch', '--channels', default=128, type=int, metavar='N',
+                        help='number of channels in convolution layers')
+
+    # Experimental setting
+    parser.add_argument('-e', '--epochs', default=60, type=int, metavar='N', help='number of training epochs')
+    parser.add_argument('-b', '--batch-size', default=128, type=int, metavar='N',
+                        help='batch size in terms of predicted frames')
+    parser.add_argument('-drop', '--dropout', default=0.05, type=float, metavar='P', help='dropout probability')
+    parser.add_argument('-lr', '--learning-rate', default=0.001, type=float, metavar='LR', help='initial learning rate')
+    parser.add_argument('-lrd', '--lr-decay', default=0.95, type=float, metavar='LR',
+                        help='learning rate decay per epoch')
+    parser.add_argument('-no-da', '--no-data-augmentation', dest='data_augmentation', action='store_false',
+                        help='disable train-time flipping')
+    parser.add_argument('-no-tta', '--no-test-time-augmentation', dest='test_time_augmentation', action='store_false',
+                        help='disable test-time flipping')
+    parser.add_argument('--subset', default=1, type=float, metavar='FRACTION', help='reduce dataset size by fraction')
+    parser.add_argument('--downsample', default=5, type=int, metavar='FACTOR',
+                        help='downsample frame rate by factor (semi-supervised)')
+    parser.add_argument('--no-eval', action='store_true',
+                        help='disable epoch evaluation while training (small speed-up)')
+    parser.add_argument('--disable-optimizations', action='store_true',
+                        help='disable optimized model for single-frame predictions')
+
+    # Visualization
+    parser.add_argument('--viz-subject', type=str, metavar='STR', help='subject to render')
+    parser.add_argument('--viz-action', type=str, metavar='STR', help='action to render')
+    parser.add_argument('--viz-camera', type=int, default=0, metavar='N', help='camera to render')
+    parser.add_argument('--viz-video', type=str, metavar='PATH', help='path to input video')
+    parser.add_argument('--viz-skip', type=int, default=0, metavar='N', help='skip first N frames of input video')
+    parser.add_argument('--viz-output', type=str, metavar='PATH', help='output file name (.gif or .mp4)')
+    parser.add_argument('--viz-export', type=str, metavar='PATH', help='output file name for coordinates')
+    parser.add_argument('--viz-bitrate', type=int, default=3000, metavar='N', help='bitrate for mp4 videos')
+    parser.add_argument('--viz-no-ground-truth', action='store_true', help='do not show ground-truth poses')
+    parser.add_argument('--viz-limit', type=int, default=-1, metavar='N', help='only render first N frames')
+    parser.add_argument('--viz-downsample', type=int, default=1, metavar='N', help='downsample FPS by a factor N')
+    parser.add_argument('--viz-size', type=int, default=5, metavar='N', help='image size')
+
+    parser.set_defaults(bone_length_term=True)
+    parser.set_defaults(data_augmentation=True)
+    parser.set_defaults(test_time_augmentation=True)
+
+    args = parser.parse_args()
+    # Check invalid configuration
+    if args.resume and args.evaluate:
+        print('Invalid flags: --resume and --evaluate cannot be set at the same time')
+        exit()
+
+    if args.export_training_curves and args.no_eval:
+        print('Invalid flags: --export-training-curves and --no-eval cannot be set at the same time')
+        exit()
+
+    return args
diff --git a/VideoToNPZ/common/camera.py b/VideoToNPZ/common/camera.py
new file mode 100644
index 0000000000000000000000000000000000000000..5d691b5166c63e0ea348eae1d939687257ba500d
--- /dev/null
+++ b/VideoToNPZ/common/camera.py
@@ -0,0 +1,63 @@
+import numpy as np
+import torch
+
+from tools.utils import wrap
+from common.quaternion import qort, qinverse
+
+
+def normalize_screen_coordinates(X, w, h):
+    assert X.shape[-1] == 2
+
+    # Normalize so that [0, w] is mapped to [-1, 1], while preserving the aspect ratio
+    return X/w*2 - [1, h/w]
+
+
+def image_coordinates(X, w, h):
+    assert X.shape[-1] == 2
+
+    # Reverse camera frame normalization
+    return (X + [1, h/w]) * w / 2
+
+
+def world_to_camera(X, R, t):
+    Rt = wrap(qinverse, R)  # Invert rotation
+    return wrap(qort, np.tile(Rt, (*X.shape[:-1], 1)), X - t)  # Rotate and translate
+
+
+def camera_to_world(X, R, t):
+    return wrap(qort, np.tile(R, (*X.shape[:-1], 1)), X) + t
+
+
+def project_to_2d(X, camera_params):
+    """
+    Project 3D points to 2D using the Human3.6M camera projection function.
+    This is a differentiable and batched reimplementation of the original MATLAB script.
+
+    Arguments:
+    X -- 3D points in *camera space* to transform (N, *, 3)
+    camera_params -- intrinsic parameteres (N, 2+2+3+2=9)
+    """
+    assert X.shape[-1] == 3
+    assert len(camera_params.shape) == 2
+    assert camera_params.shape[-1] == 9
+    assert X.shape[0] == camera_params.shape[0]
+
+    while len(camera_params.shape) < len(X.shape):
+        camera_params = camera_params.unsqueeze(1)
+
+    f = camera_params[..., :2]
+    c = camera_params[..., 2:4]
+    k = camera_params[..., 4:7]
+    p = camera_params[..., 7:]
+
+    # XX = torch.clamp(X[..., :2] / X[..., 2:], min=-1, max=1)
+    XX = X[..., :2] / X[..., 2:]
+    r2 = torch.sum(XX[..., :2]**2, dim=len(XX.shape)-1, keepdim=True)
+
+    radial = 1 + torch.sum(k * torch.cat((r2, r2**2, r2**3), dim=len(r2.shape)-1), dim=len(r2.shape)-1, keepdim=True)
+    tan = torch.sum(p*XX, dim=len(XX.shape)-1, keepdim=True)
+
+    XXX = XX*(radial + tan) + p*r2
+
+    return f*XXX + c
+
diff --git a/VideoToNPZ/common/generators.py b/VideoToNPZ/common/generators.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a437693ed396455313a0cf8f848b225322b3165
--- /dev/null
+++ b/VideoToNPZ/common/generators.py
@@ -0,0 +1,236 @@
+from itertools import zip_longest
+import numpy as np
+
+
+class ChunkedGenerator:
+    """
+        Batched data generator, used for training.
+        The sequences are split into equal-length chunks and padded as necessary.
+
+        Arguments:
+        batch_size -- the batch size to use for training
+        cameras -- list of cameras, one element for each video (optional, used for semi-supervised training)
+        poses_3d -- list of ground-truth 3D poses, one element for each video (optional, used for supervised training)
+        poses_2d -- list of input 2D keypoints, one element for each video
+        chunk_length -- number of output frames to predict for each training example (usually 1)
+        pad -- 2D input padding to compensate for valid convolutions, per side (depends on the receptive field)
+        causal_shift -- asymmetric padding offset when causal convolutions are used (usually 0 or "pad")
+        shuffle -- randomly shuffle the dataset before each epoch
+        random_seed -- initial seed to use for the random generator
+        augment -- augment the dataset by flipping poses horizontally
+        kps_left and kps_right -- list of left/right 2D keypoints if flipping is enabled
+        joints_left and joints_right -- list of left/right 3D joints if flipping is enabled
+    """
+    def __init__(self, batch_size, cameras, poses_3d, poses_2d,
+                 chunk_length, pad=0, causal_shift=0,
+                 shuffle=True, random_seed=1234,
+                 augment=False, kps_left=None, kps_right=None, joints_left=None, joints_right=None,
+                 endless=False):
+        assert poses_3d is None or len(poses_3d) == len(poses_2d), (len(poses_3d), len(poses_2d))
+        assert cameras is None or len(cameras) == len(poses_2d)
+
+        # Build lineage info
+        pairs = []  # (seq_idx, start_frame, end_frame, flip) tuples
+        for i in range(len(poses_2d)):
+            assert poses_3d is None or poses_3d[i].shape[0] == poses_2d[i].shape[0]
+            n_chunks = (poses_2d[i].shape[0] + chunk_length - 1) // chunk_length
+            offset = (n_chunks * chunk_length - poses_2d[i].shape[0]) // 2
+            bounds = np.arange(n_chunks + 1) * chunk_length - offset
+            augment_vector = np.full(len(bounds)-1, False, dtype=bool)
+            pairs += zip(np.repeat(i, len(bounds)-1), bounds[:-1], bounds[1:], augment_vector)
+            if augment:
+                pairs += zip(np.repeat(i, len(bounds)-1), bounds[:-1], bounds[1:], ~augment_vector)
+
+            # Initialize buffers
+            if cameras is not None:
+                self.batch_cam = np.empty((batch_size, cameras[0].shape[-1]))
+            if poses_3d is not None:
+                self.batch_3d = np.empty((batch_size, chunk_length, poses_3d[0].shape[-2], poses_3d[0].shape[-1]))
+            self.batch_2d = np.empty((batch_size, chunk_length + 2*pad, poses_2d[0].shape[-2], poses_2d[0].shape[-1]))
+
+            self.num_batches = (len(pairs) + batch_size - 1) // batch_size
+            self.batch_size = batch_size
+            self.random = np.random.RandomState(random_seed)
+            self.pairs = pairs
+            self.shuffle = shuffle
+            self.pad = pad
+            self.causal_shift = causal_shift
+            self.endless = endless
+            self.state = None
+
+            self.cameras = cameras
+            self.poses_3d = poses_3d
+            self.poses_2d = poses_2d
+
+            self.augment = augment
+            self.kps_left = kps_left
+            self.kps_right = kps_right
+            self.joints_left = joints_left
+            self.joints_right = joints_right
+
+    def num_frames(self):
+        return self.num_batches * self.batch_size
+
+    def random_state(self):
+        return self.random
+
+    def set_random_state(self, random):
+        self.random = random
+
+    def augment_enabled(self):
+        return self.augment
+
+    def next_pairs(self):
+        if self.state is None:
+            if self.shuffle:
+                pairs = self.random.permutation(self.pairs)
+            else:
+                pairs = self.pairs
+            return 0, pairs
+        else:
+            return self.state
+
+    def next_epoch(self):
+        enabled = True
+        while enabled:
+            start_idx, pairs = self.next_pairs()
+            for b_i in range(start_idx, self.num_batches):
+                chunks = pairs[b_i*self.batch_size : (b_i+1)*self.batch_size]
+                for i, (seq_i, start_3d, end_3d, flip) in enumerate(chunks):
+                    start_2d = start_3d - self.pad - self.causal_shift
+                    end_2d = end_3d + self.pad - self.causal_shift
+
+                    # 2D poses
+                    seq_2d = self.poses_2d[seq_i]
+                    low_2d = max(start_2d, 0)
+                    high_2d = min(end_2d, seq_2d.shape[0])
+                    pad_left_2d = low_2d - start_2d
+                    pad_right_2d = end_2d - high_2d
+                    if pad_left_2d != 0 or pad_right_2d != 0:
+                        self.batch_2d[i] = np.pad(seq_2d[low_2d:high_2d], ((pad_left_2d, pad_right_2d), (0, 0), (0, 0)), "edge")
+                    else:
+                        self.batch_2d[i] = seq_2d[low_2d:high_2d]
+
+                    if flip:
+                        # Flip 2D keypoints
+                        self.batch_2d[i, :, :, 0] *= -1
+                        self.batch_2d[i, :, self.kps_left + self.kps_right] = self.batch_2d[i, :, self.kps_right + self.kps_left]
+
+                    # 3D poses
+                    if self.poses_3d is not None:
+                        seq_3d = self.poses_3d[seq_i]
+                        low_3d = max(start_3d, 0)
+                        high_3d = min(end_3d, seq_3d.shape[0])
+                        pad_left_3d = low_3d - start_3d
+                        pad_right_3d = end_3d - high_3d
+                        if pad_left_3d != 0 or pad_right_3d != 0:
+                            self.batch_3d[i] = np.pad(seq_3d[low_3d:high_3d], ((pad_left_3d, pad_right_3d), (0, 0), (0, 0)), "edge")
+                        else:
+                            self.batch_3d[i] = seq_3d[low_3d:high_3d]
+
+                        if flip:
+                            # Flip 3D joints
+                            self.batch_3d[i, :, :, 0] *= -1
+                            self.batch_3d[i, :, self.joints_left + self.joints_right] = \
+                                    self.batch_3d[i, :, self.joints_right + self.joints_left]
+
+                    # Cameras
+                    if self.cameras is not None:
+                        self.batch_cam[i] = self.cameras[seq_i]
+                        if flip:
+                            # Flip horizontal distortion coefficients
+                            self.batch_cam[i, 2] *= -1
+                            self.batch_cam[i, 7] *= -1
+
+                if self.endless:
+                    self.state = (b_i + 1, pairs)
+                if self.poses_3d is None and self.cameras is None:
+                    yield None, None, self.batch_2d[:len(chunks)]
+                elif self.poses_3d is not None and self.cameras is None:
+                    yield None, self.batch_3d[:len(chunks)], self.batch_2d[:(len(chunks))]
+                elif self.poses_3d is None:
+                    yield self.batch_cam, None, self.batch_2d[:len(chunks)]
+                else:
+                    yield self.batch_cam[:len(chunks)], self.batch_3d[:len(chunks)], self.batch_2d[:len(chunks)]
+
+            if self.endless:
+                self.state = None
+            else:
+                enabled = False
+
+
+class UnchunkedGenerator:
+    """
+    Non-batched data generator, used for testing.
+    Sequences are returned one at a time (i.e. batch size = 1), without chunking.
+
+    If data augmentation is enabled, the batches contain two sequences (i.e. batch size = 2),
+    the second of which is a mirrored version of the first.
+
+    Arguments:
+    cameras -- list of cameras, one element for each video (optional, used for semi-supervised training)
+    poses_3d -- list of ground-truth 3D poses, one element for each video (optional, used for supervised training)
+    poses_2d -- list of input 2D keypoints, one element for each video
+    pad -- 2D input padding to compensate for valid convolutions, per side (depends on the receptive field)
+    causal_shift -- asymmetric padding offset when causal convolutions are used (usually 0 or "pad")
+    augment -- augment the dataset by flipping poses horizontally
+    kps_left and kps_right -- list of left/right 2D keypoints if flipping is enabled
+    joints_left and joints_right -- list of left/right 3D joints if flipping is enabled
+    """
+
+    def __init__(self, cameras, poses_3d, poses_2d, pad=0, causal_shift=0,
+                 augment=False, kps_left=None, kps_right=None, joints_left=None, joints_right=None):
+        assert poses_3d is None or len(poses_3d) == len(poses_2d)
+        assert cameras is None or len(cameras) == len(poses_2d)
+
+        self.augment = augment
+        self.kps_left = kps_left
+        self.kps_right = kps_right
+        self.joints_left = joints_left
+        self.joints_right = joints_right
+
+        self.pad = pad
+        self.causal_shift = causal_shift
+        self.cameras = [] if cameras is None else cameras
+        self.poses_3d = [] if poses_3d is None else poses_3d
+        self.poses_2d = poses_2d
+
+    def num_frames(self):
+        count = 0
+        for p in self.poses_2d:
+            count += p.shape[0]
+        return count
+
+    def augment_enabled(self):
+        return self.augment
+
+    def set_augment(self, augment):
+        self.augment = augment
+
+    def next_epoch(self):
+        for seq_cam, seq_3d, seq_2d in zip_longest(self.cameras, self.poses_3d, self.poses_2d):
+            batch_cam = None if seq_cam is None else np.expand_dims(seq_cam, axis=0)
+            batch_3d = None if seq_3d is None else np.expand_dims(seq_3d, axis=0)
+            batch_2d = np.expand_dims(np.pad(seq_2d,
+                                             ((self.pad + self.causal_shift, self.pad - self.causal_shift), (0, 0),
+                                              (0, 0)),
+                                             'edge'), axis=0)
+            if self.augment:
+                # Append flipped version
+                if batch_cam is not None:
+                    batch_cam = np.concatenate((batch_cam, batch_cam), axis=0)
+                    batch_cam[1, 2] *= -1
+                    batch_cam[1, 7] *= -1
+
+                if batch_3d is not None:
+                    batch_3d = np.concatenate((batch_3d, batch_3d), axis=0)
+                    batch_3d[1, :, :, 0] *= -1
+                    batch_3d[1, :, self.joints_left + self.joints_right] = batch_3d[1, :,
+                                                                           self.joints_right + self.joints_left]
+
+                batch_2d = np.concatenate((batch_2d, batch_2d), axis=0)
+                batch_2d[1, :, :, 0] *= -1
+                batch_2d[1, :, self.kps_left + self.kps_right] = batch_2d[1, :, self.kps_right + self.kps_left]
+
+            yield batch_cam, batch_3d, batch_2d
+
diff --git a/VideoToNPZ/common/graph_utils.py b/VideoToNPZ/common/graph_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..77ed5da19f51542633cee8ecd292eddff235f982
--- /dev/null
+++ b/VideoToNPZ/common/graph_utils.py
@@ -0,0 +1,45 @@
+from __future__ import absolute_import
+
+import torch
+import numpy as np
+import scipy.sparse as sp
+
+
+def normalize(mx):
+    """Row-normalize sparse matrix"""
+    rowsum = np.array(mx.sum(1))
+    r_inv = np.power(rowsum, -1).flatten()
+    r_inv[np.isinf(r_inv)] = 0.
+    r_mat_inv = sp.diags(r_inv)
+    mx = r_mat_inv.dot(mx)
+    return mx
+
+
+def sparse_mx_to_torch_sparse_tensor(sparse_mx):
+    """Convert a scipy sparse matrix to a torch sparse tensor."""
+    sparse_mx = sparse_mx.tocoo().astype(np.float32)
+    indices = torch.from_numpy(np.vstack((sparse_mx.row, sparse_mx.col)).astype(np.int64))
+    values = torch.from_numpy(sparse_mx.data)
+    shape = torch.Size(sparse_mx.shape)
+    return torch.sparse.FloatTensor(indices, values, shape)
+
+
+def adj_mx_from_edges(num_pts, edges, sparse=True):
+    edges = np.array(edges, dtype=np.int32)
+    data, i, j = np.ones(edges.shape[0]), edges[:, 0], edges[:, 1]
+    adj_mx = sp.coo_matrix((data, (i, j)), shape=(num_pts, num_pts), dtype=np.float32)
+
+    # build symmetric adjacency matrix
+    adj_mx = adj_mx + adj_mx.T.multiply(adj_mx.T > adj_mx) - adj_mx.multiply(adj_mx.T > adj_mx)
+    adj_mx = normalize(adj_mx + sp.eye(adj_mx.shape[0]))
+    if sparse:
+        adj_mx = sparse_mx_to_torch_sparse_tensor(adj_mx)
+    else:
+        adj_mx = torch.tensor(adj_mx.todense(), dtype=torch.float)
+    return adj_mx
+
+
+def adj_mx_from_skeleton(skeleton):
+    num_joints = skeleton.num_joints()
+    edges = list(filter(lambda x: x[1] >= 0, zip(list(range(0, num_joints)), skeleton.parents())))
+    return adj_mx_from_edges(num_joints, edges, sparse=False)
diff --git a/VideoToNPZ/common/loss.py b/VideoToNPZ/common/loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..128c74881cddb48b365c8aace1f20c91507db5da
--- /dev/null
+++ b/VideoToNPZ/common/loss.py
@@ -0,0 +1,90 @@
+import torch
+import numpy as np
+
+
+def mpjpe(predicted, target):
+    """
+    Mean per-joint position error (i.e. mean Euclidean distance),
+    often referred to as "Protocol #1" in many papers.
+    """
+    assert predicted.shape == target.shape
+    return torch.mean(torch.norm(predicted - target, dim=len(target.shape) - 1))
+
+
+def p_mpjpe(predicted, target):
+    """
+    Pose error: MPJPE after rigid alignment (scale, rotation, and translation),
+    often referred to as "Protocol #2" in many papers.
+    """
+    assert predicted.shape == target.shape
+
+    muX = np.mean(target, axis=1, keepdims=True)
+    muY = np.mean(predicted, axis=1, keepdims=True)
+
+    X0 = target - muX
+    Y0 = predicted - muY
+
+    normX = np.sqrt(np.sum(X0 ** 2, axis=(1, 2), keepdims=True))
+    normY = np.sqrt(np.sum(Y0 ** 2, axis=(1, 2), keepdims=True))
+
+    X0 /= normX
+    Y0 /= normY
+
+    H = np.matmul(X0.transpose(0, 2, 1), Y0)
+    U, s, Vt = np.linalg.svd(H)
+    V = Vt.transpose(0, 2, 1)
+    R = np.matmul(V, U.transpose(0, 2, 1))
+
+    # Avoid improper rotations (reflections), i.e. rotations with det(R) = -1
+    sign_detR = np.sign(np.expand_dims(np.linalg.det(R), axis=1))
+    V[:, :, -1] *= sign_detR
+    s[:, -1] *= sign_detR.flatten()
+    R = np.matmul(V, U.transpose(0, 2, 1))  # Rotation
+
+    tr = np.expand_dims(np.sum(s, axis=1, keepdims=True), axis=2)
+
+    a = tr * normX / normY  # Scale
+    t = muX - a * np.matmul(muY, R)  # Translation
+
+    # Perform rigid transformation on the input
+    predicted_aligned = a * np.matmul(predicted, R) + t
+
+    # Return MPJPE
+    return np.mean(np.linalg.norm(predicted_aligned - target, axis=len(target.shape) - 1))
+
+
+def euclidean_losses(actual, target):
+    """Calculate the average Euclidean loss for multi-point samples.
+
+    Each sample must contain `n` points, each with `d` dimensions. For example,
+    in the MPII human pose estimation task n=16 (16 joint locations) and
+    d=2 (locations are 2D).
+
+    Args:
+        actual (Tensor): Predictions (B x L x D)
+        target (Tensor): Ground truth target (B x L x D)
+    """
+
+    assert actual.size() == target.size(), 'input tensors must have the same size'
+
+    # Calculate Euclidean distances between actual and target locations
+    diff = actual - target
+    dist_sq = diff.pow(2).sum(-1, keepdim=False)
+    dist = dist_sq.sqrt()
+    return dist
+
+
+def pck(actual, expected, threshold=150):
+    dists = euclidean_losses(actual, expected)
+    return (dists < threshold).double().mean().item()
+
+
+def auc(actual, expected):
+    # This range of thresholds mimics `mpii_compute_3d_pck.m`, which is provided as part of the
+    # MPI-INF-3DHP test data release.
+    thresholds = torch.linspace(0, 150, 31).tolist()
+
+    pck_values = torch.DoubleTensor(len(thresholds))
+    for i, threshold in enumerate(thresholds):
+        pck_values[i] = pck(actual, expected, threshold=threshold)
+    return pck_values.mean().item()
diff --git a/VideoToNPZ/common/quaternion.py b/VideoToNPZ/common/quaternion.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba9070eebd176a648f29d565bfa260ede08fa32f
--- /dev/null
+++ b/VideoToNPZ/common/quaternion.py
@@ -0,0 +1,36 @@
+import torch
+
+
+def qort(q, v):
+    """
+    Rotate vector(s) v about the rotation described by quaternion(s) q.
+    Expects a tensor of shape (*, 4) for q and a tensor of shape (*, 3) for v,
+    where * denotes any number of dimensions.
+    Returns a tensor of shape (*, 3).
+    """
+    assert q.shape[-1] == 4
+    assert v.shape[-1] == 3
+    assert q.shape[:-1] == v.shape[:-1]
+
+    qvec = q[..., 1:]
+    uv = torch.cross(qvec, v, dim=len(q.shape)-1)
+    uuv = torch.cross(qvec, uv, dim=len(q.shape)-1)
+    return v + 2 * (q[..., :1] * uv + uuv)
+
+
+def qinverse(q, inplace=False):
+    # We assume the quaternion to be normalized
+    """
+    The quaternions provided in the code are from the camera coordinate to the world coordinate.
+    Therefore, the quaternions from the world coordinate to the camera coordinate is the transpose of quaternions from
+    the camera coordinates to the world coordinate.The precondition is that the quaternion is a unit quaternion.
+    So the inverse of the quaternions is equal to the transposition of the quaternions.
+    """
+    if inplace:
+        q[..., 1:] *= -1
+        return q
+    else:
+        w = q[..., :1]
+        xyz = q[..., 1:]
+        return torch.cat((w, -xyz), dim=len(q.shape)-1)
+
diff --git a/VideoToNPZ/common/skeleton.py b/VideoToNPZ/common/skeleton.py
new file mode 100644
index 0000000000000000000000000000000000000000..795bb62763fc4e6a8ab30085777d3b8c1105b88c
--- /dev/null
+++ b/VideoToNPZ/common/skeleton.py
@@ -0,0 +1,81 @@
+import numpy as np
+
+
+class Skeleton:
+    def __init__(self, parents, joints_left, joints_right):
+        assert len(joints_left) == len(joints_right)
+
+        self._parents = parents
+        self._joints_left = joints_left
+        self._joints_right = joints_right
+
+    def num_joints(self):
+        return len(self._parents)
+
+    def parents(self):
+        return self._parents
+
+    def has_children(self):
+        return self._has_children
+
+    def children(self):
+        return self._children
+
+    def remove_joints(self, joints_to_remove):
+        """
+        Remove the joints specified in 'joints_to_remove'.
+        """
+        valid_joints = []
+        for joint in range(len(self._parents)):
+            if joint not in joints_to_remove:
+                valid_joints.append(joint)
+
+        for i in range(len(self._parents)):
+            while self._parents[i] in joints_to_remove:
+                self._parents[i] = self._parents[self._parents[i]]
+
+        index_offsets = np.zeros(len(self._parents), dtype=int)
+        new_parents = []
+        for i, parent in enumerate(self._parents):
+            if i not in joints_to_remove:
+                new_parents.append(parent - index_offsets[parent])
+            else:
+                index_offsets[i:] += 1
+        self._parents = np.array(new_parents)
+
+        if self._joints_left is not None:
+            new_joints_left = []
+            for joint in self._joints_left:
+                if joint in valid_joints:
+                    new_joints_left.append(joint - index_offsets[joint])
+            self._joints_left = new_joints_left
+
+        if self._joints_right is not None:
+            new_joints_right = []
+            for joint in self._joints_right:
+                if joint in valid_joints:
+                    new_joints_right.append(joint - index_offsets[joint])
+            self._joints_right = new_joints_right
+
+        self._compute_metadata()
+
+        return valid_joints
+
+    def joints_left(self):
+        return self._joints_left
+
+    def joints_right(self):
+        return self._joints_right
+
+    def _compute_metadata(self):
+        self._has_children = np.zeros(len(self._parents)).astype(bool)
+        for i, parent in enumerate(self._parents):
+            if parent != -1:
+                self._has_children[parent] = True
+
+        self._children = []
+        for parents in enumerate(self._parents):
+            self._children.append([])
+        for i, parent in enumerate(self._parents):
+            if parent != -1:
+                    self._children[parent].append(i)
diff --git a/VideoToNPZ/data/data_utils.py b/VideoToNPZ/data/data_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..034824105ad4cbe2eb7befb0eb5e05a9b6dc6c9d
--- /dev/null
+++ b/VideoToNPZ/data/data_utils.py
@@ -0,0 +1,95 @@
+import numpy as np
+import h5py
+
+mpii_metadata = {
+    'layout_name': 'mpii',
+    'num_joints': 16,
+    'keypoints_symmetry': [
+        [3, 4, 5, 13, 14, 15],
+        [0, 1, 2, 10, 11, 12],
+    ]
+}
+
+coco_metadata = {
+    'layout_name': 'coco',
+    'num_joints': 17,
+    'keypoints_symmetry': [
+        [1, 3, 5, 7, 9, 11, 13, 15],
+        [2, 4, 6, 8, 10, 12, 14, 16],
+    ]
+}
+
+h36m_metadata = {
+    'layout_name': 'h36m',
+    'num_joints': 17,
+    'keypoints_symmetry': [
+        [4, 5, 6, 11, 12, 13],
+        [1, 2, 3, 14, 15, 16],
+    ]
+}
+
+humaneva15_metadata = {
+    'layout_name': 'humaneva15',
+    'num_joints': 15,
+    'keypoints_symmetry': [
+        [2, 3, 4, 8, 9, 10],
+        [5, 6, 7, 11, 12, 13]
+    ]
+}
+
+humaneva20_metadata = {
+    'layout_name': 'humaneva20',
+    'num_joints': 20,
+    'keypoints_symmetry': [
+        [3, 4, 5, 6, 11, 12, 13, 14],
+        [7, 8, 9, 10, 15, 16, 17, 18]
+    ]
+}
+
+def suggest_metadata(name):
+    names = []
+    for metadata in [mpii_metadata, coco_metadata, h36m_metadata, humaneva15_metadata, humaneva20_metadata]:
+        if metadata['layout_name'] in name:
+            return metadata
+        names.append(metadata['layout_name'])
+    raise KeyError('Cannot infer keypoint layout from name "{}". Tried {}.'.format(name, names))
+
+def import_detectron_poses(path):
+    # Latin1 encoding because Detectron runs on Python 2.7
+    data = np.load(path, encoding='latin1')
+    kp = data['keypoints']
+    bb = data['boxes']
+    results = []
+    for i in range(len(bb)):
+        if len(bb[i][1]) == 0:
+            assert i > 0
+            # Use last pose in case of detection failure
+            results.append(results[-1])
+            continue
+        best_match = np.argmax(bb[i][1][:, 4])
+        keypoints = kp[i][1][best_match].T.copy()
+        results.append(keypoints)
+    results = np.array(results)
+    return results[:, :, 4:6] # Soft-argmax
+    #return results[:, :, [0, 1, 3]] # Argmax + score
+    
+    
+def import_cpn_poses(path):
+    data = np.load(path)
+    kp = data['keypoints']
+    return kp[:, :, :2]
+    
+    
+def import_sh_poses(path):
+    with h5py.File(path) as hf:
+        positions = hf['poses'].value
+    return positions.astype('float32')
+    
+def suggest_pose_importer(name):
+    if 'detectron' in name:
+        return import_detectron_poses
+    if 'cpn' in name:
+        return import_cpn_poses
+    if 'sh' in name:
+        return import_sh_poses
+    raise KeyError('Cannot infer keypoint format from name "{}". Tried detectron, cpn, sh.'.format(name))
diff --git a/VideoToNPZ/gen_skes.py b/VideoToNPZ/gen_skes.py
new file mode 100644
index 0000000000000000000000000000000000000000..0229cc05c9d0d2a9a5c633eb7598f6b8249e7224
--- /dev/null
+++ b/VideoToNPZ/gen_skes.py
@@ -0,0 +1,116 @@
+import torch
+import sys
+import os.path as osp
+import os
+import argparse
+import cv2
+import time
+import h5py
+from tqdm import tqdm
+import numpy as np
+import warnings
+import signal
+
+warnings.filterwarnings('ignore')
+
+sys.path.insert(0, osp.dirname(osp.realpath(__file__)))
+from tools.utils import get_path
+from model.gast_net import SpatioTemporalModel, SpatioTemporalModelOptimized1f
+from common.skeleton import Skeleton
+from common.graph_utils import adj_mx_from_skeleton
+from common.generators import *
+from tools.preprocess import load_kpts_json, h36m_coco_format, revise_kpts, revise_skes
+from tools.inference import gen_pose
+from tools.vis_kpts import plot_keypoint
+
+cur_dir, chk_root, data_root, lib_root, output_root = get_path(__file__)
+model_dir = chk_root + 'gastnet/'
+sys.path.insert(1, lib_root)
+from lib.pose import gen_video_kpts as hrnet_pose
+sys.path.pop(1)
+sys.path.pop(0)
+
+skeleton = Skeleton(parents=[-1, 0, 1, 2, 0, 4, 5, 0, 7, 8, 9, 8, 11, 12, 8, 14, 15],
+                    joints_left=[4, 5, 6, 11, 12, 13], joints_right=[1, 2, 3, 14, 15, 16])
+adj = adj_mx_from_skeleton(skeleton)
+
+joints_left, joints_right = [4, 5, 6, 11, 12, 13], [1, 2, 3, 14, 15, 16]
+kps_left, kps_right = [4, 5, 6, 11, 12, 13], [1, 2, 3, 14, 15, 16]
+
+# Set up signal handler for keyboard interrupt
+def signal_handler(sig, frame):
+    print("\nInterrupted by user, shutting down...")
+    if 'pool' in locals() and pool is not None:
+        pool.terminate()
+        pool.join()
+    sys.exit(0)
+
+signal.signal(signal.SIGINT, signal_handler)
+
+def load_model_layer():
+    chk = model_dir + '81_frame_model.bin'
+    filters_width = [3, 3, 3, 3]
+    channels = 64
+
+    model_pos = SpatioTemporalModel(adj, 17, 2, 17, filter_widths=filters_width, channels=channels, dropout=0.05)
+
+    checkpoint = torch.load(chk)
+    model_pos.load_state_dict(checkpoint['model_pos'])
+
+    if torch.cuda.is_available():
+        model_pos = model_pos.cuda()
+    model_pos = model_pos.eval()
+
+    return model_pos
+
+def generate_skeletons(video=''):
+    cap = cv2.VideoCapture(video)
+    width = cap.get(cv2.CAP_PROP_FRAME_WIDTH)
+    height = cap.get(cv2.CAP_PROP_FRAME_HEIGHT)
+
+    keypoints, scores = hrnet_pose(video, det_dim=416, gen_output=True)
+    keypoints, scores, valid_frames = h36m_coco_format(keypoints, scores)
+    re_kpts = revise_kpts(keypoints, scores, valid_frames)
+    num_person = len(re_kpts)
+
+    model_pos = load_model_layer()
+
+    pad = (81 - 1) // 2
+    causal_shift = 0
+
+    prediction = gen_pose(re_kpts, valid_frames, width, height, model_pos, pad, causal_shift)
+
+    print('Recording 3D Pose:')
+
+    # Add a loading bar
+    for i in tqdm(range(100)):
+        time.sleep(0.01)
+
+    # Create output directory with absolute path
+    output_dir = os.path.abspath('../outputs/')
+    print(f"Creating output directory: {output_dir}")
+    os.makedirs(output_dir, exist_ok=True)
+
+    npz_dir = os.path.join(output_dir, 'npz')
+    print(f"Creating NPZ directory: {npz_dir}")
+    os.makedirs(npz_dir, exist_ok=True)
+
+    output_npz = os.path.join(npz_dir, os.path.basename(video).split('.')[0] + '.npz')
+    print(f"Saving NPZ to: {output_npz}")
+    np.savez_compressed(output_npz, reconstruction=prediction)
+    print(f"NPZ saved successfully: {output_npz}")
+    
+def arg_parse():
+    parser = argparse.ArgumentParser('Generating skeleton demo.')
+    parser.add_argument('-v', '--video', type=str)
+    args = parser.parse_args()
+    return args
+
+if __name__ == "__main__":
+    args = arg_parse()
+    # Use the video path as-is if absolute, otherwise prepend data_root
+    if os.path.isabs(args.video):
+        video_path = args.video
+    else:
+        video_path = os.path.join(data_root, 'video', args.video)
+    generate_skeletons(video=video_path)
\ No newline at end of file
diff --git a/VideoToNPZ/lib/detector/__init__.py b/VideoToNPZ/lib/detector/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..33a2b4d8be482786b3169e2e0b77d5ca6ec2daed
--- /dev/null
+++ b/VideoToNPZ/lib/detector/__init__.py
@@ -0,0 +1,6 @@
+import sys
+import os.path as osp
+
+sys.path.insert(0, osp.join(osp.dirname(osp.realpath(__file__)), 'yolov3'))
+from human_detector import yolo_human_det, load_model
+sys.path.pop(0)
\ No newline at end of file
diff --git a/VideoToNPZ/lib/detector/yolov3/__init__.py b/VideoToNPZ/lib/detector/yolov3/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/VideoToNPZ/lib/detector/yolov3/bbox.py b/VideoToNPZ/lib/detector/yolov3/bbox.py
new file mode 100644
index 0000000000000000000000000000000000000000..96818bc485a0ae55c0a4e771a2a2af2f026a2221
--- /dev/null
+++ b/VideoToNPZ/lib/detector/yolov3/bbox.py
@@ -0,0 +1,111 @@
+from __future__ import division
+
+import torch 
+import random
+import numpy as np
+import cv2
+
+
+def confidence_filter(result, confidence):
+    conf_mask = (result[:,:,4] > confidence).float().unsqueeze(2)
+    result = result*conf_mask    
+    
+    return result
+
+
+def confidence_filter_cls(result, confidence):
+    max_scores = torch.max(result[:,:,5:25], 2)[0]
+    res = torch.cat((result, max_scores),2)
+    print(res.shape)
+    
+    
+    cond_1 = (res[:,:,4] > confidence).float()
+    cond_2 = (res[:,:,25] > 0.995).float()
+    
+    conf = cond_1 + cond_2
+    conf = torch.clamp(conf, 0.0, 1.0)
+    conf = conf.unsqueeze(2)
+    result = result*conf   
+    return result
+
+
+def get_abs_coord(box):
+    box[2], box[3] = abs(box[2]), abs(box[3])
+    x1 = (box[0] - box[2]/2) - 1 
+    y1 = (box[1] - box[3]/2) - 1 
+    x2 = (box[0] + box[2]/2) - 1 
+    y2 = (box[1] + box[3]/2) - 1
+    return x1, y1, x2, y2
+
+
+def sanity_fix(box):
+    if (box[0] > box[2]):
+        box[0], box[2] = box[2], box[0]
+    
+    if (box[1] >  box[3]):
+        box[1], box[3] = box[3], box[1]
+        
+    return box
+
+
+def bbox_iou(box1, box2):
+    """
+    Returns the IoU of two bounding boxes
+    
+    """
+    # Get the coordinates of bounding boxes
+    b1_x1, b1_y1, b1_x2, b1_y2 = box1[:, 0], box1[:, 1], box1[:, 2], box1[:, 3]
+    b2_x1, b2_y1, b2_x2, b2_y2 = box2[:, 0], box2[:, 1], box2[:, 2], box2[:, 3]
+    
+    # get the corrdinates of the intersection rectangle
+    inter_rect_x1 = torch.max(b1_x1, b2_x1)
+    inter_rect_y1 = torch.max(b1_y1, b2_y1)
+    inter_rect_x2 = torch.min(b1_x2, b2_x2)
+    inter_rect_y2 = torch.min(b1_y2, b2_y2)
+    
+    # Intersection area
+    if torch.cuda.is_available():
+            inter_area = torch.max(inter_rect_x2 - inter_rect_x1 + 1, torch.zeros(inter_rect_x2.shape).cuda())*torch.max(inter_rect_y2 - inter_rect_y1 + 1, torch.zeros(inter_rect_x2.shape).cuda())
+    else:
+            inter_area = torch.max(inter_rect_x2 - inter_rect_x1 + 1, torch.zeros(inter_rect_x2.shape))*torch.max(inter_rect_y2 - inter_rect_y1 + 1, torch.zeros(inter_rect_x2.shape))
+    
+    # Union Area
+    b1_area = (b1_x2 - b1_x1 + 1)*(b1_y2 - b1_y1 + 1)
+    b2_area = (b2_x2 - b2_x1 + 1)*(b2_y2 - b2_y1 + 1)
+    
+    iou = inter_area / (b1_area + b2_area - inter_area)
+    
+    return iou
+
+
+def pred_corner_coord(prediction):
+    #Get indices of non-zero confidence bboxes
+    ind_nz = torch.nonzero(prediction[:,:,4]).transpose(0,1).contiguous()
+    
+    box = prediction[ind_nz[0], ind_nz[1]]
+
+    box_a = box.new(box.shape)
+    box_a[:,0] = (box[:,0] - box[:,2]/2)
+    box_a[:,1] = (box[:,1] - box[:,3]/2)
+    box_a[:,2] = (box[:,0] + box[:,2]/2) 
+    box_a[:,3] = (box[:,1] + box[:,3]/2)
+    box[:,:4] = box_a[:,:4]
+    
+    prediction[ind_nz[0], ind_nz[1]] = box
+    
+    return prediction
+
+
+def write(x, batches, results, colors, classes):
+    c1 = tuple(x[1:3].int())
+    c2 = tuple(x[3:5].int())
+    img = results[int(x[0])]
+    cls = int(x[-1])
+    label = "{0}".format(classes[cls])
+    color = random.choice(colors)
+    cv2.rectangle(img, c1, c2,color, 1)
+    t_size = cv2.getTextSize(label, cv2.FONT_HERSHEY_PLAIN, 1 , 1)[0]
+    c2 = c1[0] + t_size[0] + 3, c1[1] + t_size[1] + 4
+    cv2.rectangle(img, c1, c2,color, -1)
+    cv2.putText(img, label, (c1[0], c1[1] + t_size[1] + 4), cv2.FONT_HERSHEY_PLAIN, 1, [225,255,255], 1);
+    return img
diff --git a/VideoToNPZ/lib/detector/yolov3/cfg/tiny-yolo-voc.cfg b/VideoToNPZ/lib/detector/yolov3/cfg/tiny-yolo-voc.cfg
new file mode 100644
index 0000000000000000000000000000000000000000..ab2c066a216eacbee86e78c28f4d236e5d6b351a
--- /dev/null
+++ b/VideoToNPZ/lib/detector/yolov3/cfg/tiny-yolo-voc.cfg
@@ -0,0 +1,134 @@
+[net]
+batch=64
+subdivisions=8
+width=416
+height=416
+channels=3
+momentum=0.9
+decay=0.0005
+angle=0
+saturation = 1.5
+exposure = 1.5
+hue=.1
+
+learning_rate=0.001
+max_batches = 40200
+policy=steps
+steps=-1,100,20000,30000
+scales=.1,10,.1,.1
+
+[convolutional]
+batch_normalize=1
+filters=16
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[maxpool]
+size=2
+stride=2
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[maxpool]
+size=2
+stride=2
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[maxpool]
+size=2
+stride=2
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[maxpool]
+size=2
+stride=2
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[maxpool]
+size=2
+stride=2
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[maxpool]
+size=2
+stride=1
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=1
+pad=1
+activation=leaky
+
+###########
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=1024
+activation=leaky
+
+[convolutional]
+size=1
+stride=1
+pad=1
+filters=125
+activation=linear
+
+[region]
+anchors = 1.08,1.19,  3.42,4.41,  6.63,11.38,  9.42,5.11,  16.62,10.52
+bias_match=1
+classes=20
+coords=4
+num=5
+softmax=1
+jitter=.2
+rescore=1
+
+object_scale=5
+noobject_scale=1
+class_scale=1
+coord_scale=1
+
+absolute=1
+thresh = .6
+random=1
diff --git a/VideoToNPZ/lib/detector/yolov3/cfg/yolo-voc.cfg b/VideoToNPZ/lib/detector/yolov3/cfg/yolo-voc.cfg
new file mode 100644
index 0000000000000000000000000000000000000000..d5bdfc1c5bf2d34885d7614d76d980c90373f89a
--- /dev/null
+++ b/VideoToNPZ/lib/detector/yolov3/cfg/yolo-voc.cfg
@@ -0,0 +1,258 @@
+[net]
+# Testing
+batch=64
+subdivisions=8
+# Training
+# batch=64
+# subdivisions=8
+height=416
+width=416
+channels=3
+momentum=0.9
+decay=0.0005
+angle=0
+saturation = 1.5
+exposure = 1.5
+hue=.1
+
+learning_rate=0.001
+burn_in=1000
+max_batches = 80200
+policy=steps
+steps=-1,500,40000,60000
+scales=0.1,10,.1,.1
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[maxpool]
+size=2
+stride=2
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[maxpool]
+size=2
+stride=2
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[maxpool]
+size=2
+stride=2
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[maxpool]
+size=2
+stride=2
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[maxpool]
+size=2
+stride=2
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=1
+pad=1
+activation=leaky
+
+
+#######
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=1024
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=1024
+activation=leaky
+
+[route]
+layers=-9
+
+[convolutional]
+batch_normalize=1
+size=1
+stride=1
+pad=1
+filters=64
+activation=leaky
+
+[reorg]
+stride=2
+
+[route]
+layers=-1,-4
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=1024
+activation=leaky
+
+[convolutional]
+size=1
+stride=1
+pad=1
+filters=125
+activation=linear
+
+
+[region]
+anchors =  1.3221, 1.73145, 3.19275, 4.00944, 5.05587, 8.09892, 9.47112, 4.84053, 11.2364, 10.0071
+bias_match=1
+classes=20
+coords=4
+num=5
+softmax=1
+jitter=.3
+rescore=1
+
+object_scale=5
+noobject_scale=1
+class_scale=1
+coord_scale=1
+
+absolute=1
+thresh = .6
+random=1
diff --git a/VideoToNPZ/lib/detector/yolov3/cfg/yolo.cfg b/VideoToNPZ/lib/detector/yolov3/cfg/yolo.cfg
new file mode 100644
index 0000000000000000000000000000000000000000..2a0cd98fbd07c94aa0840c528a12b1b60a004928
--- /dev/null
+++ b/VideoToNPZ/lib/detector/yolov3/cfg/yolo.cfg
@@ -0,0 +1,258 @@
+[net]
+# Testing
+batch=1
+subdivisions=1
+# Training
+# batch=64
+# subdivisions=8
+width=416
+height=416
+channels=3
+momentum=0.9
+decay=0.0005
+angle=0
+saturation = 1.5
+exposure = 1.5
+hue=.1
+
+learning_rate=0.001
+burn_in=1000
+max_batches = 500200
+policy=steps
+steps=400000,450000
+scales=.1,.1
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[maxpool]
+size=2
+stride=2
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[maxpool]
+size=2
+stride=2
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[maxpool]
+size=2
+stride=2
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[maxpool]
+size=2
+stride=2
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[maxpool]
+size=2
+stride=2
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=1
+pad=1
+activation=leaky
+
+
+#######
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=1024
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=1024
+activation=leaky
+
+[route]
+layers=-9
+
+[convolutional]
+batch_normalize=1
+size=1
+stride=1
+pad=1
+filters=64
+activation=leaky
+
+[reorg]
+stride=2
+
+[route]
+layers=-1,-4
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=1024
+activation=leaky
+
+[convolutional]
+size=1
+stride=1
+pad=1
+filters=425
+activation=linear
+
+
+[region]
+anchors =  0.57273, 0.677385, 1.87446, 2.06253, 3.33843, 5.47434, 7.88282, 3.52778, 9.77052, 9.16828
+bias_match=1
+classes=80
+coords=4
+num=5
+softmax=1
+jitter=.3
+rescore=1
+
+object_scale=5
+noobject_scale=1
+class_scale=1
+coord_scale=1
+
+absolute=1
+thresh = .6
+random=1
diff --git a/VideoToNPZ/lib/detector/yolov3/cfg/yolov3.cfg b/VideoToNPZ/lib/detector/yolov3/cfg/yolov3.cfg
new file mode 100644
index 0000000000000000000000000000000000000000..e94193b0e82e56b3b457f3d8c049ffb9ac7ed1f8
--- /dev/null
+++ b/VideoToNPZ/lib/detector/yolov3/cfg/yolov3.cfg
@@ -0,0 +1,789 @@
+[net]
+# Testing
+batch=1
+subdivisions=1
+# Training
+# batch=64
+# subdivisions=16
+width= 320
+height = 320
+channels=3
+momentum=0.9
+decay=0.0005
+angle=0
+saturation = 1.5
+exposure = 1.5
+hue=.1
+
+learning_rate=0.001
+burn_in=1000
+max_batches = 500200
+policy=steps
+steps=400000,450000
+scales=.1,.1
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+# Downsample
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=3
+stride=2
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+# Downsample
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=2
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+# Downsample
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=2
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+# Downsample
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=2
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+# Downsample
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=2
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+######################
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=1024
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=1024
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=1024
+activation=leaky
+
+[convolutional]
+size=1
+stride=1
+pad=1
+filters=255
+activation=linear
+
+
+[yolo]
+mask = 6,7,8
+anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
+classes=80
+num=9
+jitter=.3
+ignore_thresh = .5
+truth_thresh = 1
+random=1
+
+
+[route]
+layers = -4
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[upsample]
+stride=2
+
+[route]
+layers = -1, 61
+
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=512
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=512
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=512
+activation=leaky
+
+[convolutional]
+size=1
+stride=1
+pad=1
+filters=255
+activation=linear
+
+
+[yolo]
+mask = 3,4,5
+anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
+classes=80
+num=9
+jitter=.3
+ignore_thresh = .5
+truth_thresh = 1
+random=1
+
+
+
+[route]
+layers = -4
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[upsample]
+stride=2
+
+[route]
+layers = -1, 36
+
+
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=256
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=256
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=256
+activation=leaky
+
+[convolutional]
+size=1
+stride=1
+pad=1
+filters=255
+activation=linear
+
+
+[yolo]
+mask = 0,1,2
+anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
+classes=80
+num=9
+jitter=.3
+ignore_thresh = .5
+truth_thresh = 1
+random=1
+
diff --git a/VideoToNPZ/lib/detector/yolov3/darknet.py b/VideoToNPZ/lib/detector/yolov3/darknet.py
new file mode 100644
index 0000000000000000000000000000000000000000..7167784f9e070f42bf2c5fb253f38133903b3b4a
--- /dev/null
+++ b/VideoToNPZ/lib/detector/yolov3/darknet.py
@@ -0,0 +1,433 @@
+from __future__ import division
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+import cv2
+import os
+import sys
+
+from util import convert2cpu as cpu
+from util import predict_transform
+
+
+class test_net(nn.Module):
+    def __init__(self, num_layers, input_size):
+        super(test_net, self).__init__()
+        self.num_layers= num_layers
+        self.linear_1 = nn.Linear(input_size, 5)
+        self.middle = nn.ModuleList([nn.Linear(5,5) for x in range(num_layers)])
+        self.output = nn.Linear(5,2)
+
+    def forward(self, x):
+        x = x.view(-1)
+        fwd = nn.Sequential(self.linear_1, *self.middle, self.output)
+        return fwd(x)
+
+
+def get_test_input():
+    img = cv2.imread("dog-cycle-car.png")
+    img = cv2.resize(img, (416, 416))
+    img_ = img[:, :, ::-1].transpose((2, 0, 1))
+    img_ = img_[np.newaxis, :, :, :]/255.0
+    img_ = torch.from_numpy(img_).float()
+    return img_
+
+
+def parse_cfg(cfgfile):
+    """
+    Takes a configuration file
+
+    Returns a list of blocks. Each blocks describes a block in the neural
+    network to be built. Block is represented as a dictionary in the list
+
+    """
+    # cfgfile = os.path.join(sys.path[-1], cfgfile)
+    file = open(cfgfile, 'r')
+    lines = file.read().split('\n')     # store the lines in a list
+    lines = [x for x in lines if len(x) > 0]  # get read of the empty lines
+    lines = [x for x in lines if x[0] != '#']
+    lines = [x.rstrip().lstrip() for x in lines]
+
+    block = {}
+    blocks = []
+
+    for line in lines:
+        if line[0] == "[":               # This marks the start of a new block
+            if len(block) != 0:
+                blocks.append(block)
+                block = {}
+            block["type"] = line[1:-1].rstrip()
+        else:
+            key,value = line.split("=")
+            block[key.rstrip()] = value.lstrip()
+    blocks.append(block)
+
+    return blocks
+
+
+class MaxPoolStride1(nn.Module):
+    def __init__(self, kernel_size):
+        super(MaxPoolStride1, self).__init__()
+        self.kernel_size = kernel_size
+        self.pad = kernel_size - 1
+
+    def forward(self, x):
+        padded_x = F.pad(x, (0, self.pad, 0, self.pad), mode="replicate")
+        pooled_x = nn.MaxPool2d(self.kernel_size, self.pad)(padded_x)
+        return pooled_x
+
+
+class EmptyLayer(nn.Module):
+    def __init__(self):
+        super(EmptyLayer, self).__init__()
+
+
+class DetectionLayer(nn.Module):
+    def __init__(self, anchors):
+        super(DetectionLayer, self).__init__()
+        self.anchors = anchors
+
+    def forward(self, x, inp_dim, num_classes, confidence):
+        x = x.data
+        global CUDA
+        prediction = x
+        prediction = predict_transform(prediction, inp_dim, self.anchors, num_classes, confidence, CUDA)
+        return prediction
+
+
+class Upsample(nn.Module):
+    def __init__(self, stride=2):
+        super(Upsample, self).__init__()
+        self.stride = stride
+
+    def forward(self, x):
+        stride = self.stride
+        assert(x.data.dim() == 4)
+        B = x.data.size(0)
+        C = x.data.size(1)
+        H = x.data.size(2)
+        W = x.data.size(3)
+        ws = stride
+        hs = stride
+        x = x.view(B, C, H, 1, W, 1).expand(B, C, H, stride, W, stride).contiguous().view(B, C, H*stride, W*stride)
+        return x
+
+
+class ReOrgLayer(nn.Module):
+    def __init__(self, stride=2):
+        super(ReOrgLayer, self).__init__()
+        self.stride= stride
+
+    def forward(self, x):
+        assert(x.data.dim() == 4)
+        B, C, H, W = x.data.shape
+        hs = self.stride
+        ws = self.stride
+        assert(H % hs == 0),  "The stride " + str(self.stride) + " is not a proper divisor of height " + str(H)
+        assert(W % ws == 0),  "The stride " + str(self.stride) + " is not a proper divisor of height " + str(W)
+        x = x.view(B, C, H // hs, hs, W // ws, ws).transpose(-2, -3).contiguous()
+        x = x.view(B, C, H // hs * W // ws, hs, ws)
+        x = x.view(B, C, H // hs * W // ws, hs*ws).transpose(-1, -2).contiguous()
+        x = x.view(B, C, ws*hs, H // ws, W // ws).transpose(1, 2).contiguous()
+        x = x.view(B, C*ws*hs, H // ws, W // ws)
+        return x
+
+
+def create_modules(blocks):
+    net_info = blocks[0]     # Captures the information about the input and pre-processing
+
+    module_list = nn.ModuleList()
+
+    index = 0    # indexing blocks helps with implementing route  layers (skip connections)
+    prev_filters = 3
+    output_filters = []
+
+    for x in blocks:
+        module = nn.Sequential()
+        if x["type"] == "net":
+            continue
+
+        # If it's a convolutional layer
+        if x["type"] == "convolutional":
+            # Get the info about the layer
+            activation = x["activation"]
+            try:
+                batch_normalize = int(x["batch_normalize"])
+                bias = False
+            except:
+                batch_normalize = 0
+                bias = True
+
+            filters= int(x["filters"])
+            padding = int(x["pad"])
+            kernel_size = int(x["size"])
+            stride = int(x["stride"])
+
+            if padding:
+                pad = (kernel_size - 1) // 2
+            else:
+                pad = 0
+
+            # Add the convolutional layer
+            conv = nn.Conv2d(prev_filters, filters, kernel_size, stride, pad, bias = bias)
+            module.add_module("conv_{0}".format(index), conv)
+
+            # Add the Batch Norm Layer
+            if batch_normalize:
+                bn = nn.BatchNorm2d(filters)
+                module.add_module("batch_norm_{0}".format(index), bn)
+
+            # Check the activation.
+            # It is either Linear or a Leaky ReLU for YOLO
+            if activation == "leaky":
+                activn = nn.LeakyReLU(0.1, inplace = True)
+                module.add_module("leaky_{0}".format(index), activn)
+
+        # If it's an upsampling layer
+        # We use Bilinear2dUpsampling
+
+        elif x["type"] == "upsample":
+            stride = int(x["stride"])
+#           upsample = Upsample(stride)
+            upsample = nn.Upsample(scale_factor=2, mode="nearest")
+            module.add_module("upsample_{}".format(index), upsample)
+
+        # If it is a route layer
+        elif (x["type"] == "route"):
+            x["layers"] = x["layers"].split(',')
+
+            # Start  of a route
+            start = int(x["layers"][0])
+
+            # end, if there exists one.
+            try:
+                end = int(x["layers"][1])
+            except:
+                end = 0
+
+            # Positive anotation
+            if start > 0:
+                start = start - index
+
+            if end > 0:
+                end = end - index
+
+            route = EmptyLayer()
+            module.add_module("route_{0}".format(index), route)
+
+            if end < 0:
+                filters = output_filters[index + start] + output_filters[index + end]
+            else:
+                filters = output_filters[index + start]
+
+        # shortcut corresponds to skip connection
+        elif x["type"] == "shortcut":
+            from_ = int(x["from"])
+            shortcut = EmptyLayer()
+            module.add_module("shortcut_{}".format(index), shortcut)
+
+        elif x["type"] == "maxpool":
+            stride = int(x["stride"])
+            size = int(x["size"])
+            if stride != 1:
+                maxpool = nn.MaxPool2d(size, stride)
+            else:
+                maxpool = MaxPoolStride1(size)
+
+            module.add_module("maxpool_{}".format(index), maxpool)
+
+        # Yolo is the detection layer
+        elif x["type"] == "yolo":
+            mask = x["mask"].split(",")
+            mask = [int(x) for x in mask]
+
+            anchors = x["anchors"].split(",")
+            anchors = [int(a) for a in anchors]
+            anchors = [(anchors[i], anchors[i+1]) for i in range(0, len(anchors),2)]
+            anchors = [anchors[i] for i in mask]
+
+            detection = DetectionLayer(anchors)
+            module.add_module("Detection_{}".format(index), detection)
+
+        else:
+            print("Something I dunno")
+            assert False
+
+        module_list.append(module)
+        prev_filters = filters
+        output_filters.append(filters)
+        index += 1
+
+    return (net_info, module_list)
+
+
+class Darknet(nn.Module):
+    def __init__(self, cfgfile):
+        super(Darknet, self).__init__()
+        self.blocks = parse_cfg(cfgfile)
+        self.net_info, self.module_list = create_modules(self.blocks)
+        self.header = torch.IntTensor([0, 0, 0, 0])
+        self.seen = 0
+
+    def get_blocks(self):
+        return self.blocks
+
+    def get_module_list(self):
+        return self.module_list
+
+    def forward(self, x, CUDA):
+        detections = []
+        modules = self.blocks[1:]
+        outputs = {}   # We cache the outputs for the route layer
+
+        write = 0
+        for i in range(len(modules)):
+
+            module_type = (modules[i]["type"])
+            if module_type == "convolutional" or module_type == "upsample" or module_type == "maxpool":
+
+                x = self.module_list[i](x)
+                outputs[i] = x
+
+            elif module_type == "route":
+                layers = modules[i]["layers"]
+                layers = [int(a) for a in layers]
+
+                if (layers[0]) > 0:
+                    layers[0] = layers[0] - i
+
+                if len(layers) == 1:
+                    x = outputs[i + (layers[0])]
+
+                else:
+                    if (layers[1]) > 0:
+                        layers[1] = layers[1] - i
+
+                    map1 = outputs[i + layers[0]]
+                    map2 = outputs[i + layers[1]]
+
+                    x = torch.cat((map1, map2), 1)
+                outputs[i] = x
+
+            elif module_type == "shortcut":
+                from_ = int(modules[i]["from"])
+                x = outputs[i-1] + outputs[i+from_]
+                outputs[i] = x
+
+            elif module_type == 'yolo':
+
+                anchors = self.module_list[i][0].anchors
+                # Get the input dimensions
+                inp_dim = int(self.net_info["height"])
+
+                # Get the number of classes
+                num_classes = int(modules[i]["classes"])
+
+                # Output the result
+                x = x.data
+                x = predict_transform(x, inp_dim, anchors, num_classes, CUDA)
+
+                if type(x) == int:
+                    continue
+
+                if not write:
+                    detections = x
+                    write = 1
+                else:
+                    detections = torch.cat((detections, x), 1)
+
+                outputs[i] = outputs[i-1]
+
+        try:
+            return detections
+        except:
+            return 0
+
+    def load_weights(self, weightfile):
+        # Introduction: https://blog.paperspace.com/how-to-implement-a-yolo-v3-object-detector-from-scratch-in-pytorch-part-3/
+        # Open the weights file
+        # weightfile = os.path.join(sys.path[-1], weightfile)
+        fp = open(weightfile, "rb")
+
+        # The first 5 values are header information
+        # 1. Major version number
+        # 2. Minor Version Number
+        # 3. Subversion number
+        # 4.5 Images seen by the network (during training)
+        header = np.fromfile(fp, dtype = np.int32, count = 5)
+        self.header = torch.from_numpy(header)
+        self.seen = self.header[3]
+
+        # The rest of the values are the weights
+        # Let's load them up
+        weights = np.fromfile(fp, dtype = np.float32)
+
+        ptr = 0
+        for i in range(len(self.module_list)):
+            module_type = self.blocks[i + 1]["type"]
+
+            if module_type == "convolutional":
+                model = self.module_list[i]
+                try:
+                    batch_normalize = int(self.blocks[i+1]["batch_normalize"])
+                except:
+                    batch_normalize = 0
+
+                conv = model[0]
+
+                if (batch_normalize):
+                    bn = model[1]
+
+                    # Get the number of weights of Batch Norm Layer
+                    num_bn_biases = bn.bias.numel()
+
+                    # Load the weights
+                    bn_biases = torch.from_numpy(weights[ptr:ptr + num_bn_biases])
+                    ptr += num_bn_biases
+
+                    bn_weights = torch.from_numpy(weights[ptr: ptr + num_bn_biases])
+                    ptr += num_bn_biases
+
+                    bn_running_mean = torch.from_numpy(weights[ptr: ptr + num_bn_biases])
+                    ptr += num_bn_biases
+
+                    bn_running_var = torch.from_numpy(weights[ptr: ptr + num_bn_biases])
+                    ptr += num_bn_biases
+
+                    # Cast the loaded weights into dims of model weights.
+                    bn_biases = bn_biases.view_as(bn.bias.data)
+                    bn_weights = bn_weights.view_as(bn.weight.data)
+                    bn_running_mean = bn_running_mean.view_as(bn.running_mean)
+                    bn_running_var = bn_running_var.view_as(bn.running_var)
+
+                    # Copy the data to model
+                    bn.bias.data.copy_(bn_biases)
+                    bn.weight.data.copy_(bn_weights)
+                    bn.running_mean.copy_(bn_running_mean)
+                    bn.running_var.copy_(bn_running_var)
+
+                else:
+                    # Number of biases
+                    num_biases = conv.bias.numel()
+
+                    # Load the weights
+                    conv_biases = torch.from_numpy(weights[ptr: ptr + num_biases])
+                    ptr = ptr + num_biases
+
+                    # reshape the loaded weights according to the dims of the model weights
+                    conv_biases = conv_biases.view_as(conv.bias.data)
+
+                    # Finally copy the data
+                    conv.bias.data.copy_(conv_biases)
+
+                # Let us load the weights for the Convolutional layers
+                num_weights = conv.weight.numel()
+
+                # Do the same as above for weights
+                conv_weights = torch.from_numpy(weights[ptr:ptr+num_weights])
+                ptr = ptr + num_weights
+
+                conv_weights = conv_weights.view_as(conv.weight.data)
+                conv.weight.data.copy_(conv_weights)
diff --git a/VideoToNPZ/lib/detector/yolov3/data/coco.names b/VideoToNPZ/lib/detector/yolov3/data/coco.names
new file mode 100644
index 0000000000000000000000000000000000000000..ca76c80b5b2cd0b25047f75736656cfebc9da7aa
--- /dev/null
+++ b/VideoToNPZ/lib/detector/yolov3/data/coco.names
@@ -0,0 +1,80 @@
+person
+bicycle
+car
+motorbike
+aeroplane
+bus
+train
+truck
+boat
+traffic light
+fire hydrant
+stop sign
+parking meter
+bench
+bird
+cat
+dog
+horse
+sheep
+cow
+elephant
+bear
+zebra
+giraffe
+backpack
+umbrella
+handbag
+tie
+suitcase
+frisbee
+skis
+snowboard
+sports ball
+kite
+baseball bat
+baseball glove
+skateboard
+surfboard
+tennis racket
+bottle
+wine glass
+cup
+fork
+knife
+spoon
+bowl
+banana
+apple
+sandwich
+orange
+broccoli
+carrot
+hot dog
+pizza
+donut
+cake
+chair
+sofa
+pottedplant
+bed
+diningtable
+toilet
+tvmonitor
+laptop
+mouse
+remote
+keyboard
+cell phone
+microwave
+oven
+toaster
+sink
+refrigerator
+book
+clock
+vase
+scissors
+teddy bear
+hair drier
+toothbrush
diff --git a/VideoToNPZ/lib/detector/yolov3/data/pallete b/VideoToNPZ/lib/detector/yolov3/data/pallete
new file mode 100644
index 0000000000000000000000000000000000000000..25f0143e9c80c98923dac550f6cd52e20a9dbbe6
Binary files /dev/null and b/VideoToNPZ/lib/detector/yolov3/data/pallete differ
diff --git a/VideoToNPZ/lib/detector/yolov3/data/voc.names b/VideoToNPZ/lib/detector/yolov3/data/voc.names
new file mode 100644
index 0000000000000000000000000000000000000000..8420ab35ede7400974f25836a6bb543024686a0e
--- /dev/null
+++ b/VideoToNPZ/lib/detector/yolov3/data/voc.names
@@ -0,0 +1,20 @@
+aeroplane
+bicycle
+bird
+boat
+bottle
+bus
+car
+cat
+chair
+cow
+diningtable
+dog
+horse
+motorbike
+person
+pottedplant
+sheep
+sofa
+train
+tvmonitor
diff --git a/VideoToNPZ/lib/detector/yolov3/human_detector.py b/VideoToNPZ/lib/detector/yolov3/human_detector.py
new file mode 100644
index 0000000000000000000000000000000000000000..9067051bc4bbb309befe82b69f105e6d3895a997
--- /dev/null
+++ b/VideoToNPZ/lib/detector/yolov3/human_detector.py
@@ -0,0 +1,155 @@
+from __future__ import division
+import time
+import torch
+import numpy as np
+import cv2
+import os
+import sys
+import random
+import pickle as pkl
+import argparse
+
+from util import *
+from darknet import Darknet
+from preprocess import letterbox_image
+import preprocess
+
+
+cur_dir = os.path.dirname(os.path.realpath(__file__))
+project_root = os.path.join(cur_dir, '../../../')
+chk_root = os.path.join(project_root, 'checkpoint/')
+data_root = os.path.join(project_root, 'data/')
+
+
+sys.path.insert(0, project_root)
+sys.path.pop(0)
+
+
+def prep_image(img, inp_dim):
+    """
+    Prepare image for inputting to the neural network.
+
+    Returns a Variable
+    """
+    ori_img = img
+    dim = ori_img.shape[1], ori_img.shape[0]
+    img = cv2.resize(ori_img, (inp_dim, inp_dim))
+    img_ = img[:, :, ::-1].transpose((2, 0, 1)).copy()
+    img_ = torch.from_numpy(img_).float().div(255.0).unsqueeze(0)
+    return img_, ori_img, dim
+
+
+def write(x, img, colors):
+    x = [int(i) for i in x]
+    c1 = tuple(x[0:2])
+    c2 = tuple(x[2:4])
+
+    label = 'People {}'.format(0)
+    color = (0, 0, 255)
+    cv2.rectangle(img, c1, c2, color, 2)
+    t_size = cv2.getTextSize(label, cv2.FONT_HERSHEY_PLAIN, 1, 1)[0]
+    c2 = c1[0] + t_size[0] + 3, c1[1] + t_size[1] + 4
+    cv2.rectangle(img, c1, c2, color, -1)
+    cv2.putText(img, label, (c1[0], c1[1] + t_size[1] + 4), cv2.FONT_HERSHEY_PLAIN, 1, [225, 255, 255], 1)
+    return img
+
+
+def arg_parse():
+    """"
+    Parse arguements to the detect module
+
+    """
+    parser = argparse.ArgumentParser(description='YOLO v3 Cam Demo')
+    parser.add_argument('--confidence', dest='confidence', type=float, default=0.70,
+                        help='Object Confidence to filter predictions')
+    parser.add_argument('--nms-thresh', dest='nms_thresh', type=float, default=0.4, help='NMS Threshold')
+    parser.add_argument('--reso', dest='reso', default=416, type=int, help='Input resolution of the network. '
+                        'Increase to increase accuracy. Decrease to increase speed. (160, 416)')
+    parser.add_argument('-wf', '--weight-file', type=str, default=chk_root + 'yolov3/yolov3.weights', help='The path'
+                        'of model weight file')
+    parser.add_argument('-cf', '--cfg-file', type=str, default=cur_dir + '/cfg/yolov3.cfg', help='weight file')
+    parser.add_argument('-a', '--animation', action='store_true', help='output animation')
+    parser.add_argument('-v', '--video', type=str, default='camera', help='The input video path')
+    parser.add_argument('-i', '--image', type=str, default=cur_dir + '/data/dog-cycle-car.png',
+                        help='The input video path')
+    parser.add_argument('-np', '--num-person', type=int, default=1, help='number of estimated human poses. [1, 2]')
+    return parser.parse_args()
+
+
+def load_model(args=None, CUDA=None, inp_dim=416):
+    if args is None:
+        args = arg_parse()
+
+    if CUDA is None:
+        CUDA = torch.cuda.is_available()
+
+    # Set up the neural network
+    model = Darknet(args.cfg_file)
+    model.load_weights(args.weight_file)
+
+    model.net_info["height"] = inp_dim
+    assert inp_dim % 32 == 0
+    assert inp_dim > 32
+
+    # If there's a GPU availible, put the model on GPU
+    if CUDA:
+        model.cuda()
+
+    # Set the model in evaluation mode
+    model.eval()
+
+    return model
+
+
+def yolo_human_det(img, model=None, reso=416, confidence=0.70):
+    args = arg_parse()
+    # args.reso = reso
+    inp_dim = reso
+    num_classes = 80
+
+    CUDA = torch.cuda.is_available()
+    if model is None:
+        model = load_model(args, CUDA, inp_dim)
+
+    if type(img) == str:
+        assert os.path.isfile(img), 'The image path does not exist'
+        img = cv2.imread(img)
+
+    img, ori_img, img_dim = preprocess.prep_image(img, inp_dim)
+    img_dim = torch.FloatTensor(img_dim).repeat(1, 2)
+
+    with torch.no_grad():
+        if CUDA:
+            img_dim = img_dim.cuda()
+            img = img.cuda()
+        output = model(img, CUDA)
+        output = write_results(output, confidence, num_classes, nms=True, nms_conf=args.nms_thresh, det_hm=True)
+
+        if len(output) == 0:
+            return None, None
+
+        img_dim = img_dim.repeat(output.size(0), 1)
+        scaling_factor = torch.min(inp_dim / img_dim, 1)[0].view(-1, 1)
+
+        output[:, [1, 3]] -= (inp_dim - scaling_factor * img_dim[:, 0].view(-1, 1)) / 2
+        output[:, [2, 4]] -= (inp_dim - scaling_factor * img_dim[:, 1].view(-1, 1)) / 2
+        output[:, 1:5] /= scaling_factor
+
+        for i in range(output.shape[0]):
+            output[i, [1, 3]] = torch.clamp(output[i, [1, 3]], 0.0, img_dim[i, 0])
+            output[i, [2, 4]] = torch.clamp(output[i, [2, 4]], 0.0, img_dim[i, 1])
+
+    bboxs = []
+    scores = []
+    for i in range(len(output)):
+        item = output[i]
+        bbox = item[1:5].cpu().numpy()
+        # conver float32 to .2f data
+        bbox = [round(i, 2) for i in list(bbox)]
+        score = item[5].cpu().numpy()
+        bboxs.append(bbox)
+        scores.append(score)
+    scores = np.expand_dims(np.array(scores), 1)
+    bboxs = np.array(bboxs)
+
+    return bboxs, scores
diff --git a/VideoToNPZ/lib/detector/yolov3/preprocess.py b/VideoToNPZ/lib/detector/yolov3/preprocess.py
new file mode 100644
index 0000000000000000000000000000000000000000..77041f1c85090e0171a080054e4925a66617a49d
--- /dev/null
+++ b/VideoToNPZ/lib/detector/yolov3/preprocess.py
@@ -0,0 +1,63 @@
+from __future__ import division
+
+import torch
+import numpy as np
+import cv2
+from PIL import Image
+
+
+def letterbox_image(img, inp_dim):
+    '''resize image with unchanged aspect ratio using padding'''
+    img_w, img_h = img.shape[1], img.shape[0]
+    w, h = inp_dim
+    new_w = int(img_w * min(w/img_w, h/img_h))
+    new_h = int(img_h * min(w/img_w, h/img_h))
+    resized_image = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_CUBIC)
+
+    canvas = np.full((inp_dim[1], inp_dim[0], 3), 128)
+
+    canvas[(h - new_h) // 2:(h - new_h) // 2 + new_h, (w - new_w) // 2:(w - new_w) // 2 + new_w, :] = resized_image
+
+    return canvas
+
+
+def prep_image(img, inp_dim):
+    """
+    Prepare image for inputting to the neural network.
+
+    Returns a Variable
+    """
+    if type(img) == str:
+        orig_im = cv2.imread(img)
+    else:
+        orig_im = img
+    dim = orig_im.shape[1], orig_im.shape[0]
+    img = (letterbox_image(orig_im, (inp_dim, inp_dim)))
+    img_ = img[:, :, ::-1].transpose((2, 0, 1)).copy()
+    img_ = torch.from_numpy(img_).float().div(255.0).unsqueeze(0)
+    return img_, orig_im, dim
+
+
+def prep_image_pil(img, network_dim):
+    orig_im = Image.open(img)
+    img = orig_im.convert('RGB')
+    dim = img.size
+    img = img.resize(network_dim)
+    img = torch.ByteTensor(torch.ByteStorage.from_buffer(img.tobytes()))
+    img = img.view(*network_dim, 3).transpose(0, 1).transpose(0, 2).contiguous()
+    img = img.view(1, 3, *network_dim)
+    img = img.float().div(255.0)
+    return img, orig_im, dim
+
+
+def inp_to_image(inp):
+    inp = inp.cpu().squeeze()
+    inp = inp * 255
+    try:
+        inp = inp.data.numpy()
+    except RuntimeError:
+        inp = inp.numpy()
+    inp = inp.transpose(1, 2, 0)
+
+    inp = inp[:, :, ::-1]
+    return inp
diff --git a/VideoToNPZ/lib/detector/yolov3/util.py b/VideoToNPZ/lib/detector/yolov3/util.py
new file mode 100644
index 0000000000000000000000000000000000000000..18b79ab4cc88d90afc9c40d4aeeadee2f5b2a1b5
--- /dev/null
+++ b/VideoToNPZ/lib/detector/yolov3/util.py
@@ -0,0 +1,225 @@
+from __future__ import division
+
+import torch
+import numpy as np
+import cv2
+import os.path as osp
+from bbox import bbox_iou
+
+
+def get_path(cur_file):
+    cur_dir = osp.dirname(osp.realpath(cur_file))
+    project_root = osp.join(cur_dir, '../../../')
+    chk_root = osp.join(project_root, 'checkpoint/')
+    data_root = osp.join(project_root, 'data/')
+
+    return project_root, chk_root, data_root, cur_dir
+
+
+def count_parameters(model):
+    return sum(p.numel() for p in model.parameters())
+
+
+def count_learnable_parameters(model):
+    return sum(p.numel() for p in model.parameters() if p.requires_grad)
+
+
+def convert2cpu(matrix):
+    if matrix.is_cuda:
+        return torch.FloatTensor(matrix.size()).copy_(matrix)
+    else:
+        return matrix
+
+
+def predict_transform(prediction, inp_dim, anchors, num_classes, CUDA = True):
+    batch_size = prediction.size(0)
+    stride = inp_dim // prediction.size(2)
+    grid_size = inp_dim // stride
+    bbox_attrs = 5 + num_classes
+    num_anchors = len(anchors)
+
+    anchors = [(a[0]/stride, a[1]/stride) for a in anchors]
+
+    prediction = prediction.view(batch_size, bbox_attrs*num_anchors, grid_size*grid_size)
+    prediction = prediction.transpose(1, 2).contiguous()
+    prediction = prediction.view(batch_size, grid_size*grid_size*num_anchors, bbox_attrs)
+
+    # Sigmoid the  centre_X, centre_Y. and object confidencce
+    prediction[:, :, 0] = torch.sigmoid(prediction[:, :, 0])
+    prediction[:, :, 1] = torch.sigmoid(prediction[:, :, 1])
+    prediction[:, :, 4] = torch.sigmoid(prediction[:, :, 4])
+
+    # Add the center offsets
+    grid_len = np.arange(grid_size)
+    a, b = np.meshgrid(grid_len, grid_len)
+
+    x_offset = torch.FloatTensor(a).view(-1, 1)
+    y_offset = torch.FloatTensor(b).view(-1, 1)
+
+    if CUDA:
+        x_offset = x_offset.cuda()
+        y_offset = y_offset.cuda()
+
+    x_y_offset = torch.cat((x_offset, y_offset), 1).repeat(1, num_anchors).view(-1, 2).unsqueeze(0)
+
+    prediction[:, :, :2] += x_y_offset
+
+    # log space transform height and the width
+    anchors = torch.FloatTensor(anchors)
+
+    if CUDA:
+        anchors = anchors.cuda()
+
+    anchors = anchors.repeat(grid_size*grid_size, 1).unsqueeze(0)
+    prediction[:, :, 2:4] = torch.exp(prediction[:, :, 2:4])*anchors
+
+    # Softmax the class scores
+    prediction[:, :, 5: 5 + num_classes] = torch.sigmoid((prediction[:, :, 5: 5 + num_classes]))
+
+    prediction[:, :, :4] *= stride
+
+    return prediction
+
+
+def load_classes(namesfile):
+    fp = open(namesfile, "r")
+    names = fp.read().split("\n")[:-1]
+    return names
+
+
+def get_im_dim(im):
+    im = cv2.imread(im)
+    w, h = im.shape[1], im.shape[0]
+    return w, h
+
+
+def unique(tensor):
+    tensor_np = tensor.cpu().numpy()
+    unique_np = np.unique(tensor_np)
+    unique_tensor = torch.from_numpy(unique_np)
+
+    tensor_res = tensor.new(unique_tensor.shape)
+    tensor_res.copy_(unique_tensor)
+    return tensor_res
+
+
+# ADD SOFT NMS
+def write_results(prediction, confidence, num_classes, nms=True, nms_conf=0.4, det_hm=False):
+    """
+        https://blog.paperspace.com/how-to-implement-a-yolo-v3-object-detector-from-scratch-in-pytorch-part-4/
+        prediction: (B x 10647 x 85)
+        B: the number of images in a batch,
+        10647: the number of bounding boxes predicted per image. (52×52+26×26+13×13)×3=10647
+        85: the number of bounding box attributes. (c_x, c_y, w, h, object confidence, and 80 class scores)
+
+        output: Num_obj × [img_index, x_1, y_1, x_2, y_2, object confidence, class_score, label_index]
+    """
+
+    conf_mask = (prediction[:, :, 4] > confidence).float().unsqueeze(2)
+    prediction = prediction*conf_mask
+
+    box_a = prediction.new(prediction.shape)
+    box_a[:, :, 0] = (prediction[:, :, 0] - prediction[:, :, 2]/2)
+    box_a[:, :, 1] = (prediction[:, :, 1] - prediction[:, :, 3]/2)
+    box_a[:, :, 2] = (prediction[:, :, 0] + prediction[:, :, 2]/2)
+    box_a[:, :, 3] = (prediction[:, :, 1] + prediction[:, :, 3]/2)
+    prediction[:, :, :4] = box_a[:, :, :4]
+
+    batch_size = prediction.size(0)
+
+    output = prediction.new(1, prediction.size(2) + 1)
+    write = False
+
+    for ind in range(batch_size):
+        # select the image from the batch
+        image_pred = prediction[ind]
+
+        # Get the class having maximum score, and the index of that class
+        # Get rid of num_classes softmax scores
+        # Add the class index and the class score of class having maximum score
+        max_conf, max_conf_index = torch.max(image_pred[:, 5:5 + num_classes], 1)
+        max_conf = max_conf.float().unsqueeze(1)
+        max_conf_index = max_conf_index.float().unsqueeze(1)
+        seq = (image_pred[:, :5], max_conf, max_conf_index)
+        image_pred = torch.cat(seq, 1)  # image_pred:(10647, 7) 7:[x1, y1, x2, y2, obj_score, max_conf, max_conf_index]
+
+        # Get rid of the zero entries
+        non_zero_ind = (torch.nonzero(image_pred[:, 4]))
+        image_pred__ = image_pred[non_zero_ind.squeeze(), :].view(-1, 7)
+
+        # filters out people id
+        if det_hm:
+            cls_mask = (image_pred__[:, -1] == 0).float()
+            class_mask_ind = torch.nonzero(cls_mask).squeeze()
+            image_pred_ = image_pred__[class_mask_ind].view(-1, 7)
+
+            if torch.sum(cls_mask) == 0:
+                return image_pred_
+        else:
+            image_pred_ = image_pred__
+
+        # Get the various classes detected in the image
+        try:
+            # img_classes = unique(image_pred_[:, -1])
+            img_classes = torch.unique(image_pred_[:, -1], sorted=True).float()
+        except:
+             continue
+
+        # We will do NMS classwise
+        #  import ipdb;ipdb.set_trace()
+        for cls in img_classes:
+            # get the detections with one particular class
+            cls_mask = image_pred_*(image_pred_[:, -1] == cls).float().unsqueeze(1)
+            class_mask_ind = torch.nonzero(cls_mask[:, -2]).squeeze()
+            image_pred_class = image_pred_[class_mask_ind].view(-1, 7)
+
+            # sort the detections such that the entry with the maximum objectness
+            # confidence is at the top
+            conf_sort_index = torch.sort(image_pred_class[:, 4], descending=True)[1]
+            image_pred_class = image_pred_class[conf_sort_index]
+            idx = image_pred_class.size(0)
+
+            #  from soft_NMS import soft_nms
+            #  boxes = image_pred_class[:,:4]
+            #  scores = image_pred_class[:, 4]
+            #  k, N = soft_nms(boxes, scores, method=2)
+            #  image_pred_class = image_pred_class[k]
+
+            # if nms has to be done
+            if nms:
+                # For each detection
+                for i in range(idx):
+                    # Get the IOUs of all boxes that come after the one we are looking at
+                    # in the loop
+                    try:
+                        ious = bbox_iou(image_pred_class[i].unsqueeze(0), image_pred_class[i+1:])
+                    except ValueError:
+                        break
+
+                    except IndexError:
+                        break
+
+                    # Zero out all the detections that have IoU > threshold
+                    iou_mask = (ious < nms_conf).float().unsqueeze(1)
+                    image_pred_class[i+1:] *= iou_mask
+
+                    #  Remove the zero entries
+                    non_zero_ind = torch.nonzero(image_pred_class[:, 4]).squeeze()
+                    image_pred_class = image_pred_class[non_zero_ind].view(-1, 7)
+
+            # Concatenate the batch_id of the image to the detection
+            # this helps us identify which image does the detection correspond to
+            # We use a linear structure to hold ALL the detections from the batch
+            # the batch_dim is flattened
+            # batch is identified by extra batch column
+
+            batch_ind = image_pred_class.new(image_pred_class.size(0), 1).fill_(ind)
+            seq = batch_ind, image_pred_class
+            if not write:
+                output = torch.cat(seq, 1)
+                write = True
+            else:
+                out = torch.cat(seq, 1)
+                output = torch.cat((output, out))
+
+    return output
diff --git a/VideoToNPZ/lib/pose/__init__.py b/VideoToNPZ/lib/pose/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..118a33cf7359ce8025294f12f8d83923c2270540
--- /dev/null
+++ b/VideoToNPZ/lib/pose/__init__.py
@@ -0,0 +1,10 @@
+import sys
+import os.path as osp
+
+sys.path.insert(1, osp.join(osp.dirname(osp.realpath(__file__)), 'hrnet/pose_estimation'))
+from gen_kpts import gen_img_kpts, gen_video_kpts, load_default_model
+sys.path.insert(2, osp.join(osp.dirname(osp.realpath(__file__)), 'hrnet/lib/utils'))
+from utilitys import plot_keypoint, write, PreProcess, box_to_center_scale, load_json
+
+sys.path.pop(1)
+sys.path.pop(2)
\ No newline at end of file
diff --git a/VideoToNPZ/lib/pose/hrnet/experiments/coco/hrnet/w32_256x192_adam_lr1e-3.yaml b/VideoToNPZ/lib/pose/hrnet/experiments/coco/hrnet/w32_256x192_adam_lr1e-3.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..16854cf2c48afde13cbf5847a202ee8640b3c982
--- /dev/null
+++ b/VideoToNPZ/lib/pose/hrnet/experiments/coco/hrnet/w32_256x192_adam_lr1e-3.yaml
@@ -0,0 +1,127 @@
+AUTO_RESUME: true
+CUDNN:
+  BENCHMARK: true
+  DETERMINISTIC: false
+  ENABLED: true
+DATA_DIR: ''
+GPUS: (0,1,2,3)
+OUTPUT_DIR: 'output'
+LOG_DIR: 'log'
+WORKERS: 24
+PRINT_FREQ: 100
+
+DATASET:
+  COLOR_RGB: true
+  DATASET: 'coco'
+  DATA_FORMAT: jpg
+  FLIP: true
+  NUM_JOINTS_HALF_BODY: 8
+  PROB_HALF_BODY: 0.3
+  ROOT: 'data/coco/'
+  ROT_FACTOR: 45
+  SCALE_FACTOR: 0.35
+  TEST_SET: 'val2017'
+  TRAIN_SET: 'train2017'
+MODEL:
+  INIT_WEIGHTS: true
+  NAME: pose_hrnet
+  NUM_JOINTS: 17
+  PRETRAINED: 'models/pytorch/imagenet/hrnet_w32-36af842e.pth'
+  TARGET_TYPE: gaussian
+  IMAGE_SIZE:
+  - 192
+  - 256
+  HEATMAP_SIZE:
+  - 48
+  - 64
+  SIGMA: 2
+  EXTRA:
+    PRETRAINED_LAYERS:
+    - 'conv1'
+    - 'bn1'
+    - 'conv2'
+    - 'bn2'
+    - 'layer1'
+    - 'transition1'
+    - 'stage2'
+    - 'transition2'
+    - 'stage3'
+    - 'transition3'
+    - 'stage4'
+    FINAL_CONV_KERNEL: 1
+    STAGE2:
+      NUM_MODULES: 1
+      NUM_BRANCHES: 2
+      BLOCK: BASIC
+      NUM_BLOCKS:
+      - 4
+      - 4
+      NUM_CHANNELS:
+      - 32
+      - 64
+      FUSE_METHOD: SUM
+    STAGE3:
+      NUM_MODULES: 4
+      NUM_BRANCHES: 3
+      BLOCK: BASIC
+      NUM_BLOCKS:
+      - 4
+      - 4
+      - 4
+      NUM_CHANNELS:
+      - 32
+      - 64
+      - 128
+      FUSE_METHOD: SUM
+    STAGE4:
+      NUM_MODULES: 3
+      NUM_BRANCHES: 4
+      BLOCK: BASIC
+      NUM_BLOCKS:
+      - 4
+      - 4
+      - 4
+      - 4
+      NUM_CHANNELS:
+      - 32
+      - 64
+      - 128
+      - 256
+      FUSE_METHOD: SUM
+LOSS:
+  USE_TARGET_WEIGHT: true
+TRAIN:
+  BATCH_SIZE_PER_GPU: 32
+  SHUFFLE: true
+  BEGIN_EPOCH: 0
+  END_EPOCH: 210
+  OPTIMIZER: adam
+  LR: 0.001
+  LR_FACTOR: 0.1
+  LR_STEP:
+  - 170
+  - 200
+  WD: 0.0001
+  GAMMA1: 0.99
+  GAMMA2: 0.0
+  MOMENTUM: 0.9
+  NESTEROV: false
+TEST:
+  BATCH_SIZE_PER_GPU: 32
+  COCO_BBOX_FILE: 'data/coco/person_detection_results/COCO_val2017_detections_AP_H_56_person.json'
+  BBOX_THRE: 1.0
+  IMAGE_THRE: 0.0
+  IN_VIS_THRE: 0.2
+  MODEL_FILE: ''
+  NMS_THRE: 1.0
+  OKS_THRE: 0.9
+  USE_GT_BBOX: true
+  FLIP_TEST: true
+  POST_PROCESS: true
+  SHIFT_HEATMAP: true
+DEBUG:
+  DEBUG: true
+  SAVE_BATCH_IMAGES_GT: true
+  SAVE_BATCH_IMAGES_PRED: true
+  SAVE_HEATMAPS_GT: true
+  SAVE_HEATMAPS_PRED: true
diff --git a/VideoToNPZ/lib/pose/hrnet/experiments/coco/hrnet/w32_384x288_adam_lr1e-3.yaml b/VideoToNPZ/lib/pose/hrnet/experiments/coco/hrnet/w32_384x288_adam_lr1e-3.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..57101e9069350d171760cb936a19c082165ece03
--- /dev/null
+++ b/VideoToNPZ/lib/pose/hrnet/experiments/coco/hrnet/w32_384x288_adam_lr1e-3.yaml
@@ -0,0 +1,127 @@
+AUTO_RESUME: true
+CUDNN:
+  BENCHMARK: true
+  DETERMINISTIC: false
+  ENABLED: true
+DATA_DIR: ''
+GPUS: (0,1,2,3)
+OUTPUT_DIR: 'output'
+LOG_DIR: 'log'
+WORKERS: 24
+PRINT_FREQ: 100
+
+DATASET:
+  COLOR_RGB: true
+  DATASET: 'coco'
+  DATA_FORMAT: jpg
+  FLIP: true
+  NUM_JOINTS_HALF_BODY: 8
+  PROB_HALF_BODY: 0.3
+  ROOT: 'data/coco/'
+  ROT_FACTOR: 45
+  SCALE_FACTOR: 0.35
+  TEST_SET: 'val2017'
+  TRAIN_SET: 'train2017'
+MODEL:
+  INIT_WEIGHTS: true
+  NAME: pose_hrnet
+  NUM_JOINTS: 17
+  PRETRAINED: 'models/pytorch/imagenet/hrnet_w32-36af842e.pth'
+  TARGET_TYPE: gaussian
+  IMAGE_SIZE:
+  - 288
+  - 384
+  HEATMAP_SIZE:
+  - 72
+  - 96
+  SIGMA: 3
+  EXTRA:
+    PRETRAINED_LAYERS:
+    - 'conv1'
+    - 'bn1'
+    - 'conv2'
+    - 'bn2'
+    - 'layer1'
+    - 'transition1'
+    - 'stage2'
+    - 'transition2'
+    - 'stage3'
+    - 'transition3'
+    - 'stage4'
+    FINAL_CONV_KERNEL: 1
+    STAGE2:
+      NUM_MODULES: 1
+      NUM_BRANCHES: 2
+      BLOCK: BASIC
+      NUM_BLOCKS:
+      - 4
+      - 4
+      NUM_CHANNELS:
+      - 32
+      - 64
+      FUSE_METHOD: SUM
+    STAGE3:
+      NUM_MODULES: 4
+      NUM_BRANCHES: 3
+      BLOCK: BASIC
+      NUM_BLOCKS:
+      - 4
+      - 4
+      - 4
+      NUM_CHANNELS:
+      - 32
+      - 64
+      - 128
+      FUSE_METHOD: SUM
+    STAGE4:
+      NUM_MODULES: 3
+      NUM_BRANCHES: 4
+      BLOCK: BASIC
+      NUM_BLOCKS:
+      - 4
+      - 4
+      - 4
+      - 4
+      NUM_CHANNELS:
+      - 32
+      - 64
+      - 128
+      - 256
+      FUSE_METHOD: SUM
+LOSS:
+  USE_TARGET_WEIGHT: true
+TRAIN:
+  BATCH_SIZE_PER_GPU: 32
+  SHUFFLE: true
+  BEGIN_EPOCH: 0
+  END_EPOCH: 210
+  OPTIMIZER: adam
+  LR: 0.001
+  LR_FACTOR: 0.1
+  LR_STEP:
+  - 170
+  - 200
+  WD: 0.0001
+  GAMMA1: 0.99
+  GAMMA2: 0.0
+  MOMENTUM: 0.9
+  NESTEROV: false
+TEST:
+  BATCH_SIZE_PER_GPU: 32
+  COCO_BBOX_FILE: 'data/coco/person_detection_results/COCO_val2017_detections_AP_H_56_person.json'
+  BBOX_THRE: 1.0
+  IMAGE_THRE: 0.0
+  IN_VIS_THRE: 0.2
+  MODEL_FILE: ''
+  NMS_THRE: 1.0
+  OKS_THRE: 0.9
+  USE_GT_BBOX: true
+  FLIP_TEST: true
+  POST_PROCESS: true
+  SHIFT_HEATMAP: true
+DEBUG:
+  DEBUG: true
+  SAVE_BATCH_IMAGES_GT: true
+  SAVE_BATCH_IMAGES_PRED: true
+  SAVE_HEATMAPS_GT: true
+  SAVE_HEATMAPS_PRED: true
diff --git a/VideoToNPZ/lib/pose/hrnet/experiments/coco/hrnet/w48_256x192_adam_lr1e-3.yaml b/VideoToNPZ/lib/pose/hrnet/experiments/coco/hrnet/w48_256x192_adam_lr1e-3.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..45c7011c0f1d3f441840e9693e6923c78fe3eab5
--- /dev/null
+++ b/VideoToNPZ/lib/pose/hrnet/experiments/coco/hrnet/w48_256x192_adam_lr1e-3.yaml
@@ -0,0 +1,127 @@
+AUTO_RESUME: true
+CUDNN:
+  BENCHMARK: true
+  DETERMINISTIC: false
+  ENABLED: true
+DATA_DIR: ''
+GPUS: (0,1,2,3)
+OUTPUT_DIR: 'output'
+LOG_DIR: 'log'
+WORKERS: 24
+PRINT_FREQ: 100
+
+DATASET:
+  COLOR_RGB: true
+  DATASET: 'coco'
+  DATA_FORMAT: jpg
+  FLIP: true
+  NUM_JOINTS_HALF_BODY: 8
+  PROB_HALF_BODY: 0.3
+  ROOT: 'data/coco/'
+  ROT_FACTOR: 45
+  SCALE_FACTOR: 0.35
+  TEST_SET: 'val2017'
+  TRAIN_SET: 'train2017'
+MODEL:
+  INIT_WEIGHTS: true
+  NAME: pose_hrnet
+  NUM_JOINTS: 17
+  PRETRAINED: 'models/pytorch/imagenet/hrnet_w48-8ef0771d.pth'
+  TARGET_TYPE: gaussian
+  IMAGE_SIZE:
+  - 192
+  - 256
+  HEATMAP_SIZE:
+  - 48
+  - 64
+  SIGMA: 2
+  EXTRA:
+    PRETRAINED_LAYERS:
+    - 'conv1'
+    - 'bn1'
+    - 'conv2'
+    - 'bn2'
+    - 'layer1'
+    - 'transition1'
+    - 'stage2'
+    - 'transition2'
+    - 'stage3'
+    - 'transition3'
+    - 'stage4'
+    FINAL_CONV_KERNEL: 1
+    STAGE2:
+      NUM_MODULES: 1
+      NUM_BRANCHES: 2
+      BLOCK: BASIC
+      NUM_BLOCKS:
+      - 4
+      - 4
+      NUM_CHANNELS:
+      - 48
+      - 96
+      FUSE_METHOD: SUM
+    STAGE3:
+      NUM_MODULES: 4
+      NUM_BRANCHES: 3
+      BLOCK: BASIC
+      NUM_BLOCKS:
+      - 4
+      - 4
+      - 4
+      NUM_CHANNELS:
+      - 48
+      - 96
+      - 192
+      FUSE_METHOD: SUM
+    STAGE4:
+      NUM_MODULES: 3
+      NUM_BRANCHES: 4
+      BLOCK: BASIC
+      NUM_BLOCKS:
+      - 4
+      - 4
+      - 4
+      - 4
+      NUM_CHANNELS:
+      - 48
+      - 96
+      - 192
+      - 384
+      FUSE_METHOD: SUM
+LOSS:
+  USE_TARGET_WEIGHT: true
+TRAIN:
+  BATCH_SIZE_PER_GPU: 32
+  SHUFFLE: true
+  BEGIN_EPOCH: 0
+  END_EPOCH: 210
+  OPTIMIZER: adam
+  LR: 0.001
+  LR_FACTOR: 0.1
+  LR_STEP:
+  - 170
+  - 200
+  WD: 0.0001
+  GAMMA1: 0.99
+  GAMMA2: 0.0
+  MOMENTUM: 0.9
+  NESTEROV: false
+TEST:
+  BATCH_SIZE_PER_GPU: 32
+  COCO_BBOX_FILE: 'data/coco/person_detection_results/COCO_val2017_detections_AP_H_56_person.json'
+  BBOX_THRE: 1.0
+  IMAGE_THRE: 0.0
+  IN_VIS_THRE: 0.2
+  MODEL_FILE: ''
+  NMS_THRE: 1.0
+  OKS_THRE: 0.9
+  USE_GT_BBOX: true
+  FLIP_TEST: true
+  POST_PROCESS: true
+  SHIFT_HEATMAP: true
+DEBUG:
+  DEBUG: true
+  SAVE_BATCH_IMAGES_GT: true
+  SAVE_BATCH_IMAGES_PRED: true
+  SAVE_HEATMAPS_GT: true
+  SAVE_HEATMAPS_PRED: true
diff --git a/VideoToNPZ/lib/pose/hrnet/experiments/coco/hrnet/w48_384x288_adam_lr1e-3.yaml b/VideoToNPZ/lib/pose/hrnet/experiments/coco/hrnet/w48_384x288_adam_lr1e-3.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2844ff61338e40b774656f884e8a370a104f19f7
--- /dev/null
+++ b/VideoToNPZ/lib/pose/hrnet/experiments/coco/hrnet/w48_384x288_adam_lr1e-3.yaml
@@ -0,0 +1,127 @@
+AUTO_RESUME: true
+CUDNN:
+  BENCHMARK: true
+  DETERMINISTIC: false
+  ENABLED: true
+DATA_DIR: ''
+GPUS: (0,1,2,3)
+OUTPUT_DIR: 'output'
+LOG_DIR: 'log'
+WORKERS: 24
+PRINT_FREQ: 100
+
+DATASET:
+  COLOR_RGB: true
+  DATASET: 'coco'
+  DATA_FORMAT: jpg
+  FLIP: true
+  NUM_JOINTS_HALF_BODY: 8
+  PROB_HALF_BODY: 0.3
+  ROOT: 'data/coco/'
+  ROT_FACTOR: 45
+  SCALE_FACTOR: 0.35
+  TEST_SET: 'val2017'
+  TRAIN_SET: 'train2017'
+MODEL:
+  INIT_WEIGHTS: true
+  NAME: pose_hrnet
+  NUM_JOINTS: 17
+  PRETRAINED: 'models/pytorch/imagenet/hrnet_w48-8ef0771d.pth'
+  TARGET_TYPE: gaussian
+  IMAGE_SIZE:
+  - 288
+  - 384
+  HEATMAP_SIZE:
+  - 72
+  - 96
+  SIGMA: 3
+  EXTRA:
+    PRETRAINED_LAYERS:
+    - 'conv1'
+    - 'bn1'
+    - 'conv2'
+    - 'bn2'
+    - 'layer1'
+    - 'transition1'
+    - 'stage2'
+    - 'transition2'
+    - 'stage3'
+    - 'transition3'
+    - 'stage4'
+    FINAL_CONV_KERNEL: 1
+    STAGE2:
+      NUM_MODULES: 1
+      NUM_BRANCHES: 2
+      BLOCK: BASIC
+      NUM_BLOCKS:
+      - 4
+      - 4
+      NUM_CHANNELS:
+      - 48
+      - 96
+      FUSE_METHOD: SUM
+    STAGE3:
+      NUM_MODULES: 4
+      NUM_BRANCHES: 3
+      BLOCK: BASIC
+      NUM_BLOCKS:
+      - 4
+      - 4
+      - 4
+      NUM_CHANNELS:
+      - 48
+      - 96
+      - 192
+      FUSE_METHOD: SUM
+    STAGE4:
+      NUM_MODULES: 3
+      NUM_BRANCHES: 4
+      BLOCK: BASIC
+      NUM_BLOCKS:
+      - 4
+      - 4
+      - 4
+      - 4
+      NUM_CHANNELS:
+      - 48
+      - 96
+      - 192
+      - 384
+      FUSE_METHOD: SUM
+LOSS:
+  USE_TARGET_WEIGHT: true
+TRAIN:
+  BATCH_SIZE_PER_GPU: 24
+  SHUFFLE: true
+  BEGIN_EPOCH: 0
+  END_EPOCH: 210
+  OPTIMIZER: adam
+  LR: 0.001
+  LR_FACTOR: 0.1
+  LR_STEP:
+  - 170
+  - 200
+  WD: 0.0001
+  GAMMA1: 0.99
+  GAMMA2: 0.0
+  MOMENTUM: 0.9
+  NESTEROV: false
+TEST:
+  BATCH_SIZE_PER_GPU: 24
+  COCO_BBOX_FILE: 'data/coco/person_detection_results/COCO_val2017_detections_AP_H_56_person.json'
+  BBOX_THRE: 1.0
+  IMAGE_THRE: 0.0
+  IN_VIS_THRE: 0.2
+  MODEL_FILE: ''
+  NMS_THRE: 1.0
+  OKS_THRE: 0.9
+  USE_GT_BBOX: true
+  FLIP_TEST: true
+  POST_PROCESS: true
+  SHIFT_HEATMAP: true
+DEBUG:
+  DEBUG: true
+  SAVE_BATCH_IMAGES_GT: true
+  SAVE_BATCH_IMAGES_PRED: true
+  SAVE_HEATMAPS_GT: true
+  SAVE_HEATMAPS_PRED: true
diff --git a/VideoToNPZ/lib/pose/hrnet/experiments/coco/resnet/res101_256x192_d256x3_adam_lr1e-3.yaml b/VideoToNPZ/lib/pose/hrnet/experiments/coco/resnet/res101_256x192_d256x3_adam_lr1e-3.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..61e8f78a031f104f90e3b6c9a7388de289391fee
--- /dev/null
+++ b/VideoToNPZ/lib/pose/hrnet/experiments/coco/resnet/res101_256x192_d256x3_adam_lr1e-3.yaml
@@ -0,0 +1,83 @@
+AUTO_RESUME: true
+CUDNN:
+  BENCHMARK: true
+  DETERMINISTIC: false
+  ENABLED: true
+DATA_DIR: ''
+GPUS: (0,1,2,3)
+OUTPUT_DIR: 'output'
+LOG_DIR: 'log'
+WORKERS: 24
+PRINT_FREQ: 100
+
+DATASET:
+  COLOR_RGB: false
+  DATASET: 'coco'
+  ROOT: 'data/coco/'
+  TEST_SET: 'val2017'
+  TRAIN_SET: 'train2017'
+  FLIP: true
+  ROT_FACTOR: 40
+  SCALE_FACTOR: 0.3
+MODEL:
+  NAME: 'pose_resnet'
+  PRETRAINED: 'models/pytorch/imagenet/resnet101-5d3b4d8f.pth'
+  IMAGE_SIZE:
+  - 192
+  - 256
+  HEATMAP_SIZE:
+  - 48
+  - 64
+  SIGMA: 2
+  NUM_JOINTS: 17
+  TARGET_TYPE: 'gaussian'
+  EXTRA:
+    FINAL_CONV_KERNEL: 1
+    DECONV_WITH_BIAS: false
+    NUM_DECONV_LAYERS: 3
+    NUM_DECONV_FILTERS:
+    - 256
+    - 256
+    - 256
+    NUM_DECONV_KERNELS:
+    - 4
+    - 4
+    - 4
+    NUM_LAYERS: 101
+LOSS:
+  USE_TARGET_WEIGHT: true
+TRAIN:
+  BATCH_SIZE_PER_GPU: 32
+  SHUFFLE: true
+  BEGIN_EPOCH: 0
+  END_EPOCH: 140
+  OPTIMIZER: 'adam'
+  LR: 0.001
+  LR_FACTOR: 0.1
+  LR_STEP:
+  - 90
+  - 120
+  WD: 0.0001
+  GAMMA1: 0.99
+  GAMMA2: 0.0
+  MOMENTUM: 0.9
+  NESTEROV: false
+TEST:
+  BATCH_SIZE_PER_GPU: 32
+  COCO_BBOX_FILE: 'data/coco/person_detection_results/COCO_val2017_detections_AP_H_56_person.json'
+  BBOX_THRE: 1.0
+  IMAGE_THRE: 0.0
+  IN_VIS_THRE: 0.2
+  MODEL_FILE: ''
+  NMS_THRE: 1.0
+  OKS_THRE: 0.9
+  FLIP_TEST: true
+  POST_PROCESS: true
+  SHIFT_HEATMAP: true
+  USE_GT_BBOX: true
+DEBUG:
+  DEBUG: true
+  SAVE_BATCH_IMAGES_GT: true
+  SAVE_BATCH_IMAGES_PRED: true
+  SAVE_HEATMAPS_GT: true
+  SAVE_HEATMAPS_PRED: true
diff --git a/VideoToNPZ/lib/pose/hrnet/experiments/coco/resnet/res101_384x288_d256x3_adam_lr1e-3.yaml b/VideoToNPZ/lib/pose/hrnet/experiments/coco/resnet/res101_384x288_d256x3_adam_lr1e-3.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b1523c69190c1a496476219577805f153f30310b
--- /dev/null
+++ b/VideoToNPZ/lib/pose/hrnet/experiments/coco/resnet/res101_384x288_d256x3_adam_lr1e-3.yaml
@@ -0,0 +1,83 @@
+AUTO_RESUME: true
+CUDNN:
+  BENCHMARK: true
+  DETERMINISTIC: false
+  ENABLED: true
+DATA_DIR: ''
+GPUS: (0,1,2,3)
+OUTPUT_DIR: 'output'
+LOG_DIR: 'log'
+WORKERS: 24
+PRINT_FREQ: 100
+
+DATASET:
+  COLOR_RGB: false
+  DATASET: 'coco'
+  ROOT: 'data/coco/'
+  TEST_SET: 'val2017'
+  TRAIN_SET: 'train2017'
+  FLIP: true
+  ROT_FACTOR: 40
+  SCALE_FACTOR: 0.3
+MODEL:
+  NAME: 'pose_resnet'
+  PRETRAINED: 'models/pytorch/imagenet/resnet101-5d3b4d8f.pth'
+  IMAGE_SIZE:
+  - 288
+  - 384
+  HEATMAP_SIZE:
+  - 72
+  - 96
+  SIGMA: 3
+  NUM_JOINTS: 17
+  TARGET_TYPE: 'gaussian'
+  EXTRA:
+    FINAL_CONV_KERNEL: 1
+    DECONV_WITH_BIAS: false
+    NUM_DECONV_LAYERS: 3
+    NUM_DECONV_FILTERS:
+    - 256
+    - 256
+    - 256
+    NUM_DECONV_KERNELS:
+    - 4
+    - 4
+    - 4
+    NUM_LAYERS: 101
+LOSS:
+  USE_TARGET_WEIGHT: true
+TRAIN:
+  BATCH_SIZE_PER_GPU: 32
+  SHUFFLE: true
+  BEGIN_EPOCH: 0
+  END_EPOCH: 140
+  OPTIMIZER: 'adam'
+  LR: 0.001
+  LR_FACTOR: 0.1
+  LR_STEP:
+  - 90
+  - 120
+  WD: 0.0001
+  GAMMA1: 0.99
+  GAMMA2: 0.0
+  MOMENTUM: 0.9
+  NESTEROV: false
+TEST:
+  BATCH_SIZE_PER_GPU: 32
+  COCO_BBOX_FILE: 'data/coco/person_detection_results/COCO_val2017_detections_AP_H_56_person.json'
+  BBOX_THRE: 1.0
+  IMAGE_THRE: 0.0
+  IN_VIS_THRE: 0.2
+  MODEL_FILE: ''
+  NMS_THRE: 1.0
+  OKS_THRE: 0.9
+  FLIP_TEST: true
+  POST_PROCESS: true
+  SHIFT_HEATMAP: true
+  USE_GT_BBOX: true
+DEBUG:
+  DEBUG: true
+  SAVE_BATCH_IMAGES_GT: true
+  SAVE_BATCH_IMAGES_PRED: true
+  SAVE_HEATMAPS_GT: true
+  SAVE_HEATMAPS_PRED: true
diff --git a/VideoToNPZ/lib/pose/hrnet/experiments/coco/resnet/res152_256x192_d256x3_adam_lr1e-3.yaml b/VideoToNPZ/lib/pose/hrnet/experiments/coco/resnet/res152_256x192_d256x3_adam_lr1e-3.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..580c09fbe890c57b4b8683bbb934724ed7ee1cc7
--- /dev/null
+++ b/VideoToNPZ/lib/pose/hrnet/experiments/coco/resnet/res152_256x192_d256x3_adam_lr1e-3.yaml
@@ -0,0 +1,83 @@
+AUTO_RESUME: true
+CUDNN:
+  BENCHMARK: true
+  DETERMINISTIC: false
+  ENABLED: true
+DATA_DIR: ''
+GPUS: (0,1,2,3)
+OUTPUT_DIR: 'output'
+LOG_DIR: 'log'
+WORKERS: 24
+PRINT_FREQ: 100
+
+DATASET:
+  COLOR_RGB: false
+  DATASET: 'coco'
+  ROOT: 'data/coco/'
+  TEST_SET: 'val2017'
+  TRAIN_SET: 'train2017'
+  FLIP: true
+  ROT_FACTOR: 40
+  SCALE_FACTOR: 0.3
+MODEL:
+  NAME: 'pose_resnet'
+  PRETRAINED: 'models/pytorch/imagenet/resnet152-b121ed2d.pth'
+  IMAGE_SIZE:
+  - 192
+  - 256
+  HEATMAP_SIZE:
+  - 48
+  - 64
+  SIGMA: 2
+  NUM_JOINTS: 17
+  TARGET_TYPE: 'gaussian'
+  EXTRA:
+    FINAL_CONV_KERNEL: 1
+    DECONV_WITH_BIAS: false
+    NUM_DECONV_LAYERS: 3
+    NUM_DECONV_FILTERS:
+    - 256
+    - 256
+    - 256
+    NUM_DECONV_KERNELS:
+    - 4
+    - 4
+    - 4
+    NUM_LAYERS: 152
+LOSS:
+  USE_TARGET_WEIGHT: true
+TRAIN:
+  BATCH_SIZE_PER_GPU: 32
+  SHUFFLE: true
+  BEGIN_EPOCH: 0
+  END_EPOCH: 140
+  OPTIMIZER: 'adam'
+  LR: 0.001
+  LR_FACTOR: 0.1
+  LR_STEP:
+  - 90
+  - 120
+  WD: 0.0001
+  GAMMA1: 0.99
+  GAMMA2: 0.0
+  MOMENTUM: 0.9
+  NESTEROV: false
+TEST:
+  BATCH_SIZE_PER_GPU: 32
+  COCO_BBOX_FILE: 'data/coco/person_detection_results/COCO_val2017_detections_AP_H_56_person.json'
+  BBOX_THRE: 1.0
+  IMAGE_THRE: 0.0
+  IN_VIS_THRE: 0.2
+  MODEL_FILE: ''
+  NMS_THRE: 1.0
+  OKS_THRE: 0.9
+  FLIP_TEST: true
+  POST_PROCESS: true
+  SHIFT_HEATMAP: true
+  USE_GT_BBOX: true
+DEBUG:
+  DEBUG: true
+  SAVE_BATCH_IMAGES_GT: true
+  SAVE_BATCH_IMAGES_PRED: true
+  SAVE_HEATMAPS_GT: true
+  SAVE_HEATMAPS_PRED: true
diff --git a/VideoToNPZ/lib/pose/hrnet/experiments/coco/resnet/res152_384x288_d256x3_adam_lr1e-3.yaml b/VideoToNPZ/lib/pose/hrnet/experiments/coco/resnet/res152_384x288_d256x3_adam_lr1e-3.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..156c576478591edddfae0a0849c80e3d1f2e0420
--- /dev/null
+++ b/VideoToNPZ/lib/pose/hrnet/experiments/coco/resnet/res152_384x288_d256x3_adam_lr1e-3.yaml
@@ -0,0 +1,83 @@
+AUTO_RESUME: true
+CUDNN:
+  BENCHMARK: true
+  DETERMINISTIC: false
+  ENABLED: true
+DATA_DIR: ''
+GPUS: (0,1,2,3)
+OUTPUT_DIR: 'output'
+LOG_DIR: 'log'
+WORKERS: 24
+PRINT_FREQ: 100
+
+DATASET:
+  COLOR_RGB: false
+  DATASET: 'coco'
+  ROOT: 'data/coco/'
+  TEST_SET: 'val2017'
+  TRAIN_SET: 'train2017'
+  FLIP: true
+  ROT_FACTOR: 40
+  SCALE_FACTOR: 0.3
+MODEL:
+  NAME: 'pose_resnet'
+  PRETRAINED: 'models/pytorch/imagenet/resnet152-b121ed2d.pth'
+  IMAGE_SIZE:
+  - 288
+  - 384
+  HEATMAP_SIZE:
+  - 72
+  - 96
+  SIGMA: 3
+  NUM_JOINTS: 17
+  TARGET_TYPE: 'gaussian'
+  EXTRA:
+    FINAL_CONV_KERNEL: 1
+    DECONV_WITH_BIAS: false
+    NUM_DECONV_LAYERS: 3
+    NUM_DECONV_FILTERS:
+    - 256
+    - 256
+    - 256
+    NUM_DECONV_KERNELS:
+    - 4
+    - 4
+    - 4
+    NUM_LAYERS: 152
+LOSS:
+  USE_TARGET_WEIGHT: true
+TRAIN:
+  BATCH_SIZE_PER_GPU: 32
+  SHUFFLE: true
+  BEGIN_EPOCH: 0
+  END_EPOCH: 140
+  OPTIMIZER: 'adam'
+  LR: 0.001
+  LR_FACTOR: 0.1
+  LR_STEP:
+  - 90
+  - 120
+  WD: 0.0001
+  GAMMA1: 0.99
+  GAMMA2: 0.0
+  MOMENTUM: 0.9
+  NESTEROV: false
+TEST:
+  BATCH_SIZE_PER_GPU: 32
+  COCO_BBOX_FILE: 'data/coco/person_detection_results/COCO_val2017_detections_AP_H_56_person.json'
+  BBOX_THRE: 1.0
+  IMAGE_THRE: 0.0
+  IN_VIS_THRE: 0.2
+  MODEL_FILE: ''
+  NMS_THRE: 1.0
+  OKS_THRE: 0.9
+  FLIP_TEST: true
+  POST_PROCESS: true
+  SHIFT_HEATMAP: true
+  USE_GT_BBOX: true
+DEBUG:
+  DEBUG: true
+  SAVE_BATCH_IMAGES_GT: true
+  SAVE_BATCH_IMAGES_PRED: true
+  SAVE_HEATMAPS_GT: true
+  SAVE_HEATMAPS_PRED: true
diff --git a/VideoToNPZ/lib/pose/hrnet/experiments/coco/resnet/res50_256x192_d256x3_adam_lr1e-3.yaml b/VideoToNPZ/lib/pose/hrnet/experiments/coco/resnet/res50_256x192_d256x3_adam_lr1e-3.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1c00e86872fe49a7371c22ad9ef2859bcd6d769e
--- /dev/null
+++ b/VideoToNPZ/lib/pose/hrnet/experiments/coco/resnet/res50_256x192_d256x3_adam_lr1e-3.yaml
@@ -0,0 +1,83 @@
+AUTO_RESUME: true
+CUDNN:
+  BENCHMARK: true
+  DETERMINISTIC: false
+  ENABLED: true
+DATA_DIR: ''
+GPUS: (0,1,2,3)
+OUTPUT_DIR: 'output'
+LOG_DIR: 'log'
+WORKERS: 24
+PRINT_FREQ: 100
+
+DATASET:
+  COLOR_RGB: false
+  DATASET: 'coco'
+  ROOT: 'data/coco/'
+  TEST_SET: 'val2017'
+  TRAIN_SET: 'train2017'
+  FLIP: true
+  ROT_FACTOR: 40
+  SCALE_FACTOR: 0.3
+MODEL:
+  NAME: 'pose_resnet'
+  PRETRAINED: 'models/pytorch/imagenet/resnet50-19c8e357.pth'
+  IMAGE_SIZE:
+  - 192
+  - 256
+  HEATMAP_SIZE:
+  - 48
+  - 64
+  SIGMA: 2
+  NUM_JOINTS: 17
+  TARGET_TYPE: 'gaussian'
+  EXTRA:
+    FINAL_CONV_KERNEL: 1
+    DECONV_WITH_BIAS: false
+    NUM_DECONV_LAYERS: 3
+    NUM_DECONV_FILTERS:
+    - 256
+    - 256
+    - 256
+    NUM_DECONV_KERNELS:
+    - 4
+    - 4
+    - 4
+    NUM_LAYERS: 50
+LOSS:
+  USE_TARGET_WEIGHT: true
+TRAIN:
+  BATCH_SIZE_PER_GPU: 32
+  SHUFFLE: true
+  BEGIN_EPOCH: 0
+  END_EPOCH: 140
+  OPTIMIZER: 'adam'
+  LR: 0.001
+  LR_FACTOR: 0.1
+  LR_STEP:
+  - 90
+  - 120
+  WD: 0.0001
+  GAMMA1: 0.99
+  GAMMA2: 0.0
+  MOMENTUM: 0.9
+  NESTEROV: false
+TEST:
+  BATCH_SIZE_PER_GPU: 32
+  COCO_BBOX_FILE: 'data/coco/person_detection_results/COCO_val2017_detections_AP_H_56_person.json'
+  BBOX_THRE: 1.0
+  IMAGE_THRE: 0.0
+  IN_VIS_THRE: 0.2
+  MODEL_FILE: ''
+  NMS_THRE: 1.0
+  OKS_THRE: 0.9
+  FLIP_TEST: true
+  POST_PROCESS: true
+  SHIFT_HEATMAP: true
+  USE_GT_BBOX: true
+DEBUG:
+  DEBUG: true
+  SAVE_BATCH_IMAGES_GT: true
+  SAVE_BATCH_IMAGES_PRED: true
+  SAVE_HEATMAPS_GT: true
+  SAVE_HEATMAPS_PRED: true
diff --git a/VideoToNPZ/lib/pose/hrnet/experiments/coco/resnet/res50_384x288_d256x3_adam_lr1e-3.yaml b/VideoToNPZ/lib/pose/hrnet/experiments/coco/resnet/res50_384x288_d256x3_adam_lr1e-3.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..caf7726c3cfcfb4a3a7a65029b0ee64f0194d0dc
--- /dev/null
+++ b/VideoToNPZ/lib/pose/hrnet/experiments/coco/resnet/res50_384x288_d256x3_adam_lr1e-3.yaml
@@ -0,0 +1,83 @@
+AUTO_RESUME: true
+CUDNN:
+  BENCHMARK: true
+  DETERMINISTIC: false
+  ENABLED: true
+DATA_DIR: ''
+GPUS: (0,1,2,3)
+OUTPUT_DIR: 'output'
+LOG_DIR: 'log'
+WORKERS: 24
+PRINT_FREQ: 100
+
+DATASET:
+  COLOR_RGB: false
+  DATASET: 'coco'
+  ROOT: 'data/coco/'
+  TEST_SET: 'val2017'
+  TRAIN_SET: 'train2017'
+  FLIP: true
+  ROT_FACTOR: 40
+  SCALE_FACTOR: 0.3
+MODEL:
+  NAME: 'pose_resnet'
+  PRETRAINED: 'models/pytorch/imagenet/resnet50-19c8e357.pth'
+  IMAGE_SIZE:
+  - 288
+  - 384
+  HEATMAP_SIZE:
+  - 72
+  - 96
+  SIGMA: 3
+  NUM_JOINTS: 17
+  TARGET_TYPE: 'gaussian'
+  EXTRA:
+    FINAL_CONV_KERNEL: 1
+    DECONV_WITH_BIAS: false
+    NUM_DECONV_LAYERS: 3
+    NUM_DECONV_FILTERS:
+    - 256
+    - 256
+    - 256
+    NUM_DECONV_KERNELS:
+    - 4
+    - 4
+    - 4
+    NUM_LAYERS: 50
+LOSS:
+  USE_TARGET_WEIGHT: true
+TRAIN:
+  BATCH_SIZE_PER_GPU: 32
+  SHUFFLE: true
+  BEGIN_EPOCH: 0
+  END_EPOCH: 140
+  OPTIMIZER: 'adam'
+  LR: 0.001
+  LR_FACTOR: 0.1
+  LR_STEP:
+  - 90
+  - 120
+  WD: 0.0001
+  GAMMA1: 0.99
+  GAMMA2: 0.0
+  MOMENTUM: 0.9
+  NESTEROV: false
+TEST:
+  BATCH_SIZE_PER_GPU: 32
+  COCO_BBOX_FILE: 'data/coco/person_detection_results/COCO_val2017_detections_AP_H_56_person.json'
+  BBOX_THRE: 1.0
+  IMAGE_THRE: 0.0
+  IN_VIS_THRE: 0.2
+  MODEL_FILE: ''
+  NMS_THRE: 1.0
+  OKS_THRE: 0.9
+  FLIP_TEST: true
+  POST_PROCESS: true
+  SHIFT_HEATMAP: true
+  USE_GT_BBOX: true
+DEBUG:
+  DEBUG: true
+  SAVE_BATCH_IMAGES_GT: true
+  SAVE_BATCH_IMAGES_PRED: true
+  SAVE_HEATMAPS_GT: true
+  SAVE_HEATMAPS_PRED: true
diff --git a/VideoToNPZ/lib/pose/hrnet/experiments/mpii/hrnet/w32_256x256_adam_lr1e-3.yaml b/VideoToNPZ/lib/pose/hrnet/experiments/mpii/hrnet/w32_256x256_adam_lr1e-3.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..894a84457da1e38020aea150b7dd47e2ec49e1bc
--- /dev/null
+++ b/VideoToNPZ/lib/pose/hrnet/experiments/mpii/hrnet/w32_256x256_adam_lr1e-3.yaml
@@ -0,0 +1,120 @@
+AUTO_RESUME: true
+CUDNN:
+  BENCHMARK: true
+  DETERMINISTIC: false
+  ENABLED: true
+DATA_DIR: ''
+GPUS: (0,1,2,3)
+OUTPUT_DIR: 'output'
+LOG_DIR: 'log'
+WORKERS: 24
+PRINT_FREQ: 100
+
+DATASET:
+  COLOR_RGB: true
+  DATASET: mpii
+  DATA_FORMAT: jpg
+  FLIP: true
+  NUM_JOINTS_HALF_BODY: 8
+  PROB_HALF_BODY: -1.0
+  ROOT: 'data/mpii/'
+  ROT_FACTOR: 30
+  SCALE_FACTOR: 0.25
+  TEST_SET: valid
+  TRAIN_SET: train
+MODEL:
+  INIT_WEIGHTS: true
+  NAME: pose_hrnet
+  NUM_JOINTS: 16
+  PRETRAINED: 'models/pytorch/imagenet/hrnet_w32-36af842e.pth'
+  TARGET_TYPE: gaussian
+  IMAGE_SIZE:
+  - 256
+  - 256
+  HEATMAP_SIZE:
+  - 64
+  - 64
+  SIGMA: 2
+  EXTRA:
+    PRETRAINED_LAYERS:
+    - 'conv1'
+    - 'bn1'
+    - 'conv2'
+    - 'bn2'
+    - 'layer1'
+    - 'transition1'
+    - 'stage2'
+    - 'transition2'
+    - 'stage3'
+    - 'transition3'
+    - 'stage4'
+    FINAL_CONV_KERNEL: 1
+    STAGE2:
+      NUM_MODULES: 1
+      NUM_BRANCHES: 2
+      BLOCK: BASIC
+      NUM_BLOCKS:
+      - 4
+      - 4
+      NUM_CHANNELS:
+      - 32
+      - 64
+      FUSE_METHOD: SUM
+    STAGE3:
+      NUM_MODULES: 4
+      NUM_BRANCHES: 3
+      BLOCK: BASIC
+      NUM_BLOCKS:
+      - 4
+      - 4
+      - 4
+      NUM_CHANNELS:
+      - 32
+      - 64
+      - 128
+      FUSE_METHOD: SUM
+    STAGE4:
+      NUM_MODULES: 3
+      NUM_BRANCHES: 4
+      BLOCK: BASIC
+      NUM_BLOCKS:
+      - 4
+      - 4
+      - 4
+      - 4
+      NUM_CHANNELS:
+      - 32
+      - 64
+      - 128
+      - 256
+      FUSE_METHOD: SUM
+LOSS:
+  USE_TARGET_WEIGHT: true
+TRAIN:
+  BATCH_SIZE_PER_GPU: 32
+  SHUFFLE: true
+  BEGIN_EPOCH: 0
+  END_EPOCH: 210
+  OPTIMIZER: adam
+  LR: 0.001
+  LR_FACTOR: 0.1
+  LR_STEP:
+  - 170
+  - 200
+  WD: 0.0001
+  GAMMA1: 0.99
+  GAMMA2: 0.0
+  MOMENTUM: 0.9
+  NESTEROV: false
+TEST:
+  BATCH_SIZE_PER_GPU: 32
+  MODEL_FILE: ''
+  FLIP_TEST: true
+  POST_PROCESS: true
+  SHIFT_HEATMAP: true
+DEBUG:
+  DEBUG: true
+  SAVE_BATCH_IMAGES_GT: true
+  SAVE_BATCH_IMAGES_PRED: true
+  SAVE_HEATMAPS_GT: true
+  SAVE_HEATMAPS_PRED: true
diff --git a/VideoToNPZ/lib/pose/hrnet/experiments/mpii/hrnet/w48_256x256_adam_lr1e-3.yaml b/VideoToNPZ/lib/pose/hrnet/experiments/mpii/hrnet/w48_256x256_adam_lr1e-3.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1f621dca2e032f19f996ada5a236bb01aebc26e0
--- /dev/null
+++ b/VideoToNPZ/lib/pose/hrnet/experiments/mpii/hrnet/w48_256x256_adam_lr1e-3.yaml
@@ -0,0 +1,120 @@
+AUTO_RESUME: true
+CUDNN:
+  BENCHMARK: true
+  DETERMINISTIC: false
+  ENABLED: true
+DATA_DIR: ''
+GPUS: (0,1,2,3)
+OUTPUT_DIR: 'output'
+LOG_DIR: 'log'
+WORKERS: 24
+PRINT_FREQ: 100
+
+DATASET:
+  COLOR_RGB: true
+  DATASET: mpii
+  DATA_FORMAT: jpg
+  FLIP: true
+  NUM_JOINTS_HALF_BODY: 8
+  PROB_HALF_BODY: -1.0
+  ROOT: 'data/mpii/'
+  ROT_FACTOR: 30
+  SCALE_FACTOR: 0.25
+  TEST_SET: valid
+  TRAIN_SET: train
+MODEL:
+  INIT_WEIGHTS: true
+  NAME: pose_hrnet
+  NUM_JOINTS: 16
+  PRETRAINED: 'models/pytorch/imagenet/hrnet_w48-8ef0771d.pth'
+  TARGET_TYPE: gaussian
+  IMAGE_SIZE:
+  - 256
+  - 256
+  HEATMAP_SIZE:
+  - 64
+  - 64
+  SIGMA: 2
+  EXTRA:
+    PRETRAINED_LAYERS:
+    - 'conv1'
+    - 'bn1'
+    - 'conv2'
+    - 'bn2'
+    - 'layer1'
+    - 'transition1'
+    - 'stage2'
+    - 'transition2'
+    - 'stage3'
+    - 'transition3'
+    - 'stage4'
+    FINAL_CONV_KERNEL: 1
+    STAGE2:
+      NUM_MODULES: 1
+      NUM_BRANCHES: 2
+      BLOCK: BASIC
+      NUM_BLOCKS:
+      - 4
+      - 4
+      NUM_CHANNELS:
+      - 48
+      - 96
+      FUSE_METHOD: SUM
+    STAGE3:
+      NUM_MODULES: 4
+      NUM_BRANCHES: 3
+      BLOCK: BASIC
+      NUM_BLOCKS:
+      - 4
+      - 4
+      - 4
+      NUM_CHANNELS:
+      - 48
+      - 96
+      - 192
+      FUSE_METHOD: SUM
+    STAGE4:
+      NUM_MODULES: 3
+      NUM_BRANCHES: 4
+      BLOCK: BASIC
+      NUM_BLOCKS:
+      - 4
+      - 4
+      - 4
+      - 4
+      NUM_CHANNELS:
+      - 48
+      - 96
+      - 192
+      - 384
+      FUSE_METHOD: SUM
+LOSS:
+  USE_TARGET_WEIGHT: true
+TRAIN:
+  BATCH_SIZE_PER_GPU: 32
+  SHUFFLE: true
+  BEGIN_EPOCH: 0
+  END_EPOCH: 210
+  OPTIMIZER: adam
+  LR: 0.001
+  LR_FACTOR: 0.1
+  LR_STEP:
+  - 170
+  - 200
+  WD: 0.0001
+  GAMMA1: 0.99
+  GAMMA2: 0.0
+  MOMENTUM: 0.9
+  NESTEROV: false
+TEST:
+  BATCH_SIZE_PER_GPU: 32
+  MODEL_FILE: ''
+  FLIP_TEST: true
+  POST_PROCESS: true
+  SHIFT_HEATMAP: true
+DEBUG:
+  DEBUG: true
+  SAVE_BATCH_IMAGES_GT: true
+  SAVE_BATCH_IMAGES_PRED: true
+  SAVE_HEATMAPS_GT: true
+  SAVE_HEATMAPS_PRED: true
diff --git a/VideoToNPZ/lib/pose/hrnet/experiments/mpii/resnet/res101_256x256_d256x3_adam_lr1e-3.yaml b/VideoToNPZ/lib/pose/hrnet/experiments/mpii/resnet/res101_256x256_d256x3_adam_lr1e-3.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a48291b00e70771e837f5b50a930ba8018a6b78b
--- /dev/null
+++ b/VideoToNPZ/lib/pose/hrnet/experiments/mpii/resnet/res101_256x256_d256x3_adam_lr1e-3.yaml
@@ -0,0 +1,86 @@
+AUTO_RESUME: true
+CUDNN:
+  BENCHMARK: true
+  DETERMINISTIC: false
+  ENABLED: true
+DATA_DIR: ''
+GPUS: (0,1,2,3)
+OUTPUT_DIR: 'output'
+LOG_DIR: 'log'
+WORKERS: 24
+PRINT_FREQ: 100
+
+DATASET:
+  COLOR_RGB: false
+  DATASET: mpii
+  DATA_FORMAT: jpg
+  FLIP: true
+  NUM_JOINTS_HALF_BODY: 8
+  PROB_HALF_BODY: -1.0
+  ROOT: 'data/mpii/'
+  ROT_FACTOR: 30
+  SCALE_FACTOR: 0.25
+  TEST_SET: valid
+  TRAIN_SET: train
+MODEL:
+  NAME: 'pose_resnet'
+  PRETRAINED: 'models/pytorch/imagenet/resnet101-5d3b4d8f.pth'
+  IMAGE_SIZE:
+  - 256
+  - 256
+  HEATMAP_SIZE:
+  - 64
+  - 64
+  SIGMA: 2
+  NUM_JOINTS: 16
+  TARGET_TYPE: 'gaussian'
+  EXTRA:
+    FINAL_CONV_KERNEL: 1
+    DECONV_WITH_BIAS: false
+    NUM_DECONV_LAYERS: 3
+    NUM_DECONV_FILTERS:
+    - 256
+    - 256
+    - 256
+    NUM_DECONV_KERNELS:
+    - 4
+    - 4
+    - 4
+    NUM_LAYERS: 101
+LOSS:
+  USE_TARGET_WEIGHT: true
+TRAIN:
+  BATCH_SIZE_PER_GPU: 32
+  SHUFFLE: true
+  BEGIN_EPOCH: 0
+  END_EPOCH: 140
+  OPTIMIZER: 'adam'
+  LR: 0.001
+  LR_FACTOR: 0.1
+  LR_STEP:
+  - 90
+  - 120
+  WD: 0.0001
+  GAMMA1: 0.99
+  GAMMA2: 0.0
+  MOMENTUM: 0.9
+  NESTEROV: false
+TEST:
+  BATCH_SIZE_PER_GPU: 32
+  COCO_BBOX_FILE: 'data/coco/person_detection_results/COCO_val2017_detections_AP_H_56_person.json'
+  BBOX_THRE: 1.0
+  IMAGE_THRE: 0.0
+  IN_VIS_THRE: 0.2
+  MODEL_FILE: ''
+  NMS_THRE: 1.0
+  OKS_THRE: 0.9
+  FLIP_TEST: true
+  POST_PROCESS: true
+  SHIFT_HEATMAP: true
+  USE_GT_BBOX: true
+DEBUG:
+  DEBUG: true
+  SAVE_BATCH_IMAGES_GT: true
+  SAVE_BATCH_IMAGES_PRED: true
+  SAVE_HEATMAPS_GT: true
+  SAVE_HEATMAPS_PRED: true
diff --git a/VideoToNPZ/lib/pose/hrnet/experiments/mpii/resnet/res152_256x256_d256x3_adam_lr1e-3.yaml b/VideoToNPZ/lib/pose/hrnet/experiments/mpii/resnet/res152_256x256_d256x3_adam_lr1e-3.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6002ac32675beb3b9b753a110d55382a7f9da7ac
--- /dev/null
+++ b/VideoToNPZ/lib/pose/hrnet/experiments/mpii/resnet/res152_256x256_d256x3_adam_lr1e-3.yaml
@@ -0,0 +1,86 @@
+AUTO_RESUME: true
+CUDNN:
+  BENCHMARK: true
+  DETERMINISTIC: false
+  ENABLED: true
+DATA_DIR: ''
+GPUS: (0,1,2,3)
+OUTPUT_DIR: 'output'
+LOG_DIR: 'log'
+WORKERS: 24
+PRINT_FREQ: 100
+
+DATASET:
+  COLOR_RGB: false
+  DATASET: mpii
+  DATA_FORMAT: jpg
+  FLIP: true
+  NUM_JOINTS_HALF_BODY: 8
+  PROB_HALF_BODY: -1.0
+  ROOT: 'data/mpii/'
+  ROT_FACTOR: 30
+  SCALE_FACTOR: 0.25
+  TEST_SET: valid
+  TRAIN_SET: train
+MODEL:
+  NAME: 'pose_resnet'
+  PRETRAINED: 'models/pytorch/imagenet/resnet152-b121ed2d.pth'
+  IMAGE_SIZE:
+  - 256
+  - 256
+  HEATMAP_SIZE:
+  - 64
+  - 64
+  SIGMA: 2
+  NUM_JOINTS: 16
+  TARGET_TYPE: 'gaussian'
+  EXTRA:
+    FINAL_CONV_KERNEL: 1
+    DECONV_WITH_BIAS: false
+    NUM_DECONV_LAYERS: 3
+    NUM_DECONV_FILTERS:
+    - 256
+    - 256
+    - 256
+    NUM_DECONV_KERNELS:
+    - 4
+    - 4
+    - 4
+    NUM_LAYERS: 152
+LOSS:
+  USE_TARGET_WEIGHT: true
+TRAIN:
+  BATCH_SIZE_PER_GPU: 32
+  SHUFFLE: true
+  BEGIN_EPOCH: 0
+  END_EPOCH: 140
+  OPTIMIZER: 'adam'
+  LR: 0.001
+  LR_FACTOR: 0.1
+  LR_STEP:
+  - 90
+  - 120
+  WD: 0.0001
+  GAMMA1: 0.99
+  GAMMA2: 0.0
+  MOMENTUM: 0.9
+  NESTEROV: false
+TEST:
+  BATCH_SIZE_PER_GPU: 32
+  COCO_BBOX_FILE: 'data/coco/person_detection_results/COCO_val2017_detections_AP_H_56_person.json'
+  BBOX_THRE: 1.0
+  IMAGE_THRE: 0.0
+  IN_VIS_THRE: 0.2
+  MODEL_FILE: ''
+  NMS_THRE: 1.0
+  OKS_THRE: 0.9
+  FLIP_TEST: true
+  POST_PROCESS: true
+  SHIFT_HEATMAP: true
+  USE_GT_BBOX: true
+DEBUG:
+  DEBUG: true
+  SAVE_BATCH_IMAGES_GT: true
+  SAVE_BATCH_IMAGES_PRED: true
+  SAVE_HEATMAPS_GT: true
+  SAVE_HEATMAPS_PRED: true
diff --git a/VideoToNPZ/lib/pose/hrnet/experiments/mpii/resnet/res50_256x256_d256x3_adam_lr1e-3.yaml b/VideoToNPZ/lib/pose/hrnet/experiments/mpii/resnet/res50_256x256_d256x3_adam_lr1e-3.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..33db6fbcf6c2e6a33190e221d07e1bcf0735714f
--- /dev/null
+++ b/VideoToNPZ/lib/pose/hrnet/experiments/mpii/resnet/res50_256x256_d256x3_adam_lr1e-3.yaml
@@ -0,0 +1,86 @@
+AUTO_RESUME: true
+CUDNN:
+  BENCHMARK: true
+  DETERMINISTIC: false
+  ENABLED: true
+DATA_DIR: ''
+GPUS: (0,1,2,3)
+OUTPUT_DIR: 'output'
+LOG_DIR: 'log'
+WORKERS: 24
+PRINT_FREQ: 100
+
+DATASET:
+  COLOR_RGB: false
+  DATASET: mpii
+  DATA_FORMAT: jpg
+  FLIP: true
+  NUM_JOINTS_HALF_BODY: 8
+  PROB_HALF_BODY: -1.0
+  ROOT: 'data/mpii/'
+  ROT_FACTOR: 30
+  SCALE_FACTOR: 0.25
+  TEST_SET: valid
+  TRAIN_SET: train
+MODEL:
+  NAME: 'pose_resnet'
+  PRETRAINED: 'models/pytorch/imagenet/resnet50-19c8e357.pth'
+  IMAGE_SIZE:
+  - 256
+  - 256
+  HEATMAP_SIZE:
+  - 64
+  - 64
+  SIGMA: 2
+  NUM_JOINTS: 16
+  TARGET_TYPE: 'gaussian'
+  EXTRA:
+    FINAL_CONV_KERNEL: 1
+    DECONV_WITH_BIAS: false
+    NUM_DECONV_LAYERS: 3
+    NUM_DECONV_FILTERS:
+    - 256
+    - 256
+    - 256
+    NUM_DECONV_KERNELS:
+    - 4
+    - 4
+    - 4
+    NUM_LAYERS: 50
+LOSS:
+  USE_TARGET_WEIGHT: true
+TRAIN:
+  BATCH_SIZE_PER_GPU: 32
+  SHUFFLE: true
+  BEGIN_EPOCH: 0
+  END_EPOCH: 140
+  OPTIMIZER: 'adam'
+  LR: 0.001
+  LR_FACTOR: 0.1
+  LR_STEP:
+  - 90
+  - 120
+  WD: 0.0001
+  GAMMA1: 0.99
+  GAMMA2: 0.0
+  MOMENTUM: 0.9
+  NESTEROV: false
+TEST:
+  BATCH_SIZE_PER_GPU: 32
+  COCO_BBOX_FILE: 'data/coco/person_detection_results/COCO_val2017_detections_AP_H_56_person.json'
+  BBOX_THRE: 1.0
+  IMAGE_THRE: 0.0
+  IN_VIS_THRE: 0.2
+  MODEL_FILE: ''
+  NMS_THRE: 1.0
+  OKS_THRE: 0.9
+  FLIP_TEST: true
+  POST_PROCESS: true
+  SHIFT_HEATMAP: true
+  USE_GT_BBOX: true
+DEBUG:
+  DEBUG: true
+  SAVE_BATCH_IMAGES_GT: true
+  SAVE_BATCH_IMAGES_PRED: true
+  SAVE_HEATMAPS_GT: true
+  SAVE_HEATMAPS_PRED: true
diff --git a/VideoToNPZ/lib/pose/hrnet/lib/Makefile b/VideoToNPZ/lib/pose/hrnet/lib/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..38cd2978c6fbb09364b579cb62e4d5abc33f80a2
--- /dev/null
+++ b/VideoToNPZ/lib/pose/hrnet/lib/Makefile
@@ -0,0 +1,4 @@
+all:
+	cd nms; python setup_linux.py build_ext --inplace; rm -rf build; cd ../../
+clean:
+	cd nms; rm *.so; cd ../../
diff --git a/VideoToNPZ/lib/pose/hrnet/lib/config/__init__.py b/VideoToNPZ/lib/pose/hrnet/lib/config/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a44e926b9b71389cb32a727d33d904bfdbcaaffb
--- /dev/null
+++ b/VideoToNPZ/lib/pose/hrnet/lib/config/__init__.py
@@ -0,0 +1,9 @@
+# ------------------------------------------------------------------------------
+# Copyright (c) Microsoft
+# Licensed under the MIT License.
+# Written by Bin Xiao (Bin.Xiao@microsoft.com)
+# ------------------------------------------------------------------------------
+
+from .default import _C as cfg
+from .default import update_config
+from .models import MODEL_EXTRAS
diff --git a/VideoToNPZ/lib/pose/hrnet/lib/config/default.py b/VideoToNPZ/lib/pose/hrnet/lib/config/default.py
new file mode 100644
index 0000000000000000000000000000000000000000..030f468ffba91d7e5886783f0971e5e88fc14000
--- /dev/null
+++ b/VideoToNPZ/lib/pose/hrnet/lib/config/default.py
@@ -0,0 +1,160 @@
+
+# ------------------------------------------------------------------------------
+# Copyright (c) Microsoft
+# Licensed under the MIT License.
+# Written by Bin Xiao (Bin.Xiao@microsoft.com)
+# ------------------------------------------------------------------------------
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from yacs.config import CfgNode as CN
+
+
+_C = CN()
+
+_C.OUTPUT_DIR = ''
+_C.LOG_DIR = ''
+_C.DATA_DIR = ''
+_C.GPUS = (0,)
+_C.WORKERS = 4
+_C.PRINT_FREQ = 20
+_C.AUTO_RESUME = False
+_C.PIN_MEMORY = True
+_C.RANK = 0
+
+# Cudnn related params
+_C.CUDNN = CN()
+_C.CUDNN.BENCHMARK = True
+_C.CUDNN.DETERMINISTIC = False
+_C.CUDNN.ENABLED = True
+
+# common params for NETWORK
+_C.MODEL = CN()
+_C.MODEL.NAME = 'pose_hrnet'
+_C.MODEL.INIT_WEIGHTS = True
+_C.MODEL.PRETRAINED = ''
+_C.MODEL.NUM_JOINTS = 17
+_C.MODEL.TAG_PER_JOINT = True
+_C.MODEL.TARGET_TYPE = 'gaussian'
+_C.MODEL.IMAGE_SIZE = [256, 256]  # width * height, ex: 192 * 256
+_C.MODEL.HEATMAP_SIZE = [64, 64]  # width * height, ex: 24 * 32
+_C.MODEL.SIGMA = 2
+_C.MODEL.EXTRA = CN(new_allowed=True)
+
+_C.LOSS = CN()
+_C.LOSS.USE_OHKM = False
+_C.LOSS.TOPK = 8
+_C.LOSS.USE_TARGET_WEIGHT = True
+_C.LOSS.USE_DIFFERENT_JOINTS_WEIGHT = False
+
+# DATASET related params
+_C.DATASET = CN()
+_C.DATASET.ROOT = ''
+_C.DATASET.DATASET = 'mpii'
+_C.DATASET.TRAIN_SET = 'train'
+_C.DATASET.TEST_SET = 'valid'
+_C.DATASET.DATA_FORMAT = 'jpg'
+_C.DATASET.HYBRID_JOINTS_TYPE = ''
+_C.DATASET.SELECT_DATA = False
+
+# training data augmentation
+_C.DATASET.FLIP = True
+_C.DATASET.SCALE_FACTOR = 0.25
+_C.DATASET.ROT_FACTOR = 30
+_C.DATASET.PROB_HALF_BODY = 0.0
+_C.DATASET.NUM_JOINTS_HALF_BODY = 8
+_C.DATASET.COLOR_RGB = False
+
+# train
+_C.TRAIN = CN()
+
+_C.TRAIN.LR_FACTOR = 0.1
+_C.TRAIN.LR_STEP = [90, 110]
+_C.TRAIN.LR = 0.001
+
+_C.TRAIN.OPTIMIZER = 'adam'
+_C.TRAIN.MOMENTUM = 0.9
+_C.TRAIN.WD = 0.0001
+_C.TRAIN.NESTEROV = False
+_C.TRAIN.GAMMA1 = 0.99
+_C.TRAIN.GAMMA2 = 0.0
+
+_C.TRAIN.BEGIN_EPOCH = 0
+_C.TRAIN.END_EPOCH = 140
+
+_C.TRAIN.RESUME = False
+_C.TRAIN.CHECKPOINT = ''
+
+_C.TRAIN.BATCH_SIZE_PER_GPU = 32
+_C.TRAIN.SHUFFLE = True
+
+# testing
+_C.TEST = CN()
+
+# size of images for each device
+_C.TEST.BATCH_SIZE_PER_GPU = 32
+# Test Model Epoch
+_C.TEST.FLIP_TEST = False
+_C.TEST.POST_PROCESS = False
+_C.TEST.SHIFT_HEATMAP = False
+
+_C.TEST.USE_GT_BBOX = False
+
+# nms
+_C.TEST.IMAGE_THRE = 0.1
+_C.TEST.NMS_THRE = 0.6
+_C.TEST.SOFT_NMS = False
+_C.TEST.OKS_THRE = 0.5
+_C.TEST.IN_VIS_THRE = 0.0
+_C.TEST.COCO_BBOX_FILE = ''
+_C.TEST.BBOX_THRE = 1.0
+_C.TEST.MODEL_FILE = ''
+
+# debug
+_C.DEBUG = CN()
+_C.DEBUG.DEBUG = False
+_C.DEBUG.SAVE_BATCH_IMAGES_GT = False
+_C.DEBUG.SAVE_BATCH_IMAGES_PRED = False
+_C.DEBUG.SAVE_HEATMAPS_GT = False
+_C.DEBUG.SAVE_HEATMAPS_PRED = False
+
+
+def update_config(cfg, args):
+    cfg.defrost()
+    cfg.merge_from_file(args.cfg)
+    cfg.merge_from_list(args.opts)
+
+    if args.modelDir:
+        cfg.OUTPUT_DIR = args.modelDir
+
+    # if args.logDir:
+    #     cfg.LOG_DIR = args.logDir
+    #
+    # if args.dataDir:
+    #     cfg.DATA_DIR = args.dataDir
+    #
+    # cfg.DATASET.ROOT = os.path.join(
+    #     cfg.DATA_DIR, cfg.DATASET.ROOT
+    # )
+    #
+    # cfg.MODEL.PRETRAINED = os.path.join(
+    #     cfg.DATA_DIR, cfg.MODEL.PRETRAINED
+    # )
+    #
+    # if cfg.TEST.MODEL_FILE:
+    #     cfg.TEST.MODEL_FILE = os.path.join(
+    #         cfg.DATA_DIR, cfg.TEST.MODEL_FILE
+    #     )
+
+    cfg.freeze()
+
+
+if __name__ == '__main__':
+    import sys
+    with open(sys.argv[1], 'w') as f:
+        print(_C, file=f)
+
diff --git a/VideoToNPZ/lib/pose/hrnet/lib/config/models.py b/VideoToNPZ/lib/pose/hrnet/lib/config/models.py
new file mode 100644
index 0000000000000000000000000000000000000000..8e04c4f75a42429142131e1fe4cbbd67fbf4acb8
--- /dev/null
+++ b/VideoToNPZ/lib/pose/hrnet/lib/config/models.py
@@ -0,0 +1,58 @@
+# ------------------------------------------------------------------------------
+# Copyright (c) Microsoft
+# Licensed under the MIT License.
+# Written by Bin Xiao (Bin.Xiao@microsoft.com)
+# ------------------------------------------------------------------------------
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from yacs.config import CfgNode as CN
+
+
+# pose_resnet related params
+POSE_RESNET = CN()
+POSE_RESNET.NUM_LAYERS = 50
+POSE_RESNET.DECONV_WITH_BIAS = False
+POSE_RESNET.NUM_DECONV_LAYERS = 3
+POSE_RESNET.NUM_DECONV_FILTERS = [256, 256, 256]
+POSE_RESNET.NUM_DECONV_KERNELS = [4, 4, 4]
+POSE_RESNET.FINAL_CONV_KERNEL = 1
+POSE_RESNET.PRETRAINED_LAYERS = ['*']
+
+# pose_multi_resoluton_net related params
+POSE_HIGH_RESOLUTION_NET = CN()
+POSE_HIGH_RESOLUTION_NET.PRETRAINED_LAYERS = ['*']
+POSE_HIGH_RESOLUTION_NET.STEM_INPLANES = 64
+POSE_HIGH_RESOLUTION_NET.FINAL_CONV_KERNEL = 1
+
+POSE_HIGH_RESOLUTION_NET.STAGE2 = CN()
+POSE_HIGH_RESOLUTION_NET.STAGE2.NUM_MODULES = 1
+POSE_HIGH_RESOLUTION_NET.STAGE2.NUM_BRANCHES = 2
+POSE_HIGH_RESOLUTION_NET.STAGE2.NUM_BLOCKS = [4, 4]
+POSE_HIGH_RESOLUTION_NET.STAGE2.NUM_CHANNELS = [32, 64]
+POSE_HIGH_RESOLUTION_NET.STAGE2.BLOCK = 'BASIC'
+POSE_HIGH_RESOLUTION_NET.STAGE2.FUSE_METHOD = 'SUM'
+
+POSE_HIGH_RESOLUTION_NET.STAGE3 = CN()
+POSE_HIGH_RESOLUTION_NET.STAGE3.NUM_MODULES = 1
+POSE_HIGH_RESOLUTION_NET.STAGE3.NUM_BRANCHES = 3
+POSE_HIGH_RESOLUTION_NET.STAGE3.NUM_BLOCKS = [4, 4, 4]
+POSE_HIGH_RESOLUTION_NET.STAGE3.NUM_CHANNELS = [32, 64, 128]
+POSE_HIGH_RESOLUTION_NET.STAGE3.BLOCK = 'BASIC'
+POSE_HIGH_RESOLUTION_NET.STAGE3.FUSE_METHOD = 'SUM'
+
+POSE_HIGH_RESOLUTION_NET.STAGE4 = CN()
+POSE_HIGH_RESOLUTION_NET.STAGE4.NUM_MODULES = 1
+POSE_HIGH_RESOLUTION_NET.STAGE4.NUM_BRANCHES = 4
+POSE_HIGH_RESOLUTION_NET.STAGE4.NUM_BLOCKS = [4, 4, 4, 4]
+POSE_HIGH_RESOLUTION_NET.STAGE4.NUM_CHANNELS = [32, 64, 128, 256]
+POSE_HIGH_RESOLUTION_NET.STAGE4.BLOCK = 'BASIC'
+POSE_HIGH_RESOLUTION_NET.STAGE4.FUSE_METHOD = 'SUM'
+
+
+MODEL_EXTRAS = {
+    'pose_resnet': POSE_RESNET,
+    'pose_high_resolution_net': POSE_HIGH_RESOLUTION_NET,
+}
diff --git a/VideoToNPZ/lib/pose/hrnet/lib/models/__init__.py b/VideoToNPZ/lib/pose/hrnet/lib/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e3b7f1a709982d59002f07fbb9f42d919d9bee17
--- /dev/null
+++ b/VideoToNPZ/lib/pose/hrnet/lib/models/__init__.py
@@ -0,0 +1,16 @@
+# ------------------------------------------------------------------------------
+# Copyright (c) Microsoft
+# Licensed under the MIT License.
+# Written by Bin Xiao (Bin.Xiao@microsoft.com)
+# ------------------------------------------------------------------------------
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import models.pose_resnet
+import models.pose_hrnet
diff --git a/VideoToNPZ/lib/pose/hrnet/lib/models/pose_hrnet.py b/VideoToNPZ/lib/pose/hrnet/lib/models/pose_hrnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..09ff346a1b20ca9e9078714132c01123d1b0b4b1
--- /dev/null
+++ b/VideoToNPZ/lib/pose/hrnet/lib/models/pose_hrnet.py
@@ -0,0 +1,501 @@
+# ------------------------------------------------------------------------------
+# Copyright (c) Microsoft
+# Licensed under the MIT License.
+# Written by Bin Xiao (Bin.Xiao@microsoft.com)
+# ------------------------------------------------------------------------------
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import logging
+
+import torch
+import torch.nn as nn
+
+
+BN_MOMENTUM = 0.1
+logger = logging.getLogger(__name__)
+
+
+def conv3x3(in_planes, out_planes, stride=1):
+    """3x3 convolution with padding"""
+    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
+                     padding=1, bias=False)
+
+
+class BasicBlock(nn.Module):
+    expansion = 1
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None):
+        super(BasicBlock, self).__init__()
+        self.conv1 = conv3x3(inplanes, planes, stride)
+        self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = conv3x3(planes, planes)
+        self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class Bottleneck(nn.Module):
+    expansion = 4
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None):
+        super(Bottleneck, self).__init__()
+        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
+                               padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
+        self.conv3 = nn.Conv2d(planes, planes * self.expansion, kernel_size=1,
+                               bias=False)
+        self.bn3 = nn.BatchNorm2d(planes * self.expansion,
+                                  momentum=BN_MOMENTUM)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class HighResolutionModule(nn.Module):
+    def __init__(self, num_branches, blocks, num_blocks, num_inchannels,
+                 num_channels, fuse_method, multi_scale_output=True):
+        super(HighResolutionModule, self).__init__()
+        self._check_branches(
+            num_branches, blocks, num_blocks, num_inchannels, num_channels)
+
+        self.num_inchannels = num_inchannels
+        self.fuse_method = fuse_method
+        self.num_branches = num_branches
+
+        self.multi_scale_output = multi_scale_output
+
+        self.branches = self._make_branches(
+            num_branches, blocks, num_blocks, num_channels)
+        self.fuse_layers = self._make_fuse_layers()
+        self.relu = nn.ReLU(True)
+
+    def _check_branches(self, num_branches, blocks, num_blocks,
+                        num_inchannels, num_channels):
+        if num_branches != len(num_blocks):
+            error_msg = 'NUM_BRANCHES({}) <> NUM_BLOCKS({})'.format(
+                num_branches, len(num_blocks))
+            logger.error(error_msg)
+            raise ValueError(error_msg)
+
+        if num_branches != len(num_channels):
+            error_msg = 'NUM_BRANCHES({}) <> NUM_CHANNELS({})'.format(
+                num_branches, len(num_channels))
+            logger.error(error_msg)
+            raise ValueError(error_msg)
+
+        if num_branches != len(num_inchannels):
+            error_msg = 'NUM_BRANCHES({}) <> NUM_INCHANNELS({})'.format(
+                num_branches, len(num_inchannels))
+            logger.error(error_msg)
+            raise ValueError(error_msg)
+
+    def _make_one_branch(self, branch_index, block, num_blocks, num_channels,
+                         stride=1):
+        downsample = None
+        if stride != 1 or \
+           self.num_inchannels[branch_index] != num_channels[branch_index] * block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2d(
+                    self.num_inchannels[branch_index],
+                    num_channels[branch_index] * block.expansion,
+                    kernel_size=1, stride=stride, bias=False
+                ),
+                nn.BatchNorm2d(
+                    num_channels[branch_index] * block.expansion,
+                    momentum=BN_MOMENTUM
+                ),
+            )
+
+        layers = []
+        layers.append(
+            block(
+                self.num_inchannels[branch_index],
+                num_channels[branch_index],
+                stride,
+                downsample
+            )
+        )
+        self.num_inchannels[branch_index] = \
+            num_channels[branch_index] * block.expansion
+        for i in range(1, num_blocks[branch_index]):
+            layers.append(
+                block(
+                    self.num_inchannels[branch_index],
+                    num_channels[branch_index]
+                )
+            )
+
+        return nn.Sequential(*layers)
+
+    def _make_branches(self, num_branches, block, num_blocks, num_channels):
+        branches = []
+
+        for i in range(num_branches):
+            branches.append(
+                self._make_one_branch(i, block, num_blocks, num_channels)
+            )
+
+        return nn.ModuleList(branches)
+
+    def _make_fuse_layers(self):
+        if self.num_branches == 1:
+            return None
+
+        num_branches = self.num_branches
+        num_inchannels = self.num_inchannels
+        fuse_layers = []
+        for i in range(num_branches if self.multi_scale_output else 1):
+            fuse_layer = []
+            for j in range(num_branches):
+                if j > i:
+                    fuse_layer.append(
+                        nn.Sequential(
+                            nn.Conv2d(
+                                num_inchannels[j],
+                                num_inchannels[i],
+                                1, 1, 0, bias=False
+                            ),
+                            nn.BatchNorm2d(num_inchannels[i]),
+                            nn.Upsample(scale_factor=2**(j-i), mode='nearest')
+                        )
+                    )
+                elif j == i:
+                    fuse_layer.append(None)
+                else:
+                    conv3x3s = []
+                    for k in range(i-j):
+                        if k == i - j - 1:
+                            num_outchannels_conv3x3 = num_inchannels[i]
+                            conv3x3s.append(
+                                nn.Sequential(
+                                    nn.Conv2d(
+                                        num_inchannels[j],
+                                        num_outchannels_conv3x3,
+                                        3, 2, 1, bias=False
+                                    ),
+                                    nn.BatchNorm2d(num_outchannels_conv3x3)
+                                )
+                            )
+                        else:
+                            num_outchannels_conv3x3 = num_inchannels[j]
+                            conv3x3s.append(
+                                nn.Sequential(
+                                    nn.Conv2d(
+                                        num_inchannels[j],
+                                        num_outchannels_conv3x3,
+                                        3, 2, 1, bias=False
+                                    ),
+                                    nn.BatchNorm2d(num_outchannels_conv3x3),
+                                    nn.ReLU(True)
+                                )
+                            )
+                    fuse_layer.append(nn.Sequential(*conv3x3s))
+            fuse_layers.append(nn.ModuleList(fuse_layer))
+
+        return nn.ModuleList(fuse_layers)
+
+    def get_num_inchannels(self):
+        return self.num_inchannels
+
+    def forward(self, x):
+        if self.num_branches == 1:
+            return [self.branches[0](x[0])]
+
+        for i in range(self.num_branches):
+            x[i] = self.branches[i](x[i])
+
+        x_fuse = []
+
+        for i in range(len(self.fuse_layers)):
+            y = x[0] if i == 0 else self.fuse_layers[i][0](x[0])
+            for j in range(1, self.num_branches):
+                if i == j:
+                    y = y + x[j]
+                else:
+                    y = y + self.fuse_layers[i][j](x[j])
+            x_fuse.append(self.relu(y))
+
+        return x_fuse
+
+
+blocks_dict = {
+    'BASIC': BasicBlock,
+    'BOTTLENECK': Bottleneck
+}
+
+
+class PoseHighResolutionNet(nn.Module):
+
+    def __init__(self, cfg, **kwargs):
+        self.inplanes = 64
+        extra = cfg['MODEL']['EXTRA']
+        super(PoseHighResolutionNet, self).__init__()
+
+        # stem net
+        self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=2, padding=1,
+                               bias=False)
+        self.bn1 = nn.BatchNorm2d(64, momentum=BN_MOMENTUM)
+        self.conv2 = nn.Conv2d(64, 64, kernel_size=3, stride=2, padding=1,
+                               bias=False)
+        self.bn2 = nn.BatchNorm2d(64, momentum=BN_MOMENTUM)
+        self.relu = nn.ReLU(inplace=True)
+        self.layer1 = self._make_layer(Bottleneck, 64, 4)
+
+        self.stage2_cfg = extra['STAGE2']
+        num_channels = self.stage2_cfg['NUM_CHANNELS']
+        block = blocks_dict[self.stage2_cfg['BLOCK']]
+        num_channels = [
+            num_channels[i] * block.expansion for i in range(len(num_channels))
+        ]
+        self.transition1 = self._make_transition_layer([256], num_channels)
+        self.stage2, pre_stage_channels = self._make_stage(
+            self.stage2_cfg, num_channels)
+
+        self.stage3_cfg = extra['STAGE3']
+        num_channels = self.stage3_cfg['NUM_CHANNELS']
+        block = blocks_dict[self.stage3_cfg['BLOCK']]
+        num_channels = [
+            num_channels[i] * block.expansion for i in range(len(num_channels))
+        ]
+        self.transition2 = self._make_transition_layer(
+            pre_stage_channels, num_channels)
+        self.stage3, pre_stage_channels = self._make_stage(
+            self.stage3_cfg, num_channels)
+
+        self.stage4_cfg = extra['STAGE4']
+        num_channels = self.stage4_cfg['NUM_CHANNELS']
+        block = blocks_dict[self.stage4_cfg['BLOCK']]
+        num_channels = [
+            num_channels[i] * block.expansion for i in range(len(num_channels))
+        ]
+        self.transition3 = self._make_transition_layer(
+            pre_stage_channels, num_channels)
+        self.stage4, pre_stage_channels = self._make_stage(
+            self.stage4_cfg, num_channels, multi_scale_output=False)
+
+        self.final_layer = nn.Conv2d(
+            in_channels=pre_stage_channels[0],
+            out_channels=cfg['MODEL']['NUM_JOINTS'],
+            kernel_size=extra['FINAL_CONV_KERNEL'],
+            stride=1,
+            padding=1 if extra['FINAL_CONV_KERNEL'] == 3 else 0
+        )
+
+        self.pretrained_layers = extra['PRETRAINED_LAYERS']
+
+    def _make_transition_layer(
+            self, num_channels_pre_layer, num_channels_cur_layer):
+        num_branches_cur = len(num_channels_cur_layer)
+        num_branches_pre = len(num_channels_pre_layer)
+
+        transition_layers = []
+        for i in range(num_branches_cur):
+            if i < num_branches_pre:
+                if num_channels_cur_layer[i] != num_channels_pre_layer[i]:
+                    transition_layers.append(
+                        nn.Sequential(
+                            nn.Conv2d(
+                                num_channels_pre_layer[i],
+                                num_channels_cur_layer[i],
+                                3, 1, 1, bias=False
+                            ),
+                            nn.BatchNorm2d(num_channels_cur_layer[i]),
+                            nn.ReLU(inplace=True)
+                        )
+                    )
+                else:
+                    transition_layers.append(None)
+            else:
+                conv3x3s = []
+                for j in range(i+1-num_branches_pre):
+                    inchannels = num_channels_pre_layer[-1]
+                    outchannels = num_channels_cur_layer[i] \
+                        if j == i-num_branches_pre else inchannels
+                    conv3x3s.append(
+                        nn.Sequential(
+                            nn.Conv2d(
+                                inchannels, outchannels, 3, 2, 1, bias=False
+                            ),
+                            nn.BatchNorm2d(outchannels),
+                            nn.ReLU(inplace=True)
+                        )
+                    )
+                transition_layers.append(nn.Sequential(*conv3x3s))
+
+        return nn.ModuleList(transition_layers)
+
+    def _make_layer(self, block, planes, blocks, stride=1):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2d(
+                    self.inplanes, planes * block.expansion,
+                    kernel_size=1, stride=stride, bias=False
+                ),
+                nn.BatchNorm2d(planes * block.expansion, momentum=BN_MOMENTUM),
+            )
+
+        layers = []
+        layers.append(block(self.inplanes, planes, stride, downsample))
+        self.inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(block(self.inplanes, planes))
+
+        return nn.Sequential(*layers)
+
+    def _make_stage(self, layer_config, num_inchannels,
+                    multi_scale_output=True):
+        num_modules = layer_config['NUM_MODULES']
+        num_branches = layer_config['NUM_BRANCHES']
+        num_blocks = layer_config['NUM_BLOCKS']
+        num_channels = layer_config['NUM_CHANNELS']
+        block = blocks_dict[layer_config['BLOCK']]
+        fuse_method = layer_config['FUSE_METHOD']
+
+        modules = []
+        for i in range(num_modules):
+            # multi_scale_output is only used last module
+            if not multi_scale_output and i == num_modules - 1:
+                reset_multi_scale_output = False
+            else:
+                reset_multi_scale_output = True
+
+            modules.append(
+                HighResolutionModule(
+                    num_branches,
+                    block,
+                    num_blocks,
+                    num_inchannels,
+                    num_channels,
+                    fuse_method,
+                    reset_multi_scale_output
+                )
+            )
+            num_inchannels = modules[-1].get_num_inchannels()
+
+        return nn.Sequential(*modules), num_inchannels
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.conv2(x)
+        x = self.bn2(x)
+        x = self.relu(x)
+        x = self.layer1(x)
+
+        x_list = []
+        for i in range(self.stage2_cfg['NUM_BRANCHES']):
+            if self.transition1[i] is not None:
+                x_list.append(self.transition1[i](x))
+            else:
+                x_list.append(x)
+        y_list = self.stage2(x_list)
+
+        x_list = []
+        for i in range(self.stage3_cfg['NUM_BRANCHES']):
+            if self.transition2[i] is not None:
+                x_list.append(self.transition2[i](y_list[-1]))
+            else:
+                x_list.append(y_list[i])
+        y_list = self.stage3(x_list)
+
+        x_list = []
+        for i in range(self.stage4_cfg['NUM_BRANCHES']):
+            if self.transition3[i] is not None:
+                x_list.append(self.transition3[i](y_list[-1]))
+            else:
+                x_list.append(y_list[i])
+        y_list = self.stage4(x_list)
+
+        x = self.final_layer(y_list[0])
+
+        return x
+
+    def init_weights(self, pretrained=''):
+        logger.info('=> init weights from normal distribution')
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                # nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
+                nn.init.normal_(m.weight, std=0.001)
+                for name, _ in m.named_parameters():
+                    if name in ['bias']:
+                        nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.BatchNorm2d):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.ConvTranspose2d):
+                nn.init.normal_(m.weight, std=0.001)
+                for name, _ in m.named_parameters():
+                    if name in ['bias']:
+                        nn.init.constant_(m.bias, 0)
+
+        if os.path.isfile(pretrained):
+            pretrained_state_dict = torch.load(pretrained)
+            logger.info('=> loading pretrained model {}'.format(pretrained))
+
+            need_init_state_dict = {}
+            for name, m in pretrained_state_dict.items():
+                if name.split('.')[0] in self.pretrained_layers \
+                   or self.pretrained_layers[0] is '*':
+                    need_init_state_dict[name] = m
+            self.load_state_dict(need_init_state_dict, strict=False)
+        elif pretrained:
+            logger.error('=> please download pre-trained models first!')
+            raise ValueError('{} is not exist!'.format(pretrained))
+
+
+def get_pose_net(cfg, is_train, **kwargs):
+    model = PoseHighResolutionNet(cfg, **kwargs)
+
+    if is_train and cfg['MODEL']['INIT_WEIGHTS']:
+        model.init_weights(cfg['MODEL']['PRETRAINED'])
+
+    return model
diff --git a/VideoToNPZ/lib/pose/hrnet/lib/models/pose_resnet.py b/VideoToNPZ/lib/pose/hrnet/lib/models/pose_resnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..f264dee95cf32ea57e9e1a97952eedd723117249
--- /dev/null
+++ b/VideoToNPZ/lib/pose/hrnet/lib/models/pose_resnet.py
@@ -0,0 +1,271 @@
+# ------------------------------------------------------------------------------
+# Copyright (c) Microsoft
+# Licensed under the MIT License.
+# Written by Bin Xiao (Bin.Xiao@microsoft.com)
+# ------------------------------------------------------------------------------
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import logging
+
+import torch
+import torch.nn as nn
+
+
+BN_MOMENTUM = 0.1
+logger = logging.getLogger(__name__)
+
+
+def conv3x3(in_planes, out_planes, stride=1):
+    """3x3 convolution with padding"""
+    return nn.Conv2d(
+        in_planes, out_planes, kernel_size=3, stride=stride,
+        padding=1, bias=False
+    )
+
+
+class BasicBlock(nn.Module):
+    expansion = 1
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None):
+        super(BasicBlock, self).__init__()
+        self.conv1 = conv3x3(inplanes, planes, stride)
+        self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = conv3x3(planes, planes)
+        self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class Bottleneck(nn.Module):
+    expansion = 4
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None):
+        super(Bottleneck, self).__init__()
+        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
+                               padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
+        self.conv3 = nn.Conv2d(planes, planes * self.expansion, kernel_size=1,
+                               bias=False)
+        self.bn3 = nn.BatchNorm2d(planes * self.expansion,
+                                  momentum=BN_MOMENTUM)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class PoseResNet(nn.Module):
+
+    def __init__(self, block, layers, cfg, **kwargs):
+        self.inplanes = 64
+        extra = cfg.MODEL.EXTRA
+        self.deconv_with_bias = extra.DECONV_WITH_BIAS
+
+        super(PoseResNet, self).__init__()
+        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3,
+                               bias=False)
+        self.bn1 = nn.BatchNorm2d(64, momentum=BN_MOMENTUM)
+        self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.layer1 = self._make_layer(block, 64, layers[0])
+        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
+        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
+        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
+
+        # used for deconv layers
+        self.deconv_layers = self._make_deconv_layer(
+            extra.NUM_DECONV_LAYERS,
+            extra.NUM_DECONV_FILTERS,
+            extra.NUM_DECONV_KERNELS,
+        )
+
+        self.final_layer = nn.Conv2d(
+            in_channels=extra.NUM_DECONV_FILTERS[-1],
+            out_channels=cfg.MODEL.NUM_JOINTS,
+            kernel_size=extra.FINAL_CONV_KERNEL,
+            stride=1,
+            padding=1 if extra.FINAL_CONV_KERNEL == 3 else 0
+        )
+
+    def _make_layer(self, block, planes, blocks, stride=1):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2d(self.inplanes, planes * block.expansion,
+                          kernel_size=1, stride=stride, bias=False),
+                nn.BatchNorm2d(planes * block.expansion, momentum=BN_MOMENTUM),
+            )
+
+        layers = []
+        layers.append(block(self.inplanes, planes, stride, downsample))
+        self.inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(block(self.inplanes, planes))
+
+        return nn.Sequential(*layers)
+
+    def _get_deconv_cfg(self, deconv_kernel, index):
+        if deconv_kernel == 4:
+            padding = 1
+            output_padding = 0
+        elif deconv_kernel == 3:
+            padding = 1
+            output_padding = 1
+        elif deconv_kernel == 2:
+            padding = 0
+            output_padding = 0
+
+        return deconv_kernel, padding, output_padding
+
+    def _make_deconv_layer(self, num_layers, num_filters, num_kernels):
+        assert num_layers == len(num_filters), \
+            'ERROR: num_deconv_layers is different len(num_deconv_filters)'
+        assert num_layers == len(num_kernels), \
+            'ERROR: num_deconv_layers is different len(num_deconv_filters)'
+
+        layers = []
+        for i in range(num_layers):
+            kernel, padding, output_padding = \
+                self._get_deconv_cfg(num_kernels[i], i)
+
+            planes = num_filters[i]
+            layers.append(
+                nn.ConvTranspose2d(
+                    in_channels=self.inplanes,
+                    out_channels=planes,
+                    kernel_size=kernel,
+                    stride=2,
+                    padding=padding,
+                    output_padding=output_padding,
+                    bias=self.deconv_with_bias))
+            layers.append(nn.BatchNorm2d(planes, momentum=BN_MOMENTUM))
+            layers.append(nn.ReLU(inplace=True))
+            self.inplanes = planes
+
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+
+        x = self.deconv_layers(x)
+        x = self.final_layer(x)
+
+        return x
+
+    def init_weights(self, pretrained=''):
+        if os.path.isfile(pretrained):
+            logger.info('=> init deconv weights from normal distribution')
+            for name, m in self.deconv_layers.named_modules():
+                if isinstance(m, nn.ConvTranspose2d):
+                    logger.info('=> init {}.weight as normal(0, 0.001)'.format(name))
+                    logger.info('=> init {}.bias as 0'.format(name))
+                    nn.init.normal_(m.weight, std=0.001)
+                    if self.deconv_with_bias:
+                        nn.init.constant_(m.bias, 0)
+                elif isinstance(m, nn.BatchNorm2d):
+                    logger.info('=> init {}.weight as 1'.format(name))
+                    logger.info('=> init {}.bias as 0'.format(name))
+                    nn.init.constant_(m.weight, 1)
+                    nn.init.constant_(m.bias, 0)
+            logger.info('=> init final conv weights from normal distribution')
+            for m in self.final_layer.modules():
+                if isinstance(m, nn.Conv2d):
+                    # nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
+                    logger.info('=> init {}.weight as normal(0, 0.001)'.format(name))
+                    logger.info('=> init {}.bias as 0'.format(name))
+                    nn.init.normal_(m.weight, std=0.001)
+                    nn.init.constant_(m.bias, 0)
+
+            pretrained_state_dict = torch.load(pretrained)
+            logger.info('=> loading pretrained model {}'.format(pretrained))
+            self.load_state_dict(pretrained_state_dict, strict=False)
+        else:
+            logger.info('=> init weights from normal distribution')
+            for m in self.modules():
+                if isinstance(m, nn.Conv2d):
+                    # nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
+                    nn.init.normal_(m.weight, std=0.001)
+                    # nn.init.constant_(m.bias, 0)
+                elif isinstance(m, nn.BatchNorm2d):
+                    nn.init.constant_(m.weight, 1)
+                    nn.init.constant_(m.bias, 0)
+                elif isinstance(m, nn.ConvTranspose2d):
+                    nn.init.normal_(m.weight, std=0.001)
+                    if self.deconv_with_bias:
+                        nn.init.constant_(m.bias, 0)
+
+
+resnet_spec = {
+    18: (BasicBlock, [2, 2, 2, 2]),
+    34: (BasicBlock, [3, 4, 6, 3]),
+    50: (Bottleneck, [3, 4, 6, 3]),
+    101: (Bottleneck, [3, 4, 23, 3]),
+    152: (Bottleneck, [3, 8, 36, 3])
+}
+
+
+def get_pose_net(cfg, is_train, **kwargs):
+    num_layers = cfg.MODEL.EXTRA.NUM_LAYERS
+
+    block_class, layers = resnet_spec[num_layers]
+
+    model = PoseResNet(block_class, layers, cfg, **kwargs)
+
+    if is_train and cfg.MODEL.INIT_WEIGHTS:
+        model.init_weights(cfg.MODEL.PRETRAINED)
+
+    return model
diff --git a/VideoToNPZ/lib/pose/hrnet/lib/utils/__init__.py b/VideoToNPZ/lib/pose/hrnet/lib/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/VideoToNPZ/lib/pose/hrnet/lib/utils/coco_h36m.py b/VideoToNPZ/lib/pose/hrnet/lib/utils/coco_h36m.py
new file mode 100644
index 0000000000000000000000000000000000000000..44fe35fa6f06e02f902129e6465a62b480653c94
--- /dev/null
+++ b/VideoToNPZ/lib/pose/hrnet/lib/utils/coco_h36m.py
@@ -0,0 +1,51 @@
+import numpy as np
+
+
+h36m_coco_order = [9, 11, 14, 12, 15, 13, 16, 4, 1, 5, 2, 6, 3]
+coco_order = [0, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
+spple_keypoints = [10, 8, 0, 7]
+
+
+def coco_h36m(keypoints):
+    # keypoints: (T, N, 2) or (M, N, 2)
+
+    temporal = keypoints.shape[0]
+    keypoints_h36m = np.zeros_like(keypoints, dtype=np.float32)
+    htps_keypoints = np.zeros((temporal, 4, 2), dtype=np.float32)
+
+    # htps_keypoints: head, thorax, pelvis, spine
+    htps_keypoints[:, 0, 0] = np.mean(keypoints[:, 1:5, 0], axis=1, dtype=np.float32)
+    htps_keypoints[:, 0, 1] = np.sum(keypoints[:, 1:3, 1], axis=1, dtype=np.float32) - keypoints[:, 0, 1]
+    htps_keypoints[:, 1, :] = np.mean(keypoints[:, 5:7, :], axis=1, dtype=np.float32)
+    htps_keypoints[:, 1, :] += (keypoints[:, 0, :] - htps_keypoints[:, 1, :]) / 3
+
+    htps_keypoints[:, 2, :] = np.mean(keypoints[:, 11:13, :], axis=1, dtype=np.float32)
+    htps_keypoints[:, 3, :] = np.mean(keypoints[:, [5, 6, 11, 12], :], axis=1, dtype=np.float32)
+
+    keypoints_h36m[:, spple_keypoints, :] = htps_keypoints
+    keypoints_h36m[:, h36m_coco_order, :] = keypoints[:, coco_order, :]
+
+    keypoints_h36m[:, 9, :] -= (keypoints_h36m[:, 9, :] - np.mean(keypoints[:, 5:7, :], axis=1, dtype=np.float32)) / 4
+    keypoints_h36m[:, 7, 0] += 0.3*(keypoints_h36m[:, 7, 0] - np.mean(keypoints_h36m[:, [0, 8], 0], axis=1, dtype=np.float32))
+    keypoints_h36m[:, 8, 1] -= (np.mean(keypoints[:, 1:3, 1], axis=1, dtype=np.float32) - keypoints[:, 0, 1])*2/3
+
+    # half body: the joint of ankle and knee equal to hip
+    # keypoints_h36m[:, [2, 3]] = keypoints_h36m[:, [1, 1]]
+    # keypoints_h36m[:, [5, 6]] = keypoints_h36m[:, [4, 4]]
+    return keypoints_h36m
+
+
+h36m_mpii_order = [3, 2, 1, 4, 5, 6, 0, 8, 9, 10, 16, 15, 14, 11, 12, 13]
+mpii_order = [i for i in range(16)]
+lr_hip_shouler = [2, 3, 12, 13]
+
+
+def mpii_h36m(keypoints):
+    temporal = keypoints.shape[0]
+    keypoints_h36m = np.zeros((temporal, 17, 2), dtype=np.float32)
+    keypoints_h36m[:, h36m_mpii_order] = keypoints
+    # keypoints_h36m[:, 7] = np.mean(keypoints[:, 6:8], axis=1, dtype=np.float32)
+    keypoints_h36m[:, 7] = np.mean(keypoints[:, lr_hip_shouler], axis=1, dtype=np.float32)
+    return keypoints_h36m
+
+
diff --git a/VideoToNPZ/lib/pose/hrnet/lib/utils/inference.py b/VideoToNPZ/lib/pose/hrnet/lib/utils/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..77942ad22ae1ca2b541a0cfd65619f34c8a32bfb
--- /dev/null
+++ b/VideoToNPZ/lib/pose/hrnet/lib/utils/inference.py
@@ -0,0 +1,82 @@
+# ------------------------------------------------------------------------------
+# Copyright (c) Microsoft
+# Licensed under the MIT License.
+# Written by Bin Xiao (Bin.Xiao@microsoft.com)
+# ------------------------------------------------------------------------------
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+import sys
+import os.path as osp
+import numpy as np
+
+sys.path.insert(0, osp.join(osp.dirname(osp.realpath(__file__)), '..'))
+from utils.transforms import transform_preds
+sys.path.pop(0)
+
+
+def get_max_preds(batch_heatmaps):
+    '''
+    get predictions from score maps
+    heatmaps: numpy.ndarray([batch_size, num_joints, height, width])
+    '''
+    assert isinstance(batch_heatmaps, np.ndarray), \
+        'batch_heatmaps should be numpy.ndarray'
+    assert batch_heatmaps.ndim == 4, 'batch_images should be 4-ndim'
+
+    batch_size = batch_heatmaps.shape[0]
+    num_joints = batch_heatmaps.shape[1]
+    width = batch_heatmaps.shape[3]
+    heatmaps_reshaped = batch_heatmaps.reshape((batch_size, num_joints, -1))
+    idx = np.argmax(heatmaps_reshaped, 2)
+    maxvals = np.amax(heatmaps_reshaped, 2)
+
+    maxvals = maxvals.reshape((batch_size, num_joints, 1))
+    idx = idx.reshape((batch_size, num_joints, 1))
+
+    preds = np.tile(idx, (1, 1, 2)).astype(np.float32)
+
+    preds[:, :, 0] = (preds[:, :, 0]) % width
+    preds[:, :, 1] = np.floor((preds[:, :, 1]) / width)
+
+    pred_mask = np.tile(np.greater(maxvals, 0.0), (1, 1, 2))
+    pred_mask = pred_mask.astype(np.float32)
+
+    preds *= pred_mask
+    return preds, maxvals
+
+
+def get_final_preds(config, batch_heatmaps, center, scale):
+    coords, maxvals = get_max_preds(batch_heatmaps)
+
+    heatmap_height = batch_heatmaps.shape[2]
+    heatmap_width = batch_heatmaps.shape[3]
+
+    # post-processing
+    if config.TEST.POST_PROCESS:
+        for n in range(coords.shape[0]):
+            for p in range(coords.shape[1]):
+                hm = batch_heatmaps[n][p]
+                px = int(math.floor(coords[n][p][0] + 0.5))
+                py = int(math.floor(coords[n][p][1] + 0.5))
+                if 1 < px < heatmap_width-1 and 1 < py < heatmap_height-1:
+                    diff = np.array(
+                        [
+                            hm[py][px+1] - hm[py][px-1],
+                            hm[py+1][px]-hm[py-1][px]
+                        ]
+                    )
+                    coords[n][p] += np.sign(diff) * .25
+
+    preds = coords.copy()
+
+    # Transform back
+    for i in range(coords.shape[0]):
+        preds[i] = transform_preds(
+            coords[i], center[i], scale[i], [heatmap_width, heatmap_height]
+        )
+
+    return preds, maxvals
diff --git a/VideoToNPZ/lib/pose/hrnet/lib/utils/transforms.py b/VideoToNPZ/lib/pose/hrnet/lib/utils/transforms.py
new file mode 100644
index 0000000000000000000000000000000000000000..98ccbd5cde7f98daa87581d183228f0e3b4a4318
--- /dev/null
+++ b/VideoToNPZ/lib/pose/hrnet/lib/utils/transforms.py
@@ -0,0 +1,122 @@
+# ------------------------------------------------------------------------------
+# Copyright (c) Microsoft
+# Licensed under the MIT License.
+# Written by Bin Xiao (Bin.Xiao@microsoft.com)
+# ------------------------------------------------------------------------------
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import cv2
+
+
+def flip_back(output_flipped, matched_parts):
+    '''
+    ouput_flipped: numpy.ndarray(batch_size, num_joints, height, width)
+    '''
+    assert output_flipped.ndim == 4,\
+        'output_flipped should be [batch_size, num_joints, height, width]'
+
+    output_flipped = output_flipped[:, :, :, ::-1]
+
+    # 因为你输入的是翻转后的图像，所以输出的热图他们对应的左右关节也是相反的（训练的时候，输入的是翻转后的图像，target对应的左右关节也是对调过来的）。
+    for pair in matched_parts:
+        tmp = output_flipped[:, pair[0], :, :].copy()
+        output_flipped[:, pair[0], :, :] = output_flipped[:, pair[1], :, :]
+        output_flipped[:, pair[1], :, :] = tmp
+
+    return output_flipped
+
+
+def fliplr_joints(joints, joints_vis, width, matched_parts):
+    """
+    flip coords
+    """
+    # Flip horizontal
+    joints[:, 0] = width - joints[:, 0] - 1
+
+    # Change left-right parts
+    for pair in matched_parts:
+        joints[pair[0], :], joints[pair[1], :] = \
+            joints[pair[1], :], joints[pair[0], :].copy()
+        joints_vis[pair[0], :], joints_vis[pair[1], :] = \
+            joints_vis[pair[1], :], joints_vis[pair[0], :].copy()
+
+    return joints*joints_vis, joints_vis
+
+
+def transform_preds(coords, center, scale, output_size):
+    target_coords = np.zeros(coords.shape)
+    trans = get_affine_transform(center, scale, 0, output_size, inv=1)
+    for p in range(coords.shape[0]):
+        target_coords[p, 0:2] = affine_transform(coords[p, 0:2], trans)
+    return target_coords
+
+
+def get_affine_transform(
+        center, scale, rot, output_size,
+        shift=np.array([0, 0], dtype=np.float32), inv=0
+):
+    if not isinstance(scale, np.ndarray) and not isinstance(scale, list):
+        print(scale)
+        scale = np.array([scale, scale])
+
+    scale_tmp = scale * 200.0
+    src_w = scale_tmp[0]
+    dst_w = output_size[0]
+    dst_h = output_size[1]
+
+    rot_rad = np.pi * rot / 180
+    src_dir = get_dir([0, src_w * -0.5], rot_rad)
+    dst_dir = np.array([0, dst_w * -0.5], np.float32)
+
+    src = np.zeros((3, 2), dtype=np.float32)
+    dst = np.zeros((3, 2), dtype=np.float32)
+    src[0, :] = center + scale_tmp * shift
+    src[1, :] = center + src_dir + scale_tmp * shift
+    dst[0, :] = [dst_w * 0.5, dst_h * 0.5]
+    dst[1, :] = np.array([dst_w * 0.5, dst_h * 0.5]) + dst_dir
+
+    src[2:, :] = get_3rd_point(src[0, :], src[1, :])
+    dst[2:, :] = get_3rd_point(dst[0, :], dst[1, :])
+
+    if inv:
+        trans = cv2.getAffineTransform(np.float32(dst), np.float32(src))
+    else:
+        trans = cv2.getAffineTransform(np.float32(src), np.float32(dst))
+
+    return trans
+
+
+def affine_transform(pt, t):
+    new_pt = np.array([pt[0], pt[1], 1.]).T
+    new_pt = np.dot(t, new_pt)
+    return new_pt[:2]
+
+
+def get_3rd_point(a, b):
+    direct = a - b
+    return b + np.array([-direct[1], direct[0]], dtype=np.float32)
+
+
+def get_dir(src_point, rot_rad):
+    sn, cs = np.sin(rot_rad), np.cos(rot_rad)
+
+    src_result = [0, 0]
+    src_result[0] = src_point[0] * cs - src_point[1] * sn
+    src_result[1] = src_point[0] * sn + src_point[1] * cs
+
+    return src_result
+
+
+def crop(img, center, scale, output_size, rot=0):
+    trans = get_affine_transform(center, scale, rot, output_size)
+
+    dst_img = cv2.warpAffine(
+        img, trans, (int(output_size[0]), int(output_size[1])),
+        flags=cv2.INTER_LINEAR
+    )
+
+    return dst_img
diff --git a/VideoToNPZ/lib/pose/hrnet/lib/utils/utilitys.py b/VideoToNPZ/lib/pose/hrnet/lib/utils/utilitys.py
new file mode 100644
index 0000000000000000000000000000000000000000..fb33b2e09ae4953cd048532bdab28a380eeecbb6
--- /dev/null
+++ b/VideoToNPZ/lib/pose/hrnet/lib/utils/utilitys.py
@@ -0,0 +1,170 @@
+import cv2
+import sys
+import torch
+import json
+import torchvision.transforms as transforms
+import _init_paths
+from utils.transforms import *
+
+from utils.coco_h36m import coco_h36m
+import numpy as np
+
+joint_pairs = [[0, 1], [1, 3], [0, 2], [2, 4],
+               [5, 6], [5, 7], [7, 9], [6, 8], [8, 10],
+               [5, 11], [6, 12], [11, 12],
+               [11, 13], [12, 14], [13, 15], [14, 16]]
+
+h36m_pairs = [(0, 1), (1, 2), (2, 3), (0, 4), (4, 5), (5, 6), (0, 7), (7, 8), (8, 9), (9, 10), (8, 11), (11, 12),
+              (12, 13), (8, 14), (14, 15), (15, 16)]
+
+colors = [[255, 0, 0], [255, 85, 0], [255, 170, 0], [255, 255, 0], [170, 255, 0], [85, 255, 0], [0, 255, 0], \
+          [0, 255, 85], [0, 255, 170], [0, 255, 255], [0, 170, 255], [0, 85, 255], [0, 0, 255], [85, 0, 255], \
+          [170, 0, 255], [255, 0, 255]]
+
+
+def plot_keypoint(image, coordinates, confidence, keypoint_thresh=0.3):
+    # USE cv2
+    joint_visible = confidence[:, :, 0] > keypoint_thresh
+    coordinates = coco_h36m(coordinates)
+    for i in range(coordinates.shape[0]):
+        pts = coordinates[i]
+
+        for joint in pts:
+            cv2.circle(image, (int(joint[0]), int(joint[1])), 8, (255, 255, 255), 1)
+
+        for color_i, jp in zip(colors, h36m_pairs):
+            if joint_visible[i, jp[0]] and joint_visible[i, jp[1]]:
+                pt0 = pts[jp, 0]
+                pt1 = pts[jp, 1]
+                pt0_0, pt0_1, pt1_0, pt1_1 = int(pt0[0]), int(pt0[1]), int(pt1[0]), int(pt1[1])
+
+                cv2.line(image, (pt0_0, pt1_0), (pt0_1, pt1_1), color_i, 6)
+                #  cv2.circle(image,(pt0_0, pt0_1), 2, color_i, thickness=-1)
+                #  cv2.circle(image,(pt1_0, pt1_1), 2, color_i, thickness=-1)
+    return image
+
+
+def write(x, img):
+    x = [int(i) for i in x]
+    c1 = tuple(x[0:2])
+    c2 = tuple(x[2:4])
+
+    color = [0, 97, 255]
+    label = 'People {}'.format(x[-1])
+    cv2.rectangle(img, c1, c2, color, 2)
+    t_size = cv2.getTextSize(label, cv2.FONT_HERSHEY_PLAIN, 1, 1)[0]
+    c2 = c1[0] + t_size[0] + 3, c1[1] + t_size[1] + 4
+    cv2.rectangle(img, c1, c2, [0, 128, 255], -1)
+    cv2.putText(img, label, (c1[0], c1[1] + t_size[1] + 4), cv2.FONT_HERSHEY_PLAIN, 1, [225, 255, 255], 1)
+    return img
+
+
+def load_json(file_path):
+    with open(file_path, 'r') as fr:
+        video_info = json.load(fr)
+
+    label = video_info['label']
+    label_index = video_info['label_index']
+
+    num_frames = video_info['data'][-1]['frame_index']
+    keypoints = np.zeros((2, num_frames, 17, 2), dtype=np.float32)  # (M, T, N, 2)
+    scores = np.zeros((2, num_frames, 17), dtype=np.float32)  # (M, T, N)
+
+    for frame_info in video_info['data']:
+        frame_index = frame_info['frame_index']
+
+        for index, skeleton_info in enumerate(frame_info['skeleton']):
+            pose = skeleton_info['pose']
+            score = skeleton_info['score']
+            bbox = skeleton_info['bbox']
+
+            if len(bbox) == 0 or index+1 > 2:
+                continue
+
+            pose = np.asarray(pose, dtype=np.float32)
+            score = np.asarray(score, dtype=np.float32)
+            score = score.reshape(-1)
+
+            keypoints[index, frame_index-1] = pose
+            scores[index, frame_index-1] = score
+
+    new_kpts = []
+    for i in range(keypoints.shape[0]):
+        kps = keypoints[i]
+        if np.sum(kps) != 0.:
+            new_kpts.append(kps)
+
+    new_kpts = np.asarray(new_kpts, dtype=np.float32)
+    scores = np.asarray(scores, dtype=np.float32)
+    scores = scores[:, :, :, np.newaxis]
+    return new_kpts, scores, label, label_index
+
+
+def box_to_center_scale(box, model_image_width, model_image_height):
+    """convert a box to center,scale information required for pose transformation
+    Parameters
+    ----------
+    box : (x1, y1, x2, y2)
+    model_image_width : int
+    model_image_height : int
+
+    Returns
+    -------
+    (numpy array, numpy array)
+        Two numpy arrays, coordinates for the center of the box and the scale of the box
+    """
+    center = np.zeros((2), dtype=np.float32)
+    x1, y1, x2, y2 = box[:4]
+    box_width, box_height = x2 - x1, y2 - y1
+
+    center[0] = x1 + box_width * 0.5
+    center[1] = y1 + box_height * 0.5
+
+    aspect_ratio = model_image_width * 1.0 / model_image_height
+    pixel_std = 200
+
+    if box_width > aspect_ratio * box_height:
+        box_height = box_width * 1.0 / aspect_ratio
+    elif box_width < aspect_ratio * box_height:
+        box_width = box_height * aspect_ratio
+    scale = np.array(
+        [box_width * 1.0 / pixel_std, box_height * 1.0 / pixel_std],
+        dtype=np.float32)
+    if center[0] != -1:
+        scale = scale * 1.25
+
+    return center, scale
+
+
+# Pre-process
+def PreProcess(image, bboxs, cfg, num_pos=2):
+    if type(image) == str:
+        data_numpy = cv2.imread(image, cv2.IMREAD_COLOR | cv2.IMREAD_IGNORE_ORIENTATION)
+        # data_numpy = cv2.cvtColor(data_numpy, cv2.COLOR_BGR2RGB)
+    else:
+        data_numpy = image
+
+    inputs = []
+    centers = []
+    scales = []
+
+    for bbox in bboxs[:num_pos]:
+        c, s = box_to_center_scale(bbox, data_numpy.shape[0], data_numpy.shape[1])
+        centers.append(c)
+        scales.append(s)
+        r = 0
+
+        trans = get_affine_transform(c, s, r, cfg.MODEL.IMAGE_SIZE)
+        input = cv2.warpAffine(
+            data_numpy,
+            trans,
+            (int(cfg.MODEL.IMAGE_SIZE[0]), int(cfg.MODEL.IMAGE_SIZE[1])),
+            flags=cv2.INTER_LINEAR)
+
+        transform = transforms.Compose([transforms.ToTensor(),
+                                        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])])
+        input = transform(input).unsqueeze(0)
+        inputs.append(input)
+
+    inputs = torch.cat(inputs)
+    return inputs, data_numpy, centers, scales
diff --git a/VideoToNPZ/lib/pose/hrnet/pose_estimation/__init__.py b/VideoToNPZ/lib/pose/hrnet/pose_estimation/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/VideoToNPZ/lib/pose/hrnet/pose_estimation/_init_paths.py b/VideoToNPZ/lib/pose/hrnet/pose_estimation/_init_paths.py
new file mode 100644
index 0000000000000000000000000000000000000000..f6c9c1200949417dbf48495ae9355d258ef693e4
--- /dev/null
+++ b/VideoToNPZ/lib/pose/hrnet/pose_estimation/_init_paths.py
@@ -0,0 +1,35 @@
+# ------------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# Written by Bin Xiao (Bin.Xiao@microsoft.com)
+# ------------------------------------------------------------------------------
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os.path as osp
+import sys
+
+
+def add_path(path):
+    if path not in sys.path:
+        sys.path.insert(0, path)
+
+
+def get_path(cur_file):
+    cur_dir = osp.dirname(osp.realpath(cur_file))
+    pre_dir = osp.join(cur_dir, '..')
+    project_root = osp.abspath(osp.join(cur_dir, '../../../../'))
+    chk_root = osp.join(project_root, 'checkpoint/')
+    data_root = osp.join(project_root, 'data/')
+    lib_root = osp.join(project_root, 'lib/')
+    output_root = osp.join(project_root, 'output/')
+
+    return pre_dir, cur_dir, chk_root, data_root, lib_root, output_root
+
+
+this_dir = osp.dirname(osp.realpath(__file__))
+
+lib_path = osp.join(this_dir, '..', 'lib')
+add_path(lib_path)
diff --git a/VideoToNPZ/lib/pose/hrnet/pose_estimation/gen_kpts.py b/VideoToNPZ/lib/pose/hrnet/pose_estimation/gen_kpts.py
new file mode 100644
index 0000000000000000000000000000000000000000..eb227aa2c41bbe1d5513cb2ca5b979609b35d703
--- /dev/null
+++ b/VideoToNPZ/lib/pose/hrnet/pose_estimation/gen_kpts.py
@@ -0,0 +1,474 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import sys
+import os
+import os.path as osp
+import argparse
+import numpy as np
+from tqdm import tqdm
+import torch
+import torch.backends.cudnn as cudnn
+import cv2
+from threading import Thread
+from queue import Queue
+import json
+import torch.multiprocessing as mp
+from functools import partial
+from io import StringIO
+
+import _init_paths
+from _init_paths import get_path
+from utils.utilitys import PreProcess, load_json, plot_keypoint, write
+from config import cfg, update_config
+from utils.transforms import *
+from utils.inference import get_final_preds
+import models
+sys.path.pop(0)
+
+pre_dir, cur_dir, chk_root, data_root, lib_root, output_root = get_path(__file__)
+cfg_dir = pre_dir + '/experiments/coco/hrnet/'
+model_dir = chk_root + 'hrnet/pose_coco/'
+
+sys.path.insert(0, lib_root)
+from detector import load_model as yolo_model
+from detector import yolo_human_det as yolo_det
+from track.sort import Sort
+sys.path.pop(0)
+
+# Set multiprocessing start method
+mp.set_start_method('spawn', force=True)
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Train keypoints network')
+    parser.add_argument('--cfg', type=str, default=cfg_dir + 'w48_384x288_adam_lr1e-3.yaml')
+    parser.add_argument('opts', nargs=argparse.REMAINDER, default=None)
+    parser.add_argument('--modelDir', type=str, default=model_dir + 'pose_hrnet_w48_384x288.pth')
+    parser.add_argument('--det-dim', type=int, default=416)
+    parser.add_argument('--thred-score', type=float, default=0.70)
+    parser.add_argument('-a', '--animation', action='store_true', help='output animation')
+    parser.add_argument('-np', '--num-person', type=int, default=1)
+    parser.add_argument("-v", "--video", type=str, default='camera')
+    parser.add_argument('--batch-size', type=int, default=16)
+    args = parser.parse_args()
+    return args
+
+def reset_config(args):
+    update_config(cfg, args)
+    cudnn.benchmark = cfg.CUDNN.BENCHMARK
+    torch.backends.cudnn.deterministic = cfg.CUDNN.DETERMINISTIC
+    torch.backends.cudnn.enabled = cfg.CUDNN.ENABLED
+
+def model_load(config, use_fp16=False):
+    model = eval('models.' + config.MODEL.NAME + '.get_pose_net')(config, is_train=False)
+    state_dict = torch.load(config.OUTPUT_DIR, map_location=torch.device('cpu'))
+    from collections import OrderedDict
+    new_state_dict = OrderedDict()
+    for k, v in state_dict.items():
+        new_state_dict[k] = v
+    model.load_state_dict(new_state_dict)
+    if torch.cuda.is_available() and use_fp16:
+        model = model.half().cuda()
+    elif torch.cuda.is_available():
+        model = model.cuda()
+    model.eval()
+    return model
+
+def load_default_model():
+    args = parse_args()
+    reset_config(args)
+    model = eval('models.' + config.MODEL.NAME + '.get_pose_net')(cfg, is_train=False)
+    if torch.cuda.is_available():
+        model = model.cuda()
+    state_dict = torch.load(cfg.OUTPUT_DIR)
+    from collections import OrderedDict
+    new_state_dict = OrderedDict()
+    for k, v in state_dict.items():
+        new_state_dict[k] = v
+    model.load_state_dict(new_state_dict)
+    model.eval()
+    return model
+
+def frame_loader(video, queue, video_length):
+    cap = cv2.VideoCapture(video)
+    for _ in range(video_length):
+        ret, frame = cap.read()
+        if not ret:
+            break
+        queue.put(frame)
+    queue.put(None)
+    cap.release()
+
+def process_batch(frames, human_model, pose_model, det_dim, num_person, thred_score, use_fp16, device):
+    if not frames:
+        return [], []
+
+    batch_bboxs = []
+    batch_centers = []
+    batch_scales = []
+    batch_inputs = []
+
+    for frame in frames:
+        bboxs, _ = yolo_det(frame, human_model, reso=det_dim, confidence=thred_score)
+        if bboxs is None or not bboxs.any():
+            continue
+
+        people_track = Sort().update(bboxs)
+        if people_track.shape[0] == 0:
+            continue
+        people_track_ = people_track[-min(num_person, people_track.shape[0]):, :-1]
+        track_bboxs = [[round(i, 2) for i in list(bbox)] for bbox in people_track_]
+
+        inputs, _, center, scale = PreProcess(frame, track_bboxs, cfg, len(track_bboxs))
+        inputs = inputs[:, [2, 1, 0]]
+        batch_bboxs.append(track_bboxs)
+        batch_centers.append(center)
+        batch_scales.append(scale)
+        batch_inputs.append(inputs)
+
+    if not batch_inputs:
+        return [], []
+
+    inputs = torch.cat(batch_inputs, dim=0).to(device)
+    if use_fp16:
+        inputs = inputs.half()
+
+    with torch.no_grad():
+        outputs = pose_model(inputs)
+        outputs = outputs.cpu().float()
+
+    kpts_result = []
+    scores_result = []
+    offset = 0
+    for i, (center, scale) in enumerate(zip(batch_centers, batch_scales)):
+        batch_size = len(batch_bboxs[i])
+        preds, maxvals = get_final_preds(cfg, outputs[offset:offset + batch_size].numpy(),
+                                        np.asarray(center).flatten(), np.asarray(scale).flatten())
+        offset += batch_size
+
+        kpts = np.zeros((len(preds), 17, 2), dtype=np.float32)
+        scores = np.zeros((len(preds), 17), dtype=np.float32)
+        for j, (kpt, score) in enumerate(zip(preds, maxvals)):
+            kpts[j] = kpt
+            scores[j] = score.squeeze()
+        kpts_result.append(kpts)
+        scores_result.append(scores)
+
+    return kpts_result, scores_result
+
+def gen_video_kpts(video, det_dim=416, num_person=1, gen_output=False, batch_size=16, animation=False):
+    args = parse_args()
+    reset_config(args)
+
+    cap = cv2.VideoCapture(video)
+    assert cap.isOpened(), 'Cannot capture source'
+    
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    use_fp16 = device.type == 'cuda' and torch.cuda.get_device_capability()[0] >= 7
+    batch_size = min(batch_size, torch.cuda.get_device_properties(0).total_memory // (1024**3) if device.type == 'cuda' else mp.cpu_count())
+
+    human_model = yolo_model(inp_dim=det_dim)
+    pose_model = model_load(cfg, use_fp16=use_fp16).to(device)
+    people_sort = Sort()
+
+    video_length = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    print('Recording 2D pose ...')
+    import sys
+    from io import StringIO
+
+    if animation:
+        # Animation mode uses frame-by-frame processing like in the backup code
+        kpts_result = []
+        scores_result = []
+        
+        for i in tqdm(range(video_length)):
+            ret, frame = cap.read()
+            if not ret:
+                break
+                
+            # Detect humans
+            bboxs, scores = yolo_det(frame, human_model, reso=det_dim, confidence=args.thred_score)
+            
+            if bboxs is None or not bboxs.any():
+                print('No person detected!')
+                continue
+                
+            # Track people
+            people_track = people_sort.update(bboxs)
+            
+            # Select people to track
+            if people_track.shape[0] == 1:
+                people_track_ = people_track[-1, :-1].reshape(1, 4)
+            elif people_track.shape[0] >= 2:
+                people_track_ = people_track[-num_person:, :-1].reshape(num_person, 4)
+                people_track_ = people_track_[::-1]
+            else:
+                continue
+                
+            track_bboxs = []
+            for bbox in people_track_:
+                bbox = [round(i, 2) for i in list(bbox)]
+                track_bboxs.append(bbox)
+                
+            with torch.no_grad():
+                # Preprocess and get pose predictions
+                inputs, origin_img, center, scale = PreProcess(frame, track_bboxs, cfg, num_person)
+                inputs = inputs[:, [2, 1, 0]]
+                
+                if torch.cuda.is_available():
+                    inputs = inputs.cuda()
+                output = pose_model(inputs)
+                
+                # Compute coordinates
+                preds, maxvals = get_final_preds(cfg, output.clone().cpu().numpy(), np.asarray(center), np.asarray(scale))
+                
+            if gen_output:
+                # Store results for later processing
+                kpts = np.zeros((num_person, 17, 2), dtype=np.float32)
+                scores = np.zeros((num_person, 17), dtype=np.float32)
+                
+                for j, kpt in enumerate(preds):
+                    kpts[j] = kpt
+                    
+                for j, score in enumerate(maxvals):
+                    scores[j] = score.squeeze()
+                    
+                kpts_result.append(kpts)
+                scores_result.append(scores)
+                
+            else:
+                # Visualize results in real-time
+                index_bboxs = [bbox + [j] for j, bbox in enumerate(track_bboxs)]
+                list(map(lambda x: write(x, frame), index_bboxs))
+                plot_keypoint(frame, preds, maxvals, 0.3)
+                
+                cv2.imshow('frame', frame)
+                key = cv2.waitKey(1)
+                if key & 0xFF == ord('q'):
+                    break
+    else:
+        # Optimized batch processing with Queue
+        old_stdout = sys.stdout
+        sys.stdout = StringIO()
+
+        frame_queue = mp.Queue(maxsize=batch_size * 2)
+        loader_thread = Thread(target=frame_loader, args=(video, frame_queue, video_length))
+        loader_thread.start()
+
+        # Pre-allocate result arrays
+        max_frames = video_length
+        kpts_result = np.zeros((max_frames, num_person, 17, 2), dtype=np.float32)
+        scores_result = np.zeros((max_frames, num_person, 17), dtype=np.float32)
+        frame_idx = 0
+
+        pool = None  # Initialize pool outside try block for cleanup
+        try:
+            if device.type == 'cuda':
+                # GPU batch processing
+                batch_frames = []
+                with torch.no_grad():
+                    for _ in tqdm(range(video_length)):
+                        frame = frame_queue.get()
+                        if frame is None:
+                            break
+                        batch_frames.append(frame)
+                        
+                        if len(batch_frames) >= batch_size:
+                            kpts_batch, scores_batch = process_batch(batch_frames, human_model, pose_model,
+                                                                    det_dim, num_person, args.thred_score,
+                                                                    use_fp16, device)
+                            for kpts, scores in zip(kpts_batch, scores_batch):
+                                kpts_result[frame_idx:frame_idx + 1] = kpts[None, :num_person]
+                                scores_result[frame_idx:frame_idx + 1] = scores[None, :num_person]
+                                frame_idx += 1
+                            batch_frames = []
+                    
+                    # Process remaining frames
+                    if batch_frames:
+                        kpts_batch, scores_batch = process_batch(batch_frames, human_model, pose_model,
+                                                                det_dim, num_person, args.thred_score,
+                                                                use_fp16, device)
+                        for kpts, scores in zip(kpts_batch, scores_batch):
+                            kpts_result[frame_idx:frame_idx + 1] = kpts[None, :num_person]
+                            scores_result[frame_idx:frame_idx + 1] = scores[None, :num_person]
+                            frame_idx += 1
+            else:
+                # CPU batch processing with multiprocessing
+                pool = mp.Pool(processes=mp.cpu_count())
+                process_func = partial(process_batch, human_model=human_model, pose_model=pose_model,
+                                      det_dim=det_dim, num_person=num_person, thred_score=args.thred_score,
+                                      use_fp16=use_fp16, device=device)
+                
+                batch_frames = []
+                with torch.no_grad():
+                    for _ in tqdm(range(video_length)):
+                        frame = frame_queue.get()
+                        if frame is None:
+                            break
+                        batch_frames.append(frame)
+                        if len(batch_frames) >= batch_size:
+                            kpts_batch, scores_batch = process_func(batch_frames)
+                            for kpts, scores in zip(kpts_batch, scores_batch):
+                                kpts_result[frame_idx:frame_idx + 1] = kpts[None, :num_person]
+                                scores_result[frame_idx:frame_idx + 1] = scores[None, :num_person]
+                                frame_idx += 1
+                            batch_frames = []
+                
+                    # Process remaining frames
+                    if batch_frames:
+                        kpts_batch, scores_batch = process_func(batch_frames)
+                        for kpts, scores in zip(kpts_batch, scores_batch):
+                            kpts_result[frame_idx:frame_idx + 1] = kpts[None, :num_person]
+                            scores_result[frame_idx:frame_idx + 1] = scores[None, :num_person]
+                            frame_idx += 1
+                
+                pool.close()
+                pool.join()
+        except KeyboardInterrupt:
+            print("\nInterrupted by user, shutting down...")
+            if pool is not None:
+                pool.terminate()
+                pool.join()
+            loader_thread.join()
+            sys.stdout = old_stdout
+            sys.exit(0)
+
+        loader_thread.join()
+        sys.stdout = old_stdout
+                
+    if gen_output and kpts_result.any():
+        keypoints = kpts_result[:frame_idx].transpose(1, 0, 2, 3)
+        scores = scores_result[:frame_idx].transpose(1, 0, 2)
+        return keypoints, scores
+    return None, None
+
+def gen_img_kpts(image, human_model, pose_model, human_sort, det_dim=416, num_person=2):
+    args = parse_args()
+    reset_config(args)
+    thred_score = args.thred_score
+
+    bboxs, bbox_scores = yolo_det(image, human_model, reso=det_dim, confidence=thred_score)
+    if bboxs is None or not bboxs.any():
+        return None, None, None
+
+    people_track = human_sort.update(bboxs)
+    if people_track.shape[0] == 1:
+        bboxs_track = people_track[-1].reshape(1, 5)
+    else:
+        people_track_ = people_track[-num_person:].reshape(num_person, 5)
+        bboxs_track = people_track_[::-1]
+
+    with torch.no_grad():
+        inputs, origin_img, center, scale = PreProcess(image, bboxs_track, cfg, num_person)
+        inputs = inputs[:, [2, 1, 0]]
+        if torch.cuda.is_available():
+            inputs = inputs.cuda()
+        output = pose_model(inputs)
+        preds, maxvals = get_final_preds(cfg, output.clone().cpu().numpy(), np.asarray(center), np.asarray(scale))
+
+        kpts = np.zeros((num_person, 17, 2), dtype=np.float32)
+        scores = np.zeros((num_person, 17, 1), dtype=np.float32)
+        for i, kpt in enumerate(preds):
+            kpts[i] = kpt
+        for i, score in enumerate(maxvals):
+            scores[i] = score
+
+    human_indexes = [bboxs_track[i, -1] for i in range(len(bboxs_track))]
+    return kpts, scores, human_indexes
+
+def generate_ntu_kpts_json(video_path, kpts_file):
+    args = parse_args()
+    reset_config(args)
+
+    human_model = yolo_model()
+    pose_model = model_load(cfg)
+    people_sort = Sort()
+
+    with torch.no_grad():
+        cap = cv2.VideoCapture(video_path)
+        video_length = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+        kpts_info = dict()
+        data = []
+
+        for i in tqdm(range(video_length), unit="%", ncols=100):
+            frame_info = {'frame_index': i + 1}
+            ret, frame = cap.read()
+            if not ret:
+                continue
+                
+            try:
+                bboxs, scores = yolo_det(frame, human_model, confidence=args.thred_score)
+                if bboxs is None or not bboxs.any():
+                    print('No person detected!')
+                    continue
+
+                people_track = people_sort.update(bboxs)
+                if people_track.shape[0] == 1:
+                    people_track_ = people_track[-1, :-1].reshape(1, 4)
+                elif people_track.shape[0] >= 2:
+                    people_track_ = people_track[-2:, :-1].reshape(2, 4)
+                    people_track_ = people_track_[::-1]
+                else:
+                    skeleton = {'skeleton': [{'pose': [], 'score': [], 'bbox': []}]}
+                    frame_info.update(skeleton)
+                    data.append(frame_info)
+                    continue
+
+                track_bboxs = []
+                for bbox in people_track_:
+                    bbox = [round(i, 3) for i in list(bbox)]
+                    track_bboxs.append(bbox)
+
+            except Exception as e:
+                print(e)
+                continue
+
+            inputs, origin_img, center, scale = PreProcess(frame, track_bboxs, cfg, args.num_person)
+            inputs = inputs[:, [2, 1, 0]]
+            if torch.cuda.is_available():
+                inputs = inputs.cuda()
+            output = pose_model(inputs)
+            preds, maxvals = get_final_preds(cfg, output.clone().cpu().numpy(), np.asarray(center), np.asarray(scale))
+
+            skeleton = []
+            for num, bbox in enumerate(track_bboxs):
+                pose = preds[num].tolist()
+                score = maxvals[num].tolist()
+                pose = round_list(pose)
+                score = round_list(score)
+                one_skeleton = {'pose': pose, 'score': score, 'bbox': bbox}
+                skeleton.append(one_skeleton)
+
+            frame_info.update({'skeleton': skeleton})
+            data.append(frame_info)
+
+        kpts_info.update({'data': data})
+        with open(kpts_file, 'w') as fw:
+            json.dump(kpts_info, fw)
+
+def round_list(input_list, decimals=3):
+    dim = len(input_list)
+    for i in range(dim):
+        for j in range(len(input_list[i])):
+            input_list[i][j] = round(input_list[i][j], decimals)
+    return input_list
+
+if __name__ == "__main__":
+    args = parse_args()
+    video_path = args.video
+    
+    if args.animation:
+        # Real-time animation mode
+        gen_video_kpts(video_path, det_dim=args.det_dim, num_person=args.num_person, 
+                       gen_output=False, animation=True)
+    else:
+        # Process and save keypoints
+        keypoints, scores = gen_video_kpts(video_path, det_dim=args.det_dim, 
+                                          num_person=args.num_person, 
+                                          gen_output=True, 
+                                          batch_size=args.batch_size)
+        if keypoints is not None:
+            output_file = "output.npz"
+            np.savez(output_file, keypoints=keypoints, scores=scores)
+            print(f"Saved to {output_file}")
\ No newline at end of file
diff --git a/VideoToNPZ/lib/track/sort.py b/VideoToNPZ/lib/track/sort.py
new file mode 100644
index 0000000000000000000000000000000000000000..1f12f8fd56fa409f885099044199e642d00690aa
--- /dev/null
+++ b/VideoToNPZ/lib/track/sort.py
@@ -0,0 +1,231 @@
+"""
+    https://arxiv.org/abs/1602.00763
+"""
+from __future__ import print_function
+
+from numba import jit
+import os.path
+import numpy as np
+from skimage import io
+from scipy.optimize import linear_sum_assignment
+import argparse
+from filterpy.kalman import KalmanFilter
+
+
+@jit
+def iou(bb_test, bb_gt):
+    """
+    Computes IUO between two bboxes in the form [x1,y1,x2,y2]
+    """
+    xx1 = np.maximum(bb_test[0], bb_gt[0])
+    yy1 = np.maximum(bb_test[1], bb_gt[1])
+    xx2 = np.minimum(bb_test[2], bb_gt[2])
+    yy2 = np.minimum(bb_test[3], bb_gt[3])
+    w = np.maximum(0., xx2 - xx1)
+    h = np.maximum(0., yy2 - yy1)
+    wh = w * h
+    o = wh / ((bb_test[2] - bb_test[0]) * (bb_test[3] - bb_test[1])
+              + (bb_gt[2] - bb_gt[0]) * (bb_gt[3] - bb_gt[1]) - wh)
+
+    return o
+
+
+def convert_bbox_to_z(bbox):
+    """
+    Takes a bounding box in the form [x1,y1,x2,y2] and returns z in the form
+      [x,y,s,r] where x,y is the centre of the box and s is the scale/area and r is
+      the aspect ratio
+    """
+    w = bbox[2] - bbox[0]
+    h = bbox[3] - bbox[1]
+    x = bbox[0] + w / 2.
+    y = bbox[1] + h / 2.
+    s = w * h  # scale is just area
+    r = w / float(h)
+    return np.array([x, y, s, r]).reshape((4, 1))
+
+
+def convert_x_to_bbox(x, score=None):
+    """
+    Takes a bounding box in the centre form [x,y,s,r] and returns it in the form
+      [x1,y1,x2,y2] where x1,y1 is the top left and x2,y2 is the bottom right
+    """
+    w = np.sqrt(x[2] * x[3])
+    h = x[2] / w
+    if (score == None):
+        return np.array([x[0] - w / 2., x[1] - h / 2., x[0] + w / 2., x[1] + h / 2.]).reshape((1, 4))
+    else:
+        return np.array([x[0] - w / 2., x[1] - h / 2., x[0] + w / 2., x[1] + h / 2., score]).reshape((1, 5))
+
+
+class KalmanBoxTracker(object):
+    """
+    This class represents the internel state of individual tracked objects observed as bbox.
+    """
+    count = 0
+
+    def __init__(self, bbox):
+        """
+        Initialises a tracker using initial bounding box.
+        """
+        # define constant velocity model
+        self.kf = KalmanFilter(dim_x=7, dim_z=4)
+        self.kf.F = np.array(
+            [[1, 0, 0, 0, 1, 0, 0], [0, 1, 0, 0, 0, 1, 0], [0, 0, 1, 0, 0, 0, 1], [0, 0, 0, 1, 0, 0, 0],
+             [0, 0, 0, 0, 1, 0, 0], [0, 0, 0, 0, 0, 1, 0], [0, 0, 0, 0, 0, 0, 1]])
+        self.kf.H = np.array(
+            [[1, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0]])
+
+        self.kf.R[2:, 2:] *= 10.
+        self.kf.P[4:, 4:] *= 1000.  # give high uncertainty to the unobservable initial velocities
+        self.kf.P *= 10.
+        self.kf.Q[-1, -1] *= 0.01
+        self.kf.Q[4:, 4:] *= 0.01
+
+        self.kf.x[:4] = convert_bbox_to_z(bbox)
+        self.time_since_update = 0
+        self.id = KalmanBoxTracker.count
+        KalmanBoxTracker.count += 1
+        self.history = []
+        self.hits = 0
+        self.hit_streak = 0
+        self.age = 0
+
+    def update(self, bbox):
+        """
+        Updates the state vector with observed bbox.
+        """
+        self.time_since_update = 0
+        self.history = []
+        self.hits += 1
+        self.hit_streak += 1
+        self.kf.update(convert_bbox_to_z(bbox))
+
+    def predict(self):
+        """
+        Advances the state vector and returns the predicted bounding box estimate.
+        """
+        if ((self.kf.x[6] + self.kf.x[2]) <= 0):
+            self.kf.x[6] *= 0.0
+        self.kf.predict()
+        self.age += 1
+        if (self.time_since_update > 0):
+            self.hit_streak = 0
+        self.time_since_update += 1
+        self.history.append(convert_x_to_bbox(self.kf.x))
+        return self.history[-1]
+
+    def get_state(self):
+        """
+        Returns the current bounding box estimate.
+        """
+        return convert_x_to_bbox(self.kf.x)
+
+
+def associate_detections_to_trackers(detections, trackers, iou_threshold=0.3):
+    """
+    Assigns detections to tracked object (both represented as bounding boxes)
+
+    Returns 3 lists of matches, unmatched_detections and unmatched_trackers
+    """
+    if (len(trackers) == 0):
+        return np.empty((0, 2), dtype=int), np.arange(len(detections)), np.empty((0, 5), dtype=int)
+    iou_matrix = np.zeros((len(detections), len(trackers)), dtype=np.float32)
+
+    for d, det in enumerate(detections):
+        for t, trk in enumerate(trackers):
+            iou_matrix[d, t] = iou(det, trk)
+    matched_indices = linear_sum_assignment(-iou_matrix)
+    matched_indices = np.asarray(matched_indices)
+    matched_indices = matched_indices.transpose()
+
+    unmatched_detections = []
+    for d, det in enumerate(detections):
+        if (d not in matched_indices[:, 0]):
+            unmatched_detections.append(d)
+    unmatched_trackers = []
+    for t, trk in enumerate(trackers):
+        if (t not in matched_indices[:, 1]):
+            unmatched_trackers.append(t)
+
+    # filter out matched with low IOU
+    matches = []
+    for m in matched_indices:
+        if (iou_matrix[m[0], m[1]] < iou_threshold):
+            unmatched_detections.append(m[0])
+            unmatched_trackers.append(m[1])
+        else:
+            matches.append(m.reshape(1, 2))
+    if (len(matches) == 0):
+        matches = np.empty((0, 2), dtype=int)
+    else:
+        matches = np.concatenate(matches, axis=0)
+
+    return matches, np.array(unmatched_detections), np.array(unmatched_trackers)
+
+
+class Sort(object):
+    def __init__(self, max_age=1, min_hits=3):
+        """
+        Sets key parameters for SORT
+        """
+        self.max_age = max_age
+        self.min_hits = min_hits
+        self.trackers = []
+        self.frame_count = 0
+
+    def update(self, dets):
+        """
+        Params:
+          dets - a numpy array of detections in the format [[x1,y1,x2,y2,score],[x1,y1,x2,y2,score],...]
+        Requires: this method must be called once for each frame even with empty detections.
+        Returns the a similar array, where the last column is the object ID.
+
+        NOTE: The number of objects returned may differ from the number of detections provided.
+        """
+        self.frame_count += 1
+        # get predicted locations from existing trackers.
+        trks = np.zeros((len(self.trackers), 5))
+        to_del = []
+        ret = []
+        for t, trk in enumerate(trks):
+            pos = self.trackers[t].predict()[0]
+            trk[:] = [pos[0], pos[1], pos[2], pos[3], 0]
+            if np.any(np.isnan(pos)):
+                to_del.append(t)
+        trks = np.ma.compress_rows(np.ma.masked_invalid(trks))
+        for t in reversed(to_del):
+            self.trackers.pop(t)
+        matched, unmatched_dets, unmatched_trks = associate_detections_to_trackers(dets, trks)
+
+        # update matched trackers with assigned detections
+        for t, trk in enumerate(self.trackers):
+            if t not in unmatched_trks:
+                d = matched[np.where(matched[:, 1] == t)[0], 0]  # d: [n]
+                trk.update(dets[d, :][0])
+
+        # create and initialise new trackers for unmatched detections
+        for i in unmatched_dets:
+            trk = KalmanBoxTracker(dets[i, :])
+            self.trackers.append(trk)
+        i = len(self.trackers)
+        for trk in reversed(self.trackers):
+            d = trk.get_state()[0]
+            if ((trk.time_since_update < 1) and (trk.hit_streak >= self.min_hits or self.frame_count <= self.min_hits)):
+                ret.append(np.concatenate((d, [trk.id + 1])).reshape(1, -1))  # +1 as MOT benchmark requires positive
+            i -= 1
+            # remove dead tracklet
+            if (trk.time_since_update > self.max_age):
+                self.trackers.pop(i)
+        if (len(ret) > 0):
+            return np.concatenate(ret)
+        return np.empty((0, 5))
+
+
+def parse_args():
+    """Parse input arguments."""
+    parser = argparse.ArgumentParser(description='SORT demo')
+    parser.add_argument('--display', dest='display', help='Display online tracker output (slow) [False]',
+                        action='store_true')
+    args = parser.parse_args()
+    return args
diff --git a/VideoToNPZ/model/gast_net.py b/VideoToNPZ/model/gast_net.py
new file mode 100644
index 0000000000000000000000000000000000000000..c066088e9f34d87a656f8dc54e17510aae94af15
--- /dev/null
+++ b/VideoToNPZ/model/gast_net.py
@@ -0,0 +1,285 @@
+import torch
+from torchsummary import summary
+import torch.nn as nn
+from model.local_attention import LocalGraph
+from model.global_attention import MultiGlobalGraph, SingleGlobalGraph
+
+
+class GraphAttentionBlock(nn.Module):
+    def __init__(self, adj, input_dim, output_dim, p_dropout):
+        super(GraphAttentionBlock, self).__init__()
+        
+        hid_dim = output_dim
+        self.relu = nn.ReLU(inplace=True)
+
+        self.local_graph_layer = LocalGraph(adj, input_dim, hid_dim, p_dropout)
+        self.global_graph_layer = MultiGlobalGraph(adj, input_dim, input_dim//4, dropout=p_dropout)
+        # self.global_graph_layer = SingleGlobalGraph(adj, input_dim, output_dim)
+
+        self.cat_conv = nn.Conv2d(3*output_dim, 2*output_dim, 1, bias=False)
+        self.cat_bn = nn.BatchNorm2d(2*output_dim, momentum=0.1)
+
+    def forward(self, x):
+        # x: (B, C, T, N) --> (B, T, N, C)
+        x = x.permute(0, 2, 3, 1)
+        residual = x
+        x_ = self.local_graph_layer(x)
+        y_ = self.global_graph_layer(x)
+        x = torch.cat((residual, x_, y_), dim=-1)
+
+        # x: (B, T, N, C) --> (B, C, T, N)
+        x = x.permute(0, 3, 1, 2)
+        x = self.relu(self.cat_bn(self.cat_conv(x)))
+        return x
+
+
+class SpatioTemporalModelBase(nn.Module):
+    """
+    Do not instantiate this class.
+    """
+
+    def __init__(self, adj, num_joints_in, in_features, num_joints_out,
+                 filter_widths, causal, dropout, channels):
+        super().__init__()
+
+        # Validate input
+        for fw in filter_widths:
+            assert fw % 2 != 0, 'Only odd filter widths are supported'
+
+        self.num_joints_in = num_joints_in
+        self.in_features = in_features
+        self.num_joints_out = num_joints_out
+        self.filter_widths = filter_widths
+
+        self.drop = nn.Dropout(dropout)
+        self.relu = nn.ReLU(inplace=True)
+
+        self.pad = [filter_widths[0] // 2]
+        self.init_bn = nn.BatchNorm2d(in_features, momentum=0.1)
+        self.expand_bn = nn.BatchNorm2d(channels, momentum=0.1)
+        self.shrink = nn.Conv2d(2**len(self.filter_widths)*channels, 3, 1, bias=False)
+
+    def receptive_field(self):
+        """
+        Return the total receptive field of this model as # of frames.
+        """
+        frames = 0
+        for f in self.pad:
+            frames += f
+        return 1 + 2 * frames
+
+    def total_causal_shift(self):
+        """
+        Return the asymmetric offset for sequence padding.
+        The returned value is typically 0 if causal convolutions are disabled,
+        otherwise it is half the receptive field.
+        """
+        frames = self.causal_shift[0]
+        next_dilation = self.filter_widths[0]
+        for i in range(1, len(self.filter_widths)):
+            frames += self.causal_shift[i] * next_dilation
+            next_dilation *= self.filter_widths[i]
+        return frames
+
+    def forward(self, x):
+        """
+        X: (B, C, T, N)
+            B: batchsize
+            T: Temporal
+            N: The number of keypoints
+            C: The feature dimension of keypoints
+        """
+
+        assert len(x.shape) == 4
+        assert x.shape[-2] == self.num_joints_in
+        assert x.shape[-1] == self.in_features
+
+        # X: (B, T, N, C)
+        x = self._forward_blocks(x)
+        x = self.shrink(x)
+
+        # x: (B, C, T, N) --> (B, T, N, C)
+        x = x.permute(0, 2, 3, 1)
+
+        return x
+
+
+class SpatioTemporalModel(SpatioTemporalModelBase):
+    """
+    Reference 3D pose estimation model with temporal convolutions.
+    This implementation can be used for all use-cases.
+    """
+
+    def __init__(self, adj, num_joints_in, in_features, num_joints_out,
+                 filter_widths, causal=False, dropout=0.25, channels=64, dense=False):
+        """
+        Initialize this model.
+
+        Arguments:
+        num_joints_in -- number of input joints (e.g. 17 for Human3.6M)
+        in_features -- number of input features for each joint (typically 2 for 2D input)
+        num_joints_out -- number of output joints (can be different than input)
+        filter_widths -- list of convolution widths, which also determines the # of blocks and receptive field
+        causal -- use causal convolutions instead of symmetric convolutions (for real-time applications)
+        dropout -- dropout probability
+        channels -- number of convolution channels
+        dense -- use regular dense convolutions instead of dilated convolutions (ablation experiment)
+        """
+        super().__init__(adj, num_joints_in, in_features, num_joints_out, filter_widths, causal, dropout, channels)
+
+        self.expand_conv = nn.Conv2d(in_features, channels, (filter_widths[0], 1), bias=False)
+        nn.init.kaiming_normal_(self.expand_conv.weight)
+
+        layers_conv = []
+        layers_graph_conv = []
+        layers_bn = []
+
+        layers_graph_conv.append(GraphAttentionBlock(adj, channels, channels, p_dropout=dropout))
+
+        self.causal_shift = [(filter_widths[0]) // 2 if causal else 0]
+        next_dilation = filter_widths[0]
+        for i in range(1, len(filter_widths)):
+            self.pad.append((filter_widths[i] - 1) * next_dilation // 2)
+            self.causal_shift.append((filter_widths[i] // 2 * next_dilation) if causal else 0)
+
+            layers_conv.append(nn.Conv2d(2**i*channels, 2**i*channels, (filter_widths[i], 1) if not dense else (2*self.pad[-1]+1, 1),
+                               dilation=(next_dilation, 1) if not dense else (1, 1), bias=False))
+            layers_bn.append(nn.BatchNorm2d(2**i*channels, momentum=0.1))
+            layers_conv.append(nn.Conv2d(2**i*channels, 2**i*channels, 1, dilation=1, bias=False))
+            layers_bn.append(nn.BatchNorm2d(2**i*channels, momentum=0.1))
+
+            layers_graph_conv.append(GraphAttentionBlock(adj, 2**i*channels, 2**i*channels, p_dropout=dropout))
+
+            next_dilation *= filter_widths[i]
+
+        self.layers_conv = nn.ModuleList(layers_conv)
+        self.layers_bn = nn.ModuleList(layers_bn)
+        self.layers_graph_conv = nn.ModuleList(layers_graph_conv)
+
+    def _forward_blocks(self, x):
+
+        # x: (B, T, N, C) --> (B, C, T, N)
+        x = x.permute(0, 3, 1, 2)
+        x = self.init_bn(x)
+        x = self.relu(self.expand_bn(self.expand_conv(x)))
+        x = self.layers_graph_conv[0](x)
+
+        for i in range(len(self.pad) - 1):
+            pad = self.pad[i + 1]
+            shift = self.causal_shift[i + 1]
+            res = x[:, :, pad + shift: x.shape[2] - pad + shift]
+
+            # x: (B, C, T, N)
+            x = self.relu(self.layers_bn[2 * i](self.layers_conv[2 * i](x)))
+            x = res + self.drop(self.relu(self.layers_bn[2 * i + 1](self.layers_conv[2 * i + 1](x))))
+
+            x = self.layers_graph_conv[i + 1](x)
+        return x
+
+
+class SpatioTemporalModelOptimized1f(SpatioTemporalModelBase):
+    """
+    3D pose estimation model optimized for single-frame batching, i.e.
+    where batches have input length = receptive field, and output length = 1.
+    This scenario is only used for training when stride == 1.
+
+    This implementation replaces dilated convolutions with strided convolutions
+    to avoid generating unused intermediate results. The weights are interchangeable
+    with the reference implementation.
+    """
+
+    def __init__(self, adj, num_joints_in, in_features, num_joints_out,
+                 filter_widths, causal=False, dropout=0.25, channels=64):
+        """
+        Initialize this model.
+
+        Arguments:
+        num_joints_in -- number of input joints (e.g. 17 for Human3.6M)
+        in_features -- number of input features for each joint (typically 2 for 2D input)
+        num_joints_out -- number of output joints (can be different than input)
+        filter_widths -- list of convolution widths, which also determines the # of blocks and receptive field
+        causal -- use causal convolutions instead of symmetric convolutions (for real-time applications)
+        dropout -- dropout probability
+        channels -- number of convolution channels
+        """
+        super().__init__(adj, num_joints_in, in_features, num_joints_out, filter_widths, causal, dropout, channels)
+
+        self.expand_conv = nn.Conv2d(in_features, channels, (filter_widths[0], 1), stride=(filter_widths[0], 1), bias=False)
+        nn.init.kaiming_normal_(self.expand_conv.weight)
+
+        layers_conv = []
+        layers_graph_conv = []
+        layers_bn = []
+
+        layers_graph_conv.append(GraphAttentionBlock(adj, channels, channels, p_dropout=dropout))
+
+        self.causal_shift = [(filter_widths[0] // 2) if causal else 0]
+        next_dilation = filter_widths[0]
+        for i in range(1, len(filter_widths)):
+            self.pad.append((filter_widths[i] - 1) * next_dilation // 2)
+            self.causal_shift.append((filter_widths[i] // 2) if causal else 0)
+
+            layers_conv.append(nn.Conv2d(2**i*channels, 2**i*channels, (filter_widths[i], 1), stride=(filter_widths[i], 1), bias=False))
+            layers_bn.append(nn.BatchNorm2d(2**i*channels, momentum=0.1))
+            layers_conv.append(nn.Conv2d(2**i*channels, 2**i*channels, 1, dilation=1, bias=False))
+            layers_bn.append(nn.BatchNorm2d(2**i*channels, momentum=0.1))
+
+            layers_graph_conv.append(GraphAttentionBlock(adj, 2**i*channels, 2**i*channels, p_dropout=dropout))
+
+            next_dilation *= filter_widths[i]
+
+        self.layers_conv = nn.ModuleList(layers_conv)
+        self.layers_bn = nn.ModuleList(layers_bn)
+        self.layers_graph_conv = nn.ModuleList(layers_graph_conv)
+
+    def _forward_blocks(self, x):
+        # x: (B, T, N, C) --> (B, C, T, N)
+        x = x.permute(0, 3, 1, 2)
+        x = self.init_bn(x)
+        x = self.relu(self.expand_bn(self.expand_conv(x)))
+        x = self.layers_graph_conv[0](x)
+
+        for i in range(len(self.pad) - 1):
+            res = x[:, :, self.causal_shift[i+1] + self.filter_widths[i+1]//2 :: self.filter_widths[i+1]]
+
+            # x: (B, C, T, N)
+            x = self.relu(self.layers_bn[2 * i](self.layers_conv[2 * i](x)))
+            x = res + self.drop(self.relu(self.layers_bn[2 * i + 1](self.layers_conv[2 * i + 1](x))))
+
+            x = self.layers_graph_conv[i+1](x)
+
+        return x
+
+
+if __name__ == "__main__":
+    import torch
+    import numpy as np
+    import torchsummary
+    from common.skeleton import Skeleton
+    from common.graph_utils import adj_mx_from_skeleton
+
+    h36m_skeleton = Skeleton(parents=[-1, 0, 1, 2, 0, 4, 5, 0, 7, 8, 9, 8, 11, 12, 8, 14, 15],
+                             joints_left=[6, 7, 8, 9, 10, 16, 17, 18, 19, 20, 21, 22, 23],
+                             joints_right=[1, 2, 3, 4, 5, 24, 25, 26, 27, 28, 29, 30, 31])
+
+    humaneva_skeleton = Skeleton(parents=[-1, 0, 1, 2, 3, 1, 5, 6, 0, 8, 9, 0, 11, 12, 1],
+                                 joints_left=[2, 3, 4, 8, 9, 10],
+                                 joints_right=[5, 6, 7, 11, 12, 13])
+
+    adj = adj_mx_from_skeleton(h36m_skeleton)
+    model = SpatioTemporalModel(adj, num_joints_in=17, in_features=2, num_joints_out=17,
+                                filter_widths=[3, 3, 3], channels=128)
+    model = model.cuda()
+
+    model_params = 0
+
+    for parameter in model.parameters():
+        model_params += parameter.numel()
+
+    print('INFO: Trainable parameter count:', model_params)
+    input = torch.randn(2, 27, 17, 2)
+    input = input.cuda()
+
+    # summary(model, (27, 15, 2))
+    output = model(input)
+    print(output.shape)
diff --git a/VideoToNPZ/model/global_attention.py b/VideoToNPZ/model/global_attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..f188bb6cfec4bd0a93d5fd58220471b6e5ee53cc
--- /dev/null
+++ b/VideoToNPZ/model/global_attention.py
@@ -0,0 +1,173 @@
+from __future__ import absolute_import, division
+
+import torch
+from torch import nn
+
+
+class GlobalGraph(nn.Module):
+    """"
+    Global graph attention layer
+    """
+
+    def __init__(self, adj, in_channels, inter_channels=None):
+        super(GlobalGraph, self).__init__()
+
+        self.adj = adj
+        self.in_channels = in_channels
+        self.inter_channels = inter_channels
+
+        self.softmax = nn.Softmax(dim=-1)
+        self.relu = nn.ReLU(inplace=True)
+        self.leakyrelu = nn.LeakyReLU(0.2)
+
+        if self.inter_channels == self.in_channels // 2:
+            self.g_channels = self.in_channels
+        else:
+            self.g_channels = self.inter_channels
+
+        assert self.inter_channels > 0
+
+        self.g = nn.Conv1d(in_channels=self.in_channels, out_channels=self.g_channels,
+                           kernel_size=1, stride=1, padding=0)
+        self.theta = nn.Conv1d(in_channels=self.in_channels, out_channels=self.inter_channels,
+                               kernel_size=1, stride=1, padding=0)
+        self.phi = nn.Conv1d(in_channels=self.in_channels, out_channels=self.inter_channels,
+                             kernel_size=1, stride=1, padding=0)
+
+        adj_shape = self.adj.shape
+        self.C_k = nn.Parameter(torch.zeros(adj_shape, dtype=torch.float))
+
+        self.concat_project = nn.Sequential(
+            nn.Conv2d(self.inter_channels * 2, 1, 1, 1, 0, bias=False),
+        )
+
+        nn.init.kaiming_normal_(self.concat_project[0].weight)
+        nn.init.kaiming_normal_(self.g.weight)
+        nn.init.constant_(self.g.bias, 0)
+        nn.init.kaiming_normal_(self.theta.weight)
+        nn.init.constant_(self.theta.bias, 0)
+        nn.init.kaiming_normal_(self.phi.weight)
+        nn.init.constant_(self.phi.bias, 0)
+
+    def forward(self, x):
+        batch_size = x.size(0)  # x: (B*T, C, N)
+
+        # g_x: (B*T, N, C/k)
+        g_x = self.g(x).view(batch_size, self.g_channels, -1)
+        g_x = g_x.permute(0, 2, 1)
+
+        # (B*T, C/k, N, 1)
+        theta_x = self.theta(x).view(batch_size, self.inter_channels, -1, 1)
+        # (B*T, C/k, 1, N)
+        phi_x = self.phi(x).view(batch_size, self.inter_channels, 1, -1)
+
+        # h: N, w: N
+        h = theta_x.size(2)
+        w = phi_x.size(3)
+        theta_x = theta_x.expand(-1, -1, -1, w)  # (B*T, C/k, N, N)
+        phi_x = phi_x.expand(-1, -1, h, -1)
+
+        # concat_feature: (B*T, C/k, N, N)
+        concat_feature = torch.cat([theta_x, phi_x], dim=1)
+        f = self.concat_project(concat_feature)  # (B*T, 1, N, N)
+        b, _, h, w = f.size()
+        attention = self.leakyrelu(f.view(b, h, w))  # (B*T, N, N)  attention:B_k
+
+        attention = torch.add(self.softmax(attention), self.C_k)
+        # y: (B*T, C/k, N)
+        y = torch.matmul(attention, g_x)
+        y = y.permute(0, 2, 1).contiguous()
+        y = y.view(batch_size, self.g_channels, *x.size()[2:])
+
+        return y
+
+
+class MultiGlobalGraph(nn.Module):
+    def __init__(self, adj, in_channels, inter_channels, dropout=None):
+        super(MultiGlobalGraph, self).__init__()
+
+        self.num_non_local = in_channels // inter_channels
+
+        attentions = [GlobalGraph(adj, in_channels, inter_channels) for _ in range(self.num_non_local)]
+        self.attentions = nn.ModuleList(attentions)
+
+        self.cat_conv = nn.Conv2d(in_channels, in_channels, 1, bias=False)
+        self.cat_bn = nn.BatchNorm2d(in_channels, momentum=0.1)
+        self.relu = nn.ReLU(inplace=True)
+
+        if dropout is not None:
+            self.dropout = nn.Dropout(dropout)
+        else:
+            self.dropout = None
+
+    def forward(self, x):
+        # x: (B, T, K, C) --> (B*T, K, C)
+        x_size = x.shape
+        x = x.contiguous()
+        x = x.view(-1, *x_size[2:])
+        # x: (B*T, C, K)
+        x = x.permute(0, 2, 1)
+
+        x = torch.cat([self.attentions[i](x) for i in range(len(self.attentions))], dim=1)
+
+        # x: (B*T, C, K) --> (B*T, K, C)
+        x = x.permute(0, 2, 1).contiguous()
+
+        # x = torch.matmul(x, self.W)
+        # x: (B*T, K, C) --> (B, T, K, C)
+        x = x.view(*x_size)
+
+        # x: (B, T, K, C) --> (B, C, T, K)
+        x = x.permute(0, 3, 1, 2)
+        x = self.relu(self.cat_bn(self.cat_conv(x)))
+
+        if self.dropout is not None:
+            x = self.dropout(x)
+
+        # x: (B, C, T, K) --> (B, T, K, C)
+        x = x.permute(0, 2, 3, 1)
+
+        return x
+
+
+class SingleGlobalGraph(nn.Module):
+    def __init__(self, adj, in_channels, output_channels, dropout=None):
+        super(SingleGlobalGraph, self).__init__()
+
+        self.attentions = GlobalGraph(adj, in_channels, output_channels//2)
+        self.bn = nn.BatchNorm2d(in_channels, momentum=0.1)
+        self.relu = nn.ReLU(inplace=True)
+
+        if dropout is not None:
+            self.dropout = nn.Dropout(dropout)
+        else:
+            self.dropout = None
+
+    def forward(self, x):
+        # x: (B, T, K, C) --> (B*T, K, C)
+        x_size = x.shape
+        x = x.contiguous()
+        x = x.view(-1, *x_size[2:])
+        # x: (B*T, C, K)
+        x = x.permute(0, 2, 1)
+
+        x = self.attentions(x)
+
+        # x: (B*T, C, K) --> (B*T, K, C)
+        x = x.permute(0, 2, 1).contiguous()
+
+        # x = torch.matmul(x, self.W)
+        # x: (B*T, K, C) --> (B, T, K, C)
+        x = x.view(*x_size)
+
+        # x: (B, T, K, C) --> (B, C, T, K)
+        x = x.permute(0, 3, 1, 2)
+        x = self.relu(self.bn(x))
+
+        if self.dropout is not None:
+            x = self.dropout(x)
+
+        # x: (B, C, T, K) --> (B, T, K, C)
+        x = x.permute(0, 2, 3, 1)
+
+        return x
diff --git a/VideoToNPZ/model/local_attention.py b/VideoToNPZ/model/local_attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..ceba45dac4b8c89e7117f3bf13761392c0869f32
--- /dev/null
+++ b/VideoToNPZ/model/local_attention.py
@@ -0,0 +1,151 @@
+from __future__ import absolute_import, division
+
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+
+
+class SemCHGraphConv(nn.Module):
+    """
+    Semantic channel-wise graph convolution layer
+    """
+
+    def __init__(self, in_features, out_features, adj, bias=False):
+        super(SemCHGraphConv, self).__init__()
+        self.in_features = in_features
+        self.out_features = out_features
+
+        self.W = nn.Parameter(torch.zeros(size=(2, in_features, out_features), dtype=torch.float))
+        nn.init.xavier_uniform_(self.W.data, gain=1.414)
+
+        self.adj = adj.unsqueeze(0).repeat(out_features, 1, 1)
+        self.m = (self.adj > 0)
+        self.e = nn.Parameter(torch.zeros(out_features, len(self.m[0].nonzero()), dtype=torch.float))
+        nn.init.constant_(self.e.data, 1)
+
+        if bias:
+            self.bias = nn.Parameter(torch.zeros(out_features, dtype=torch.float))
+            stdv = 1. / math.sqrt(self.W.size(1))
+            self.bias.data.uniform_(-stdv, stdv)
+        else:
+            self.register_parameter('bias', None)
+
+    def forward(self, input):
+        # input: (B, T, J, C)
+        h0 = torch.matmul(input, self.W[0]).unsqueeze(2).transpose(2, 4)  # B * T * C * J * 1
+        h1 = torch.matmul(input, self.W[1]).unsqueeze(2).transpose(2, 4)  # B * T * C * J * 1
+
+        adj = -9e15 * torch.ones_like(self.adj).to(input.device)  # C * J * J
+        adj[self.m] = self.e.view(-1)
+        adj = F.softmax(adj, dim=2)
+
+        E = torch.eye(adj.size(1), dtype=torch.float).to(input.device)
+        E = E.unsqueeze(0).repeat(self.out_features, 1, 1)  # C * J * J
+
+        output = torch.matmul(adj * E, h0) + torch.matmul(adj * (1 - E), h1)
+        output = output.transpose(2, 4).squeeze(2)
+
+        if self.bias is not None:
+            return output + self.bias.view(1, 1, -1)
+        else:
+            return output
+
+    def __repr__(self):
+        return self.__class__.__name__ + ' (' + str(self.in_features) + ' -> ' + str(self.out_features) + ')'
+
+
+class LocalGraph(nn.Module):
+    def __init__(self, adj, input_dim, output_dim, dropout=None):
+        super(LocalGraph, self).__init__()
+        
+        num_joints = adj.shape[0]
+
+        # Human3.6M
+        if num_joints == 17:
+            distal_joints = [3, 6, 10, 13, 16]
+            joints_left = [4, 5, 6, 11, 12, 13]
+            joints_right = [1, 2, 3, 14, 15, 16]
+
+        # Human3.6M detected from Stacked Hourglass
+        elif num_joints == 16:
+            distal_joints = [3, 6, 9, 12, 15]
+            joints_left = [4, 5, 6, 10, 11, 12]
+            joints_right = [1, 2, 3, 13, 14, 15]
+
+        # HumanEva
+        elif num_joints == 15:
+            distal_joints = [4, 7, 10, 13]
+            joints_left = [2, 3, 4, 8, 9, 10]
+            joints_right = [5, 6, 7, 11, 12, 13]
+
+        # Human3.6M including toe keypoints
+        elif num_joints == 19:
+            distal_joints = [3, 4, 7, 8, 12, 15, 18]
+            joints_left = [5, 6, 7, 8, 13, 14, 15]
+            joints_right = [1, 2, 3, 4, 16, 17, 18]
+
+        else:
+            raise KeyError("The dimension of adj matrix is wrong!")
+
+        adj_sym = torch.zeros_like(adj)
+        for i in range(num_joints):
+            for j in range(num_joints):
+                if i == j:
+                    adj_sym[i][j] = 1
+                if i in joints_left:
+                    index = joints_left.index(i)
+                    adj_sym[i][joints_right[index]] = 1.0
+                if i in joints_right:
+                    index = joints_right.index(i)
+                    adj_sym[i][joints_left[index]] = 1.0
+
+        adj_1st_order = adj.matrix_power(1)
+        for i in np.arange(num_joints):
+            if i in distal_joints:
+                adj_1st_order[i] = 0
+
+        adj_2nd_order = adj.matrix_power(2)
+        for i in np.arange(num_joints):
+            if i not in distal_joints:
+                adj_2nd_order[i] = 0
+
+        adj_con = adj_1st_order + adj_2nd_order
+
+        self.gcn_sym = SemCHGraphConv(input_dim, output_dim, adj_sym)
+        self.bn_1 = nn.BatchNorm2d(output_dim, momentum=0.1)
+        self.gcn_con = SemCHGraphConv(input_dim, output_dim, adj_con)
+        self.bn_2 = nn.BatchNorm2d(output_dim, momentum=0.1)
+        self.relu = nn.ReLU()
+
+        self.cat_conv = nn.Conv2d(2*output_dim, output_dim, 1, bias=False)
+        self.cat_bn = nn.BatchNorm2d(output_dim, momentum=0.1)
+
+        if dropout is not None:
+            self.dropout = nn.Dropout(dropout)
+        else:
+            self.dropout = None
+
+    def forward(self, input):
+        # x: (B, T, K, C)
+        x = self.gcn_sym(input)
+        y = self.gcn_con(input)
+
+        # x: (B, T, K, C) --> (B, C, T, K)
+        x = x.permute(0, 3, 1, 2)
+        y = y.permute(0, 3, 1, 2)
+
+        x = self.relu(self.bn_1(x))
+        y = self.relu(self.bn_2(y))
+
+        output = torch.cat((x, y), dim=1)
+        output = self.cat_bn(self.cat_conv(output))
+
+        if self.dropout is not None:
+            output = self.dropout(self.relu(output))
+        else:
+            output = self.relu(output)
+        output = output.permute(0, 2, 3, 1)
+
+        return output
diff --git a/VideoToNPZ/model/sem_graph_conv.py b/VideoToNPZ/model/sem_graph_conv.py
new file mode 100644
index 0000000000000000000000000000000000000000..f5ca3dde2d36c07f6edf4908c7fa0a768d10e9e5
--- /dev/null
+++ b/VideoToNPZ/model/sem_graph_conv.py
@@ -0,0 +1,154 @@
+from __future__ import absolute_import, division
+
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+
+
+class SemGraphConv(nn.Module):
+    """
+    Semantic graph convolution layer
+    """
+
+    def __init__(self, in_features, out_features, adj, bias=True):
+        super(SemGraphConv, self).__init__()
+        self.in_features = in_features
+        self.out_features = out_features
+
+        self.W = nn.Parameter(torch.zeros(size=(2, in_features, out_features), dtype=torch.float))
+        nn.init.xavier_uniform_(self.W.data, gain=1.414)
+
+        self.adj = adj
+        self.m = (self.adj > 0)
+        self.e = nn.Parameter(torch.zeros(1, len(self.m.nonzero()), dtype=torch.float))
+        nn.init.constant_(self.e.data, 1)
+
+        if bias:
+            self.bias = nn.Parameter(torch.zeros(out_features, dtype=torch.float))
+            stdv = 1. / math.sqrt(self.W.size(2))
+            self.bias.data.uniform_(-stdv, stdv)
+        else:
+            self.register_parameter('bias', None)
+
+    def forward(self, input):
+        # X: (B, T, K, C)
+
+        h0 = torch.matmul(input, self.W[0])
+        h1 = torch.matmul(input, self.W[1])
+
+        adj = -9e15 * torch.ones_like(self.adj).to(input.device)
+        adj[self.m] = self.e
+        adj = F.softmax(adj, dim=1)
+
+        M = torch.eye(adj.size(0), dtype=torch.float).to(input.device)
+
+        output = torch.matmul(adj * M, h0) + torch.matmul(adj * (1 - M), h1)
+
+        if self.bias is not None:
+            return output + self.bias.view(1, 1, -1)
+        else:
+            return output
+
+    def __repr__(self):
+        return self.__class__.__name__ + ' (' + str(self.in_features) + ' -> ' + str(self.out_features) + ')'
+
+
+class LocalGraph(nn.Module):
+    def __init__(self, adj, input_dim, output_dim, dropout=None):
+        super(LocalGraph, self).__init__()
+
+        num_joints = adj.shape[0]
+
+        # Human3.6M
+        if num_joints == 17:
+            distal_joints = [3, 6, 10, 13, 16]
+            joints_left = [4, 5, 6, 11, 12, 13]
+            joints_right = [1, 2, 3, 14, 15, 16]
+
+        # Human3.6m with toe keypoitns
+        elif num_joints == 19:
+            distal_joints = [3, 4, 7, 8, 12, 15, 18]
+            joints_left = [5, 6, 7, 8, 13, 14, 15]
+            joints_right = [1, 2, 3, 4, 16, 17, 18]
+
+        # Human3.6M detected from Stacked Hourglass
+        elif num_joints == 16:
+            distal_joints = [3, 6, 9, 12, 15]
+            joints_left = [4, 5, 6, 10, 11, 12]
+            joints_right = [1, 2, 3, 13, 14, 15]
+
+        # HumanEva
+        elif num_joints == 15:
+            distal_joints = [4, 7, 10, 13]
+            joints_left = [2, 3, 4, 8, 9, 10]
+            joints_right = [5, 6, 7, 11, 12, 13]
+
+        else:
+            print('num_joints: %d' % num_joints)
+            raise KeyError("The dimension of adj matrix is wrong!")
+
+        adj_sym = torch.zeros_like(adj)
+        for i in range(num_joints):
+            for j in range(num_joints):
+                if i == j:
+                    adj_sym[i][j] = 1
+                if i in joints_left:
+                    index = joints_left.index(i)
+                    adj_sym[i][joints_right[index]] = 1.0
+                if i in joints_right:
+                    index = joints_right.index(i)
+                    adj_sym[i][joints_left[index]] = 1.0
+
+        adj_1st_order = adj.matrix_power(1)
+        # distal_joints = [3, 6, 10, 13, 16]
+        for i in np.arange(num_joints):
+            if i in distal_joints:
+                adj_1st_order[i] = 0
+
+        adj_2nd_order = adj.matrix_power(2)
+        # distal_joints = [3, 6, 10, 13, 16]
+        for i in np.arange(num_joints):
+            if i not in distal_joints:
+                adj_2nd_order[i] = 0
+
+        adj_con = adj_1st_order + adj_2nd_order
+
+        self.gcn_sym = SemGraphConv(input_dim, output_dim, adj_sym)
+        self.bn_1 = nn.BatchNorm2d(output_dim, momentum=0.1)
+        self.gcn_con = SemGraphConv(input_dim, output_dim, adj_con)
+        self.bn_2 = nn.BatchNorm2d(output_dim, momentum=0.1)
+        self.relu = nn.ReLU()
+
+        self.cat_conv = nn.Conv2d(2 * output_dim, output_dim, 1, bias=False)
+        self.cat_bn = nn.BatchNorm2d(output_dim, momentum=0.1)
+
+        if dropout is not None:
+            self.dropout = nn.Dropout2d(dropout)
+        else:
+            self.dropout = None
+
+    def forward(self, input):
+        # x: (B, T, K, C)
+        x = self.gcn_sym(input)
+        y = self.gcn_con(input)
+
+        # x: (B, T, K, C) --> (B, C, T, K)
+        x = x.permute(0, 3, 1, 2)
+        y = y.permute(0, 3, 1, 2)
+
+        x = self.relu(self.bn_1(x))
+        y = self.relu(self.bn_2(y))
+
+        output = torch.cat((x, y), dim=1)
+        output = self.cat_bn(self.cat_conv(output))
+
+        if self.dropout is not None:
+            output = self.dropout(self.relu(output))
+        else:
+            output = self.relu(output)
+        output = output.permute(0, 2, 3, 1)
+
+        return output
+
diff --git a/VideoToNPZ/tools/color_edge.py b/VideoToNPZ/tools/color_edge.py
new file mode 100644
index 0000000000000000000000000000000000000000..40475ab2311d2d85a595e67bb2558b2faaf0f5fb
--- /dev/null
+++ b/VideoToNPZ/tools/color_edge.py
@@ -0,0 +1,68 @@
+# For better visualization, give different colors to different bones
+
+h36m_elbow_knee_v1 = [5, 15]
+h36m_elbow_knee_v2 = [2, 12]
+h36m_wrist_ankle_v1 = [6, 16]
+h36m_wrist_ankle_v2 = [3, 13]
+h36m_hip_shoulder = [1, 4, 11, 14]
+h36m_spine_neck = [7, 9]
+h36m_thorax_head = [8, 10]
+
+
+def h36m_color_edge(joint_num):
+    if joint_num in h36m_elbow_knee_v1:
+        color = 'peru'  # (205, 133, 63)
+    elif joint_num in h36m_elbow_knee_v2:
+        color = 'indianred'  # (205, 92, 92)
+    elif joint_num in h36m_wrist_ankle_v1:
+        color = 'coral'  # (255, 127, 80)
+    elif joint_num in h36m_wrist_ankle_v2:
+        # color = 'deepskyblue'
+        color = 'brown'  # (165, 42, 42)
+    elif joint_num in h36m_hip_shoulder:
+        # color = 'dodgerblue'
+        color = 'tan'  # (210, 180, 140)
+    elif joint_num in h36m_spine_neck:
+        color = 'olive'  # (128, 128, 0)
+    else:
+        color = 'purple'  # (128, 0, 128)
+    return color
+
+
+ntu_elbow_knee_v1 = [6, 18]
+ntu_elbow_knee_v2 = [10, 14]
+ntu_wrist_ankle_v1 = [8, 19]
+ntu_wrist_ankle_v2 = [12, 15]
+ntu_hip_shoulder = [13, 17, 5, 9]
+ntu_spine_neck = [2, 3]
+ntu_thorax_head = [21, 4]
+ntu_foot = [16, 20]
+ntu_middle_wrist = [7, 11]
+ntu_thumbs = [23, 25]
+ntu_middle_finger = [22, 24]
+
+
+def ntu_color_edge(joint_num):
+    if joint_num in ntu_elbow_knee_v1:
+        color = 'peru'
+    elif joint_num in ntu_elbow_knee_v2:
+        color = 'indianred'
+    elif joint_num in ntu_wrist_ankle_v1:
+        color = 'coral'
+    elif joint_num in ntu_wrist_ankle_v2:
+        color = 'brown'
+    elif joint_num in ntu_hip_shoulder:
+        color = 'tan'
+    elif joint_num in ntu_spine_neck:
+        color = 'olive'
+    elif ntu_thorax_head:
+        color = 'purple'
+    elif joint_num in ntu_foot:
+        color = 'deepskyblue'
+    elif joint_num in ntu_middle_wrist:
+        color = 'dodgerblue'
+    elif joint_num in ntu_thumbs:
+        color = 'red'
+    else:
+        color = 'yellow'
+    return color
diff --git a/VideoToNPZ/tools/inference.py b/VideoToNPZ/tools/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..3f6ffed4d8318f67b37483e350bae40723ca3d82
--- /dev/null
+++ b/VideoToNPZ/tools/inference.py
@@ -0,0 +1,110 @@
+import torch
+import numpy as np
+import sys
+import os.path as osp
+
+
+pre_dir = osp.join(osp.dirname(osp.realpath(__file__)), '..')
+sys.path.insert(0, pre_dir)
+from common.camera import normalize_screen_coordinates, camera_to_world
+from common.generators import *
+sys.path.pop(0)
+
+
+joints_left, joints_right = [4, 5, 6, 11, 12, 13], [1, 2, 3, 14, 15, 16]
+kps_left, kps_right = [4, 5, 6, 11, 12, 13], [1, 2, 3, 14, 15, 16]
+rot = np.array([0.14070565, -0.15007018, -0.7552408, 0.62232804], dtype=np.float32)
+
+
+def evaluate(test_generator, model_pos):
+    prediction = []
+
+    with torch.no_grad():
+        for _, _, batch_2d in test_generator.next_epoch():
+
+            inputs_2d = torch.from_numpy(batch_2d.astype('float32'))
+            if torch.cuda.is_available():
+                inputs_2d = inputs_2d.cuda()
+
+            # Positional model
+            predicted_3d_pos = model_pos(inputs_2d)
+
+            # Test-time augmentation (if enabled)
+            if test_generator.augment_enabled():
+                # Undo flipping and take average with non-flipped version
+                predicted_3d_pos[1, :, :, 0] *= -1
+                predicted_3d_pos[1, :, joints_left + joints_right] = predicted_3d_pos[1, :, joints_right + joints_left]
+                predicted_3d_pos = torch.mean(predicted_3d_pos, dim=0, keepdim=True)
+
+            prediction.append(predicted_3d_pos.squeeze(0).cpu().numpy())
+
+        return prediction
+
+
+def gen_pose(kpts, valid_frames, width, height, model_pos, pad, causal_shift=0):
+    assert len(kpts.shape) == 4, 'The shape of kpts: {}'.format(kpts.shape)
+    assert kpts.shape[0] == len(valid_frames)
+
+    norm_seqs = []
+    for index, frames in enumerate(valid_frames):
+        seq_kps = kpts[index, frames]
+        norm_seq_kps = normalize_screen_coordinates(seq_kps, w=width, h=height)
+        norm_seqs.append(norm_seq_kps)
+
+    gen = UnchunkedGenerator(None, None, norm_seqs, pad=pad, causal_shift=causal_shift, augment=True,
+                             kps_left=kps_left, kps_right=kps_right, joints_left=joints_left, joints_right=joints_right)
+    prediction = evaluate(gen, model_pos)
+
+    prediction_to_world = []
+    for i in range(len(prediction)):
+        sub_prediction = prediction[i]
+
+        sub_prediction = camera_to_world(sub_prediction, R=rot, t=0)
+
+        # sub_prediction[:, :, 2] -= np.expand_dims(np.amin(sub_prediction[:, :, 2], axis=1), axis=1).repeat([17], axis=1)
+        # sub_prediction[:, :, 2] -= np.amin(sub_prediction[:, :, 2])
+
+        prediction_to_world.append(sub_prediction)
+
+    # prediction_to_world = np.asarray(prediction_to_world, dtype=np.float32)
+    return prediction_to_world
+
+
+def gen_pose_frame(kpts, width, height, model_pos, pad, causal_shift=0):
+    # kpts: (M, T, N, 2)
+    norm_seqs = []
+    for kpt in kpts:
+        norm_kpt = normalize_screen_coordinates(kpt, w=width, h=height)
+        norm_seqs.append(norm_kpt)
+
+    gen = UnchunkedGenerator(None, None, norm_seqs, pad=pad, causal_shift=causal_shift, augment=True,
+                             kps_left=kps_left, kps_right=kps_right, joints_left=joints_left, joints_right=joints_right)
+    prediction = evaluate(gen, model_pos)
+
+    prediction_to_world = []
+    for i in range(len(prediction)):
+        sub_prediction = prediction[i][0]
+        sub_prediction = camera_to_world(sub_prediction, R=rot, t=0)
+        sub_prediction[:, 2] -= np.amin(sub_prediction[:, 2])
+        prediction_to_world.append(sub_prediction)
+
+    return prediction_to_world
+
+
+def gen_pose_frame_(kpts, width, height, model_pos, pad, causal_shift=0):
+        # input (N, 17, 2) return (N, 17, 3)
+        if not isinstance(kpts, np.ndarray):
+            kpts = np.array(kpts)
+
+        keypoints = normalize_screen_coordinates(kpts[..., :2], w=width, h=height)
+
+        input_keypoints = keypoints.copy()
+        # test_time_augmentation True
+        from common.generators import UnchunkedGenerator
+        gen = UnchunkedGenerator(None, None, [input_keypoints], pad=pad, causal_shift=causal_shift,
+                                 augment=True, kps_left=kps_left, kps_right=kps_right,
+                                 joints_left=joints_left, joints_right=joints_right)
+        prediction = evaluate(gen, model_pos)
+        prediction = camera_to_world(prediction[0], R=rot, t=0)
+        prediction[:, :, 2] -= np.min(prediction[:, :, 2])
+        return prediction
diff --git a/VideoToNPZ/tools/mpii_coco_h36m.py b/VideoToNPZ/tools/mpii_coco_h36m.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c9a934ed6c9430bec828934ed376dc8a4ee48fb
--- /dev/null
+++ b/VideoToNPZ/tools/mpii_coco_h36m.py
@@ -0,0 +1,75 @@
+'''
+Project: https://github.com/fabro66/GAST-Net-3DPoseEstimation
+'''
+import numpy as np
+
+
+h36m_coco_order = [9, 11, 14, 12, 15, 13, 16, 4, 1, 5, 2, 6, 3]
+coco_order = [0, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
+spple_keypoints = [10, 8, 0, 7]
+
+scores_h36m_toe_oeder = [1, 2, 3, 5, 6, 7, 11, 13, 14, 15, 16, 17, 18]
+kpts_h36m_toe_order = [0, 1, 2, 3, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18]
+scores_coco_order = [12, 14, 16, 11, 13, 15, 0, 5, 7, 9, 6, 8, 10]
+
+h36m_mpii_order = [3, 2, 1, 4, 5, 6, 0, 8, 9, 10, 16, 15, 14, 11, 12, 13]
+mpii_order = [i for i in range(16)]
+lr_hip_shouler = [2, 3, 12, 13]
+
+
+def coco_h36m(keypoints):
+    temporal = keypoints.shape[0]
+    keypoints_h36m = np.zeros_like(keypoints, dtype=np.float32)
+    htps_keypoints = np.zeros((temporal, 4, 2), dtype=np.float32)
+
+    # htps_keypoints: head, thorax, pelvis, spine
+    htps_keypoints[:, 0, 0] = np.mean(keypoints[:, 1:5, 0], axis=1, dtype=np.float32)
+    htps_keypoints[:, 0, 1] = np.sum(keypoints[:, 1:3, 1], axis=1, dtype=np.float32) - keypoints[:, 0, 1]
+    htps_keypoints[:, 1, :] = np.mean(keypoints[:, 5:7, :], axis=1, dtype=np.float32)
+    htps_keypoints[:, 1, :] += (keypoints[:, 0, :] - htps_keypoints[:, 1, :]) / 3
+
+    htps_keypoints[:, 2, :] = np.mean(keypoints[:, 11:13, :], axis=1, dtype=np.float32)
+    htps_keypoints[:, 3, :] = np.mean(keypoints[:, [5, 6, 11, 12], :], axis=1, dtype=np.float32)
+
+    keypoints_h36m[:, spple_keypoints, :] = htps_keypoints
+    keypoints_h36m[:, h36m_coco_order, :] = keypoints[:, coco_order, :]
+
+    keypoints_h36m[:, 9, :] -= (keypoints_h36m[:, 9, :] - np.mean(keypoints[:, 5:7, :], axis=1, dtype=np.float32)) / 4
+    keypoints_h36m[:, 7, 0] += 2*(keypoints_h36m[:, 7, 0] - np.mean(keypoints_h36m[:, [0, 8], 0], axis=1, dtype=np.float32))
+    keypoints_h36m[:, 8, 1] -= (np.mean(keypoints[:, 1:3, 1], axis=1, dtype=np.float32) - keypoints[:, 0, 1])*2/3
+
+    # half body: the joint of ankle and knee equal to hip
+    # keypoints_h36m[:, [2, 3]] = keypoints_h36m[:, [1, 1]]
+    # keypoints_h36m[:, [5, 6]] = keypoints_h36m[:, [4, 4]]
+
+    valid_frames = np.where(np.sum(keypoints_h36m.reshape(-1, 34), axis=1) != 0)[0]
+    return keypoints_h36m, valid_frames
+
+
+def mpii_h36m(keypoints):
+    temporal = keypoints.shape[0]
+    keypoints_h36m = np.zeros((temporal, 17, 2), dtype=np.float32)
+    keypoints_h36m[:, h36m_mpii_order] = keypoints
+    # keypoints_h36m[:, 7] = np.mean(keypoints[:, 6:8], axis=1, dtype=np.float32)
+    keypoints_h36m[:, 7] = np.mean(keypoints[:, lr_hip_shouler], axis=1, dtype=np.float32)
+
+    valid_frames = np.where(np.sum(keypoints_h36m.reshape(-1, 34), axis=1) != 0)[0]
+    return keypoints_h36m, valid_frames
+
+
+def coco_h36m_toe_format(keypoints):
+    assert len(keypoints.shape) == 3
+    temporal = keypoints.shape[0]
+
+    new_kpts = np.zeros((temporal, 19, 2), dtype=np.float32)
+
+    # convert body+foot keypoints
+    coco_body_kpts = keypoints[:, :17].copy()
+    h36m_body_kpts, _ = coco_h36m(coco_body_kpts)
+    new_kpts[:, kpts_h36m_toe_order] = h36m_body_kpts
+    new_kpts[:, 4] = np.mean(keypoints[:, [20, 21]], axis=1, dtype=np.float32)
+    new_kpts[:, 8] = np.mean(keypoints[:, [17, 18]], axis=1, dtype=np.float32)
+
+    valid_frames = np.where(np.sum(new_kpts.reshape(-1, 38), axis=-1) != 0)[0]
+
+    return new_kpts,  valid_frames
diff --git a/VideoToNPZ/tools/preprocess.py b/VideoToNPZ/tools/preprocess.py
new file mode 100644
index 0000000000000000000000000000000000000000..3502ec725dc3b08c56c4a063016be2e190b6fc49
--- /dev/null
+++ b/VideoToNPZ/tools/preprocess.py
@@ -0,0 +1,172 @@
+import json
+import numpy as np
+from tools.mpii_coco_h36m import coco_h36m
+import os
+
+
+h36m_coco_order = [9, 11, 14, 12, 15, 13, 16, 4, 1, 5, 2, 6, 3]
+coco_order = [0, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
+num_person = 2
+num_joints = 17
+img_3d = 100.
+ratio_2d_3d = 500.
+
+
+def load_json(file_path):
+    with open(file_path, 'r') as fr:
+        video_info = json.load(fr)
+
+    label = video_info['label']
+    label_index = video_info['label_index']
+
+    num_frames = video_info['data'][-1]['frame_index']
+    keypoints = np.zeros((num_person, num_frames, num_joints, 2), dtype=np.float32)
+    scores = np.zeros((num_person, num_frames, num_joints), dtype=np.float32)
+
+    for frame_info in video_info['data']:
+        frame_index = frame_info['frame_index']
+
+        for index, skeleton_info in enumerate(frame_info['skeleton']):
+            pose = skeleton_info['pose']
+            score = skeleton_info['score']
+            bbox = skeleton_info['bbox']
+
+            if len(bbox) == 0 or index+1 > num_person:
+                continue
+
+            pose = np.asarray(pose, dtype=np.float32)
+            score = np.asarray(score, dtype=np.float32)
+            score = score.reshape(-1)
+
+            keypoints[index, frame_index-1] = pose
+            scores[index, frame_index-1] = score
+
+    return keypoints, scores, label, label_index
+
+
+def h36m_coco_format(keypoints, scores):
+    assert len(keypoints.shape) == 4 and len(scores.shape) == 3
+
+    h36m_kpts = []
+    h36m_scores = []
+    valid_frames = []
+
+    for i in range(keypoints.shape[0]):
+        kpts = keypoints[i]
+        score = scores[i]
+
+        new_score = np.zeros_like(score, dtype=np.float32)
+
+        if np.sum(kpts) != 0.:
+            kpts, valid_frame = coco_h36m(kpts)
+            h36m_kpts.append(kpts)
+            valid_frames.append(valid_frame)
+
+            new_score[:, h36m_coco_order] = score[:, coco_order]
+            new_score[:, 0] = np.mean(score[:, [11, 12]], axis=1, dtype=np.float32)
+            new_score[:, 8] = np.mean(score[:, [5, 6]], axis=1, dtype=np.float32)
+            new_score[:, 7] = np.mean(new_score[:, [0, 8]], axis=1, dtype=np.float32)
+            new_score[:, 10] = np.mean(score[:, [1, 2, 3, 4]], axis=1, dtype=np.float32)
+
+            h36m_scores.append(new_score)
+
+    h36m_kpts = np.asarray(h36m_kpts, dtype=np.float32)
+    h36m_scores = np.asarray(h36m_scores, dtype=np.float32)
+    return h36m_kpts, h36m_scores, valid_frames
+
+
+def revise_kpts(h36m_kpts, h36m_scores, valid_frames):
+
+    new_h36m_kpts = np.zeros_like(h36m_kpts)
+    for index, frames in enumerate(valid_frames):
+        kpts = h36m_kpts[index, frames]
+        score = h36m_scores[index, frames]
+
+        # threshold_score = score > 0.3
+        # if threshold_score.all():
+        #     continue
+
+        index_frame = np.where(np.sum(score < 0.3, axis=1) > 0)[0]
+
+        for frame in index_frame:
+            less_threshold_joints = np.where(score[frame] < 0.3)[0]
+
+            intersect = [i for i in [2, 3, 5, 6] if i in less_threshold_joints]
+
+            if [2, 3, 5, 6] == intersect:
+                kpts[frame, [2, 3, 5, 6]] = kpts[frame, [1, 1, 4, 4]]
+            elif [2, 3, 6] == intersect:
+                kpts[frame, [2, 3, 6]] = kpts[frame, [1, 1, 5]]
+            elif [3, 5, 6] == intersect:
+                kpts[frame, [3, 5, 6]] = kpts[frame, [2, 4, 4]]
+            elif [3, 6] == intersect:
+                kpts[frame, [3, 6]] = kpts[frame, [2, 5]]
+            elif [3] == intersect:
+                kpts[frame, 3] = kpts[frame, 2]
+            elif [6] == intersect:
+                kpts[frame, 6] = kpts[frame, 5]
+            else:
+                continue
+
+        new_h36m_kpts[index, frames] = kpts
+    return new_h36m_kpts
+
+
+def load_kpts_json(kpts_json):
+    keypoints, scores, label, label_index = load_json(kpts_json)
+    h36m_kpts, h36m_scores, valid_frames = h36m_coco_format(keypoints, scores)
+    re_kpts = revise_kpts(h36m_kpts, h36m_scores, valid_frames)
+
+    return re_kpts, valid_frames, scores, label, label_index
+
+
+def revise_skes(prediction, re_kpts, valid_frames):
+    new_prediction = np.zeros((*re_kpts.shape[:-1], 3), dtype=np.float32)
+    for i, frames in enumerate(valid_frames):
+        new_prediction[i, frames] = prediction[i]
+
+        # The origin of (x, y) is in the upper right corner,
+        # while the (x,y) coordinates in the image are in the upper left corner.
+        distance = re_kpts[i, frames[1:], :, :2] - re_kpts[i, frames[:1], :, :2]
+        distance = np.mean(distance[:, [1, 4, 11, 14]], axis=-2, keepdims=True)
+        new_prediction[i, frames[1:], :, 0] -= distance[..., 0] / ratio_2d_3d
+        new_prediction[i, frames[1:], :, 1] += distance[..., 1] / ratio_2d_3d
+
+    # The origin of (x, y) is in the upper right corner,
+    # while the (x,y) coordinates in the image are in the upper left corner.
+    # Calculate the relative distance between two people
+    if len(valid_frames) == 2:
+        intersec_frames = [frame for frame in valid_frames[0] if frame in valid_frames[1]]
+        absolute_distance = re_kpts[0, intersec_frames[:1], :, :2] - re_kpts[1, intersec_frames[:1], :, :2]
+        absolute_distance = np.mean(absolute_distance[:, [1, 4, 11, 14]], axis=-2, keepdims=True) / 2.
+
+        new_prediction[0, valid_frames[0], :, 0] -= absolute_distance[..., 0] / ratio_2d_3d
+        new_prediction[0, valid_frames[0], :, 1] += absolute_distance[..., 1] / ratio_2d_3d
+
+        new_prediction[1, valid_frames[1], :, 0] += absolute_distance[..., 0] / ratio_2d_3d
+        new_prediction[1, valid_frames[1], :, 1] -= absolute_distance[..., 1] / ratio_2d_3d
+
+    # Pre-processing the case where the movement of Z axis is relatively large, such as 'sitting down'
+    # Remove the absolute distance
+    # new_prediction[:, :, 1:] -= new_prediction[:, :, :1]
+    # new_prediction[:, :, 0] = 0
+    new_prediction[:, :, :, 2] -= np.amin(new_prediction[:, :, :, 2])
+
+    return new_prediction
+
+
+def revise_skes_real_time(prediction, re_kpts, width):
+    ratio_2d_3d_width = ratio_2d_3d * (width / 1920)
+    # prediction: (M, N, 3)
+    new_prediction = np.zeros((len(prediction), 17, 3), dtype=np.float32)
+    for i in range(len(prediction)):
+        new_prediction[i] = prediction[i]
+
+        initial_distance = re_kpts[i]
+        initial_distance = np.mean(initial_distance[[1, 4, 11, 14], :], axis=0)
+        new_prediction[i, :, 0] -= (initial_distance[0] - 3*width/5) / ratio_2d_3d_width
+        new_prediction[i, :, 1] += (initial_distance[1] - width/5) / ratio_2d_3d_width
+
+    new_prediction[:, :, 2] -= np.amin(new_prediction[:, :, 2])
+
+    return new_prediction
diff --git a/VideoToNPZ/tools/utils.py b/VideoToNPZ/tools/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..8d6260ffe4ab33e7e1a09a2360f641fd09502ec9
--- /dev/null
+++ b/VideoToNPZ/tools/utils.py
@@ -0,0 +1,171 @@
+import torch
+import numpy as np
+import hashlib
+import cv2
+import os.path as osp
+
+
+spple_keypoints = [10, 8, 0, 7]
+h36m_coco_order = [9, 11, 14, 12, 15, 13, 16, 4, 1, 5, 2, 6, 3]
+coco_order = [0, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
+joint_pairs = [(0, 1), (1, 2), (2, 3), (0, 4), (4, 5), (5, 6), (0, 7), (7, 8), (8, 9), (9, 10),
+               (8, 11), (11, 12), (12, 13), (8, 14), (14, 15), (15, 16)]
+colors_kps = [[255, 0, 0], [255, 85, 0], [255, 170, 0], [255, 255, 0], [170, 255, 0], [85, 255, 0], [0, 255, 0],
+              [50, 205, 50], [0, 255, 170], [0, 255, 255], [0, 170, 255], [0, 85, 255], [0, 0, 255], [85, 0, 255],
+              [170, 0, 255], [255, 0, 255]]
+
+
+def wrap(func, *args, unsqueeze=False):
+    """
+    Wrap a torch function so it can be called with NumPy arrays.
+    Input and return types are seamlessly converted.
+    """
+
+    args = list(args)
+    for i, arg in enumerate(args):
+        if type(arg) == np.ndarray:
+            args[i] = torch.from_numpy(arg)
+            if unsqueeze:
+                args[i] = args[i].unsqueeze(0)
+
+    result = func(*args)
+
+    if isinstance(result, tuple):
+        result = list(result)
+        for i, res in enumerate(result):
+            if type(res) == torch.Tensor:
+                if unsqueeze:
+                    res = res.squeeze(0)
+                result[i] = res.numpy()
+        return tuple(result)
+    elif type(result) == torch.Tensor:
+        if unsqueeze:
+            result = result.squeeze(0)
+        result = result.numpy()
+        return result
+    else:
+        return result
+
+
+def deterministic_random(min_value, max_value, data):
+    """
+        Encrypted, in order to generate the same size each time
+    """
+
+    digest = hashlib.sha256(data.encode()).digest()
+    raw_value = int.from_bytes(digest[:4], byteorder="litter", signed=False)
+    return int(raw_value / (2**32 - 1) * (max_value - min_value) + min_value)
+
+
+def resize_img(frame, max_length=640):
+    H, W = frame.shape[:2]
+    if max(W, H) > max_length:
+        if W > H:
+            W_resize = max_length
+            H_resize = int(H * max_length / W)
+        else:
+            H_resize = max_length
+            W_resize = int(W * max_length / H)
+        frame = cv2.resize(frame, (W_resize, H_resize), interpolation=cv2.INTER_AREA)
+        return frame, W_resize, H_resize
+
+    else:
+        return frame, W, H
+
+
+def draw_2Dimg(img, kpts, scores, display=None):
+    # kpts : (M, 17, 2)  scores: (M, 17)
+    im = img.copy()
+    for kpt, score in zip(kpts, scores):
+        for i, item in enumerate(kpt):
+            score_val = score[i]
+            if score_val > 0.3:
+                x, y = int(item[0]), int(item[1])
+                cv2.circle(im, (x, y), 4, (255, 255, 255), 1)
+        for pair, color in zip(joint_pairs, colors_kps):
+            j, j_parent = pair
+            pt1 = (int(kpt[j][0]), int(kpt[j][1]))
+            pt2 = (int(kpt[j_parent][0]), int(kpt[j_parent][1]))
+            cv2.line(im, pt1, pt2, color, 2)
+
+    if display:
+        cv2.imshow('frame', im)
+        cv2.waitKey(1)
+    return im
+
+
+def get_path(cur_file):
+    project_root = osp.dirname(osp.realpath(cur_file))
+    chk_root = osp.join(project_root, 'checkpoint/')
+    data_root = osp.join(project_root, 'data/')
+    lib_root = osp.join(project_root, 'lib/')
+    output_root = osp.join(project_root, 'output/')
+
+    return project_root, chk_root, data_root, lib_root, output_root
+
+
+def coco_h36m_frame(keypoints):
+    keypoints_h36m = np.zeros_like(keypoints, dtype=np.float32)
+    htps_keypoints = np.zeros((4, 2), dtype=np.float32)
+
+    # htps_keypoints: head, thorax, pelvis, spine
+    htps_keypoints[0, 0] = np.mean(keypoints[1:5, 0], axis=0, dtype=np.float32)
+    htps_keypoints[0, 1] = np.sum(keypoints[1:3, 1], axis=0, dtype=np.float32) - keypoints[0, 1]
+    htps_keypoints[1, :] = np.mean(keypoints[5:7, :], axis=0, dtype=np.float32)
+    htps_keypoints[1, :] += (keypoints[0, :] - htps_keypoints[1, :]) / 3
+
+    htps_keypoints[2, :] = np.mean(keypoints[11:13, :], axis=0, dtype=np.float32)
+    htps_keypoints[3, :] = np.mean(keypoints[[5, 6, 11, 12], :], axis=0, dtype=np.float32)
+
+    keypoints_h36m[spple_keypoints, :] = htps_keypoints
+    keypoints_h36m[h36m_coco_order, :] = keypoints[coco_order, :]
+
+    keypoints_h36m[9, :] -= (keypoints_h36m[9, :] - np.mean(keypoints[5:7, :], axis=0, dtype=np.float32)) / 4
+    keypoints_h36m[7, 0] += 0.3 * (keypoints_h36m[7, 0] - np.mean(keypoints_h36m[[0, 8], 0], axis=0, dtype=np.float32))
+    keypoints_h36m[8, 1] -= (np.mean(keypoints[1:3, 1], axis=0, dtype=np.float32) - keypoints[0, 1]) * 2 / 3
+
+    return keypoints_h36m
+
+
+def h36m_coco_kpts(keypoints, scores):
+    # keypoints: (M, N, C)  scores:(M, N, 1)
+    assert len(keypoints.shape) == 3 and len(scores.shape) == 3
+    scores.squeeze(axis=2)
+
+    h36m_kpts = []
+    h36m_scores = []
+    for i in range(keypoints.shape[0]):
+        kpts = keypoints[i]
+        score = scores[i]
+
+        new_score = np.zeros_like(score, dtype=np.float32)
+
+        if np.sum(kpts) != 0.:
+            new_score[h36m_coco_order] = score[coco_order]
+            new_score[0] = np.mean(score[[11, 12]], axis=0, dtype=np.float32)
+            new_score[8] = np.mean(score[[5, 6]], axis=0, dtype=np.float32)
+            new_score[7] = np.mean(new_score[[0, 8]], axis=0, dtype=np.float32)
+            new_score[10] = np.mean(score[[1, 2, 3, 4]], axis=0, dtype=np.float32)
+
+            h36m_scores.append(new_score)
+
+            kpts = coco_h36m_frame(kpts)
+            less_threshold_joints = np.where(new_score < 0.3)[0]
+            intersect = [i for i in [2, 3, 5, 6] if i in less_threshold_joints]
+
+            if [2, 3, 5, 6] == intersect:
+                kpts[[2, 3, 5, 6]] = kpts[[1, 1, 4, 4]]
+            elif [2, 3, 6] == intersect:
+                kpts[[2, 3, 6]] = kpts[[1, 1, 5]]
+            elif [3, 5, 6] == intersect:
+                kpts[[3, 5, 6]] = kpts[[2, 4, 4]]
+            elif [3, 6] == intersect:
+                kpts[[3, 6]] = kpts[[2, 5]]
+            elif [3] == intersect:
+                kpts[3] = kpts[2]
+            elif [6] == intersect:
+                kpts[6] = kpts[5]
+
+            h36m_kpts.append(kpts)
+
+    return h36m_kpts, h36m_scores
diff --git a/VideoToNPZ/tools/vis_h36m.py b/VideoToNPZ/tools/vis_h36m.py
new file mode 100644
index 0000000000000000000000000000000000000000..c3eb8103a16fdb25a23893b4dd7ed9f1fa249b90
--- /dev/null
+++ b/VideoToNPZ/tools/vis_h36m.py
@@ -0,0 +1,249 @@
+import matplotlib
+matplotlib.use('Agg')
+
+import matplotlib.pyplot as plt
+from matplotlib.animation import FuncAnimation, writers
+from mpl_toolkits.mplot3d import Axes3D
+import numpy as np
+import subprocess as sp
+from tools.color_edge import h36m_color_edge
+
+
+def get_resolution(filename):
+    command = ['ffprobe', '-v', 'error', '-select_streams', 'v:0',
+               '-show_entries', 'stream=width,height', '-of', 'csv=p=0', filename]
+    with sp.Popen(command, stdout=sp.PIPE, bufsize=-1) as pipe:
+        for line in pipe.stdout:
+            w, h = line.decode().strip().split(',')
+            return int(w), int(h)
+
+
+def get_fps(filename):
+    command = ['ffprobe', '-v', 'error', '-select_streams', 'v:0',
+               '-show_entries', 'stream=r_frame_rate', '-of', 'csv=p=0', filename]
+    with sp.Popen(command, stdout=sp.PIPE, bufsize=-1) as pipe:
+        for line in pipe.stdout:
+            a, b = line.decode().strip().split('/')
+            return int(a) / int(b)
+
+
+def read_video(filename, skip=0, limit=-1):
+    w, h = get_resolution(filename)
+
+    command = ['ffmpeg',
+               '-i', filename,
+               '-f', 'image2pipe',
+               '-pix_fmt', 'rgb24',
+               '-vsync', '0',
+               '-vcodec', 'rawvideo', '-']
+
+    i = 0
+    with sp.Popen(command, stdout=sp.PIPE, bufsize=-1) as pipe:
+        while True:
+            data = pipe.stdout.read(w * h * 3)
+            if not data:
+                break
+            i += 1
+            if i > limit and limit != -1:
+                continue
+            if i > skip:
+                yield np.frombuffer(data, dtype='uint8').reshape((h, w, 3))
+
+
+def downsample_tensor(X, factor):
+    length = X.shape[0] // factor * factor
+    return np.mean(X[:length].reshape(-1, factor, *X.shape[1:]), axis=1)
+
+
+def render_animation(keypoints, keypoints_metadata, poses, skeleton, fps, bitrate, azim, output, viewport, limit=-1,
+                     downsample=1, size=5, input_video_path=None, com_reconstrcution=False, input_video_skip=0):
+    """
+    TODO
+    Render an animation. The supported output modes are:
+     -- 'interactive': display an interactive figure
+                       (also works on notebooks if associated with %matplotlib inline)
+     -- 'html': render the animation as HTML5 video. Can be displayed in a notebook using HTML(...).
+     -- 'filename.mp4': render and export the animation as an h264 video (requires ffmpeg).
+     -- 'filename.gif': render and export the animation a gif file (requires imagemagick).
+    """
+    plt.ioff()
+
+    num_person = keypoints.shape[1]
+    if num_person == 2 and com_reconstrcution:
+
+        fig = plt.figure(figsize=(size * (1 + len(poses)), size))
+        ax_in = fig.add_subplot(1, 2, 1)
+    else:
+        fig = plt.figure(figsize=(size * (1 + len(poses)), size))
+        ax_in = fig.add_subplot(1, 1 + len(poses), 1)
+
+    ax_in.get_xaxis().set_visible(False)
+    ax_in.get_yaxis().set_visible(False)
+    ax_in.set_axis_off()
+    # ax_in.set_title('Input')
+
+    ax_3d = []
+    lines_3d = []
+    radius = 1.7
+
+    if num_person == 2 and com_reconstrcution:
+        ax = fig.add_subplot(1, 2, 2, projection='3d')
+        ax.view_init(elev=15., azim=azim)
+        ax.set_xlim3d([-radius, radius])
+        ax.set_zlim3d([0, radius])
+        ax.set_ylim3d([-radius, radius])
+        ax.set_xticklabels([])
+        ax.set_yticklabels([])
+        ax.set_zticklabels([])
+        ax.dist = 7.5
+        ax_3d.append(ax)
+        lines_3d.append([])
+
+        poses = list(poses.values())
+    else:
+        for index, (title, data) in enumerate(poses.items()):
+            ax = fig.add_subplot(1, 1 + len(poses), index + 2, projection='3d')
+
+            ax.view_init(elev=15., azim=azim)
+            ax.set_xlim3d([-radius / 2, radius / 2])
+            ax.set_zlim3d([0, radius])
+            ax.set_ylim3d([-radius / 2, radius / 2])
+            ax.set_aspect('equal')
+            ax.set_xticklabels([])
+            ax.set_yticklabels([])
+            ax.set_zticklabels([])
+            ax.dist = 7.5
+            # ax.set_title(title)  # , pad=35
+            ax_3d.append(ax)
+            lines_3d.append([])
+        poses = list(poses.values())
+
+    # Decode video
+    if input_video_path is None:
+        # Black background
+        all_frames = np.zeros((keypoints.shape[0], viewport[1], viewport[0]), dtype='uint8')
+    else:
+        # Load video using ffmpeg
+        all_frames = []
+        for f in read_video(input_video_path, skip=input_video_skip, limit=limit):
+            all_frames.append(f)
+        effective_length = min(keypoints.shape[0], len(all_frames))
+        all_frames = all_frames[:effective_length]
+
+        keypoints = keypoints[input_video_skip:]  # todo remove
+        for idx in range(len(poses)):
+            poses[idx] = poses[idx][input_video_skip:]
+
+        if fps is None:
+            fps = get_fps(input_video_path)
+
+    if downsample > 1:
+        keypoints = downsample_tensor(keypoints, downsample)
+        all_frames = downsample_tensor(np.array(all_frames), downsample).astype('uint8')
+        for idx in range(len(poses)):
+            poses[idx] = downsample_tensor(poses[idx], downsample)
+        fps /= downsample
+
+    initialized = False
+    image = None
+    lines = []
+    points = None
+
+    if limit < 1:
+        limit = len(all_frames)
+    else:
+        limit = min(limit, len(all_frames))
+
+    parents = skeleton.parents()
+    index = [i for i in np.arange(17)]
+
+    def update_video(i):
+        nonlocal initialized, image, lines, points
+
+        joints_right_2d = keypoints_metadata['keypoints_symmetry'][1]
+
+        if num_person == 2:
+            joints_right_2d_two = []
+            joints_right_2d_two += joints_right_2d
+            joints_right_2d_second = [i + 17 for i in joints_right_2d]
+            joints_right_2d_two += joints_right_2d_second
+
+            colors_2d = np.full(34, 'black')
+            colors_2d[joints_right_2d_two] = 'red'
+        else:
+            colors_2d = np.full(17, 'black')
+            colors_2d[joints_right_2d] = 'red'
+
+        if not initialized:
+            image = ax_in.imshow(all_frames[i], aspect='equal')
+
+            for j, j_parent in zip(index, parents):
+                if j_parent == -1:
+                    continue
+
+                if len(parents) == 17 and keypoints_metadata['layout_name'] != 'coco':
+                    for m in range(num_person):
+                        # Draw skeleton only if keypoints match (otherwise we don't have the parents definition)
+                        lines.append(ax_in.plot([keypoints[i, m, j, 0], keypoints[i, m, j_parent, 0]],
+                                                [keypoints[i, m, j, 1], keypoints[i, m, j_parent, 1]],
+                                                color='pink'))
+
+                # Apply different colors for each joint
+                col = h36m_color_edge(j)
+
+                if com_reconstrcution:
+                    for pose in poses:
+                        pos = pose[i]
+                        lines_3d[0].append(ax_3d[0].plot([pos[j, 0], pos[j_parent, 0]],
+                                                         [pos[j, 1], pos[j_parent, 1]],
+                                                         [pos[j, 2], pos[j_parent, 2]], zdir='z', c=col, linewidth=3))
+                else:
+                    for n, ax in enumerate(ax_3d):
+                        pos = poses[n][i]
+                        lines_3d[n].append(ax.plot([pos[j, 0], pos[j_parent, 0]],
+                                                   [pos[j, 1], pos[j_parent, 1]],
+                                                   [pos[j, 2], pos[j_parent, 2]], zdir='z', c=col, linewidth=3))
+
+            points = ax_in.scatter(*keypoints[i].reshape(17*num_person, 2).T, 10, color=colors_2d, edgecolors='white', zorder=10)
+            initialized = True
+        else:
+            image.set_data(all_frames[i])
+
+            for j, j_parent in zip(index, parents):
+                if j_parent == -1:
+                    continue
+
+                if len(parents) == 17 and keypoints_metadata['layout_name'] != 'coco':
+                    for m in range(num_person):
+                        lines[j + 16*m - 1][0].set_data([keypoints[i, m, j, 0], keypoints[i, m, j_parent, 0]],
+                                                        [keypoints[i, m, j, 1], keypoints[i, m, j_parent, 1]])
+
+                if com_reconstrcution:
+                    for k, pose in enumerate(poses):
+                        pos = pose[i]
+                        lines_3d[0][j + k*16 - 1][0].set_xdata([pos[j, 0], pos[j_parent, 0]])
+                        lines_3d[0][j + k*16 - 1][0].set_ydata([pos[j, 1], pos[j_parent, 1]])
+                        lines_3d[0][j + k*16 - 1][0].set_3d_properties([pos[j, 2], pos[j_parent, 2]], zdir='z')
+                else:
+                    for n, ax in enumerate(ax_3d):
+                        pos = poses[n][i]
+                        lines_3d[n][j - 1][0].set_xdata([pos[j, 0], pos[j_parent, 0]])
+                        lines_3d[n][j - 1][0].set_ydata([pos[j, 1], pos[j_parent, 1]])
+                        lines_3d[n][j - 1][0].set_3d_properties([pos[j, 2], pos[j_parent, 2]], zdir='z')
+
+            points.set_offsets(keypoints[i].reshape(17*num_person, 2))
+
+        print('{}/{}      '.format(i, limit), end='\r')
+
+    fig.tight_layout()
+
+    anim = FuncAnimation(fig, update_video, frames=np.arange(0, limit), interval=1000 / fps, repeat=False)
+    if output.endswith('.mp4'):
+        Writer = writers['ffmpeg']
+        writer = Writer(fps=fps, metadata={}, bitrate=bitrate)
+        anim.save(output, writer=writer)
+    elif output.endswith('.gif'):
+        anim.save(output, dpi=80, writer='imagemagick')
+    else:
+        raise ValueError('Unsupported output format (only .mp4 and .gif are supported)')
+    plt.close()
diff --git a/VideoToNPZ/tools/vis_kpts.py b/VideoToNPZ/tools/vis_kpts.py
new file mode 100644
index 0000000000000000000000000000000000000000..92dfe0018566e44c3976c42c77b350edb9e59372
--- /dev/null
+++ b/VideoToNPZ/tools/vis_kpts.py
@@ -0,0 +1,44 @@
+import numpy as np
+import cv2
+
+
+joint_pairs = [[0, 1], [1, 2], [2, 3], [0, 4], [4, 5], [5, 6],
+               [0, 7], [7, 8], [8, 9], [9, 10], [8, 11], [11, 12],
+               [12, 13], [8, 14], [14, 15], [15, 16]]
+
+colors_kps = [[255, 0, 0], [255, 85, 0], [255, 170, 0], [255, 255, 0], [170, 255, 0], [85, 255, 0], [0, 255, 0],
+              [50, 205, 50], [0, 255, 170], [0, 255, 255], [0, 170, 255], [0, 85, 255], [0, 0, 255], [85, 0, 255],
+              [170, 0, 255], [255, 0, 255]]
+
+
+def write(x, img):
+    # c1 = tuple(x[1:3].int())
+    # c2 = tuple(x[3:5].int())
+    c1 = (int(x[0]), int(x[1]))
+    c2 = (int(x[2]), int(x[3]))
+
+    cls = int(x[-1])
+    color = [0, 97, 255]
+    label = 'People {}'.format(x[-1])
+    cv2.rectangle(img, c1, c2, color, 1)
+    t_size = cv2.getTextSize(label, cv2.FONT_HERSHEY_PLAIN, 1, 1)[0]
+    c2 = c1[0] + t_size[0] + 3, c1[1] + t_size[1] + 4
+    cv2.rectangle(img, c1, c2, color, -1)
+    cv2.putText(img, label, (c1[0], c1[1] + t_size[1] + 4), cv2.FONT_HERSHEY_PLAIN, 1, [225, 255, 255], 1);
+    return img
+
+
+def plot_keypoint(image, coordinates):
+    # USE cv2
+    for i in range(coordinates.shape[0]):
+        pts = coordinates[i]
+        for color_i, jp in zip(colors_kps, joint_pairs):
+            pt0 = pts[jp, 0]
+            pt1 = pts[jp, 1]
+            pt0_0, pt0_1, pt1_0, pt1_1 = int(pt0[0]), int(pt0[1]), int(pt1[0]), int(pt1[1])
+
+            cv2.line(image, (pt0_0, pt1_0), (pt0_1, pt1_1), color_i, 5)
+            # cv2.circle(image,(pt0_0, pt0_1), 2, color_i, thickness=-1)
+            # cv2.circle(image,(pt1_0, pt1_1), 2, color_i, thickness=-1)
+    return image
+
diff --git a/convertNPZtoBVH/conver_bvh.py b/convertNPZtoBVH/conver_bvh.py
new file mode 100644
index 0000000000000000000000000000000000000000..27a5a1ffb803512636af65d9a5ef48416589a6cd
--- /dev/null
+++ b/convertNPZtoBVH/conver_bvh.py
@@ -0,0 +1,185 @@
+import os
+import numpy as np
+from scipy.spatial.transform import Rotation
+from collections import deque
+from tqdm import tqdm
+
+print(f"Saving 3D Motion")
+
+
+def parse_obj(filename):
+    vertices = []
+    lines = []
+    try:
+        with open(filename, 'r') as f:
+            for line in f:
+                if line.startswith('v '):
+                    parts = line.split()
+                    vertices.append([float(parts[1]), float(parts[2]), float(parts[3])])
+                elif line.startswith('l '):
+                    parts = line.split()
+                    lines.append([int(parts[1]) - 1, int(parts[2]) - 1])
+        return np.array(vertices), lines
+    except Exception as e:
+        raise ValueError(f"Error parsing OBJ file {filename}: {str(e)}")
+
+
+def build_hierarchy(lines, root=0):
+    num_joints = max(max(line) for line in lines) + 1
+    adj = [[] for _ in range(num_joints)]
+    for a, b in lines:
+        adj[a].append(b)
+        adj[b].append(a)
+    parent = [-1] * num_joints
+    queue = deque([root])
+    visited = [False] * num_joints
+    visited[root] = True
+    while queue:
+        p = queue.popleft()
+        for c in adj[p]:
+            if not visited[c]:
+                parent[c] = p
+                queue.append(c)
+                visited[c] = True
+    if not all(visited):
+        raise ValueError("The skeleton has disconnected components.")
+    children = [[] for _ in range(num_joints)]
+    for c in range(num_joints):
+        if parent[c] != -1:
+            children[parent[c]].append(c)
+    return parent, children
+
+
+def compute_offsets(vertices_ref, parent):
+    num_joints = len(vertices_ref)
+    offsets = np.zeros((num_joints, 3))
+    for j in range(num_joints):
+        if parent[j] != -1:
+            offsets[j] = vertices_ref[j] - vertices_ref[parent[j]]
+    return offsets
+
+
+def compute_R_world(joint, vertices_ref, vertices_cur, children):
+    if not children[joint]:
+        return np.eye(3)
+    elif len(children[joint]) == 1:
+        c = children[joint][0]
+        V_ref = vertices_ref[c] - vertices_ref[joint]
+        V_cur = vertices_cur[c] - vertices_cur[joint]
+        norm_ref = np.linalg.norm(V_ref)
+        norm_cur = np.linalg.norm(V_cur)
+        if norm_ref < 1e-6 or norm_cur < 1e-6:
+            return np.eye(3)
+        V_ref_norm = V_ref / norm_ref
+        V_cur_norm = V_cur / norm_cur
+        cos_theta = np.clip(np.dot(V_ref_norm, V_cur_norm), -1.0, 1.0)
+        if cos_theta > 0.99999:
+            return np.eye(3)
+        axis = np.cross(V_ref_norm, V_cur_norm)
+        axis_norm = np.linalg.norm(axis)
+        if axis_norm < 1e-6:
+            return np.eye(3)
+        axis = axis / axis_norm
+        angle = np.arccos(cos_theta)
+        R = Rotation.from_rotvec(axis * angle).as_matrix()
+        return R
+    else:
+        A = np.column_stack([vertices_ref[c] - vertices_ref[joint] for c in children[joint]])
+        B = np.column_stack([vertices_cur[c] - vertices_cur[joint] for c in children[joint]])
+        M = B @ A.T
+        U, _, Vh = np.linalg.svd(M)
+        R = U @ Vh
+        if np.linalg.det(R) < 0:
+            Vh[-1, :] *= -1
+            R = U @ Vh
+        return R
+
+
+def main():
+    output_dir = os.path.abspath('../outputs/')
+    os.makedirs(output_dir, exist_ok=True)
+    folder = os.path.join(output_dir, 'obj_sequence')
+
+    try:
+        obj_files = sorted([f for f in os.listdir(folder) if f.endswith('.obj')])
+    except Exception as e:
+        print(f"Error accessing folder {folder}: {e}")
+        return
+
+    if not obj_files:
+        print("No OBJ files found.")
+        return
+
+    try:
+        vertices_ref, lines = parse_obj(os.path.join(folder, obj_files[0]))
+        num_joints = len(vertices_ref)
+        parent, children = build_hierarchy(lines)
+        offsets = compute_offsets(vertices_ref, parent)
+        root = 0
+
+        hierarchy_order = []
+
+        def dfs(joint):
+            hierarchy_order.append(joint)
+            for child in children[joint]:
+                dfs(child)
+
+        dfs(root)
+
+        motion_data = []
+        for obj_file in tqdm(obj_files):
+            vertices_cur = parse_obj(os.path.join(folder, obj_file))[0]
+            R_world = [compute_R_world(j, vertices_ref, vertices_cur, children) for j in range(num_joints)]
+            R_local = [R_world[j] if parent[j] == -1 else R_world[parent[j]].T @ R_world[j] for j in range(num_joints)]
+            euler_angles = [Rotation.from_matrix(R).as_euler('ZYX', degrees=True) for R in R_local]
+            root_pos = vertices_cur[root]
+            motion_line = list(root_pos) + list(euler_angles[root])
+            for j in hierarchy_order[1:]:
+                motion_line.extend(euler_angles[j])
+            motion_data.append(motion_line)
+
+        # Note: Smoothing function has been removed
+        # Note: Elbow constraints have been removed
+
+        bvh_dir = os.path.join(output_dir, 'bvh')
+        os.makedirs(bvh_dir, exist_ok=True)
+        bvh_file = os.path.join(bvh_dir, 'output.bvh')
+
+        with open(bvh_file, 'w') as f:
+            f.write("HIERARCHY\n")
+
+            def write_hierarchy(joint, parent, f, indent=0):
+                if parent == -1:
+                    f.write("ROOT Joint{}\n".format(joint))
+                else:
+                    f.write("  " * indent + "JOINT Joint{}\n".format(joint))
+                f.write("  " * indent + "{\n")
+                f.write("  " * (indent + 1) + "OFFSET {:.6f} {:.6f} {:.6f}\n".format(*offsets[joint]))
+                if parent == -1:
+                    f.write("  " * (
+                                indent + 1) + "CHANNELS 6 Xposition Yposition Zposition Zrotation Yrotation Xrotation\n")
+                else:
+                    f.write("  " * (indent + 1) + "CHANNELS 3 Zrotation Yrotation Xrotation\n")
+                for child in children[joint]:
+                    write_hierarchy(child, joint, f, indent + 1)
+                if not children[joint]:
+                    f.write("  " * (indent + 1) + "End Site\n")
+                    f.write("  " * (indent + 1) + "{\n")
+                    f.write("  " * (indent + 2) + "OFFSET 0.000000 0.000000 0.000000\n")
+                    f.write("  " * (indent + 1) + "}\n")
+                f.write("  " * indent + "}\n")
+
+            write_hierarchy(root, -1, f)
+
+            f.write("MOTION\n")
+            f.write("Frames: {}\n".format(len(motion_data)))
+            f.write("Frame Time: 0.033333\n")
+            for motion_line in motion_data:
+                f.write(" ".join("{:.6f}".format(x) for x in motion_line) + "\n")
+
+    except Exception as e:
+        print(f"Error during processing: {e}")
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/convertNPZtoBVH/conver_obj.py b/convertNPZtoBVH/conver_obj.py
new file mode 100644
index 0000000000000000000000000000000000000000..857f091d2e50cfd7c1f3dec2b536aa4fe00b066a
--- /dev/null
+++ b/convertNPZtoBVH/conver_obj.py
@@ -0,0 +1,141 @@
+import numpy as np
+import os
+from datetime import datetime
+
+def define_human_connections():
+    """
+    Define connections for human stick figure with support for various poses
+    including crossed legs and complex movements
+    """
+    return [
+        # Core body structure
+        [0, 7],  # Base spine to upper spine
+        [7, 8],  # Upper spine to neck
+        [8, 9],  # Neck to head base
+        [9, 10], # Head extension
+        
+        # Arms (with complete chains)
+        # Left arm
+        [7, 14],  # Spine to left shoulder
+        [14, 15], # Left upper arm
+        [15, 16], # Left forearm/hand
+        
+        # Right arm
+        [7, 11],  # Spine to right shoulder
+        [11, 12], # Right upper arm
+        [12, 13], # Right forearm/hand
+        
+        # Legs with crossed support
+        # Left leg (now crossing to right)
+        [0, 1],   # Hip to left thigh
+        [1, 2],   # Left thigh to knee
+        [2, 3],   # Left knee to foot
+        
+        # Right leg
+        [0, 4],   # Hip to right thigh
+        [4, 5],   # Right thigh to knee
+        [5, 6],   # Right knee to foot
+        
+        # Structural connections
+        [14, 11], # Shoulder cross support
+        [1, 4],   # Hip cross support
+    ]
+
+def npz_to_obj_sequence(npz_path, output_dir):
+    """
+    Convert NPZ motion capture data to OBJ sequence
+    with enhanced support for various poses and movements
+    """
+    os.makedirs(output_dir, exist_ok=True)
+    data = np.load(npz_path)
+    reconstruction = data['reconstruction'][0]
+    
+    num_frames = reconstruction.shape[0]
+    connections = define_human_connections()
+    
+    # Increased scale for better visibility
+    scale = 150.0  # Adjusted scale factor
+    
+    for frame_idx in range(num_frames):
+        vertices = reconstruction[frame_idx]
+        output_path = os.path.join(output_dir, f"frame_{frame_idx:04d}.obj")
+        
+        with open(output_path, 'w') as f:
+            # Write vertices with enhanced precision
+            for v in vertices:
+                # Coordinate system transformation with improved scaling
+                x, y, z = v[0] * scale, v[2] * scale, v[1] * scale
+                f.write(f"v {x:.8f} {y:.8f} {z:.8f}\n")
+            
+            # Write connections
+            for conn in connections:
+                f.write(f"l {conn[0] + 1} {conn[1] + 1}\n")
+    
+
+def analyze_vertex_data(npz_path):
+    """
+    Enhanced analysis function to help understand the motion data
+    and verify correct vertex positions
+    """
+    data = np.load(npz_path)
+    reconstruction = data['reconstruction'][0]
+    
+    
+    # Calculate full range of motion
+    x_min, x_max = reconstruction[:,:,0].min(), reconstruction[:,:,0].max()
+    y_min, y_max = reconstruction[:,:,1].min(), reconstruction[:,:,1].max()
+    z_min, z_max = reconstruction[:,:,2].min(), reconstruction[:,:,2].max()
+    
+
+def process_motion_capture(npz_file):
+    try:
+        # Verify input file exists
+        if not os.path.exists(npz_file):
+            raise FileNotFoundError(f"Input file {npz_file} not found")
+        
+        # Define base output directory
+        base_output_dir = os.path.abspath('../outputs/')
+        # print(output_dir)
+        os.makedirs(base_output_dir, exist_ok=True)
+        # base_output_dir = r"C:\Users\ROGST\Programming\Python\videotobvh\convertNPZtoBVH\outputs"
+        
+        # Create a unique output directory with timestamp to avoid overwriting
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        output_dir = os.path.join(base_output_dir, f"obj_sequence")
+        
+        # Analyze data first
+        analyze_vertex_data(npz_file)
+        
+        # Convert to OBJ sequence
+        npz_to_obj_sequence(npz_path=npz_file, output_dir=output_dir)
+        
+        
+    except Exception as e:
+        print(f"Error processing motion capture data: {str(e)}")
+        raise
+
+def get_npz_paths(folder_path):
+    if not os.path.isdir(folder_path):
+        raise FileNotFoundError(f"Directory not found: {folder_path}")
+    
+    # Find the first .npz file in the directory
+    for file in os.listdir(folder_path):
+        if file.endswith('.npz'):
+            npz_path = os.path.join(folder_path, file)
+            return npz_path
+    
+    # If no .npz file is found
+    raise FileNotFoundError(f"No NPZ files found in directory: {folder_path}")
+
+if __name__ == "__main__":
+    # Define the directory where the NPZ file is located
+    output_dir = os.path.abspath('../outputs/npz/')
+    os.makedirs(output_dir, exist_ok=True)
+    input_dir = output_dir
+    
+    try:
+        # Get the first available NPZ file from the directory
+        npz_file = get_npz_paths(input_dir)
+        process_motion_capture(npz_file)
+    except FileNotFoundError as e:
+        print(f"Error: {str(e)}")
\ No newline at end of file
diff --git a/pipeline.py b/pipeline.py
new file mode 100644
index 0000000000000000000000000000000000000000..35b22b5837a6aca66811152e6a89a5c9873bcef1
--- /dev/null
+++ b/pipeline.py
@@ -0,0 +1,84 @@
+import subprocess
+import sys
+import os
+import argparse
+import time
+from datetime import datetime
+import signal
+
+def signal_handler(sig, frame):
+    print("\nInterrupted by user, shutting down...")
+    if 'pool' in locals() and pool is not None:
+        pool.terminate()
+        pool.join()
+    sys.exit(0)
+
+signal.signal(signal.SIGINT, signal_handler)
+
+def parse_arguments():
+    parser = argparse.ArgumentParser(description="Run the complete video-to-BVH pipeline")
+    # parser.add_argument('-v', '--video', required=True, help="Path to the input video file")
+    return parser.parse_args()
+
+def run_command(command, description):
+    """Run a command and show its output."""
+    try:
+        start_time = time.time()
+        script_dir = os.path.dirname(command[1])
+        current_dir = os.getcwd()
+        os.chdir(script_dir)
+        
+        subprocess.run(command, check=True)
+        
+        os.chdir(current_dir)
+        
+        end_time = time.time()
+        execution_time = end_time - start_time
+        return True
+        
+    except subprocess.CalledProcessError as e:
+        os.chdir(current_dir)
+        return False
+    
+    except Exception as e:
+        if 'current_dir' in locals():
+            os.chdir(current_dir)
+        return False
+
+def main():
+    args = parse_arguments()
+    
+    base_dir = os.path.dirname(os.path.abspath(__file__))
+    gen_skes_path = os.path.join(base_dir, "VideoToNPZ", "gen_skes.py")
+    convert_obj_path = os.path.join(base_dir, "convertNPZtoBVH", "conver_obj.py")
+    convert_bvh_path = os.path.join(base_dir, "convertNPZtoBVH", "conver_bvh.py")
+    
+    for script_path in [gen_skes_path, convert_obj_path, convert_bvh_path]:
+        if not os.path.exists(script_path):
+            return 1
+    
+    pipeline_steps = [
+        {
+            "command": [sys.executable, gen_skes_path],
+        },
+        {
+            "command": [sys.executable, convert_obj_path],
+        },
+        {
+            "command": [sys.executable, convert_bvh_path],
+        }
+    ]
+    
+    successful = 0
+    failed = 0
+    
+    for step in pipeline_steps:
+        if run_command(step["command"], ""):
+            successful += 1
+        else:
+            failed += 1
+    
+    return 0 if failed == 0 else 1
+
+if __name__ == "__main__":
+    sys.exit(main())
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..c0e0d8af6019e5b8408f7fc8918e17e31871a149
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,112 @@
+altair==5.4.1
+asttokens==3.0.0
+attrs==25.1.0
+backcall==0.2.0
+beautifulsoup4==4.13.3
+bleach==6.1.0
+blinker==1.8.2
+cachetools==5.5.2
+certifi==2025.1.31
+charset-normalizer==3.4.1
+click==8.1.8
+colorama==0.4.6
+contourpy==1.1.1
+cycler==0.12.1
+decorator==5.2.1
+defusedxml==0.7.1
+docopt==0.6.2
+executing==2.2.0
+fastjsonschema==2.21.1
+filelock==3.16.1
+filterpy==1.4.5
+fonttools==4.56.0
+fsspec==2025.2.0
+gitdb==4.0.12
+GitPython==3.1.44
+h5py==3.11.0
+idna==3.10
+imageio==2.35.1
+importlib_metadata==8.5.0
+importlib_resources==6.4.5
+ipython==8.12.3
+jedi==0.19.2
+Jinja2==3.1.5
+jsonschema==4.23.0
+jsonschema-specifications==2023.12.1
+jupyter_client==8.6.3
+jupyter_core==5.7.2
+jupyterlab_pygments==0.3.0
+kiwisolver==1.4.7
+lazy_loader==0.4
+llvmlite==0.41.1
+markdown-it-py==3.0.0
+MarkupSafe==2.1.5
+matplotlib==3.7.5
+matplotlib-inline==0.1.7
+mdurl==0.1.2
+mistune==3.1.2
+mpmath==1.3.0
+narwhals==1.29.0
+nbclient==0.10.1
+nbconvert==7.16.6
+nbformat==5.10.4
+networkx==3.1
+numba==0.58.1
+numpy==1.24.4
+opencv-python==4.11.0.86
+packaging==24.2
+pandas==2.0.3
+pandocfilters==1.5.1
+parso==0.8.4
+pickleshare==0.7.5
+pillow==10.4.0
+pip-check==2.9
+pipreqs==0.5.0
+pkgutil_resolve_name==1.3.10
+platformdirs==4.3.6
+prompt_toolkit==3.0.50
+protobuf==5.29.3
+psutil==7.0.0
+pure_eval==0.2.3
+pyarrow==17.0.0
+pydeck==0.9.1
+Pygments==2.19.1
+pyparsing==3.1.4
+python-dateutil==2.9.0.post0
+pytz==2025.1
+PyWavelets==1.4.1
+pywin32==308
+PyYAML==6.0.2
+pyzmq==26.2.1
+referencing==0.35.1
+requests==2.32.3
+rich==13.9.4
+rpds-py==0.20.1
+scikit-image==0.21.0
+scipy==1.10.1
+six==1.17.0
+smmap==5.0.2
+soupsieve==2.6
+stack-data==0.6.3
+streamlit==1.40.1
+sympy==1.13.3
+tenacity==9.0.0
+terminaltables==3.1.10
+tifffile==2023.7.10
+tinycss2==1.2.1
+toml==0.10.2
+torch==2.4.1
+torchsummary==1.5.1
+torchvision==0.19.1
+tornado==6.4.2
+tqdm==4.67.1
+traitlets==5.14.3
+typing_extensions==4.12.2
+tzdata==2025.1
+urllib3==2.2.3
+watchdog==4.0.2
+wcwidth==0.2.13
+webencodings==0.5.1
+yacs==0.1.8
+yarg==0.1.9
+zipp==3.20.2