diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..f5894da75edb37f7e550a2924d850e51fbe116d0 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +*.weights filter=lfs diff=lfs merge=lfs -text \ No newline at end of file diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..044da9b5783620e46b9ce1c2a0abf053ebeae759 --- /dev/null +++ b/.gitignore @@ -0,0 +1,7 @@ +venv/ +_pycache_/ +*.pyc +*.bvh +*.obj +*.npz +*.mp4 \ No newline at end of file diff --git a/VideoToNPZ/INFERENCE_EN.md b/VideoToNPZ/INFERENCE_EN.md new file mode 100644 index 0000000000000000000000000000000000000000..de704682748ff64757379026b805f48c375c64a1 --- /dev/null +++ b/VideoToNPZ/INFERENCE_EN.md @@ -0,0 +1,2 @@ + + python gen_skes.py -v baseball.mp4 diff --git a/VideoToNPZ/checkpoint/gastnet/81_frame_model.bin b/VideoToNPZ/checkpoint/gastnet/81_frame_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..7ab636bcc966684a8a3be7ded4ff0d9311d76575 --- /dev/null +++ b/VideoToNPZ/checkpoint/gastnet/81_frame_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3150eb3125ca66242a888fd06b4eb7d8a8b755607370225c24f0b9c794d35cc4 +size 28333160 diff --git a/VideoToNPZ/checkpoint/hrnet/pose_coco/pose_hrnet_w48_384x288.pth b/VideoToNPZ/checkpoint/hrnet/pose_coco/pose_hrnet_w48_384x288.pth new file mode 100644 index 0000000000000000000000000000000000000000..8cea6c32352118068ddda01cb72a33c8450b3e31 --- /dev/null +++ b/VideoToNPZ/checkpoint/hrnet/pose_coco/pose_hrnet_w48_384x288.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:95e0fec3194826d5e3f806ea89be68bbb84517b114c3a32b3058c56610b5ef61 +size 255061287 diff --git a/VideoToNPZ/checkpoint/yolov3/yolov3.weights b/VideoToNPZ/checkpoint/yolov3/yolov3.weights new file mode 100644 index 0000000000000000000000000000000000000000..550ca2f10867af32a8434dd7cddb5d305a77c97f --- /dev/null +++ b/VideoToNPZ/checkpoint/yolov3/yolov3.weights @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:523e4e69e1d015393a1b0a441cef1d9c7659e3eb2d7e15f793f060a21b32f297 +size 248007048 diff --git a/VideoToNPZ/common/arguments.py b/VideoToNPZ/common/arguments.py new file mode 100644 index 0000000000000000000000000000000000000000..f4bf40e987e5a5cf41b50bb228472d0841e551b7 --- /dev/null +++ b/VideoToNPZ/common/arguments.py @@ -0,0 +1,86 @@ +import argparse + + +def parse_args(): + parser = argparse.ArgumentParser(description='Training script') + + # General arguments + parser.add_argument('-d', '--dataset', default='h36m', type=str, metavar='NAME', + help='target dataset') # h36m or humaneva + parser.add_argument('-k', '--keypoints', default='cpn_ft_h36m_dbb', type=str, metavar='NAME', + help='2D detections to use') + parser.add_argument('-str', '--subjects-train', default='S1,S5,S6,S7,S8', type=str, metavar='LIST', + help='training subjects separated by comma') + parser.add_argument('-ste', '--subjects-test', default='S9,S11', type=str, metavar='LIST', + help='test subjects separated by comma') + parser.add_argument('-a', '--actions', default='*', type=str, metavar='LIST', + help='actions to train/test on, separated by comma, or * for all') + parser.add_argument('-c', '--checkpoint', default='checkpoint', type=str, metavar='PATH', + help='checkpoint directory') + parser.add_argument('--checkpoint-frequency', default=10, type=int, metavar='N', + help='create a checkpoint every N epochs') + parser.add_argument('-r', '--resume', default='', type=str, metavar='FILENAME', + help='checkpoint to resume (file name)') + parser.add_argument('--evaluate', default='', type=str, metavar='FILENAME', + help='checkpoint to evaluate (file name)') + parser.add_argument('--render', action='store_true', help='visualize a particular video') + parser.add_argument('--by-subject', action='store_true', help='break down error by subject (on evaluation)') + parser.add_argument('--export-training-curves', action='store_true', help='save training curves as .png images') + + # Model arguments + parser.add_argument('-s', '--stride', default=1, type=int, metavar='N', help='chunk size to use during training') + parser.add_argument('-arc', '--architecture', default='3,3,3', type=str, metavar='LAYERS', + help='filter widths separated by comma') + parser.add_argument('--causal', action='store_true', help='use causal convolutions for real-time processing') + parser.add_argument('-ch', '--channels', default=128, type=int, metavar='N', + help='number of channels in convolution layers') + + # Experimental setting + parser.add_argument('-e', '--epochs', default=60, type=int, metavar='N', help='number of training epochs') + parser.add_argument('-b', '--batch-size', default=128, type=int, metavar='N', + help='batch size in terms of predicted frames') + parser.add_argument('-drop', '--dropout', default=0.05, type=float, metavar='P', help='dropout probability') + parser.add_argument('-lr', '--learning-rate', default=0.001, type=float, metavar='LR', help='initial learning rate') + parser.add_argument('-lrd', '--lr-decay', default=0.95, type=float, metavar='LR', + help='learning rate decay per epoch') + parser.add_argument('-no-da', '--no-data-augmentation', dest='data_augmentation', action='store_false', + help='disable train-time flipping') + parser.add_argument('-no-tta', '--no-test-time-augmentation', dest='test_time_augmentation', action='store_false', + help='disable test-time flipping') + parser.add_argument('--subset', default=1, type=float, metavar='FRACTION', help='reduce dataset size by fraction') + parser.add_argument('--downsample', default=5, type=int, metavar='FACTOR', + help='downsample frame rate by factor (semi-supervised)') + parser.add_argument('--no-eval', action='store_true', + help='disable epoch evaluation while training (small speed-up)') + parser.add_argument('--disable-optimizations', action='store_true', + help='disable optimized model for single-frame predictions') + + # Visualization + parser.add_argument('--viz-subject', type=str, metavar='STR', help='subject to render') + parser.add_argument('--viz-action', type=str, metavar='STR', help='action to render') + parser.add_argument('--viz-camera', type=int, default=0, metavar='N', help='camera to render') + parser.add_argument('--viz-video', type=str, metavar='PATH', help='path to input video') + parser.add_argument('--viz-skip', type=int, default=0, metavar='N', help='skip first N frames of input video') + parser.add_argument('--viz-output', type=str, metavar='PATH', help='output file name (.gif or .mp4)') + parser.add_argument('--viz-export', type=str, metavar='PATH', help='output file name for coordinates') + parser.add_argument('--viz-bitrate', type=int, default=3000, metavar='N', help='bitrate for mp4 videos') + parser.add_argument('--viz-no-ground-truth', action='store_true', help='do not show ground-truth poses') + parser.add_argument('--viz-limit', type=int, default=-1, metavar='N', help='only render first N frames') + parser.add_argument('--viz-downsample', type=int, default=1, metavar='N', help='downsample FPS by a factor N') + parser.add_argument('--viz-size', type=int, default=5, metavar='N', help='image size') + + parser.set_defaults(bone_length_term=True) + parser.set_defaults(data_augmentation=True) + parser.set_defaults(test_time_augmentation=True) + + args = parser.parse_args() + # Check invalid configuration + if args.resume and args.evaluate: + print('Invalid flags: --resume and --evaluate cannot be set at the same time') + exit() + + if args.export_training_curves and args.no_eval: + print('Invalid flags: --export-training-curves and --no-eval cannot be set at the same time') + exit() + + return args diff --git a/VideoToNPZ/common/camera.py b/VideoToNPZ/common/camera.py new file mode 100644 index 0000000000000000000000000000000000000000..5d691b5166c63e0ea348eae1d939687257ba500d --- /dev/null +++ b/VideoToNPZ/common/camera.py @@ -0,0 +1,63 @@ +import numpy as np +import torch + +from tools.utils import wrap +from common.quaternion import qort, qinverse + + +def normalize_screen_coordinates(X, w, h): + assert X.shape[-1] == 2 + + # Normalize so that [0, w] is mapped to [-1, 1], while preserving the aspect ratio + return X/w*2 - [1, h/w] + + +def image_coordinates(X, w, h): + assert X.shape[-1] == 2 + + # Reverse camera frame normalization + return (X + [1, h/w]) * w / 2 + + +def world_to_camera(X, R, t): + Rt = wrap(qinverse, R) # Invert rotation + return wrap(qort, np.tile(Rt, (*X.shape[:-1], 1)), X - t) # Rotate and translate + + +def camera_to_world(X, R, t): + return wrap(qort, np.tile(R, (*X.shape[:-1], 1)), X) + t + + +def project_to_2d(X, camera_params): + """ + Project 3D points to 2D using the Human3.6M camera projection function. + This is a differentiable and batched reimplementation of the original MATLAB script. + + Arguments: + X -- 3D points in *camera space* to transform (N, *, 3) + camera_params -- intrinsic parameteres (N, 2+2+3+2=9) + """ + assert X.shape[-1] == 3 + assert len(camera_params.shape) == 2 + assert camera_params.shape[-1] == 9 + assert X.shape[0] == camera_params.shape[0] + + while len(camera_params.shape) < len(X.shape): + camera_params = camera_params.unsqueeze(1) + + f = camera_params[..., :2] + c = camera_params[..., 2:4] + k = camera_params[..., 4:7] + p = camera_params[..., 7:] + + # XX = torch.clamp(X[..., :2] / X[..., 2:], min=-1, max=1) + XX = X[..., :2] / X[..., 2:] + r2 = torch.sum(XX[..., :2]**2, dim=len(XX.shape)-1, keepdim=True) + + radial = 1 + torch.sum(k * torch.cat((r2, r2**2, r2**3), dim=len(r2.shape)-1), dim=len(r2.shape)-1, keepdim=True) + tan = torch.sum(p*XX, dim=len(XX.shape)-1, keepdim=True) + + XXX = XX*(radial + tan) + p*r2 + + return f*XXX + c + diff --git a/VideoToNPZ/common/generators.py b/VideoToNPZ/common/generators.py new file mode 100644 index 0000000000000000000000000000000000000000..1a437693ed396455313a0cf8f848b225322b3165 --- /dev/null +++ b/VideoToNPZ/common/generators.py @@ -0,0 +1,236 @@ +from itertools import zip_longest +import numpy as np + + +class ChunkedGenerator: + """ + Batched data generator, used for training. + The sequences are split into equal-length chunks and padded as necessary. + + Arguments: + batch_size -- the batch size to use for training + cameras -- list of cameras, one element for each video (optional, used for semi-supervised training) + poses_3d -- list of ground-truth 3D poses, one element for each video (optional, used for supervised training) + poses_2d -- list of input 2D keypoints, one element for each video + chunk_length -- number of output frames to predict for each training example (usually 1) + pad -- 2D input padding to compensate for valid convolutions, per side (depends on the receptive field) + causal_shift -- asymmetric padding offset when causal convolutions are used (usually 0 or "pad") + shuffle -- randomly shuffle the dataset before each epoch + random_seed -- initial seed to use for the random generator + augment -- augment the dataset by flipping poses horizontally + kps_left and kps_right -- list of left/right 2D keypoints if flipping is enabled + joints_left and joints_right -- list of left/right 3D joints if flipping is enabled + """ + def __init__(self, batch_size, cameras, poses_3d, poses_2d, + chunk_length, pad=0, causal_shift=0, + shuffle=True, random_seed=1234, + augment=False, kps_left=None, kps_right=None, joints_left=None, joints_right=None, + endless=False): + assert poses_3d is None or len(poses_3d) == len(poses_2d), (len(poses_3d), len(poses_2d)) + assert cameras is None or len(cameras) == len(poses_2d) + + # Build lineage info + pairs = [] # (seq_idx, start_frame, end_frame, flip) tuples + for i in range(len(poses_2d)): + assert poses_3d is None or poses_3d[i].shape[0] == poses_2d[i].shape[0] + n_chunks = (poses_2d[i].shape[0] + chunk_length - 1) // chunk_length + offset = (n_chunks * chunk_length - poses_2d[i].shape[0]) // 2 + bounds = np.arange(n_chunks + 1) * chunk_length - offset + augment_vector = np.full(len(bounds)-1, False, dtype=bool) + pairs += zip(np.repeat(i, len(bounds)-1), bounds[:-1], bounds[1:], augment_vector) + if augment: + pairs += zip(np.repeat(i, len(bounds)-1), bounds[:-1], bounds[1:], ~augment_vector) + + # Initialize buffers + if cameras is not None: + self.batch_cam = np.empty((batch_size, cameras[0].shape[-1])) + if poses_3d is not None: + self.batch_3d = np.empty((batch_size, chunk_length, poses_3d[0].shape[-2], poses_3d[0].shape[-1])) + self.batch_2d = np.empty((batch_size, chunk_length + 2*pad, poses_2d[0].shape[-2], poses_2d[0].shape[-1])) + + self.num_batches = (len(pairs) + batch_size - 1) // batch_size + self.batch_size = batch_size + self.random = np.random.RandomState(random_seed) + self.pairs = pairs + self.shuffle = shuffle + self.pad = pad + self.causal_shift = causal_shift + self.endless = endless + self.state = None + + self.cameras = cameras + self.poses_3d = poses_3d + self.poses_2d = poses_2d + + self.augment = augment + self.kps_left = kps_left + self.kps_right = kps_right + self.joints_left = joints_left + self.joints_right = joints_right + + def num_frames(self): + return self.num_batches * self.batch_size + + def random_state(self): + return self.random + + def set_random_state(self, random): + self.random = random + + def augment_enabled(self): + return self.augment + + def next_pairs(self): + if self.state is None: + if self.shuffle: + pairs = self.random.permutation(self.pairs) + else: + pairs = self.pairs + return 0, pairs + else: + return self.state + + def next_epoch(self): + enabled = True + while enabled: + start_idx, pairs = self.next_pairs() + for b_i in range(start_idx, self.num_batches): + chunks = pairs[b_i*self.batch_size : (b_i+1)*self.batch_size] + for i, (seq_i, start_3d, end_3d, flip) in enumerate(chunks): + start_2d = start_3d - self.pad - self.causal_shift + end_2d = end_3d + self.pad - self.causal_shift + + # 2D poses + seq_2d = self.poses_2d[seq_i] + low_2d = max(start_2d, 0) + high_2d = min(end_2d, seq_2d.shape[0]) + pad_left_2d = low_2d - start_2d + pad_right_2d = end_2d - high_2d + if pad_left_2d != 0 or pad_right_2d != 0: + self.batch_2d[i] = np.pad(seq_2d[low_2d:high_2d], ((pad_left_2d, pad_right_2d), (0, 0), (0, 0)), "edge") + else: + self.batch_2d[i] = seq_2d[low_2d:high_2d] + + if flip: + # Flip 2D keypoints + self.batch_2d[i, :, :, 0] *= -1 + self.batch_2d[i, :, self.kps_left + self.kps_right] = self.batch_2d[i, :, self.kps_right + self.kps_left] + + # 3D poses + if self.poses_3d is not None: + seq_3d = self.poses_3d[seq_i] + low_3d = max(start_3d, 0) + high_3d = min(end_3d, seq_3d.shape[0]) + pad_left_3d = low_3d - start_3d + pad_right_3d = end_3d - high_3d + if pad_left_3d != 0 or pad_right_3d != 0: + self.batch_3d[i] = np.pad(seq_3d[low_3d:high_3d], ((pad_left_3d, pad_right_3d), (0, 0), (0, 0)), "edge") + else: + self.batch_3d[i] = seq_3d[low_3d:high_3d] + + if flip: + # Flip 3D joints + self.batch_3d[i, :, :, 0] *= -1 + self.batch_3d[i, :, self.joints_left + self.joints_right] = \ + self.batch_3d[i, :, self.joints_right + self.joints_left] + + # Cameras + if self.cameras is not None: + self.batch_cam[i] = self.cameras[seq_i] + if flip: + # Flip horizontal distortion coefficients + self.batch_cam[i, 2] *= -1 + self.batch_cam[i, 7] *= -1 + + if self.endless: + self.state = (b_i + 1, pairs) + if self.poses_3d is None and self.cameras is None: + yield None, None, self.batch_2d[:len(chunks)] + elif self.poses_3d is not None and self.cameras is None: + yield None, self.batch_3d[:len(chunks)], self.batch_2d[:(len(chunks))] + elif self.poses_3d is None: + yield self.batch_cam, None, self.batch_2d[:len(chunks)] + else: + yield self.batch_cam[:len(chunks)], self.batch_3d[:len(chunks)], self.batch_2d[:len(chunks)] + + if self.endless: + self.state = None + else: + enabled = False + + +class UnchunkedGenerator: + """ + Non-batched data generator, used for testing. + Sequences are returned one at a time (i.e. batch size = 1), without chunking. + + If data augmentation is enabled, the batches contain two sequences (i.e. batch size = 2), + the second of which is a mirrored version of the first. + + Arguments: + cameras -- list of cameras, one element for each video (optional, used for semi-supervised training) + poses_3d -- list of ground-truth 3D poses, one element for each video (optional, used for supervised training) + poses_2d -- list of input 2D keypoints, one element for each video + pad -- 2D input padding to compensate for valid convolutions, per side (depends on the receptive field) + causal_shift -- asymmetric padding offset when causal convolutions are used (usually 0 or "pad") + augment -- augment the dataset by flipping poses horizontally + kps_left and kps_right -- list of left/right 2D keypoints if flipping is enabled + joints_left and joints_right -- list of left/right 3D joints if flipping is enabled + """ + + def __init__(self, cameras, poses_3d, poses_2d, pad=0, causal_shift=0, + augment=False, kps_left=None, kps_right=None, joints_left=None, joints_right=None): + assert poses_3d is None or len(poses_3d) == len(poses_2d) + assert cameras is None or len(cameras) == len(poses_2d) + + self.augment = augment + self.kps_left = kps_left + self.kps_right = kps_right + self.joints_left = joints_left + self.joints_right = joints_right + + self.pad = pad + self.causal_shift = causal_shift + self.cameras = [] if cameras is None else cameras + self.poses_3d = [] if poses_3d is None else poses_3d + self.poses_2d = poses_2d + + def num_frames(self): + count = 0 + for p in self.poses_2d: + count += p.shape[0] + return count + + def augment_enabled(self): + return self.augment + + def set_augment(self, augment): + self.augment = augment + + def next_epoch(self): + for seq_cam, seq_3d, seq_2d in zip_longest(self.cameras, self.poses_3d, self.poses_2d): + batch_cam = None if seq_cam is None else np.expand_dims(seq_cam, axis=0) + batch_3d = None if seq_3d is None else np.expand_dims(seq_3d, axis=0) + batch_2d = np.expand_dims(np.pad(seq_2d, + ((self.pad + self.causal_shift, self.pad - self.causal_shift), (0, 0), + (0, 0)), + 'edge'), axis=0) + if self.augment: + # Append flipped version + if batch_cam is not None: + batch_cam = np.concatenate((batch_cam, batch_cam), axis=0) + batch_cam[1, 2] *= -1 + batch_cam[1, 7] *= -1 + + if batch_3d is not None: + batch_3d = np.concatenate((batch_3d, batch_3d), axis=0) + batch_3d[1, :, :, 0] *= -1 + batch_3d[1, :, self.joints_left + self.joints_right] = batch_3d[1, :, + self.joints_right + self.joints_left] + + batch_2d = np.concatenate((batch_2d, batch_2d), axis=0) + batch_2d[1, :, :, 0] *= -1 + batch_2d[1, :, self.kps_left + self.kps_right] = batch_2d[1, :, self.kps_right + self.kps_left] + + yield batch_cam, batch_3d, batch_2d + diff --git a/VideoToNPZ/common/graph_utils.py b/VideoToNPZ/common/graph_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..77ed5da19f51542633cee8ecd292eddff235f982 --- /dev/null +++ b/VideoToNPZ/common/graph_utils.py @@ -0,0 +1,45 @@ +from __future__ import absolute_import + +import torch +import numpy as np +import scipy.sparse as sp + + +def normalize(mx): + """Row-normalize sparse matrix""" + rowsum = np.array(mx.sum(1)) + r_inv = np.power(rowsum, -1).flatten() + r_inv[np.isinf(r_inv)] = 0. + r_mat_inv = sp.diags(r_inv) + mx = r_mat_inv.dot(mx) + return mx + + +def sparse_mx_to_torch_sparse_tensor(sparse_mx): + """Convert a scipy sparse matrix to a torch sparse tensor.""" + sparse_mx = sparse_mx.tocoo().astype(np.float32) + indices = torch.from_numpy(np.vstack((sparse_mx.row, sparse_mx.col)).astype(np.int64)) + values = torch.from_numpy(sparse_mx.data) + shape = torch.Size(sparse_mx.shape) + return torch.sparse.FloatTensor(indices, values, shape) + + +def adj_mx_from_edges(num_pts, edges, sparse=True): + edges = np.array(edges, dtype=np.int32) + data, i, j = np.ones(edges.shape[0]), edges[:, 0], edges[:, 1] + adj_mx = sp.coo_matrix((data, (i, j)), shape=(num_pts, num_pts), dtype=np.float32) + + # build symmetric adjacency matrix + adj_mx = adj_mx + adj_mx.T.multiply(adj_mx.T > adj_mx) - adj_mx.multiply(adj_mx.T > adj_mx) + adj_mx = normalize(adj_mx + sp.eye(adj_mx.shape[0])) + if sparse: + adj_mx = sparse_mx_to_torch_sparse_tensor(adj_mx) + else: + adj_mx = torch.tensor(adj_mx.todense(), dtype=torch.float) + return adj_mx + + +def adj_mx_from_skeleton(skeleton): + num_joints = skeleton.num_joints() + edges = list(filter(lambda x: x[1] >= 0, zip(list(range(0, num_joints)), skeleton.parents()))) + return adj_mx_from_edges(num_joints, edges, sparse=False) diff --git a/VideoToNPZ/common/loss.py b/VideoToNPZ/common/loss.py new file mode 100644 index 0000000000000000000000000000000000000000..128c74881cddb48b365c8aace1f20c91507db5da --- /dev/null +++ b/VideoToNPZ/common/loss.py @@ -0,0 +1,90 @@ +import torch +import numpy as np + + +def mpjpe(predicted, target): + """ + Mean per-joint position error (i.e. mean Euclidean distance), + often referred to as "Protocol #1" in many papers. + """ + assert predicted.shape == target.shape + return torch.mean(torch.norm(predicted - target, dim=len(target.shape) - 1)) + + +def p_mpjpe(predicted, target): + """ + Pose error: MPJPE after rigid alignment (scale, rotation, and translation), + often referred to as "Protocol #2" in many papers. + """ + assert predicted.shape == target.shape + + muX = np.mean(target, axis=1, keepdims=True) + muY = np.mean(predicted, axis=1, keepdims=True) + + X0 = target - muX + Y0 = predicted - muY + + normX = np.sqrt(np.sum(X0 ** 2, axis=(1, 2), keepdims=True)) + normY = np.sqrt(np.sum(Y0 ** 2, axis=(1, 2), keepdims=True)) + + X0 /= normX + Y0 /= normY + + H = np.matmul(X0.transpose(0, 2, 1), Y0) + U, s, Vt = np.linalg.svd(H) + V = Vt.transpose(0, 2, 1) + R = np.matmul(V, U.transpose(0, 2, 1)) + + # Avoid improper rotations (reflections), i.e. rotations with det(R) = -1 + sign_detR = np.sign(np.expand_dims(np.linalg.det(R), axis=1)) + V[:, :, -1] *= sign_detR + s[:, -1] *= sign_detR.flatten() + R = np.matmul(V, U.transpose(0, 2, 1)) # Rotation + + tr = np.expand_dims(np.sum(s, axis=1, keepdims=True), axis=2) + + a = tr * normX / normY # Scale + t = muX - a * np.matmul(muY, R) # Translation + + # Perform rigid transformation on the input + predicted_aligned = a * np.matmul(predicted, R) + t + + # Return MPJPE + return np.mean(np.linalg.norm(predicted_aligned - target, axis=len(target.shape) - 1)) + + +def euclidean_losses(actual, target): + """Calculate the average Euclidean loss for multi-point samples. + + Each sample must contain `n` points, each with `d` dimensions. For example, + in the MPII human pose estimation task n=16 (16 joint locations) and + d=2 (locations are 2D). + + Args: + actual (Tensor): Predictions (B x L x D) + target (Tensor): Ground truth target (B x L x D) + """ + + assert actual.size() == target.size(), 'input tensors must have the same size' + + # Calculate Euclidean distances between actual and target locations + diff = actual - target + dist_sq = diff.pow(2).sum(-1, keepdim=False) + dist = dist_sq.sqrt() + return dist + + +def pck(actual, expected, threshold=150): + dists = euclidean_losses(actual, expected) + return (dists < threshold).double().mean().item() + + +def auc(actual, expected): + # This range of thresholds mimics `mpii_compute_3d_pck.m`, which is provided as part of the + # MPI-INF-3DHP test data release. + thresholds = torch.linspace(0, 150, 31).tolist() + + pck_values = torch.DoubleTensor(len(thresholds)) + for i, threshold in enumerate(thresholds): + pck_values[i] = pck(actual, expected, threshold=threshold) + return pck_values.mean().item() diff --git a/VideoToNPZ/common/quaternion.py b/VideoToNPZ/common/quaternion.py new file mode 100644 index 0000000000000000000000000000000000000000..ba9070eebd176a648f29d565bfa260ede08fa32f --- /dev/null +++ b/VideoToNPZ/common/quaternion.py @@ -0,0 +1,36 @@ +import torch + + +def qort(q, v): + """ + Rotate vector(s) v about the rotation described by quaternion(s) q. + Expects a tensor of shape (*, 4) for q and a tensor of shape (*, 3) for v, + where * denotes any number of dimensions. + Returns a tensor of shape (*, 3). + """ + assert q.shape[-1] == 4 + assert v.shape[-1] == 3 + assert q.shape[:-1] == v.shape[:-1] + + qvec = q[..., 1:] + uv = torch.cross(qvec, v, dim=len(q.shape)-1) + uuv = torch.cross(qvec, uv, dim=len(q.shape)-1) + return v + 2 * (q[..., :1] * uv + uuv) + + +def qinverse(q, inplace=False): + # We assume the quaternion to be normalized + """ + The quaternions provided in the code are from the camera coordinate to the world coordinate. + Therefore, the quaternions from the world coordinate to the camera coordinate is the transpose of quaternions from + the camera coordinates to the world coordinate.The precondition is that the quaternion is a unit quaternion. + So the inverse of the quaternions is equal to the transposition of the quaternions. + """ + if inplace: + q[..., 1:] *= -1 + return q + else: + w = q[..., :1] + xyz = q[..., 1:] + return torch.cat((w, -xyz), dim=len(q.shape)-1) + diff --git a/VideoToNPZ/common/skeleton.py b/VideoToNPZ/common/skeleton.py new file mode 100644 index 0000000000000000000000000000000000000000..795bb62763fc4e6a8ab30085777d3b8c1105b88c --- /dev/null +++ b/VideoToNPZ/common/skeleton.py @@ -0,0 +1,81 @@ +import numpy as np + + +class Skeleton: + def __init__(self, parents, joints_left, joints_right): + assert len(joints_left) == len(joints_right) + + self._parents = parents + self._joints_left = joints_left + self._joints_right = joints_right + + def num_joints(self): + return len(self._parents) + + def parents(self): + return self._parents + + def has_children(self): + return self._has_children + + def children(self): + return self._children + + def remove_joints(self, joints_to_remove): + """ + Remove the joints specified in 'joints_to_remove'. + """ + valid_joints = [] + for joint in range(len(self._parents)): + if joint not in joints_to_remove: + valid_joints.append(joint) + + for i in range(len(self._parents)): + while self._parents[i] in joints_to_remove: + self._parents[i] = self._parents[self._parents[i]] + + index_offsets = np.zeros(len(self._parents), dtype=int) + new_parents = [] + for i, parent in enumerate(self._parents): + if i not in joints_to_remove: + new_parents.append(parent - index_offsets[parent]) + else: + index_offsets[i:] += 1 + self._parents = np.array(new_parents) + + if self._joints_left is not None: + new_joints_left = [] + for joint in self._joints_left: + if joint in valid_joints: + new_joints_left.append(joint - index_offsets[joint]) + self._joints_left = new_joints_left + + if self._joints_right is not None: + new_joints_right = [] + for joint in self._joints_right: + if joint in valid_joints: + new_joints_right.append(joint - index_offsets[joint]) + self._joints_right = new_joints_right + + self._compute_metadata() + + return valid_joints + + def joints_left(self): + return self._joints_left + + def joints_right(self): + return self._joints_right + + def _compute_metadata(self): + self._has_children = np.zeros(len(self._parents)).astype(bool) + for i, parent in enumerate(self._parents): + if parent != -1: + self._has_children[parent] = True + + self._children = [] + for parents in enumerate(self._parents): + self._children.append([]) + for i, parent in enumerate(self._parents): + if parent != -1: + self._children[parent].append(i) diff --git a/VideoToNPZ/data/data_utils.py b/VideoToNPZ/data/data_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..034824105ad4cbe2eb7befb0eb5e05a9b6dc6c9d --- /dev/null +++ b/VideoToNPZ/data/data_utils.py @@ -0,0 +1,95 @@ +import numpy as np +import h5py + +mpii_metadata = { + 'layout_name': 'mpii', + 'num_joints': 16, + 'keypoints_symmetry': [ + [3, 4, 5, 13, 14, 15], + [0, 1, 2, 10, 11, 12], + ] +} + +coco_metadata = { + 'layout_name': 'coco', + 'num_joints': 17, + 'keypoints_symmetry': [ + [1, 3, 5, 7, 9, 11, 13, 15], + [2, 4, 6, 8, 10, 12, 14, 16], + ] +} + +h36m_metadata = { + 'layout_name': 'h36m', + 'num_joints': 17, + 'keypoints_symmetry': [ + [4, 5, 6, 11, 12, 13], + [1, 2, 3, 14, 15, 16], + ] +} + +humaneva15_metadata = { + 'layout_name': 'humaneva15', + 'num_joints': 15, + 'keypoints_symmetry': [ + [2, 3, 4, 8, 9, 10], + [5, 6, 7, 11, 12, 13] + ] +} + +humaneva20_metadata = { + 'layout_name': 'humaneva20', + 'num_joints': 20, + 'keypoints_symmetry': [ + [3, 4, 5, 6, 11, 12, 13, 14], + [7, 8, 9, 10, 15, 16, 17, 18] + ] +} + +def suggest_metadata(name): + names = [] + for metadata in [mpii_metadata, coco_metadata, h36m_metadata, humaneva15_metadata, humaneva20_metadata]: + if metadata['layout_name'] in name: + return metadata + names.append(metadata['layout_name']) + raise KeyError('Cannot infer keypoint layout from name "{}". Tried {}.'.format(name, names)) + +def import_detectron_poses(path): + # Latin1 encoding because Detectron runs on Python 2.7 + data = np.load(path, encoding='latin1') + kp = data['keypoints'] + bb = data['boxes'] + results = [] + for i in range(len(bb)): + if len(bb[i][1]) == 0: + assert i > 0 + # Use last pose in case of detection failure + results.append(results[-1]) + continue + best_match = np.argmax(bb[i][1][:, 4]) + keypoints = kp[i][1][best_match].T.copy() + results.append(keypoints) + results = np.array(results) + return results[:, :, 4:6] # Soft-argmax + #return results[:, :, [0, 1, 3]] # Argmax + score + + +def import_cpn_poses(path): + data = np.load(path) + kp = data['keypoints'] + return kp[:, :, :2] + + +def import_sh_poses(path): + with h5py.File(path) as hf: + positions = hf['poses'].value + return positions.astype('float32') + +def suggest_pose_importer(name): + if 'detectron' in name: + return import_detectron_poses + if 'cpn' in name: + return import_cpn_poses + if 'sh' in name: + return import_sh_poses + raise KeyError('Cannot infer keypoint format from name "{}". Tried detectron, cpn, sh.'.format(name)) diff --git a/VideoToNPZ/gen_skes.py b/VideoToNPZ/gen_skes.py new file mode 100644 index 0000000000000000000000000000000000000000..0229cc05c9d0d2a9a5c633eb7598f6b8249e7224 --- /dev/null +++ b/VideoToNPZ/gen_skes.py @@ -0,0 +1,116 @@ +import torch +import sys +import os.path as osp +import os +import argparse +import cv2 +import time +import h5py +from tqdm import tqdm +import numpy as np +import warnings +import signal + +warnings.filterwarnings('ignore') + +sys.path.insert(0, osp.dirname(osp.realpath(__file__))) +from tools.utils import get_path +from model.gast_net import SpatioTemporalModel, SpatioTemporalModelOptimized1f +from common.skeleton import Skeleton +from common.graph_utils import adj_mx_from_skeleton +from common.generators import * +from tools.preprocess import load_kpts_json, h36m_coco_format, revise_kpts, revise_skes +from tools.inference import gen_pose +from tools.vis_kpts import plot_keypoint + +cur_dir, chk_root, data_root, lib_root, output_root = get_path(__file__) +model_dir = chk_root + 'gastnet/' +sys.path.insert(1, lib_root) +from lib.pose import gen_video_kpts as hrnet_pose +sys.path.pop(1) +sys.path.pop(0) + +skeleton = Skeleton(parents=[-1, 0, 1, 2, 0, 4, 5, 0, 7, 8, 9, 8, 11, 12, 8, 14, 15], + joints_left=[4, 5, 6, 11, 12, 13], joints_right=[1, 2, 3, 14, 15, 16]) +adj = adj_mx_from_skeleton(skeleton) + +joints_left, joints_right = [4, 5, 6, 11, 12, 13], [1, 2, 3, 14, 15, 16] +kps_left, kps_right = [4, 5, 6, 11, 12, 13], [1, 2, 3, 14, 15, 16] + +# Set up signal handler for keyboard interrupt +def signal_handler(sig, frame): + print("\nInterrupted by user, shutting down...") + if 'pool' in locals() and pool is not None: + pool.terminate() + pool.join() + sys.exit(0) + +signal.signal(signal.SIGINT, signal_handler) + +def load_model_layer(): + chk = model_dir + '81_frame_model.bin' + filters_width = [3, 3, 3, 3] + channels = 64 + + model_pos = SpatioTemporalModel(adj, 17, 2, 17, filter_widths=filters_width, channels=channels, dropout=0.05) + + checkpoint = torch.load(chk) + model_pos.load_state_dict(checkpoint['model_pos']) + + if torch.cuda.is_available(): + model_pos = model_pos.cuda() + model_pos = model_pos.eval() + + return model_pos + +def generate_skeletons(video=''): + cap = cv2.VideoCapture(video) + width = cap.get(cv2.CAP_PROP_FRAME_WIDTH) + height = cap.get(cv2.CAP_PROP_FRAME_HEIGHT) + + keypoints, scores = hrnet_pose(video, det_dim=416, gen_output=True) + keypoints, scores, valid_frames = h36m_coco_format(keypoints, scores) + re_kpts = revise_kpts(keypoints, scores, valid_frames) + num_person = len(re_kpts) + + model_pos = load_model_layer() + + pad = (81 - 1) // 2 + causal_shift = 0 + + prediction = gen_pose(re_kpts, valid_frames, width, height, model_pos, pad, causal_shift) + + print('Recording 3D Pose:') + + # Add a loading bar + for i in tqdm(range(100)): + time.sleep(0.01) + + # Create output directory with absolute path + output_dir = os.path.abspath('../outputs/') + print(f"Creating output directory: {output_dir}") + os.makedirs(output_dir, exist_ok=True) + + npz_dir = os.path.join(output_dir, 'npz') + print(f"Creating NPZ directory: {npz_dir}") + os.makedirs(npz_dir, exist_ok=True) + + output_npz = os.path.join(npz_dir, os.path.basename(video).split('.')[0] + '.npz') + print(f"Saving NPZ to: {output_npz}") + np.savez_compressed(output_npz, reconstruction=prediction) + print(f"NPZ saved successfully: {output_npz}") + +def arg_parse(): + parser = argparse.ArgumentParser('Generating skeleton demo.') + parser.add_argument('-v', '--video', type=str) + args = parser.parse_args() + return args + +if __name__ == "__main__": + args = arg_parse() + # Use the video path as-is if absolute, otherwise prepend data_root + if os.path.isabs(args.video): + video_path = args.video + else: + video_path = os.path.join(data_root, 'video', args.video) + generate_skeletons(video=video_path) \ No newline at end of file diff --git a/VideoToNPZ/lib/detector/__init__.py b/VideoToNPZ/lib/detector/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..33a2b4d8be482786b3169e2e0b77d5ca6ec2daed --- /dev/null +++ b/VideoToNPZ/lib/detector/__init__.py @@ -0,0 +1,6 @@ +import sys +import os.path as osp + +sys.path.insert(0, osp.join(osp.dirname(osp.realpath(__file__)), 'yolov3')) +from human_detector import yolo_human_det, load_model +sys.path.pop(0) \ No newline at end of file diff --git a/VideoToNPZ/lib/detector/yolov3/__init__.py b/VideoToNPZ/lib/detector/yolov3/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/VideoToNPZ/lib/detector/yolov3/bbox.py b/VideoToNPZ/lib/detector/yolov3/bbox.py new file mode 100644 index 0000000000000000000000000000000000000000..96818bc485a0ae55c0a4e771a2a2af2f026a2221 --- /dev/null +++ b/VideoToNPZ/lib/detector/yolov3/bbox.py @@ -0,0 +1,111 @@ +from __future__ import division + +import torch +import random +import numpy as np +import cv2 + + +def confidence_filter(result, confidence): + conf_mask = (result[:,:,4] > confidence).float().unsqueeze(2) + result = result*conf_mask + + return result + + +def confidence_filter_cls(result, confidence): + max_scores = torch.max(result[:,:,5:25], 2)[0] + res = torch.cat((result, max_scores),2) + print(res.shape) + + + cond_1 = (res[:,:,4] > confidence).float() + cond_2 = (res[:,:,25] > 0.995).float() + + conf = cond_1 + cond_2 + conf = torch.clamp(conf, 0.0, 1.0) + conf = conf.unsqueeze(2) + result = result*conf + return result + + +def get_abs_coord(box): + box[2], box[3] = abs(box[2]), abs(box[3]) + x1 = (box[0] - box[2]/2) - 1 + y1 = (box[1] - box[3]/2) - 1 + x2 = (box[0] + box[2]/2) - 1 + y2 = (box[1] + box[3]/2) - 1 + return x1, y1, x2, y2 + + +def sanity_fix(box): + if (box[0] > box[2]): + box[0], box[2] = box[2], box[0] + + if (box[1] > box[3]): + box[1], box[3] = box[3], box[1] + + return box + + +def bbox_iou(box1, box2): + """ + Returns the IoU of two bounding boxes + + """ + # Get the coordinates of bounding boxes + b1_x1, b1_y1, b1_x2, b1_y2 = box1[:, 0], box1[:, 1], box1[:, 2], box1[:, 3] + b2_x1, b2_y1, b2_x2, b2_y2 = box2[:, 0], box2[:, 1], box2[:, 2], box2[:, 3] + + # get the corrdinates of the intersection rectangle + inter_rect_x1 = torch.max(b1_x1, b2_x1) + inter_rect_y1 = torch.max(b1_y1, b2_y1) + inter_rect_x2 = torch.min(b1_x2, b2_x2) + inter_rect_y2 = torch.min(b1_y2, b2_y2) + + # Intersection area + if torch.cuda.is_available(): + inter_area = torch.max(inter_rect_x2 - inter_rect_x1 + 1, torch.zeros(inter_rect_x2.shape).cuda())*torch.max(inter_rect_y2 - inter_rect_y1 + 1, torch.zeros(inter_rect_x2.shape).cuda()) + else: + inter_area = torch.max(inter_rect_x2 - inter_rect_x1 + 1, torch.zeros(inter_rect_x2.shape))*torch.max(inter_rect_y2 - inter_rect_y1 + 1, torch.zeros(inter_rect_x2.shape)) + + # Union Area + b1_area = (b1_x2 - b1_x1 + 1)*(b1_y2 - b1_y1 + 1) + b2_area = (b2_x2 - b2_x1 + 1)*(b2_y2 - b2_y1 + 1) + + iou = inter_area / (b1_area + b2_area - inter_area) + + return iou + + +def pred_corner_coord(prediction): + #Get indices of non-zero confidence bboxes + ind_nz = torch.nonzero(prediction[:,:,4]).transpose(0,1).contiguous() + + box = prediction[ind_nz[0], ind_nz[1]] + + box_a = box.new(box.shape) + box_a[:,0] = (box[:,0] - box[:,2]/2) + box_a[:,1] = (box[:,1] - box[:,3]/2) + box_a[:,2] = (box[:,0] + box[:,2]/2) + box_a[:,3] = (box[:,1] + box[:,3]/2) + box[:,:4] = box_a[:,:4] + + prediction[ind_nz[0], ind_nz[1]] = box + + return prediction + + +def write(x, batches, results, colors, classes): + c1 = tuple(x[1:3].int()) + c2 = tuple(x[3:5].int()) + img = results[int(x[0])] + cls = int(x[-1]) + label = "{0}".format(classes[cls]) + color = random.choice(colors) + cv2.rectangle(img, c1, c2,color, 1) + t_size = cv2.getTextSize(label, cv2.FONT_HERSHEY_PLAIN, 1 , 1)[0] + c2 = c1[0] + t_size[0] + 3, c1[1] + t_size[1] + 4 + cv2.rectangle(img, c1, c2,color, -1) + cv2.putText(img, label, (c1[0], c1[1] + t_size[1] + 4), cv2.FONT_HERSHEY_PLAIN, 1, [225,255,255], 1); + return img diff --git a/VideoToNPZ/lib/detector/yolov3/cfg/tiny-yolo-voc.cfg b/VideoToNPZ/lib/detector/yolov3/cfg/tiny-yolo-voc.cfg new file mode 100644 index 0000000000000000000000000000000000000000..ab2c066a216eacbee86e78c28f4d236e5d6b351a --- /dev/null +++ b/VideoToNPZ/lib/detector/yolov3/cfg/tiny-yolo-voc.cfg @@ -0,0 +1,134 @@ +[net] +batch=64 +subdivisions=8 +width=416 +height=416 +channels=3 +momentum=0.9 +decay=0.0005 +angle=0 +saturation = 1.5 +exposure = 1.5 +hue=.1 + +learning_rate=0.001 +max_batches = 40200 +policy=steps +steps=-1,100,20000,30000 +scales=.1,10,.1,.1 + +[convolutional] +batch_normalize=1 +filters=16 +size=3 +stride=1 +pad=1 +activation=leaky + +[maxpool] +size=2 +stride=2 + +[convolutional] +batch_normalize=1 +filters=32 +size=3 +stride=1 +pad=1 +activation=leaky + +[maxpool] +size=2 +stride=2 + +[convolutional] +batch_normalize=1 +filters=64 +size=3 +stride=1 +pad=1 +activation=leaky + +[maxpool] +size=2 +stride=2 + +[convolutional] +batch_normalize=1 +filters=128 +size=3 +stride=1 +pad=1 +activation=leaky + +[maxpool] +size=2 +stride=2 + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=leaky + +[maxpool] +size=2 +stride=2 + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[maxpool] +size=2 +stride=1 + +[convolutional] +batch_normalize=1 +filters=1024 +size=3 +stride=1 +pad=1 +activation=leaky + +########### + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=1024 +activation=leaky + +[convolutional] +size=1 +stride=1 +pad=1 +filters=125 +activation=linear + +[region] +anchors = 1.08,1.19, 3.42,4.41, 6.63,11.38, 9.42,5.11, 16.62,10.52 +bias_match=1 +classes=20 +coords=4 +num=5 +softmax=1 +jitter=.2 +rescore=1 + +object_scale=5 +noobject_scale=1 +class_scale=1 +coord_scale=1 + +absolute=1 +thresh = .6 +random=1 diff --git a/VideoToNPZ/lib/detector/yolov3/cfg/yolo-voc.cfg b/VideoToNPZ/lib/detector/yolov3/cfg/yolo-voc.cfg new file mode 100644 index 0000000000000000000000000000000000000000..d5bdfc1c5bf2d34885d7614d76d980c90373f89a --- /dev/null +++ b/VideoToNPZ/lib/detector/yolov3/cfg/yolo-voc.cfg @@ -0,0 +1,258 @@ +[net] +# Testing +batch=64 +subdivisions=8 +# Training +# batch=64 +# subdivisions=8 +height=416 +width=416 +channels=3 +momentum=0.9 +decay=0.0005 +angle=0 +saturation = 1.5 +exposure = 1.5 +hue=.1 + +learning_rate=0.001 +burn_in=1000 +max_batches = 80200 +policy=steps +steps=-1,500,40000,60000 +scales=0.1,10,.1,.1 + +[convolutional] +batch_normalize=1 +filters=32 +size=3 +stride=1 +pad=1 +activation=leaky + +[maxpool] +size=2 +stride=2 + +[convolutional] +batch_normalize=1 +filters=64 +size=3 +stride=1 +pad=1 +activation=leaky + +[maxpool] +size=2 +stride=2 + +[convolutional] +batch_normalize=1 +filters=128 +size=3 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=64 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=128 +size=3 +stride=1 +pad=1 +activation=leaky + +[maxpool] +size=2 +stride=2 + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=leaky + +[maxpool] +size=2 +stride=2 + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[maxpool] +size=2 +stride=2 + +[convolutional] +batch_normalize=1 +filters=1024 +size=3 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=1024 +size=3 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=1024 +size=3 +stride=1 +pad=1 +activation=leaky + + +####### + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=1024 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=1024 +activation=leaky + +[route] +layers=-9 + +[convolutional] +batch_normalize=1 +size=1 +stride=1 +pad=1 +filters=64 +activation=leaky + +[reorg] +stride=2 + +[route] +layers=-1,-4 + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=1024 +activation=leaky + +[convolutional] +size=1 +stride=1 +pad=1 +filters=125 +activation=linear + + +[region] +anchors = 1.3221, 1.73145, 3.19275, 4.00944, 5.05587, 8.09892, 9.47112, 4.84053, 11.2364, 10.0071 +bias_match=1 +classes=20 +coords=4 +num=5 +softmax=1 +jitter=.3 +rescore=1 + +object_scale=5 +noobject_scale=1 +class_scale=1 +coord_scale=1 + +absolute=1 +thresh = .6 +random=1 diff --git a/VideoToNPZ/lib/detector/yolov3/cfg/yolo.cfg b/VideoToNPZ/lib/detector/yolov3/cfg/yolo.cfg new file mode 100644 index 0000000000000000000000000000000000000000..2a0cd98fbd07c94aa0840c528a12b1b60a004928 --- /dev/null +++ b/VideoToNPZ/lib/detector/yolov3/cfg/yolo.cfg @@ -0,0 +1,258 @@ +[net] +# Testing +batch=1 +subdivisions=1 +# Training +# batch=64 +# subdivisions=8 +width=416 +height=416 +channels=3 +momentum=0.9 +decay=0.0005 +angle=0 +saturation = 1.5 +exposure = 1.5 +hue=.1 + +learning_rate=0.001 +burn_in=1000 +max_batches = 500200 +policy=steps +steps=400000,450000 +scales=.1,.1 + +[convolutional] +batch_normalize=1 +filters=32 +size=3 +stride=1 +pad=1 +activation=leaky + +[maxpool] +size=2 +stride=2 + +[convolutional] +batch_normalize=1 +filters=64 +size=3 +stride=1 +pad=1 +activation=leaky + +[maxpool] +size=2 +stride=2 + +[convolutional] +batch_normalize=1 +filters=128 +size=3 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=64 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=128 +size=3 +stride=1 +pad=1 +activation=leaky + +[maxpool] +size=2 +stride=2 + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=leaky + +[maxpool] +size=2 +stride=2 + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[maxpool] +size=2 +stride=2 + +[convolutional] +batch_normalize=1 +filters=1024 +size=3 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=1024 +size=3 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=1024 +size=3 +stride=1 +pad=1 +activation=leaky + + +####### + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=1024 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=1024 +activation=leaky + +[route] +layers=-9 + +[convolutional] +batch_normalize=1 +size=1 +stride=1 +pad=1 +filters=64 +activation=leaky + +[reorg] +stride=2 + +[route] +layers=-1,-4 + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=1024 +activation=leaky + +[convolutional] +size=1 +stride=1 +pad=1 +filters=425 +activation=linear + + +[region] +anchors = 0.57273, 0.677385, 1.87446, 2.06253, 3.33843, 5.47434, 7.88282, 3.52778, 9.77052, 9.16828 +bias_match=1 +classes=80 +coords=4 +num=5 +softmax=1 +jitter=.3 +rescore=1 + +object_scale=5 +noobject_scale=1 +class_scale=1 +coord_scale=1 + +absolute=1 +thresh = .6 +random=1 diff --git a/VideoToNPZ/lib/detector/yolov3/cfg/yolov3.cfg b/VideoToNPZ/lib/detector/yolov3/cfg/yolov3.cfg new file mode 100644 index 0000000000000000000000000000000000000000..e94193b0e82e56b3b457f3d8c049ffb9ac7ed1f8 --- /dev/null +++ b/VideoToNPZ/lib/detector/yolov3/cfg/yolov3.cfg @@ -0,0 +1,789 @@ +[net] +# Testing +batch=1 +subdivisions=1 +# Training +# batch=64 +# subdivisions=16 +width= 320 +height = 320 +channels=3 +momentum=0.9 +decay=0.0005 +angle=0 +saturation = 1.5 +exposure = 1.5 +hue=.1 + +learning_rate=0.001 +burn_in=1000 +max_batches = 500200 +policy=steps +steps=400000,450000 +scales=.1,.1 + +[convolutional] +batch_normalize=1 +filters=32 +size=3 +stride=1 +pad=1 +activation=leaky + +# Downsample + +[convolutional] +batch_normalize=1 +filters=64 +size=3 +stride=2 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=32 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=64 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +# Downsample + +[convolutional] +batch_normalize=1 +filters=128 +size=3 +stride=2 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=64 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=128 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=64 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=128 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +# Downsample + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=2 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +# Downsample + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=2 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +# Downsample + +[convolutional] +batch_normalize=1 +filters=1024 +size=3 +stride=2 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=1024 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=1024 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=1024 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=1024 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +###################### + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=1024 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=1024 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=1024 +activation=leaky + +[convolutional] +size=1 +stride=1 +pad=1 +filters=255 +activation=linear + + +[yolo] +mask = 6,7,8 +anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 +classes=80 +num=9 +jitter=.3 +ignore_thresh = .5 +truth_thresh = 1 +random=1 + + +[route] +layers = -4 + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[upsample] +stride=2 + +[route] +layers = -1, 61 + + + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=512 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=512 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=512 +activation=leaky + +[convolutional] +size=1 +stride=1 +pad=1 +filters=255 +activation=linear + + +[yolo] +mask = 3,4,5 +anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 +classes=80 +num=9 +jitter=.3 +ignore_thresh = .5 +truth_thresh = 1 +random=1 + + + +[route] +layers = -4 + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[upsample] +stride=2 + +[route] +layers = -1, 36 + + + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=256 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=256 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=256 +activation=leaky + +[convolutional] +size=1 +stride=1 +pad=1 +filters=255 +activation=linear + + +[yolo] +mask = 0,1,2 +anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 +classes=80 +num=9 +jitter=.3 +ignore_thresh = .5 +truth_thresh = 1 +random=1 + diff --git a/VideoToNPZ/lib/detector/yolov3/darknet.py b/VideoToNPZ/lib/detector/yolov3/darknet.py new file mode 100644 index 0000000000000000000000000000000000000000..7167784f9e070f42bf2c5fb253f38133903b3b4a --- /dev/null +++ b/VideoToNPZ/lib/detector/yolov3/darknet.py @@ -0,0 +1,433 @@ +from __future__ import division + +import torch +import torch.nn as nn +import torch.nn.functional as F +import numpy as np +import cv2 +import os +import sys + +from util import convert2cpu as cpu +from util import predict_transform + + +class test_net(nn.Module): + def __init__(self, num_layers, input_size): + super(test_net, self).__init__() + self.num_layers= num_layers + self.linear_1 = nn.Linear(input_size, 5) + self.middle = nn.ModuleList([nn.Linear(5,5) for x in range(num_layers)]) + self.output = nn.Linear(5,2) + + def forward(self, x): + x = x.view(-1) + fwd = nn.Sequential(self.linear_1, *self.middle, self.output) + return fwd(x) + + +def get_test_input(): + img = cv2.imread("dog-cycle-car.png") + img = cv2.resize(img, (416, 416)) + img_ = img[:, :, ::-1].transpose((2, 0, 1)) + img_ = img_[np.newaxis, :, :, :]/255.0 + img_ = torch.from_numpy(img_).float() + return img_ + + +def parse_cfg(cfgfile): + """ + Takes a configuration file + + Returns a list of blocks. Each blocks describes a block in the neural + network to be built. Block is represented as a dictionary in the list + + """ + # cfgfile = os.path.join(sys.path[-1], cfgfile) + file = open(cfgfile, 'r') + lines = file.read().split('\n') # store the lines in a list + lines = [x for x in lines if len(x) > 0] # get read of the empty lines + lines = [x for x in lines if x[0] != '#'] + lines = [x.rstrip().lstrip() for x in lines] + + block = {} + blocks = [] + + for line in lines: + if line[0] == "[": # This marks the start of a new block + if len(block) != 0: + blocks.append(block) + block = {} + block["type"] = line[1:-1].rstrip() + else: + key,value = line.split("=") + block[key.rstrip()] = value.lstrip() + blocks.append(block) + + return blocks + + +class MaxPoolStride1(nn.Module): + def __init__(self, kernel_size): + super(MaxPoolStride1, self).__init__() + self.kernel_size = kernel_size + self.pad = kernel_size - 1 + + def forward(self, x): + padded_x = F.pad(x, (0, self.pad, 0, self.pad), mode="replicate") + pooled_x = nn.MaxPool2d(self.kernel_size, self.pad)(padded_x) + return pooled_x + + +class EmptyLayer(nn.Module): + def __init__(self): + super(EmptyLayer, self).__init__() + + +class DetectionLayer(nn.Module): + def __init__(self, anchors): + super(DetectionLayer, self).__init__() + self.anchors = anchors + + def forward(self, x, inp_dim, num_classes, confidence): + x = x.data + global CUDA + prediction = x + prediction = predict_transform(prediction, inp_dim, self.anchors, num_classes, confidence, CUDA) + return prediction + + +class Upsample(nn.Module): + def __init__(self, stride=2): + super(Upsample, self).__init__() + self.stride = stride + + def forward(self, x): + stride = self.stride + assert(x.data.dim() == 4) + B = x.data.size(0) + C = x.data.size(1) + H = x.data.size(2) + W = x.data.size(3) + ws = stride + hs = stride + x = x.view(B, C, H, 1, W, 1).expand(B, C, H, stride, W, stride).contiguous().view(B, C, H*stride, W*stride) + return x + + +class ReOrgLayer(nn.Module): + def __init__(self, stride=2): + super(ReOrgLayer, self).__init__() + self.stride= stride + + def forward(self, x): + assert(x.data.dim() == 4) + B, C, H, W = x.data.shape + hs = self.stride + ws = self.stride + assert(H % hs == 0), "The stride " + str(self.stride) + " is not a proper divisor of height " + str(H) + assert(W % ws == 0), "The stride " + str(self.stride) + " is not a proper divisor of height " + str(W) + x = x.view(B, C, H // hs, hs, W // ws, ws).transpose(-2, -3).contiguous() + x = x.view(B, C, H // hs * W // ws, hs, ws) + x = x.view(B, C, H // hs * W // ws, hs*ws).transpose(-1, -2).contiguous() + x = x.view(B, C, ws*hs, H // ws, W // ws).transpose(1, 2).contiguous() + x = x.view(B, C*ws*hs, H // ws, W // ws) + return x + + +def create_modules(blocks): + net_info = blocks[0] # Captures the information about the input and pre-processing + + module_list = nn.ModuleList() + + index = 0 # indexing blocks helps with implementing route layers (skip connections) + prev_filters = 3 + output_filters = [] + + for x in blocks: + module = nn.Sequential() + if x["type"] == "net": + continue + + # If it's a convolutional layer + if x["type"] == "convolutional": + # Get the info about the layer + activation = x["activation"] + try: + batch_normalize = int(x["batch_normalize"]) + bias = False + except: + batch_normalize = 0 + bias = True + + filters= int(x["filters"]) + padding = int(x["pad"]) + kernel_size = int(x["size"]) + stride = int(x["stride"]) + + if padding: + pad = (kernel_size - 1) // 2 + else: + pad = 0 + + # Add the convolutional layer + conv = nn.Conv2d(prev_filters, filters, kernel_size, stride, pad, bias = bias) + module.add_module("conv_{0}".format(index), conv) + + # Add the Batch Norm Layer + if batch_normalize: + bn = nn.BatchNorm2d(filters) + module.add_module("batch_norm_{0}".format(index), bn) + + # Check the activation. + # It is either Linear or a Leaky ReLU for YOLO + if activation == "leaky": + activn = nn.LeakyReLU(0.1, inplace = True) + module.add_module("leaky_{0}".format(index), activn) + + # If it's an upsampling layer + # We use Bilinear2dUpsampling + + elif x["type"] == "upsample": + stride = int(x["stride"]) +# upsample = Upsample(stride) + upsample = nn.Upsample(scale_factor=2, mode="nearest") + module.add_module("upsample_{}".format(index), upsample) + + # If it is a route layer + elif (x["type"] == "route"): + x["layers"] = x["layers"].split(',') + + # Start of a route + start = int(x["layers"][0]) + + # end, if there exists one. + try: + end = int(x["layers"][1]) + except: + end = 0 + + # Positive anotation + if start > 0: + start = start - index + + if end > 0: + end = end - index + + route = EmptyLayer() + module.add_module("route_{0}".format(index), route) + + if end < 0: + filters = output_filters[index + start] + output_filters[index + end] + else: + filters = output_filters[index + start] + + # shortcut corresponds to skip connection + elif x["type"] == "shortcut": + from_ = int(x["from"]) + shortcut = EmptyLayer() + module.add_module("shortcut_{}".format(index), shortcut) + + elif x["type"] == "maxpool": + stride = int(x["stride"]) + size = int(x["size"]) + if stride != 1: + maxpool = nn.MaxPool2d(size, stride) + else: + maxpool = MaxPoolStride1(size) + + module.add_module("maxpool_{}".format(index), maxpool) + + # Yolo is the detection layer + elif x["type"] == "yolo": + mask = x["mask"].split(",") + mask = [int(x) for x in mask] + + anchors = x["anchors"].split(",") + anchors = [int(a) for a in anchors] + anchors = [(anchors[i], anchors[i+1]) for i in range(0, len(anchors),2)] + anchors = [anchors[i] for i in mask] + + detection = DetectionLayer(anchors) + module.add_module("Detection_{}".format(index), detection) + + else: + print("Something I dunno") + assert False + + module_list.append(module) + prev_filters = filters + output_filters.append(filters) + index += 1 + + return (net_info, module_list) + + +class Darknet(nn.Module): + def __init__(self, cfgfile): + super(Darknet, self).__init__() + self.blocks = parse_cfg(cfgfile) + self.net_info, self.module_list = create_modules(self.blocks) + self.header = torch.IntTensor([0, 0, 0, 0]) + self.seen = 0 + + def get_blocks(self): + return self.blocks + + def get_module_list(self): + return self.module_list + + def forward(self, x, CUDA): + detections = [] + modules = self.blocks[1:] + outputs = {} # We cache the outputs for the route layer + + write = 0 + for i in range(len(modules)): + + module_type = (modules[i]["type"]) + if module_type == "convolutional" or module_type == "upsample" or module_type == "maxpool": + + x = self.module_list[i](x) + outputs[i] = x + + elif module_type == "route": + layers = modules[i]["layers"] + layers = [int(a) for a in layers] + + if (layers[0]) > 0: + layers[0] = layers[0] - i + + if len(layers) == 1: + x = outputs[i + (layers[0])] + + else: + if (layers[1]) > 0: + layers[1] = layers[1] - i + + map1 = outputs[i + layers[0]] + map2 = outputs[i + layers[1]] + + x = torch.cat((map1, map2), 1) + outputs[i] = x + + elif module_type == "shortcut": + from_ = int(modules[i]["from"]) + x = outputs[i-1] + outputs[i+from_] + outputs[i] = x + + elif module_type == 'yolo': + + anchors = self.module_list[i][0].anchors + # Get the input dimensions + inp_dim = int(self.net_info["height"]) + + # Get the number of classes + num_classes = int(modules[i]["classes"]) + + # Output the result + x = x.data + x = predict_transform(x, inp_dim, anchors, num_classes, CUDA) + + if type(x) == int: + continue + + if not write: + detections = x + write = 1 + else: + detections = torch.cat((detections, x), 1) + + outputs[i] = outputs[i-1] + + try: + return detections + except: + return 0 + + def load_weights(self, weightfile): + # Introduction: https://blog.paperspace.com/how-to-implement-a-yolo-v3-object-detector-from-scratch-in-pytorch-part-3/ + # Open the weights file + # weightfile = os.path.join(sys.path[-1], weightfile) + fp = open(weightfile, "rb") + + # The first 5 values are header information + # 1. Major version number + # 2. Minor Version Number + # 3. Subversion number + # 4.5 Images seen by the network (during training) + header = np.fromfile(fp, dtype = np.int32, count = 5) + self.header = torch.from_numpy(header) + self.seen = self.header[3] + + # The rest of the values are the weights + # Let's load them up + weights = np.fromfile(fp, dtype = np.float32) + + ptr = 0 + for i in range(len(self.module_list)): + module_type = self.blocks[i + 1]["type"] + + if module_type == "convolutional": + model = self.module_list[i] + try: + batch_normalize = int(self.blocks[i+1]["batch_normalize"]) + except: + batch_normalize = 0 + + conv = model[0] + + if (batch_normalize): + bn = model[1] + + # Get the number of weights of Batch Norm Layer + num_bn_biases = bn.bias.numel() + + # Load the weights + bn_biases = torch.from_numpy(weights[ptr:ptr + num_bn_biases]) + ptr += num_bn_biases + + bn_weights = torch.from_numpy(weights[ptr: ptr + num_bn_biases]) + ptr += num_bn_biases + + bn_running_mean = torch.from_numpy(weights[ptr: ptr + num_bn_biases]) + ptr += num_bn_biases + + bn_running_var = torch.from_numpy(weights[ptr: ptr + num_bn_biases]) + ptr += num_bn_biases + + # Cast the loaded weights into dims of model weights. + bn_biases = bn_biases.view_as(bn.bias.data) + bn_weights = bn_weights.view_as(bn.weight.data) + bn_running_mean = bn_running_mean.view_as(bn.running_mean) + bn_running_var = bn_running_var.view_as(bn.running_var) + + # Copy the data to model + bn.bias.data.copy_(bn_biases) + bn.weight.data.copy_(bn_weights) + bn.running_mean.copy_(bn_running_mean) + bn.running_var.copy_(bn_running_var) + + else: + # Number of biases + num_biases = conv.bias.numel() + + # Load the weights + conv_biases = torch.from_numpy(weights[ptr: ptr + num_biases]) + ptr = ptr + num_biases + + # reshape the loaded weights according to the dims of the model weights + conv_biases = conv_biases.view_as(conv.bias.data) + + # Finally copy the data + conv.bias.data.copy_(conv_biases) + + # Let us load the weights for the Convolutional layers + num_weights = conv.weight.numel() + + # Do the same as above for weights + conv_weights = torch.from_numpy(weights[ptr:ptr+num_weights]) + ptr = ptr + num_weights + + conv_weights = conv_weights.view_as(conv.weight.data) + conv.weight.data.copy_(conv_weights) diff --git a/VideoToNPZ/lib/detector/yolov3/data/coco.names b/VideoToNPZ/lib/detector/yolov3/data/coco.names new file mode 100644 index 0000000000000000000000000000000000000000..ca76c80b5b2cd0b25047f75736656cfebc9da7aa --- /dev/null +++ b/VideoToNPZ/lib/detector/yolov3/data/coco.names @@ -0,0 +1,80 @@ +person +bicycle +car +motorbike +aeroplane +bus +train +truck +boat +traffic light +fire hydrant +stop sign +parking meter +bench +bird +cat +dog +horse +sheep +cow +elephant +bear +zebra +giraffe +backpack +umbrella +handbag +tie +suitcase +frisbee +skis +snowboard +sports ball +kite +baseball bat +baseball glove +skateboard +surfboard +tennis racket +bottle +wine glass +cup +fork +knife +spoon +bowl +banana +apple +sandwich +orange +broccoli +carrot +hot dog +pizza +donut +cake +chair +sofa +pottedplant +bed +diningtable +toilet +tvmonitor +laptop +mouse +remote +keyboard +cell phone +microwave +oven +toaster +sink +refrigerator +book +clock +vase +scissors +teddy bear +hair drier +toothbrush diff --git a/VideoToNPZ/lib/detector/yolov3/data/pallete b/VideoToNPZ/lib/detector/yolov3/data/pallete new file mode 100644 index 0000000000000000000000000000000000000000..25f0143e9c80c98923dac550f6cd52e20a9dbbe6 Binary files /dev/null and b/VideoToNPZ/lib/detector/yolov3/data/pallete differ diff --git a/VideoToNPZ/lib/detector/yolov3/data/voc.names b/VideoToNPZ/lib/detector/yolov3/data/voc.names new file mode 100644 index 0000000000000000000000000000000000000000..8420ab35ede7400974f25836a6bb543024686a0e --- /dev/null +++ b/VideoToNPZ/lib/detector/yolov3/data/voc.names @@ -0,0 +1,20 @@ +aeroplane +bicycle +bird +boat +bottle +bus +car +cat +chair +cow +diningtable +dog +horse +motorbike +person +pottedplant +sheep +sofa +train +tvmonitor diff --git a/VideoToNPZ/lib/detector/yolov3/human_detector.py b/VideoToNPZ/lib/detector/yolov3/human_detector.py new file mode 100644 index 0000000000000000000000000000000000000000..9067051bc4bbb309befe82b69f105e6d3895a997 --- /dev/null +++ b/VideoToNPZ/lib/detector/yolov3/human_detector.py @@ -0,0 +1,155 @@ +from __future__ import division +import time +import torch +import numpy as np +import cv2 +import os +import sys +import random +import pickle as pkl +import argparse + +from util import * +from darknet import Darknet +from preprocess import letterbox_image +import preprocess + + +cur_dir = os.path.dirname(os.path.realpath(__file__)) +project_root = os.path.join(cur_dir, '../../../') +chk_root = os.path.join(project_root, 'checkpoint/') +data_root = os.path.join(project_root, 'data/') + + +sys.path.insert(0, project_root) +sys.path.pop(0) + + +def prep_image(img, inp_dim): + """ + Prepare image for inputting to the neural network. + + Returns a Variable + """ + ori_img = img + dim = ori_img.shape[1], ori_img.shape[0] + img = cv2.resize(ori_img, (inp_dim, inp_dim)) + img_ = img[:, :, ::-1].transpose((2, 0, 1)).copy() + img_ = torch.from_numpy(img_).float().div(255.0).unsqueeze(0) + return img_, ori_img, dim + + +def write(x, img, colors): + x = [int(i) for i in x] + c1 = tuple(x[0:2]) + c2 = tuple(x[2:4]) + + label = 'People {}'.format(0) + color = (0, 0, 255) + cv2.rectangle(img, c1, c2, color, 2) + t_size = cv2.getTextSize(label, cv2.FONT_HERSHEY_PLAIN, 1, 1)[0] + c2 = c1[0] + t_size[0] + 3, c1[1] + t_size[1] + 4 + cv2.rectangle(img, c1, c2, color, -1) + cv2.putText(img, label, (c1[0], c1[1] + t_size[1] + 4), cv2.FONT_HERSHEY_PLAIN, 1, [225, 255, 255], 1) + return img + + +def arg_parse(): + """" + Parse arguements to the detect module + + """ + parser = argparse.ArgumentParser(description='YOLO v3 Cam Demo') + parser.add_argument('--confidence', dest='confidence', type=float, default=0.70, + help='Object Confidence to filter predictions') + parser.add_argument('--nms-thresh', dest='nms_thresh', type=float, default=0.4, help='NMS Threshold') + parser.add_argument('--reso', dest='reso', default=416, type=int, help='Input resolution of the network. ' + 'Increase to increase accuracy. Decrease to increase speed. (160, 416)') + parser.add_argument('-wf', '--weight-file', type=str, default=chk_root + 'yolov3/yolov3.weights', help='The path' + 'of model weight file') + parser.add_argument('-cf', '--cfg-file', type=str, default=cur_dir + '/cfg/yolov3.cfg', help='weight file') + parser.add_argument('-a', '--animation', action='store_true', help='output animation') + parser.add_argument('-v', '--video', type=str, default='camera', help='The input video path') + parser.add_argument('-i', '--image', type=str, default=cur_dir + '/data/dog-cycle-car.png', + help='The input video path') + parser.add_argument('-np', '--num-person', type=int, default=1, help='number of estimated human poses. [1, 2]') + return parser.parse_args() + + +def load_model(args=None, CUDA=None, inp_dim=416): + if args is None: + args = arg_parse() + + if CUDA is None: + CUDA = torch.cuda.is_available() + + # Set up the neural network + model = Darknet(args.cfg_file) + model.load_weights(args.weight_file) + + model.net_info["height"] = inp_dim + assert inp_dim % 32 == 0 + assert inp_dim > 32 + + # If there's a GPU availible, put the model on GPU + if CUDA: + model.cuda() + + # Set the model in evaluation mode + model.eval() + + return model + + +def yolo_human_det(img, model=None, reso=416, confidence=0.70): + args = arg_parse() + # args.reso = reso + inp_dim = reso + num_classes = 80 + + CUDA = torch.cuda.is_available() + if model is None: + model = load_model(args, CUDA, inp_dim) + + if type(img) == str: + assert os.path.isfile(img), 'The image path does not exist' + img = cv2.imread(img) + + img, ori_img, img_dim = preprocess.prep_image(img, inp_dim) + img_dim = torch.FloatTensor(img_dim).repeat(1, 2) + + with torch.no_grad(): + if CUDA: + img_dim = img_dim.cuda() + img = img.cuda() + output = model(img, CUDA) + output = write_results(output, confidence, num_classes, nms=True, nms_conf=args.nms_thresh, det_hm=True) + + if len(output) == 0: + return None, None + + img_dim = img_dim.repeat(output.size(0), 1) + scaling_factor = torch.min(inp_dim / img_dim, 1)[0].view(-1, 1) + + output[:, [1, 3]] -= (inp_dim - scaling_factor * img_dim[:, 0].view(-1, 1)) / 2 + output[:, [2, 4]] -= (inp_dim - scaling_factor * img_dim[:, 1].view(-1, 1)) / 2 + output[:, 1:5] /= scaling_factor + + for i in range(output.shape[0]): + output[i, [1, 3]] = torch.clamp(output[i, [1, 3]], 0.0, img_dim[i, 0]) + output[i, [2, 4]] = torch.clamp(output[i, [2, 4]], 0.0, img_dim[i, 1]) + + bboxs = [] + scores = [] + for i in range(len(output)): + item = output[i] + bbox = item[1:5].cpu().numpy() + # conver float32 to .2f data + bbox = [round(i, 2) for i in list(bbox)] + score = item[5].cpu().numpy() + bboxs.append(bbox) + scores.append(score) + scores = np.expand_dims(np.array(scores), 1) + bboxs = np.array(bboxs) + + return bboxs, scores diff --git a/VideoToNPZ/lib/detector/yolov3/preprocess.py b/VideoToNPZ/lib/detector/yolov3/preprocess.py new file mode 100644 index 0000000000000000000000000000000000000000..77041f1c85090e0171a080054e4925a66617a49d --- /dev/null +++ b/VideoToNPZ/lib/detector/yolov3/preprocess.py @@ -0,0 +1,63 @@ +from __future__ import division + +import torch +import numpy as np +import cv2 +from PIL import Image + + +def letterbox_image(img, inp_dim): + '''resize image with unchanged aspect ratio using padding''' + img_w, img_h = img.shape[1], img.shape[0] + w, h = inp_dim + new_w = int(img_w * min(w/img_w, h/img_h)) + new_h = int(img_h * min(w/img_w, h/img_h)) + resized_image = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_CUBIC) + + canvas = np.full((inp_dim[1], inp_dim[0], 3), 128) + + canvas[(h - new_h) // 2:(h - new_h) // 2 + new_h, (w - new_w) // 2:(w - new_w) // 2 + new_w, :] = resized_image + + return canvas + + +def prep_image(img, inp_dim): + """ + Prepare image for inputting to the neural network. + + Returns a Variable + """ + if type(img) == str: + orig_im = cv2.imread(img) + else: + orig_im = img + dim = orig_im.shape[1], orig_im.shape[0] + img = (letterbox_image(orig_im, (inp_dim, inp_dim))) + img_ = img[:, :, ::-1].transpose((2, 0, 1)).copy() + img_ = torch.from_numpy(img_).float().div(255.0).unsqueeze(0) + return img_, orig_im, dim + + +def prep_image_pil(img, network_dim): + orig_im = Image.open(img) + img = orig_im.convert('RGB') + dim = img.size + img = img.resize(network_dim) + img = torch.ByteTensor(torch.ByteStorage.from_buffer(img.tobytes())) + img = img.view(*network_dim, 3).transpose(0, 1).transpose(0, 2).contiguous() + img = img.view(1, 3, *network_dim) + img = img.float().div(255.0) + return img, orig_im, dim + + +def inp_to_image(inp): + inp = inp.cpu().squeeze() + inp = inp * 255 + try: + inp = inp.data.numpy() + except RuntimeError: + inp = inp.numpy() + inp = inp.transpose(1, 2, 0) + + inp = inp[:, :, ::-1] + return inp diff --git a/VideoToNPZ/lib/detector/yolov3/util.py b/VideoToNPZ/lib/detector/yolov3/util.py new file mode 100644 index 0000000000000000000000000000000000000000..18b79ab4cc88d90afc9c40d4aeeadee2f5b2a1b5 --- /dev/null +++ b/VideoToNPZ/lib/detector/yolov3/util.py @@ -0,0 +1,225 @@ +from __future__ import division + +import torch +import numpy as np +import cv2 +import os.path as osp +from bbox import bbox_iou + + +def get_path(cur_file): + cur_dir = osp.dirname(osp.realpath(cur_file)) + project_root = osp.join(cur_dir, '../../../') + chk_root = osp.join(project_root, 'checkpoint/') + data_root = osp.join(project_root, 'data/') + + return project_root, chk_root, data_root, cur_dir + + +def count_parameters(model): + return sum(p.numel() for p in model.parameters()) + + +def count_learnable_parameters(model): + return sum(p.numel() for p in model.parameters() if p.requires_grad) + + +def convert2cpu(matrix): + if matrix.is_cuda: + return torch.FloatTensor(matrix.size()).copy_(matrix) + else: + return matrix + + +def predict_transform(prediction, inp_dim, anchors, num_classes, CUDA = True): + batch_size = prediction.size(0) + stride = inp_dim // prediction.size(2) + grid_size = inp_dim // stride + bbox_attrs = 5 + num_classes + num_anchors = len(anchors) + + anchors = [(a[0]/stride, a[1]/stride) for a in anchors] + + prediction = prediction.view(batch_size, bbox_attrs*num_anchors, grid_size*grid_size) + prediction = prediction.transpose(1, 2).contiguous() + prediction = prediction.view(batch_size, grid_size*grid_size*num_anchors, bbox_attrs) + + # Sigmoid the centre_X, centre_Y. and object confidencce + prediction[:, :, 0] = torch.sigmoid(prediction[:, :, 0]) + prediction[:, :, 1] = torch.sigmoid(prediction[:, :, 1]) + prediction[:, :, 4] = torch.sigmoid(prediction[:, :, 4]) + + # Add the center offsets + grid_len = np.arange(grid_size) + a, b = np.meshgrid(grid_len, grid_len) + + x_offset = torch.FloatTensor(a).view(-1, 1) + y_offset = torch.FloatTensor(b).view(-1, 1) + + if CUDA: + x_offset = x_offset.cuda() + y_offset = y_offset.cuda() + + x_y_offset = torch.cat((x_offset, y_offset), 1).repeat(1, num_anchors).view(-1, 2).unsqueeze(0) + + prediction[:, :, :2] += x_y_offset + + # log space transform height and the width + anchors = torch.FloatTensor(anchors) + + if CUDA: + anchors = anchors.cuda() + + anchors = anchors.repeat(grid_size*grid_size, 1).unsqueeze(0) + prediction[:, :, 2:4] = torch.exp(prediction[:, :, 2:4])*anchors + + # Softmax the class scores + prediction[:, :, 5: 5 + num_classes] = torch.sigmoid((prediction[:, :, 5: 5 + num_classes])) + + prediction[:, :, :4] *= stride + + return prediction + + +def load_classes(namesfile): + fp = open(namesfile, "r") + names = fp.read().split("\n")[:-1] + return names + + +def get_im_dim(im): + im = cv2.imread(im) + w, h = im.shape[1], im.shape[0] + return w, h + + +def unique(tensor): + tensor_np = tensor.cpu().numpy() + unique_np = np.unique(tensor_np) + unique_tensor = torch.from_numpy(unique_np) + + tensor_res = tensor.new(unique_tensor.shape) + tensor_res.copy_(unique_tensor) + return tensor_res + + +# ADD SOFT NMS +def write_results(prediction, confidence, num_classes, nms=True, nms_conf=0.4, det_hm=False): + """ + https://blog.paperspace.com/how-to-implement-a-yolo-v3-object-detector-from-scratch-in-pytorch-part-4/ + prediction: (B x 10647 x 85) + B: the number of images in a batch, + 10647: the number of bounding boxes predicted per image. (52×52+26×26+13×13)×3=10647 + 85: the number of bounding box attributes. (c_x, c_y, w, h, object confidence, and 80 class scores) + + output: Num_obj × [img_index, x_1, y_1, x_2, y_2, object confidence, class_score, label_index] + """ + + conf_mask = (prediction[:, :, 4] > confidence).float().unsqueeze(2) + prediction = prediction*conf_mask + + box_a = prediction.new(prediction.shape) + box_a[:, :, 0] = (prediction[:, :, 0] - prediction[:, :, 2]/2) + box_a[:, :, 1] = (prediction[:, :, 1] - prediction[:, :, 3]/2) + box_a[:, :, 2] = (prediction[:, :, 0] + prediction[:, :, 2]/2) + box_a[:, :, 3] = (prediction[:, :, 1] + prediction[:, :, 3]/2) + prediction[:, :, :4] = box_a[:, :, :4] + + batch_size = prediction.size(0) + + output = prediction.new(1, prediction.size(2) + 1) + write = False + + for ind in range(batch_size): + # select the image from the batch + image_pred = prediction[ind] + + # Get the class having maximum score, and the index of that class + # Get rid of num_classes softmax scores + # Add the class index and the class score of class having maximum score + max_conf, max_conf_index = torch.max(image_pred[:, 5:5 + num_classes], 1) + max_conf = max_conf.float().unsqueeze(1) + max_conf_index = max_conf_index.float().unsqueeze(1) + seq = (image_pred[:, :5], max_conf, max_conf_index) + image_pred = torch.cat(seq, 1) # image_pred:(10647, 7) 7:[x1, y1, x2, y2, obj_score, max_conf, max_conf_index] + + # Get rid of the zero entries + non_zero_ind = (torch.nonzero(image_pred[:, 4])) + image_pred__ = image_pred[non_zero_ind.squeeze(), :].view(-1, 7) + + # filters out people id + if det_hm: + cls_mask = (image_pred__[:, -1] == 0).float() + class_mask_ind = torch.nonzero(cls_mask).squeeze() + image_pred_ = image_pred__[class_mask_ind].view(-1, 7) + + if torch.sum(cls_mask) == 0: + return image_pred_ + else: + image_pred_ = image_pred__ + + # Get the various classes detected in the image + try: + # img_classes = unique(image_pred_[:, -1]) + img_classes = torch.unique(image_pred_[:, -1], sorted=True).float() + except: + continue + + # We will do NMS classwise + # import ipdb;ipdb.set_trace() + for cls in img_classes: + # get the detections with one particular class + cls_mask = image_pred_*(image_pred_[:, -1] == cls).float().unsqueeze(1) + class_mask_ind = torch.nonzero(cls_mask[:, -2]).squeeze() + image_pred_class = image_pred_[class_mask_ind].view(-1, 7) + + # sort the detections such that the entry with the maximum objectness + # confidence is at the top + conf_sort_index = torch.sort(image_pred_class[:, 4], descending=True)[1] + image_pred_class = image_pred_class[conf_sort_index] + idx = image_pred_class.size(0) + + # from soft_NMS import soft_nms + # boxes = image_pred_class[:,:4] + # scores = image_pred_class[:, 4] + # k, N = soft_nms(boxes, scores, method=2) + # image_pred_class = image_pred_class[k] + + # if nms has to be done + if nms: + # For each detection + for i in range(idx): + # Get the IOUs of all boxes that come after the one we are looking at + # in the loop + try: + ious = bbox_iou(image_pred_class[i].unsqueeze(0), image_pred_class[i+1:]) + except ValueError: + break + + except IndexError: + break + + # Zero out all the detections that have IoU > threshold + iou_mask = (ious < nms_conf).float().unsqueeze(1) + image_pred_class[i+1:] *= iou_mask + + # Remove the zero entries + non_zero_ind = torch.nonzero(image_pred_class[:, 4]).squeeze() + image_pred_class = image_pred_class[non_zero_ind].view(-1, 7) + + # Concatenate the batch_id of the image to the detection + # this helps us identify which image does the detection correspond to + # We use a linear structure to hold ALL the detections from the batch + # the batch_dim is flattened + # batch is identified by extra batch column + + batch_ind = image_pred_class.new(image_pred_class.size(0), 1).fill_(ind) + seq = batch_ind, image_pred_class + if not write: + output = torch.cat(seq, 1) + write = True + else: + out = torch.cat(seq, 1) + output = torch.cat((output, out)) + + return output diff --git a/VideoToNPZ/lib/pose/__init__.py b/VideoToNPZ/lib/pose/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..118a33cf7359ce8025294f12f8d83923c2270540 --- /dev/null +++ b/VideoToNPZ/lib/pose/__init__.py @@ -0,0 +1,10 @@ +import sys +import os.path as osp + +sys.path.insert(1, osp.join(osp.dirname(osp.realpath(__file__)), 'hrnet/pose_estimation')) +from gen_kpts import gen_img_kpts, gen_video_kpts, load_default_model +sys.path.insert(2, osp.join(osp.dirname(osp.realpath(__file__)), 'hrnet/lib/utils')) +from utilitys import plot_keypoint, write, PreProcess, box_to_center_scale, load_json + +sys.path.pop(1) +sys.path.pop(2) \ No newline at end of file diff --git a/VideoToNPZ/lib/pose/hrnet/experiments/coco/hrnet/w32_256x192_adam_lr1e-3.yaml b/VideoToNPZ/lib/pose/hrnet/experiments/coco/hrnet/w32_256x192_adam_lr1e-3.yaml new file mode 100644 index 0000000000000000000000000000000000000000..16854cf2c48afde13cbf5847a202ee8640b3c982 --- /dev/null +++ b/VideoToNPZ/lib/pose/hrnet/experiments/coco/hrnet/w32_256x192_adam_lr1e-3.yaml @@ -0,0 +1,127 @@ +AUTO_RESUME: true +CUDNN: + BENCHMARK: true + DETERMINISTIC: false + ENABLED: true +DATA_DIR: '' +GPUS: (0,1,2,3) +OUTPUT_DIR: 'output' +LOG_DIR: 'log' +WORKERS: 24 +PRINT_FREQ: 100 + +DATASET: + COLOR_RGB: true + DATASET: 'coco' + DATA_FORMAT: jpg + FLIP: true + NUM_JOINTS_HALF_BODY: 8 + PROB_HALF_BODY: 0.3 + ROOT: 'data/coco/' + ROT_FACTOR: 45 + SCALE_FACTOR: 0.35 + TEST_SET: 'val2017' + TRAIN_SET: 'train2017' +MODEL: + INIT_WEIGHTS: true + NAME: pose_hrnet + NUM_JOINTS: 17 + PRETRAINED: 'models/pytorch/imagenet/hrnet_w32-36af842e.pth' + TARGET_TYPE: gaussian + IMAGE_SIZE: + - 192 + - 256 + HEATMAP_SIZE: + - 48 + - 64 + SIGMA: 2 + EXTRA: + PRETRAINED_LAYERS: + - 'conv1' + - 'bn1' + - 'conv2' + - 'bn2' + - 'layer1' + - 'transition1' + - 'stage2' + - 'transition2' + - 'stage3' + - 'transition3' + - 'stage4' + FINAL_CONV_KERNEL: 1 + STAGE2: + NUM_MODULES: 1 + NUM_BRANCHES: 2 + BLOCK: BASIC + NUM_BLOCKS: + - 4 + - 4 + NUM_CHANNELS: + - 32 + - 64 + FUSE_METHOD: SUM + STAGE3: + NUM_MODULES: 4 + NUM_BRANCHES: 3 + BLOCK: BASIC + NUM_BLOCKS: + - 4 + - 4 + - 4 + NUM_CHANNELS: + - 32 + - 64 + - 128 + FUSE_METHOD: SUM + STAGE4: + NUM_MODULES: 3 + NUM_BRANCHES: 4 + BLOCK: BASIC + NUM_BLOCKS: + - 4 + - 4 + - 4 + - 4 + NUM_CHANNELS: + - 32 + - 64 + - 128 + - 256 + FUSE_METHOD: SUM +LOSS: + USE_TARGET_WEIGHT: true +TRAIN: + BATCH_SIZE_PER_GPU: 32 + SHUFFLE: true + BEGIN_EPOCH: 0 + END_EPOCH: 210 + OPTIMIZER: adam + LR: 0.001 + LR_FACTOR: 0.1 + LR_STEP: + - 170 + - 200 + WD: 0.0001 + GAMMA1: 0.99 + GAMMA2: 0.0 + MOMENTUM: 0.9 + NESTEROV: false +TEST: + BATCH_SIZE_PER_GPU: 32 + COCO_BBOX_FILE: 'data/coco/person_detection_results/COCO_val2017_detections_AP_H_56_person.json' + BBOX_THRE: 1.0 + IMAGE_THRE: 0.0 + IN_VIS_THRE: 0.2 + MODEL_FILE: '' + NMS_THRE: 1.0 + OKS_THRE: 0.9 + USE_GT_BBOX: true + FLIP_TEST: true + POST_PROCESS: true + SHIFT_HEATMAP: true +DEBUG: + DEBUG: true + SAVE_BATCH_IMAGES_GT: true + SAVE_BATCH_IMAGES_PRED: true + SAVE_HEATMAPS_GT: true + SAVE_HEATMAPS_PRED: true diff --git a/VideoToNPZ/lib/pose/hrnet/experiments/coco/hrnet/w32_384x288_adam_lr1e-3.yaml b/VideoToNPZ/lib/pose/hrnet/experiments/coco/hrnet/w32_384x288_adam_lr1e-3.yaml new file mode 100644 index 0000000000000000000000000000000000000000..57101e9069350d171760cb936a19c082165ece03 --- /dev/null +++ b/VideoToNPZ/lib/pose/hrnet/experiments/coco/hrnet/w32_384x288_adam_lr1e-3.yaml @@ -0,0 +1,127 @@ +AUTO_RESUME: true +CUDNN: + BENCHMARK: true + DETERMINISTIC: false + ENABLED: true +DATA_DIR: '' +GPUS: (0,1,2,3) +OUTPUT_DIR: 'output' +LOG_DIR: 'log' +WORKERS: 24 +PRINT_FREQ: 100 + +DATASET: + COLOR_RGB: true + DATASET: 'coco' + DATA_FORMAT: jpg + FLIP: true + NUM_JOINTS_HALF_BODY: 8 + PROB_HALF_BODY: 0.3 + ROOT: 'data/coco/' + ROT_FACTOR: 45 + SCALE_FACTOR: 0.35 + TEST_SET: 'val2017' + TRAIN_SET: 'train2017' +MODEL: + INIT_WEIGHTS: true + NAME: pose_hrnet + NUM_JOINTS: 17 + PRETRAINED: 'models/pytorch/imagenet/hrnet_w32-36af842e.pth' + TARGET_TYPE: gaussian + IMAGE_SIZE: + - 288 + - 384 + HEATMAP_SIZE: + - 72 + - 96 + SIGMA: 3 + EXTRA: + PRETRAINED_LAYERS: + - 'conv1' + - 'bn1' + - 'conv2' + - 'bn2' + - 'layer1' + - 'transition1' + - 'stage2' + - 'transition2' + - 'stage3' + - 'transition3' + - 'stage4' + FINAL_CONV_KERNEL: 1 + STAGE2: + NUM_MODULES: 1 + NUM_BRANCHES: 2 + BLOCK: BASIC + NUM_BLOCKS: + - 4 + - 4 + NUM_CHANNELS: + - 32 + - 64 + FUSE_METHOD: SUM + STAGE3: + NUM_MODULES: 4 + NUM_BRANCHES: 3 + BLOCK: BASIC + NUM_BLOCKS: + - 4 + - 4 + - 4 + NUM_CHANNELS: + - 32 + - 64 + - 128 + FUSE_METHOD: SUM + STAGE4: + NUM_MODULES: 3 + NUM_BRANCHES: 4 + BLOCK: BASIC + NUM_BLOCKS: + - 4 + - 4 + - 4 + - 4 + NUM_CHANNELS: + - 32 + - 64 + - 128 + - 256 + FUSE_METHOD: SUM +LOSS: + USE_TARGET_WEIGHT: true +TRAIN: + BATCH_SIZE_PER_GPU: 32 + SHUFFLE: true + BEGIN_EPOCH: 0 + END_EPOCH: 210 + OPTIMIZER: adam + LR: 0.001 + LR_FACTOR: 0.1 + LR_STEP: + - 170 + - 200 + WD: 0.0001 + GAMMA1: 0.99 + GAMMA2: 0.0 + MOMENTUM: 0.9 + NESTEROV: false +TEST: + BATCH_SIZE_PER_GPU: 32 + COCO_BBOX_FILE: 'data/coco/person_detection_results/COCO_val2017_detections_AP_H_56_person.json' + BBOX_THRE: 1.0 + IMAGE_THRE: 0.0 + IN_VIS_THRE: 0.2 + MODEL_FILE: '' + NMS_THRE: 1.0 + OKS_THRE: 0.9 + USE_GT_BBOX: true + FLIP_TEST: true + POST_PROCESS: true + SHIFT_HEATMAP: true +DEBUG: + DEBUG: true + SAVE_BATCH_IMAGES_GT: true + SAVE_BATCH_IMAGES_PRED: true + SAVE_HEATMAPS_GT: true + SAVE_HEATMAPS_PRED: true diff --git a/VideoToNPZ/lib/pose/hrnet/experiments/coco/hrnet/w48_256x192_adam_lr1e-3.yaml b/VideoToNPZ/lib/pose/hrnet/experiments/coco/hrnet/w48_256x192_adam_lr1e-3.yaml new file mode 100644 index 0000000000000000000000000000000000000000..45c7011c0f1d3f441840e9693e6923c78fe3eab5 --- /dev/null +++ b/VideoToNPZ/lib/pose/hrnet/experiments/coco/hrnet/w48_256x192_adam_lr1e-3.yaml @@ -0,0 +1,127 @@ +AUTO_RESUME: true +CUDNN: + BENCHMARK: true + DETERMINISTIC: false + ENABLED: true +DATA_DIR: '' +GPUS: (0,1,2,3) +OUTPUT_DIR: 'output' +LOG_DIR: 'log' +WORKERS: 24 +PRINT_FREQ: 100 + +DATASET: + COLOR_RGB: true + DATASET: 'coco' + DATA_FORMAT: jpg + FLIP: true + NUM_JOINTS_HALF_BODY: 8 + PROB_HALF_BODY: 0.3 + ROOT: 'data/coco/' + ROT_FACTOR: 45 + SCALE_FACTOR: 0.35 + TEST_SET: 'val2017' + TRAIN_SET: 'train2017' +MODEL: + INIT_WEIGHTS: true + NAME: pose_hrnet + NUM_JOINTS: 17 + PRETRAINED: 'models/pytorch/imagenet/hrnet_w48-8ef0771d.pth' + TARGET_TYPE: gaussian + IMAGE_SIZE: + - 192 + - 256 + HEATMAP_SIZE: + - 48 + - 64 + SIGMA: 2 + EXTRA: + PRETRAINED_LAYERS: + - 'conv1' + - 'bn1' + - 'conv2' + - 'bn2' + - 'layer1' + - 'transition1' + - 'stage2' + - 'transition2' + - 'stage3' + - 'transition3' + - 'stage4' + FINAL_CONV_KERNEL: 1 + STAGE2: + NUM_MODULES: 1 + NUM_BRANCHES: 2 + BLOCK: BASIC + NUM_BLOCKS: + - 4 + - 4 + NUM_CHANNELS: + - 48 + - 96 + FUSE_METHOD: SUM + STAGE3: + NUM_MODULES: 4 + NUM_BRANCHES: 3 + BLOCK: BASIC + NUM_BLOCKS: + - 4 + - 4 + - 4 + NUM_CHANNELS: + - 48 + - 96 + - 192 + FUSE_METHOD: SUM + STAGE4: + NUM_MODULES: 3 + NUM_BRANCHES: 4 + BLOCK: BASIC + NUM_BLOCKS: + - 4 + - 4 + - 4 + - 4 + NUM_CHANNELS: + - 48 + - 96 + - 192 + - 384 + FUSE_METHOD: SUM +LOSS: + USE_TARGET_WEIGHT: true +TRAIN: + BATCH_SIZE_PER_GPU: 32 + SHUFFLE: true + BEGIN_EPOCH: 0 + END_EPOCH: 210 + OPTIMIZER: adam + LR: 0.001 + LR_FACTOR: 0.1 + LR_STEP: + - 170 + - 200 + WD: 0.0001 + GAMMA1: 0.99 + GAMMA2: 0.0 + MOMENTUM: 0.9 + NESTEROV: false +TEST: + BATCH_SIZE_PER_GPU: 32 + COCO_BBOX_FILE: 'data/coco/person_detection_results/COCO_val2017_detections_AP_H_56_person.json' + BBOX_THRE: 1.0 + IMAGE_THRE: 0.0 + IN_VIS_THRE: 0.2 + MODEL_FILE: '' + NMS_THRE: 1.0 + OKS_THRE: 0.9 + USE_GT_BBOX: true + FLIP_TEST: true + POST_PROCESS: true + SHIFT_HEATMAP: true +DEBUG: + DEBUG: true + SAVE_BATCH_IMAGES_GT: true + SAVE_BATCH_IMAGES_PRED: true + SAVE_HEATMAPS_GT: true + SAVE_HEATMAPS_PRED: true diff --git a/VideoToNPZ/lib/pose/hrnet/experiments/coco/hrnet/w48_384x288_adam_lr1e-3.yaml b/VideoToNPZ/lib/pose/hrnet/experiments/coco/hrnet/w48_384x288_adam_lr1e-3.yaml new file mode 100644 index 0000000000000000000000000000000000000000..2844ff61338e40b774656f884e8a370a104f19f7 --- /dev/null +++ b/VideoToNPZ/lib/pose/hrnet/experiments/coco/hrnet/w48_384x288_adam_lr1e-3.yaml @@ -0,0 +1,127 @@ +AUTO_RESUME: true +CUDNN: + BENCHMARK: true + DETERMINISTIC: false + ENABLED: true +DATA_DIR: '' +GPUS: (0,1,2,3) +OUTPUT_DIR: 'output' +LOG_DIR: 'log' +WORKERS: 24 +PRINT_FREQ: 100 + +DATASET: + COLOR_RGB: true + DATASET: 'coco' + DATA_FORMAT: jpg + FLIP: true + NUM_JOINTS_HALF_BODY: 8 + PROB_HALF_BODY: 0.3 + ROOT: 'data/coco/' + ROT_FACTOR: 45 + SCALE_FACTOR: 0.35 + TEST_SET: 'val2017' + TRAIN_SET: 'train2017' +MODEL: + INIT_WEIGHTS: true + NAME: pose_hrnet + NUM_JOINTS: 17 + PRETRAINED: 'models/pytorch/imagenet/hrnet_w48-8ef0771d.pth' + TARGET_TYPE: gaussian + IMAGE_SIZE: + - 288 + - 384 + HEATMAP_SIZE: + - 72 + - 96 + SIGMA: 3 + EXTRA: + PRETRAINED_LAYERS: + - 'conv1' + - 'bn1' + - 'conv2' + - 'bn2' + - 'layer1' + - 'transition1' + - 'stage2' + - 'transition2' + - 'stage3' + - 'transition3' + - 'stage4' + FINAL_CONV_KERNEL: 1 + STAGE2: + NUM_MODULES: 1 + NUM_BRANCHES: 2 + BLOCK: BASIC + NUM_BLOCKS: + - 4 + - 4 + NUM_CHANNELS: + - 48 + - 96 + FUSE_METHOD: SUM + STAGE3: + NUM_MODULES: 4 + NUM_BRANCHES: 3 + BLOCK: BASIC + NUM_BLOCKS: + - 4 + - 4 + - 4 + NUM_CHANNELS: + - 48 + - 96 + - 192 + FUSE_METHOD: SUM + STAGE4: + NUM_MODULES: 3 + NUM_BRANCHES: 4 + BLOCK: BASIC + NUM_BLOCKS: + - 4 + - 4 + - 4 + - 4 + NUM_CHANNELS: + - 48 + - 96 + - 192 + - 384 + FUSE_METHOD: SUM +LOSS: + USE_TARGET_WEIGHT: true +TRAIN: + BATCH_SIZE_PER_GPU: 24 + SHUFFLE: true + BEGIN_EPOCH: 0 + END_EPOCH: 210 + OPTIMIZER: adam + LR: 0.001 + LR_FACTOR: 0.1 + LR_STEP: + - 170 + - 200 + WD: 0.0001 + GAMMA1: 0.99 + GAMMA2: 0.0 + MOMENTUM: 0.9 + NESTEROV: false +TEST: + BATCH_SIZE_PER_GPU: 24 + COCO_BBOX_FILE: 'data/coco/person_detection_results/COCO_val2017_detections_AP_H_56_person.json' + BBOX_THRE: 1.0 + IMAGE_THRE: 0.0 + IN_VIS_THRE: 0.2 + MODEL_FILE: '' + NMS_THRE: 1.0 + OKS_THRE: 0.9 + USE_GT_BBOX: true + FLIP_TEST: true + POST_PROCESS: true + SHIFT_HEATMAP: true +DEBUG: + DEBUG: true + SAVE_BATCH_IMAGES_GT: true + SAVE_BATCH_IMAGES_PRED: true + SAVE_HEATMAPS_GT: true + SAVE_HEATMAPS_PRED: true diff --git a/VideoToNPZ/lib/pose/hrnet/experiments/coco/resnet/res101_256x192_d256x3_adam_lr1e-3.yaml b/VideoToNPZ/lib/pose/hrnet/experiments/coco/resnet/res101_256x192_d256x3_adam_lr1e-3.yaml new file mode 100644 index 0000000000000000000000000000000000000000..61e8f78a031f104f90e3b6c9a7388de289391fee --- /dev/null +++ b/VideoToNPZ/lib/pose/hrnet/experiments/coco/resnet/res101_256x192_d256x3_adam_lr1e-3.yaml @@ -0,0 +1,83 @@ +AUTO_RESUME: true +CUDNN: + BENCHMARK: true + DETERMINISTIC: false + ENABLED: true +DATA_DIR: '' +GPUS: (0,1,2,3) +OUTPUT_DIR: 'output' +LOG_DIR: 'log' +WORKERS: 24 +PRINT_FREQ: 100 + +DATASET: + COLOR_RGB: false + DATASET: 'coco' + ROOT: 'data/coco/' + TEST_SET: 'val2017' + TRAIN_SET: 'train2017' + FLIP: true + ROT_FACTOR: 40 + SCALE_FACTOR: 0.3 +MODEL: + NAME: 'pose_resnet' + PRETRAINED: 'models/pytorch/imagenet/resnet101-5d3b4d8f.pth' + IMAGE_SIZE: + - 192 + - 256 + HEATMAP_SIZE: + - 48 + - 64 + SIGMA: 2 + NUM_JOINTS: 17 + TARGET_TYPE: 'gaussian' + EXTRA: + FINAL_CONV_KERNEL: 1 + DECONV_WITH_BIAS: false + NUM_DECONV_LAYERS: 3 + NUM_DECONV_FILTERS: + - 256 + - 256 + - 256 + NUM_DECONV_KERNELS: + - 4 + - 4 + - 4 + NUM_LAYERS: 101 +LOSS: + USE_TARGET_WEIGHT: true +TRAIN: + BATCH_SIZE_PER_GPU: 32 + SHUFFLE: true + BEGIN_EPOCH: 0 + END_EPOCH: 140 + OPTIMIZER: 'adam' + LR: 0.001 + LR_FACTOR: 0.1 + LR_STEP: + - 90 + - 120 + WD: 0.0001 + GAMMA1: 0.99 + GAMMA2: 0.0 + MOMENTUM: 0.9 + NESTEROV: false +TEST: + BATCH_SIZE_PER_GPU: 32 + COCO_BBOX_FILE: 'data/coco/person_detection_results/COCO_val2017_detections_AP_H_56_person.json' + BBOX_THRE: 1.0 + IMAGE_THRE: 0.0 + IN_VIS_THRE: 0.2 + MODEL_FILE: '' + NMS_THRE: 1.0 + OKS_THRE: 0.9 + FLIP_TEST: true + POST_PROCESS: true + SHIFT_HEATMAP: true + USE_GT_BBOX: true +DEBUG: + DEBUG: true + SAVE_BATCH_IMAGES_GT: true + SAVE_BATCH_IMAGES_PRED: true + SAVE_HEATMAPS_GT: true + SAVE_HEATMAPS_PRED: true diff --git a/VideoToNPZ/lib/pose/hrnet/experiments/coco/resnet/res101_384x288_d256x3_adam_lr1e-3.yaml b/VideoToNPZ/lib/pose/hrnet/experiments/coco/resnet/res101_384x288_d256x3_adam_lr1e-3.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b1523c69190c1a496476219577805f153f30310b --- /dev/null +++ b/VideoToNPZ/lib/pose/hrnet/experiments/coco/resnet/res101_384x288_d256x3_adam_lr1e-3.yaml @@ -0,0 +1,83 @@ +AUTO_RESUME: true +CUDNN: + BENCHMARK: true + DETERMINISTIC: false + ENABLED: true +DATA_DIR: '' +GPUS: (0,1,2,3) +OUTPUT_DIR: 'output' +LOG_DIR: 'log' +WORKERS: 24 +PRINT_FREQ: 100 + +DATASET: + COLOR_RGB: false + DATASET: 'coco' + ROOT: 'data/coco/' + TEST_SET: 'val2017' + TRAIN_SET: 'train2017' + FLIP: true + ROT_FACTOR: 40 + SCALE_FACTOR: 0.3 +MODEL: + NAME: 'pose_resnet' + PRETRAINED: 'models/pytorch/imagenet/resnet101-5d3b4d8f.pth' + IMAGE_SIZE: + - 288 + - 384 + HEATMAP_SIZE: + - 72 + - 96 + SIGMA: 3 + NUM_JOINTS: 17 + TARGET_TYPE: 'gaussian' + EXTRA: + FINAL_CONV_KERNEL: 1 + DECONV_WITH_BIAS: false + NUM_DECONV_LAYERS: 3 + NUM_DECONV_FILTERS: + - 256 + - 256 + - 256 + NUM_DECONV_KERNELS: + - 4 + - 4 + - 4 + NUM_LAYERS: 101 +LOSS: + USE_TARGET_WEIGHT: true +TRAIN: + BATCH_SIZE_PER_GPU: 32 + SHUFFLE: true + BEGIN_EPOCH: 0 + END_EPOCH: 140 + OPTIMIZER: 'adam' + LR: 0.001 + LR_FACTOR: 0.1 + LR_STEP: + - 90 + - 120 + WD: 0.0001 + GAMMA1: 0.99 + GAMMA2: 0.0 + MOMENTUM: 0.9 + NESTEROV: false +TEST: + BATCH_SIZE_PER_GPU: 32 + COCO_BBOX_FILE: 'data/coco/person_detection_results/COCO_val2017_detections_AP_H_56_person.json' + BBOX_THRE: 1.0 + IMAGE_THRE: 0.0 + IN_VIS_THRE: 0.2 + MODEL_FILE: '' + NMS_THRE: 1.0 + OKS_THRE: 0.9 + FLIP_TEST: true + POST_PROCESS: true + SHIFT_HEATMAP: true + USE_GT_BBOX: true +DEBUG: + DEBUG: true + SAVE_BATCH_IMAGES_GT: true + SAVE_BATCH_IMAGES_PRED: true + SAVE_HEATMAPS_GT: true + SAVE_HEATMAPS_PRED: true diff --git a/VideoToNPZ/lib/pose/hrnet/experiments/coco/resnet/res152_256x192_d256x3_adam_lr1e-3.yaml b/VideoToNPZ/lib/pose/hrnet/experiments/coco/resnet/res152_256x192_d256x3_adam_lr1e-3.yaml new file mode 100644 index 0000000000000000000000000000000000000000..580c09fbe890c57b4b8683bbb934724ed7ee1cc7 --- /dev/null +++ b/VideoToNPZ/lib/pose/hrnet/experiments/coco/resnet/res152_256x192_d256x3_adam_lr1e-3.yaml @@ -0,0 +1,83 @@ +AUTO_RESUME: true +CUDNN: + BENCHMARK: true + DETERMINISTIC: false + ENABLED: true +DATA_DIR: '' +GPUS: (0,1,2,3) +OUTPUT_DIR: 'output' +LOG_DIR: 'log' +WORKERS: 24 +PRINT_FREQ: 100 + +DATASET: + COLOR_RGB: false + DATASET: 'coco' + ROOT: 'data/coco/' + TEST_SET: 'val2017' + TRAIN_SET: 'train2017' + FLIP: true + ROT_FACTOR: 40 + SCALE_FACTOR: 0.3 +MODEL: + NAME: 'pose_resnet' + PRETRAINED: 'models/pytorch/imagenet/resnet152-b121ed2d.pth' + IMAGE_SIZE: + - 192 + - 256 + HEATMAP_SIZE: + - 48 + - 64 + SIGMA: 2 + NUM_JOINTS: 17 + TARGET_TYPE: 'gaussian' + EXTRA: + FINAL_CONV_KERNEL: 1 + DECONV_WITH_BIAS: false + NUM_DECONV_LAYERS: 3 + NUM_DECONV_FILTERS: + - 256 + - 256 + - 256 + NUM_DECONV_KERNELS: + - 4 + - 4 + - 4 + NUM_LAYERS: 152 +LOSS: + USE_TARGET_WEIGHT: true +TRAIN: + BATCH_SIZE_PER_GPU: 32 + SHUFFLE: true + BEGIN_EPOCH: 0 + END_EPOCH: 140 + OPTIMIZER: 'adam' + LR: 0.001 + LR_FACTOR: 0.1 + LR_STEP: + - 90 + - 120 + WD: 0.0001 + GAMMA1: 0.99 + GAMMA2: 0.0 + MOMENTUM: 0.9 + NESTEROV: false +TEST: + BATCH_SIZE_PER_GPU: 32 + COCO_BBOX_FILE: 'data/coco/person_detection_results/COCO_val2017_detections_AP_H_56_person.json' + BBOX_THRE: 1.0 + IMAGE_THRE: 0.0 + IN_VIS_THRE: 0.2 + MODEL_FILE: '' + NMS_THRE: 1.0 + OKS_THRE: 0.9 + FLIP_TEST: true + POST_PROCESS: true + SHIFT_HEATMAP: true + USE_GT_BBOX: true +DEBUG: + DEBUG: true + SAVE_BATCH_IMAGES_GT: true + SAVE_BATCH_IMAGES_PRED: true + SAVE_HEATMAPS_GT: true + SAVE_HEATMAPS_PRED: true diff --git a/VideoToNPZ/lib/pose/hrnet/experiments/coco/resnet/res152_384x288_d256x3_adam_lr1e-3.yaml b/VideoToNPZ/lib/pose/hrnet/experiments/coco/resnet/res152_384x288_d256x3_adam_lr1e-3.yaml new file mode 100644 index 0000000000000000000000000000000000000000..156c576478591edddfae0a0849c80e3d1f2e0420 --- /dev/null +++ b/VideoToNPZ/lib/pose/hrnet/experiments/coco/resnet/res152_384x288_d256x3_adam_lr1e-3.yaml @@ -0,0 +1,83 @@ +AUTO_RESUME: true +CUDNN: + BENCHMARK: true + DETERMINISTIC: false + ENABLED: true +DATA_DIR: '' +GPUS: (0,1,2,3) +OUTPUT_DIR: 'output' +LOG_DIR: 'log' +WORKERS: 24 +PRINT_FREQ: 100 + +DATASET: + COLOR_RGB: false + DATASET: 'coco' + ROOT: 'data/coco/' + TEST_SET: 'val2017' + TRAIN_SET: 'train2017' + FLIP: true + ROT_FACTOR: 40 + SCALE_FACTOR: 0.3 +MODEL: + NAME: 'pose_resnet' + PRETRAINED: 'models/pytorch/imagenet/resnet152-b121ed2d.pth' + IMAGE_SIZE: + - 288 + - 384 + HEATMAP_SIZE: + - 72 + - 96 + SIGMA: 3 + NUM_JOINTS: 17 + TARGET_TYPE: 'gaussian' + EXTRA: + FINAL_CONV_KERNEL: 1 + DECONV_WITH_BIAS: false + NUM_DECONV_LAYERS: 3 + NUM_DECONV_FILTERS: + - 256 + - 256 + - 256 + NUM_DECONV_KERNELS: + - 4 + - 4 + - 4 + NUM_LAYERS: 152 +LOSS: + USE_TARGET_WEIGHT: true +TRAIN: + BATCH_SIZE_PER_GPU: 32 + SHUFFLE: true + BEGIN_EPOCH: 0 + END_EPOCH: 140 + OPTIMIZER: 'adam' + LR: 0.001 + LR_FACTOR: 0.1 + LR_STEP: + - 90 + - 120 + WD: 0.0001 + GAMMA1: 0.99 + GAMMA2: 0.0 + MOMENTUM: 0.9 + NESTEROV: false +TEST: + BATCH_SIZE_PER_GPU: 32 + COCO_BBOX_FILE: 'data/coco/person_detection_results/COCO_val2017_detections_AP_H_56_person.json' + BBOX_THRE: 1.0 + IMAGE_THRE: 0.0 + IN_VIS_THRE: 0.2 + MODEL_FILE: '' + NMS_THRE: 1.0 + OKS_THRE: 0.9 + FLIP_TEST: true + POST_PROCESS: true + SHIFT_HEATMAP: true + USE_GT_BBOX: true +DEBUG: + DEBUG: true + SAVE_BATCH_IMAGES_GT: true + SAVE_BATCH_IMAGES_PRED: true + SAVE_HEATMAPS_GT: true + SAVE_HEATMAPS_PRED: true diff --git a/VideoToNPZ/lib/pose/hrnet/experiments/coco/resnet/res50_256x192_d256x3_adam_lr1e-3.yaml b/VideoToNPZ/lib/pose/hrnet/experiments/coco/resnet/res50_256x192_d256x3_adam_lr1e-3.yaml new file mode 100644 index 0000000000000000000000000000000000000000..1c00e86872fe49a7371c22ad9ef2859bcd6d769e --- /dev/null +++ b/VideoToNPZ/lib/pose/hrnet/experiments/coco/resnet/res50_256x192_d256x3_adam_lr1e-3.yaml @@ -0,0 +1,83 @@ +AUTO_RESUME: true +CUDNN: + BENCHMARK: true + DETERMINISTIC: false + ENABLED: true +DATA_DIR: '' +GPUS: (0,1,2,3) +OUTPUT_DIR: 'output' +LOG_DIR: 'log' +WORKERS: 24 +PRINT_FREQ: 100 + +DATASET: + COLOR_RGB: false + DATASET: 'coco' + ROOT: 'data/coco/' + TEST_SET: 'val2017' + TRAIN_SET: 'train2017' + FLIP: true + ROT_FACTOR: 40 + SCALE_FACTOR: 0.3 +MODEL: + NAME: 'pose_resnet' + PRETRAINED: 'models/pytorch/imagenet/resnet50-19c8e357.pth' + IMAGE_SIZE: + - 192 + - 256 + HEATMAP_SIZE: + - 48 + - 64 + SIGMA: 2 + NUM_JOINTS: 17 + TARGET_TYPE: 'gaussian' + EXTRA: + FINAL_CONV_KERNEL: 1 + DECONV_WITH_BIAS: false + NUM_DECONV_LAYERS: 3 + NUM_DECONV_FILTERS: + - 256 + - 256 + - 256 + NUM_DECONV_KERNELS: + - 4 + - 4 + - 4 + NUM_LAYERS: 50 +LOSS: + USE_TARGET_WEIGHT: true +TRAIN: + BATCH_SIZE_PER_GPU: 32 + SHUFFLE: true + BEGIN_EPOCH: 0 + END_EPOCH: 140 + OPTIMIZER: 'adam' + LR: 0.001 + LR_FACTOR: 0.1 + LR_STEP: + - 90 + - 120 + WD: 0.0001 + GAMMA1: 0.99 + GAMMA2: 0.0 + MOMENTUM: 0.9 + NESTEROV: false +TEST: + BATCH_SIZE_PER_GPU: 32 + COCO_BBOX_FILE: 'data/coco/person_detection_results/COCO_val2017_detections_AP_H_56_person.json' + BBOX_THRE: 1.0 + IMAGE_THRE: 0.0 + IN_VIS_THRE: 0.2 + MODEL_FILE: '' + NMS_THRE: 1.0 + OKS_THRE: 0.9 + FLIP_TEST: true + POST_PROCESS: true + SHIFT_HEATMAP: true + USE_GT_BBOX: true +DEBUG: + DEBUG: true + SAVE_BATCH_IMAGES_GT: true + SAVE_BATCH_IMAGES_PRED: true + SAVE_HEATMAPS_GT: true + SAVE_HEATMAPS_PRED: true diff --git a/VideoToNPZ/lib/pose/hrnet/experiments/coco/resnet/res50_384x288_d256x3_adam_lr1e-3.yaml b/VideoToNPZ/lib/pose/hrnet/experiments/coco/resnet/res50_384x288_d256x3_adam_lr1e-3.yaml new file mode 100644 index 0000000000000000000000000000000000000000..caf7726c3cfcfb4a3a7a65029b0ee64f0194d0dc --- /dev/null +++ b/VideoToNPZ/lib/pose/hrnet/experiments/coco/resnet/res50_384x288_d256x3_adam_lr1e-3.yaml @@ -0,0 +1,83 @@ +AUTO_RESUME: true +CUDNN: + BENCHMARK: true + DETERMINISTIC: false + ENABLED: true +DATA_DIR: '' +GPUS: (0,1,2,3) +OUTPUT_DIR: 'output' +LOG_DIR: 'log' +WORKERS: 24 +PRINT_FREQ: 100 + +DATASET: + COLOR_RGB: false + DATASET: 'coco' + ROOT: 'data/coco/' + TEST_SET: 'val2017' + TRAIN_SET: 'train2017' + FLIP: true + ROT_FACTOR: 40 + SCALE_FACTOR: 0.3 +MODEL: + NAME: 'pose_resnet' + PRETRAINED: 'models/pytorch/imagenet/resnet50-19c8e357.pth' + IMAGE_SIZE: + - 288 + - 384 + HEATMAP_SIZE: + - 72 + - 96 + SIGMA: 3 + NUM_JOINTS: 17 + TARGET_TYPE: 'gaussian' + EXTRA: + FINAL_CONV_KERNEL: 1 + DECONV_WITH_BIAS: false + NUM_DECONV_LAYERS: 3 + NUM_DECONV_FILTERS: + - 256 + - 256 + - 256 + NUM_DECONV_KERNELS: + - 4 + - 4 + - 4 + NUM_LAYERS: 50 +LOSS: + USE_TARGET_WEIGHT: true +TRAIN: + BATCH_SIZE_PER_GPU: 32 + SHUFFLE: true + BEGIN_EPOCH: 0 + END_EPOCH: 140 + OPTIMIZER: 'adam' + LR: 0.001 + LR_FACTOR: 0.1 + LR_STEP: + - 90 + - 120 + WD: 0.0001 + GAMMA1: 0.99 + GAMMA2: 0.0 + MOMENTUM: 0.9 + NESTEROV: false +TEST: + BATCH_SIZE_PER_GPU: 32 + COCO_BBOX_FILE: 'data/coco/person_detection_results/COCO_val2017_detections_AP_H_56_person.json' + BBOX_THRE: 1.0 + IMAGE_THRE: 0.0 + IN_VIS_THRE: 0.2 + MODEL_FILE: '' + NMS_THRE: 1.0 + OKS_THRE: 0.9 + FLIP_TEST: true + POST_PROCESS: true + SHIFT_HEATMAP: true + USE_GT_BBOX: true +DEBUG: + DEBUG: true + SAVE_BATCH_IMAGES_GT: true + SAVE_BATCH_IMAGES_PRED: true + SAVE_HEATMAPS_GT: true + SAVE_HEATMAPS_PRED: true diff --git a/VideoToNPZ/lib/pose/hrnet/experiments/mpii/hrnet/w32_256x256_adam_lr1e-3.yaml b/VideoToNPZ/lib/pose/hrnet/experiments/mpii/hrnet/w32_256x256_adam_lr1e-3.yaml new file mode 100644 index 0000000000000000000000000000000000000000..894a84457da1e38020aea150b7dd47e2ec49e1bc --- /dev/null +++ b/VideoToNPZ/lib/pose/hrnet/experiments/mpii/hrnet/w32_256x256_adam_lr1e-3.yaml @@ -0,0 +1,120 @@ +AUTO_RESUME: true +CUDNN: + BENCHMARK: true + DETERMINISTIC: false + ENABLED: true +DATA_DIR: '' +GPUS: (0,1,2,3) +OUTPUT_DIR: 'output' +LOG_DIR: 'log' +WORKERS: 24 +PRINT_FREQ: 100 + +DATASET: + COLOR_RGB: true + DATASET: mpii + DATA_FORMAT: jpg + FLIP: true + NUM_JOINTS_HALF_BODY: 8 + PROB_HALF_BODY: -1.0 + ROOT: 'data/mpii/' + ROT_FACTOR: 30 + SCALE_FACTOR: 0.25 + TEST_SET: valid + TRAIN_SET: train +MODEL: + INIT_WEIGHTS: true + NAME: pose_hrnet + NUM_JOINTS: 16 + PRETRAINED: 'models/pytorch/imagenet/hrnet_w32-36af842e.pth' + TARGET_TYPE: gaussian + IMAGE_SIZE: + - 256 + - 256 + HEATMAP_SIZE: + - 64 + - 64 + SIGMA: 2 + EXTRA: + PRETRAINED_LAYERS: + - 'conv1' + - 'bn1' + - 'conv2' + - 'bn2' + - 'layer1' + - 'transition1' + - 'stage2' + - 'transition2' + - 'stage3' + - 'transition3' + - 'stage4' + FINAL_CONV_KERNEL: 1 + STAGE2: + NUM_MODULES: 1 + NUM_BRANCHES: 2 + BLOCK: BASIC + NUM_BLOCKS: + - 4 + - 4 + NUM_CHANNELS: + - 32 + - 64 + FUSE_METHOD: SUM + STAGE3: + NUM_MODULES: 4 + NUM_BRANCHES: 3 + BLOCK: BASIC + NUM_BLOCKS: + - 4 + - 4 + - 4 + NUM_CHANNELS: + - 32 + - 64 + - 128 + FUSE_METHOD: SUM + STAGE4: + NUM_MODULES: 3 + NUM_BRANCHES: 4 + BLOCK: BASIC + NUM_BLOCKS: + - 4 + - 4 + - 4 + - 4 + NUM_CHANNELS: + - 32 + - 64 + - 128 + - 256 + FUSE_METHOD: SUM +LOSS: + USE_TARGET_WEIGHT: true +TRAIN: + BATCH_SIZE_PER_GPU: 32 + SHUFFLE: true + BEGIN_EPOCH: 0 + END_EPOCH: 210 + OPTIMIZER: adam + LR: 0.001 + LR_FACTOR: 0.1 + LR_STEP: + - 170 + - 200 + WD: 0.0001 + GAMMA1: 0.99 + GAMMA2: 0.0 + MOMENTUM: 0.9 + NESTEROV: false +TEST: + BATCH_SIZE_PER_GPU: 32 + MODEL_FILE: '' + FLIP_TEST: true + POST_PROCESS: true + SHIFT_HEATMAP: true +DEBUG: + DEBUG: true + SAVE_BATCH_IMAGES_GT: true + SAVE_BATCH_IMAGES_PRED: true + SAVE_HEATMAPS_GT: true + SAVE_HEATMAPS_PRED: true diff --git a/VideoToNPZ/lib/pose/hrnet/experiments/mpii/hrnet/w48_256x256_adam_lr1e-3.yaml b/VideoToNPZ/lib/pose/hrnet/experiments/mpii/hrnet/w48_256x256_adam_lr1e-3.yaml new file mode 100644 index 0000000000000000000000000000000000000000..1f621dca2e032f19f996ada5a236bb01aebc26e0 --- /dev/null +++ b/VideoToNPZ/lib/pose/hrnet/experiments/mpii/hrnet/w48_256x256_adam_lr1e-3.yaml @@ -0,0 +1,120 @@ +AUTO_RESUME: true +CUDNN: + BENCHMARK: true + DETERMINISTIC: false + ENABLED: true +DATA_DIR: '' +GPUS: (0,1,2,3) +OUTPUT_DIR: 'output' +LOG_DIR: 'log' +WORKERS: 24 +PRINT_FREQ: 100 + +DATASET: + COLOR_RGB: true + DATASET: mpii + DATA_FORMAT: jpg + FLIP: true + NUM_JOINTS_HALF_BODY: 8 + PROB_HALF_BODY: -1.0 + ROOT: 'data/mpii/' + ROT_FACTOR: 30 + SCALE_FACTOR: 0.25 + TEST_SET: valid + TRAIN_SET: train +MODEL: + INIT_WEIGHTS: true + NAME: pose_hrnet + NUM_JOINTS: 16 + PRETRAINED: 'models/pytorch/imagenet/hrnet_w48-8ef0771d.pth' + TARGET_TYPE: gaussian + IMAGE_SIZE: + - 256 + - 256 + HEATMAP_SIZE: + - 64 + - 64 + SIGMA: 2 + EXTRA: + PRETRAINED_LAYERS: + - 'conv1' + - 'bn1' + - 'conv2' + - 'bn2' + - 'layer1' + - 'transition1' + - 'stage2' + - 'transition2' + - 'stage3' + - 'transition3' + - 'stage4' + FINAL_CONV_KERNEL: 1 + STAGE2: + NUM_MODULES: 1 + NUM_BRANCHES: 2 + BLOCK: BASIC + NUM_BLOCKS: + - 4 + - 4 + NUM_CHANNELS: + - 48 + - 96 + FUSE_METHOD: SUM + STAGE3: + NUM_MODULES: 4 + NUM_BRANCHES: 3 + BLOCK: BASIC + NUM_BLOCKS: + - 4 + - 4 + - 4 + NUM_CHANNELS: + - 48 + - 96 + - 192 + FUSE_METHOD: SUM + STAGE4: + NUM_MODULES: 3 + NUM_BRANCHES: 4 + BLOCK: BASIC + NUM_BLOCKS: + - 4 + - 4 + - 4 + - 4 + NUM_CHANNELS: + - 48 + - 96 + - 192 + - 384 + FUSE_METHOD: SUM +LOSS: + USE_TARGET_WEIGHT: true +TRAIN: + BATCH_SIZE_PER_GPU: 32 + SHUFFLE: true + BEGIN_EPOCH: 0 + END_EPOCH: 210 + OPTIMIZER: adam + LR: 0.001 + LR_FACTOR: 0.1 + LR_STEP: + - 170 + - 200 + WD: 0.0001 + GAMMA1: 0.99 + GAMMA2: 0.0 + MOMENTUM: 0.9 + NESTEROV: false +TEST: + BATCH_SIZE_PER_GPU: 32 + MODEL_FILE: '' + FLIP_TEST: true + POST_PROCESS: true + SHIFT_HEATMAP: true +DEBUG: + DEBUG: true + SAVE_BATCH_IMAGES_GT: true + SAVE_BATCH_IMAGES_PRED: true + SAVE_HEATMAPS_GT: true + SAVE_HEATMAPS_PRED: true diff --git a/VideoToNPZ/lib/pose/hrnet/experiments/mpii/resnet/res101_256x256_d256x3_adam_lr1e-3.yaml b/VideoToNPZ/lib/pose/hrnet/experiments/mpii/resnet/res101_256x256_d256x3_adam_lr1e-3.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a48291b00e70771e837f5b50a930ba8018a6b78b --- /dev/null +++ b/VideoToNPZ/lib/pose/hrnet/experiments/mpii/resnet/res101_256x256_d256x3_adam_lr1e-3.yaml @@ -0,0 +1,86 @@ +AUTO_RESUME: true +CUDNN: + BENCHMARK: true + DETERMINISTIC: false + ENABLED: true +DATA_DIR: '' +GPUS: (0,1,2,3) +OUTPUT_DIR: 'output' +LOG_DIR: 'log' +WORKERS: 24 +PRINT_FREQ: 100 + +DATASET: + COLOR_RGB: false + DATASET: mpii + DATA_FORMAT: jpg + FLIP: true + NUM_JOINTS_HALF_BODY: 8 + PROB_HALF_BODY: -1.0 + ROOT: 'data/mpii/' + ROT_FACTOR: 30 + SCALE_FACTOR: 0.25 + TEST_SET: valid + TRAIN_SET: train +MODEL: + NAME: 'pose_resnet' + PRETRAINED: 'models/pytorch/imagenet/resnet101-5d3b4d8f.pth' + IMAGE_SIZE: + - 256 + - 256 + HEATMAP_SIZE: + - 64 + - 64 + SIGMA: 2 + NUM_JOINTS: 16 + TARGET_TYPE: 'gaussian' + EXTRA: + FINAL_CONV_KERNEL: 1 + DECONV_WITH_BIAS: false + NUM_DECONV_LAYERS: 3 + NUM_DECONV_FILTERS: + - 256 + - 256 + - 256 + NUM_DECONV_KERNELS: + - 4 + - 4 + - 4 + NUM_LAYERS: 101 +LOSS: + USE_TARGET_WEIGHT: true +TRAIN: + BATCH_SIZE_PER_GPU: 32 + SHUFFLE: true + BEGIN_EPOCH: 0 + END_EPOCH: 140 + OPTIMIZER: 'adam' + LR: 0.001 + LR_FACTOR: 0.1 + LR_STEP: + - 90 + - 120 + WD: 0.0001 + GAMMA1: 0.99 + GAMMA2: 0.0 + MOMENTUM: 0.9 + NESTEROV: false +TEST: + BATCH_SIZE_PER_GPU: 32 + COCO_BBOX_FILE: 'data/coco/person_detection_results/COCO_val2017_detections_AP_H_56_person.json' + BBOX_THRE: 1.0 + IMAGE_THRE: 0.0 + IN_VIS_THRE: 0.2 + MODEL_FILE: '' + NMS_THRE: 1.0 + OKS_THRE: 0.9 + FLIP_TEST: true + POST_PROCESS: true + SHIFT_HEATMAP: true + USE_GT_BBOX: true +DEBUG: + DEBUG: true + SAVE_BATCH_IMAGES_GT: true + SAVE_BATCH_IMAGES_PRED: true + SAVE_HEATMAPS_GT: true + SAVE_HEATMAPS_PRED: true diff --git a/VideoToNPZ/lib/pose/hrnet/experiments/mpii/resnet/res152_256x256_d256x3_adam_lr1e-3.yaml b/VideoToNPZ/lib/pose/hrnet/experiments/mpii/resnet/res152_256x256_d256x3_adam_lr1e-3.yaml new file mode 100644 index 0000000000000000000000000000000000000000..6002ac32675beb3b9b753a110d55382a7f9da7ac --- /dev/null +++ b/VideoToNPZ/lib/pose/hrnet/experiments/mpii/resnet/res152_256x256_d256x3_adam_lr1e-3.yaml @@ -0,0 +1,86 @@ +AUTO_RESUME: true +CUDNN: + BENCHMARK: true + DETERMINISTIC: false + ENABLED: true +DATA_DIR: '' +GPUS: (0,1,2,3) +OUTPUT_DIR: 'output' +LOG_DIR: 'log' +WORKERS: 24 +PRINT_FREQ: 100 + +DATASET: + COLOR_RGB: false + DATASET: mpii + DATA_FORMAT: jpg + FLIP: true + NUM_JOINTS_HALF_BODY: 8 + PROB_HALF_BODY: -1.0 + ROOT: 'data/mpii/' + ROT_FACTOR: 30 + SCALE_FACTOR: 0.25 + TEST_SET: valid + TRAIN_SET: train +MODEL: + NAME: 'pose_resnet' + PRETRAINED: 'models/pytorch/imagenet/resnet152-b121ed2d.pth' + IMAGE_SIZE: + - 256 + - 256 + HEATMAP_SIZE: + - 64 + - 64 + SIGMA: 2 + NUM_JOINTS: 16 + TARGET_TYPE: 'gaussian' + EXTRA: + FINAL_CONV_KERNEL: 1 + DECONV_WITH_BIAS: false + NUM_DECONV_LAYERS: 3 + NUM_DECONV_FILTERS: + - 256 + - 256 + - 256 + NUM_DECONV_KERNELS: + - 4 + - 4 + - 4 + NUM_LAYERS: 152 +LOSS: + USE_TARGET_WEIGHT: true +TRAIN: + BATCH_SIZE_PER_GPU: 32 + SHUFFLE: true + BEGIN_EPOCH: 0 + END_EPOCH: 140 + OPTIMIZER: 'adam' + LR: 0.001 + LR_FACTOR: 0.1 + LR_STEP: + - 90 + - 120 + WD: 0.0001 + GAMMA1: 0.99 + GAMMA2: 0.0 + MOMENTUM: 0.9 + NESTEROV: false +TEST: + BATCH_SIZE_PER_GPU: 32 + COCO_BBOX_FILE: 'data/coco/person_detection_results/COCO_val2017_detections_AP_H_56_person.json' + BBOX_THRE: 1.0 + IMAGE_THRE: 0.0 + IN_VIS_THRE: 0.2 + MODEL_FILE: '' + NMS_THRE: 1.0 + OKS_THRE: 0.9 + FLIP_TEST: true + POST_PROCESS: true + SHIFT_HEATMAP: true + USE_GT_BBOX: true +DEBUG: + DEBUG: true + SAVE_BATCH_IMAGES_GT: true + SAVE_BATCH_IMAGES_PRED: true + SAVE_HEATMAPS_GT: true + SAVE_HEATMAPS_PRED: true diff --git a/VideoToNPZ/lib/pose/hrnet/experiments/mpii/resnet/res50_256x256_d256x3_adam_lr1e-3.yaml b/VideoToNPZ/lib/pose/hrnet/experiments/mpii/resnet/res50_256x256_d256x3_adam_lr1e-3.yaml new file mode 100644 index 0000000000000000000000000000000000000000..33db6fbcf6c2e6a33190e221d07e1bcf0735714f --- /dev/null +++ b/VideoToNPZ/lib/pose/hrnet/experiments/mpii/resnet/res50_256x256_d256x3_adam_lr1e-3.yaml @@ -0,0 +1,86 @@ +AUTO_RESUME: true +CUDNN: + BENCHMARK: true + DETERMINISTIC: false + ENABLED: true +DATA_DIR: '' +GPUS: (0,1,2,3) +OUTPUT_DIR: 'output' +LOG_DIR: 'log' +WORKERS: 24 +PRINT_FREQ: 100 + +DATASET: + COLOR_RGB: false + DATASET: mpii + DATA_FORMAT: jpg + FLIP: true + NUM_JOINTS_HALF_BODY: 8 + PROB_HALF_BODY: -1.0 + ROOT: 'data/mpii/' + ROT_FACTOR: 30 + SCALE_FACTOR: 0.25 + TEST_SET: valid + TRAIN_SET: train +MODEL: + NAME: 'pose_resnet' + PRETRAINED: 'models/pytorch/imagenet/resnet50-19c8e357.pth' + IMAGE_SIZE: + - 256 + - 256 + HEATMAP_SIZE: + - 64 + - 64 + SIGMA: 2 + NUM_JOINTS: 16 + TARGET_TYPE: 'gaussian' + EXTRA: + FINAL_CONV_KERNEL: 1 + DECONV_WITH_BIAS: false + NUM_DECONV_LAYERS: 3 + NUM_DECONV_FILTERS: + - 256 + - 256 + - 256 + NUM_DECONV_KERNELS: + - 4 + - 4 + - 4 + NUM_LAYERS: 50 +LOSS: + USE_TARGET_WEIGHT: true +TRAIN: + BATCH_SIZE_PER_GPU: 32 + SHUFFLE: true + BEGIN_EPOCH: 0 + END_EPOCH: 140 + OPTIMIZER: 'adam' + LR: 0.001 + LR_FACTOR: 0.1 + LR_STEP: + - 90 + - 120 + WD: 0.0001 + GAMMA1: 0.99 + GAMMA2: 0.0 + MOMENTUM: 0.9 + NESTEROV: false +TEST: + BATCH_SIZE_PER_GPU: 32 + COCO_BBOX_FILE: 'data/coco/person_detection_results/COCO_val2017_detections_AP_H_56_person.json' + BBOX_THRE: 1.0 + IMAGE_THRE: 0.0 + IN_VIS_THRE: 0.2 + MODEL_FILE: '' + NMS_THRE: 1.0 + OKS_THRE: 0.9 + FLIP_TEST: true + POST_PROCESS: true + SHIFT_HEATMAP: true + USE_GT_BBOX: true +DEBUG: + DEBUG: true + SAVE_BATCH_IMAGES_GT: true + SAVE_BATCH_IMAGES_PRED: true + SAVE_HEATMAPS_GT: true + SAVE_HEATMAPS_PRED: true diff --git a/VideoToNPZ/lib/pose/hrnet/lib/Makefile b/VideoToNPZ/lib/pose/hrnet/lib/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..38cd2978c6fbb09364b579cb62e4d5abc33f80a2 --- /dev/null +++ b/VideoToNPZ/lib/pose/hrnet/lib/Makefile @@ -0,0 +1,4 @@ +all: + cd nms; python setup_linux.py build_ext --inplace; rm -rf build; cd ../../ +clean: + cd nms; rm *.so; cd ../../ diff --git a/VideoToNPZ/lib/pose/hrnet/lib/config/__init__.py b/VideoToNPZ/lib/pose/hrnet/lib/config/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a44e926b9b71389cb32a727d33d904bfdbcaaffb --- /dev/null +++ b/VideoToNPZ/lib/pose/hrnet/lib/config/__init__.py @@ -0,0 +1,9 @@ +# ------------------------------------------------------------------------------ +# Copyright (c) Microsoft +# Licensed under the MIT License. +# Written by Bin Xiao (Bin.Xiao@microsoft.com) +# ------------------------------------------------------------------------------ + +from .default import _C as cfg +from .default import update_config +from .models import MODEL_EXTRAS diff --git a/VideoToNPZ/lib/pose/hrnet/lib/config/default.py b/VideoToNPZ/lib/pose/hrnet/lib/config/default.py new file mode 100644 index 0000000000000000000000000000000000000000..030f468ffba91d7e5886783f0971e5e88fc14000 --- /dev/null +++ b/VideoToNPZ/lib/pose/hrnet/lib/config/default.py @@ -0,0 +1,160 @@ + +# ------------------------------------------------------------------------------ +# Copyright (c) Microsoft +# Licensed under the MIT License. +# Written by Bin Xiao (Bin.Xiao@microsoft.com) +# ------------------------------------------------------------------------------ + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os + +from yacs.config import CfgNode as CN + + +_C = CN() + +_C.OUTPUT_DIR = '' +_C.LOG_DIR = '' +_C.DATA_DIR = '' +_C.GPUS = (0,) +_C.WORKERS = 4 +_C.PRINT_FREQ = 20 +_C.AUTO_RESUME = False +_C.PIN_MEMORY = True +_C.RANK = 0 + +# Cudnn related params +_C.CUDNN = CN() +_C.CUDNN.BENCHMARK = True +_C.CUDNN.DETERMINISTIC = False +_C.CUDNN.ENABLED = True + +# common params for NETWORK +_C.MODEL = CN() +_C.MODEL.NAME = 'pose_hrnet' +_C.MODEL.INIT_WEIGHTS = True +_C.MODEL.PRETRAINED = '' +_C.MODEL.NUM_JOINTS = 17 +_C.MODEL.TAG_PER_JOINT = True +_C.MODEL.TARGET_TYPE = 'gaussian' +_C.MODEL.IMAGE_SIZE = [256, 256] # width * height, ex: 192 * 256 +_C.MODEL.HEATMAP_SIZE = [64, 64] # width * height, ex: 24 * 32 +_C.MODEL.SIGMA = 2 +_C.MODEL.EXTRA = CN(new_allowed=True) + +_C.LOSS = CN() +_C.LOSS.USE_OHKM = False +_C.LOSS.TOPK = 8 +_C.LOSS.USE_TARGET_WEIGHT = True +_C.LOSS.USE_DIFFERENT_JOINTS_WEIGHT = False + +# DATASET related params +_C.DATASET = CN() +_C.DATASET.ROOT = '' +_C.DATASET.DATASET = 'mpii' +_C.DATASET.TRAIN_SET = 'train' +_C.DATASET.TEST_SET = 'valid' +_C.DATASET.DATA_FORMAT = 'jpg' +_C.DATASET.HYBRID_JOINTS_TYPE = '' +_C.DATASET.SELECT_DATA = False + +# training data augmentation +_C.DATASET.FLIP = True +_C.DATASET.SCALE_FACTOR = 0.25 +_C.DATASET.ROT_FACTOR = 30 +_C.DATASET.PROB_HALF_BODY = 0.0 +_C.DATASET.NUM_JOINTS_HALF_BODY = 8 +_C.DATASET.COLOR_RGB = False + +# train +_C.TRAIN = CN() + +_C.TRAIN.LR_FACTOR = 0.1 +_C.TRAIN.LR_STEP = [90, 110] +_C.TRAIN.LR = 0.001 + +_C.TRAIN.OPTIMIZER = 'adam' +_C.TRAIN.MOMENTUM = 0.9 +_C.TRAIN.WD = 0.0001 +_C.TRAIN.NESTEROV = False +_C.TRAIN.GAMMA1 = 0.99 +_C.TRAIN.GAMMA2 = 0.0 + +_C.TRAIN.BEGIN_EPOCH = 0 +_C.TRAIN.END_EPOCH = 140 + +_C.TRAIN.RESUME = False +_C.TRAIN.CHECKPOINT = '' + +_C.TRAIN.BATCH_SIZE_PER_GPU = 32 +_C.TRAIN.SHUFFLE = True + +# testing +_C.TEST = CN() + +# size of images for each device +_C.TEST.BATCH_SIZE_PER_GPU = 32 +# Test Model Epoch +_C.TEST.FLIP_TEST = False +_C.TEST.POST_PROCESS = False +_C.TEST.SHIFT_HEATMAP = False + +_C.TEST.USE_GT_BBOX = False + +# nms +_C.TEST.IMAGE_THRE = 0.1 +_C.TEST.NMS_THRE = 0.6 +_C.TEST.SOFT_NMS = False +_C.TEST.OKS_THRE = 0.5 +_C.TEST.IN_VIS_THRE = 0.0 +_C.TEST.COCO_BBOX_FILE = '' +_C.TEST.BBOX_THRE = 1.0 +_C.TEST.MODEL_FILE = '' + +# debug +_C.DEBUG = CN() +_C.DEBUG.DEBUG = False +_C.DEBUG.SAVE_BATCH_IMAGES_GT = False +_C.DEBUG.SAVE_BATCH_IMAGES_PRED = False +_C.DEBUG.SAVE_HEATMAPS_GT = False +_C.DEBUG.SAVE_HEATMAPS_PRED = False + + +def update_config(cfg, args): + cfg.defrost() + cfg.merge_from_file(args.cfg) + cfg.merge_from_list(args.opts) + + if args.modelDir: + cfg.OUTPUT_DIR = args.modelDir + + # if args.logDir: + # cfg.LOG_DIR = args.logDir + # + # if args.dataDir: + # cfg.DATA_DIR = args.dataDir + # + # cfg.DATASET.ROOT = os.path.join( + # cfg.DATA_DIR, cfg.DATASET.ROOT + # ) + # + # cfg.MODEL.PRETRAINED = os.path.join( + # cfg.DATA_DIR, cfg.MODEL.PRETRAINED + # ) + # + # if cfg.TEST.MODEL_FILE: + # cfg.TEST.MODEL_FILE = os.path.join( + # cfg.DATA_DIR, cfg.TEST.MODEL_FILE + # ) + + cfg.freeze() + + +if __name__ == '__main__': + import sys + with open(sys.argv[1], 'w') as f: + print(_C, file=f) + diff --git a/VideoToNPZ/lib/pose/hrnet/lib/config/models.py b/VideoToNPZ/lib/pose/hrnet/lib/config/models.py new file mode 100644 index 0000000000000000000000000000000000000000..8e04c4f75a42429142131e1fe4cbbd67fbf4acb8 --- /dev/null +++ b/VideoToNPZ/lib/pose/hrnet/lib/config/models.py @@ -0,0 +1,58 @@ +# ------------------------------------------------------------------------------ +# Copyright (c) Microsoft +# Licensed under the MIT License. +# Written by Bin Xiao (Bin.Xiao@microsoft.com) +# ------------------------------------------------------------------------------ + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from yacs.config import CfgNode as CN + + +# pose_resnet related params +POSE_RESNET = CN() +POSE_RESNET.NUM_LAYERS = 50 +POSE_RESNET.DECONV_WITH_BIAS = False +POSE_RESNET.NUM_DECONV_LAYERS = 3 +POSE_RESNET.NUM_DECONV_FILTERS = [256, 256, 256] +POSE_RESNET.NUM_DECONV_KERNELS = [4, 4, 4] +POSE_RESNET.FINAL_CONV_KERNEL = 1 +POSE_RESNET.PRETRAINED_LAYERS = ['*'] + +# pose_multi_resoluton_net related params +POSE_HIGH_RESOLUTION_NET = CN() +POSE_HIGH_RESOLUTION_NET.PRETRAINED_LAYERS = ['*'] +POSE_HIGH_RESOLUTION_NET.STEM_INPLANES = 64 +POSE_HIGH_RESOLUTION_NET.FINAL_CONV_KERNEL = 1 + +POSE_HIGH_RESOLUTION_NET.STAGE2 = CN() +POSE_HIGH_RESOLUTION_NET.STAGE2.NUM_MODULES = 1 +POSE_HIGH_RESOLUTION_NET.STAGE2.NUM_BRANCHES = 2 +POSE_HIGH_RESOLUTION_NET.STAGE2.NUM_BLOCKS = [4, 4] +POSE_HIGH_RESOLUTION_NET.STAGE2.NUM_CHANNELS = [32, 64] +POSE_HIGH_RESOLUTION_NET.STAGE2.BLOCK = 'BASIC' +POSE_HIGH_RESOLUTION_NET.STAGE2.FUSE_METHOD = 'SUM' + +POSE_HIGH_RESOLUTION_NET.STAGE3 = CN() +POSE_HIGH_RESOLUTION_NET.STAGE3.NUM_MODULES = 1 +POSE_HIGH_RESOLUTION_NET.STAGE3.NUM_BRANCHES = 3 +POSE_HIGH_RESOLUTION_NET.STAGE3.NUM_BLOCKS = [4, 4, 4] +POSE_HIGH_RESOLUTION_NET.STAGE3.NUM_CHANNELS = [32, 64, 128] +POSE_HIGH_RESOLUTION_NET.STAGE3.BLOCK = 'BASIC' +POSE_HIGH_RESOLUTION_NET.STAGE3.FUSE_METHOD = 'SUM' + +POSE_HIGH_RESOLUTION_NET.STAGE4 = CN() +POSE_HIGH_RESOLUTION_NET.STAGE4.NUM_MODULES = 1 +POSE_HIGH_RESOLUTION_NET.STAGE4.NUM_BRANCHES = 4 +POSE_HIGH_RESOLUTION_NET.STAGE4.NUM_BLOCKS = [4, 4, 4, 4] +POSE_HIGH_RESOLUTION_NET.STAGE4.NUM_CHANNELS = [32, 64, 128, 256] +POSE_HIGH_RESOLUTION_NET.STAGE4.BLOCK = 'BASIC' +POSE_HIGH_RESOLUTION_NET.STAGE4.FUSE_METHOD = 'SUM' + + +MODEL_EXTRAS = { + 'pose_resnet': POSE_RESNET, + 'pose_high_resolution_net': POSE_HIGH_RESOLUTION_NET, +} diff --git a/VideoToNPZ/lib/pose/hrnet/lib/models/__init__.py b/VideoToNPZ/lib/pose/hrnet/lib/models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e3b7f1a709982d59002f07fbb9f42d919d9bee17 --- /dev/null +++ b/VideoToNPZ/lib/pose/hrnet/lib/models/__init__.py @@ -0,0 +1,16 @@ +# ------------------------------------------------------------------------------ +# Copyright (c) Microsoft +# Licensed under the MIT License. +# Written by Bin Xiao (Bin.Xiao@microsoft.com) +# ------------------------------------------------------------------------------ + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import models.pose_resnet +import models.pose_hrnet diff --git a/VideoToNPZ/lib/pose/hrnet/lib/models/pose_hrnet.py b/VideoToNPZ/lib/pose/hrnet/lib/models/pose_hrnet.py new file mode 100644 index 0000000000000000000000000000000000000000..09ff346a1b20ca9e9078714132c01123d1b0b4b1 --- /dev/null +++ b/VideoToNPZ/lib/pose/hrnet/lib/models/pose_hrnet.py @@ -0,0 +1,501 @@ +# ------------------------------------------------------------------------------ +# Copyright (c) Microsoft +# Licensed under the MIT License. +# Written by Bin Xiao (Bin.Xiao@microsoft.com) +# ------------------------------------------------------------------------------ + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import logging + +import torch +import torch.nn as nn + + +BN_MOMENTUM = 0.1 +logger = logging.getLogger(__name__) + + +def conv3x3(in_planes, out_planes, stride=1): + """3x3 convolution with padding""" + return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, + padding=1, bias=False) + + +class BasicBlock(nn.Module): + expansion = 1 + + def __init__(self, inplanes, planes, stride=1, downsample=None): + super(BasicBlock, self).__init__() + self.conv1 = conv3x3(inplanes, planes, stride) + self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM) + self.relu = nn.ReLU(inplace=True) + self.conv2 = conv3x3(planes, planes) + self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM) + self.downsample = downsample + self.stride = stride + + def forward(self, x): + residual = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + + if self.downsample is not None: + residual = self.downsample(x) + + out += residual + out = self.relu(out) + + return out + + +class Bottleneck(nn.Module): + expansion = 4 + + def __init__(self, inplanes, planes, stride=1, downsample=None): + super(Bottleneck, self).__init__() + self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False) + self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM) + self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, + padding=1, bias=False) + self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM) + self.conv3 = nn.Conv2d(planes, planes * self.expansion, kernel_size=1, + bias=False) + self.bn3 = nn.BatchNorm2d(planes * self.expansion, + momentum=BN_MOMENTUM) + self.relu = nn.ReLU(inplace=True) + self.downsample = downsample + self.stride = stride + + def forward(self, x): + residual = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + out = self.relu(out) + + out = self.conv3(out) + out = self.bn3(out) + + if self.downsample is not None: + residual = self.downsample(x) + + out += residual + out = self.relu(out) + + return out + + +class HighResolutionModule(nn.Module): + def __init__(self, num_branches, blocks, num_blocks, num_inchannels, + num_channels, fuse_method, multi_scale_output=True): + super(HighResolutionModule, self).__init__() + self._check_branches( + num_branches, blocks, num_blocks, num_inchannels, num_channels) + + self.num_inchannels = num_inchannels + self.fuse_method = fuse_method + self.num_branches = num_branches + + self.multi_scale_output = multi_scale_output + + self.branches = self._make_branches( + num_branches, blocks, num_blocks, num_channels) + self.fuse_layers = self._make_fuse_layers() + self.relu = nn.ReLU(True) + + def _check_branches(self, num_branches, blocks, num_blocks, + num_inchannels, num_channels): + if num_branches != len(num_blocks): + error_msg = 'NUM_BRANCHES({}) <> NUM_BLOCKS({})'.format( + num_branches, len(num_blocks)) + logger.error(error_msg) + raise ValueError(error_msg) + + if num_branches != len(num_channels): + error_msg = 'NUM_BRANCHES({}) <> NUM_CHANNELS({})'.format( + num_branches, len(num_channels)) + logger.error(error_msg) + raise ValueError(error_msg) + + if num_branches != len(num_inchannels): + error_msg = 'NUM_BRANCHES({}) <> NUM_INCHANNELS({})'.format( + num_branches, len(num_inchannels)) + logger.error(error_msg) + raise ValueError(error_msg) + + def _make_one_branch(self, branch_index, block, num_blocks, num_channels, + stride=1): + downsample = None + if stride != 1 or \ + self.num_inchannels[branch_index] != num_channels[branch_index] * block.expansion: + downsample = nn.Sequential( + nn.Conv2d( + self.num_inchannels[branch_index], + num_channels[branch_index] * block.expansion, + kernel_size=1, stride=stride, bias=False + ), + nn.BatchNorm2d( + num_channels[branch_index] * block.expansion, + momentum=BN_MOMENTUM + ), + ) + + layers = [] + layers.append( + block( + self.num_inchannels[branch_index], + num_channels[branch_index], + stride, + downsample + ) + ) + self.num_inchannels[branch_index] = \ + num_channels[branch_index] * block.expansion + for i in range(1, num_blocks[branch_index]): + layers.append( + block( + self.num_inchannels[branch_index], + num_channels[branch_index] + ) + ) + + return nn.Sequential(*layers) + + def _make_branches(self, num_branches, block, num_blocks, num_channels): + branches = [] + + for i in range(num_branches): + branches.append( + self._make_one_branch(i, block, num_blocks, num_channels) + ) + + return nn.ModuleList(branches) + + def _make_fuse_layers(self): + if self.num_branches == 1: + return None + + num_branches = self.num_branches + num_inchannels = self.num_inchannels + fuse_layers = [] + for i in range(num_branches if self.multi_scale_output else 1): + fuse_layer = [] + for j in range(num_branches): + if j > i: + fuse_layer.append( + nn.Sequential( + nn.Conv2d( + num_inchannels[j], + num_inchannels[i], + 1, 1, 0, bias=False + ), + nn.BatchNorm2d(num_inchannels[i]), + nn.Upsample(scale_factor=2**(j-i), mode='nearest') + ) + ) + elif j == i: + fuse_layer.append(None) + else: + conv3x3s = [] + for k in range(i-j): + if k == i - j - 1: + num_outchannels_conv3x3 = num_inchannels[i] + conv3x3s.append( + nn.Sequential( + nn.Conv2d( + num_inchannels[j], + num_outchannels_conv3x3, + 3, 2, 1, bias=False + ), + nn.BatchNorm2d(num_outchannels_conv3x3) + ) + ) + else: + num_outchannels_conv3x3 = num_inchannels[j] + conv3x3s.append( + nn.Sequential( + nn.Conv2d( + num_inchannels[j], + num_outchannels_conv3x3, + 3, 2, 1, bias=False + ), + nn.BatchNorm2d(num_outchannels_conv3x3), + nn.ReLU(True) + ) + ) + fuse_layer.append(nn.Sequential(*conv3x3s)) + fuse_layers.append(nn.ModuleList(fuse_layer)) + + return nn.ModuleList(fuse_layers) + + def get_num_inchannels(self): + return self.num_inchannels + + def forward(self, x): + if self.num_branches == 1: + return [self.branches[0](x[0])] + + for i in range(self.num_branches): + x[i] = self.branches[i](x[i]) + + x_fuse = [] + + for i in range(len(self.fuse_layers)): + y = x[0] if i == 0 else self.fuse_layers[i][0](x[0]) + for j in range(1, self.num_branches): + if i == j: + y = y + x[j] + else: + y = y + self.fuse_layers[i][j](x[j]) + x_fuse.append(self.relu(y)) + + return x_fuse + + +blocks_dict = { + 'BASIC': BasicBlock, + 'BOTTLENECK': Bottleneck +} + + +class PoseHighResolutionNet(nn.Module): + + def __init__(self, cfg, **kwargs): + self.inplanes = 64 + extra = cfg['MODEL']['EXTRA'] + super(PoseHighResolutionNet, self).__init__() + + # stem net + self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=2, padding=1, + bias=False) + self.bn1 = nn.BatchNorm2d(64, momentum=BN_MOMENTUM) + self.conv2 = nn.Conv2d(64, 64, kernel_size=3, stride=2, padding=1, + bias=False) + self.bn2 = nn.BatchNorm2d(64, momentum=BN_MOMENTUM) + self.relu = nn.ReLU(inplace=True) + self.layer1 = self._make_layer(Bottleneck, 64, 4) + + self.stage2_cfg = extra['STAGE2'] + num_channels = self.stage2_cfg['NUM_CHANNELS'] + block = blocks_dict[self.stage2_cfg['BLOCK']] + num_channels = [ + num_channels[i] * block.expansion for i in range(len(num_channels)) + ] + self.transition1 = self._make_transition_layer([256], num_channels) + self.stage2, pre_stage_channels = self._make_stage( + self.stage2_cfg, num_channels) + + self.stage3_cfg = extra['STAGE3'] + num_channels = self.stage3_cfg['NUM_CHANNELS'] + block = blocks_dict[self.stage3_cfg['BLOCK']] + num_channels = [ + num_channels[i] * block.expansion for i in range(len(num_channels)) + ] + self.transition2 = self._make_transition_layer( + pre_stage_channels, num_channels) + self.stage3, pre_stage_channels = self._make_stage( + self.stage3_cfg, num_channels) + + self.stage4_cfg = extra['STAGE4'] + num_channels = self.stage4_cfg['NUM_CHANNELS'] + block = blocks_dict[self.stage4_cfg['BLOCK']] + num_channels = [ + num_channels[i] * block.expansion for i in range(len(num_channels)) + ] + self.transition3 = self._make_transition_layer( + pre_stage_channels, num_channels) + self.stage4, pre_stage_channels = self._make_stage( + self.stage4_cfg, num_channels, multi_scale_output=False) + + self.final_layer = nn.Conv2d( + in_channels=pre_stage_channels[0], + out_channels=cfg['MODEL']['NUM_JOINTS'], + kernel_size=extra['FINAL_CONV_KERNEL'], + stride=1, + padding=1 if extra['FINAL_CONV_KERNEL'] == 3 else 0 + ) + + self.pretrained_layers = extra['PRETRAINED_LAYERS'] + + def _make_transition_layer( + self, num_channels_pre_layer, num_channels_cur_layer): + num_branches_cur = len(num_channels_cur_layer) + num_branches_pre = len(num_channels_pre_layer) + + transition_layers = [] + for i in range(num_branches_cur): + if i < num_branches_pre: + if num_channels_cur_layer[i] != num_channels_pre_layer[i]: + transition_layers.append( + nn.Sequential( + nn.Conv2d( + num_channels_pre_layer[i], + num_channels_cur_layer[i], + 3, 1, 1, bias=False + ), + nn.BatchNorm2d(num_channels_cur_layer[i]), + nn.ReLU(inplace=True) + ) + ) + else: + transition_layers.append(None) + else: + conv3x3s = [] + for j in range(i+1-num_branches_pre): + inchannels = num_channels_pre_layer[-1] + outchannels = num_channels_cur_layer[i] \ + if j == i-num_branches_pre else inchannels + conv3x3s.append( + nn.Sequential( + nn.Conv2d( + inchannels, outchannels, 3, 2, 1, bias=False + ), + nn.BatchNorm2d(outchannels), + nn.ReLU(inplace=True) + ) + ) + transition_layers.append(nn.Sequential(*conv3x3s)) + + return nn.ModuleList(transition_layers) + + def _make_layer(self, block, planes, blocks, stride=1): + downsample = None + if stride != 1 or self.inplanes != planes * block.expansion: + downsample = nn.Sequential( + nn.Conv2d( + self.inplanes, planes * block.expansion, + kernel_size=1, stride=stride, bias=False + ), + nn.BatchNorm2d(planes * block.expansion, momentum=BN_MOMENTUM), + ) + + layers = [] + layers.append(block(self.inplanes, planes, stride, downsample)) + self.inplanes = planes * block.expansion + for i in range(1, blocks): + layers.append(block(self.inplanes, planes)) + + return nn.Sequential(*layers) + + def _make_stage(self, layer_config, num_inchannels, + multi_scale_output=True): + num_modules = layer_config['NUM_MODULES'] + num_branches = layer_config['NUM_BRANCHES'] + num_blocks = layer_config['NUM_BLOCKS'] + num_channels = layer_config['NUM_CHANNELS'] + block = blocks_dict[layer_config['BLOCK']] + fuse_method = layer_config['FUSE_METHOD'] + + modules = [] + for i in range(num_modules): + # multi_scale_output is only used last module + if not multi_scale_output and i == num_modules - 1: + reset_multi_scale_output = False + else: + reset_multi_scale_output = True + + modules.append( + HighResolutionModule( + num_branches, + block, + num_blocks, + num_inchannels, + num_channels, + fuse_method, + reset_multi_scale_output + ) + ) + num_inchannels = modules[-1].get_num_inchannels() + + return nn.Sequential(*modules), num_inchannels + + def forward(self, x): + x = self.conv1(x) + x = self.bn1(x) + x = self.relu(x) + x = self.conv2(x) + x = self.bn2(x) + x = self.relu(x) + x = self.layer1(x) + + x_list = [] + for i in range(self.stage2_cfg['NUM_BRANCHES']): + if self.transition1[i] is not None: + x_list.append(self.transition1[i](x)) + else: + x_list.append(x) + y_list = self.stage2(x_list) + + x_list = [] + for i in range(self.stage3_cfg['NUM_BRANCHES']): + if self.transition2[i] is not None: + x_list.append(self.transition2[i](y_list[-1])) + else: + x_list.append(y_list[i]) + y_list = self.stage3(x_list) + + x_list = [] + for i in range(self.stage4_cfg['NUM_BRANCHES']): + if self.transition3[i] is not None: + x_list.append(self.transition3[i](y_list[-1])) + else: + x_list.append(y_list[i]) + y_list = self.stage4(x_list) + + x = self.final_layer(y_list[0]) + + return x + + def init_weights(self, pretrained=''): + logger.info('=> init weights from normal distribution') + for m in self.modules(): + if isinstance(m, nn.Conv2d): + # nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') + nn.init.normal_(m.weight, std=0.001) + for name, _ in m.named_parameters(): + if name in ['bias']: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.BatchNorm2d): + nn.init.constant_(m.weight, 1) + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.ConvTranspose2d): + nn.init.normal_(m.weight, std=0.001) + for name, _ in m.named_parameters(): + if name in ['bias']: + nn.init.constant_(m.bias, 0) + + if os.path.isfile(pretrained): + pretrained_state_dict = torch.load(pretrained) + logger.info('=> loading pretrained model {}'.format(pretrained)) + + need_init_state_dict = {} + for name, m in pretrained_state_dict.items(): + if name.split('.')[0] in self.pretrained_layers \ + or self.pretrained_layers[0] is '*': + need_init_state_dict[name] = m + self.load_state_dict(need_init_state_dict, strict=False) + elif pretrained: + logger.error('=> please download pre-trained models first!') + raise ValueError('{} is not exist!'.format(pretrained)) + + +def get_pose_net(cfg, is_train, **kwargs): + model = PoseHighResolutionNet(cfg, **kwargs) + + if is_train and cfg['MODEL']['INIT_WEIGHTS']: + model.init_weights(cfg['MODEL']['PRETRAINED']) + + return model diff --git a/VideoToNPZ/lib/pose/hrnet/lib/models/pose_resnet.py b/VideoToNPZ/lib/pose/hrnet/lib/models/pose_resnet.py new file mode 100644 index 0000000000000000000000000000000000000000..f264dee95cf32ea57e9e1a97952eedd723117249 --- /dev/null +++ b/VideoToNPZ/lib/pose/hrnet/lib/models/pose_resnet.py @@ -0,0 +1,271 @@ +# ------------------------------------------------------------------------------ +# Copyright (c) Microsoft +# Licensed under the MIT License. +# Written by Bin Xiao (Bin.Xiao@microsoft.com) +# ------------------------------------------------------------------------------ + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import logging + +import torch +import torch.nn as nn + + +BN_MOMENTUM = 0.1 +logger = logging.getLogger(__name__) + + +def conv3x3(in_planes, out_planes, stride=1): + """3x3 convolution with padding""" + return nn.Conv2d( + in_planes, out_planes, kernel_size=3, stride=stride, + padding=1, bias=False + ) + + +class BasicBlock(nn.Module): + expansion = 1 + + def __init__(self, inplanes, planes, stride=1, downsample=None): + super(BasicBlock, self).__init__() + self.conv1 = conv3x3(inplanes, planes, stride) + self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM) + self.relu = nn.ReLU(inplace=True) + self.conv2 = conv3x3(planes, planes) + self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM) + self.downsample = downsample + self.stride = stride + + def forward(self, x): + residual = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + + if self.downsample is not None: + residual = self.downsample(x) + + out += residual + out = self.relu(out) + + return out + + +class Bottleneck(nn.Module): + expansion = 4 + + def __init__(self, inplanes, planes, stride=1, downsample=None): + super(Bottleneck, self).__init__() + self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False) + self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM) + self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, + padding=1, bias=False) + self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM) + self.conv3 = nn.Conv2d(planes, planes * self.expansion, kernel_size=1, + bias=False) + self.bn3 = nn.BatchNorm2d(planes * self.expansion, + momentum=BN_MOMENTUM) + self.relu = nn.ReLU(inplace=True) + self.downsample = downsample + self.stride = stride + + def forward(self, x): + residual = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + out = self.relu(out) + + out = self.conv3(out) + out = self.bn3(out) + + if self.downsample is not None: + residual = self.downsample(x) + + out += residual + out = self.relu(out) + + return out + + +class PoseResNet(nn.Module): + + def __init__(self, block, layers, cfg, **kwargs): + self.inplanes = 64 + extra = cfg.MODEL.EXTRA + self.deconv_with_bias = extra.DECONV_WITH_BIAS + + super(PoseResNet, self).__init__() + self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, + bias=False) + self.bn1 = nn.BatchNorm2d(64, momentum=BN_MOMENTUM) + self.relu = nn.ReLU(inplace=True) + self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) + self.layer1 = self._make_layer(block, 64, layers[0]) + self.layer2 = self._make_layer(block, 128, layers[1], stride=2) + self.layer3 = self._make_layer(block, 256, layers[2], stride=2) + self.layer4 = self._make_layer(block, 512, layers[3], stride=2) + + # used for deconv layers + self.deconv_layers = self._make_deconv_layer( + extra.NUM_DECONV_LAYERS, + extra.NUM_DECONV_FILTERS, + extra.NUM_DECONV_KERNELS, + ) + + self.final_layer = nn.Conv2d( + in_channels=extra.NUM_DECONV_FILTERS[-1], + out_channels=cfg.MODEL.NUM_JOINTS, + kernel_size=extra.FINAL_CONV_KERNEL, + stride=1, + padding=1 if extra.FINAL_CONV_KERNEL == 3 else 0 + ) + + def _make_layer(self, block, planes, blocks, stride=1): + downsample = None + if stride != 1 or self.inplanes != planes * block.expansion: + downsample = nn.Sequential( + nn.Conv2d(self.inplanes, planes * block.expansion, + kernel_size=1, stride=stride, bias=False), + nn.BatchNorm2d(planes * block.expansion, momentum=BN_MOMENTUM), + ) + + layers = [] + layers.append(block(self.inplanes, planes, stride, downsample)) + self.inplanes = planes * block.expansion + for i in range(1, blocks): + layers.append(block(self.inplanes, planes)) + + return nn.Sequential(*layers) + + def _get_deconv_cfg(self, deconv_kernel, index): + if deconv_kernel == 4: + padding = 1 + output_padding = 0 + elif deconv_kernel == 3: + padding = 1 + output_padding = 1 + elif deconv_kernel == 2: + padding = 0 + output_padding = 0 + + return deconv_kernel, padding, output_padding + + def _make_deconv_layer(self, num_layers, num_filters, num_kernels): + assert num_layers == len(num_filters), \ + 'ERROR: num_deconv_layers is different len(num_deconv_filters)' + assert num_layers == len(num_kernels), \ + 'ERROR: num_deconv_layers is different len(num_deconv_filters)' + + layers = [] + for i in range(num_layers): + kernel, padding, output_padding = \ + self._get_deconv_cfg(num_kernels[i], i) + + planes = num_filters[i] + layers.append( + nn.ConvTranspose2d( + in_channels=self.inplanes, + out_channels=planes, + kernel_size=kernel, + stride=2, + padding=padding, + output_padding=output_padding, + bias=self.deconv_with_bias)) + layers.append(nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)) + layers.append(nn.ReLU(inplace=True)) + self.inplanes = planes + + return nn.Sequential(*layers) + + def forward(self, x): + x = self.conv1(x) + x = self.bn1(x) + x = self.relu(x) + x = self.maxpool(x) + + x = self.layer1(x) + x = self.layer2(x) + x = self.layer3(x) + x = self.layer4(x) + + x = self.deconv_layers(x) + x = self.final_layer(x) + + return x + + def init_weights(self, pretrained=''): + if os.path.isfile(pretrained): + logger.info('=> init deconv weights from normal distribution') + for name, m in self.deconv_layers.named_modules(): + if isinstance(m, nn.ConvTranspose2d): + logger.info('=> init {}.weight as normal(0, 0.001)'.format(name)) + logger.info('=> init {}.bias as 0'.format(name)) + nn.init.normal_(m.weight, std=0.001) + if self.deconv_with_bias: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.BatchNorm2d): + logger.info('=> init {}.weight as 1'.format(name)) + logger.info('=> init {}.bias as 0'.format(name)) + nn.init.constant_(m.weight, 1) + nn.init.constant_(m.bias, 0) + logger.info('=> init final conv weights from normal distribution') + for m in self.final_layer.modules(): + if isinstance(m, nn.Conv2d): + # nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') + logger.info('=> init {}.weight as normal(0, 0.001)'.format(name)) + logger.info('=> init {}.bias as 0'.format(name)) + nn.init.normal_(m.weight, std=0.001) + nn.init.constant_(m.bias, 0) + + pretrained_state_dict = torch.load(pretrained) + logger.info('=> loading pretrained model {}'.format(pretrained)) + self.load_state_dict(pretrained_state_dict, strict=False) + else: + logger.info('=> init weights from normal distribution') + for m in self.modules(): + if isinstance(m, nn.Conv2d): + # nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') + nn.init.normal_(m.weight, std=0.001) + # nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.BatchNorm2d): + nn.init.constant_(m.weight, 1) + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.ConvTranspose2d): + nn.init.normal_(m.weight, std=0.001) + if self.deconv_with_bias: + nn.init.constant_(m.bias, 0) + + +resnet_spec = { + 18: (BasicBlock, [2, 2, 2, 2]), + 34: (BasicBlock, [3, 4, 6, 3]), + 50: (Bottleneck, [3, 4, 6, 3]), + 101: (Bottleneck, [3, 4, 23, 3]), + 152: (Bottleneck, [3, 8, 36, 3]) +} + + +def get_pose_net(cfg, is_train, **kwargs): + num_layers = cfg.MODEL.EXTRA.NUM_LAYERS + + block_class, layers = resnet_spec[num_layers] + + model = PoseResNet(block_class, layers, cfg, **kwargs) + + if is_train and cfg.MODEL.INIT_WEIGHTS: + model.init_weights(cfg.MODEL.PRETRAINED) + + return model diff --git a/VideoToNPZ/lib/pose/hrnet/lib/utils/__init__.py b/VideoToNPZ/lib/pose/hrnet/lib/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/VideoToNPZ/lib/pose/hrnet/lib/utils/coco_h36m.py b/VideoToNPZ/lib/pose/hrnet/lib/utils/coco_h36m.py new file mode 100644 index 0000000000000000000000000000000000000000..44fe35fa6f06e02f902129e6465a62b480653c94 --- /dev/null +++ b/VideoToNPZ/lib/pose/hrnet/lib/utils/coco_h36m.py @@ -0,0 +1,51 @@ +import numpy as np + + +h36m_coco_order = [9, 11, 14, 12, 15, 13, 16, 4, 1, 5, 2, 6, 3] +coco_order = [0, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16] +spple_keypoints = [10, 8, 0, 7] + + +def coco_h36m(keypoints): + # keypoints: (T, N, 2) or (M, N, 2) + + temporal = keypoints.shape[0] + keypoints_h36m = np.zeros_like(keypoints, dtype=np.float32) + htps_keypoints = np.zeros((temporal, 4, 2), dtype=np.float32) + + # htps_keypoints: head, thorax, pelvis, spine + htps_keypoints[:, 0, 0] = np.mean(keypoints[:, 1:5, 0], axis=1, dtype=np.float32) + htps_keypoints[:, 0, 1] = np.sum(keypoints[:, 1:3, 1], axis=1, dtype=np.float32) - keypoints[:, 0, 1] + htps_keypoints[:, 1, :] = np.mean(keypoints[:, 5:7, :], axis=1, dtype=np.float32) + htps_keypoints[:, 1, :] += (keypoints[:, 0, :] - htps_keypoints[:, 1, :]) / 3 + + htps_keypoints[:, 2, :] = np.mean(keypoints[:, 11:13, :], axis=1, dtype=np.float32) + htps_keypoints[:, 3, :] = np.mean(keypoints[:, [5, 6, 11, 12], :], axis=1, dtype=np.float32) + + keypoints_h36m[:, spple_keypoints, :] = htps_keypoints + keypoints_h36m[:, h36m_coco_order, :] = keypoints[:, coco_order, :] + + keypoints_h36m[:, 9, :] -= (keypoints_h36m[:, 9, :] - np.mean(keypoints[:, 5:7, :], axis=1, dtype=np.float32)) / 4 + keypoints_h36m[:, 7, 0] += 0.3*(keypoints_h36m[:, 7, 0] - np.mean(keypoints_h36m[:, [0, 8], 0], axis=1, dtype=np.float32)) + keypoints_h36m[:, 8, 1] -= (np.mean(keypoints[:, 1:3, 1], axis=1, dtype=np.float32) - keypoints[:, 0, 1])*2/3 + + # half body: the joint of ankle and knee equal to hip + # keypoints_h36m[:, [2, 3]] = keypoints_h36m[:, [1, 1]] + # keypoints_h36m[:, [5, 6]] = keypoints_h36m[:, [4, 4]] + return keypoints_h36m + + +h36m_mpii_order = [3, 2, 1, 4, 5, 6, 0, 8, 9, 10, 16, 15, 14, 11, 12, 13] +mpii_order = [i for i in range(16)] +lr_hip_shouler = [2, 3, 12, 13] + + +def mpii_h36m(keypoints): + temporal = keypoints.shape[0] + keypoints_h36m = np.zeros((temporal, 17, 2), dtype=np.float32) + keypoints_h36m[:, h36m_mpii_order] = keypoints + # keypoints_h36m[:, 7] = np.mean(keypoints[:, 6:8], axis=1, dtype=np.float32) + keypoints_h36m[:, 7] = np.mean(keypoints[:, lr_hip_shouler], axis=1, dtype=np.float32) + return keypoints_h36m + + diff --git a/VideoToNPZ/lib/pose/hrnet/lib/utils/inference.py b/VideoToNPZ/lib/pose/hrnet/lib/utils/inference.py new file mode 100644 index 0000000000000000000000000000000000000000..77942ad22ae1ca2b541a0cfd65619f34c8a32bfb --- /dev/null +++ b/VideoToNPZ/lib/pose/hrnet/lib/utils/inference.py @@ -0,0 +1,82 @@ +# ------------------------------------------------------------------------------ +# Copyright (c) Microsoft +# Licensed under the MIT License. +# Written by Bin Xiao (Bin.Xiao@microsoft.com) +# ------------------------------------------------------------------------------ + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import math +import sys +import os.path as osp +import numpy as np + +sys.path.insert(0, osp.join(osp.dirname(osp.realpath(__file__)), '..')) +from utils.transforms import transform_preds +sys.path.pop(0) + + +def get_max_preds(batch_heatmaps): + ''' + get predictions from score maps + heatmaps: numpy.ndarray([batch_size, num_joints, height, width]) + ''' + assert isinstance(batch_heatmaps, np.ndarray), \ + 'batch_heatmaps should be numpy.ndarray' + assert batch_heatmaps.ndim == 4, 'batch_images should be 4-ndim' + + batch_size = batch_heatmaps.shape[0] + num_joints = batch_heatmaps.shape[1] + width = batch_heatmaps.shape[3] + heatmaps_reshaped = batch_heatmaps.reshape((batch_size, num_joints, -1)) + idx = np.argmax(heatmaps_reshaped, 2) + maxvals = np.amax(heatmaps_reshaped, 2) + + maxvals = maxvals.reshape((batch_size, num_joints, 1)) + idx = idx.reshape((batch_size, num_joints, 1)) + + preds = np.tile(idx, (1, 1, 2)).astype(np.float32) + + preds[:, :, 0] = (preds[:, :, 0]) % width + preds[:, :, 1] = np.floor((preds[:, :, 1]) / width) + + pred_mask = np.tile(np.greater(maxvals, 0.0), (1, 1, 2)) + pred_mask = pred_mask.astype(np.float32) + + preds *= pred_mask + return preds, maxvals + + +def get_final_preds(config, batch_heatmaps, center, scale): + coords, maxvals = get_max_preds(batch_heatmaps) + + heatmap_height = batch_heatmaps.shape[2] + heatmap_width = batch_heatmaps.shape[3] + + # post-processing + if config.TEST.POST_PROCESS: + for n in range(coords.shape[0]): + for p in range(coords.shape[1]): + hm = batch_heatmaps[n][p] + px = int(math.floor(coords[n][p][0] + 0.5)) + py = int(math.floor(coords[n][p][1] + 0.5)) + if 1 < px < heatmap_width-1 and 1 < py < heatmap_height-1: + diff = np.array( + [ + hm[py][px+1] - hm[py][px-1], + hm[py+1][px]-hm[py-1][px] + ] + ) + coords[n][p] += np.sign(diff) * .25 + + preds = coords.copy() + + # Transform back + for i in range(coords.shape[0]): + preds[i] = transform_preds( + coords[i], center[i], scale[i], [heatmap_width, heatmap_height] + ) + + return preds, maxvals diff --git a/VideoToNPZ/lib/pose/hrnet/lib/utils/transforms.py b/VideoToNPZ/lib/pose/hrnet/lib/utils/transforms.py new file mode 100644 index 0000000000000000000000000000000000000000..98ccbd5cde7f98daa87581d183228f0e3b4a4318 --- /dev/null +++ b/VideoToNPZ/lib/pose/hrnet/lib/utils/transforms.py @@ -0,0 +1,122 @@ +# ------------------------------------------------------------------------------ +# Copyright (c) Microsoft +# Licensed under the MIT License. +# Written by Bin Xiao (Bin.Xiao@microsoft.com) +# ------------------------------------------------------------------------------ + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +import cv2 + + +def flip_back(output_flipped, matched_parts): + ''' + ouput_flipped: numpy.ndarray(batch_size, num_joints, height, width) + ''' + assert output_flipped.ndim == 4,\ + 'output_flipped should be [batch_size, num_joints, height, width]' + + output_flipped = output_flipped[:, :, :, ::-1] + + # 因为你输入的是翻转后的图像,所以输出的热图他们对应的左右关节也是相反的(训练的时候,输入的是翻转后的图像,target对应的左右关节也是对调过来的)。 + for pair in matched_parts: + tmp = output_flipped[:, pair[0], :, :].copy() + output_flipped[:, pair[0], :, :] = output_flipped[:, pair[1], :, :] + output_flipped[:, pair[1], :, :] = tmp + + return output_flipped + + +def fliplr_joints(joints, joints_vis, width, matched_parts): + """ + flip coords + """ + # Flip horizontal + joints[:, 0] = width - joints[:, 0] - 1 + + # Change left-right parts + for pair in matched_parts: + joints[pair[0], :], joints[pair[1], :] = \ + joints[pair[1], :], joints[pair[0], :].copy() + joints_vis[pair[0], :], joints_vis[pair[1], :] = \ + joints_vis[pair[1], :], joints_vis[pair[0], :].copy() + + return joints*joints_vis, joints_vis + + +def transform_preds(coords, center, scale, output_size): + target_coords = np.zeros(coords.shape) + trans = get_affine_transform(center, scale, 0, output_size, inv=1) + for p in range(coords.shape[0]): + target_coords[p, 0:2] = affine_transform(coords[p, 0:2], trans) + return target_coords + + +def get_affine_transform( + center, scale, rot, output_size, + shift=np.array([0, 0], dtype=np.float32), inv=0 +): + if not isinstance(scale, np.ndarray) and not isinstance(scale, list): + print(scale) + scale = np.array([scale, scale]) + + scale_tmp = scale * 200.0 + src_w = scale_tmp[0] + dst_w = output_size[0] + dst_h = output_size[1] + + rot_rad = np.pi * rot / 180 + src_dir = get_dir([0, src_w * -0.5], rot_rad) + dst_dir = np.array([0, dst_w * -0.5], np.float32) + + src = np.zeros((3, 2), dtype=np.float32) + dst = np.zeros((3, 2), dtype=np.float32) + src[0, :] = center + scale_tmp * shift + src[1, :] = center + src_dir + scale_tmp * shift + dst[0, :] = [dst_w * 0.5, dst_h * 0.5] + dst[1, :] = np.array([dst_w * 0.5, dst_h * 0.5]) + dst_dir + + src[2:, :] = get_3rd_point(src[0, :], src[1, :]) + dst[2:, :] = get_3rd_point(dst[0, :], dst[1, :]) + + if inv: + trans = cv2.getAffineTransform(np.float32(dst), np.float32(src)) + else: + trans = cv2.getAffineTransform(np.float32(src), np.float32(dst)) + + return trans + + +def affine_transform(pt, t): + new_pt = np.array([pt[0], pt[1], 1.]).T + new_pt = np.dot(t, new_pt) + return new_pt[:2] + + +def get_3rd_point(a, b): + direct = a - b + return b + np.array([-direct[1], direct[0]], dtype=np.float32) + + +def get_dir(src_point, rot_rad): + sn, cs = np.sin(rot_rad), np.cos(rot_rad) + + src_result = [0, 0] + src_result[0] = src_point[0] * cs - src_point[1] * sn + src_result[1] = src_point[0] * sn + src_point[1] * cs + + return src_result + + +def crop(img, center, scale, output_size, rot=0): + trans = get_affine_transform(center, scale, rot, output_size) + + dst_img = cv2.warpAffine( + img, trans, (int(output_size[0]), int(output_size[1])), + flags=cv2.INTER_LINEAR + ) + + return dst_img diff --git a/VideoToNPZ/lib/pose/hrnet/lib/utils/utilitys.py b/VideoToNPZ/lib/pose/hrnet/lib/utils/utilitys.py new file mode 100644 index 0000000000000000000000000000000000000000..fb33b2e09ae4953cd048532bdab28a380eeecbb6 --- /dev/null +++ b/VideoToNPZ/lib/pose/hrnet/lib/utils/utilitys.py @@ -0,0 +1,170 @@ +import cv2 +import sys +import torch +import json +import torchvision.transforms as transforms +import _init_paths +from utils.transforms import * + +from utils.coco_h36m import coco_h36m +import numpy as np + +joint_pairs = [[0, 1], [1, 3], [0, 2], [2, 4], + [5, 6], [5, 7], [7, 9], [6, 8], [8, 10], + [5, 11], [6, 12], [11, 12], + [11, 13], [12, 14], [13, 15], [14, 16]] + +h36m_pairs = [(0, 1), (1, 2), (2, 3), (0, 4), (4, 5), (5, 6), (0, 7), (7, 8), (8, 9), (9, 10), (8, 11), (11, 12), + (12, 13), (8, 14), (14, 15), (15, 16)] + +colors = [[255, 0, 0], [255, 85, 0], [255, 170, 0], [255, 255, 0], [170, 255, 0], [85, 255, 0], [0, 255, 0], \ + [0, 255, 85], [0, 255, 170], [0, 255, 255], [0, 170, 255], [0, 85, 255], [0, 0, 255], [85, 0, 255], \ + [170, 0, 255], [255, 0, 255]] + + +def plot_keypoint(image, coordinates, confidence, keypoint_thresh=0.3): + # USE cv2 + joint_visible = confidence[:, :, 0] > keypoint_thresh + coordinates = coco_h36m(coordinates) + for i in range(coordinates.shape[0]): + pts = coordinates[i] + + for joint in pts: + cv2.circle(image, (int(joint[0]), int(joint[1])), 8, (255, 255, 255), 1) + + for color_i, jp in zip(colors, h36m_pairs): + if joint_visible[i, jp[0]] and joint_visible[i, jp[1]]: + pt0 = pts[jp, 0] + pt1 = pts[jp, 1] + pt0_0, pt0_1, pt1_0, pt1_1 = int(pt0[0]), int(pt0[1]), int(pt1[0]), int(pt1[1]) + + cv2.line(image, (pt0_0, pt1_0), (pt0_1, pt1_1), color_i, 6) + # cv2.circle(image,(pt0_0, pt0_1), 2, color_i, thickness=-1) + # cv2.circle(image,(pt1_0, pt1_1), 2, color_i, thickness=-1) + return image + + +def write(x, img): + x = [int(i) for i in x] + c1 = tuple(x[0:2]) + c2 = tuple(x[2:4]) + + color = [0, 97, 255] + label = 'People {}'.format(x[-1]) + cv2.rectangle(img, c1, c2, color, 2) + t_size = cv2.getTextSize(label, cv2.FONT_HERSHEY_PLAIN, 1, 1)[0] + c2 = c1[0] + t_size[0] + 3, c1[1] + t_size[1] + 4 + cv2.rectangle(img, c1, c2, [0, 128, 255], -1) + cv2.putText(img, label, (c1[0], c1[1] + t_size[1] + 4), cv2.FONT_HERSHEY_PLAIN, 1, [225, 255, 255], 1) + return img + + +def load_json(file_path): + with open(file_path, 'r') as fr: + video_info = json.load(fr) + + label = video_info['label'] + label_index = video_info['label_index'] + + num_frames = video_info['data'][-1]['frame_index'] + keypoints = np.zeros((2, num_frames, 17, 2), dtype=np.float32) # (M, T, N, 2) + scores = np.zeros((2, num_frames, 17), dtype=np.float32) # (M, T, N) + + for frame_info in video_info['data']: + frame_index = frame_info['frame_index'] + + for index, skeleton_info in enumerate(frame_info['skeleton']): + pose = skeleton_info['pose'] + score = skeleton_info['score'] + bbox = skeleton_info['bbox'] + + if len(bbox) == 0 or index+1 > 2: + continue + + pose = np.asarray(pose, dtype=np.float32) + score = np.asarray(score, dtype=np.float32) + score = score.reshape(-1) + + keypoints[index, frame_index-1] = pose + scores[index, frame_index-1] = score + + new_kpts = [] + for i in range(keypoints.shape[0]): + kps = keypoints[i] + if np.sum(kps) != 0.: + new_kpts.append(kps) + + new_kpts = np.asarray(new_kpts, dtype=np.float32) + scores = np.asarray(scores, dtype=np.float32) + scores = scores[:, :, :, np.newaxis] + return new_kpts, scores, label, label_index + + +def box_to_center_scale(box, model_image_width, model_image_height): + """convert a box to center,scale information required for pose transformation + Parameters + ---------- + box : (x1, y1, x2, y2) + model_image_width : int + model_image_height : int + + Returns + ------- + (numpy array, numpy array) + Two numpy arrays, coordinates for the center of the box and the scale of the box + """ + center = np.zeros((2), dtype=np.float32) + x1, y1, x2, y2 = box[:4] + box_width, box_height = x2 - x1, y2 - y1 + + center[0] = x1 + box_width * 0.5 + center[1] = y1 + box_height * 0.5 + + aspect_ratio = model_image_width * 1.0 / model_image_height + pixel_std = 200 + + if box_width > aspect_ratio * box_height: + box_height = box_width * 1.0 / aspect_ratio + elif box_width < aspect_ratio * box_height: + box_width = box_height * aspect_ratio + scale = np.array( + [box_width * 1.0 / pixel_std, box_height * 1.0 / pixel_std], + dtype=np.float32) + if center[0] != -1: + scale = scale * 1.25 + + return center, scale + + +# Pre-process +def PreProcess(image, bboxs, cfg, num_pos=2): + if type(image) == str: + data_numpy = cv2.imread(image, cv2.IMREAD_COLOR | cv2.IMREAD_IGNORE_ORIENTATION) + # data_numpy = cv2.cvtColor(data_numpy, cv2.COLOR_BGR2RGB) + else: + data_numpy = image + + inputs = [] + centers = [] + scales = [] + + for bbox in bboxs[:num_pos]: + c, s = box_to_center_scale(bbox, data_numpy.shape[0], data_numpy.shape[1]) + centers.append(c) + scales.append(s) + r = 0 + + trans = get_affine_transform(c, s, r, cfg.MODEL.IMAGE_SIZE) + input = cv2.warpAffine( + data_numpy, + trans, + (int(cfg.MODEL.IMAGE_SIZE[0]), int(cfg.MODEL.IMAGE_SIZE[1])), + flags=cv2.INTER_LINEAR) + + transform = transforms.Compose([transforms.ToTensor(), + transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])]) + input = transform(input).unsqueeze(0) + inputs.append(input) + + inputs = torch.cat(inputs) + return inputs, data_numpy, centers, scales diff --git a/VideoToNPZ/lib/pose/hrnet/pose_estimation/__init__.py b/VideoToNPZ/lib/pose/hrnet/pose_estimation/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/VideoToNPZ/lib/pose/hrnet/pose_estimation/_init_paths.py b/VideoToNPZ/lib/pose/hrnet/pose_estimation/_init_paths.py new file mode 100644 index 0000000000000000000000000000000000000000..f6c9c1200949417dbf48495ae9355d258ef693e4 --- /dev/null +++ b/VideoToNPZ/lib/pose/hrnet/pose_estimation/_init_paths.py @@ -0,0 +1,35 @@ +# ------------------------------------------------------------------------------ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# Written by Bin Xiao (Bin.Xiao@microsoft.com) +# ------------------------------------------------------------------------------ + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os.path as osp +import sys + + +def add_path(path): + if path not in sys.path: + sys.path.insert(0, path) + + +def get_path(cur_file): + cur_dir = osp.dirname(osp.realpath(cur_file)) + pre_dir = osp.join(cur_dir, '..') + project_root = osp.abspath(osp.join(cur_dir, '../../../../')) + chk_root = osp.join(project_root, 'checkpoint/') + data_root = osp.join(project_root, 'data/') + lib_root = osp.join(project_root, 'lib/') + output_root = osp.join(project_root, 'output/') + + return pre_dir, cur_dir, chk_root, data_root, lib_root, output_root + + +this_dir = osp.dirname(osp.realpath(__file__)) + +lib_path = osp.join(this_dir, '..', 'lib') +add_path(lib_path) diff --git a/VideoToNPZ/lib/pose/hrnet/pose_estimation/gen_kpts.py b/VideoToNPZ/lib/pose/hrnet/pose_estimation/gen_kpts.py new file mode 100644 index 0000000000000000000000000000000000000000..eb227aa2c41bbe1d5513cb2ca5b979609b35d703 --- /dev/null +++ b/VideoToNPZ/lib/pose/hrnet/pose_estimation/gen_kpts.py @@ -0,0 +1,474 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import sys +import os +import os.path as osp +import argparse +import numpy as np +from tqdm import tqdm +import torch +import torch.backends.cudnn as cudnn +import cv2 +from threading import Thread +from queue import Queue +import json +import torch.multiprocessing as mp +from functools import partial +from io import StringIO + +import _init_paths +from _init_paths import get_path +from utils.utilitys import PreProcess, load_json, plot_keypoint, write +from config import cfg, update_config +from utils.transforms import * +from utils.inference import get_final_preds +import models +sys.path.pop(0) + +pre_dir, cur_dir, chk_root, data_root, lib_root, output_root = get_path(__file__) +cfg_dir = pre_dir + '/experiments/coco/hrnet/' +model_dir = chk_root + 'hrnet/pose_coco/' + +sys.path.insert(0, lib_root) +from detector import load_model as yolo_model +from detector import yolo_human_det as yolo_det +from track.sort import Sort +sys.path.pop(0) + +# Set multiprocessing start method +mp.set_start_method('spawn', force=True) + +def parse_args(): + parser = argparse.ArgumentParser(description='Train keypoints network') + parser.add_argument('--cfg', type=str, default=cfg_dir + 'w48_384x288_adam_lr1e-3.yaml') + parser.add_argument('opts', nargs=argparse.REMAINDER, default=None) + parser.add_argument('--modelDir', type=str, default=model_dir + 'pose_hrnet_w48_384x288.pth') + parser.add_argument('--det-dim', type=int, default=416) + parser.add_argument('--thred-score', type=float, default=0.70) + parser.add_argument('-a', '--animation', action='store_true', help='output animation') + parser.add_argument('-np', '--num-person', type=int, default=1) + parser.add_argument("-v", "--video", type=str, default='camera') + parser.add_argument('--batch-size', type=int, default=16) + args = parser.parse_args() + return args + +def reset_config(args): + update_config(cfg, args) + cudnn.benchmark = cfg.CUDNN.BENCHMARK + torch.backends.cudnn.deterministic = cfg.CUDNN.DETERMINISTIC + torch.backends.cudnn.enabled = cfg.CUDNN.ENABLED + +def model_load(config, use_fp16=False): + model = eval('models.' + config.MODEL.NAME + '.get_pose_net')(config, is_train=False) + state_dict = torch.load(config.OUTPUT_DIR, map_location=torch.device('cpu')) + from collections import OrderedDict + new_state_dict = OrderedDict() + for k, v in state_dict.items(): + new_state_dict[k] = v + model.load_state_dict(new_state_dict) + if torch.cuda.is_available() and use_fp16: + model = model.half().cuda() + elif torch.cuda.is_available(): + model = model.cuda() + model.eval() + return model + +def load_default_model(): + args = parse_args() + reset_config(args) + model = eval('models.' + config.MODEL.NAME + '.get_pose_net')(cfg, is_train=False) + if torch.cuda.is_available(): + model = model.cuda() + state_dict = torch.load(cfg.OUTPUT_DIR) + from collections import OrderedDict + new_state_dict = OrderedDict() + for k, v in state_dict.items(): + new_state_dict[k] = v + model.load_state_dict(new_state_dict) + model.eval() + return model + +def frame_loader(video, queue, video_length): + cap = cv2.VideoCapture(video) + for _ in range(video_length): + ret, frame = cap.read() + if not ret: + break + queue.put(frame) + queue.put(None) + cap.release() + +def process_batch(frames, human_model, pose_model, det_dim, num_person, thred_score, use_fp16, device): + if not frames: + return [], [] + + batch_bboxs = [] + batch_centers = [] + batch_scales = [] + batch_inputs = [] + + for frame in frames: + bboxs, _ = yolo_det(frame, human_model, reso=det_dim, confidence=thred_score) + if bboxs is None or not bboxs.any(): + continue + + people_track = Sort().update(bboxs) + if people_track.shape[0] == 0: + continue + people_track_ = people_track[-min(num_person, people_track.shape[0]):, :-1] + track_bboxs = [[round(i, 2) for i in list(bbox)] for bbox in people_track_] + + inputs, _, center, scale = PreProcess(frame, track_bboxs, cfg, len(track_bboxs)) + inputs = inputs[:, [2, 1, 0]] + batch_bboxs.append(track_bboxs) + batch_centers.append(center) + batch_scales.append(scale) + batch_inputs.append(inputs) + + if not batch_inputs: + return [], [] + + inputs = torch.cat(batch_inputs, dim=0).to(device) + if use_fp16: + inputs = inputs.half() + + with torch.no_grad(): + outputs = pose_model(inputs) + outputs = outputs.cpu().float() + + kpts_result = [] + scores_result = [] + offset = 0 + for i, (center, scale) in enumerate(zip(batch_centers, batch_scales)): + batch_size = len(batch_bboxs[i]) + preds, maxvals = get_final_preds(cfg, outputs[offset:offset + batch_size].numpy(), + np.asarray(center).flatten(), np.asarray(scale).flatten()) + offset += batch_size + + kpts = np.zeros((len(preds), 17, 2), dtype=np.float32) + scores = np.zeros((len(preds), 17), dtype=np.float32) + for j, (kpt, score) in enumerate(zip(preds, maxvals)): + kpts[j] = kpt + scores[j] = score.squeeze() + kpts_result.append(kpts) + scores_result.append(scores) + + return kpts_result, scores_result + +def gen_video_kpts(video, det_dim=416, num_person=1, gen_output=False, batch_size=16, animation=False): + args = parse_args() + reset_config(args) + + cap = cv2.VideoCapture(video) + assert cap.isOpened(), 'Cannot capture source' + + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + use_fp16 = device.type == 'cuda' and torch.cuda.get_device_capability()[0] >= 7 + batch_size = min(batch_size, torch.cuda.get_device_properties(0).total_memory // (1024**3) if device.type == 'cuda' else mp.cpu_count()) + + human_model = yolo_model(inp_dim=det_dim) + pose_model = model_load(cfg, use_fp16=use_fp16).to(device) + people_sort = Sort() + + video_length = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) + print('Recording 2D pose ...') + import sys + from io import StringIO + + if animation: + # Animation mode uses frame-by-frame processing like in the backup code + kpts_result = [] + scores_result = [] + + for i in tqdm(range(video_length)): + ret, frame = cap.read() + if not ret: + break + + # Detect humans + bboxs, scores = yolo_det(frame, human_model, reso=det_dim, confidence=args.thred_score) + + if bboxs is None or not bboxs.any(): + print('No person detected!') + continue + + # Track people + people_track = people_sort.update(bboxs) + + # Select people to track + if people_track.shape[0] == 1: + people_track_ = people_track[-1, :-1].reshape(1, 4) + elif people_track.shape[0] >= 2: + people_track_ = people_track[-num_person:, :-1].reshape(num_person, 4) + people_track_ = people_track_[::-1] + else: + continue + + track_bboxs = [] + for bbox in people_track_: + bbox = [round(i, 2) for i in list(bbox)] + track_bboxs.append(bbox) + + with torch.no_grad(): + # Preprocess and get pose predictions + inputs, origin_img, center, scale = PreProcess(frame, track_bboxs, cfg, num_person) + inputs = inputs[:, [2, 1, 0]] + + if torch.cuda.is_available(): + inputs = inputs.cuda() + output = pose_model(inputs) + + # Compute coordinates + preds, maxvals = get_final_preds(cfg, output.clone().cpu().numpy(), np.asarray(center), np.asarray(scale)) + + if gen_output: + # Store results for later processing + kpts = np.zeros((num_person, 17, 2), dtype=np.float32) + scores = np.zeros((num_person, 17), dtype=np.float32) + + for j, kpt in enumerate(preds): + kpts[j] = kpt + + for j, score in enumerate(maxvals): + scores[j] = score.squeeze() + + kpts_result.append(kpts) + scores_result.append(scores) + + else: + # Visualize results in real-time + index_bboxs = [bbox + [j] for j, bbox in enumerate(track_bboxs)] + list(map(lambda x: write(x, frame), index_bboxs)) + plot_keypoint(frame, preds, maxvals, 0.3) + + cv2.imshow('frame', frame) + key = cv2.waitKey(1) + if key & 0xFF == ord('q'): + break + else: + # Optimized batch processing with Queue + old_stdout = sys.stdout + sys.stdout = StringIO() + + frame_queue = mp.Queue(maxsize=batch_size * 2) + loader_thread = Thread(target=frame_loader, args=(video, frame_queue, video_length)) + loader_thread.start() + + # Pre-allocate result arrays + max_frames = video_length + kpts_result = np.zeros((max_frames, num_person, 17, 2), dtype=np.float32) + scores_result = np.zeros((max_frames, num_person, 17), dtype=np.float32) + frame_idx = 0 + + pool = None # Initialize pool outside try block for cleanup + try: + if device.type == 'cuda': + # GPU batch processing + batch_frames = [] + with torch.no_grad(): + for _ in tqdm(range(video_length)): + frame = frame_queue.get() + if frame is None: + break + batch_frames.append(frame) + + if len(batch_frames) >= batch_size: + kpts_batch, scores_batch = process_batch(batch_frames, human_model, pose_model, + det_dim, num_person, args.thred_score, + use_fp16, device) + for kpts, scores in zip(kpts_batch, scores_batch): + kpts_result[frame_idx:frame_idx + 1] = kpts[None, :num_person] + scores_result[frame_idx:frame_idx + 1] = scores[None, :num_person] + frame_idx += 1 + batch_frames = [] + + # Process remaining frames + if batch_frames: + kpts_batch, scores_batch = process_batch(batch_frames, human_model, pose_model, + det_dim, num_person, args.thred_score, + use_fp16, device) + for kpts, scores in zip(kpts_batch, scores_batch): + kpts_result[frame_idx:frame_idx + 1] = kpts[None, :num_person] + scores_result[frame_idx:frame_idx + 1] = scores[None, :num_person] + frame_idx += 1 + else: + # CPU batch processing with multiprocessing + pool = mp.Pool(processes=mp.cpu_count()) + process_func = partial(process_batch, human_model=human_model, pose_model=pose_model, + det_dim=det_dim, num_person=num_person, thred_score=args.thred_score, + use_fp16=use_fp16, device=device) + + batch_frames = [] + with torch.no_grad(): + for _ in tqdm(range(video_length)): + frame = frame_queue.get() + if frame is None: + break + batch_frames.append(frame) + if len(batch_frames) >= batch_size: + kpts_batch, scores_batch = process_func(batch_frames) + for kpts, scores in zip(kpts_batch, scores_batch): + kpts_result[frame_idx:frame_idx + 1] = kpts[None, :num_person] + scores_result[frame_idx:frame_idx + 1] = scores[None, :num_person] + frame_idx += 1 + batch_frames = [] + + # Process remaining frames + if batch_frames: + kpts_batch, scores_batch = process_func(batch_frames) + for kpts, scores in zip(kpts_batch, scores_batch): + kpts_result[frame_idx:frame_idx + 1] = kpts[None, :num_person] + scores_result[frame_idx:frame_idx + 1] = scores[None, :num_person] + frame_idx += 1 + + pool.close() + pool.join() + except KeyboardInterrupt: + print("\nInterrupted by user, shutting down...") + if pool is not None: + pool.terminate() + pool.join() + loader_thread.join() + sys.stdout = old_stdout + sys.exit(0) + + loader_thread.join() + sys.stdout = old_stdout + + if gen_output and kpts_result.any(): + keypoints = kpts_result[:frame_idx].transpose(1, 0, 2, 3) + scores = scores_result[:frame_idx].transpose(1, 0, 2) + return keypoints, scores + return None, None + +def gen_img_kpts(image, human_model, pose_model, human_sort, det_dim=416, num_person=2): + args = parse_args() + reset_config(args) + thred_score = args.thred_score + + bboxs, bbox_scores = yolo_det(image, human_model, reso=det_dim, confidence=thred_score) + if bboxs is None or not bboxs.any(): + return None, None, None + + people_track = human_sort.update(bboxs) + if people_track.shape[0] == 1: + bboxs_track = people_track[-1].reshape(1, 5) + else: + people_track_ = people_track[-num_person:].reshape(num_person, 5) + bboxs_track = people_track_[::-1] + + with torch.no_grad(): + inputs, origin_img, center, scale = PreProcess(image, bboxs_track, cfg, num_person) + inputs = inputs[:, [2, 1, 0]] + if torch.cuda.is_available(): + inputs = inputs.cuda() + output = pose_model(inputs) + preds, maxvals = get_final_preds(cfg, output.clone().cpu().numpy(), np.asarray(center), np.asarray(scale)) + + kpts = np.zeros((num_person, 17, 2), dtype=np.float32) + scores = np.zeros((num_person, 17, 1), dtype=np.float32) + for i, kpt in enumerate(preds): + kpts[i] = kpt + for i, score in enumerate(maxvals): + scores[i] = score + + human_indexes = [bboxs_track[i, -1] for i in range(len(bboxs_track))] + return kpts, scores, human_indexes + +def generate_ntu_kpts_json(video_path, kpts_file): + args = parse_args() + reset_config(args) + + human_model = yolo_model() + pose_model = model_load(cfg) + people_sort = Sort() + + with torch.no_grad(): + cap = cv2.VideoCapture(video_path) + video_length = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) + kpts_info = dict() + data = [] + + for i in tqdm(range(video_length), unit="%", ncols=100): + frame_info = {'frame_index': i + 1} + ret, frame = cap.read() + if not ret: + continue + + try: + bboxs, scores = yolo_det(frame, human_model, confidence=args.thred_score) + if bboxs is None or not bboxs.any(): + print('No person detected!') + continue + + people_track = people_sort.update(bboxs) + if people_track.shape[0] == 1: + people_track_ = people_track[-1, :-1].reshape(1, 4) + elif people_track.shape[0] >= 2: + people_track_ = people_track[-2:, :-1].reshape(2, 4) + people_track_ = people_track_[::-1] + else: + skeleton = {'skeleton': [{'pose': [], 'score': [], 'bbox': []}]} + frame_info.update(skeleton) + data.append(frame_info) + continue + + track_bboxs = [] + for bbox in people_track_: + bbox = [round(i, 3) for i in list(bbox)] + track_bboxs.append(bbox) + + except Exception as e: + print(e) + continue + + inputs, origin_img, center, scale = PreProcess(frame, track_bboxs, cfg, args.num_person) + inputs = inputs[:, [2, 1, 0]] + if torch.cuda.is_available(): + inputs = inputs.cuda() + output = pose_model(inputs) + preds, maxvals = get_final_preds(cfg, output.clone().cpu().numpy(), np.asarray(center), np.asarray(scale)) + + skeleton = [] + for num, bbox in enumerate(track_bboxs): + pose = preds[num].tolist() + score = maxvals[num].tolist() + pose = round_list(pose) + score = round_list(score) + one_skeleton = {'pose': pose, 'score': score, 'bbox': bbox} + skeleton.append(one_skeleton) + + frame_info.update({'skeleton': skeleton}) + data.append(frame_info) + + kpts_info.update({'data': data}) + with open(kpts_file, 'w') as fw: + json.dump(kpts_info, fw) + +def round_list(input_list, decimals=3): + dim = len(input_list) + for i in range(dim): + for j in range(len(input_list[i])): + input_list[i][j] = round(input_list[i][j], decimals) + return input_list + +if __name__ == "__main__": + args = parse_args() + video_path = args.video + + if args.animation: + # Real-time animation mode + gen_video_kpts(video_path, det_dim=args.det_dim, num_person=args.num_person, + gen_output=False, animation=True) + else: + # Process and save keypoints + keypoints, scores = gen_video_kpts(video_path, det_dim=args.det_dim, + num_person=args.num_person, + gen_output=True, + batch_size=args.batch_size) + if keypoints is not None: + output_file = "output.npz" + np.savez(output_file, keypoints=keypoints, scores=scores) + print(f"Saved to {output_file}") \ No newline at end of file diff --git a/VideoToNPZ/lib/track/sort.py b/VideoToNPZ/lib/track/sort.py new file mode 100644 index 0000000000000000000000000000000000000000..1f12f8fd56fa409f885099044199e642d00690aa --- /dev/null +++ b/VideoToNPZ/lib/track/sort.py @@ -0,0 +1,231 @@ +""" + https://arxiv.org/abs/1602.00763 +""" +from __future__ import print_function + +from numba import jit +import os.path +import numpy as np +from skimage import io +from scipy.optimize import linear_sum_assignment +import argparse +from filterpy.kalman import KalmanFilter + + +@jit +def iou(bb_test, bb_gt): + """ + Computes IUO between two bboxes in the form [x1,y1,x2,y2] + """ + xx1 = np.maximum(bb_test[0], bb_gt[0]) + yy1 = np.maximum(bb_test[1], bb_gt[1]) + xx2 = np.minimum(bb_test[2], bb_gt[2]) + yy2 = np.minimum(bb_test[3], bb_gt[3]) + w = np.maximum(0., xx2 - xx1) + h = np.maximum(0., yy2 - yy1) + wh = w * h + o = wh / ((bb_test[2] - bb_test[0]) * (bb_test[3] - bb_test[1]) + + (bb_gt[2] - bb_gt[0]) * (bb_gt[3] - bb_gt[1]) - wh) + + return o + + +def convert_bbox_to_z(bbox): + """ + Takes a bounding box in the form [x1,y1,x2,y2] and returns z in the form + [x,y,s,r] where x,y is the centre of the box and s is the scale/area and r is + the aspect ratio + """ + w = bbox[2] - bbox[0] + h = bbox[3] - bbox[1] + x = bbox[0] + w / 2. + y = bbox[1] + h / 2. + s = w * h # scale is just area + r = w / float(h) + return np.array([x, y, s, r]).reshape((4, 1)) + + +def convert_x_to_bbox(x, score=None): + """ + Takes a bounding box in the centre form [x,y,s,r] and returns it in the form + [x1,y1,x2,y2] where x1,y1 is the top left and x2,y2 is the bottom right + """ + w = np.sqrt(x[2] * x[3]) + h = x[2] / w + if (score == None): + return np.array([x[0] - w / 2., x[1] - h / 2., x[0] + w / 2., x[1] + h / 2.]).reshape((1, 4)) + else: + return np.array([x[0] - w / 2., x[1] - h / 2., x[0] + w / 2., x[1] + h / 2., score]).reshape((1, 5)) + + +class KalmanBoxTracker(object): + """ + This class represents the internel state of individual tracked objects observed as bbox. + """ + count = 0 + + def __init__(self, bbox): + """ + Initialises a tracker using initial bounding box. + """ + # define constant velocity model + self.kf = KalmanFilter(dim_x=7, dim_z=4) + self.kf.F = np.array( + [[1, 0, 0, 0, 1, 0, 0], [0, 1, 0, 0, 0, 1, 0], [0, 0, 1, 0, 0, 0, 1], [0, 0, 0, 1, 0, 0, 0], + [0, 0, 0, 0, 1, 0, 0], [0, 0, 0, 0, 0, 1, 0], [0, 0, 0, 0, 0, 0, 1]]) + self.kf.H = np.array( + [[1, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0]]) + + self.kf.R[2:, 2:] *= 10. + self.kf.P[4:, 4:] *= 1000. # give high uncertainty to the unobservable initial velocities + self.kf.P *= 10. + self.kf.Q[-1, -1] *= 0.01 + self.kf.Q[4:, 4:] *= 0.01 + + self.kf.x[:4] = convert_bbox_to_z(bbox) + self.time_since_update = 0 + self.id = KalmanBoxTracker.count + KalmanBoxTracker.count += 1 + self.history = [] + self.hits = 0 + self.hit_streak = 0 + self.age = 0 + + def update(self, bbox): + """ + Updates the state vector with observed bbox. + """ + self.time_since_update = 0 + self.history = [] + self.hits += 1 + self.hit_streak += 1 + self.kf.update(convert_bbox_to_z(bbox)) + + def predict(self): + """ + Advances the state vector and returns the predicted bounding box estimate. + """ + if ((self.kf.x[6] + self.kf.x[2]) <= 0): + self.kf.x[6] *= 0.0 + self.kf.predict() + self.age += 1 + if (self.time_since_update > 0): + self.hit_streak = 0 + self.time_since_update += 1 + self.history.append(convert_x_to_bbox(self.kf.x)) + return self.history[-1] + + def get_state(self): + """ + Returns the current bounding box estimate. + """ + return convert_x_to_bbox(self.kf.x) + + +def associate_detections_to_trackers(detections, trackers, iou_threshold=0.3): + """ + Assigns detections to tracked object (both represented as bounding boxes) + + Returns 3 lists of matches, unmatched_detections and unmatched_trackers + """ + if (len(trackers) == 0): + return np.empty((0, 2), dtype=int), np.arange(len(detections)), np.empty((0, 5), dtype=int) + iou_matrix = np.zeros((len(detections), len(trackers)), dtype=np.float32) + + for d, det in enumerate(detections): + for t, trk in enumerate(trackers): + iou_matrix[d, t] = iou(det, trk) + matched_indices = linear_sum_assignment(-iou_matrix) + matched_indices = np.asarray(matched_indices) + matched_indices = matched_indices.transpose() + + unmatched_detections = [] + for d, det in enumerate(detections): + if (d not in matched_indices[:, 0]): + unmatched_detections.append(d) + unmatched_trackers = [] + for t, trk in enumerate(trackers): + if (t not in matched_indices[:, 1]): + unmatched_trackers.append(t) + + # filter out matched with low IOU + matches = [] + for m in matched_indices: + if (iou_matrix[m[0], m[1]] < iou_threshold): + unmatched_detections.append(m[0]) + unmatched_trackers.append(m[1]) + else: + matches.append(m.reshape(1, 2)) + if (len(matches) == 0): + matches = np.empty((0, 2), dtype=int) + else: + matches = np.concatenate(matches, axis=0) + + return matches, np.array(unmatched_detections), np.array(unmatched_trackers) + + +class Sort(object): + def __init__(self, max_age=1, min_hits=3): + """ + Sets key parameters for SORT + """ + self.max_age = max_age + self.min_hits = min_hits + self.trackers = [] + self.frame_count = 0 + + def update(self, dets): + """ + Params: + dets - a numpy array of detections in the format [[x1,y1,x2,y2,score],[x1,y1,x2,y2,score],...] + Requires: this method must be called once for each frame even with empty detections. + Returns the a similar array, where the last column is the object ID. + + NOTE: The number of objects returned may differ from the number of detections provided. + """ + self.frame_count += 1 + # get predicted locations from existing trackers. + trks = np.zeros((len(self.trackers), 5)) + to_del = [] + ret = [] + for t, trk in enumerate(trks): + pos = self.trackers[t].predict()[0] + trk[:] = [pos[0], pos[1], pos[2], pos[3], 0] + if np.any(np.isnan(pos)): + to_del.append(t) + trks = np.ma.compress_rows(np.ma.masked_invalid(trks)) + for t in reversed(to_del): + self.trackers.pop(t) + matched, unmatched_dets, unmatched_trks = associate_detections_to_trackers(dets, trks) + + # update matched trackers with assigned detections + for t, trk in enumerate(self.trackers): + if t not in unmatched_trks: + d = matched[np.where(matched[:, 1] == t)[0], 0] # d: [n] + trk.update(dets[d, :][0]) + + # create and initialise new trackers for unmatched detections + for i in unmatched_dets: + trk = KalmanBoxTracker(dets[i, :]) + self.trackers.append(trk) + i = len(self.trackers) + for trk in reversed(self.trackers): + d = trk.get_state()[0] + if ((trk.time_since_update < 1) and (trk.hit_streak >= self.min_hits or self.frame_count <= self.min_hits)): + ret.append(np.concatenate((d, [trk.id + 1])).reshape(1, -1)) # +1 as MOT benchmark requires positive + i -= 1 + # remove dead tracklet + if (trk.time_since_update > self.max_age): + self.trackers.pop(i) + if (len(ret) > 0): + return np.concatenate(ret) + return np.empty((0, 5)) + + +def parse_args(): + """Parse input arguments.""" + parser = argparse.ArgumentParser(description='SORT demo') + parser.add_argument('--display', dest='display', help='Display online tracker output (slow) [False]', + action='store_true') + args = parser.parse_args() + return args diff --git a/VideoToNPZ/model/gast_net.py b/VideoToNPZ/model/gast_net.py new file mode 100644 index 0000000000000000000000000000000000000000..c066088e9f34d87a656f8dc54e17510aae94af15 --- /dev/null +++ b/VideoToNPZ/model/gast_net.py @@ -0,0 +1,285 @@ +import torch +from torchsummary import summary +import torch.nn as nn +from model.local_attention import LocalGraph +from model.global_attention import MultiGlobalGraph, SingleGlobalGraph + + +class GraphAttentionBlock(nn.Module): + def __init__(self, adj, input_dim, output_dim, p_dropout): + super(GraphAttentionBlock, self).__init__() + + hid_dim = output_dim + self.relu = nn.ReLU(inplace=True) + + self.local_graph_layer = LocalGraph(adj, input_dim, hid_dim, p_dropout) + self.global_graph_layer = MultiGlobalGraph(adj, input_dim, input_dim//4, dropout=p_dropout) + # self.global_graph_layer = SingleGlobalGraph(adj, input_dim, output_dim) + + self.cat_conv = nn.Conv2d(3*output_dim, 2*output_dim, 1, bias=False) + self.cat_bn = nn.BatchNorm2d(2*output_dim, momentum=0.1) + + def forward(self, x): + # x: (B, C, T, N) --> (B, T, N, C) + x = x.permute(0, 2, 3, 1) + residual = x + x_ = self.local_graph_layer(x) + y_ = self.global_graph_layer(x) + x = torch.cat((residual, x_, y_), dim=-1) + + # x: (B, T, N, C) --> (B, C, T, N) + x = x.permute(0, 3, 1, 2) + x = self.relu(self.cat_bn(self.cat_conv(x))) + return x + + +class SpatioTemporalModelBase(nn.Module): + """ + Do not instantiate this class. + """ + + def __init__(self, adj, num_joints_in, in_features, num_joints_out, + filter_widths, causal, dropout, channels): + super().__init__() + + # Validate input + for fw in filter_widths: + assert fw % 2 != 0, 'Only odd filter widths are supported' + + self.num_joints_in = num_joints_in + self.in_features = in_features + self.num_joints_out = num_joints_out + self.filter_widths = filter_widths + + self.drop = nn.Dropout(dropout) + self.relu = nn.ReLU(inplace=True) + + self.pad = [filter_widths[0] // 2] + self.init_bn = nn.BatchNorm2d(in_features, momentum=0.1) + self.expand_bn = nn.BatchNorm2d(channels, momentum=0.1) + self.shrink = nn.Conv2d(2**len(self.filter_widths)*channels, 3, 1, bias=False) + + def receptive_field(self): + """ + Return the total receptive field of this model as # of frames. + """ + frames = 0 + for f in self.pad: + frames += f + return 1 + 2 * frames + + def total_causal_shift(self): + """ + Return the asymmetric offset for sequence padding. + The returned value is typically 0 if causal convolutions are disabled, + otherwise it is half the receptive field. + """ + frames = self.causal_shift[0] + next_dilation = self.filter_widths[0] + for i in range(1, len(self.filter_widths)): + frames += self.causal_shift[i] * next_dilation + next_dilation *= self.filter_widths[i] + return frames + + def forward(self, x): + """ + X: (B, C, T, N) + B: batchsize + T: Temporal + N: The number of keypoints + C: The feature dimension of keypoints + """ + + assert len(x.shape) == 4 + assert x.shape[-2] == self.num_joints_in + assert x.shape[-1] == self.in_features + + # X: (B, T, N, C) + x = self._forward_blocks(x) + x = self.shrink(x) + + # x: (B, C, T, N) --> (B, T, N, C) + x = x.permute(0, 2, 3, 1) + + return x + + +class SpatioTemporalModel(SpatioTemporalModelBase): + """ + Reference 3D pose estimation model with temporal convolutions. + This implementation can be used for all use-cases. + """ + + def __init__(self, adj, num_joints_in, in_features, num_joints_out, + filter_widths, causal=False, dropout=0.25, channels=64, dense=False): + """ + Initialize this model. + + Arguments: + num_joints_in -- number of input joints (e.g. 17 for Human3.6M) + in_features -- number of input features for each joint (typically 2 for 2D input) + num_joints_out -- number of output joints (can be different than input) + filter_widths -- list of convolution widths, which also determines the # of blocks and receptive field + causal -- use causal convolutions instead of symmetric convolutions (for real-time applications) + dropout -- dropout probability + channels -- number of convolution channels + dense -- use regular dense convolutions instead of dilated convolutions (ablation experiment) + """ + super().__init__(adj, num_joints_in, in_features, num_joints_out, filter_widths, causal, dropout, channels) + + self.expand_conv = nn.Conv2d(in_features, channels, (filter_widths[0], 1), bias=False) + nn.init.kaiming_normal_(self.expand_conv.weight) + + layers_conv = [] + layers_graph_conv = [] + layers_bn = [] + + layers_graph_conv.append(GraphAttentionBlock(adj, channels, channels, p_dropout=dropout)) + + self.causal_shift = [(filter_widths[0]) // 2 if causal else 0] + next_dilation = filter_widths[0] + for i in range(1, len(filter_widths)): + self.pad.append((filter_widths[i] - 1) * next_dilation // 2) + self.causal_shift.append((filter_widths[i] // 2 * next_dilation) if causal else 0) + + layers_conv.append(nn.Conv2d(2**i*channels, 2**i*channels, (filter_widths[i], 1) if not dense else (2*self.pad[-1]+1, 1), + dilation=(next_dilation, 1) if not dense else (1, 1), bias=False)) + layers_bn.append(nn.BatchNorm2d(2**i*channels, momentum=0.1)) + layers_conv.append(nn.Conv2d(2**i*channels, 2**i*channels, 1, dilation=1, bias=False)) + layers_bn.append(nn.BatchNorm2d(2**i*channels, momentum=0.1)) + + layers_graph_conv.append(GraphAttentionBlock(adj, 2**i*channels, 2**i*channels, p_dropout=dropout)) + + next_dilation *= filter_widths[i] + + self.layers_conv = nn.ModuleList(layers_conv) + self.layers_bn = nn.ModuleList(layers_bn) + self.layers_graph_conv = nn.ModuleList(layers_graph_conv) + + def _forward_blocks(self, x): + + # x: (B, T, N, C) --> (B, C, T, N) + x = x.permute(0, 3, 1, 2) + x = self.init_bn(x) + x = self.relu(self.expand_bn(self.expand_conv(x))) + x = self.layers_graph_conv[0](x) + + for i in range(len(self.pad) - 1): + pad = self.pad[i + 1] + shift = self.causal_shift[i + 1] + res = x[:, :, pad + shift: x.shape[2] - pad + shift] + + # x: (B, C, T, N) + x = self.relu(self.layers_bn[2 * i](self.layers_conv[2 * i](x))) + x = res + self.drop(self.relu(self.layers_bn[2 * i + 1](self.layers_conv[2 * i + 1](x)))) + + x = self.layers_graph_conv[i + 1](x) + return x + + +class SpatioTemporalModelOptimized1f(SpatioTemporalModelBase): + """ + 3D pose estimation model optimized for single-frame batching, i.e. + where batches have input length = receptive field, and output length = 1. + This scenario is only used for training when stride == 1. + + This implementation replaces dilated convolutions with strided convolutions + to avoid generating unused intermediate results. The weights are interchangeable + with the reference implementation. + """ + + def __init__(self, adj, num_joints_in, in_features, num_joints_out, + filter_widths, causal=False, dropout=0.25, channels=64): + """ + Initialize this model. + + Arguments: + num_joints_in -- number of input joints (e.g. 17 for Human3.6M) + in_features -- number of input features for each joint (typically 2 for 2D input) + num_joints_out -- number of output joints (can be different than input) + filter_widths -- list of convolution widths, which also determines the # of blocks and receptive field + causal -- use causal convolutions instead of symmetric convolutions (for real-time applications) + dropout -- dropout probability + channels -- number of convolution channels + """ + super().__init__(adj, num_joints_in, in_features, num_joints_out, filter_widths, causal, dropout, channels) + + self.expand_conv = nn.Conv2d(in_features, channels, (filter_widths[0], 1), stride=(filter_widths[0], 1), bias=False) + nn.init.kaiming_normal_(self.expand_conv.weight) + + layers_conv = [] + layers_graph_conv = [] + layers_bn = [] + + layers_graph_conv.append(GraphAttentionBlock(adj, channels, channels, p_dropout=dropout)) + + self.causal_shift = [(filter_widths[0] // 2) if causal else 0] + next_dilation = filter_widths[0] + for i in range(1, len(filter_widths)): + self.pad.append((filter_widths[i] - 1) * next_dilation // 2) + self.causal_shift.append((filter_widths[i] // 2) if causal else 0) + + layers_conv.append(nn.Conv2d(2**i*channels, 2**i*channels, (filter_widths[i], 1), stride=(filter_widths[i], 1), bias=False)) + layers_bn.append(nn.BatchNorm2d(2**i*channels, momentum=0.1)) + layers_conv.append(nn.Conv2d(2**i*channels, 2**i*channels, 1, dilation=1, bias=False)) + layers_bn.append(nn.BatchNorm2d(2**i*channels, momentum=0.1)) + + layers_graph_conv.append(GraphAttentionBlock(adj, 2**i*channels, 2**i*channels, p_dropout=dropout)) + + next_dilation *= filter_widths[i] + + self.layers_conv = nn.ModuleList(layers_conv) + self.layers_bn = nn.ModuleList(layers_bn) + self.layers_graph_conv = nn.ModuleList(layers_graph_conv) + + def _forward_blocks(self, x): + # x: (B, T, N, C) --> (B, C, T, N) + x = x.permute(0, 3, 1, 2) + x = self.init_bn(x) + x = self.relu(self.expand_bn(self.expand_conv(x))) + x = self.layers_graph_conv[0](x) + + for i in range(len(self.pad) - 1): + res = x[:, :, self.causal_shift[i+1] + self.filter_widths[i+1]//2 :: self.filter_widths[i+1]] + + # x: (B, C, T, N) + x = self.relu(self.layers_bn[2 * i](self.layers_conv[2 * i](x))) + x = res + self.drop(self.relu(self.layers_bn[2 * i + 1](self.layers_conv[2 * i + 1](x)))) + + x = self.layers_graph_conv[i+1](x) + + return x + + +if __name__ == "__main__": + import torch + import numpy as np + import torchsummary + from common.skeleton import Skeleton + from common.graph_utils import adj_mx_from_skeleton + + h36m_skeleton = Skeleton(parents=[-1, 0, 1, 2, 0, 4, 5, 0, 7, 8, 9, 8, 11, 12, 8, 14, 15], + joints_left=[6, 7, 8, 9, 10, 16, 17, 18, 19, 20, 21, 22, 23], + joints_right=[1, 2, 3, 4, 5, 24, 25, 26, 27, 28, 29, 30, 31]) + + humaneva_skeleton = Skeleton(parents=[-1, 0, 1, 2, 3, 1, 5, 6, 0, 8, 9, 0, 11, 12, 1], + joints_left=[2, 3, 4, 8, 9, 10], + joints_right=[5, 6, 7, 11, 12, 13]) + + adj = adj_mx_from_skeleton(h36m_skeleton) + model = SpatioTemporalModel(adj, num_joints_in=17, in_features=2, num_joints_out=17, + filter_widths=[3, 3, 3], channels=128) + model = model.cuda() + + model_params = 0 + + for parameter in model.parameters(): + model_params += parameter.numel() + + print('INFO: Trainable parameter count:', model_params) + input = torch.randn(2, 27, 17, 2) + input = input.cuda() + + # summary(model, (27, 15, 2)) + output = model(input) + print(output.shape) diff --git a/VideoToNPZ/model/global_attention.py b/VideoToNPZ/model/global_attention.py new file mode 100644 index 0000000000000000000000000000000000000000..f188bb6cfec4bd0a93d5fd58220471b6e5ee53cc --- /dev/null +++ b/VideoToNPZ/model/global_attention.py @@ -0,0 +1,173 @@ +from __future__ import absolute_import, division + +import torch +from torch import nn + + +class GlobalGraph(nn.Module): + """" + Global graph attention layer + """ + + def __init__(self, adj, in_channels, inter_channels=None): + super(GlobalGraph, self).__init__() + + self.adj = adj + self.in_channels = in_channels + self.inter_channels = inter_channels + + self.softmax = nn.Softmax(dim=-1) + self.relu = nn.ReLU(inplace=True) + self.leakyrelu = nn.LeakyReLU(0.2) + + if self.inter_channels == self.in_channels // 2: + self.g_channels = self.in_channels + else: + self.g_channels = self.inter_channels + + assert self.inter_channels > 0 + + self.g = nn.Conv1d(in_channels=self.in_channels, out_channels=self.g_channels, + kernel_size=1, stride=1, padding=0) + self.theta = nn.Conv1d(in_channels=self.in_channels, out_channels=self.inter_channels, + kernel_size=1, stride=1, padding=0) + self.phi = nn.Conv1d(in_channels=self.in_channels, out_channels=self.inter_channels, + kernel_size=1, stride=1, padding=0) + + adj_shape = self.adj.shape + self.C_k = nn.Parameter(torch.zeros(adj_shape, dtype=torch.float)) + + self.concat_project = nn.Sequential( + nn.Conv2d(self.inter_channels * 2, 1, 1, 1, 0, bias=False), + ) + + nn.init.kaiming_normal_(self.concat_project[0].weight) + nn.init.kaiming_normal_(self.g.weight) + nn.init.constant_(self.g.bias, 0) + nn.init.kaiming_normal_(self.theta.weight) + nn.init.constant_(self.theta.bias, 0) + nn.init.kaiming_normal_(self.phi.weight) + nn.init.constant_(self.phi.bias, 0) + + def forward(self, x): + batch_size = x.size(0) # x: (B*T, C, N) + + # g_x: (B*T, N, C/k) + g_x = self.g(x).view(batch_size, self.g_channels, -1) + g_x = g_x.permute(0, 2, 1) + + # (B*T, C/k, N, 1) + theta_x = self.theta(x).view(batch_size, self.inter_channels, -1, 1) + # (B*T, C/k, 1, N) + phi_x = self.phi(x).view(batch_size, self.inter_channels, 1, -1) + + # h: N, w: N + h = theta_x.size(2) + w = phi_x.size(3) + theta_x = theta_x.expand(-1, -1, -1, w) # (B*T, C/k, N, N) + phi_x = phi_x.expand(-1, -1, h, -1) + + # concat_feature: (B*T, C/k, N, N) + concat_feature = torch.cat([theta_x, phi_x], dim=1) + f = self.concat_project(concat_feature) # (B*T, 1, N, N) + b, _, h, w = f.size() + attention = self.leakyrelu(f.view(b, h, w)) # (B*T, N, N) attention:B_k + + attention = torch.add(self.softmax(attention), self.C_k) + # y: (B*T, C/k, N) + y = torch.matmul(attention, g_x) + y = y.permute(0, 2, 1).contiguous() + y = y.view(batch_size, self.g_channels, *x.size()[2:]) + + return y + + +class MultiGlobalGraph(nn.Module): + def __init__(self, adj, in_channels, inter_channels, dropout=None): + super(MultiGlobalGraph, self).__init__() + + self.num_non_local = in_channels // inter_channels + + attentions = [GlobalGraph(adj, in_channels, inter_channels) for _ in range(self.num_non_local)] + self.attentions = nn.ModuleList(attentions) + + self.cat_conv = nn.Conv2d(in_channels, in_channels, 1, bias=False) + self.cat_bn = nn.BatchNorm2d(in_channels, momentum=0.1) + self.relu = nn.ReLU(inplace=True) + + if dropout is not None: + self.dropout = nn.Dropout(dropout) + else: + self.dropout = None + + def forward(self, x): + # x: (B, T, K, C) --> (B*T, K, C) + x_size = x.shape + x = x.contiguous() + x = x.view(-1, *x_size[2:]) + # x: (B*T, C, K) + x = x.permute(0, 2, 1) + + x = torch.cat([self.attentions[i](x) for i in range(len(self.attentions))], dim=1) + + # x: (B*T, C, K) --> (B*T, K, C) + x = x.permute(0, 2, 1).contiguous() + + # x = torch.matmul(x, self.W) + # x: (B*T, K, C) --> (B, T, K, C) + x = x.view(*x_size) + + # x: (B, T, K, C) --> (B, C, T, K) + x = x.permute(0, 3, 1, 2) + x = self.relu(self.cat_bn(self.cat_conv(x))) + + if self.dropout is not None: + x = self.dropout(x) + + # x: (B, C, T, K) --> (B, T, K, C) + x = x.permute(0, 2, 3, 1) + + return x + + +class SingleGlobalGraph(nn.Module): + def __init__(self, adj, in_channels, output_channels, dropout=None): + super(SingleGlobalGraph, self).__init__() + + self.attentions = GlobalGraph(adj, in_channels, output_channels//2) + self.bn = nn.BatchNorm2d(in_channels, momentum=0.1) + self.relu = nn.ReLU(inplace=True) + + if dropout is not None: + self.dropout = nn.Dropout(dropout) + else: + self.dropout = None + + def forward(self, x): + # x: (B, T, K, C) --> (B*T, K, C) + x_size = x.shape + x = x.contiguous() + x = x.view(-1, *x_size[2:]) + # x: (B*T, C, K) + x = x.permute(0, 2, 1) + + x = self.attentions(x) + + # x: (B*T, C, K) --> (B*T, K, C) + x = x.permute(0, 2, 1).contiguous() + + # x = torch.matmul(x, self.W) + # x: (B*T, K, C) --> (B, T, K, C) + x = x.view(*x_size) + + # x: (B, T, K, C) --> (B, C, T, K) + x = x.permute(0, 3, 1, 2) + x = self.relu(self.bn(x)) + + if self.dropout is not None: + x = self.dropout(x) + + # x: (B, C, T, K) --> (B, T, K, C) + x = x.permute(0, 2, 3, 1) + + return x diff --git a/VideoToNPZ/model/local_attention.py b/VideoToNPZ/model/local_attention.py new file mode 100644 index 0000000000000000000000000000000000000000..ceba45dac4b8c89e7117f3bf13761392c0869f32 --- /dev/null +++ b/VideoToNPZ/model/local_attention.py @@ -0,0 +1,151 @@ +from __future__ import absolute_import, division + +import math +import torch +import torch.nn as nn +import torch.nn.functional as F +import numpy as np + + +class SemCHGraphConv(nn.Module): + """ + Semantic channel-wise graph convolution layer + """ + + def __init__(self, in_features, out_features, adj, bias=False): + super(SemCHGraphConv, self).__init__() + self.in_features = in_features + self.out_features = out_features + + self.W = nn.Parameter(torch.zeros(size=(2, in_features, out_features), dtype=torch.float)) + nn.init.xavier_uniform_(self.W.data, gain=1.414) + + self.adj = adj.unsqueeze(0).repeat(out_features, 1, 1) + self.m = (self.adj > 0) + self.e = nn.Parameter(torch.zeros(out_features, len(self.m[0].nonzero()), dtype=torch.float)) + nn.init.constant_(self.e.data, 1) + + if bias: + self.bias = nn.Parameter(torch.zeros(out_features, dtype=torch.float)) + stdv = 1. / math.sqrt(self.W.size(1)) + self.bias.data.uniform_(-stdv, stdv) + else: + self.register_parameter('bias', None) + + def forward(self, input): + # input: (B, T, J, C) + h0 = torch.matmul(input, self.W[0]).unsqueeze(2).transpose(2, 4) # B * T * C * J * 1 + h1 = torch.matmul(input, self.W[1]).unsqueeze(2).transpose(2, 4) # B * T * C * J * 1 + + adj = -9e15 * torch.ones_like(self.adj).to(input.device) # C * J * J + adj[self.m] = self.e.view(-1) + adj = F.softmax(adj, dim=2) + + E = torch.eye(adj.size(1), dtype=torch.float).to(input.device) + E = E.unsqueeze(0).repeat(self.out_features, 1, 1) # C * J * J + + output = torch.matmul(adj * E, h0) + torch.matmul(adj * (1 - E), h1) + output = output.transpose(2, 4).squeeze(2) + + if self.bias is not None: + return output + self.bias.view(1, 1, -1) + else: + return output + + def __repr__(self): + return self.__class__.__name__ + ' (' + str(self.in_features) + ' -> ' + str(self.out_features) + ')' + + +class LocalGraph(nn.Module): + def __init__(self, adj, input_dim, output_dim, dropout=None): + super(LocalGraph, self).__init__() + + num_joints = adj.shape[0] + + # Human3.6M + if num_joints == 17: + distal_joints = [3, 6, 10, 13, 16] + joints_left = [4, 5, 6, 11, 12, 13] + joints_right = [1, 2, 3, 14, 15, 16] + + # Human3.6M detected from Stacked Hourglass + elif num_joints == 16: + distal_joints = [3, 6, 9, 12, 15] + joints_left = [4, 5, 6, 10, 11, 12] + joints_right = [1, 2, 3, 13, 14, 15] + + # HumanEva + elif num_joints == 15: + distal_joints = [4, 7, 10, 13] + joints_left = [2, 3, 4, 8, 9, 10] + joints_right = [5, 6, 7, 11, 12, 13] + + # Human3.6M including toe keypoints + elif num_joints == 19: + distal_joints = [3, 4, 7, 8, 12, 15, 18] + joints_left = [5, 6, 7, 8, 13, 14, 15] + joints_right = [1, 2, 3, 4, 16, 17, 18] + + else: + raise KeyError("The dimension of adj matrix is wrong!") + + adj_sym = torch.zeros_like(adj) + for i in range(num_joints): + for j in range(num_joints): + if i == j: + adj_sym[i][j] = 1 + if i in joints_left: + index = joints_left.index(i) + adj_sym[i][joints_right[index]] = 1.0 + if i in joints_right: + index = joints_right.index(i) + adj_sym[i][joints_left[index]] = 1.0 + + adj_1st_order = adj.matrix_power(1) + for i in np.arange(num_joints): + if i in distal_joints: + adj_1st_order[i] = 0 + + adj_2nd_order = adj.matrix_power(2) + for i in np.arange(num_joints): + if i not in distal_joints: + adj_2nd_order[i] = 0 + + adj_con = adj_1st_order + adj_2nd_order + + self.gcn_sym = SemCHGraphConv(input_dim, output_dim, adj_sym) + self.bn_1 = nn.BatchNorm2d(output_dim, momentum=0.1) + self.gcn_con = SemCHGraphConv(input_dim, output_dim, adj_con) + self.bn_2 = nn.BatchNorm2d(output_dim, momentum=0.1) + self.relu = nn.ReLU() + + self.cat_conv = nn.Conv2d(2*output_dim, output_dim, 1, bias=False) + self.cat_bn = nn.BatchNorm2d(output_dim, momentum=0.1) + + if dropout is not None: + self.dropout = nn.Dropout(dropout) + else: + self.dropout = None + + def forward(self, input): + # x: (B, T, K, C) + x = self.gcn_sym(input) + y = self.gcn_con(input) + + # x: (B, T, K, C) --> (B, C, T, K) + x = x.permute(0, 3, 1, 2) + y = y.permute(0, 3, 1, 2) + + x = self.relu(self.bn_1(x)) + y = self.relu(self.bn_2(y)) + + output = torch.cat((x, y), dim=1) + output = self.cat_bn(self.cat_conv(output)) + + if self.dropout is not None: + output = self.dropout(self.relu(output)) + else: + output = self.relu(output) + output = output.permute(0, 2, 3, 1) + + return output diff --git a/VideoToNPZ/model/sem_graph_conv.py b/VideoToNPZ/model/sem_graph_conv.py new file mode 100644 index 0000000000000000000000000000000000000000..f5ca3dde2d36c07f6edf4908c7fa0a768d10e9e5 --- /dev/null +++ b/VideoToNPZ/model/sem_graph_conv.py @@ -0,0 +1,154 @@ +from __future__ import absolute_import, division + +import math +import torch +import torch.nn as nn +import torch.nn.functional as F +import numpy as np + + +class SemGraphConv(nn.Module): + """ + Semantic graph convolution layer + """ + + def __init__(self, in_features, out_features, adj, bias=True): + super(SemGraphConv, self).__init__() + self.in_features = in_features + self.out_features = out_features + + self.W = nn.Parameter(torch.zeros(size=(2, in_features, out_features), dtype=torch.float)) + nn.init.xavier_uniform_(self.W.data, gain=1.414) + + self.adj = adj + self.m = (self.adj > 0) + self.e = nn.Parameter(torch.zeros(1, len(self.m.nonzero()), dtype=torch.float)) + nn.init.constant_(self.e.data, 1) + + if bias: + self.bias = nn.Parameter(torch.zeros(out_features, dtype=torch.float)) + stdv = 1. / math.sqrt(self.W.size(2)) + self.bias.data.uniform_(-stdv, stdv) + else: + self.register_parameter('bias', None) + + def forward(self, input): + # X: (B, T, K, C) + + h0 = torch.matmul(input, self.W[0]) + h1 = torch.matmul(input, self.W[1]) + + adj = -9e15 * torch.ones_like(self.adj).to(input.device) + adj[self.m] = self.e + adj = F.softmax(adj, dim=1) + + M = torch.eye(adj.size(0), dtype=torch.float).to(input.device) + + output = torch.matmul(adj * M, h0) + torch.matmul(adj * (1 - M), h1) + + if self.bias is not None: + return output + self.bias.view(1, 1, -1) + else: + return output + + def __repr__(self): + return self.__class__.__name__ + ' (' + str(self.in_features) + ' -> ' + str(self.out_features) + ')' + + +class LocalGraph(nn.Module): + def __init__(self, adj, input_dim, output_dim, dropout=None): + super(LocalGraph, self).__init__() + + num_joints = adj.shape[0] + + # Human3.6M + if num_joints == 17: + distal_joints = [3, 6, 10, 13, 16] + joints_left = [4, 5, 6, 11, 12, 13] + joints_right = [1, 2, 3, 14, 15, 16] + + # Human3.6m with toe keypoitns + elif num_joints == 19: + distal_joints = [3, 4, 7, 8, 12, 15, 18] + joints_left = [5, 6, 7, 8, 13, 14, 15] + joints_right = [1, 2, 3, 4, 16, 17, 18] + + # Human3.6M detected from Stacked Hourglass + elif num_joints == 16: + distal_joints = [3, 6, 9, 12, 15] + joints_left = [4, 5, 6, 10, 11, 12] + joints_right = [1, 2, 3, 13, 14, 15] + + # HumanEva + elif num_joints == 15: + distal_joints = [4, 7, 10, 13] + joints_left = [2, 3, 4, 8, 9, 10] + joints_right = [5, 6, 7, 11, 12, 13] + + else: + print('num_joints: %d' % num_joints) + raise KeyError("The dimension of adj matrix is wrong!") + + adj_sym = torch.zeros_like(adj) + for i in range(num_joints): + for j in range(num_joints): + if i == j: + adj_sym[i][j] = 1 + if i in joints_left: + index = joints_left.index(i) + adj_sym[i][joints_right[index]] = 1.0 + if i in joints_right: + index = joints_right.index(i) + adj_sym[i][joints_left[index]] = 1.0 + + adj_1st_order = adj.matrix_power(1) + # distal_joints = [3, 6, 10, 13, 16] + for i in np.arange(num_joints): + if i in distal_joints: + adj_1st_order[i] = 0 + + adj_2nd_order = adj.matrix_power(2) + # distal_joints = [3, 6, 10, 13, 16] + for i in np.arange(num_joints): + if i not in distal_joints: + adj_2nd_order[i] = 0 + + adj_con = adj_1st_order + adj_2nd_order + + self.gcn_sym = SemGraphConv(input_dim, output_dim, adj_sym) + self.bn_1 = nn.BatchNorm2d(output_dim, momentum=0.1) + self.gcn_con = SemGraphConv(input_dim, output_dim, adj_con) + self.bn_2 = nn.BatchNorm2d(output_dim, momentum=0.1) + self.relu = nn.ReLU() + + self.cat_conv = nn.Conv2d(2 * output_dim, output_dim, 1, bias=False) + self.cat_bn = nn.BatchNorm2d(output_dim, momentum=0.1) + + if dropout is not None: + self.dropout = nn.Dropout2d(dropout) + else: + self.dropout = None + + def forward(self, input): + # x: (B, T, K, C) + x = self.gcn_sym(input) + y = self.gcn_con(input) + + # x: (B, T, K, C) --> (B, C, T, K) + x = x.permute(0, 3, 1, 2) + y = y.permute(0, 3, 1, 2) + + x = self.relu(self.bn_1(x)) + y = self.relu(self.bn_2(y)) + + output = torch.cat((x, y), dim=1) + output = self.cat_bn(self.cat_conv(output)) + + if self.dropout is not None: + output = self.dropout(self.relu(output)) + else: + output = self.relu(output) + output = output.permute(0, 2, 3, 1) + + return output + diff --git a/VideoToNPZ/tools/color_edge.py b/VideoToNPZ/tools/color_edge.py new file mode 100644 index 0000000000000000000000000000000000000000..40475ab2311d2d85a595e67bb2558b2faaf0f5fb --- /dev/null +++ b/VideoToNPZ/tools/color_edge.py @@ -0,0 +1,68 @@ +# For better visualization, give different colors to different bones + +h36m_elbow_knee_v1 = [5, 15] +h36m_elbow_knee_v2 = [2, 12] +h36m_wrist_ankle_v1 = [6, 16] +h36m_wrist_ankle_v2 = [3, 13] +h36m_hip_shoulder = [1, 4, 11, 14] +h36m_spine_neck = [7, 9] +h36m_thorax_head = [8, 10] + + +def h36m_color_edge(joint_num): + if joint_num in h36m_elbow_knee_v1: + color = 'peru' # (205, 133, 63) + elif joint_num in h36m_elbow_knee_v2: + color = 'indianred' # (205, 92, 92) + elif joint_num in h36m_wrist_ankle_v1: + color = 'coral' # (255, 127, 80) + elif joint_num in h36m_wrist_ankle_v2: + # color = 'deepskyblue' + color = 'brown' # (165, 42, 42) + elif joint_num in h36m_hip_shoulder: + # color = 'dodgerblue' + color = 'tan' # (210, 180, 140) + elif joint_num in h36m_spine_neck: + color = 'olive' # (128, 128, 0) + else: + color = 'purple' # (128, 0, 128) + return color + + +ntu_elbow_knee_v1 = [6, 18] +ntu_elbow_knee_v2 = [10, 14] +ntu_wrist_ankle_v1 = [8, 19] +ntu_wrist_ankle_v2 = [12, 15] +ntu_hip_shoulder = [13, 17, 5, 9] +ntu_spine_neck = [2, 3] +ntu_thorax_head = [21, 4] +ntu_foot = [16, 20] +ntu_middle_wrist = [7, 11] +ntu_thumbs = [23, 25] +ntu_middle_finger = [22, 24] + + +def ntu_color_edge(joint_num): + if joint_num in ntu_elbow_knee_v1: + color = 'peru' + elif joint_num in ntu_elbow_knee_v2: + color = 'indianred' + elif joint_num in ntu_wrist_ankle_v1: + color = 'coral' + elif joint_num in ntu_wrist_ankle_v2: + color = 'brown' + elif joint_num in ntu_hip_shoulder: + color = 'tan' + elif joint_num in ntu_spine_neck: + color = 'olive' + elif ntu_thorax_head: + color = 'purple' + elif joint_num in ntu_foot: + color = 'deepskyblue' + elif joint_num in ntu_middle_wrist: + color = 'dodgerblue' + elif joint_num in ntu_thumbs: + color = 'red' + else: + color = 'yellow' + return color diff --git a/VideoToNPZ/tools/inference.py b/VideoToNPZ/tools/inference.py new file mode 100644 index 0000000000000000000000000000000000000000..3f6ffed4d8318f67b37483e350bae40723ca3d82 --- /dev/null +++ b/VideoToNPZ/tools/inference.py @@ -0,0 +1,110 @@ +import torch +import numpy as np +import sys +import os.path as osp + + +pre_dir = osp.join(osp.dirname(osp.realpath(__file__)), '..') +sys.path.insert(0, pre_dir) +from common.camera import normalize_screen_coordinates, camera_to_world +from common.generators import * +sys.path.pop(0) + + +joints_left, joints_right = [4, 5, 6, 11, 12, 13], [1, 2, 3, 14, 15, 16] +kps_left, kps_right = [4, 5, 6, 11, 12, 13], [1, 2, 3, 14, 15, 16] +rot = np.array([0.14070565, -0.15007018, -0.7552408, 0.62232804], dtype=np.float32) + + +def evaluate(test_generator, model_pos): + prediction = [] + + with torch.no_grad(): + for _, _, batch_2d in test_generator.next_epoch(): + + inputs_2d = torch.from_numpy(batch_2d.astype('float32')) + if torch.cuda.is_available(): + inputs_2d = inputs_2d.cuda() + + # Positional model + predicted_3d_pos = model_pos(inputs_2d) + + # Test-time augmentation (if enabled) + if test_generator.augment_enabled(): + # Undo flipping and take average with non-flipped version + predicted_3d_pos[1, :, :, 0] *= -1 + predicted_3d_pos[1, :, joints_left + joints_right] = predicted_3d_pos[1, :, joints_right + joints_left] + predicted_3d_pos = torch.mean(predicted_3d_pos, dim=0, keepdim=True) + + prediction.append(predicted_3d_pos.squeeze(0).cpu().numpy()) + + return prediction + + +def gen_pose(kpts, valid_frames, width, height, model_pos, pad, causal_shift=0): + assert len(kpts.shape) == 4, 'The shape of kpts: {}'.format(kpts.shape) + assert kpts.shape[0] == len(valid_frames) + + norm_seqs = [] + for index, frames in enumerate(valid_frames): + seq_kps = kpts[index, frames] + norm_seq_kps = normalize_screen_coordinates(seq_kps, w=width, h=height) + norm_seqs.append(norm_seq_kps) + + gen = UnchunkedGenerator(None, None, norm_seqs, pad=pad, causal_shift=causal_shift, augment=True, + kps_left=kps_left, kps_right=kps_right, joints_left=joints_left, joints_right=joints_right) + prediction = evaluate(gen, model_pos) + + prediction_to_world = [] + for i in range(len(prediction)): + sub_prediction = prediction[i] + + sub_prediction = camera_to_world(sub_prediction, R=rot, t=0) + + # sub_prediction[:, :, 2] -= np.expand_dims(np.amin(sub_prediction[:, :, 2], axis=1), axis=1).repeat([17], axis=1) + # sub_prediction[:, :, 2] -= np.amin(sub_prediction[:, :, 2]) + + prediction_to_world.append(sub_prediction) + + # prediction_to_world = np.asarray(prediction_to_world, dtype=np.float32) + return prediction_to_world + + +def gen_pose_frame(kpts, width, height, model_pos, pad, causal_shift=0): + # kpts: (M, T, N, 2) + norm_seqs = [] + for kpt in kpts: + norm_kpt = normalize_screen_coordinates(kpt, w=width, h=height) + norm_seqs.append(norm_kpt) + + gen = UnchunkedGenerator(None, None, norm_seqs, pad=pad, causal_shift=causal_shift, augment=True, + kps_left=kps_left, kps_right=kps_right, joints_left=joints_left, joints_right=joints_right) + prediction = evaluate(gen, model_pos) + + prediction_to_world = [] + for i in range(len(prediction)): + sub_prediction = prediction[i][0] + sub_prediction = camera_to_world(sub_prediction, R=rot, t=0) + sub_prediction[:, 2] -= np.amin(sub_prediction[:, 2]) + prediction_to_world.append(sub_prediction) + + return prediction_to_world + + +def gen_pose_frame_(kpts, width, height, model_pos, pad, causal_shift=0): + # input (N, 17, 2) return (N, 17, 3) + if not isinstance(kpts, np.ndarray): + kpts = np.array(kpts) + + keypoints = normalize_screen_coordinates(kpts[..., :2], w=width, h=height) + + input_keypoints = keypoints.copy() + # test_time_augmentation True + from common.generators import UnchunkedGenerator + gen = UnchunkedGenerator(None, None, [input_keypoints], pad=pad, causal_shift=causal_shift, + augment=True, kps_left=kps_left, kps_right=kps_right, + joints_left=joints_left, joints_right=joints_right) + prediction = evaluate(gen, model_pos) + prediction = camera_to_world(prediction[0], R=rot, t=0) + prediction[:, :, 2] -= np.min(prediction[:, :, 2]) + return prediction diff --git a/VideoToNPZ/tools/mpii_coco_h36m.py b/VideoToNPZ/tools/mpii_coco_h36m.py new file mode 100644 index 0000000000000000000000000000000000000000..7c9a934ed6c9430bec828934ed376dc8a4ee48fb --- /dev/null +++ b/VideoToNPZ/tools/mpii_coco_h36m.py @@ -0,0 +1,75 @@ +''' +Project: https://github.com/fabro66/GAST-Net-3DPoseEstimation +''' +import numpy as np + + +h36m_coco_order = [9, 11, 14, 12, 15, 13, 16, 4, 1, 5, 2, 6, 3] +coco_order = [0, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16] +spple_keypoints = [10, 8, 0, 7] + +scores_h36m_toe_oeder = [1, 2, 3, 5, 6, 7, 11, 13, 14, 15, 16, 17, 18] +kpts_h36m_toe_order = [0, 1, 2, 3, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18] +scores_coco_order = [12, 14, 16, 11, 13, 15, 0, 5, 7, 9, 6, 8, 10] + +h36m_mpii_order = [3, 2, 1, 4, 5, 6, 0, 8, 9, 10, 16, 15, 14, 11, 12, 13] +mpii_order = [i for i in range(16)] +lr_hip_shouler = [2, 3, 12, 13] + + +def coco_h36m(keypoints): + temporal = keypoints.shape[0] + keypoints_h36m = np.zeros_like(keypoints, dtype=np.float32) + htps_keypoints = np.zeros((temporal, 4, 2), dtype=np.float32) + + # htps_keypoints: head, thorax, pelvis, spine + htps_keypoints[:, 0, 0] = np.mean(keypoints[:, 1:5, 0], axis=1, dtype=np.float32) + htps_keypoints[:, 0, 1] = np.sum(keypoints[:, 1:3, 1], axis=1, dtype=np.float32) - keypoints[:, 0, 1] + htps_keypoints[:, 1, :] = np.mean(keypoints[:, 5:7, :], axis=1, dtype=np.float32) + htps_keypoints[:, 1, :] += (keypoints[:, 0, :] - htps_keypoints[:, 1, :]) / 3 + + htps_keypoints[:, 2, :] = np.mean(keypoints[:, 11:13, :], axis=1, dtype=np.float32) + htps_keypoints[:, 3, :] = np.mean(keypoints[:, [5, 6, 11, 12], :], axis=1, dtype=np.float32) + + keypoints_h36m[:, spple_keypoints, :] = htps_keypoints + keypoints_h36m[:, h36m_coco_order, :] = keypoints[:, coco_order, :] + + keypoints_h36m[:, 9, :] -= (keypoints_h36m[:, 9, :] - np.mean(keypoints[:, 5:7, :], axis=1, dtype=np.float32)) / 4 + keypoints_h36m[:, 7, 0] += 2*(keypoints_h36m[:, 7, 0] - np.mean(keypoints_h36m[:, [0, 8], 0], axis=1, dtype=np.float32)) + keypoints_h36m[:, 8, 1] -= (np.mean(keypoints[:, 1:3, 1], axis=1, dtype=np.float32) - keypoints[:, 0, 1])*2/3 + + # half body: the joint of ankle and knee equal to hip + # keypoints_h36m[:, [2, 3]] = keypoints_h36m[:, [1, 1]] + # keypoints_h36m[:, [5, 6]] = keypoints_h36m[:, [4, 4]] + + valid_frames = np.where(np.sum(keypoints_h36m.reshape(-1, 34), axis=1) != 0)[0] + return keypoints_h36m, valid_frames + + +def mpii_h36m(keypoints): + temporal = keypoints.shape[0] + keypoints_h36m = np.zeros((temporal, 17, 2), dtype=np.float32) + keypoints_h36m[:, h36m_mpii_order] = keypoints + # keypoints_h36m[:, 7] = np.mean(keypoints[:, 6:8], axis=1, dtype=np.float32) + keypoints_h36m[:, 7] = np.mean(keypoints[:, lr_hip_shouler], axis=1, dtype=np.float32) + + valid_frames = np.where(np.sum(keypoints_h36m.reshape(-1, 34), axis=1) != 0)[0] + return keypoints_h36m, valid_frames + + +def coco_h36m_toe_format(keypoints): + assert len(keypoints.shape) == 3 + temporal = keypoints.shape[0] + + new_kpts = np.zeros((temporal, 19, 2), dtype=np.float32) + + # convert body+foot keypoints + coco_body_kpts = keypoints[:, :17].copy() + h36m_body_kpts, _ = coco_h36m(coco_body_kpts) + new_kpts[:, kpts_h36m_toe_order] = h36m_body_kpts + new_kpts[:, 4] = np.mean(keypoints[:, [20, 21]], axis=1, dtype=np.float32) + new_kpts[:, 8] = np.mean(keypoints[:, [17, 18]], axis=1, dtype=np.float32) + + valid_frames = np.where(np.sum(new_kpts.reshape(-1, 38), axis=-1) != 0)[0] + + return new_kpts, valid_frames diff --git a/VideoToNPZ/tools/preprocess.py b/VideoToNPZ/tools/preprocess.py new file mode 100644 index 0000000000000000000000000000000000000000..3502ec725dc3b08c56c4a063016be2e190b6fc49 --- /dev/null +++ b/VideoToNPZ/tools/preprocess.py @@ -0,0 +1,172 @@ +import json +import numpy as np +from tools.mpii_coco_h36m import coco_h36m +import os + + +h36m_coco_order = [9, 11, 14, 12, 15, 13, 16, 4, 1, 5, 2, 6, 3] +coco_order = [0, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16] +num_person = 2 +num_joints = 17 +img_3d = 100. +ratio_2d_3d = 500. + + +def load_json(file_path): + with open(file_path, 'r') as fr: + video_info = json.load(fr) + + label = video_info['label'] + label_index = video_info['label_index'] + + num_frames = video_info['data'][-1]['frame_index'] + keypoints = np.zeros((num_person, num_frames, num_joints, 2), dtype=np.float32) + scores = np.zeros((num_person, num_frames, num_joints), dtype=np.float32) + + for frame_info in video_info['data']: + frame_index = frame_info['frame_index'] + + for index, skeleton_info in enumerate(frame_info['skeleton']): + pose = skeleton_info['pose'] + score = skeleton_info['score'] + bbox = skeleton_info['bbox'] + + if len(bbox) == 0 or index+1 > num_person: + continue + + pose = np.asarray(pose, dtype=np.float32) + score = np.asarray(score, dtype=np.float32) + score = score.reshape(-1) + + keypoints[index, frame_index-1] = pose + scores[index, frame_index-1] = score + + return keypoints, scores, label, label_index + + +def h36m_coco_format(keypoints, scores): + assert len(keypoints.shape) == 4 and len(scores.shape) == 3 + + h36m_kpts = [] + h36m_scores = [] + valid_frames = [] + + for i in range(keypoints.shape[0]): + kpts = keypoints[i] + score = scores[i] + + new_score = np.zeros_like(score, dtype=np.float32) + + if np.sum(kpts) != 0.: + kpts, valid_frame = coco_h36m(kpts) + h36m_kpts.append(kpts) + valid_frames.append(valid_frame) + + new_score[:, h36m_coco_order] = score[:, coco_order] + new_score[:, 0] = np.mean(score[:, [11, 12]], axis=1, dtype=np.float32) + new_score[:, 8] = np.mean(score[:, [5, 6]], axis=1, dtype=np.float32) + new_score[:, 7] = np.mean(new_score[:, [0, 8]], axis=1, dtype=np.float32) + new_score[:, 10] = np.mean(score[:, [1, 2, 3, 4]], axis=1, dtype=np.float32) + + h36m_scores.append(new_score) + + h36m_kpts = np.asarray(h36m_kpts, dtype=np.float32) + h36m_scores = np.asarray(h36m_scores, dtype=np.float32) + return h36m_kpts, h36m_scores, valid_frames + + +def revise_kpts(h36m_kpts, h36m_scores, valid_frames): + + new_h36m_kpts = np.zeros_like(h36m_kpts) + for index, frames in enumerate(valid_frames): + kpts = h36m_kpts[index, frames] + score = h36m_scores[index, frames] + + # threshold_score = score > 0.3 + # if threshold_score.all(): + # continue + + index_frame = np.where(np.sum(score < 0.3, axis=1) > 0)[0] + + for frame in index_frame: + less_threshold_joints = np.where(score[frame] < 0.3)[0] + + intersect = [i for i in [2, 3, 5, 6] if i in less_threshold_joints] + + if [2, 3, 5, 6] == intersect: + kpts[frame, [2, 3, 5, 6]] = kpts[frame, [1, 1, 4, 4]] + elif [2, 3, 6] == intersect: + kpts[frame, [2, 3, 6]] = kpts[frame, [1, 1, 5]] + elif [3, 5, 6] == intersect: + kpts[frame, [3, 5, 6]] = kpts[frame, [2, 4, 4]] + elif [3, 6] == intersect: + kpts[frame, [3, 6]] = kpts[frame, [2, 5]] + elif [3] == intersect: + kpts[frame, 3] = kpts[frame, 2] + elif [6] == intersect: + kpts[frame, 6] = kpts[frame, 5] + else: + continue + + new_h36m_kpts[index, frames] = kpts + return new_h36m_kpts + + +def load_kpts_json(kpts_json): + keypoints, scores, label, label_index = load_json(kpts_json) + h36m_kpts, h36m_scores, valid_frames = h36m_coco_format(keypoints, scores) + re_kpts = revise_kpts(h36m_kpts, h36m_scores, valid_frames) + + return re_kpts, valid_frames, scores, label, label_index + + +def revise_skes(prediction, re_kpts, valid_frames): + new_prediction = np.zeros((*re_kpts.shape[:-1], 3), dtype=np.float32) + for i, frames in enumerate(valid_frames): + new_prediction[i, frames] = prediction[i] + + # The origin of (x, y) is in the upper right corner, + # while the (x,y) coordinates in the image are in the upper left corner. + distance = re_kpts[i, frames[1:], :, :2] - re_kpts[i, frames[:1], :, :2] + distance = np.mean(distance[:, [1, 4, 11, 14]], axis=-2, keepdims=True) + new_prediction[i, frames[1:], :, 0] -= distance[..., 0] / ratio_2d_3d + new_prediction[i, frames[1:], :, 1] += distance[..., 1] / ratio_2d_3d + + # The origin of (x, y) is in the upper right corner, + # while the (x,y) coordinates in the image are in the upper left corner. + # Calculate the relative distance between two people + if len(valid_frames) == 2: + intersec_frames = [frame for frame in valid_frames[0] if frame in valid_frames[1]] + absolute_distance = re_kpts[0, intersec_frames[:1], :, :2] - re_kpts[1, intersec_frames[:1], :, :2] + absolute_distance = np.mean(absolute_distance[:, [1, 4, 11, 14]], axis=-2, keepdims=True) / 2. + + new_prediction[0, valid_frames[0], :, 0] -= absolute_distance[..., 0] / ratio_2d_3d + new_prediction[0, valid_frames[0], :, 1] += absolute_distance[..., 1] / ratio_2d_3d + + new_prediction[1, valid_frames[1], :, 0] += absolute_distance[..., 0] / ratio_2d_3d + new_prediction[1, valid_frames[1], :, 1] -= absolute_distance[..., 1] / ratio_2d_3d + + # Pre-processing the case where the movement of Z axis is relatively large, such as 'sitting down' + # Remove the absolute distance + # new_prediction[:, :, 1:] -= new_prediction[:, :, :1] + # new_prediction[:, :, 0] = 0 + new_prediction[:, :, :, 2] -= np.amin(new_prediction[:, :, :, 2]) + + return new_prediction + + +def revise_skes_real_time(prediction, re_kpts, width): + ratio_2d_3d_width = ratio_2d_3d * (width / 1920) + # prediction: (M, N, 3) + new_prediction = np.zeros((len(prediction), 17, 3), dtype=np.float32) + for i in range(len(prediction)): + new_prediction[i] = prediction[i] + + initial_distance = re_kpts[i] + initial_distance = np.mean(initial_distance[[1, 4, 11, 14], :], axis=0) + new_prediction[i, :, 0] -= (initial_distance[0] - 3*width/5) / ratio_2d_3d_width + new_prediction[i, :, 1] += (initial_distance[1] - width/5) / ratio_2d_3d_width + + new_prediction[:, :, 2] -= np.amin(new_prediction[:, :, 2]) + + return new_prediction diff --git a/VideoToNPZ/tools/utils.py b/VideoToNPZ/tools/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..8d6260ffe4ab33e7e1a09a2360f641fd09502ec9 --- /dev/null +++ b/VideoToNPZ/tools/utils.py @@ -0,0 +1,171 @@ +import torch +import numpy as np +import hashlib +import cv2 +import os.path as osp + + +spple_keypoints = [10, 8, 0, 7] +h36m_coco_order = [9, 11, 14, 12, 15, 13, 16, 4, 1, 5, 2, 6, 3] +coco_order = [0, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16] +joint_pairs = [(0, 1), (1, 2), (2, 3), (0, 4), (4, 5), (5, 6), (0, 7), (7, 8), (8, 9), (9, 10), + (8, 11), (11, 12), (12, 13), (8, 14), (14, 15), (15, 16)] +colors_kps = [[255, 0, 0], [255, 85, 0], [255, 170, 0], [255, 255, 0], [170, 255, 0], [85, 255, 0], [0, 255, 0], + [50, 205, 50], [0, 255, 170], [0, 255, 255], [0, 170, 255], [0, 85, 255], [0, 0, 255], [85, 0, 255], + [170, 0, 255], [255, 0, 255]] + + +def wrap(func, *args, unsqueeze=False): + """ + Wrap a torch function so it can be called with NumPy arrays. + Input and return types are seamlessly converted. + """ + + args = list(args) + for i, arg in enumerate(args): + if type(arg) == np.ndarray: + args[i] = torch.from_numpy(arg) + if unsqueeze: + args[i] = args[i].unsqueeze(0) + + result = func(*args) + + if isinstance(result, tuple): + result = list(result) + for i, res in enumerate(result): + if type(res) == torch.Tensor: + if unsqueeze: + res = res.squeeze(0) + result[i] = res.numpy() + return tuple(result) + elif type(result) == torch.Tensor: + if unsqueeze: + result = result.squeeze(0) + result = result.numpy() + return result + else: + return result + + +def deterministic_random(min_value, max_value, data): + """ + Encrypted, in order to generate the same size each time + """ + + digest = hashlib.sha256(data.encode()).digest() + raw_value = int.from_bytes(digest[:4], byteorder="litter", signed=False) + return int(raw_value / (2**32 - 1) * (max_value - min_value) + min_value) + + +def resize_img(frame, max_length=640): + H, W = frame.shape[:2] + if max(W, H) > max_length: + if W > H: + W_resize = max_length + H_resize = int(H * max_length / W) + else: + H_resize = max_length + W_resize = int(W * max_length / H) + frame = cv2.resize(frame, (W_resize, H_resize), interpolation=cv2.INTER_AREA) + return frame, W_resize, H_resize + + else: + return frame, W, H + + +def draw_2Dimg(img, kpts, scores, display=None): + # kpts : (M, 17, 2) scores: (M, 17) + im = img.copy() + for kpt, score in zip(kpts, scores): + for i, item in enumerate(kpt): + score_val = score[i] + if score_val > 0.3: + x, y = int(item[0]), int(item[1]) + cv2.circle(im, (x, y), 4, (255, 255, 255), 1) + for pair, color in zip(joint_pairs, colors_kps): + j, j_parent = pair + pt1 = (int(kpt[j][0]), int(kpt[j][1])) + pt2 = (int(kpt[j_parent][0]), int(kpt[j_parent][1])) + cv2.line(im, pt1, pt2, color, 2) + + if display: + cv2.imshow('frame', im) + cv2.waitKey(1) + return im + + +def get_path(cur_file): + project_root = osp.dirname(osp.realpath(cur_file)) + chk_root = osp.join(project_root, 'checkpoint/') + data_root = osp.join(project_root, 'data/') + lib_root = osp.join(project_root, 'lib/') + output_root = osp.join(project_root, 'output/') + + return project_root, chk_root, data_root, lib_root, output_root + + +def coco_h36m_frame(keypoints): + keypoints_h36m = np.zeros_like(keypoints, dtype=np.float32) + htps_keypoints = np.zeros((4, 2), dtype=np.float32) + + # htps_keypoints: head, thorax, pelvis, spine + htps_keypoints[0, 0] = np.mean(keypoints[1:5, 0], axis=0, dtype=np.float32) + htps_keypoints[0, 1] = np.sum(keypoints[1:3, 1], axis=0, dtype=np.float32) - keypoints[0, 1] + htps_keypoints[1, :] = np.mean(keypoints[5:7, :], axis=0, dtype=np.float32) + htps_keypoints[1, :] += (keypoints[0, :] - htps_keypoints[1, :]) / 3 + + htps_keypoints[2, :] = np.mean(keypoints[11:13, :], axis=0, dtype=np.float32) + htps_keypoints[3, :] = np.mean(keypoints[[5, 6, 11, 12], :], axis=0, dtype=np.float32) + + keypoints_h36m[spple_keypoints, :] = htps_keypoints + keypoints_h36m[h36m_coco_order, :] = keypoints[coco_order, :] + + keypoints_h36m[9, :] -= (keypoints_h36m[9, :] - np.mean(keypoints[5:7, :], axis=0, dtype=np.float32)) / 4 + keypoints_h36m[7, 0] += 0.3 * (keypoints_h36m[7, 0] - np.mean(keypoints_h36m[[0, 8], 0], axis=0, dtype=np.float32)) + keypoints_h36m[8, 1] -= (np.mean(keypoints[1:3, 1], axis=0, dtype=np.float32) - keypoints[0, 1]) * 2 / 3 + + return keypoints_h36m + + +def h36m_coco_kpts(keypoints, scores): + # keypoints: (M, N, C) scores:(M, N, 1) + assert len(keypoints.shape) == 3 and len(scores.shape) == 3 + scores.squeeze(axis=2) + + h36m_kpts = [] + h36m_scores = [] + for i in range(keypoints.shape[0]): + kpts = keypoints[i] + score = scores[i] + + new_score = np.zeros_like(score, dtype=np.float32) + + if np.sum(kpts) != 0.: + new_score[h36m_coco_order] = score[coco_order] + new_score[0] = np.mean(score[[11, 12]], axis=0, dtype=np.float32) + new_score[8] = np.mean(score[[5, 6]], axis=0, dtype=np.float32) + new_score[7] = np.mean(new_score[[0, 8]], axis=0, dtype=np.float32) + new_score[10] = np.mean(score[[1, 2, 3, 4]], axis=0, dtype=np.float32) + + h36m_scores.append(new_score) + + kpts = coco_h36m_frame(kpts) + less_threshold_joints = np.where(new_score < 0.3)[0] + intersect = [i for i in [2, 3, 5, 6] if i in less_threshold_joints] + + if [2, 3, 5, 6] == intersect: + kpts[[2, 3, 5, 6]] = kpts[[1, 1, 4, 4]] + elif [2, 3, 6] == intersect: + kpts[[2, 3, 6]] = kpts[[1, 1, 5]] + elif [3, 5, 6] == intersect: + kpts[[3, 5, 6]] = kpts[[2, 4, 4]] + elif [3, 6] == intersect: + kpts[[3, 6]] = kpts[[2, 5]] + elif [3] == intersect: + kpts[3] = kpts[2] + elif [6] == intersect: + kpts[6] = kpts[5] + + h36m_kpts.append(kpts) + + return h36m_kpts, h36m_scores diff --git a/VideoToNPZ/tools/vis_h36m.py b/VideoToNPZ/tools/vis_h36m.py new file mode 100644 index 0000000000000000000000000000000000000000..c3eb8103a16fdb25a23893b4dd7ed9f1fa249b90 --- /dev/null +++ b/VideoToNPZ/tools/vis_h36m.py @@ -0,0 +1,249 @@ +import matplotlib +matplotlib.use('Agg') + +import matplotlib.pyplot as plt +from matplotlib.animation import FuncAnimation, writers +from mpl_toolkits.mplot3d import Axes3D +import numpy as np +import subprocess as sp +from tools.color_edge import h36m_color_edge + + +def get_resolution(filename): + command = ['ffprobe', '-v', 'error', '-select_streams', 'v:0', + '-show_entries', 'stream=width,height', '-of', 'csv=p=0', filename] + with sp.Popen(command, stdout=sp.PIPE, bufsize=-1) as pipe: + for line in pipe.stdout: + w, h = line.decode().strip().split(',') + return int(w), int(h) + + +def get_fps(filename): + command = ['ffprobe', '-v', 'error', '-select_streams', 'v:0', + '-show_entries', 'stream=r_frame_rate', '-of', 'csv=p=0', filename] + with sp.Popen(command, stdout=sp.PIPE, bufsize=-1) as pipe: + for line in pipe.stdout: + a, b = line.decode().strip().split('/') + return int(a) / int(b) + + +def read_video(filename, skip=0, limit=-1): + w, h = get_resolution(filename) + + command = ['ffmpeg', + '-i', filename, + '-f', 'image2pipe', + '-pix_fmt', 'rgb24', + '-vsync', '0', + '-vcodec', 'rawvideo', '-'] + + i = 0 + with sp.Popen(command, stdout=sp.PIPE, bufsize=-1) as pipe: + while True: + data = pipe.stdout.read(w * h * 3) + if not data: + break + i += 1 + if i > limit and limit != -1: + continue + if i > skip: + yield np.frombuffer(data, dtype='uint8').reshape((h, w, 3)) + + +def downsample_tensor(X, factor): + length = X.shape[0] // factor * factor + return np.mean(X[:length].reshape(-1, factor, *X.shape[1:]), axis=1) + + +def render_animation(keypoints, keypoints_metadata, poses, skeleton, fps, bitrate, azim, output, viewport, limit=-1, + downsample=1, size=5, input_video_path=None, com_reconstrcution=False, input_video_skip=0): + """ + TODO + Render an animation. The supported output modes are: + -- 'interactive': display an interactive figure + (also works on notebooks if associated with %matplotlib inline) + -- 'html': render the animation as HTML5 video. Can be displayed in a notebook using HTML(...). + -- 'filename.mp4': render and export the animation as an h264 video (requires ffmpeg). + -- 'filename.gif': render and export the animation a gif file (requires imagemagick). + """ + plt.ioff() + + num_person = keypoints.shape[1] + if num_person == 2 and com_reconstrcution: + + fig = plt.figure(figsize=(size * (1 + len(poses)), size)) + ax_in = fig.add_subplot(1, 2, 1) + else: + fig = plt.figure(figsize=(size * (1 + len(poses)), size)) + ax_in = fig.add_subplot(1, 1 + len(poses), 1) + + ax_in.get_xaxis().set_visible(False) + ax_in.get_yaxis().set_visible(False) + ax_in.set_axis_off() + # ax_in.set_title('Input') + + ax_3d = [] + lines_3d = [] + radius = 1.7 + + if num_person == 2 and com_reconstrcution: + ax = fig.add_subplot(1, 2, 2, projection='3d') + ax.view_init(elev=15., azim=azim) + ax.set_xlim3d([-radius, radius]) + ax.set_zlim3d([0, radius]) + ax.set_ylim3d([-radius, radius]) + ax.set_xticklabels([]) + ax.set_yticklabels([]) + ax.set_zticklabels([]) + ax.dist = 7.5 + ax_3d.append(ax) + lines_3d.append([]) + + poses = list(poses.values()) + else: + for index, (title, data) in enumerate(poses.items()): + ax = fig.add_subplot(1, 1 + len(poses), index + 2, projection='3d') + + ax.view_init(elev=15., azim=azim) + ax.set_xlim3d([-radius / 2, radius / 2]) + ax.set_zlim3d([0, radius]) + ax.set_ylim3d([-radius / 2, radius / 2]) + ax.set_aspect('equal') + ax.set_xticklabels([]) + ax.set_yticklabels([]) + ax.set_zticklabels([]) + ax.dist = 7.5 + # ax.set_title(title) # , pad=35 + ax_3d.append(ax) + lines_3d.append([]) + poses = list(poses.values()) + + # Decode video + if input_video_path is None: + # Black background + all_frames = np.zeros((keypoints.shape[0], viewport[1], viewport[0]), dtype='uint8') + else: + # Load video using ffmpeg + all_frames = [] + for f in read_video(input_video_path, skip=input_video_skip, limit=limit): + all_frames.append(f) + effective_length = min(keypoints.shape[0], len(all_frames)) + all_frames = all_frames[:effective_length] + + keypoints = keypoints[input_video_skip:] # todo remove + for idx in range(len(poses)): + poses[idx] = poses[idx][input_video_skip:] + + if fps is None: + fps = get_fps(input_video_path) + + if downsample > 1: + keypoints = downsample_tensor(keypoints, downsample) + all_frames = downsample_tensor(np.array(all_frames), downsample).astype('uint8') + for idx in range(len(poses)): + poses[idx] = downsample_tensor(poses[idx], downsample) + fps /= downsample + + initialized = False + image = None + lines = [] + points = None + + if limit < 1: + limit = len(all_frames) + else: + limit = min(limit, len(all_frames)) + + parents = skeleton.parents() + index = [i for i in np.arange(17)] + + def update_video(i): + nonlocal initialized, image, lines, points + + joints_right_2d = keypoints_metadata['keypoints_symmetry'][1] + + if num_person == 2: + joints_right_2d_two = [] + joints_right_2d_two += joints_right_2d + joints_right_2d_second = [i + 17 for i in joints_right_2d] + joints_right_2d_two += joints_right_2d_second + + colors_2d = np.full(34, 'black') + colors_2d[joints_right_2d_two] = 'red' + else: + colors_2d = np.full(17, 'black') + colors_2d[joints_right_2d] = 'red' + + if not initialized: + image = ax_in.imshow(all_frames[i], aspect='equal') + + for j, j_parent in zip(index, parents): + if j_parent == -1: + continue + + if len(parents) == 17 and keypoints_metadata['layout_name'] != 'coco': + for m in range(num_person): + # Draw skeleton only if keypoints match (otherwise we don't have the parents definition) + lines.append(ax_in.plot([keypoints[i, m, j, 0], keypoints[i, m, j_parent, 0]], + [keypoints[i, m, j, 1], keypoints[i, m, j_parent, 1]], + color='pink')) + + # Apply different colors for each joint + col = h36m_color_edge(j) + + if com_reconstrcution: + for pose in poses: + pos = pose[i] + lines_3d[0].append(ax_3d[0].plot([pos[j, 0], pos[j_parent, 0]], + [pos[j, 1], pos[j_parent, 1]], + [pos[j, 2], pos[j_parent, 2]], zdir='z', c=col, linewidth=3)) + else: + for n, ax in enumerate(ax_3d): + pos = poses[n][i] + lines_3d[n].append(ax.plot([pos[j, 0], pos[j_parent, 0]], + [pos[j, 1], pos[j_parent, 1]], + [pos[j, 2], pos[j_parent, 2]], zdir='z', c=col, linewidth=3)) + + points = ax_in.scatter(*keypoints[i].reshape(17*num_person, 2).T, 10, color=colors_2d, edgecolors='white', zorder=10) + initialized = True + else: + image.set_data(all_frames[i]) + + for j, j_parent in zip(index, parents): + if j_parent == -1: + continue + + if len(parents) == 17 and keypoints_metadata['layout_name'] != 'coco': + for m in range(num_person): + lines[j + 16*m - 1][0].set_data([keypoints[i, m, j, 0], keypoints[i, m, j_parent, 0]], + [keypoints[i, m, j, 1], keypoints[i, m, j_parent, 1]]) + + if com_reconstrcution: + for k, pose in enumerate(poses): + pos = pose[i] + lines_3d[0][j + k*16 - 1][0].set_xdata([pos[j, 0], pos[j_parent, 0]]) + lines_3d[0][j + k*16 - 1][0].set_ydata([pos[j, 1], pos[j_parent, 1]]) + lines_3d[0][j + k*16 - 1][0].set_3d_properties([pos[j, 2], pos[j_parent, 2]], zdir='z') + else: + for n, ax in enumerate(ax_3d): + pos = poses[n][i] + lines_3d[n][j - 1][0].set_xdata([pos[j, 0], pos[j_parent, 0]]) + lines_3d[n][j - 1][0].set_ydata([pos[j, 1], pos[j_parent, 1]]) + lines_3d[n][j - 1][0].set_3d_properties([pos[j, 2], pos[j_parent, 2]], zdir='z') + + points.set_offsets(keypoints[i].reshape(17*num_person, 2)) + + print('{}/{} '.format(i, limit), end='\r') + + fig.tight_layout() + + anim = FuncAnimation(fig, update_video, frames=np.arange(0, limit), interval=1000 / fps, repeat=False) + if output.endswith('.mp4'): + Writer = writers['ffmpeg'] + writer = Writer(fps=fps, metadata={}, bitrate=bitrate) + anim.save(output, writer=writer) + elif output.endswith('.gif'): + anim.save(output, dpi=80, writer='imagemagick') + else: + raise ValueError('Unsupported output format (only .mp4 and .gif are supported)') + plt.close() diff --git a/VideoToNPZ/tools/vis_kpts.py b/VideoToNPZ/tools/vis_kpts.py new file mode 100644 index 0000000000000000000000000000000000000000..92dfe0018566e44c3976c42c77b350edb9e59372 --- /dev/null +++ b/VideoToNPZ/tools/vis_kpts.py @@ -0,0 +1,44 @@ +import numpy as np +import cv2 + + +joint_pairs = [[0, 1], [1, 2], [2, 3], [0, 4], [4, 5], [5, 6], + [0, 7], [7, 8], [8, 9], [9, 10], [8, 11], [11, 12], + [12, 13], [8, 14], [14, 15], [15, 16]] + +colors_kps = [[255, 0, 0], [255, 85, 0], [255, 170, 0], [255, 255, 0], [170, 255, 0], [85, 255, 0], [0, 255, 0], + [50, 205, 50], [0, 255, 170], [0, 255, 255], [0, 170, 255], [0, 85, 255], [0, 0, 255], [85, 0, 255], + [170, 0, 255], [255, 0, 255]] + + +def write(x, img): + # c1 = tuple(x[1:3].int()) + # c2 = tuple(x[3:5].int()) + c1 = (int(x[0]), int(x[1])) + c2 = (int(x[2]), int(x[3])) + + cls = int(x[-1]) + color = [0, 97, 255] + label = 'People {}'.format(x[-1]) + cv2.rectangle(img, c1, c2, color, 1) + t_size = cv2.getTextSize(label, cv2.FONT_HERSHEY_PLAIN, 1, 1)[0] + c2 = c1[0] + t_size[0] + 3, c1[1] + t_size[1] + 4 + cv2.rectangle(img, c1, c2, color, -1) + cv2.putText(img, label, (c1[0], c1[1] + t_size[1] + 4), cv2.FONT_HERSHEY_PLAIN, 1, [225, 255, 255], 1); + return img + + +def plot_keypoint(image, coordinates): + # USE cv2 + for i in range(coordinates.shape[0]): + pts = coordinates[i] + for color_i, jp in zip(colors_kps, joint_pairs): + pt0 = pts[jp, 0] + pt1 = pts[jp, 1] + pt0_0, pt0_1, pt1_0, pt1_1 = int(pt0[0]), int(pt0[1]), int(pt1[0]), int(pt1[1]) + + cv2.line(image, (pt0_0, pt1_0), (pt0_1, pt1_1), color_i, 5) + # cv2.circle(image,(pt0_0, pt0_1), 2, color_i, thickness=-1) + # cv2.circle(image,(pt1_0, pt1_1), 2, color_i, thickness=-1) + return image + diff --git a/convertNPZtoBVH/conver_bvh.py b/convertNPZtoBVH/conver_bvh.py new file mode 100644 index 0000000000000000000000000000000000000000..27a5a1ffb803512636af65d9a5ef48416589a6cd --- /dev/null +++ b/convertNPZtoBVH/conver_bvh.py @@ -0,0 +1,185 @@ +import os +import numpy as np +from scipy.spatial.transform import Rotation +from collections import deque +from tqdm import tqdm + +print(f"Saving 3D Motion") + + +def parse_obj(filename): + vertices = [] + lines = [] + try: + with open(filename, 'r') as f: + for line in f: + if line.startswith('v '): + parts = line.split() + vertices.append([float(parts[1]), float(parts[2]), float(parts[3])]) + elif line.startswith('l '): + parts = line.split() + lines.append([int(parts[1]) - 1, int(parts[2]) - 1]) + return np.array(vertices), lines + except Exception as e: + raise ValueError(f"Error parsing OBJ file {filename}: {str(e)}") + + +def build_hierarchy(lines, root=0): + num_joints = max(max(line) for line in lines) + 1 + adj = [[] for _ in range(num_joints)] + for a, b in lines: + adj[a].append(b) + adj[b].append(a) + parent = [-1] * num_joints + queue = deque([root]) + visited = [False] * num_joints + visited[root] = True + while queue: + p = queue.popleft() + for c in adj[p]: + if not visited[c]: + parent[c] = p + queue.append(c) + visited[c] = True + if not all(visited): + raise ValueError("The skeleton has disconnected components.") + children = [[] for _ in range(num_joints)] + for c in range(num_joints): + if parent[c] != -1: + children[parent[c]].append(c) + return parent, children + + +def compute_offsets(vertices_ref, parent): + num_joints = len(vertices_ref) + offsets = np.zeros((num_joints, 3)) + for j in range(num_joints): + if parent[j] != -1: + offsets[j] = vertices_ref[j] - vertices_ref[parent[j]] + return offsets + + +def compute_R_world(joint, vertices_ref, vertices_cur, children): + if not children[joint]: + return np.eye(3) + elif len(children[joint]) == 1: + c = children[joint][0] + V_ref = vertices_ref[c] - vertices_ref[joint] + V_cur = vertices_cur[c] - vertices_cur[joint] + norm_ref = np.linalg.norm(V_ref) + norm_cur = np.linalg.norm(V_cur) + if norm_ref < 1e-6 or norm_cur < 1e-6: + return np.eye(3) + V_ref_norm = V_ref / norm_ref + V_cur_norm = V_cur / norm_cur + cos_theta = np.clip(np.dot(V_ref_norm, V_cur_norm), -1.0, 1.0) + if cos_theta > 0.99999: + return np.eye(3) + axis = np.cross(V_ref_norm, V_cur_norm) + axis_norm = np.linalg.norm(axis) + if axis_norm < 1e-6: + return np.eye(3) + axis = axis / axis_norm + angle = np.arccos(cos_theta) + R = Rotation.from_rotvec(axis * angle).as_matrix() + return R + else: + A = np.column_stack([vertices_ref[c] - vertices_ref[joint] for c in children[joint]]) + B = np.column_stack([vertices_cur[c] - vertices_cur[joint] for c in children[joint]]) + M = B @ A.T + U, _, Vh = np.linalg.svd(M) + R = U @ Vh + if np.linalg.det(R) < 0: + Vh[-1, :] *= -1 + R = U @ Vh + return R + + +def main(): + output_dir = os.path.abspath('../outputs/') + os.makedirs(output_dir, exist_ok=True) + folder = os.path.join(output_dir, 'obj_sequence') + + try: + obj_files = sorted([f for f in os.listdir(folder) if f.endswith('.obj')]) + except Exception as e: + print(f"Error accessing folder {folder}: {e}") + return + + if not obj_files: + print("No OBJ files found.") + return + + try: + vertices_ref, lines = parse_obj(os.path.join(folder, obj_files[0])) + num_joints = len(vertices_ref) + parent, children = build_hierarchy(lines) + offsets = compute_offsets(vertices_ref, parent) + root = 0 + + hierarchy_order = [] + + def dfs(joint): + hierarchy_order.append(joint) + for child in children[joint]: + dfs(child) + + dfs(root) + + motion_data = [] + for obj_file in tqdm(obj_files): + vertices_cur = parse_obj(os.path.join(folder, obj_file))[0] + R_world = [compute_R_world(j, vertices_ref, vertices_cur, children) for j in range(num_joints)] + R_local = [R_world[j] if parent[j] == -1 else R_world[parent[j]].T @ R_world[j] for j in range(num_joints)] + euler_angles = [Rotation.from_matrix(R).as_euler('ZYX', degrees=True) for R in R_local] + root_pos = vertices_cur[root] + motion_line = list(root_pos) + list(euler_angles[root]) + for j in hierarchy_order[1:]: + motion_line.extend(euler_angles[j]) + motion_data.append(motion_line) + + # Note: Smoothing function has been removed + # Note: Elbow constraints have been removed + + bvh_dir = os.path.join(output_dir, 'bvh') + os.makedirs(bvh_dir, exist_ok=True) + bvh_file = os.path.join(bvh_dir, 'output.bvh') + + with open(bvh_file, 'w') as f: + f.write("HIERARCHY\n") + + def write_hierarchy(joint, parent, f, indent=0): + if parent == -1: + f.write("ROOT Joint{}\n".format(joint)) + else: + f.write(" " * indent + "JOINT Joint{}\n".format(joint)) + f.write(" " * indent + "{\n") + f.write(" " * (indent + 1) + "OFFSET {:.6f} {:.6f} {:.6f}\n".format(*offsets[joint])) + if parent == -1: + f.write(" " * ( + indent + 1) + "CHANNELS 6 Xposition Yposition Zposition Zrotation Yrotation Xrotation\n") + else: + f.write(" " * (indent + 1) + "CHANNELS 3 Zrotation Yrotation Xrotation\n") + for child in children[joint]: + write_hierarchy(child, joint, f, indent + 1) + if not children[joint]: + f.write(" " * (indent + 1) + "End Site\n") + f.write(" " * (indent + 1) + "{\n") + f.write(" " * (indent + 2) + "OFFSET 0.000000 0.000000 0.000000\n") + f.write(" " * (indent + 1) + "}\n") + f.write(" " * indent + "}\n") + + write_hierarchy(root, -1, f) + + f.write("MOTION\n") + f.write("Frames: {}\n".format(len(motion_data))) + f.write("Frame Time: 0.033333\n") + for motion_line in motion_data: + f.write(" ".join("{:.6f}".format(x) for x in motion_line) + "\n") + + except Exception as e: + print(f"Error during processing: {e}") + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/convertNPZtoBVH/conver_obj.py b/convertNPZtoBVH/conver_obj.py new file mode 100644 index 0000000000000000000000000000000000000000..857f091d2e50cfd7c1f3dec2b536aa4fe00b066a --- /dev/null +++ b/convertNPZtoBVH/conver_obj.py @@ -0,0 +1,141 @@ +import numpy as np +import os +from datetime import datetime + +def define_human_connections(): + """ + Define connections for human stick figure with support for various poses + including crossed legs and complex movements + """ + return [ + # Core body structure + [0, 7], # Base spine to upper spine + [7, 8], # Upper spine to neck + [8, 9], # Neck to head base + [9, 10], # Head extension + + # Arms (with complete chains) + # Left arm + [7, 14], # Spine to left shoulder + [14, 15], # Left upper arm + [15, 16], # Left forearm/hand + + # Right arm + [7, 11], # Spine to right shoulder + [11, 12], # Right upper arm + [12, 13], # Right forearm/hand + + # Legs with crossed support + # Left leg (now crossing to right) + [0, 1], # Hip to left thigh + [1, 2], # Left thigh to knee + [2, 3], # Left knee to foot + + # Right leg + [0, 4], # Hip to right thigh + [4, 5], # Right thigh to knee + [5, 6], # Right knee to foot + + # Structural connections + [14, 11], # Shoulder cross support + [1, 4], # Hip cross support + ] + +def npz_to_obj_sequence(npz_path, output_dir): + """ + Convert NPZ motion capture data to OBJ sequence + with enhanced support for various poses and movements + """ + os.makedirs(output_dir, exist_ok=True) + data = np.load(npz_path) + reconstruction = data['reconstruction'][0] + + num_frames = reconstruction.shape[0] + connections = define_human_connections() + + # Increased scale for better visibility + scale = 150.0 # Adjusted scale factor + + for frame_idx in range(num_frames): + vertices = reconstruction[frame_idx] + output_path = os.path.join(output_dir, f"frame_{frame_idx:04d}.obj") + + with open(output_path, 'w') as f: + # Write vertices with enhanced precision + for v in vertices: + # Coordinate system transformation with improved scaling + x, y, z = v[0] * scale, v[2] * scale, v[1] * scale + f.write(f"v {x:.8f} {y:.8f} {z:.8f}\n") + + # Write connections + for conn in connections: + f.write(f"l {conn[0] + 1} {conn[1] + 1}\n") + + +def analyze_vertex_data(npz_path): + """ + Enhanced analysis function to help understand the motion data + and verify correct vertex positions + """ + data = np.load(npz_path) + reconstruction = data['reconstruction'][0] + + + # Calculate full range of motion + x_min, x_max = reconstruction[:,:,0].min(), reconstruction[:,:,0].max() + y_min, y_max = reconstruction[:,:,1].min(), reconstruction[:,:,1].max() + z_min, z_max = reconstruction[:,:,2].min(), reconstruction[:,:,2].max() + + +def process_motion_capture(npz_file): + try: + # Verify input file exists + if not os.path.exists(npz_file): + raise FileNotFoundError(f"Input file {npz_file} not found") + + # Define base output directory + base_output_dir = os.path.abspath('../outputs/') + # print(output_dir) + os.makedirs(base_output_dir, exist_ok=True) + # base_output_dir = r"C:\Users\ROGST\Programming\Python\videotobvh\convertNPZtoBVH\outputs" + + # Create a unique output directory with timestamp to avoid overwriting + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + output_dir = os.path.join(base_output_dir, f"obj_sequence") + + # Analyze data first + analyze_vertex_data(npz_file) + + # Convert to OBJ sequence + npz_to_obj_sequence(npz_path=npz_file, output_dir=output_dir) + + + except Exception as e: + print(f"Error processing motion capture data: {str(e)}") + raise + +def get_npz_paths(folder_path): + if not os.path.isdir(folder_path): + raise FileNotFoundError(f"Directory not found: {folder_path}") + + # Find the first .npz file in the directory + for file in os.listdir(folder_path): + if file.endswith('.npz'): + npz_path = os.path.join(folder_path, file) + return npz_path + + # If no .npz file is found + raise FileNotFoundError(f"No NPZ files found in directory: {folder_path}") + +if __name__ == "__main__": + # Define the directory where the NPZ file is located + output_dir = os.path.abspath('../outputs/npz/') + os.makedirs(output_dir, exist_ok=True) + input_dir = output_dir + + try: + # Get the first available NPZ file from the directory + npz_file = get_npz_paths(input_dir) + process_motion_capture(npz_file) + except FileNotFoundError as e: + print(f"Error: {str(e)}") \ No newline at end of file diff --git a/pipeline.py b/pipeline.py new file mode 100644 index 0000000000000000000000000000000000000000..35b22b5837a6aca66811152e6a89a5c9873bcef1 --- /dev/null +++ b/pipeline.py @@ -0,0 +1,84 @@ +import subprocess +import sys +import os +import argparse +import time +from datetime import datetime +import signal + +def signal_handler(sig, frame): + print("\nInterrupted by user, shutting down...") + if 'pool' in locals() and pool is not None: + pool.terminate() + pool.join() + sys.exit(0) + +signal.signal(signal.SIGINT, signal_handler) + +def parse_arguments(): + parser = argparse.ArgumentParser(description="Run the complete video-to-BVH pipeline") + # parser.add_argument('-v', '--video', required=True, help="Path to the input video file") + return parser.parse_args() + +def run_command(command, description): + """Run a command and show its output.""" + try: + start_time = time.time() + script_dir = os.path.dirname(command[1]) + current_dir = os.getcwd() + os.chdir(script_dir) + + subprocess.run(command, check=True) + + os.chdir(current_dir) + + end_time = time.time() + execution_time = end_time - start_time + return True + + except subprocess.CalledProcessError as e: + os.chdir(current_dir) + return False + + except Exception as e: + if 'current_dir' in locals(): + os.chdir(current_dir) + return False + +def main(): + args = parse_arguments() + + base_dir = os.path.dirname(os.path.abspath(__file__)) + gen_skes_path = os.path.join(base_dir, "VideoToNPZ", "gen_skes.py") + convert_obj_path = os.path.join(base_dir, "convertNPZtoBVH", "conver_obj.py") + convert_bvh_path = os.path.join(base_dir, "convertNPZtoBVH", "conver_bvh.py") + + for script_path in [gen_skes_path, convert_obj_path, convert_bvh_path]: + if not os.path.exists(script_path): + return 1 + + pipeline_steps = [ + { + "command": [sys.executable, gen_skes_path], + }, + { + "command": [sys.executable, convert_obj_path], + }, + { + "command": [sys.executable, convert_bvh_path], + } + ] + + successful = 0 + failed = 0 + + for step in pipeline_steps: + if run_command(step["command"], ""): + successful += 1 + else: + failed += 1 + + return 0 if failed == 0 else 1 + +if __name__ == "__main__": + sys.exit(main()) \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..c0e0d8af6019e5b8408f7fc8918e17e31871a149 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,112 @@ +altair==5.4.1 +asttokens==3.0.0 +attrs==25.1.0 +backcall==0.2.0 +beautifulsoup4==4.13.3 +bleach==6.1.0 +blinker==1.8.2 +cachetools==5.5.2 +certifi==2025.1.31 +charset-normalizer==3.4.1 +click==8.1.8 +colorama==0.4.6 +contourpy==1.1.1 +cycler==0.12.1 +decorator==5.2.1 +defusedxml==0.7.1 +docopt==0.6.2 +executing==2.2.0 +fastjsonschema==2.21.1 +filelock==3.16.1 +filterpy==1.4.5 +fonttools==4.56.0 +fsspec==2025.2.0 +gitdb==4.0.12 +GitPython==3.1.44 +h5py==3.11.0 +idna==3.10 +imageio==2.35.1 +importlib_metadata==8.5.0 +importlib_resources==6.4.5 +ipython==8.12.3 +jedi==0.19.2 +Jinja2==3.1.5 +jsonschema==4.23.0 +jsonschema-specifications==2023.12.1 +jupyter_client==8.6.3 +jupyter_core==5.7.2 +jupyterlab_pygments==0.3.0 +kiwisolver==1.4.7 +lazy_loader==0.4 +llvmlite==0.41.1 +markdown-it-py==3.0.0 +MarkupSafe==2.1.5 +matplotlib==3.7.5 +matplotlib-inline==0.1.7 +mdurl==0.1.2 +mistune==3.1.2 +mpmath==1.3.0 +narwhals==1.29.0 +nbclient==0.10.1 +nbconvert==7.16.6 +nbformat==5.10.4 +networkx==3.1 +numba==0.58.1 +numpy==1.24.4 +opencv-python==4.11.0.86 +packaging==24.2 +pandas==2.0.3 +pandocfilters==1.5.1 +parso==0.8.4 +pickleshare==0.7.5 +pillow==10.4.0 +pip-check==2.9 +pipreqs==0.5.0 +pkgutil_resolve_name==1.3.10 +platformdirs==4.3.6 +prompt_toolkit==3.0.50 +protobuf==5.29.3 +psutil==7.0.0 +pure_eval==0.2.3 +pyarrow==17.0.0 +pydeck==0.9.1 +Pygments==2.19.1 +pyparsing==3.1.4 +python-dateutil==2.9.0.post0 +pytz==2025.1 +PyWavelets==1.4.1 +pywin32==308 +PyYAML==6.0.2 +pyzmq==26.2.1 +referencing==0.35.1 +requests==2.32.3 +rich==13.9.4 +rpds-py==0.20.1 +scikit-image==0.21.0 +scipy==1.10.1 +six==1.17.0 +smmap==5.0.2 +soupsieve==2.6 +stack-data==0.6.3 +streamlit==1.40.1 +sympy==1.13.3 +tenacity==9.0.0 +terminaltables==3.1.10 +tifffile==2023.7.10 +tinycss2==1.2.1 +toml==0.10.2 +torch==2.4.1 +torchsummary==1.5.1 +torchvision==0.19.1 +tornado==6.4.2 +tqdm==4.67.1 +traitlets==5.14.3 +typing_extensions==4.12.2 +tzdata==2025.1 +urllib3==2.2.3 +watchdog==4.0.2 +wcwidth==0.2.13 +webencodings==0.5.1 +yacs==0.1.8 +yarg==0.1.9 +zipp==3.20.2