Spaces:
Build error
Build error
# Copyright (c) OpenMMLab. All rights reserved. | |
import numpy as np | |
import torch | |
import torch.nn as nn | |
import torch.nn.functional as F | |
from mmcv.cnn import (build_conv_layer, build_norm_layer, build_upsample_layer, | |
constant_init, normal_init) | |
from mmpose.core.evaluation.top_down_eval import ( | |
keypoints_from_heatmaps3d, multilabel_classification_accuracy) | |
from mmpose.core.post_processing import flip_back | |
from mmpose.models.builder import build_loss | |
from mmpose.models.necks import GlobalAveragePooling | |
from ..builder import HEADS | |
class Heatmap3DHead(nn.Module): | |
"""Heatmap3DHead is a sub-module of Interhand3DHead, and outputs 3D | |
heatmaps. Heatmap3DHead is composed of (>=0) number of deconv layers and a | |
simple conv2d layer. | |
Args: | |
in_channels (int): Number of input channels | |
out_channels (int): Number of output channels | |
depth_size (int): Number of depth discretization size | |
num_deconv_layers (int): Number of deconv layers. | |
num_deconv_layers should >= 0. Note that 0 means no deconv layers. | |
num_deconv_filters (list|tuple): Number of filters. | |
num_deconv_kernels (list|tuple): Kernel sizes. | |
extra (dict): Configs for extra conv layers. Default: None | |
""" | |
def __init__(self, | |
in_channels, | |
out_channels, | |
depth_size=64, | |
num_deconv_layers=3, | |
num_deconv_filters=(256, 256, 256), | |
num_deconv_kernels=(4, 4, 4), | |
extra=None): | |
super().__init__() | |
assert out_channels % depth_size == 0 | |
self.depth_size = depth_size | |
self.in_channels = in_channels | |
if extra is not None and not isinstance(extra, dict): | |
raise TypeError('extra should be dict or None.') | |
if num_deconv_layers > 0: | |
self.deconv_layers = self._make_deconv_layer( | |
num_deconv_layers, | |
num_deconv_filters, | |
num_deconv_kernels, | |
) | |
elif num_deconv_layers == 0: | |
self.deconv_layers = nn.Identity() | |
else: | |
raise ValueError( | |
f'num_deconv_layers ({num_deconv_layers}) should >= 0.') | |
identity_final_layer = False | |
if extra is not None and 'final_conv_kernel' in extra: | |
assert extra['final_conv_kernel'] in [0, 1, 3] | |
if extra['final_conv_kernel'] == 3: | |
padding = 1 | |
elif extra['final_conv_kernel'] == 1: | |
padding = 0 | |
else: | |
# 0 for Identity mapping. | |
identity_final_layer = True | |
kernel_size = extra['final_conv_kernel'] | |
else: | |
kernel_size = 1 | |
padding = 0 | |
if identity_final_layer: | |
self.final_layer = nn.Identity() | |
else: | |
conv_channels = num_deconv_filters[ | |
-1] if num_deconv_layers > 0 else self.in_channels | |
layers = [] | |
if extra is not None: | |
num_conv_layers = extra.get('num_conv_layers', 0) | |
num_conv_kernels = extra.get('num_conv_kernels', | |
[1] * num_conv_layers) | |
for i in range(num_conv_layers): | |
layers.append( | |
build_conv_layer( | |
dict(type='Conv2d'), | |
in_channels=conv_channels, | |
out_channels=conv_channels, | |
kernel_size=num_conv_kernels[i], | |
stride=1, | |
padding=(num_conv_kernels[i] - 1) // 2)) | |
layers.append( | |
build_norm_layer(dict(type='BN'), conv_channels)[1]) | |
layers.append(nn.ReLU(inplace=True)) | |
layers.append( | |
build_conv_layer( | |
cfg=dict(type='Conv2d'), | |
in_channels=conv_channels, | |
out_channels=out_channels, | |
kernel_size=kernel_size, | |
stride=1, | |
padding=padding)) | |
if len(layers) > 1: | |
self.final_layer = nn.Sequential(*layers) | |
else: | |
self.final_layer = layers[0] | |
def _make_deconv_layer(self, num_layers, num_filters, num_kernels): | |
"""Make deconv layers.""" | |
if num_layers != len(num_filters): | |
error_msg = f'num_layers({num_layers}) ' \ | |
f'!= length of num_filters({len(num_filters)})' | |
raise ValueError(error_msg) | |
if num_layers != len(num_kernels): | |
error_msg = f'num_layers({num_layers}) ' \ | |
f'!= length of num_kernels({len(num_kernels)})' | |
raise ValueError(error_msg) | |
layers = [] | |
for i in range(num_layers): | |
kernel, padding, output_padding = \ | |
self._get_deconv_cfg(num_kernels[i]) | |
planes = num_filters[i] | |
layers.append( | |
build_upsample_layer( | |
dict(type='deconv'), | |
in_channels=self.in_channels, | |
out_channels=planes, | |
kernel_size=kernel, | |
stride=2, | |
padding=padding, | |
output_padding=output_padding, | |
bias=False)) | |
layers.append(nn.BatchNorm2d(planes)) | |
layers.append(nn.ReLU(inplace=True)) | |
self.in_channels = planes | |
return nn.Sequential(*layers) | |
def _get_deconv_cfg(deconv_kernel): | |
"""Get configurations for deconv layers.""" | |
if deconv_kernel == 4: | |
padding = 1 | |
output_padding = 0 | |
elif deconv_kernel == 3: | |
padding = 1 | |
output_padding = 1 | |
elif deconv_kernel == 2: | |
padding = 0 | |
output_padding = 0 | |
else: | |
raise ValueError(f'Not supported num_kernels ({deconv_kernel}).') | |
return deconv_kernel, padding, output_padding | |
def forward(self, x): | |
"""Forward function.""" | |
x = self.deconv_layers(x) | |
x = self.final_layer(x) | |
N, C, H, W = x.shape | |
# reshape the 2D heatmap to 3D heatmap | |
x = x.reshape(N, C // self.depth_size, self.depth_size, H, W) | |
return x | |
def init_weights(self): | |
"""Initialize model weights.""" | |
for _, m in self.deconv_layers.named_modules(): | |
if isinstance(m, nn.ConvTranspose2d): | |
normal_init(m, std=0.001) | |
elif isinstance(m, nn.BatchNorm2d): | |
constant_init(m, 1) | |
for m in self.final_layer.modules(): | |
if isinstance(m, nn.Conv2d): | |
normal_init(m, std=0.001, bias=0) | |
elif isinstance(m, nn.BatchNorm2d): | |
constant_init(m, 1) | |
class Heatmap1DHead(nn.Module): | |
"""Heatmap1DHead is a sub-module of Interhand3DHead, and outputs 1D | |
heatmaps. | |
Args: | |
in_channels (int): Number of input channels | |
heatmap_size (int): Heatmap size | |
hidden_dims (list|tuple): Number of feature dimension of FC layers. | |
""" | |
def __init__(self, in_channels=2048, heatmap_size=64, hidden_dims=(512, )): | |
super().__init__() | |
self.in_channels = in_channels | |
self.heatmap_size = heatmap_size | |
feature_dims = [in_channels, *hidden_dims, heatmap_size] | |
self.fc = self._make_linear_layers(feature_dims, relu_final=False) | |
def soft_argmax_1d(self, heatmap1d): | |
heatmap1d = F.softmax(heatmap1d, 1) | |
accu = heatmap1d * torch.arange( | |
self.heatmap_size, dtype=heatmap1d.dtype, | |
device=heatmap1d.device)[None, :] | |
coord = accu.sum(dim=1) | |
return coord | |
def _make_linear_layers(self, feat_dims, relu_final=False): | |
"""Make linear layers.""" | |
layers = [] | |
for i in range(len(feat_dims) - 1): | |
layers.append(nn.Linear(feat_dims[i], feat_dims[i + 1])) | |
if i < len(feat_dims) - 2 or \ | |
(i == len(feat_dims) - 2 and relu_final): | |
layers.append(nn.ReLU(inplace=True)) | |
return nn.Sequential(*layers) | |
def forward(self, x): | |
"""Forward function.""" | |
heatmap1d = self.fc(x) | |
value = self.soft_argmax_1d(heatmap1d).view(-1, 1) | |
return value | |
def init_weights(self): | |
"""Initialize model weights.""" | |
for m in self.fc.modules(): | |
if isinstance(m, nn.Linear): | |
normal_init(m, mean=0, std=0.01, bias=0) | |
class MultilabelClassificationHead(nn.Module): | |
"""MultilabelClassificationHead is a sub-module of Interhand3DHead, and | |
outputs hand type classification. | |
Args: | |
in_channels (int): Number of input channels | |
num_labels (int): Number of labels | |
hidden_dims (list|tuple): Number of hidden dimension of FC layers. | |
""" | |
def __init__(self, in_channels=2048, num_labels=2, hidden_dims=(512, )): | |
super().__init__() | |
self.in_channels = in_channels | |
self.num_labesl = num_labels | |
feature_dims = [in_channels, *hidden_dims, num_labels] | |
self.fc = self._make_linear_layers(feature_dims, relu_final=False) | |
def _make_linear_layers(self, feat_dims, relu_final=False): | |
"""Make linear layers.""" | |
layers = [] | |
for i in range(len(feat_dims) - 1): | |
layers.append(nn.Linear(feat_dims[i], feat_dims[i + 1])) | |
if i < len(feat_dims) - 2 or \ | |
(i == len(feat_dims) - 2 and relu_final): | |
layers.append(nn.ReLU(inplace=True)) | |
return nn.Sequential(*layers) | |
def forward(self, x): | |
"""Forward function.""" | |
labels = torch.sigmoid(self.fc(x)) | |
return labels | |
def init_weights(self): | |
for m in self.fc.modules(): | |
if isinstance(m, nn.Linear): | |
normal_init(m, mean=0, std=0.01, bias=0) | |
class Interhand3DHead(nn.Module): | |
"""Interhand 3D head of paper ref: Gyeongsik Moon. "InterHand2.6M: A | |
Dataset and Baseline for 3D Interacting Hand Pose Estimation from a Single | |
RGB Image". | |
Args: | |
keypoint_head_cfg (dict): Configs of Heatmap3DHead for hand | |
keypoint estimation. | |
root_head_cfg (dict): Configs of Heatmap1DHead for relative | |
hand root depth estimation. | |
hand_type_head_cfg (dict): Configs of MultilabelClassificationHead | |
for hand type classification. | |
loss_keypoint (dict): Config for keypoint loss. Default: None. | |
loss_root_depth (dict): Config for relative root depth loss. | |
Default: None. | |
loss_hand_type (dict): Config for hand type classification | |
loss. Default: None. | |
""" | |
def __init__(self, | |
keypoint_head_cfg, | |
root_head_cfg, | |
hand_type_head_cfg, | |
loss_keypoint=None, | |
loss_root_depth=None, | |
loss_hand_type=None, | |
train_cfg=None, | |
test_cfg=None): | |
super().__init__() | |
# build sub-module heads | |
self.right_hand_head = Heatmap3DHead(**keypoint_head_cfg) | |
self.left_hand_head = Heatmap3DHead(**keypoint_head_cfg) | |
self.root_head = Heatmap1DHead(**root_head_cfg) | |
self.hand_type_head = MultilabelClassificationHead( | |
**hand_type_head_cfg) | |
self.neck = GlobalAveragePooling() | |
# build losses | |
self.keypoint_loss = build_loss(loss_keypoint) | |
self.root_depth_loss = build_loss(loss_root_depth) | |
self.hand_type_loss = build_loss(loss_hand_type) | |
self.train_cfg = {} if train_cfg is None else train_cfg | |
self.test_cfg = {} if test_cfg is None else test_cfg | |
self.target_type = self.test_cfg.get('target_type', 'GaussianHeatmap') | |
def init_weights(self): | |
self.left_hand_head.init_weights() | |
self.right_hand_head.init_weights() | |
self.root_head.init_weights() | |
self.hand_type_head.init_weights() | |
def get_loss(self, output, target, target_weight): | |
"""Calculate loss for hand keypoint heatmaps, relative root depth and | |
hand type. | |
Args: | |
output (list[Tensor]): a list of outputs from multiple heads. | |
target (list[Tensor]): a list of targets for multiple heads. | |
target_weight (list[Tensor]): a list of targets weight for | |
multiple heads. | |
""" | |
losses = dict() | |
# hand keypoint loss | |
assert not isinstance(self.keypoint_loss, nn.Sequential) | |
out, tar, tar_weight = output[0], target[0], target_weight[0] | |
assert tar.dim() == 5 and tar_weight.dim() == 3 | |
losses['hand_loss'] = self.keypoint_loss(out, tar, tar_weight) | |
# relative root depth loss | |
assert not isinstance(self.root_depth_loss, nn.Sequential) | |
out, tar, tar_weight = output[1], target[1], target_weight[1] | |
assert tar.dim() == 2 and tar_weight.dim() == 2 | |
losses['rel_root_loss'] = self.root_depth_loss(out, tar, tar_weight) | |
# hand type loss | |
assert not isinstance(self.hand_type_loss, nn.Sequential) | |
out, tar, tar_weight = output[2], target[2], target_weight[2] | |
assert tar.dim() == 2 and tar_weight.dim() in [1, 2] | |
losses['hand_type_loss'] = self.hand_type_loss(out, tar, tar_weight) | |
return losses | |
def get_accuracy(self, output, target, target_weight): | |
"""Calculate accuracy for hand type. | |
Args: | |
output (list[Tensor]): a list of outputs from multiple heads. | |
target (list[Tensor]): a list of targets for multiple heads. | |
target_weight (list[Tensor]): a list of targets weight for | |
multiple heads. | |
""" | |
accuracy = dict() | |
avg_acc = multilabel_classification_accuracy( | |
output[2].detach().cpu().numpy(), | |
target[2].detach().cpu().numpy(), | |
target_weight[2].detach().cpu().numpy(), | |
) | |
accuracy['acc_classification'] = float(avg_acc) | |
return accuracy | |
def forward(self, x): | |
"""Forward function.""" | |
outputs = [] | |
outputs.append( | |
torch.cat([self.right_hand_head(x), | |
self.left_hand_head(x)], dim=1)) | |
x = self.neck(x) | |
outputs.append(self.root_head(x)) | |
outputs.append(self.hand_type_head(x)) | |
return outputs | |
def inference_model(self, x, flip_pairs=None): | |
"""Inference function. | |
Returns: | |
output (list[np.ndarray]): list of output hand keypoint | |
heatmaps, relative root depth and hand type. | |
Args: | |
x (torch.Tensor[N,K,H,W]): Input features. | |
flip_pairs (None | list[tuple()): | |
Pairs of keypoints which are mirrored. | |
""" | |
output = self.forward(x) | |
if flip_pairs is not None: | |
# flip 3D heatmap | |
heatmap_3d = output[0] | |
N, K, D, H, W = heatmap_3d.shape | |
# reshape 3D heatmap to 2D heatmap | |
heatmap_3d = heatmap_3d.reshape(N, K * D, H, W) | |
# 2D heatmap flip | |
heatmap_3d_flipped_back = flip_back( | |
heatmap_3d.detach().cpu().numpy(), | |
flip_pairs, | |
target_type=self.target_type) | |
# reshape back to 3D heatmap | |
heatmap_3d_flipped_back = heatmap_3d_flipped_back.reshape( | |
N, K, D, H, W) | |
# feature is not aligned, shift flipped heatmap for higher accuracy | |
if self.test_cfg.get('shift_heatmap', False): | |
heatmap_3d_flipped_back[..., | |
1:] = heatmap_3d_flipped_back[..., :-1] | |
output[0] = heatmap_3d_flipped_back | |
# flip relative hand root depth | |
output[1] = -output[1].detach().cpu().numpy() | |
# flip hand type | |
hand_type = output[2].detach().cpu().numpy() | |
hand_type_flipped_back = hand_type.copy() | |
hand_type_flipped_back[:, 0] = hand_type[:, 1] | |
hand_type_flipped_back[:, 1] = hand_type[:, 0] | |
output[2] = hand_type_flipped_back | |
else: | |
output = [out.detach().cpu().numpy() for out in output] | |
return output | |
def decode(self, img_metas, output, **kwargs): | |
"""Decode hand keypoint, relative root depth and hand type. | |
Args: | |
img_metas (list(dict)): Information about data augmentation | |
By default this includes: | |
- "image_file: path to the image file | |
- "center": center of the bbox | |
- "scale": scale of the bbox | |
- "rotation": rotation of the bbox | |
- "bbox_score": score of bbox | |
- "heatmap3d_depth_bound": depth bound of hand keypoint | |
3D heatmap | |
- "root_depth_bound": depth bound of relative root depth | |
1D heatmap | |
output (list[np.ndarray]): model predicted 3D heatmaps, relative | |
root depth and hand type. | |
""" | |
batch_size = len(img_metas) | |
result = {} | |
heatmap3d_depth_bound = np.ones(batch_size, dtype=np.float32) | |
root_depth_bound = np.ones(batch_size, dtype=np.float32) | |
center = np.zeros((batch_size, 2), dtype=np.float32) | |
scale = np.zeros((batch_size, 2), dtype=np.float32) | |
image_paths = [] | |
score = np.ones(batch_size, dtype=np.float32) | |
if 'bbox_id' in img_metas[0]: | |
bbox_ids = [] | |
else: | |
bbox_ids = None | |
for i in range(batch_size): | |
heatmap3d_depth_bound[i] = img_metas[i]['heatmap3d_depth_bound'] | |
root_depth_bound[i] = img_metas[i]['root_depth_bound'] | |
center[i, :] = img_metas[i]['center'] | |
scale[i, :] = img_metas[i]['scale'] | |
image_paths.append(img_metas[i]['image_file']) | |
if 'bbox_score' in img_metas[i]: | |
score[i] = np.array(img_metas[i]['bbox_score']).reshape(-1) | |
if bbox_ids is not None: | |
bbox_ids.append(img_metas[i]['bbox_id']) | |
all_boxes = np.zeros((batch_size, 6), dtype=np.float32) | |
all_boxes[:, 0:2] = center[:, 0:2] | |
all_boxes[:, 2:4] = scale[:, 0:2] | |
# scale is defined as: bbox_size / 200.0, so we | |
# need multiply 200.0 to get bbox size | |
all_boxes[:, 4] = np.prod(scale * 200.0, axis=1) | |
all_boxes[:, 5] = score | |
result['boxes'] = all_boxes | |
result['image_paths'] = image_paths | |
result['bbox_ids'] = bbox_ids | |
# decode 3D heatmaps of hand keypoints | |
heatmap3d = output[0] | |
preds, maxvals = keypoints_from_heatmaps3d(heatmap3d, center, scale) | |
keypoints_3d = np.zeros((batch_size, preds.shape[1], 4), | |
dtype=np.float32) | |
keypoints_3d[:, :, 0:3] = preds[:, :, 0:3] | |
keypoints_3d[:, :, 3:4] = maxvals | |
# transform keypoint depth to camera space | |
keypoints_3d[:, :, 2] = \ | |
(keypoints_3d[:, :, 2] / self.right_hand_head.depth_size - 0.5) \ | |
* heatmap3d_depth_bound[:, np.newaxis] | |
result['preds'] = keypoints_3d | |
# decode relative hand root depth | |
# transform relative root depth to camera space | |
result['rel_root_depth'] = (output[1] / self.root_head.heatmap_size - | |
0.5) * root_depth_bound | |
# decode hand type | |
result['hand_type'] = output[2] > 0.5 | |
return result | |