|
|
|
|
|
import numpy as np |
|
from typing import Dict, List, Optional |
|
import fvcore.nn.weight_init as weight_init |
|
import torch |
|
import torch.nn as nn |
|
from torch.nn import functional as F |
|
|
|
from detectron2.layers import Conv2d, ShapeSpec, get_norm |
|
from detectron2.modeling import ROI_HEADS_REGISTRY, StandardROIHeads |
|
from detectron2.modeling.poolers import ROIPooler |
|
from detectron2.modeling.roi_heads import select_foreground_proposals |
|
from detectron2.structures import ImageList, Instances |
|
|
|
from .. import ( |
|
build_densepose_data_filter, |
|
build_densepose_embedder, |
|
build_densepose_head, |
|
build_densepose_losses, |
|
build_densepose_predictor, |
|
densepose_inference, |
|
) |
|
|
|
|
|
class Decoder(nn.Module): |
|
""" |
|
A semantic segmentation head described in detail in the Panoptic Feature Pyramid Networks paper |
|
(https://arxiv.org/abs/1901.02446). It takes FPN features as input and merges information from |
|
all levels of the FPN into single output. |
|
""" |
|
|
|
def __init__(self, cfg, input_shape: Dict[str, ShapeSpec], in_features): |
|
super(Decoder, self).__init__() |
|
|
|
|
|
self.in_features = in_features |
|
feature_strides = {k: v.stride for k, v in input_shape.items()} |
|
feature_channels = {k: v.channels for k, v in input_shape.items()} |
|
num_classes = cfg.MODEL.ROI_DENSEPOSE_HEAD.DECODER_NUM_CLASSES |
|
conv_dims = cfg.MODEL.ROI_DENSEPOSE_HEAD.DECODER_CONV_DIMS |
|
self.common_stride = cfg.MODEL.ROI_DENSEPOSE_HEAD.DECODER_COMMON_STRIDE |
|
norm = cfg.MODEL.ROI_DENSEPOSE_HEAD.DECODER_NORM |
|
|
|
|
|
self.scale_heads = [] |
|
for in_feature in self.in_features: |
|
head_ops = [] |
|
head_length = max( |
|
1, int(np.log2(feature_strides[in_feature]) - np.log2(self.common_stride)) |
|
) |
|
for k in range(head_length): |
|
conv = Conv2d( |
|
feature_channels[in_feature] if k == 0 else conv_dims, |
|
conv_dims, |
|
kernel_size=3, |
|
stride=1, |
|
padding=1, |
|
bias=not norm, |
|
norm=get_norm(norm, conv_dims), |
|
activation=F.relu, |
|
) |
|
weight_init.c2_msra_fill(conv) |
|
head_ops.append(conv) |
|
if feature_strides[in_feature] != self.common_stride: |
|
head_ops.append( |
|
nn.Upsample(scale_factor=2, mode="bilinear", align_corners=False) |
|
) |
|
self.scale_heads.append(nn.Sequential(*head_ops)) |
|
self.add_module(in_feature, self.scale_heads[-1]) |
|
self.predictor = Conv2d(conv_dims, num_classes, kernel_size=1, stride=1, padding=0) |
|
weight_init.c2_msra_fill(self.predictor) |
|
|
|
def forward(self, features: List[torch.Tensor]): |
|
for i, _ in enumerate(self.in_features): |
|
if i == 0: |
|
x = self.scale_heads[i](features[i]) |
|
else: |
|
x = x + self.scale_heads[i](features[i]) |
|
x = self.predictor(x) |
|
return x |
|
|
|
|
|
@ROI_HEADS_REGISTRY.register() |
|
class DensePoseROIHeads(StandardROIHeads): |
|
""" |
|
A Standard ROIHeads which contains an addition of DensePose head. |
|
""" |
|
|
|
def __init__(self, cfg, input_shape): |
|
super().__init__(cfg, input_shape) |
|
self._init_densepose_head(cfg, input_shape) |
|
|
|
def _init_densepose_head(self, cfg, input_shape): |
|
|
|
self.densepose_on = cfg.MODEL.DENSEPOSE_ON |
|
if not self.densepose_on: |
|
return |
|
self.densepose_data_filter = build_densepose_data_filter(cfg) |
|
dp_pooler_resolution = cfg.MODEL.ROI_DENSEPOSE_HEAD.POOLER_RESOLUTION |
|
dp_pooler_sampling_ratio = cfg.MODEL.ROI_DENSEPOSE_HEAD.POOLER_SAMPLING_RATIO |
|
dp_pooler_type = cfg.MODEL.ROI_DENSEPOSE_HEAD.POOLER_TYPE |
|
self.use_decoder = cfg.MODEL.ROI_DENSEPOSE_HEAD.DECODER_ON |
|
|
|
if self.use_decoder: |
|
dp_pooler_scales = (1.0 / input_shape[self.in_features[0]].stride,) |
|
else: |
|
dp_pooler_scales = tuple(1.0 / input_shape[k].stride for k in self.in_features) |
|
in_channels = [input_shape[f].channels for f in self.in_features][0] |
|
|
|
if self.use_decoder: |
|
self.decoder = Decoder(cfg, input_shape, self.in_features) |
|
|
|
self.densepose_pooler = ROIPooler( |
|
output_size=dp_pooler_resolution, |
|
scales=dp_pooler_scales, |
|
sampling_ratio=dp_pooler_sampling_ratio, |
|
pooler_type=dp_pooler_type, |
|
) |
|
self.densepose_head = build_densepose_head(cfg, in_channels) |
|
self.densepose_predictor = build_densepose_predictor( |
|
cfg, self.densepose_head.n_out_channels |
|
) |
|
self.densepose_losses = build_densepose_losses(cfg) |
|
self.embedder = build_densepose_embedder(cfg) |
|
|
|
def _forward_densepose(self, features: Dict[str, torch.Tensor], instances: List[Instances]): |
|
""" |
|
Forward logic of the densepose prediction branch. |
|
|
|
Args: |
|
features (dict[str, Tensor]): input data as a mapping from feature |
|
map name to tensor. Axis 0 represents the number of images `N` in |
|
the input data; axes 1-3 are channels, height, and width, which may |
|
vary between feature maps (e.g., if a feature pyramid is used). |
|
instances (list[Instances]): length `N` list of `Instances`. The i-th |
|
`Instances` contains instances for the i-th input image, |
|
In training, they can be the proposals. |
|
In inference, they can be the predicted boxes. |
|
|
|
Returns: |
|
In training, a dict of losses. |
|
In inference, update `instances` with new fields "densepose" and return it. |
|
""" |
|
if not self.densepose_on: |
|
return {} if self.training else instances |
|
|
|
features_list = [features[f] for f in self.in_features] |
|
if self.training: |
|
proposals, _ = select_foreground_proposals(instances, self.num_classes) |
|
features_list, proposals = self.densepose_data_filter(features_list, proposals) |
|
if len(proposals) > 0: |
|
proposal_boxes = [x.proposal_boxes for x in proposals] |
|
|
|
if self.use_decoder: |
|
features_list = [self.decoder(features_list)] |
|
|
|
features_dp = self.densepose_pooler(features_list, proposal_boxes) |
|
densepose_head_outputs = self.densepose_head(features_dp) |
|
densepose_predictor_outputs = self.densepose_predictor(densepose_head_outputs) |
|
densepose_loss_dict = self.densepose_losses( |
|
proposals, densepose_predictor_outputs, embedder=self.embedder |
|
) |
|
return densepose_loss_dict |
|
else: |
|
pred_boxes = [x.pred_boxes for x in instances] |
|
|
|
if self.use_decoder: |
|
features_list = [self.decoder(features_list)] |
|
|
|
features_dp = self.densepose_pooler(features_list, pred_boxes) |
|
if len(features_dp) > 0: |
|
densepose_head_outputs = self.densepose_head(features_dp) |
|
densepose_predictor_outputs = self.densepose_predictor(densepose_head_outputs) |
|
else: |
|
densepose_predictor_outputs = None |
|
|
|
densepose_inference(densepose_predictor_outputs, instances) |
|
return instances |
|
|
|
def forward( |
|
self, |
|
images: ImageList, |
|
features: Dict[str, torch.Tensor], |
|
proposals: List[Instances], |
|
targets: Optional[List[Instances]] = None, |
|
): |
|
instances, losses = super().forward(images, features, proposals, targets) |
|
del targets, images |
|
|
|
if self.training: |
|
losses.update(self._forward_densepose(features, instances)) |
|
return instances, losses |
|
|
|
def forward_with_given_boxes( |
|
self, features: Dict[str, torch.Tensor], instances: List[Instances] |
|
): |
|
""" |
|
Use the given boxes in `instances` to produce other (non-box) per-ROI outputs. |
|
|
|
This is useful for downstream tasks where a box is known, but need to obtain |
|
other attributes (outputs of other heads). |
|
Test-time augmentation also uses this. |
|
|
|
Args: |
|
features: same as in `forward()` |
|
instances (list[Instances]): instances to predict other outputs. Expect the keys |
|
"pred_boxes" and "pred_classes" to exist. |
|
|
|
Returns: |
|
instances (list[Instances]): |
|
the same `Instances` objects, with extra |
|
fields such as `pred_masks` or `pred_keypoints`. |
|
""" |
|
|
|
instances = super().forward_with_given_boxes(features, instances) |
|
instances = self._forward_densepose(features, instances) |
|
return instances |
|
|