Spaces:
Configuration error
Configuration error
# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
import numpy as np | |
import paddle | |
import paddle.nn as nn | |
import paddle.nn.functional as F | |
from paddleseg.cvlibs import manager | |
from paddleseg.models import layers | |
from paddleseg.utils import utils | |
class PointRend(nn.Layer): | |
""" | |
The SemanticFPN-PointRend implementation based on PaddlePaddle. | |
The original article refers to | |
Kirillov A, Wu Y, He K, et al. "PointRend: Image Segmentation As Rendering." | |
(https://arxiv.org/abs/1912.08193). | |
Args: | |
num_classes (int): The unique number of target classes. | |
backbone (Paddle.nn.Layer): Backbone network, currently support Resnet50/101. | |
backbone_indices (tuple, optional): Four values in the tuple indicate the indices of output of backbone. | |
fpn_inplanes (list, optional): Input channels list(the feature channels from backbone) for lateral_conv constraction in FPN. Default: [256, 512, 1024, 2048]. | |
fpn_outplanes (int, optional): The output channels in FPN. Default: 256. | |
point_num_fcs (int, optional): Number of fc layers in the head in PointHead. Default: 3. | |
point_in_channels (list, optional): input channels of fc block in PointHead. Default: [256]. | |
point_out_channels (int, optional): Fc block's output channels in PointHead. Default: 256. | |
point_in_index (list, optional): The indexs of input features to use in PointHead. Default: [0]. | |
point_num_points (int, optional): The number of point in training mode in PointHead. Default: 2048. | |
point_oversample_ratio (int, optional): The sample ratio of points when in training mode in PointHead. | |
sampled_point = num_points * oversample_ratio. Default: 3. | |
point_importance_sample_ratio (float, optional): The importance sample ratio for compute num_uncertain_points in PointHead. Default: 0.75. | |
point_scale_factor(int, optinal): The scale factor of F.interpolate in refine seg logits stage when in inference in PointHead. Default: 2. | |
point_subdivision_steps(int, optional): Then refine steps in refine seg logits stage when in inference in PointHead. Default: 2. | |
point_subdivision_num_points(int, optional): The points number for refine seg logits when in inference in PointHead. Default: 8196. | |
point_dropout_ratio(float, optional): If the dropout_ratio >0, to use Dropout before output and the p of dropout is dropout_ratio in PointHead. Default: 0.1. | |
point_coarse_pred_each_layer(bool, optional): Whether concatenate coarse feature with | |
the output of each fc layer in PointHead. Default: True. | |
point_conv_cfg(str): The config of Conv in PointHead. Default: 'Conv1D'. | |
point_input_transform(str): The features transform method of inputs in PointHead. | |
it can be found in function '_transform_inputs'. Defalut: 'multiple_select'. | |
PFN_feature_strides(list): The strides for input feature maps and all strides suppose to be power of 2 in FPNHead. The first | |
one is of largest resolution. Default: [4, 8, 16, 32]. | |
PFN_in_channels(list): The input feature's channels list in FPNHead. Default: [256, 256, 256, 256]. | |
PFN_channels(int,optional): The output channels of scale_head's Conv before Upsample block in FPNHead. Default: 128. | |
PFN_in_index(list): The indexs of input features to use. it's shape should keep with in_channels in FPNHead. Default: [0, 1, 2, 3]. | |
PFN_dropout_ratio(float,optional): If the dropout_ratio >0, to use Dropout before output and the p of dropout is dropout_ratio in FPNHead. Default: 0.1. | |
PFN_conv_cfg(str): The config of Conv. Default: 'Conv2D'. | |
PFN_input_transform(str): The features transform method of inputs. it can be found in function '_transform_inputs' in FPNHead. Defalut: 'multiple_select'. | |
align_corners (bool, optional): An argument of F.interpolate. It should be set to False when the feature size is even, | |
e.g. 1024x512, otherwise it is True, e.g. 769x769. Default: False. | |
pretrained (str, optional): The path or url of pretrained model. Default: None. | |
""" | |
def __init__( | |
self, | |
num_classes, | |
backbone, | |
backbone_indices, | |
fpn_inplanes=[256, 512, 1024, 2048], | |
fpn_outplanes=256, | |
point_in_channels=[256], | |
point_out_channels=256, | |
point_in_index=[0], | |
point_num_fcs=3, | |
point_num_points=2048, | |
point_oversample_ratio=3, | |
point_importance_sample_ratio=0.75, | |
point_scale_factor=2, | |
point_subdivision_steps=2, | |
point_subdivision_num_points=8196, | |
point_dropout_ratio=0, | |
point_coarse_pred_each_layer=True, | |
point_input_transform='multiple_select', # resize_concat | |
point_conv_cfg='Conv1D', | |
PFN_feature_strides=[4, 8, 16, 32], | |
PFN_in_channels=[256, 256, 256, 256], | |
PFN_channels=128, | |
PFN_in_index=[0, 1, 2, 3], | |
PFN_dropout_ratio=0, | |
PFN_conv_cfg='Conv2D', | |
PFN_input_transform='multiple_select', | |
align_corners=False, | |
pretrained=None): | |
super(PointRend, self).__init__() | |
self.backbone = backbone | |
self.backbone_indices = backbone_indices | |
self.in_channels = [ | |
self.backbone.feat_channels[i] for i in backbone_indices | |
] | |
self.neck = FPNNeck( | |
fpn_inplanes=fpn_inplanes, fpn_outplanes=fpn_outplanes) | |
self.pointhead = PointHead( | |
in_channels=point_in_channels, | |
out_channels=point_out_channels, | |
num_classes=num_classes, | |
in_index=point_in_index, | |
num_fcs=point_num_fcs, | |
num_points=point_num_points, | |
oversample_ratio=point_oversample_ratio, | |
importance_sample_ratio=point_importance_sample_ratio, | |
scale_factor=point_scale_factor, | |
subdivision_steps=point_subdivision_steps, | |
subdivision_num_points=point_subdivision_num_points, | |
dropout_ratio=point_dropout_ratio, | |
align_corners=align_corners, | |
coarse_pred_each_layer=point_coarse_pred_each_layer, | |
input_transform=point_input_transform, # resize_concat | |
conv_cfg=point_conv_cfg) | |
self.fpnhead = FPNHead( | |
feature_strides=PFN_feature_strides, | |
in_channels=PFN_in_channels, | |
channels=PFN_channels, | |
num_class=num_classes, | |
in_index=PFN_in_index, | |
dropout_ratio=PFN_dropout_ratio, | |
conv_cfg=PFN_conv_cfg, | |
input_transform=PFN_input_transform, | |
align_corners=align_corners) | |
self.align_corners = align_corners | |
self.pretrained = pretrained | |
self.init_weight() | |
def forward(self, x): | |
feats = self.backbone(x) | |
feats = [feats[i] for i in self.backbone_indices] | |
fpn_feats = self.neck(feats) # [n,256,64,128]*3 & [n,256,128,256] | |
pfn_logits = self.fpnhead( | |
fpn_feats) # segmainoutput decode_head[0] 512*1024->[n, 19, 64, 128] | |
point_logits = self.pointhead( | |
fpn_feats, pfn_logits) # segpointoutput decode_head[1] | |
if self.training: | |
logit_list = [ | |
F.interpolate( | |
logit, | |
paddle.shape(x)[2:], | |
mode='bilinear', | |
align_corners=self.align_corners) for logit in pfn_logits | |
] | |
logit_list.append(point_logits) | |
else: | |
logit_list = [ | |
F.interpolate( | |
logit, | |
paddle.shape(x)[2:], | |
mode='bilinear', | |
align_corners=self.align_corners) for logit in point_logits | |
] | |
return logit_list | |
def init_weight(self): | |
if self.pretrained is not None: | |
utils.load_entire_model(self, self.pretrained) | |
class PointHead(nn.Layer): | |
""" | |
The PointHead implementation based on PaddlePaddle. | |
PointHead use shared multi-layer perceptron (equivalent to | |
nn.Conv1D) to predict the logit of input points. The fine-grained feature | |
and coarse feature will be concatenate together for predication. | |
The original article refers to: | |
Kirillov A , Wu Y , He K , et al "PointRend: Image Segmentation As Rendering." | |
(https://arxiv.org/abs/1912.08193) | |
Args: | |
num_classes (int): Number of classes for logits. Default: 19. | |
num_fcs (int, optional): Number of fc layers in the head. Default: 3. | |
in_channels (list): input channels of fc block. Default: [256]. | |
out_channels (int, optional): Fc block's output channels. Default: 256. | |
in_index (list): The indexs of input features to use. Default: [0]. | |
num_points (int, optional): The number of point in training mode. Default: 2048. | |
oversample_ratio (int, optional): The sample ratio of points when in training mode. | |
sampled_point = num_points * oversample_ratio. Default: 3. | |
importance_sample_ratio(float, optional): The importance sample ratio for compute num_uncertain_points. Default: 0.75. | |
scale_factor(int, optional): The scale factor of F.interpolate in refine seg logits stage when in inference. Default: 2. | |
subdivision_steps(int, optional): Then refine steps in refine seg logits stage when in inference. Default: 2. | |
subdivision_num_points(int, optional): The points number for refine seg logits when in inference. Default: 8196. | |
dropout_ratio(float, optional): If the dropout_ratio >0, to use Dropout before output and the p of dropout is dropout_ratio. Default: 0.1. | |
coarse_pred_each_layer(bool, optional): Whether concatenate coarse feature with | |
the output of each fc layer. Default: True. | |
conv_cfg(str): The config of Conv. Default: 'Conv1D'. | |
input_transform(str): The features transform method of inputs. | |
it can be found in function '_transform_inputs'. Defalut: 'multiple_select'. | |
align_corners (bool, optional): An argument of F.interpolate. It should be set to False when the feature size is even, | |
e.g. 1024x512, otherwise it is True, e.g. 769x769. Default: False. | |
""" | |
def __init__( | |
self, | |
num_classes=19, | |
num_fcs=3, | |
in_channels=[256], | |
out_channels=256, | |
in_index=[0], | |
num_points=2048, | |
oversample_ratio=3, | |
importance_sample_ratio=0.75, | |
scale_factor=2, | |
subdivision_steps=2, | |
subdivision_num_points=8196, | |
dropout_ratio=0.1, | |
coarse_pred_each_layer=True, | |
conv_cfg='Conv1D', | |
input_transform='multiple_select', # resize_concat | |
align_corners=False): | |
super(PointHead, self).__init__() | |
self.in_channels = in_channels | |
self.channels = out_channels | |
self.in_index = in_index | |
self.num_classes = num_classes | |
self.num_fcs = num_fcs | |
self.num_points = num_points | |
self.oversample_ratio = oversample_ratio | |
self.importance_sample_ratio = importance_sample_ratio | |
self.scale_factor = scale_factor | |
self.subdivision_steps = subdivision_steps | |
self.subdivision_num_points = paddle.to_tensor( | |
subdivision_num_points, dtype="int32") | |
self.dropout_ratio = dropout_ratio | |
self.coarse_pred_each_layer = coarse_pred_each_layer | |
self.align_corners = align_corners | |
self.input_transform = input_transform | |
fc_in_channels = sum(self.in_channels) + self.num_classes | |
fc_channels = self.channels | |
self.fcs = nn.LayerList() | |
for k in range(num_fcs): | |
fc = ConvModule( | |
fc_in_channels, | |
fc_channels, | |
kernel_size=1, | |
stride=1, | |
padding=0, | |
conv_cfg=conv_cfg, ) | |
self.fcs.append(fc) | |
fc_in_channels = fc_channels | |
fc_in_channels += self.num_classes if self.coarse_pred_each_layer else 0 | |
self.fc_seg = nn.Conv1D( | |
fc_in_channels, | |
self.num_classes, | |
kernel_size=1, | |
stride=1, | |
padding=0) | |
if self.dropout_ratio > 0: | |
self.dropout = nn.Dropout(self.dropout_ratio) | |
else: | |
self.dropout = None | |
def cls_seg(self, feat): | |
"""Classify each pixel with fc.""" | |
if self.dropout is not None: | |
feat = self.dropout(feat) | |
output = self.fc_seg(feat) | |
return output | |
def _get_fine_grained_point_feats(self, x, points): | |
""" | |
Sample from fine grained features. | |
Args: | |
x (list[Tensor]): Feature pyramid from by neck or backbone. | |
points (Tensor): Point coordinates, shape (batch_size, | |
num_points, 2). | |
Returns: | |
fine_grained_feats (Tensor): Sampled fine grained feature, | |
shape (batch_size, sum(channels of x), num_points). | |
""" | |
fine_grained_feats_list = [ | |
point_sample( | |
_, points, align_corners=self.align_corners) for _ in x | |
] | |
if len(fine_grained_feats_list) > 1: | |
fine_grained_feats = paddle.concat(fine_grained_feats_list, axis=1) | |
else: | |
fine_grained_feats = fine_grained_feats_list[0] | |
return fine_grained_feats | |
def _get_coarse_point_feats(self, prev_output, points): | |
""" | |
Sample from fine grained features. | |
Args: | |
prev_output (list[Tensor]): Prediction of previous decode head. | |
points (Tensor): Point coordinates, shape (batch_size, | |
num_points, 2). | |
Returns: | |
coarse_feats (Tensor): Sampled coarse feature, shape (batch_size, | |
num_classes, num_points). | |
""" | |
coarse_feats = point_sample( | |
prev_output, points, align_corners=self.align_corners) | |
return coarse_feats | |
def _transform_inputs(self, inputs): | |
""" | |
Transform inputs for decoder. | |
Args: | |
inputs (list[Tensor]): List of multi-level img features. | |
Returns: | |
Tensor: The transformed inputs | |
""" | |
if self.input_transform == 'resize_concat': | |
inputs = [inputs[i] for i in self.in_index] | |
upsampled_inputs = [ | |
F.interpolate( | |
x, | |
size=paddle.shape(inputs[0])[2:], | |
mode='bilinear', | |
align_corners=self.align_corners) for x in inputs | |
] | |
inputs = paddle.concat(upsampled_inputs, axis=1) | |
elif self.input_transform == 'multiple_select': | |
inputs = [inputs[i] for i in self.in_index] | |
else: | |
inputs = inputs[self.in_index[0]] | |
return inputs | |
def get_points_train(self, seg_logits, uncertainty_func): # finish | |
""" | |
Sample points for training. | |
Sample points in [0, 1] x [0, 1] coordinate space based on their | |
uncertainty. The uncertainties are calculated for each point using | |
'uncertainty_func' function that takes point's logit prediction as | |
input. | |
Args: | |
seg_logits (Tensor): Semantic segmentation logits, shape ( | |
batch_size, num_classes, height, width). | |
uncertainty_func (func): uncertainty calculation function. | |
cfg (dict): Training config of point head. | |
Returns: | |
point_coords (Tensor): A tensor of shape (batch_size, num_points, | |
2) that contains the coordinates of ``num_points`` sampled | |
points. | |
""" | |
num_points = self.num_points | |
oversample_ratio = self.oversample_ratio | |
importance_sample_ratio = self.importance_sample_ratio | |
assert oversample_ratio >= 1 | |
assert 0 <= importance_sample_ratio <= 1 | |
batch_size = paddle.shape(seg_logits)[0] | |
num_sampled = int(num_points * oversample_ratio) | |
point_coords = paddle.rand([batch_size, num_sampled, 2]) | |
point_logits = point_sample(seg_logits, point_coords) | |
# It is crucial to calculate uncertainty based on the sampled | |
# prediction value for the points. Calculating uncertainties of the | |
# coarse predictions first and sampling them for points leads to | |
# incorrect results. To illustrate this: assume uncertainty func( | |
# logits)=-abs(logits), a sampled point between two coarse | |
# predictions with -1 and 1 logits has 0 logits, and therefore 0 | |
# uncertainty value. However, if we calculate uncertainties for the | |
# coarse predictions first, both will have -1 uncertainty, | |
# and sampled point will get -1 uncertainty. | |
point_uncertainties = uncertainty_func(point_logits) | |
num_uncertain_points = int(importance_sample_ratio * num_points) | |
num_random_points = num_points - num_uncertain_points | |
idx = paddle.topk( | |
point_uncertainties[:, 0, :], k=num_uncertain_points, axis=1)[1] | |
shift = num_sampled * paddle.arange(batch_size, dtype='int64') | |
idx += shift.unsqueeze([-1]) | |
idx = idx.reshape([-1]) | |
point_coords = paddle.index_select( | |
point_coords.reshape([-1, 2]), idx, axis=0) | |
point_coords = point_coords.reshape( | |
[batch_size, num_uncertain_points, 2]) | |
if num_random_points > 0: | |
rand_point_coords = paddle.rand([batch_size, num_random_points, 2]) | |
point_coords = paddle.concat( | |
(point_coords, rand_point_coords), axis=1) | |
return point_coords | |
def get_points_test(self, seg_logits, uncertainty_func): # finish | |
""" | |
Sample points for testing. | |
Find ``num_points`` most uncertain points from ``uncertainty_map``. | |
Args: | |
seg_logits (Tensor): A tensor of shape (batch_size, num_classes, | |
height, width) for class-specific or class-agnostic prediction. | |
uncertainty_func (func): uncertainty calculation function. | |
cfg (dict): Testing config of point head. | |
Returns: | |
point_indices (Tensor): A tensor of shape (batch_size, num_points) | |
that contains indices from [0, height x width) of the most | |
uncertain points. | |
point_coords (Tensor): A tensor of shape (batch_size, num_points, | |
2) that contains [0, 1] x [0, 1] normalized coordinates of the | |
most uncertain points from the ``height x width`` grid . | |
""" | |
num_points = self.subdivision_num_points | |
uncertainty_map = uncertainty_func(seg_logits) | |
batch_size = paddle.shape(uncertainty_map)[0] | |
height = paddle.shape(uncertainty_map)[2] | |
width = paddle.shape(uncertainty_map)[3] | |
h_step = 1.0 / height | |
w_step = 1.0 / width | |
uncertainty_map = uncertainty_map.reshape([batch_size, height * width]) | |
num_points = paddle.min(paddle.concat([height * width, num_points])) | |
point_indices = paddle.topk(uncertainty_map, num_points, axis=1)[1] | |
point_coords = paddle.zeros( | |
[batch_size, num_points, 2], dtype='float32') | |
point_coords[:, :, 0] = w_step / 2.0 + (point_indices % width | |
).astype('float32') * w_step | |
point_coords[:, :, 1] = h_step / 2.0 + (point_indices // width | |
).astype('float32') * h_step | |
return point_indices, point_coords | |
def scatter_paddle(self, refined_seg_logits, point_indices, point_logits): | |
""" | |
paddle version scatter : equal to pytorch version scatter(-1,point_indices,point_logits). | |
Args: | |
refined_seg_logits(Tensor): shape=[batch_size, channels, height * width] | |
point_indices(Tensor): shape=[batch_size, channels, height * width] | |
point_logits(Tensor): shape[batch_size, channels, height * width] | |
Returns: | |
scattered refined_seg_logits(Tensor). | |
""" | |
original_shape = paddle.shape( | |
refined_seg_logits) # [batch_size, channels, height * width] | |
new_refined_seg_logits = refined_seg_logits.flatten(0, 1) # [N*C,H*W] | |
offsets = ( | |
paddle.arange(paddle.shape(new_refined_seg_logits)[0]) * | |
paddle.shape(new_refined_seg_logits)[1]).unsqueeze(-1) # [N*C,1] | |
point_indices = point_indices.flatten(0, 1) # [N*C,H*W] | |
new_point_indices = (point_indices + offsets).flatten() | |
point_logits = point_logits.flatten() # [N*C*H*W] | |
refined_seg_logits = paddle.scatter( | |
refined_seg_logits.flatten(), | |
new_point_indices, | |
point_logits, | |
overwrite=True) | |
return refined_seg_logits.reshape(shape=original_shape) | |
def forward_train(self, x, prev_output): | |
with paddle.no_grad(): | |
points = self.get_points_train(prev_output, calculate_uncertainty) | |
fine_grained_point_feats = self._get_fine_grained_point_feats( | |
x, points) # [2, 256, 2048] | |
coarse_point_feats = self._get_coarse_point_feats( | |
prev_output, points) # [2, 19, 2048] | |
# forward for train | |
fusion_point_feats = paddle.concat( | |
[fine_grained_point_feats, coarse_point_feats], axis=1) | |
for fc in self.fcs: | |
fusion_point_feats = fc(fusion_point_feats) | |
if self.coarse_pred_each_layer: | |
fusion_point_feats = paddle.concat( | |
(fusion_point_feats, coarse_point_feats), axis=1) | |
point_logits = self.cls_seg(fusion_point_feats) | |
return [point_logits, points] # for points loss | |
def forward(self, inputs, prev_output): | |
""" | |
Forward function. | |
Args: | |
inputs (list[Tensor]): List of multi-level img features. | |
prev_output (Tensor): The output of previous decode head. | |
Returns: | |
[point_logits,points]: For points loss when in training. | |
[refined_seg_logits]: Output refined seg logits when in inference. | |
""" | |
prev_output = prev_output[0] | |
x = self._transform_inputs(inputs) | |
if self.training: | |
return self.forward_train(x, prev_output) | |
else: | |
refined_seg_logits = prev_output.clone() | |
for _ in range(self.subdivision_steps): | |
refined_seg_logits = F.interpolate( | |
refined_seg_logits, | |
scale_factor=self.scale_factor, | |
mode='bilinear', | |
align_corners=self.align_corners) | |
save_shape = paddle.shape(refined_seg_logits) | |
point_indices, points = self.get_points_test( | |
refined_seg_logits, calculate_uncertainty) | |
fine_grained_point_feats = self._get_fine_grained_point_feats( | |
x, points) | |
coarse_point_feats = self._get_coarse_point_feats(prev_output, | |
points) | |
# forward for inference | |
fusion_point_feats = paddle.concat( | |
[fine_grained_point_feats, coarse_point_feats], axis=1) | |
for fc in self.fcs: | |
fusion_point_feats = fc(fusion_point_feats) | |
if self.coarse_pred_each_layer: | |
fusion_point_feats = paddle.concat( | |
(fusion_point_feats, coarse_point_feats), axis=1) | |
point_logits = self.cls_seg(fusion_point_feats) | |
point_indices = paddle.unsqueeze(point_indices, axis=1) | |
point_indices = paddle.expand(point_indices, | |
[-1, save_shape[1], -1]) | |
refined_seg_logits = paddle.flatten(refined_seg_logits, 2) | |
refined_seg_logits = self.scatter_paddle( | |
refined_seg_logits, point_indices, | |
point_logits) # 2->height * width dim | |
refined_seg_logits = refined_seg_logits.reshape(save_shape) | |
return [refined_seg_logits] | |
class FPNHead(nn.Layer): | |
""" | |
This head is the implementation of Semantic FPN in paddle. | |
The original article refers to: | |
Kirillov, A. , et al. "Panoptic Feature Pyramid Networks." | |
(https://arxiv.org/abs/1901.02446) | |
Args: | |
num_classes(int): The unique number of target classes. Default: 19. | |
feature_strides(list): The strides for input feature maps and all strides suppose to be power of 2. The first | |
one is of largest resolution. Default: [4, 8, 16, 32]. | |
in_channels(list): The input feature's channels list. Default: [256, 256, 256, 256]. | |
channels(int, optional): The output channels of scale_head's Conv before Upsample block. Default: 128. | |
in_index(list): The indexs of input features to use. it's shape should keep with in_channels. Default: [0, 1, 2, 3]. | |
dropout_ratio(float, optional): If the dropout_ratio >0, to use Dropout before output and the p of dropout is dropout_ratio. Default: 0.1. | |
conv_cfg(str): The config of Conv. Default: 'Conv2D'. | |
input_transform(str): The features transform method of inputs. it can be found in function '_transform_inputs'. Defalut: 'multiple_select'. | |
align_corners (bool, optional): An argument of F.interpolate. It should be set to False when the feature size is even, | |
e.g. 1024x512, otherwise it is True, e.g. 769x769. Default: False. | |
""" | |
def __init__( | |
self, | |
num_class=19, | |
feature_strides=[4, 8, 16, 32], | |
in_channels=[256, 256, 256, 256], | |
channels=128, | |
in_index=[0, 1, 2, 3], | |
dropout_ratio=0.1, | |
conv_cfg='Conv2D', | |
input_transform='multiple_select', | |
align_corners=False, ): | |
super(FPNHead, self).__init__() | |
assert len(feature_strides) == len(in_channels) | |
assert min(feature_strides) == feature_strides[0] | |
self.feature_strides = feature_strides | |
self.in_channels = in_channels | |
self.channels = channels | |
self.in_index = in_index | |
self.num_class = num_class | |
self.conv_cfg = conv_cfg | |
self.dropout_ratio = dropout_ratio | |
self.input_transform = input_transform | |
self.align_corners = align_corners | |
self.scale_heads = nn.LayerList() | |
for i in range(len(feature_strides)): | |
head_length = max( | |
1, | |
int(np.log2(feature_strides[i]) - np.log2(feature_strides[0]))) | |
scale_head = [] | |
for k in range(head_length): | |
scale_head.append( | |
ConvModule( | |
self.in_channels[i] if k == 0 else self.channels, | |
self.channels, | |
3, | |
padding=1, | |
conv_cfg=self.conv_cfg)) | |
if feature_strides[i] != feature_strides[0]: | |
scale_head.append( | |
Upsample( | |
scale_factor=2, | |
mode='bilinear', | |
align_corners=self.align_corners)) | |
self.scale_heads.append(nn.Sequential(*scale_head)) | |
self.conv_seg = nn.Conv2D(self.channels, self.num_class, kernel_size=1) | |
if self.dropout_ratio is not None: | |
self.dropout = nn.Dropout2D(self.dropout_ratio) | |
else: | |
self.dropout = None | |
def cls_seg(self, feat): | |
if self.dropout is not None: | |
feat = self.dropout(feat) | |
output = self.conv_seg(feat) | |
return output | |
def _transform_inputs(self, inputs): | |
""" | |
Transform inputs for decoder. | |
Args: | |
inputs (list[Tensor]): List of multi-level img features. | |
Returns: | |
Tensor: The transformed inputs | |
""" | |
if self.input_transform == 'resize_concat': | |
inputs = [inputs[i] for i in self.in_index] | |
upsampled_inputs = [ | |
F.interpolate( | |
x, | |
size=paddle.shape(inputs[0])[2:], | |
mode='bilinear', | |
align_corners=self.align_corners) for x in inputs | |
] | |
inputs = paddle.concat(upsampled_inputs, axis=1) | |
elif self.input_transform == 'multiple_select': | |
inputs = [inputs[i] for i in self.in_index] | |
else: | |
inputs = inputs[self.in_index[0]] | |
return inputs | |
def forward(self, inputs): | |
x = self._transform_inputs(inputs) | |
output = self.scale_heads[0](x[0]) | |
for i in range(1, len(self.feature_strides)): | |
output = output + F.interpolate( | |
self.scale_heads[i](x[i]), | |
size=paddle.shape(output)[2:], | |
mode='bilinear', | |
align_corners=self.align_corners) | |
output = self.cls_seg(output) | |
return [output] | |
class FPNNeck(nn.Layer): | |
""" | |
The FPN Neck implementation in paddle. | |
Args: | |
fpn_inplanes (list, optional): Input channels list(the feature channels from backbone) for lateral_conv constraction. Default: [256, 512, 1024, 2048]. | |
fpn_outplanes (int, optional): The output channels. Default: 256. | |
""" | |
def __init__( | |
self, | |
fpn_inplanes=[256, 512, 1024, 2048], | |
fpn_outplanes=256, ): | |
super(FPNNeck, self).__init__() | |
self.lateral_convs = [] | |
self.fpn_out = [] | |
# FPN head | |
for fpn_inplane in fpn_inplanes: | |
self.lateral_convs.append( | |
nn.Sequential( | |
nn.Conv2D(fpn_inplane, fpn_outplanes, 1), | |
layers.SyncBatchNorm(fpn_outplanes), nn.ReLU())) | |
self.fpn_out.append( | |
nn.Sequential( | |
layers.ConvBNReLU( | |
fpn_outplanes, fpn_outplanes, 3, bias_attr=False))) | |
self.lateral_convs = nn.LayerList(self.lateral_convs) | |
self.fpn_out = nn.LayerList(self.fpn_out) | |
def forward(self, conv_out): | |
last_out = self.lateral_convs[-1](conv_out[-1]) | |
f = last_out | |
fpn_feature_list = [last_out] | |
for i in reversed(range(len(conv_out) - 1)): | |
conv_x = conv_out[i] | |
conv_x = self.lateral_convs[i](conv_x) | |
prev_shape = paddle.shape(conv_x)[2:] | |
f = conv_x + F.interpolate( | |
f, prev_shape, mode='bilinear', align_corners=True) | |
fpn_feature_list.append(self.fpn_out[i](f)) | |
return fpn_feature_list | |
class ConvModule(nn.Layer): | |
""" | |
ConvModule includes Conv1/Conv2D. | |
""" | |
def __init__(self, | |
in_channels, | |
out_channels, | |
kernel_size, | |
padding=0, | |
stride=1, | |
conv_cfg='Conv1D', | |
norm_cfg='None', | |
**kwargs): | |
super().__init__() | |
if (conv_cfg == 'Conv1D'): | |
self._conv = nn.Conv1D( | |
in_channels, | |
out_channels, | |
kernel_size, | |
stride=stride, | |
padding=padding, | |
**kwargs) | |
if (conv_cfg == 'Conv2D'): | |
self._conv = nn.Conv2D( | |
in_channels, | |
out_channels, | |
kernel_size, | |
stride=stride, | |
padding=padding, | |
**kwargs) | |
if 'data_format' in kwargs: | |
data_format = kwargs['data_format'] | |
else: | |
data_format = 'NCHW' | |
if (norm_cfg != 'None'): | |
self._batch_norm = layers.SyncBatchNorm( | |
out_channels, data_format=data_format) | |
else: | |
self._batch_norm = None | |
def forward(self, x): | |
x = self._conv(x) | |
if (self._batch_norm != None): | |
x = self._batch_norm(x) | |
x = F.relu(x) | |
return x | |
class Upsample(nn.Layer): | |
""" | |
Upsample Module. | |
""" | |
def __init__(self, | |
size=None, | |
scale_factor=None, | |
mode='nearest', | |
align_corners=None): | |
super(Upsample, self).__init__() | |
self.size = size | |
if isinstance(scale_factor, tuple): | |
self.scale_factor = tuple(float(factor) for factor in scale_factor) | |
else: | |
self.scale_factor = float(scale_factor) if scale_factor else None | |
self.mode = mode | |
self.align_corners = align_corners | |
def forward(self, x): | |
if not self.size: | |
return F.interpolate(x, None, self.scale_factor, self.mode, | |
self.align_corners) | |
else: | |
return F.interpolate(x, self.size, None, self.mode, | |
self.align_corners) | |
def point_sample(input, points, align_corners=False, **kwargs): | |
""" | |
A wrapper around :func:`grid_sample` to support 3D point_coords tensors | |
Unlike :func:`torch.nn.functional.grid_sample` it assumes point_coords to | |
lie inside ``[0, 1] x [0, 1]`` square. | |
Args: | |
input (Tensor): Feature map, shape (N, C, H, W). | |
points (Tensor): Image based absolute point coordinates (normalized), | |
range [0, 1] x [0, 1], shape (N, P, 2) or (N, Hgrid, Wgrid, 2). | |
align_corners (bool): Whether align_corners. Default: False | |
Returns: | |
Tensor: Features of `point` on `input`, shape (N, C, P) or | |
(N, C, Hgrid, Wgrid). | |
""" | |
def denormalize(grid): | |
"""Denormalize input grid from range [0, 1] to [-1, 1] | |
Args: | |
grid (Tensor): The grid to be denormalize, range [0, 1]. | |
Returns: | |
Tensor: Denormalized grid, range [-1, 1]. | |
""" | |
return grid * 2.0 - 1.0 | |
add_dim = False | |
if points.dim() == 3: | |
add_dim = True | |
points = paddle.unsqueeze(points, axis=2) | |
output = F.grid_sample( | |
input, denormalize(points), align_corners=align_corners, **kwargs) | |
if add_dim: | |
output = paddle.squeeze(output, axis=3) | |
return output | |
def calculate_uncertainty(seg_logits): | |
""" | |
Estimate uncertainty based on seg logits. | |
For each location of the prediction ``seg_logits`` we estimate | |
uncertainty as the difference between top first and top second | |
predicted logits. | |
Args: | |
seg_logits (Tensor): Semantic segmentation logits, | |
shape (batch_size, num_classes, height, width). | |
Returns: | |
scores (Tensor): T uncertainty scores with the most uncertain | |
locations having the highest uncertainty score, shape ( | |
batch_size, 1, height, width) | |
""" | |
top2_scores = paddle.topk(seg_logits, k=2, axis=1)[0] | |
return paddle.unsqueeze(top2_scores[:, 1] - top2_scores[:, 0], axis=1) | |