deanna-emery's picture
updates
93528c6
# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Classes to build various prediction heads in all supported models."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import functools
import numpy as np
import tensorflow as tf, tf_keras
from official.legacy.detection.modeling.architecture import nn_ops
from official.legacy.detection.ops import spatial_transform_ops
class RpnHead(tf_keras.layers.Layer):
"""Region Proposal Network head."""
def __init__(
self,
min_level,
max_level,
anchors_per_location,
num_convs=2,
num_filters=256,
use_separable_conv=False,
activation='relu',
use_batch_norm=True,
norm_activation=nn_ops.norm_activation_builder(activation='relu')):
"""Initialize params to build Region Proposal Network head.
Args:
min_level: `int` number of minimum feature level.
max_level: `int` number of maximum feature level.
anchors_per_location: `int` number of number of anchors per pixel
location.
num_convs: `int` number that represents the number of the intermediate
conv layers before the prediction.
num_filters: `int` number that represents the number of filters of the
intermediate conv layers.
use_separable_conv: `bool`, indicating whether the separable conv layers
is used.
activation: activation function. Support 'relu' and 'swish'.
use_batch_norm: 'bool', indicating whether batchnorm layers are added.
norm_activation: an operation that includes a normalization layer followed
by an optional activation layer.
"""
super().__init__(autocast=False)
self._min_level = min_level
self._max_level = max_level
self._anchors_per_location = anchors_per_location
if activation == 'relu':
self._activation_op = tf.nn.relu
elif activation == 'swish':
self._activation_op = tf.nn.swish
else:
raise ValueError('Unsupported activation `{}`.'.format(activation))
self._use_batch_norm = use_batch_norm
if use_separable_conv:
self._conv2d_op = functools.partial(
tf_keras.layers.SeparableConv2D,
depth_multiplier=1,
bias_initializer=tf.zeros_initializer())
else:
self._conv2d_op = functools.partial(
tf_keras.layers.Conv2D,
kernel_initializer=tf_keras.initializers.RandomNormal(stddev=0.01),
bias_initializer=tf.zeros_initializer())
self._rpn_conv = self._conv2d_op(
num_filters,
kernel_size=(3, 3),
strides=(1, 1),
activation=(None if self._use_batch_norm else self._activation_op),
padding='same',
name='rpn')
self._rpn_class_conv = self._conv2d_op(
anchors_per_location,
kernel_size=(1, 1),
strides=(1, 1),
padding='valid',
name='rpn-class')
self._rpn_box_conv = self._conv2d_op(
4 * anchors_per_location,
kernel_size=(1, 1),
strides=(1, 1),
padding='valid',
name='rpn-box')
self._norm_activations = {}
if self._use_batch_norm:
for level in range(self._min_level, self._max_level + 1):
self._norm_activations[level] = norm_activation(name='rpn-l%d-bn' %
level)
def _shared_rpn_heads(self, features, anchors_per_location, level,
is_training):
"""Shared RPN heads."""
features = self._rpn_conv(features)
if self._use_batch_norm:
# The batch normalization layers are not shared between levels.
features = self._norm_activations[level](
features, is_training=is_training)
# Proposal classification scores
scores = self._rpn_class_conv(features)
# Proposal bbox regression deltas
bboxes = self._rpn_box_conv(features)
return scores, bboxes
def call(self, features, is_training=None):
scores_outputs = {}
box_outputs = {}
with tf.name_scope('rpn_head'):
for level in range(self._min_level, self._max_level + 1):
scores_output, box_output = self._shared_rpn_heads(
features[level], self._anchors_per_location, level, is_training)
scores_outputs[level] = scores_output
box_outputs[level] = box_output
return scores_outputs, box_outputs
class OlnRpnHead(tf_keras.layers.Layer):
"""Region Proposal Network for Object Localization Network (OLN)."""
def __init__(
self,
min_level,
max_level,
anchors_per_location,
num_convs=2,
num_filters=256,
use_separable_conv=False,
activation='relu',
use_batch_norm=True,
norm_activation=nn_ops.norm_activation_builder(activation='relu')):
"""Initialize params to build Region Proposal Network head.
Args:
min_level: `int` number of minimum feature level.
max_level: `int` number of maximum feature level.
anchors_per_location: `int` number of number of anchors per pixel
location.
num_convs: `int` number that represents the number of the intermediate
conv layers before the prediction.
num_filters: `int` number that represents the number of filters of the
intermediate conv layers.
use_separable_conv: `bool`, indicating whether the separable conv layers
is used.
activation: activation function. Support 'relu' and 'swish'.
use_batch_norm: 'bool', indicating whether batchnorm layers are added.
norm_activation: an operation that includes a normalization layer followed
by an optional activation layer.
"""
self._min_level = min_level
self._max_level = max_level
self._anchors_per_location = anchors_per_location
if activation == 'relu':
self._activation_op = tf.nn.relu
elif activation == 'swish':
self._activation_op = tf.nn.swish
else:
raise ValueError('Unsupported activation `{}`.'.format(activation))
self._use_batch_norm = use_batch_norm
if use_separable_conv:
self._conv2d_op = functools.partial(
tf_keras.layers.SeparableConv2D,
depth_multiplier=1,
bias_initializer=tf.zeros_initializer())
else:
self._conv2d_op = functools.partial(
tf_keras.layers.Conv2D,
kernel_initializer=tf_keras.initializers.RandomNormal(stddev=0.01),
bias_initializer=tf.zeros_initializer())
self._rpn_conv = self._conv2d_op(
num_filters,
kernel_size=(3, 3),
strides=(1, 1),
activation=(None if self._use_batch_norm else self._activation_op),
padding='same',
name='rpn')
self._rpn_class_conv = self._conv2d_op(
anchors_per_location,
kernel_size=(1, 1),
strides=(1, 1),
padding='valid',
name='rpn-class')
self._rpn_box_conv = self._conv2d_op(
4 * anchors_per_location,
kernel_size=(1, 1),
strides=(1, 1),
padding='valid',
name='rpn-box-lrtb')
self._rpn_center_conv = self._conv2d_op(
anchors_per_location,
kernel_size=(1, 1),
strides=(1, 1),
padding='valid',
name='rpn-centerness')
self._norm_activations = {}
if self._use_batch_norm:
for level in range(self._min_level, self._max_level + 1):
self._norm_activations[level] = norm_activation(name='rpn-l%d-bn' %
level)
def _shared_rpn_heads(self, features, anchors_per_location, level,
is_training):
"""Shared RPN heads."""
features = self._rpn_conv(features)
if self._use_batch_norm:
# The batch normalization layers are not shared between levels.
features = self._norm_activations[level](
features, is_training=is_training)
# Feature L2 normalization for training stability
features = tf.math.l2_normalize(
features,
axis=-1,
name='rpn-norm',)
# Proposal classification scores
scores = self._rpn_class_conv(features)
# Proposal bbox regression deltas
bboxes = self._rpn_box_conv(features)
# Proposal centerness scores
centers = self._rpn_center_conv(features)
return scores, bboxes, centers
def __call__(self, features, is_training=None):
scores_outputs = {}
box_outputs = {}
center_outputs = {}
with tf.name_scope('rpn_head'):
for level in range(self._min_level, self._max_level + 1):
scores_output, box_output, center_output = self._shared_rpn_heads(
features[level], self._anchors_per_location, level, is_training)
scores_outputs[level] = scores_output
box_outputs[level] = box_output
center_outputs[level] = center_output
return scores_outputs, box_outputs, center_outputs
class FastrcnnHead(tf_keras.layers.Layer):
"""Fast R-CNN box head."""
def __init__(
self,
num_classes,
num_convs=0,
num_filters=256,
use_separable_conv=False,
num_fcs=2,
fc_dims=1024,
activation='relu',
use_batch_norm=True,
norm_activation=nn_ops.norm_activation_builder(activation='relu')):
"""Initialize params to build Fast R-CNN box head.
Args:
num_classes: a integer for the number of classes.
num_convs: `int` number that represents the number of the intermediate
conv layers before the FC layers.
num_filters: `int` number that represents the number of filters of the
intermediate conv layers.
use_separable_conv: `bool`, indicating whether the separable conv layers
is used.
num_fcs: `int` number that represents the number of FC layers before the
predictions.
fc_dims: `int` number that represents the number of dimension of the FC
layers.
activation: activation function. Support 'relu' and 'swish'.
use_batch_norm: 'bool', indicating whether batchnorm layers are added.
norm_activation: an operation that includes a normalization layer followed
by an optional activation layer.
"""
super(FastrcnnHead, self).__init__(autocast=False)
self._num_classes = num_classes
self._num_convs = num_convs
self._num_filters = num_filters
if use_separable_conv:
self._conv2d_op = functools.partial(
tf_keras.layers.SeparableConv2D,
depth_multiplier=1,
bias_initializer=tf.zeros_initializer())
else:
self._conv2d_op = functools.partial(
tf_keras.layers.Conv2D,
kernel_initializer=tf_keras.initializers.VarianceScaling(
scale=2, mode='fan_out', distribution='untruncated_normal'),
bias_initializer=tf.zeros_initializer())
self._num_fcs = num_fcs
self._fc_dims = fc_dims
if activation == 'relu':
self._activation_op = tf.nn.relu
elif activation == 'swish':
self._activation_op = tf.nn.swish
else:
raise ValueError('Unsupported activation `{}`.'.format(activation))
self._use_batch_norm = use_batch_norm
self._norm_activation = norm_activation
self._conv_ops = []
self._conv_bn_ops = []
for i in range(self._num_convs):
self._conv_ops.append(
self._conv2d_op(
self._num_filters,
kernel_size=(3, 3),
strides=(1, 1),
padding='same',
dilation_rate=(1, 1),
activation=(None
if self._use_batch_norm else self._activation_op),
name='conv_{}'.format(i)))
if self._use_batch_norm:
self._conv_bn_ops.append(self._norm_activation())
self._fc_ops = []
self._fc_bn_ops = []
for i in range(self._num_fcs):
self._fc_ops.append(
tf_keras.layers.Dense(
units=self._fc_dims,
activation=(None
if self._use_batch_norm else self._activation_op),
name='fc{}'.format(i)))
if self._use_batch_norm:
self._fc_bn_ops.append(self._norm_activation(fused=False))
self._class_predict = tf_keras.layers.Dense(
self._num_classes,
kernel_initializer=tf_keras.initializers.RandomNormal(stddev=0.01),
bias_initializer=tf.zeros_initializer(),
name='class-predict')
self._box_predict = tf_keras.layers.Dense(
self._num_classes * 4,
kernel_initializer=tf_keras.initializers.RandomNormal(stddev=0.001),
bias_initializer=tf.zeros_initializer(),
name='box-predict')
def call(self, roi_features, is_training=None):
"""Box and class branches for the Mask-RCNN model.
Args:
roi_features: A ROI feature tensor of shape [batch_size, num_rois,
height_l, width_l, num_filters].
is_training: `boolean`, if True if model is in training mode.
Returns:
class_outputs: a tensor with a shape of
[batch_size, num_rois, num_classes], representing the class predictions.
box_outputs: a tensor with a shape of
[batch_size, num_rois, num_classes * 4], representing the box
predictions.
"""
with tf.name_scope(
'fast_rcnn_head'):
# reshape inputs beofre FC.
_, num_rois, height, width, filters = roi_features.get_shape().as_list()
net = tf.reshape(roi_features, [-1, height, width, filters])
for i in range(self._num_convs):
net = self._conv_ops[i](net)
if self._use_batch_norm:
net = self._conv_bn_ops[i](net, is_training=is_training)
filters = self._num_filters if self._num_convs > 0 else filters
net = tf.reshape(net, [-1, num_rois, height * width * filters])
for i in range(self._num_fcs):
net = self._fc_ops[i](net)
if self._use_batch_norm:
net = self._fc_bn_ops[i](net, is_training=is_training)
class_outputs = self._class_predict(net)
box_outputs = self._box_predict(net)
return class_outputs, box_outputs
class OlnBoxScoreHead(tf_keras.layers.Layer):
"""Box head of Object Localization Network (OLN)."""
def __init__(
self,
num_classes,
num_convs=0,
num_filters=256,
use_separable_conv=False,
num_fcs=2,
fc_dims=1024,
activation='relu',
use_batch_norm=True,
norm_activation=nn_ops.norm_activation_builder(activation='relu')):
"""Initialize params to build OLN box head.
Args:
num_classes: a integer for the number of classes.
num_convs: `int` number that represents the number of the intermediate
conv layers before the FC layers.
num_filters: `int` number that represents the number of filters of the
intermediate conv layers.
use_separable_conv: `bool`, indicating whether the separable conv layers
is used.
num_fcs: `int` number that represents the number of FC layers before the
predictions.
fc_dims: `int` number that represents the number of dimension of the FC
layers.
activation: activation function. Support 'relu' and 'swish'.
use_batch_norm: 'bool', indicating whether batchnorm layers are added.
norm_activation: an operation that includes a normalization layer followed
by an optional activation layer.
"""
self._num_classes = num_classes
self._num_convs = num_convs
self._num_filters = num_filters
if use_separable_conv:
self._conv2d_op = functools.partial(
tf_keras.layers.SeparableConv2D,
depth_multiplier=1,
bias_initializer=tf.zeros_initializer())
else:
self._conv2d_op = functools.partial(
tf_keras.layers.Conv2D,
kernel_initializer=tf_keras.initializers.VarianceScaling(
scale=2, mode='fan_out', distribution='untruncated_normal'),
bias_initializer=tf.zeros_initializer())
self._num_fcs = num_fcs
self._fc_dims = fc_dims
if activation == 'relu':
self._activation_op = tf.nn.relu
elif activation == 'swish':
self._activation_op = tf.nn.swish
else:
raise ValueError('Unsupported activation `{}`.'.format(activation))
self._use_batch_norm = use_batch_norm
self._norm_activation = norm_activation
self._conv_ops = []
self._conv_bn_ops = []
for i in range(self._num_convs):
self._conv_ops.append(
self._conv2d_op(
self._num_filters,
kernel_size=(3, 3),
strides=(1, 1),
padding='same',
dilation_rate=(1, 1),
activation=(None
if self._use_batch_norm else self._activation_op),
name='conv_{}'.format(i)))
if self._use_batch_norm:
self._conv_bn_ops.append(self._norm_activation())
self._fc_ops = []
self._fc_bn_ops = []
for i in range(self._num_fcs):
self._fc_ops.append(
tf_keras.layers.Dense(
units=self._fc_dims,
activation=(None
if self._use_batch_norm else self._activation_op),
name='fc{}'.format(i)))
if self._use_batch_norm:
self._fc_bn_ops.append(self._norm_activation(fused=False))
self._class_predict = tf_keras.layers.Dense(
self._num_classes,
kernel_initializer=tf_keras.initializers.RandomNormal(stddev=0.01),
bias_initializer=tf.zeros_initializer(),
name='class-predict')
self._box_predict = tf_keras.layers.Dense(
self._num_classes * 4,
kernel_initializer=tf_keras.initializers.RandomNormal(stddev=0.001),
bias_initializer=tf.zeros_initializer(),
name='box-predict')
self._score_predict = tf_keras.layers.Dense(
1,
kernel_initializer=tf_keras.initializers.RandomNormal(stddev=0.01),
bias_initializer=tf.zeros_initializer(),
name='score-predict')
def __call__(self, roi_features, is_training=None):
"""Box and class branches for the Mask-RCNN model.
Args:
roi_features: A ROI feature tensor of shape [batch_size, num_rois,
height_l, width_l, num_filters].
is_training: `boolean`, if True if model is in training mode.
Returns:
class_outputs: a tensor with a shape of
[batch_size, num_rois, num_classes], representing the class predictions.
box_outputs: a tensor with a shape of
[batch_size, num_rois, num_classes * 4], representing the box
predictions.
"""
with tf.name_scope('fast_rcnn_head'):
# reshape inputs beofre FC.
_, num_rois, height, width, filters = roi_features.get_shape().as_list()
net = tf.reshape(roi_features, [-1, height, width, filters])
for i in range(self._num_convs):
net = self._conv_ops[i](net)
if self._use_batch_norm:
net = self._conv_bn_ops[i](net, is_training=is_training)
filters = self._num_filters if self._num_convs > 0 else filters
net = tf.reshape(net, [-1, num_rois, height * width * filters])
for i in range(self._num_fcs):
net = self._fc_ops[i](net)
if self._use_batch_norm:
net = self._fc_bn_ops[i](net, is_training=is_training)
class_outputs = self._class_predict(net)
box_outputs = self._box_predict(net)
score_outputs = self._score_predict(net)
return class_outputs, box_outputs, score_outputs
class MaskrcnnHead(tf_keras.layers.Layer):
"""Mask R-CNN head."""
def __init__(
self,
num_classes,
mask_target_size,
num_convs=4,
num_filters=256,
use_separable_conv=False,
activation='relu',
use_batch_norm=True,
norm_activation=nn_ops.norm_activation_builder(activation='relu')):
"""Initialize params to build Fast R-CNN head.
Args:
num_classes: a integer for the number of classes.
mask_target_size: a integer that is the resolution of masks.
num_convs: `int` number that represents the number of the intermediate
conv layers before the prediction.
num_filters: `int` number that represents the number of filters of the
intermediate conv layers.
use_separable_conv: `bool`, indicating whether the separable conv layers
is used.
activation: activation function. Support 'relu' and 'swish'.
use_batch_norm: 'bool', indicating whether batchnorm layers are added.
norm_activation: an operation that includes a normalization layer followed
by an optional activation layer.
"""
super(MaskrcnnHead, self).__init__(autocast=False)
self._num_classes = num_classes
self._mask_target_size = mask_target_size
self._num_convs = num_convs
self._num_filters = num_filters
if use_separable_conv:
self._conv2d_op = functools.partial(
tf_keras.layers.SeparableConv2D,
depth_multiplier=1,
bias_initializer=tf.zeros_initializer())
else:
self._conv2d_op = functools.partial(
tf_keras.layers.Conv2D,
kernel_initializer=tf_keras.initializers.VarianceScaling(
scale=2, mode='fan_out', distribution='untruncated_normal'),
bias_initializer=tf.zeros_initializer())
if activation == 'relu':
self._activation_op = tf.nn.relu
elif activation == 'swish':
self._activation_op = tf.nn.swish
else:
raise ValueError('Unsupported activation `{}`.'.format(activation))
self._use_batch_norm = use_batch_norm
self._norm_activation = norm_activation
self._conv2d_ops = []
for i in range(self._num_convs):
self._conv2d_ops.append(
self._conv2d_op(
self._num_filters,
kernel_size=(3, 3),
strides=(1, 1),
padding='same',
dilation_rate=(1, 1),
activation=(None
if self._use_batch_norm else self._activation_op),
name='mask-conv-l%d' % i))
self._mask_conv_transpose = tf_keras.layers.Conv2DTranspose(
self._num_filters,
kernel_size=(2, 2),
strides=(2, 2),
padding='valid',
activation=(None if self._use_batch_norm else self._activation_op),
kernel_initializer=tf_keras.initializers.VarianceScaling(
scale=2, mode='fan_out', distribution='untruncated_normal'),
bias_initializer=tf.zeros_initializer(),
name='conv5-mask')
with tf.name_scope('mask_head'):
self._mask_conv2d_op = self._conv2d_op(
self._num_classes,
kernel_size=(1, 1),
strides=(1, 1),
padding='valid',
name='mask_fcn_logits')
def call(self, roi_features, class_indices, is_training=None):
"""Mask branch for the Mask-RCNN model.
Args:
roi_features: A ROI feature tensor of shape [batch_size, num_rois,
height_l, width_l, num_filters].
class_indices: a Tensor of shape [batch_size, num_rois], indicating which
class the ROI is.
is_training: `boolean`, if True if model is in training mode.
Returns:
mask_outputs: a tensor with a shape of
[batch_size, num_masks, mask_height, mask_width, num_classes],
representing the mask predictions.
fg_gather_indices: a tensor with a shape of [batch_size, num_masks, 2],
representing the fg mask targets.
Raises:
ValueError: If boxes is not a rank-3 tensor or the last dimension of
boxes is not 4.
"""
with tf.name_scope('mask_head'):
_, num_rois, height, width, filters = roi_features.get_shape().as_list()
net = tf.reshape(roi_features, [-1, height, width, filters])
for i in range(self._num_convs):
net = self._conv2d_ops[i](net)
if self._use_batch_norm:
net = self._norm_activation()(net, is_training=is_training)
net = self._mask_conv_transpose(net)
if self._use_batch_norm:
net = self._norm_activation()(net, is_training=is_training)
mask_outputs = self._mask_conv2d_op(net)
mask_outputs = tf.reshape(mask_outputs, [
-1, num_rois, self._mask_target_size, self._mask_target_size,
self._num_classes
])
with tf.name_scope('masks_post_processing'):
mask_outputs = tf.gather(
mask_outputs,
tf.cast(class_indices, tf.int32),
axis=-1,
batch_dims=2,
)
return mask_outputs
class RetinanetHead(object):
"""RetinaNet head."""
def __init__(
self,
min_level,
max_level,
num_classes,
anchors_per_location,
num_convs=4,
num_filters=256,
use_separable_conv=False,
norm_activation=nn_ops.norm_activation_builder(activation='relu')):
"""Initialize params to build RetinaNet head.
Args:
min_level: `int` number of minimum feature level.
max_level: `int` number of maximum feature level.
num_classes: `int` number of classification categories.
anchors_per_location: `int` number of anchors per pixel location.
num_convs: `int` number of stacked convolution before the last prediction
layer.
num_filters: `int` number of filters used in the head architecture.
use_separable_conv: `bool` to indicate whether to use separable
convoluation.
norm_activation: an operation that includes a normalization layer followed
by an optional activation layer.
"""
self._min_level = min_level
self._max_level = max_level
self._num_classes = num_classes
self._anchors_per_location = anchors_per_location
self._num_convs = num_convs
self._num_filters = num_filters
self._use_separable_conv = use_separable_conv
with tf.name_scope('class_net') as scope_name:
self._class_name_scope = tf.name_scope(scope_name)
with tf.name_scope('box_net') as scope_name:
self._box_name_scope = tf.name_scope(scope_name)
self._build_class_net_layers(norm_activation)
self._build_box_net_layers(norm_activation)
def _class_net_batch_norm_name(self, i, level):
return 'class-%d-%d' % (i, level)
def _box_net_batch_norm_name(self, i, level):
return 'box-%d-%d' % (i, level)
def _build_class_net_layers(self, norm_activation):
"""Build re-usable layers for class prediction network."""
if self._use_separable_conv:
self._class_predict = tf_keras.layers.SeparableConv2D(
self._num_classes * self._anchors_per_location,
kernel_size=(3, 3),
bias_initializer=tf.constant_initializer(-np.log((1 - 0.01) / 0.01)),
padding='same',
name='class-predict')
else:
self._class_predict = tf_keras.layers.Conv2D(
self._num_classes * self._anchors_per_location,
kernel_size=(3, 3),
bias_initializer=tf.constant_initializer(-np.log((1 - 0.01) / 0.01)),
kernel_initializer=tf_keras.initializers.RandomNormal(stddev=1e-5),
padding='same',
name='class-predict')
self._class_conv = []
self._class_norm_activation = {}
for i in range(self._num_convs):
if self._use_separable_conv:
self._class_conv.append(
tf_keras.layers.SeparableConv2D(
self._num_filters,
kernel_size=(3, 3),
bias_initializer=tf.zeros_initializer(),
activation=None,
padding='same',
name='class-' + str(i)))
else:
self._class_conv.append(
tf_keras.layers.Conv2D(
self._num_filters,
kernel_size=(3, 3),
bias_initializer=tf.zeros_initializer(),
kernel_initializer=tf_keras.initializers.RandomNormal(
stddev=0.01),
activation=None,
padding='same',
name='class-' + str(i)))
for level in range(self._min_level, self._max_level + 1):
name = self._class_net_batch_norm_name(i, level)
self._class_norm_activation[name] = norm_activation(name=name)
def _build_box_net_layers(self, norm_activation):
"""Build re-usable layers for box prediction network."""
if self._use_separable_conv:
self._box_predict = tf_keras.layers.SeparableConv2D(
4 * self._anchors_per_location,
kernel_size=(3, 3),
bias_initializer=tf.zeros_initializer(),
padding='same',
name='box-predict')
else:
self._box_predict = tf_keras.layers.Conv2D(
4 * self._anchors_per_location,
kernel_size=(3, 3),
bias_initializer=tf.zeros_initializer(),
kernel_initializer=tf_keras.initializers.RandomNormal(stddev=1e-5),
padding='same',
name='box-predict')
self._box_conv = []
self._box_norm_activation = {}
for i in range(self._num_convs):
if self._use_separable_conv:
self._box_conv.append(
tf_keras.layers.SeparableConv2D(
self._num_filters,
kernel_size=(3, 3),
activation=None,
bias_initializer=tf.zeros_initializer(),
padding='same',
name='box-' + str(i)))
else:
self._box_conv.append(
tf_keras.layers.Conv2D(
self._num_filters,
kernel_size=(3, 3),
activation=None,
bias_initializer=tf.zeros_initializer(),
kernel_initializer=tf_keras.initializers.RandomNormal(
stddev=0.01),
padding='same',
name='box-' + str(i)))
for level in range(self._min_level, self._max_level + 1):
name = self._box_net_batch_norm_name(i, level)
self._box_norm_activation[name] = norm_activation(name=name)
def __call__(self, fpn_features, is_training=None):
"""Returns outputs of RetinaNet head."""
class_outputs = {}
box_outputs = {}
with tf.name_scope('retinanet_head'):
for level in range(self._min_level, self._max_level + 1):
features = fpn_features[level]
class_outputs[level] = self.class_net(
features, level, is_training=is_training)
box_outputs[level] = self.box_net(
features, level, is_training=is_training)
return class_outputs, box_outputs
def class_net(self, features, level, is_training):
"""Class prediction network for RetinaNet."""
with self._class_name_scope:
for i in range(self._num_convs):
features = self._class_conv[i](features)
# The convolution layers in the class net are shared among all levels,
# but each level has its batch normlization to capture the statistical
# difference among different levels.
name = self._class_net_batch_norm_name(i, level)
features = self._class_norm_activation[name](
features, is_training=is_training)
classes = self._class_predict(features)
return classes
def box_net(self, features, level, is_training=None):
"""Box regression network for RetinaNet."""
with self._box_name_scope:
for i in range(self._num_convs):
features = self._box_conv[i](features)
# The convolution layers in the box net are shared among all levels, but
# each level has its batch normlization to capture the statistical
# difference among different levels.
name = self._box_net_batch_norm_name(i, level)
features = self._box_norm_activation[name](
features, is_training=is_training)
boxes = self._box_predict(features)
return boxes
# TODO(yeqing): Refactor this class when it is ready for var_scope reuse.
class ShapemaskPriorHead(object):
"""ShapeMask Prior head."""
def __init__(self, num_classes, num_downsample_channels, mask_crop_size,
use_category_for_mask, shape_prior_path):
"""Initialize params to build RetinaNet head.
Args:
num_classes: Number of output classes.
num_downsample_channels: number of channels in mask branch.
mask_crop_size: feature crop size.
use_category_for_mask: use class information in mask branch.
shape_prior_path: the path to load shape priors.
"""
self._mask_num_classes = num_classes if use_category_for_mask else 1
self._num_downsample_channels = num_downsample_channels
self._mask_crop_size = mask_crop_size
self._shape_prior_path = shape_prior_path
self._use_category_for_mask = use_category_for_mask
self._shape_prior_fc = tf_keras.layers.Dense(
self._num_downsample_channels, name='shape-prior-fc')
def __call__(self, fpn_features, boxes, outer_boxes, classes, is_training):
"""Generate the detection priors from the box detections and FPN features.
This corresponds to the Fig. 4 of the ShapeMask paper at
https://arxiv.org/pdf/1904.03239.pdf
Args:
fpn_features: a dictionary of FPN features.
boxes: a float tensor of shape [batch_size, num_instances, 4] representing
the tight gt boxes from dataloader/detection.
outer_boxes: a float tensor of shape [batch_size, num_instances, 4]
representing the loose gt boxes from dataloader/detection.
classes: a int Tensor of shape [batch_size, num_instances] of instance
classes.
is_training: training mode or not.
Returns:
instance_features: a float Tensor of shape [batch_size * num_instances,
mask_crop_size, mask_crop_size, num_downsample_channels]. This is the
instance feature crop.
detection_priors: A float Tensor of shape [batch_size * num_instances,
mask_size, mask_size, 1].
"""
with tf.name_scope('prior_mask'):
batch_size, num_instances, _ = boxes.get_shape().as_list()
outer_boxes = tf.cast(outer_boxes, tf.float32)
boxes = tf.cast(boxes, tf.float32)
instance_features = spatial_transform_ops.multilevel_crop_and_resize(
fpn_features, outer_boxes, output_size=self._mask_crop_size)
instance_features = self._shape_prior_fc(instance_features)
shape_priors = self._get_priors()
# Get uniform priors for each outer box.
uniform_priors = tf.ones([
batch_size, num_instances, self._mask_crop_size, self._mask_crop_size
])
uniform_priors = spatial_transform_ops.crop_mask_in_target_box(
uniform_priors, boxes, outer_boxes, self._mask_crop_size)
# Classify shape priors using uniform priors + instance features.
prior_distribution = self._classify_shape_priors(
tf.cast(instance_features, tf.float32), uniform_priors, classes)
instance_priors = tf.gather(shape_priors, classes)
instance_priors *= tf.expand_dims(
tf.expand_dims(tf.cast(prior_distribution, tf.float32), axis=-1),
axis=-1)
instance_priors = tf.reduce_sum(instance_priors, axis=2)
detection_priors = spatial_transform_ops.crop_mask_in_target_box(
instance_priors, boxes, outer_boxes, self._mask_crop_size)
return instance_features, detection_priors
def _get_priors(self):
"""Load shape priors from file."""
# loads class specific or agnostic shape priors
if self._shape_prior_path:
# Priors are loaded into shape [mask_num_classes, num_clusters, 32, 32].
priors = np.load(tf.io.gfile.GFile(self._shape_prior_path, 'rb'))
priors = tf.convert_to_tensor(priors, dtype=tf.float32)
self._num_clusters = priors.get_shape().as_list()[1]
else:
# If prior path does not exist, do not use priors, i.e., pirors equal to
# uniform empty 32x32 patch.
self._num_clusters = 1
priors = tf.zeros([
self._mask_num_classes, self._num_clusters, self._mask_crop_size,
self._mask_crop_size
])
return priors
def _classify_shape_priors(self, features, uniform_priors, classes):
"""Classify the uniform prior by predicting the shape modes.
Classify the object crop features into K modes of the clusters for each
category.
Args:
features: A float Tensor of shape [batch_size, num_instances, mask_size,
mask_size, num_channels].
uniform_priors: A float Tensor of shape [batch_size, num_instances,
mask_size, mask_size] representing the uniform detection priors.
classes: A int Tensor of shape [batch_size, num_instances] of detection
class ids.
Returns:
prior_distribution: A float Tensor of shape
[batch_size, num_instances, num_clusters] representing the classifier
output probability over all possible shapes.
"""
batch_size, num_instances, _, _, _ = features.get_shape().as_list()
features *= tf.expand_dims(uniform_priors, axis=-1)
# Reduce spatial dimension of features. The features have shape
# [batch_size, num_instances, num_channels].
features = tf.reduce_mean(features, axis=(2, 3))
logits = tf_keras.layers.Dense(
self._mask_num_classes * self._num_clusters,
kernel_initializer=tf.random_normal_initializer(stddev=0.01),
name='classify-shape-prior-fc')(features)
logits = tf.reshape(
logits,
[batch_size, num_instances, self._mask_num_classes, self._num_clusters])
if self._use_category_for_mask:
logits = tf.gather(logits, tf.expand_dims(classes, axis=-1), batch_dims=2)
logits = tf.squeeze(logits, axis=2)
else:
logits = logits[:, :, 0, :]
distribution = tf.nn.softmax(logits, name='shape_prior_weights')
return distribution
class ShapemaskCoarsemaskHead(object):
"""ShapemaskCoarsemaskHead head."""
def __init__(self,
num_classes,
num_downsample_channels,
mask_crop_size,
use_category_for_mask,
num_convs,
norm_activation=nn_ops.norm_activation_builder()):
"""Initialize params to build ShapeMask coarse and fine prediction head.
Args:
num_classes: `int` number of mask classification categories.
num_downsample_channels: `int` number of filters at mask head.
mask_crop_size: feature crop size.
use_category_for_mask: use class information in mask branch.
num_convs: `int` number of stacked convolution before the last prediction
layer.
norm_activation: an operation that includes a normalization layer followed
by an optional activation layer.
"""
self._mask_num_classes = num_classes if use_category_for_mask else 1
self._use_category_for_mask = use_category_for_mask
self._num_downsample_channels = num_downsample_channels
self._mask_crop_size = mask_crop_size
self._num_convs = num_convs
self._norm_activation = norm_activation
self._coarse_mask_fc = tf_keras.layers.Dense(
self._num_downsample_channels, name='coarse-mask-fc')
self._class_conv = []
self._class_norm_activation = []
for i in range(self._num_convs):
self._class_conv.append(
tf_keras.layers.Conv2D(
self._num_downsample_channels,
kernel_size=(3, 3),
bias_initializer=tf.zeros_initializer(),
kernel_initializer=tf_keras.initializers.RandomNormal(
stddev=0.01),
padding='same',
name='coarse-mask-class-%d' % i))
self._class_norm_activation.append(
norm_activation(name='coarse-mask-class-%d-bn' % i))
self._class_predict = tf_keras.layers.Conv2D(
self._mask_num_classes,
kernel_size=(1, 1),
# Focal loss bias initialization to have foreground 0.01 probability.
bias_initializer=tf.constant_initializer(-np.log((1 - 0.01) / 0.01)),
kernel_initializer=tf_keras.initializers.RandomNormal(stddev=0.01),
padding='same',
name='coarse-mask-class-predict')
def __call__(self, features, detection_priors, classes, is_training):
"""Generate instance masks from FPN features and detection priors.
This corresponds to the Fig. 5-6 of the ShapeMask paper at
https://arxiv.org/pdf/1904.03239.pdf
Args:
features: a float Tensor of shape [batch_size, num_instances,
mask_crop_size, mask_crop_size, num_downsample_channels]. This is the
instance feature crop.
detection_priors: a float Tensor of shape [batch_size, num_instances,
mask_crop_size, mask_crop_size, 1]. This is the detection prior for the
instance.
classes: a int Tensor of shape [batch_size, num_instances] of instance
classes.
is_training: a bool indicating whether in training mode.
Returns:
mask_outputs: instance mask prediction as a float Tensor of shape
[batch_size, num_instances, mask_size, mask_size].
"""
with tf.name_scope('coarse_mask'):
# Transform detection priors to have the same dimension as features.
detection_priors = tf.expand_dims(detection_priors, axis=-1)
detection_priors = self._coarse_mask_fc(detection_priors)
features += detection_priors
mask_logits = self.decoder_net(features, is_training)
# Gather the logits with right input class.
if self._use_category_for_mask:
mask_logits = tf.transpose(mask_logits, [0, 1, 4, 2, 3])
mask_logits = tf.gather(
mask_logits, tf.expand_dims(classes, -1), batch_dims=2)
mask_logits = tf.squeeze(mask_logits, axis=2)
else:
mask_logits = mask_logits[..., 0]
return mask_logits
def decoder_net(self, features, is_training=False):
"""Coarse mask decoder network architecture.
Args:
features: A tensor of size [batch, height_in, width_in, channels_in].
is_training: Whether batch_norm layers are in training mode.
Returns:
images: A feature tensor of size [batch, output_size, output_size,
num_channels]
"""
(batch_size, num_instances, height, width,
num_channels) = features.get_shape().as_list()
features = tf.reshape(
features, [batch_size * num_instances, height, width, num_channels])
for i in range(self._num_convs):
features = self._class_conv[i](features)
features = self._class_norm_activation[i](
features, is_training=is_training)
mask_logits = self._class_predict(features)
mask_logits = tf.reshape(
mask_logits,
[batch_size, num_instances, height, width, self._mask_num_classes])
return mask_logits
class ShapemaskFinemaskHead(object):
"""ShapemaskFinemaskHead head."""
def __init__(self,
num_classes,
num_downsample_channels,
mask_crop_size,
use_category_for_mask,
num_convs,
upsample_factor,
norm_activation=nn_ops.norm_activation_builder()):
"""Initialize params to build ShapeMask coarse and fine prediction head.
Args:
num_classes: `int` number of mask classification categories.
num_downsample_channels: `int` number of filters at mask head.
mask_crop_size: feature crop size.
use_category_for_mask: use class information in mask branch.
num_convs: `int` number of stacked convolution before the last prediction
layer.
upsample_factor: `int` number of fine mask upsampling factor.
norm_activation: an operation that includes a batch normalization layer
followed by a relu layer(optional).
"""
self._use_category_for_mask = use_category_for_mask
self._mask_num_classes = num_classes if use_category_for_mask else 1
self._num_downsample_channels = num_downsample_channels
self._mask_crop_size = mask_crop_size
self._num_convs = num_convs
self.up_sample_factor = upsample_factor
self._fine_mask_fc = tf_keras.layers.Dense(
self._num_downsample_channels, name='fine-mask-fc')
self._upsample_conv = tf_keras.layers.Conv2DTranspose(
self._num_downsample_channels,
(self.up_sample_factor, self.up_sample_factor),
(self.up_sample_factor, self.up_sample_factor),
name='fine-mask-conv2d-tran')
self._fine_class_conv = []
self._fine_class_bn = []
for i in range(self._num_convs):
self._fine_class_conv.append(
tf_keras.layers.Conv2D(
self._num_downsample_channels,
kernel_size=(3, 3),
bias_initializer=tf.zeros_initializer(),
kernel_initializer=tf_keras.initializers.RandomNormal(
stddev=0.01),
activation=None,
padding='same',
name='fine-mask-class-%d' % i))
self._fine_class_bn.append(
norm_activation(name='fine-mask-class-%d-bn' % i))
self._class_predict_conv = tf_keras.layers.Conv2D(
self._mask_num_classes,
kernel_size=(1, 1),
# Focal loss bias initialization to have foreground 0.01 probability.
bias_initializer=tf.constant_initializer(-np.log((1 - 0.01) / 0.01)),
kernel_initializer=tf_keras.initializers.RandomNormal(stddev=0.01),
padding='same',
name='fine-mask-class-predict')
def __call__(self, features, mask_logits, classes, is_training):
"""Generate instance masks from FPN features and detection priors.
This corresponds to the Fig. 5-6 of the ShapeMask paper at
https://arxiv.org/pdf/1904.03239.pdf
Args:
features: a float Tensor of shape [batch_size, num_instances,
mask_crop_size, mask_crop_size, num_downsample_channels]. This is the
instance feature crop.
mask_logits: a float Tensor of shape [batch_size, num_instances,
mask_crop_size, mask_crop_size] indicating predicted mask logits.
classes: a int Tensor of shape [batch_size, num_instances] of instance
classes.
is_training: a bool indicating whether in training mode.
Returns:
mask_outputs: instance mask prediction as a float Tensor of shape
[batch_size, num_instances, mask_size, mask_size].
"""
# Extract the foreground mean features
# with tf.variable_scope('fine_mask', reuse=tf.AUTO_REUSE):
with tf.name_scope('fine_mask'):
mask_probs = tf.nn.sigmoid(mask_logits)
# Compute instance embedding for hard average.
binary_mask = tf.cast(tf.greater(mask_probs, 0.5), features.dtype)
instance_embedding = tf.reduce_sum(
features * tf.expand_dims(binary_mask, axis=-1), axis=(2, 3))
instance_embedding /= tf.expand_dims(
tf.reduce_sum(binary_mask, axis=(2, 3)) + 1e-20, axis=-1)
# Take the difference between crop features and mean instance features.
features -= tf.expand_dims(
tf.expand_dims(instance_embedding, axis=2), axis=2)
features += self._fine_mask_fc(tf.expand_dims(mask_probs, axis=-1))
# Decoder to generate upsampled segmentation mask.
mask_logits = self.decoder_net(features, is_training)
if self._use_category_for_mask:
mask_logits = tf.transpose(mask_logits, [0, 1, 4, 2, 3])
mask_logits = tf.gather(
mask_logits, tf.expand_dims(classes, -1), batch_dims=2)
mask_logits = tf.squeeze(mask_logits, axis=2)
else:
mask_logits = mask_logits[..., 0]
return mask_logits
def decoder_net(self, features, is_training=False):
"""Fine mask decoder network architecture.
Args:
features: A tensor of size [batch, height_in, width_in, channels_in].
is_training: Whether batch_norm layers are in training mode.
Returns:
images: A feature tensor of size [batch, output_size, output_size,
num_channels], where output size is self._gt_upsample_scale times
that of input.
"""
(batch_size, num_instances, height, width,
num_channels) = features.get_shape().as_list()
features = tf.reshape(
features, [batch_size * num_instances, height, width, num_channels])
for i in range(self._num_convs):
features = self._fine_class_conv[i](features)
features = self._fine_class_bn[i](features, is_training=is_training)
if self.up_sample_factor > 1:
features = self._upsample_conv(features)
# Predict per-class instance masks.
mask_logits = self._class_predict_conv(features)
mask_logits = tf.reshape(mask_logits, [
batch_size, num_instances, height * self.up_sample_factor,
width * self.up_sample_factor, self._mask_num_classes
])
return mask_logits