|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
"""Functions to manipulate feature map pyramids, such as for FPNs and BiFPNs. |
|
|
|
Includes utility functions to facilitate feature pyramid map manipulations, |
|
such as combining multiple feature maps, upsampling or downsampling feature |
|
maps, and applying blocks of convolution, batchnorm, and activation layers. |
|
""" |
|
from six.moves import range |
|
import tensorflow as tf |
|
from object_detection.utils import ops |
|
from object_detection.utils import shape_utils |
|
|
|
|
|
def create_conv_block(name, num_filters, kernel_size, strides, padding, |
|
use_separable, apply_batchnorm, apply_activation, |
|
conv_hyperparams, is_training, freeze_batchnorm): |
|
"""Create Keras layers for regular or separable convolutions. |
|
|
|
Args: |
|
name: String. The name of the layer. |
|
num_filters: Number of filters (channels) for the output feature maps. |
|
kernel_size: A list of length 2: [kernel_height, kernel_width] of the |
|
filters, or a single int if both values are the same. |
|
strides: A list of length 2: [stride_height, stride_width], specifying the |
|
convolution stride, or a single int if both strides are the same. |
|
padding: One of 'VALID' or 'SAME'. |
|
use_separable: Bool. Whether to use depthwise separable convolution instead |
|
of regular convolution. |
|
apply_batchnorm: Bool. Whether to apply a batch normalization layer after |
|
convolution, constructed according to the conv_hyperparams. |
|
apply_activation: Bool. Whether to apply an activation layer after |
|
convolution, constructed according to the conv_hyperparams. |
|
conv_hyperparams: A `hyperparams_builder.KerasLayerHyperparams` object |
|
containing hyperparameters for convolution ops. |
|
is_training: Bool. Whether the feature generator is in training mode. |
|
freeze_batchnorm: Bool. Whether to freeze batch norm parameters during |
|
training or not. When training with a small batch size (e.g. 1), it is |
|
desirable to freeze batch norm update and use pretrained batch norm |
|
params. |
|
|
|
Returns: |
|
A list of keras layers, including (regular or seperable) convolution, and |
|
optionally batch normalization and activation layers. |
|
""" |
|
layers = [] |
|
if use_separable: |
|
kwargs = conv_hyperparams.params() |
|
|
|
|
|
kwargs['depthwise_regularizer'] = kwargs['kernel_regularizer'] |
|
kwargs['depthwise_initializer'] = kwargs['kernel_initializer'] |
|
|
|
|
|
kwargs['pointwise_regularizer'] = kwargs['kernel_regularizer'] |
|
kwargs['pointwise_initializer'] = kwargs['kernel_initializer'] |
|
layers.append( |
|
tf.keras.layers.SeparableConv2D( |
|
filters=num_filters, |
|
kernel_size=kernel_size, |
|
depth_multiplier=1, |
|
padding=padding, |
|
strides=strides, |
|
name=name + '_separable_conv', |
|
**kwargs)) |
|
else: |
|
layers.append( |
|
tf.keras.layers.Conv2D( |
|
filters=num_filters, |
|
kernel_size=kernel_size, |
|
padding=padding, |
|
strides=strides, |
|
name=name + '_conv', |
|
**conv_hyperparams.params())) |
|
|
|
if apply_batchnorm: |
|
layers.append( |
|
conv_hyperparams.build_batch_norm( |
|
training=(is_training and not freeze_batchnorm), |
|
name=name + '_batchnorm')) |
|
|
|
if apply_activation: |
|
layers.append( |
|
conv_hyperparams.build_activation_layer(name=name + '_activation')) |
|
|
|
return layers |
|
|
|
|
|
def create_downsample_feature_map_ops(scale, downsample_method, |
|
conv_hyperparams, is_training, |
|
freeze_batchnorm, name): |
|
"""Creates Keras layers for downsampling feature maps. |
|
|
|
Args: |
|
scale: Int. The scale factor by which to downsample input feature maps. For |
|
example, in the case of a typical feature map pyramid, the scale factor |
|
between level_i and level_i+1 is 2. |
|
downsample_method: String. The method used for downsampling. Currently |
|
supported methods include 'max_pooling', 'avg_pooling', and |
|
'depthwise_conv'. |
|
conv_hyperparams: A `hyperparams_builder.KerasLayerHyperparams` object |
|
containing hyperparameters for convolution ops. |
|
is_training: Bool. Whether the feature generator is in training mode. |
|
freeze_batchnorm: Bool. Whether to freeze batch norm parameters during |
|
training or not. When training with a small batch size (e.g. 1), it is |
|
desirable to freeze batch norm update and use pretrained batch norm |
|
params. |
|
name: String. The name used to prefix the constructed layers. |
|
|
|
Returns: |
|
A list of Keras layers which will downsample input feature maps by the |
|
desired scale factor. |
|
""" |
|
layers = [] |
|
padding = 'SAME' |
|
stride = int(scale) |
|
kernel_size = stride + 1 |
|
if downsample_method == 'max_pooling': |
|
layers.append( |
|
tf.keras.layers.MaxPooling2D( |
|
pool_size=kernel_size, |
|
strides=stride, |
|
padding=padding, |
|
name=name + '_downsample_max_x{}'.format(stride))) |
|
elif downsample_method == 'avg_pooling': |
|
layers.append( |
|
tf.keras.layers.AveragePooling2D( |
|
pool_size=kernel_size, |
|
strides=stride, |
|
padding=padding, |
|
name=name + '_downsample_avg_x{}'.format(stride))) |
|
elif downsample_method == 'depthwise_conv': |
|
layers.append( |
|
tf.keras.layers.DepthwiseConv2D( |
|
kernel_size=kernel_size, |
|
strides=stride, |
|
padding=padding, |
|
name=name + '_downsample_depthwise_x{}'.format(stride))) |
|
layers.append( |
|
conv_hyperparams.build_batch_norm( |
|
training=(is_training and not freeze_batchnorm), |
|
name=name + '_downsample_batchnorm')) |
|
layers.append( |
|
conv_hyperparams.build_activation_layer(name=name + |
|
'_downsample_activation')) |
|
else: |
|
raise ValueError('Unknown downsample method: {}'.format(downsample_method)) |
|
|
|
return layers |
|
|
|
|
|
def create_upsample_feature_map_ops(scale, use_native_resize_op, name): |
|
"""Creates Keras layers for upsampling feature maps. |
|
|
|
Args: |
|
scale: Int. The scale factor by which to upsample input feature maps. For |
|
example, in the case of a typical feature map pyramid, the scale factor |
|
between level_i and level_i-1 is 2. |
|
use_native_resize_op: If True, uses tf.image.resize_nearest_neighbor op for |
|
the upsampling process instead of reshape and broadcasting implementation. |
|
name: String. The name used to prefix the constructed layers. |
|
|
|
Returns: |
|
A list of Keras layers which will upsample input feature maps by the |
|
desired scale factor. |
|
""" |
|
layers = [] |
|
if use_native_resize_op: |
|
|
|
def resize_nearest_neighbor(image): |
|
image_shape = shape_utils.combined_static_and_dynamic_shape(image) |
|
return tf.image.resize_nearest_neighbor( |
|
image, [image_shape[1] * scale, image_shape[2] * scale]) |
|
|
|
layers.append( |
|
tf.keras.layers.Lambda( |
|
resize_nearest_neighbor, |
|
name=name + 'nearest_neighbor_upsampling_x{}'.format(scale))) |
|
else: |
|
|
|
def nearest_neighbor_upsampling(image): |
|
return ops.nearest_neighbor_upsampling(image, scale=scale) |
|
|
|
layers.append( |
|
tf.keras.layers.Lambda( |
|
nearest_neighbor_upsampling, |
|
name=name + 'nearest_neighbor_upsampling_x{}'.format(scale))) |
|
|
|
return layers |
|
|
|
|
|
def create_resample_feature_map_ops(input_scale_factor, output_scale_factor, |
|
downsample_method, use_native_resize_op, |
|
conv_hyperparams, is_training, |
|
freeze_batchnorm, name): |
|
"""Creates Keras layers for downsampling or upsampling feature maps. |
|
|
|
Args: |
|
input_scale_factor: Int. Scale factor of the input feature map. For example, |
|
for a feature pyramid where each successive level halves its spatial |
|
resolution, the scale factor of a level is 2^level. The input and output |
|
scale factors are used to compute the scale for upsampling or downsamling, |
|
so they should be evenly divisible. |
|
output_scale_factor: Int. Scale factor of the output feature map. See |
|
input_scale_factor for additional details. |
|
downsample_method: String. The method used for downsampling. See |
|
create_downsample_feature_map_ops for details on supported methods. |
|
use_native_resize_op: If True, uses tf.image.resize_nearest_neighbor op for |
|
the upsampling process instead of reshape and broadcasting implementation. |
|
See create_upsample_feature_map_ops for details. |
|
conv_hyperparams: A `hyperparams_builder.KerasLayerHyperparams` object |
|
containing hyperparameters for convolution ops. |
|
is_training: Bool. Whether the feature generator is in training mode. |
|
freeze_batchnorm: Bool. Whether to freeze batch norm parameters during |
|
training or not. When training with a small batch size (e.g. 1), it is |
|
desirable to freeze batch norm update and use pretrained batch norm |
|
params. |
|
name: String. The name used to prefix the constructed layers. |
|
|
|
Returns: |
|
A list of Keras layers which will downsample or upsample input feature maps |
|
to match the desired output feature map scale. |
|
""" |
|
if input_scale_factor < output_scale_factor: |
|
if output_scale_factor % input_scale_factor != 0: |
|
raise ValueError('Invalid scale factor: input scale 1/{} not divisible by' |
|
'output scale 1/{}'.format(input_scale_factor, |
|
output_scale_factor)) |
|
scale = output_scale_factor // input_scale_factor |
|
return create_downsample_feature_map_ops(scale, downsample_method, |
|
conv_hyperparams, is_training, |
|
freeze_batchnorm, name) |
|
elif input_scale_factor > output_scale_factor: |
|
if input_scale_factor % output_scale_factor != 0: |
|
raise ValueError('Invalid scale factor: input scale 1/{} not a divisor of' |
|
'output scale 1/{}'.format(input_scale_factor, |
|
output_scale_factor)) |
|
scale = input_scale_factor // output_scale_factor |
|
return create_upsample_feature_map_ops(scale, use_native_resize_op, name) |
|
else: |
|
return [] |
|
|
|
|
|
|
|
class BiFPNCombineLayer(tf.keras.layers.Layer): |
|
"""Combines multiple input feature maps into a single output feature map. |
|
|
|
A Keras layer which combines multiple input feature maps into a single output |
|
feature map, according to the desired combination method. Options for |
|
combining feature maps include simple summation, or several types of weighted |
|
sums using learned weights for each input feature map. These include |
|
'weighted_sum', 'attention', and 'fast_attention'. For more details, see the |
|
EfficientDet paper by Tan et al, see arxiv.org/abs/1911.09070. |
|
|
|
Specifically, this layer takes a list of tensors as input, all of the same |
|
shape, and returns a single tensor, also of the same shape. |
|
""" |
|
|
|
def __init__(self, combine_method, **kwargs): |
|
"""Constructor. |
|
|
|
Args: |
|
combine_method: String. The method used to combine the input feature maps |
|
into a single output feature map. One of 'sum', 'weighted_sum', |
|
'attention', or 'fast_attention'. |
|
**kwargs: Additional Keras layer arguments. |
|
""" |
|
super(BiFPNCombineLayer, self).__init__(**kwargs) |
|
self.combine_method = combine_method |
|
|
|
def _combine_weighted_sum(self, inputs): |
|
return tf.squeeze( |
|
tf.linalg.matmul(tf.stack(inputs, axis=-1), self.per_input_weights), |
|
axis=[-1]) |
|
|
|
def _combine_attention(self, inputs): |
|
normalized_weights = tf.nn.softmax(self.per_input_weights) |
|
return tf.squeeze( |
|
tf.linalg.matmul(tf.stack(inputs, axis=-1), normalized_weights), |
|
axis=[-1]) |
|
|
|
def _combine_fast_attention(self, inputs): |
|
weights_non_neg = tf.nn.relu(self.per_input_weights) |
|
normalizer = tf.reduce_sum(weights_non_neg) + 0.0001 |
|
normalized_weights = weights_non_neg / normalizer |
|
return tf.squeeze( |
|
tf.linalg.matmul(tf.stack(inputs, axis=-1), normalized_weights), |
|
axis=[-1]) |
|
|
|
def build(self, input_shape): |
|
if not isinstance(input_shape, list): |
|
raise ValueError('A BiFPN combine layer should be called ' |
|
'on a list of inputs.') |
|
if len(input_shape) < 2: |
|
raise ValueError('A BiFPN combine layer should be called ' |
|
'on a list of at least 2 inputs. ' |
|
'Got ' + str(len(input_shape)) + ' inputs.') |
|
if self.combine_method == 'sum': |
|
self._combine_op = tf.keras.layers.Add() |
|
elif self.combine_method == 'weighted_sum': |
|
self._combine_op = self._combine_weighted_sum |
|
elif self.combine_method == 'attention': |
|
self._combine_op = self._combine_attention |
|
elif self.combine_method == 'fast_attention': |
|
self._combine_op = self._combine_fast_attention |
|
else: |
|
raise ValueError('Unknown combine type: {}'.format(self.combine_method)) |
|
if self.combine_method in {'weighted_sum', 'attention', 'fast_attention'}: |
|
self.per_input_weights = self.add_weight( |
|
name='bifpn_combine_weights', |
|
shape=(len(input_shape), 1), |
|
initializer='ones', |
|
trainable=True) |
|
super(BiFPNCombineLayer, self).build(input_shape) |
|
|
|
def call(self, inputs): |
|
"""Combines multiple input feature maps into a single output feature map. |
|
|
|
Executed when calling the `.__call__` method on input. |
|
|
|
Args: |
|
inputs: A list of tensors where all tensors have the same shape, [batch, |
|
height_i, width_i, depth_i]. |
|
|
|
Returns: |
|
A single tensor, with the same shape as the input tensors, |
|
[batch, height_i, width_i, depth_i]. |
|
""" |
|
return self._combine_op(inputs) |
|
|
|
def compute_output_shape(self, input_shape): |
|
output_shape = input_shape[0] |
|
for i in range(1, len(input_shape)): |
|
if input_shape[i] != output_shape: |
|
raise ValueError( |
|
'Inputs could not be combined. Shapes should match, ' |
|
'but input_shape[0] is {} while input_shape[{}] is {}'.format( |
|
output_shape, i, input_shape[i])) |
|
|