|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
"""Contains definitions for the post-activation form of Residual Networks. |
|
|
|
Residual networks (ResNets) were proposed in: |
|
[1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun |
|
Deep Residual Learning for Image Recognition. arXiv:1512.03385 |
|
""" |
|
|
|
from __future__ import absolute_import |
|
from __future__ import division |
|
from __future__ import print_function |
|
|
|
from absl import logging |
|
import tensorflow as tf |
|
from tensorflow.python.keras import backend |
|
from official.vision.detection.modeling.architecture import nn_ops |
|
|
|
|
|
class Resnet(object): |
|
"""Class to build ResNet family model.""" |
|
|
|
def __init__(self, |
|
resnet_depth, |
|
activation='relu', |
|
norm_activation=nn_ops.norm_activation_builder( |
|
activation='relu'), |
|
data_format='channels_last'): |
|
"""ResNet initialization function. |
|
|
|
Args: |
|
resnet_depth: `int` depth of ResNet backbone model. |
|
norm_activation: an operation that includes a normalization layer |
|
followed by an optional activation layer. |
|
data_format: `str` either "channels_first" for `[batch, channels, height, |
|
width]` or "channels_last for `[batch, height, width, channels]`. |
|
""" |
|
self._resnet_depth = resnet_depth |
|
if activation == 'relu': |
|
self._activation_op = tf.nn.relu |
|
elif activation == 'swish': |
|
self._activation_op = tf.nn.swish |
|
else: |
|
raise ValueError('Unsupported activation `{}`.'.format(activation)) |
|
self._norm_activation = norm_activation |
|
self._data_format = data_format |
|
|
|
model_params = { |
|
10: {'block': self.residual_block, 'layers': [1, 1, 1, 1]}, |
|
18: {'block': self.residual_block, 'layers': [2, 2, 2, 2]}, |
|
34: {'block': self.residual_block, 'layers': [3, 4, 6, 3]}, |
|
50: {'block': self.bottleneck_block, 'layers': [3, 4, 6, 3]}, |
|
101: {'block': self.bottleneck_block, 'layers': [3, 4, 23, 3]}, |
|
152: {'block': self.bottleneck_block, 'layers': [3, 8, 36, 3]}, |
|
200: {'block': self.bottleneck_block, 'layers': [3, 24, 36, 3]} |
|
} |
|
|
|
if resnet_depth not in model_params: |
|
valid_resnet_depths = ', '.join( |
|
[str(depth) for depth in sorted(model_params.keys())]) |
|
raise ValueError( |
|
'The resnet_depth should be in [%s]. Not a valid resnet_depth:'%( |
|
valid_resnet_depths), self._resnet_depth) |
|
params = model_params[resnet_depth] |
|
self._resnet_fn = self.resnet_v1_generator( |
|
params['block'], params['layers']) |
|
|
|
def __call__(self, inputs, is_training=None): |
|
"""Returns the ResNet model for a given size and number of output classes. |
|
|
|
Args: |
|
inputs: a `Tesnor` with shape [batch_size, height, width, 3] representing |
|
a batch of images. |
|
is_training: `bool` if True, the model is in training mode. |
|
|
|
Returns: |
|
a `dict` containing `int` keys for continuous feature levels [2, 3, 4, 5]. |
|
The values are corresponding feature hierarchy in ResNet with shape |
|
[batch_size, height_l, width_l, num_filters]. |
|
""" |
|
with backend.get_graph().as_default(): |
|
with tf.name_scope('resnet%s' % self._resnet_depth): |
|
return self._resnet_fn(inputs, is_training) |
|
|
|
def fixed_padding(self, inputs, kernel_size): |
|
"""Pads the input along the spatial dimensions independently of input size. |
|
|
|
Args: |
|
inputs: `Tensor` of size `[batch, channels, height, width]` or |
|
`[batch, height, width, channels]` depending on `data_format`. |
|
kernel_size: `int` kernel size to be used for `conv2d` or max_pool2d` |
|
operations. Should be a positive integer. |
|
|
|
Returns: |
|
A padded `Tensor` of the same `data_format` with size either intact |
|
(if `kernel_size == 1`) or padded (if `kernel_size > 1`). |
|
""" |
|
pad_total = kernel_size - 1 |
|
pad_beg = pad_total // 2 |
|
pad_end = pad_total - pad_beg |
|
if self._data_format == 'channels_first': |
|
padded_inputs = tf.pad( |
|
tensor=inputs, |
|
paddings=[[0, 0], [0, 0], [pad_beg, pad_end], [pad_beg, pad_end]]) |
|
else: |
|
padded_inputs = tf.pad( |
|
tensor=inputs, |
|
paddings=[[0, 0], [pad_beg, pad_end], [pad_beg, pad_end], [0, 0]]) |
|
|
|
return padded_inputs |
|
|
|
def conv2d_fixed_padding(self, inputs, filters, kernel_size, strides): |
|
"""Strided 2-D convolution with explicit padding. |
|
|
|
The padding is consistent and is based only on `kernel_size`, not on the |
|
dimensions of `inputs` (as opposed to using `tf.layers.conv2d` alone). |
|
|
|
Args: |
|
inputs: `Tensor` of size `[batch, channels, height_in, width_in]`. |
|
filters: `int` number of filters in the convolution. |
|
kernel_size: `int` size of the kernel to be used in the convolution. |
|
strides: `int` strides of the convolution. |
|
|
|
Returns: |
|
A `Tensor` of shape `[batch, filters, height_out, width_out]`. |
|
""" |
|
if strides > 1: |
|
inputs = self.fixed_padding(inputs, kernel_size) |
|
|
|
return tf.keras.layers.Conv2D( |
|
filters=filters, |
|
kernel_size=kernel_size, |
|
strides=strides, |
|
padding=('SAME' if strides == 1 else 'VALID'), |
|
use_bias=False, |
|
kernel_initializer=tf.initializers.VarianceScaling(), |
|
data_format=self._data_format)( |
|
inputs=inputs) |
|
|
|
def residual_block(self, |
|
inputs, |
|
filters, |
|
strides, |
|
use_projection=False, |
|
is_training=None): |
|
"""Standard building block for residual networks with BN after convolutions. |
|
|
|
Args: |
|
inputs: `Tensor` of size `[batch, channels, height, width]`. |
|
filters: `int` number of filters for the first two convolutions. Note that |
|
the third and final convolution will use 4 times as many filters. |
|
strides: `int` block stride. If greater than 1, this block will ultimately |
|
downsample the input. |
|
use_projection: `bool` for whether this block should use a projection |
|
shortcut (versus the default identity shortcut). This is usually |
|
`True` for the first block of a block group, which may change the |
|
number of filters and the resolution. |
|
is_training: `bool` if True, the model is in training mode. |
|
Returns: |
|
The output `Tensor` of the block. |
|
""" |
|
shortcut = inputs |
|
if use_projection: |
|
|
|
shortcut = self.conv2d_fixed_padding( |
|
inputs=inputs, filters=filters, kernel_size=1, strides=strides) |
|
shortcut = self._norm_activation(use_activation=False)( |
|
shortcut, is_training=is_training) |
|
|
|
inputs = self.conv2d_fixed_padding( |
|
inputs=inputs, filters=filters, kernel_size=3, strides=strides) |
|
inputs = self._norm_activation()(inputs, is_training=is_training) |
|
|
|
inputs = self.conv2d_fixed_padding( |
|
inputs=inputs, filters=filters, kernel_size=3, strides=1) |
|
inputs = self._norm_activation(use_activation=False, init_zero=True)( |
|
inputs, is_training=is_training) |
|
|
|
return self._activation_op(inputs + shortcut) |
|
|
|
def bottleneck_block(self, |
|
inputs, |
|
filters, |
|
strides, |
|
use_projection=False, |
|
is_training=None): |
|
"""Bottleneck block variant for residual networks with BN after convolutions. |
|
|
|
Args: |
|
inputs: `Tensor` of size `[batch, channels, height, width]`. |
|
filters: `int` number of filters for the first two convolutions. Note that |
|
the third and final convolution will use 4 times as many filters. |
|
strides: `int` block stride. If greater than 1, this block will ultimately |
|
downsample the input. |
|
use_projection: `bool` for whether this block should use a projection |
|
shortcut (versus the default identity shortcut). This is usually |
|
`True` for the first block of a block group, which may change the |
|
number of filters and the resolution. |
|
is_training: `bool` if True, the model is in training mode. |
|
|
|
Returns: |
|
The output `Tensor` of the block. |
|
""" |
|
shortcut = inputs |
|
if use_projection: |
|
|
|
|
|
filters_out = 4 * filters |
|
shortcut = self.conv2d_fixed_padding( |
|
inputs=inputs, filters=filters_out, kernel_size=1, strides=strides) |
|
shortcut = self._norm_activation(use_activation=False)( |
|
shortcut, is_training=is_training) |
|
|
|
inputs = self.conv2d_fixed_padding( |
|
inputs=inputs, filters=filters, kernel_size=1, strides=1) |
|
inputs = self._norm_activation()(inputs, is_training=is_training) |
|
|
|
inputs = self.conv2d_fixed_padding( |
|
inputs=inputs, filters=filters, kernel_size=3, strides=strides) |
|
inputs = self._norm_activation()(inputs, is_training=is_training) |
|
|
|
inputs = self.conv2d_fixed_padding( |
|
inputs=inputs, filters=4 * filters, kernel_size=1, strides=1) |
|
inputs = self._norm_activation(use_activation=False, init_zero=True)( |
|
inputs, is_training=is_training) |
|
|
|
return self._activation_op(inputs + shortcut) |
|
|
|
def block_group(self, inputs, filters, block_fn, blocks, strides, name, |
|
is_training): |
|
"""Creates one group of blocks for the ResNet model. |
|
|
|
Args: |
|
inputs: `Tensor` of size `[batch, channels, height, width]`. |
|
filters: `int` number of filters for the first convolution of the layer. |
|
block_fn: `function` for the block to use within the model |
|
blocks: `int` number of blocks contained in the layer. |
|
strides: `int` stride to use for the first convolution of the layer. If |
|
greater than 1, this layer will downsample the input. |
|
name: `str`name for the Tensor output of the block layer. |
|
is_training: `bool` if True, the model is in training mode. |
|
|
|
Returns: |
|
The output `Tensor` of the block layer. |
|
""" |
|
|
|
inputs = block_fn(inputs, filters, strides, use_projection=True, |
|
is_training=is_training) |
|
|
|
for _ in range(1, blocks): |
|
inputs = block_fn(inputs, filters, 1, is_training=is_training) |
|
|
|
return tf.identity(inputs, name) |
|
|
|
def resnet_v1_generator(self, block_fn, layers): |
|
"""Generator for ResNet v1 models. |
|
|
|
Args: |
|
block_fn: `function` for the block to use within the model. Either |
|
`residual_block` or `bottleneck_block`. |
|
layers: list of 4 `int`s denoting the number of blocks to include in each |
|
of the 4 block groups. Each group consists of blocks that take inputs of |
|
the same resolution. |
|
|
|
Returns: |
|
Model `function` that takes in `inputs` and `is_training` and returns the |
|
output `Tensor` of the ResNet model. |
|
""" |
|
|
|
def model(inputs, is_training=None): |
|
"""Creation of the model graph.""" |
|
inputs = self.conv2d_fixed_padding( |
|
inputs=inputs, filters=64, kernel_size=7, strides=2) |
|
inputs = tf.identity(inputs, 'initial_conv') |
|
inputs = self._norm_activation()(inputs, is_training=is_training) |
|
|
|
inputs = tf.keras.layers.MaxPool2D( |
|
pool_size=3, strides=2, padding='SAME', |
|
data_format=self._data_format)( |
|
inputs) |
|
inputs = tf.identity(inputs, 'initial_max_pool') |
|
|
|
c2 = self.block_group( |
|
inputs=inputs, filters=64, block_fn=block_fn, blocks=layers[0], |
|
strides=1, name='block_group1', is_training=is_training) |
|
c3 = self.block_group( |
|
inputs=c2, filters=128, block_fn=block_fn, blocks=layers[1], |
|
strides=2, name='block_group2', is_training=is_training) |
|
c4 = self.block_group( |
|
inputs=c3, filters=256, block_fn=block_fn, blocks=layers[2], |
|
strides=2, name='block_group3', is_training=is_training) |
|
c5 = self.block_group( |
|
inputs=c4, filters=512, block_fn=block_fn, blocks=layers[3], |
|
strides=2, name='block_group4', is_training=is_training) |
|
return {2: c2, 3: c3, 4: c4, 5: c5} |
|
|
|
return model |
|
|