# Copyright 2023 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Contains definitions for the post-activation form of Residual Networks. Residual networks (ResNets) were proposed in: [1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun Deep Residual Learning for Image Recognition. arXiv:1512.03385 """ from __future__ import absolute_import from __future__ import division from __future__ import print_function import tensorflow as tf, tf_keras from official.legacy.detection.modeling.architecture import nn_ops # TODO(b/140112644): Refactor the code with Keras style, i.e. build and call. class Resnet(object): """Class to build ResNet family model.""" def __init__( self, resnet_depth, activation='relu', norm_activation=nn_ops.norm_activation_builder(activation='relu'), data_format='channels_last'): """ResNet initialization function. Args: resnet_depth: `int` depth of ResNet backbone model. activation: the activation function. norm_activation: an operation that includes a normalization layer followed by an optional activation layer. data_format: `str` either "channels_first" for `[batch, channels, height, width]` or "channels_last for `[batch, height, width, channels]`. """ self._resnet_depth = resnet_depth if activation == 'relu': self._activation_op = tf.nn.relu elif activation == 'swish': self._activation_op = tf.nn.swish else: raise ValueError('Unsupported activation `{}`.'.format(activation)) self._norm_activation = norm_activation self._data_format = data_format model_params = { 10: { 'block': self.residual_block, 'layers': [1, 1, 1, 1] }, 18: { 'block': self.residual_block, 'layers': [2, 2, 2, 2] }, 34: { 'block': self.residual_block, 'layers': [3, 4, 6, 3] }, 50: { 'block': self.bottleneck_block, 'layers': [3, 4, 6, 3] }, 101: { 'block': self.bottleneck_block, 'layers': [3, 4, 23, 3] }, 152: { 'block': self.bottleneck_block, 'layers': [3, 8, 36, 3] }, 200: { 'block': self.bottleneck_block, 'layers': [3, 24, 36, 3] } } if resnet_depth not in model_params: valid_resnet_depths = ', '.join( [str(depth) for depth in sorted(model_params.keys())]) raise ValueError( 'The resnet_depth should be in [%s]. Not a valid resnet_depth:' % (valid_resnet_depths), self._resnet_depth) params = model_params[resnet_depth] self._resnet_fn = self.resnet_v1_generator(params['block'], params['layers']) def __call__(self, inputs, is_training=None): """Returns the ResNet model for a given size and number of output classes. Args: inputs: a `Tesnor` with shape [batch_size, height, width, 3] representing a batch of images. is_training: `bool` if True, the model is in training mode. Returns: a `dict` containing `int` keys for continuous feature levels [2, 3, 4, 5]. The values are corresponding feature hierarchy in ResNet with shape [batch_size, height_l, width_l, num_filters]. """ with tf.name_scope('resnet%s' % self._resnet_depth): return self._resnet_fn(inputs, is_training) def fixed_padding(self, inputs, kernel_size): """Pads the input along the spatial dimensions independently of input size. Args: inputs: `Tensor` of size `[batch, channels, height, width]` or `[batch, height, width, channels]` depending on `data_format`. kernel_size: `int` kernel size to be used for `conv2d` or max_pool2d` operations. Should be a positive integer. Returns: A padded `Tensor` of the same `data_format` with size either intact (if `kernel_size == 1`) or padded (if `kernel_size > 1`). """ pad_total = kernel_size - 1 pad_beg = pad_total // 2 pad_end = pad_total - pad_beg if self._data_format == 'channels_first': padded_inputs = tf.pad( tensor=inputs, paddings=[[0, 0], [0, 0], [pad_beg, pad_end], [pad_beg, pad_end]]) else: padded_inputs = tf.pad( tensor=inputs, paddings=[[0, 0], [pad_beg, pad_end], [pad_beg, pad_end], [0, 0]]) return padded_inputs def conv2d_fixed_padding(self, inputs, filters, kernel_size, strides): """Strided 2-D convolution with explicit padding. The padding is consistent and is based only on `kernel_size`, not on the dimensions of `inputs` (as opposed to using `tf.layers.conv2d` alone). Args: inputs: `Tensor` of size `[batch, channels, height_in, width_in]`. filters: `int` number of filters in the convolution. kernel_size: `int` size of the kernel to be used in the convolution. strides: `int` strides of the convolution. Returns: A `Tensor` of shape `[batch, filters, height_out, width_out]`. """ if strides > 1: inputs = self.fixed_padding(inputs, kernel_size) return tf_keras.layers.Conv2D( filters=filters, kernel_size=kernel_size, strides=strides, padding=('SAME' if strides == 1 else 'VALID'), use_bias=False, kernel_initializer=tf.initializers.VarianceScaling(), data_format=self._data_format)( inputs=inputs) def residual_block(self, inputs, filters, strides, use_projection=False, is_training=None): """Standard building block for residual networks with BN after convolutions. Args: inputs: `Tensor` of size `[batch, channels, height, width]`. filters: `int` number of filters for the first two convolutions. Note that the third and final convolution will use 4 times as many filters. strides: `int` block stride. If greater than 1, this block will ultimately downsample the input. use_projection: `bool` for whether this block should use a projection shortcut (versus the default identity shortcut). This is usually `True` for the first block of a block group, which may change the number of filters and the resolution. is_training: `bool` if True, the model is in training mode. Returns: The output `Tensor` of the block. """ shortcut = inputs if use_projection: # Projection shortcut in first layer to match filters and strides shortcut = self.conv2d_fixed_padding( inputs=inputs, filters=filters, kernel_size=1, strides=strides) shortcut = self._norm_activation(use_activation=False)( shortcut, is_training=is_training) inputs = self.conv2d_fixed_padding( inputs=inputs, filters=filters, kernel_size=3, strides=strides) inputs = self._norm_activation()(inputs, is_training=is_training) inputs = self.conv2d_fixed_padding( inputs=inputs, filters=filters, kernel_size=3, strides=1) inputs = self._norm_activation( use_activation=False, init_zero=True)( inputs, is_training=is_training) return self._activation_op(inputs + shortcut) def bottleneck_block(self, inputs, filters, strides, use_projection=False, is_training=None): """Bottleneck block variant for residual networks with BN after convolutions. Args: inputs: `Tensor` of size `[batch, channels, height, width]`. filters: `int` number of filters for the first two convolutions. Note that the third and final convolution will use 4 times as many filters. strides: `int` block stride. If greater than 1, this block will ultimately downsample the input. use_projection: `bool` for whether this block should use a projection shortcut (versus the default identity shortcut). This is usually `True` for the first block of a block group, which may change the number of filters and the resolution. is_training: `bool` if True, the model is in training mode. Returns: The output `Tensor` of the block. """ shortcut = inputs if use_projection: # Projection shortcut only in first block within a group. Bottleneck # blocks end with 4 times the number of filters. filters_out = 4 * filters shortcut = self.conv2d_fixed_padding( inputs=inputs, filters=filters_out, kernel_size=1, strides=strides) shortcut = self._norm_activation(use_activation=False)( shortcut, is_training=is_training) inputs = self.conv2d_fixed_padding( inputs=inputs, filters=filters, kernel_size=1, strides=1) inputs = self._norm_activation()(inputs, is_training=is_training) inputs = self.conv2d_fixed_padding( inputs=inputs, filters=filters, kernel_size=3, strides=strides) inputs = self._norm_activation()(inputs, is_training=is_training) inputs = self.conv2d_fixed_padding( inputs=inputs, filters=4 * filters, kernel_size=1, strides=1) inputs = self._norm_activation( use_activation=False, init_zero=True)( inputs, is_training=is_training) return self._activation_op(inputs + shortcut) def block_group(self, inputs, filters, block_fn, blocks, strides, name, is_training): """Creates one group of blocks for the ResNet model. Args: inputs: `Tensor` of size `[batch, channels, height, width]`. filters: `int` number of filters for the first convolution of the layer. block_fn: `function` for the block to use within the model blocks: `int` number of blocks contained in the layer. strides: `int` stride to use for the first convolution of the layer. If greater than 1, this layer will downsample the input. name: `str`name for the Tensor output of the block layer. is_training: `bool` if True, the model is in training mode. Returns: The output `Tensor` of the block layer. """ # Only the first block per block_group uses projection shortcut and strides. inputs = block_fn( inputs, filters, strides, use_projection=True, is_training=is_training) for _ in range(1, blocks): inputs = block_fn(inputs, filters, 1, is_training=is_training) return tf.identity(inputs, name) def resnet_v1_generator(self, block_fn, layers): """Generator for ResNet v1 models. Args: block_fn: `function` for the block to use within the model. Either `residual_block` or `bottleneck_block`. layers: list of 4 `int`s denoting the number of blocks to include in each of the 4 block groups. Each group consists of blocks that take inputs of the same resolution. Returns: Model `function` that takes in `inputs` and `is_training` and returns the output `Tensor` of the ResNet model. """ def model(inputs, is_training=None): """Creation of the model graph.""" inputs = self.conv2d_fixed_padding( inputs=inputs, filters=64, kernel_size=7, strides=2) inputs = tf.identity(inputs, 'initial_conv') inputs = self._norm_activation()(inputs, is_training=is_training) inputs = tf_keras.layers.MaxPool2D( pool_size=3, strides=2, padding='SAME', data_format=self._data_format)( inputs) inputs = tf.identity(inputs, 'initial_max_pool') c2 = self.block_group( inputs=inputs, filters=64, block_fn=block_fn, blocks=layers[0], strides=1, name='block_group1', is_training=is_training) c3 = self.block_group( inputs=c2, filters=128, block_fn=block_fn, blocks=layers[1], strides=2, name='block_group2', is_training=is_training) c4 = self.block_group( inputs=c3, filters=256, block_fn=block_fn, blocks=layers[2], strides=2, name='block_group3', is_training=is_training) c5 = self.block_group( inputs=c4, filters=512, block_fn=block_fn, blocks=layers[3], strides=2, name='block_group4', is_training=is_training) return {2: c2, 3: c3, 4: c4, 5: c5} return model