Spaces:
Sleeping
Sleeping
# Copyright 2023 The TensorFlow Authors. All Rights Reserved. | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
"""Contains definitions for the post-activation form of Residual Networks. | |
Residual networks (ResNets) were proposed in: | |
[1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun | |
Deep Residual Learning for Image Recognition. arXiv:1512.03385 | |
""" | |
from __future__ import absolute_import | |
from __future__ import division | |
from __future__ import print_function | |
import tensorflow as tf, tf_keras | |
from official.legacy.detection.modeling.architecture import nn_ops | |
# TODO(b/140112644): Refactor the code with Keras style, i.e. build and call. | |
class Resnet(object): | |
"""Class to build ResNet family model.""" | |
def __init__( | |
self, | |
resnet_depth, | |
activation='relu', | |
norm_activation=nn_ops.norm_activation_builder(activation='relu'), | |
data_format='channels_last'): | |
"""ResNet initialization function. | |
Args: | |
resnet_depth: `int` depth of ResNet backbone model. | |
activation: the activation function. | |
norm_activation: an operation that includes a normalization layer followed | |
by an optional activation layer. | |
data_format: `str` either "channels_first" for `[batch, channels, height, | |
width]` or "channels_last for `[batch, height, width, channels]`. | |
""" | |
self._resnet_depth = resnet_depth | |
if activation == 'relu': | |
self._activation_op = tf.nn.relu | |
elif activation == 'swish': | |
self._activation_op = tf.nn.swish | |
else: | |
raise ValueError('Unsupported activation `{}`.'.format(activation)) | |
self._norm_activation = norm_activation | |
self._data_format = data_format | |
model_params = { | |
10: { | |
'block': self.residual_block, | |
'layers': [1, 1, 1, 1] | |
}, | |
18: { | |
'block': self.residual_block, | |
'layers': [2, 2, 2, 2] | |
}, | |
34: { | |
'block': self.residual_block, | |
'layers': [3, 4, 6, 3] | |
}, | |
50: { | |
'block': self.bottleneck_block, | |
'layers': [3, 4, 6, 3] | |
}, | |
101: { | |
'block': self.bottleneck_block, | |
'layers': [3, 4, 23, 3] | |
}, | |
152: { | |
'block': self.bottleneck_block, | |
'layers': [3, 8, 36, 3] | |
}, | |
200: { | |
'block': self.bottleneck_block, | |
'layers': [3, 24, 36, 3] | |
} | |
} | |
if resnet_depth not in model_params: | |
valid_resnet_depths = ', '.join( | |
[str(depth) for depth in sorted(model_params.keys())]) | |
raise ValueError( | |
'The resnet_depth should be in [%s]. Not a valid resnet_depth:' % | |
(valid_resnet_depths), self._resnet_depth) | |
params = model_params[resnet_depth] | |
self._resnet_fn = self.resnet_v1_generator(params['block'], | |
params['layers']) | |
def __call__(self, inputs, is_training=None): | |
"""Returns the ResNet model for a given size and number of output classes. | |
Args: | |
inputs: a `Tesnor` with shape [batch_size, height, width, 3] representing | |
a batch of images. | |
is_training: `bool` if True, the model is in training mode. | |
Returns: | |
a `dict` containing `int` keys for continuous feature levels [2, 3, 4, 5]. | |
The values are corresponding feature hierarchy in ResNet with shape | |
[batch_size, height_l, width_l, num_filters]. | |
""" | |
with tf.name_scope('resnet%s' % self._resnet_depth): | |
return self._resnet_fn(inputs, is_training) | |
def fixed_padding(self, inputs, kernel_size): | |
"""Pads the input along the spatial dimensions independently of input size. | |
Args: | |
inputs: `Tensor` of size `[batch, channels, height, width]` or `[batch, | |
height, width, channels]` depending on `data_format`. | |
kernel_size: `int` kernel size to be used for `conv2d` or max_pool2d` | |
operations. Should be a positive integer. | |
Returns: | |
A padded `Tensor` of the same `data_format` with size either intact | |
(if `kernel_size == 1`) or padded (if `kernel_size > 1`). | |
""" | |
pad_total = kernel_size - 1 | |
pad_beg = pad_total // 2 | |
pad_end = pad_total - pad_beg | |
if self._data_format == 'channels_first': | |
padded_inputs = tf.pad( | |
tensor=inputs, | |
paddings=[[0, 0], [0, 0], [pad_beg, pad_end], [pad_beg, pad_end]]) | |
else: | |
padded_inputs = tf.pad( | |
tensor=inputs, | |
paddings=[[0, 0], [pad_beg, pad_end], [pad_beg, pad_end], [0, 0]]) | |
return padded_inputs | |
def conv2d_fixed_padding(self, inputs, filters, kernel_size, strides): | |
"""Strided 2-D convolution with explicit padding. | |
The padding is consistent and is based only on `kernel_size`, not on the | |
dimensions of `inputs` (as opposed to using `tf.layers.conv2d` alone). | |
Args: | |
inputs: `Tensor` of size `[batch, channels, height_in, width_in]`. | |
filters: `int` number of filters in the convolution. | |
kernel_size: `int` size of the kernel to be used in the convolution. | |
strides: `int` strides of the convolution. | |
Returns: | |
A `Tensor` of shape `[batch, filters, height_out, width_out]`. | |
""" | |
if strides > 1: | |
inputs = self.fixed_padding(inputs, kernel_size) | |
return tf_keras.layers.Conv2D( | |
filters=filters, | |
kernel_size=kernel_size, | |
strides=strides, | |
padding=('SAME' if strides == 1 else 'VALID'), | |
use_bias=False, | |
kernel_initializer=tf.initializers.VarianceScaling(), | |
data_format=self._data_format)( | |
inputs=inputs) | |
def residual_block(self, | |
inputs, | |
filters, | |
strides, | |
use_projection=False, | |
is_training=None): | |
"""Standard building block for residual networks with BN after convolutions. | |
Args: | |
inputs: `Tensor` of size `[batch, channels, height, width]`. | |
filters: `int` number of filters for the first two convolutions. Note that | |
the third and final convolution will use 4 times as many filters. | |
strides: `int` block stride. If greater than 1, this block will ultimately | |
downsample the input. | |
use_projection: `bool` for whether this block should use a projection | |
shortcut (versus the default identity shortcut). This is usually `True` | |
for the first block of a block group, which may change the number of | |
filters and the resolution. | |
is_training: `bool` if True, the model is in training mode. | |
Returns: | |
The output `Tensor` of the block. | |
""" | |
shortcut = inputs | |
if use_projection: | |
# Projection shortcut in first layer to match filters and strides | |
shortcut = self.conv2d_fixed_padding( | |
inputs=inputs, filters=filters, kernel_size=1, strides=strides) | |
shortcut = self._norm_activation(use_activation=False)( | |
shortcut, is_training=is_training) | |
inputs = self.conv2d_fixed_padding( | |
inputs=inputs, filters=filters, kernel_size=3, strides=strides) | |
inputs = self._norm_activation()(inputs, is_training=is_training) | |
inputs = self.conv2d_fixed_padding( | |
inputs=inputs, filters=filters, kernel_size=3, strides=1) | |
inputs = self._norm_activation( | |
use_activation=False, init_zero=True)( | |
inputs, is_training=is_training) | |
return self._activation_op(inputs + shortcut) | |
def bottleneck_block(self, | |
inputs, | |
filters, | |
strides, | |
use_projection=False, | |
is_training=None): | |
"""Bottleneck block variant for residual networks with BN after convolutions. | |
Args: | |
inputs: `Tensor` of size `[batch, channels, height, width]`. | |
filters: `int` number of filters for the first two convolutions. Note that | |
the third and final convolution will use 4 times as many filters. | |
strides: `int` block stride. If greater than 1, this block will ultimately | |
downsample the input. | |
use_projection: `bool` for whether this block should use a projection | |
shortcut (versus the default identity shortcut). This is usually `True` | |
for the first block of a block group, which may change the number of | |
filters and the resolution. | |
is_training: `bool` if True, the model is in training mode. | |
Returns: | |
The output `Tensor` of the block. | |
""" | |
shortcut = inputs | |
if use_projection: | |
# Projection shortcut only in first block within a group. Bottleneck | |
# blocks end with 4 times the number of filters. | |
filters_out = 4 * filters | |
shortcut = self.conv2d_fixed_padding( | |
inputs=inputs, filters=filters_out, kernel_size=1, strides=strides) | |
shortcut = self._norm_activation(use_activation=False)( | |
shortcut, is_training=is_training) | |
inputs = self.conv2d_fixed_padding( | |
inputs=inputs, filters=filters, kernel_size=1, strides=1) | |
inputs = self._norm_activation()(inputs, is_training=is_training) | |
inputs = self.conv2d_fixed_padding( | |
inputs=inputs, filters=filters, kernel_size=3, strides=strides) | |
inputs = self._norm_activation()(inputs, is_training=is_training) | |
inputs = self.conv2d_fixed_padding( | |
inputs=inputs, filters=4 * filters, kernel_size=1, strides=1) | |
inputs = self._norm_activation( | |
use_activation=False, init_zero=True)( | |
inputs, is_training=is_training) | |
return self._activation_op(inputs + shortcut) | |
def block_group(self, inputs, filters, block_fn, blocks, strides, name, | |
is_training): | |
"""Creates one group of blocks for the ResNet model. | |
Args: | |
inputs: `Tensor` of size `[batch, channels, height, width]`. | |
filters: `int` number of filters for the first convolution of the layer. | |
block_fn: `function` for the block to use within the model | |
blocks: `int` number of blocks contained in the layer. | |
strides: `int` stride to use for the first convolution of the layer. If | |
greater than 1, this layer will downsample the input. | |
name: `str`name for the Tensor output of the block layer. | |
is_training: `bool` if True, the model is in training mode. | |
Returns: | |
The output `Tensor` of the block layer. | |
""" | |
# Only the first block per block_group uses projection shortcut and strides. | |
inputs = block_fn( | |
inputs, filters, strides, use_projection=True, is_training=is_training) | |
for _ in range(1, blocks): | |
inputs = block_fn(inputs, filters, 1, is_training=is_training) | |
return tf.identity(inputs, name) | |
def resnet_v1_generator(self, block_fn, layers): | |
"""Generator for ResNet v1 models. | |
Args: | |
block_fn: `function` for the block to use within the model. Either | |
`residual_block` or `bottleneck_block`. | |
layers: list of 4 `int`s denoting the number of blocks to include in each | |
of the 4 block groups. Each group consists of blocks that take inputs of | |
the same resolution. | |
Returns: | |
Model `function` that takes in `inputs` and `is_training` and returns the | |
output `Tensor` of the ResNet model. | |
""" | |
def model(inputs, is_training=None): | |
"""Creation of the model graph.""" | |
inputs = self.conv2d_fixed_padding( | |
inputs=inputs, filters=64, kernel_size=7, strides=2) | |
inputs = tf.identity(inputs, 'initial_conv') | |
inputs = self._norm_activation()(inputs, is_training=is_training) | |
inputs = tf_keras.layers.MaxPool2D( | |
pool_size=3, strides=2, padding='SAME', | |
data_format=self._data_format)( | |
inputs) | |
inputs = tf.identity(inputs, 'initial_max_pool') | |
c2 = self.block_group( | |
inputs=inputs, | |
filters=64, | |
block_fn=block_fn, | |
blocks=layers[0], | |
strides=1, | |
name='block_group1', | |
is_training=is_training) | |
c3 = self.block_group( | |
inputs=c2, | |
filters=128, | |
block_fn=block_fn, | |
blocks=layers[1], | |
strides=2, | |
name='block_group2', | |
is_training=is_training) | |
c4 = self.block_group( | |
inputs=c3, | |
filters=256, | |
block_fn=block_fn, | |
blocks=layers[2], | |
strides=2, | |
name='block_group3', | |
is_training=is_training) | |
c5 = self.block_group( | |
inputs=c4, | |
filters=512, | |
block_fn=block_fn, | |
blocks=layers[3], | |
strides=2, | |
name='block_group4', | |
is_training=is_training) | |
return {2: c2, 3: c3, 4: c4, 5: c5} | |
return model | |