# Copyright 2023 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Augmentation policies for enhanced image/video preprocessing. AutoAugment Reference: - AutoAugment Reference: https://arxiv.org/abs/1805.09501 - AutoAugment for Object Detection Reference: https://arxiv.org/abs/1906.11172 RandAugment Reference: https://arxiv.org/abs/1909.13719 RandomErasing Reference: https://arxiv.org/abs/1708.04896 MixupAndCutmix: - Mixup: https://arxiv.org/abs/1710.09412 - Cutmix: https://arxiv.org/abs/1905.04899 RandomErasing, Mixup and Cutmix are inspired by https://github.com/rwightman/pytorch-image-models """ import inspect import math from typing import Any, List, Iterable, Optional, Tuple, Union import numpy as np import tensorflow as tf, tf_keras # This signifies the max integer that the controller RNN could predict for the # augmentation scheme. _MAX_LEVEL = 10. def to_4d(image: tf.Tensor) -> tf.Tensor: """Converts an input Tensor to 4 dimensions. 4D image => [N, H, W, C] or [N, C, H, W] 3D image => [1, H, W, C] or [1, C, H, W] 2D image => [1, H, W, 1] Args: image: The 2/3/4D input tensor. Returns: A 4D image tensor. Raises: `TypeError` if `image` is not a 2/3/4D tensor. """ shape = tf.shape(image) original_rank = tf.rank(image) left_pad = tf.cast(tf.less_equal(original_rank, 3), dtype=tf.int32) right_pad = tf.cast(tf.equal(original_rank, 2), dtype=tf.int32) new_shape = tf.concat( [ tf.ones(shape=left_pad, dtype=tf.int32), shape, tf.ones(shape=right_pad, dtype=tf.int32), ], axis=0, ) return tf.reshape(image, new_shape) def from_4d(image: tf.Tensor, ndims: tf.Tensor) -> tf.Tensor: """Converts a 4D image back to `ndims` rank.""" shape = tf.shape(image) begin = tf.cast(tf.less_equal(ndims, 3), dtype=tf.int32) end = 4 - tf.cast(tf.equal(ndims, 2), dtype=tf.int32) new_shape = shape[begin:end] return tf.reshape(image, new_shape) def _pad( image: tf.Tensor, filter_shape: Union[List[int], Tuple[int, ...]], mode: str = 'CONSTANT', constant_values: Union[int, tf.Tensor] = 0, ) -> tf.Tensor: """Explicitly pads a 4-D image. Equivalent to the implicit padding method offered in `tf.nn.conv2d` and `tf.nn.depthwise_conv2d`, but supports non-zero, reflect and symmetric padding mode. For the even-sized filter, it pads one more value to the right or the bottom side. Args: image: A 4-D `Tensor` of shape `[batch_size, height, width, channels]`. filter_shape: A `tuple`/`list` of 2 integers, specifying the height and width of the 2-D filter. mode: A `string`, one of "REFLECT", "CONSTANT", or "SYMMETRIC". The type of padding algorithm to use, which is compatible with `mode` argument in `tf.pad`. For more details, please refer to https://www.tensorflow.org/api_docs/python/tf/pad. constant_values: A `scalar`, the pad value to use in "CONSTANT" padding mode. Returns: A padded image. """ if mode.upper() not in {'REFLECT', 'CONSTANT', 'SYMMETRIC'}: raise ValueError( 'padding should be one of "REFLECT", "CONSTANT", or "SYMMETRIC".' ) constant_values = tf.convert_to_tensor(constant_values, image.dtype) filter_height, filter_width = filter_shape pad_top = (filter_height - 1) // 2 pad_bottom = filter_height - 1 - pad_top pad_left = (filter_width - 1) // 2 pad_right = filter_width - 1 - pad_left paddings = [[0, 0], [pad_top, pad_bottom], [pad_left, pad_right], [0, 0]] return tf.pad(image, paddings, mode=mode, constant_values=constant_values) def _get_gaussian_kernel(sigma, filter_shape): """Computes 1D Gaussian kernel.""" sigma = tf.convert_to_tensor(sigma) x = tf.range(-filter_shape // 2 + 1, filter_shape // 2 + 1) x = tf.cast(x**2, sigma.dtype) x = tf.nn.softmax(-x / (2.0 * (sigma**2))) return x def _get_gaussian_kernel_2d(gaussian_filter_x, gaussian_filter_y): """Computes 2D Gaussian kernel given 1D kernels.""" gaussian_kernel = tf.matmul(gaussian_filter_x, gaussian_filter_y) return gaussian_kernel def _normalize_tuple(value, n, name): """Transforms an integer or iterable of integers into an integer tuple. Args: value: The value to validate and convert. Could an int, or any iterable of ints. n: The size of the tuple to be returned. name: The name of the argument being validated, e.g. "strides" or "kernel_size". This is only used to format error messages. Returns: A tuple of n integers. Raises: ValueError: If something else than an int/long or iterable thereof was passed. """ if isinstance(value, int): return (value,) * n else: try: value_tuple = tuple(value) except TypeError as exc: raise TypeError( f'The {name} argument must be a tuple of {n} integers. ' f'Received: {value}' ) from exc if len(value_tuple) != n: raise ValueError( f'The {name} argument must be a tuple of {n} integers. ' f'Received: {value}' ) for single_value in value_tuple: try: int(single_value) except (ValueError, TypeError) as exc: raise ValueError( f'The {name} argument must be a tuple of {n} integers. Received:' f' {value} including element {single_value} of type' f' {type(single_value)}.' ) from exc return value_tuple def gaussian_filter2d( image: tf.Tensor, filter_shape: Union[List[int], Tuple[int, ...], int], sigma: Union[List[float], Tuple[float], float] = 1.0, padding: str = 'REFLECT', constant_values: Union[int, tf.Tensor] = 0, name: Optional[str] = None, ) -> tf.Tensor: """Performs Gaussian blur on image(s). Args: image: Either a 2-D `Tensor` of shape `[height, width]`, a 3-D `Tensor` of shape `[height, width, channels]`, or a 4-D `Tensor` of shape `[batch_size, height, width, channels]`. filter_shape: An `integer` or `tuple`/`list` of 2 integers, specifying the height and width of the 2-D gaussian filter. Can be a single integer to specify the same value for all spatial dimensions. sigma: A `float` or `tuple`/`list` of 2 floats, specifying the standard deviation in x and y direction the 2-D gaussian filter. Can be a single float to specify the same value for all spatial dimensions. padding: A `string`, one of "REFLECT", "CONSTANT", or "SYMMETRIC". The type of padding algorithm to use, which is compatible with `mode` argument in `tf.pad`. For more details, please refer to https://www.tensorflow.org/api_docs/python/tf/pad. constant_values: A `scalar`, the pad value to use in "CONSTANT" padding mode. name: A name for this operation (optional). Returns: 2-D, 3-D or 4-D `Tensor` of the same dtype as input. Raises: ValueError: If `image` is not 2, 3 or 4-dimensional, if `padding` is other than "REFLECT", "CONSTANT" or "SYMMETRIC", if `filter_shape` is invalid, or if `sigma` is invalid. """ with tf.name_scope(name or 'gaussian_filter2d'): if isinstance(sigma, (list, tuple)): if len(sigma) != 2: raise ValueError('sigma should be a float or a tuple/list of 2 floats') else: sigma = (sigma,) * 2 if any(s < 0 for s in sigma): raise ValueError('sigma should be greater than or equal to 0.') image = tf.convert_to_tensor(image, name='image') sigma = tf.convert_to_tensor(sigma, name='sigma') original_ndims = tf.rank(image) image = to_4d(image) # Keep the precision if it's float; # otherwise, convert to float32 for computing. orig_dtype = image.dtype if not image.dtype.is_floating: image = tf.cast(image, tf.float32) channels = tf.shape(image)[3] filter_shape = _normalize_tuple(filter_shape, 2, 'filter_shape') sigma = tf.cast(sigma, image.dtype) gaussian_kernel_x = _get_gaussian_kernel(sigma[1], filter_shape[1]) gaussian_kernel_x = gaussian_kernel_x[tf.newaxis, :] gaussian_kernel_y = _get_gaussian_kernel(sigma[0], filter_shape[0]) gaussian_kernel_y = gaussian_kernel_y[:, tf.newaxis] gaussian_kernel_2d = _get_gaussian_kernel_2d( gaussian_kernel_y, gaussian_kernel_x ) gaussian_kernel_2d = gaussian_kernel_2d[:, :, tf.newaxis, tf.newaxis] gaussian_kernel_2d = tf.tile(gaussian_kernel_2d, [1, 1, channels, 1]) image = _pad( image, filter_shape, mode=padding, constant_values=constant_values ) output = tf.nn.depthwise_conv2d( input=image, filter=gaussian_kernel_2d, strides=(1, 1, 1, 1), padding='VALID', ) output = from_4d(output, original_ndims) return tf.cast(output, orig_dtype) def _convert_translation_to_transform(translations: tf.Tensor) -> tf.Tensor: """Converts translations to a projective transform. The translation matrix looks like this: [[1 0 -dx] [0 1 -dy] [0 0 1]] Args: translations: The 2-element list representing [dx, dy], or a matrix of 2-element lists representing [dx dy] to translate for each image. The shape must be static. Returns: The transformation matrix of shape (num_images, 8). Raises: `TypeError` if - the shape of `translations` is not known or - the shape of `translations` is not rank 1 or 2. """ translations = tf.convert_to_tensor(translations, dtype=tf.float32) if translations.get_shape().ndims is None: raise TypeError('translations rank must be statically known') elif len(translations.get_shape()) == 1: translations = translations[None] elif len(translations.get_shape()) != 2: raise TypeError('translations should have rank 1 or 2.') num_translations = tf.shape(translations)[0] return tf.concat( values=[ tf.ones((num_translations, 1), tf.dtypes.float32), tf.zeros((num_translations, 1), tf.dtypes.float32), -translations[:, 0, None], tf.zeros((num_translations, 1), tf.dtypes.float32), tf.ones((num_translations, 1), tf.dtypes.float32), -translations[:, 1, None], tf.zeros((num_translations, 2), tf.dtypes.float32), ], axis=1, ) def _convert_angles_to_transform(angles: tf.Tensor, image_width: tf.Tensor, image_height: tf.Tensor) -> tf.Tensor: """Converts an angle or angles to a projective transform. Args: angles: A scalar to rotate all images, or a vector to rotate a batch of images. This must be a scalar. image_width: The width of the image(s) to be transformed. image_height: The height of the image(s) to be transformed. Returns: A tensor of shape (num_images, 8). Raises: `TypeError` if `angles` is not rank 0 or 1. """ angles = tf.convert_to_tensor(angles, dtype=tf.float32) if len(angles.get_shape()) == 0: # pylint:disable=g-explicit-length-test angles = angles[None] elif len(angles.get_shape()) != 1: raise TypeError('Angles should have a rank 0 or 1.') x_offset = ((image_width - 1) - (tf.math.cos(angles) * (image_width - 1) - tf.math.sin(angles) * (image_height - 1))) / 2.0 y_offset = ((image_height - 1) - (tf.math.sin(angles) * (image_width - 1) + tf.math.cos(angles) * (image_height - 1))) / 2.0 num_angles = tf.shape(angles)[0] return tf.concat( values=[ tf.math.cos(angles)[:, None], -tf.math.sin(angles)[:, None], x_offset[:, None], tf.math.sin(angles)[:, None], tf.math.cos(angles)[:, None], y_offset[:, None], tf.zeros((num_angles, 2), tf.dtypes.float32), ], axis=1, ) def _apply_transform_to_images( images, transforms, fill_mode='reflect', fill_value=0.0, interpolation='bilinear', output_shape=None, name=None, ): """Applies the given transform(s) to the image(s). Args: images: A tensor of shape `(num_images, num_rows, num_columns, num_channels)` (NHWC). The rank must be statically known (the shape is not `TensorShape(None)`). transforms: Projective transform matrix/matrices. A vector of length 8 or tensor of size N x 8. If one row of transforms is [a0, a1, a2, b0, b1, b2, c0, c1], then it maps the *output* point `(x, y)` to a transformed *input* point `(x', y') = ((a0 x + a1 y + a2) / k, (b0 x + b1 y + b2) / k)`, where `k = c0 x + c1 y + 1`. The transforms are *inverted* compared to the transform mapping input points to output points. Note that gradients are not backpropagated into transformation parameters. fill_mode: Points outside the boundaries of the input are filled according to the given mode (one of `{"constant", "reflect", "wrap", "nearest"}`). fill_value: a float represents the value to be filled outside the boundaries when `fill_mode="constant"`. interpolation: Interpolation mode. Supported values: `"nearest"`, `"bilinear"`. output_shape: Output dimension after the transform, `[height, width]`. If `None`, output is the same size as input image. name: The name of the op. Fill mode behavior for each valid value is as follows - `"reflect"`: `(d c b a | a b c d | d c b a)` The input is extended by reflecting about the edge of the last pixel. - `"constant"`: `(k k k k | a b c d | k k k k)` The input is extended by filling all values beyond the edge with the same constant value k = 0. - `"wrap"`: `(a b c d | a b c d | a b c d)` The input is extended by wrapping around to the opposite edge. - `"nearest"`: `(a a a a | a b c d | d d d d)` The input is extended by the nearest pixel. Input shape: 4D tensor with shape: `(samples, height, width, channels)`, in `"channels_last"` format. Output shape: 4D tensor with shape: `(samples, height, width, channels)`, in `"channels_last"` format. Returns: Image(s) with the same type and shape as `images`, with the given transform(s) applied. Transformed coordinates outside of the input image will be filled with zeros. """ with tf.name_scope(name or 'transform'): if output_shape is None: output_shape = tf.shape(images)[1:3] if not tf.executing_eagerly(): output_shape_value = tf.get_static_value(output_shape) if output_shape_value is not None: output_shape = output_shape_value output_shape = tf.convert_to_tensor( output_shape, tf.int32, name='output_shape' ) if not output_shape.get_shape().is_compatible_with([2]): raise ValueError( 'output_shape must be a 1-D Tensor of 2 elements: ' 'new_height, new_width, instead got ' f'output_shape={output_shape}' ) fill_value = tf.convert_to_tensor(fill_value, tf.float32, name='fill_value') return tf.raw_ops.ImageProjectiveTransformV3( images=images, output_shape=output_shape, fill_value=fill_value, transforms=transforms, fill_mode=fill_mode.upper(), interpolation=interpolation.upper(), ) def transform( image: tf.Tensor, transforms: Any, interpolation: str = 'nearest', output_shape=None, fill_mode: str = 'reflect', fill_value: float = 0.0, ) -> tf.Tensor: """Transforms an image.""" original_ndims = tf.rank(image) transforms = tf.convert_to_tensor(transforms, dtype=tf.float32) if transforms.shape.rank == 1: transforms = transforms[None] image = to_4d(image) image = _apply_transform_to_images( images=image, transforms=transforms, interpolation=interpolation, fill_mode=fill_mode, fill_value=fill_value, output_shape=output_shape, ) return from_4d(image, original_ndims) def translate( image: tf.Tensor, translations, fill_value: float = 0.0, fill_mode: str = 'reflect', interpolation: str = 'nearest', ) -> tf.Tensor: """Translates image(s) by provided vectors. Args: image: An image Tensor of type uint8. translations: A vector or matrix representing [dx dy]. fill_value: a float represents the value to be filled outside the boundaries when `fill_mode="constant"`. fill_mode: Points outside the boundaries of the input are filled according to the given mode (one of `{"constant", "reflect", "wrap", "nearest"}`). interpolation: Interpolation mode. Supported values: `"nearest"`, `"bilinear"`. Returns: The translated version of the image. """ transforms = _convert_translation_to_transform(translations) # pytype: disable=wrong-arg-types # always-use-return-annotations return transform( image, transforms=transforms, interpolation=interpolation, fill_value=fill_value, fill_mode=fill_mode, ) def rotate(image: tf.Tensor, degrees: float) -> tf.Tensor: """Rotates the image by degrees either clockwise or counterclockwise. Args: image: An image Tensor of type uint8. degrees: Float, a scalar angle in degrees to rotate all images by. If degrees is positive the image will be rotated clockwise otherwise it will be rotated counterclockwise. Returns: The rotated version of image. """ # Convert from degrees to radians. degrees_to_radians = math.pi / 180.0 radians = tf.cast(degrees * degrees_to_radians, tf.float32) original_ndims = tf.rank(image) image = to_4d(image) image_height = tf.cast(tf.shape(image)[1], tf.float32) image_width = tf.cast(tf.shape(image)[2], tf.float32) transforms = _convert_angles_to_transform( angles=radians, image_width=image_width, image_height=image_height) # In practice, we should randomize the rotation degrees by flipping # it negatively half the time, but that's done on 'degrees' outside # of the function. image = transform(image, transforms=transforms) return from_4d(image, original_ndims) def blend(image1: tf.Tensor, image2: tf.Tensor, factor: float) -> tf.Tensor: """Blend image1 and image2 using 'factor'. Factor can be above 0.0. A value of 0.0 means only image1 is used. A value of 1.0 means only image2 is used. A value between 0.0 and 1.0 means we linearly interpolate the pixel values between the two images. A value greater than 1.0 "extrapolates" the difference between the two pixel values, and we clip the results to values between 0 and 255. Args: image1: An image Tensor of type uint8. image2: An image Tensor of type uint8. factor: A floating point value above 0.0. Returns: A blended image Tensor of type uint8. """ if factor == 0.0: return tf.convert_to_tensor(image1) if factor == 1.0: return tf.convert_to_tensor(image2) image1 = tf.cast(image1, tf.float32) image2 = tf.cast(image2, tf.float32) difference = image2 - image1 scaled = factor * difference # Do addition in float. temp = tf.cast(image1, tf.float32) + scaled # Interpolate if factor > 0.0 and factor < 1.0: # Interpolation means we always stay within 0 and 255. return tf.cast(temp, tf.uint8) # Extrapolate: # # We need to clip and then cast. return tf.cast(tf.clip_by_value(temp, 0.0, 255.0), tf.uint8) def cutout(image: tf.Tensor, pad_size: int, replace: int = 0) -> tf.Tensor: """Apply cutout (https://arxiv.org/abs/1708.04552) to image. This operation applies a (2*pad_size x 2*pad_size) mask of zeros to a random location within `image`. The pixel values filled in will be of the value `replace`. The location where the mask will be applied is randomly chosen uniformly over the whole image. Args: image: An image Tensor of type uint8. pad_size: Specifies how big the zero mask that will be generated is that is applied to the image. The mask will be of size (2*pad_size x 2*pad_size). replace: What pixel value to fill in the image in the area that has the cutout mask applied to it. Returns: An image Tensor that is of type uint8. """ if image.shape.rank not in [3, 4]: raise ValueError('Bad image rank: {}'.format(image.shape.rank)) if image.shape.rank == 4: return cutout_video(image, replace=replace) image_height = tf.shape(image)[0] image_width = tf.shape(image)[1] # Sample the center location in the image where the zero mask will be applied. cutout_center_height = tf.random.uniform( shape=[], minval=0, maxval=image_height, dtype=tf.int32) cutout_center_width = tf.random.uniform( shape=[], minval=0, maxval=image_width, dtype=tf.int32) image = _fill_rectangle(image, cutout_center_width, cutout_center_height, pad_size, pad_size, replace) return image def _fill_rectangle(image, center_width, center_height, half_width, half_height, replace=None): """Fills blank area.""" image_height = tf.shape(image)[0] image_width = tf.shape(image)[1] lower_pad = tf.maximum(0, center_height - half_height) upper_pad = tf.maximum(0, image_height - center_height - half_height) left_pad = tf.maximum(0, center_width - half_width) right_pad = tf.maximum(0, image_width - center_width - half_width) cutout_shape = [ image_height - (lower_pad + upper_pad), image_width - (left_pad + right_pad) ] padding_dims = [[lower_pad, upper_pad], [left_pad, right_pad]] mask = tf.pad( tf.zeros(cutout_shape, dtype=image.dtype), padding_dims, constant_values=1) mask = tf.expand_dims(mask, -1) mask = tf.tile(mask, [1, 1, 3]) if replace is None: fill = tf.random.normal(tf.shape(image), dtype=image.dtype) elif isinstance(replace, tf.Tensor): fill = replace else: fill = tf.ones_like(image, dtype=image.dtype) * replace image = tf.where(tf.equal(mask, 0), fill, image) return image def _fill_rectangle_video(image, center_width, center_height, half_width, half_height, replace=None): """Fills blank area for video.""" image_time = tf.shape(image)[0] image_height = tf.shape(image)[1] image_width = tf.shape(image)[2] lower_pad = tf.maximum(0, center_height - half_height) upper_pad = tf.maximum(0, image_height - center_height - half_height) left_pad = tf.maximum(0, center_width - half_width) right_pad = tf.maximum(0, image_width - center_width - half_width) cutout_shape = [ image_time, image_height - (lower_pad + upper_pad), image_width - (left_pad + right_pad) ] padding_dims = [[0, 0], [lower_pad, upper_pad], [left_pad, right_pad]] mask = tf.pad( tf.zeros(cutout_shape, dtype=image.dtype), padding_dims, constant_values=1) mask = tf.expand_dims(mask, -1) mask = tf.tile(mask, [1, 1, 1, 3]) if replace is None: fill = tf.random.normal(tf.shape(image), dtype=image.dtype) elif isinstance(replace, tf.Tensor): fill = replace else: fill = tf.ones_like(image, dtype=image.dtype) * replace image = tf.where(tf.equal(mask, 0), fill, image) return image def cutout_video( video: tf.Tensor, mask_shape: Optional[tf.Tensor] = None, replace: int = 0, ) -> tf.Tensor: """Apply cutout (https://arxiv.org/abs/1708.04552) to a video. This operation applies a random size 3D mask of zeros to a random location within `video`. The mask is padded The pixel values filled in will be of the value `replace`. The location where the mask will be applied is randomly chosen uniformly over the whole video. If the size of the mask is not set, then, it is randomly sampled uniformly from [0.25*height, 0.5*height], [0.25*width, 0.5*width], and [1, 0.25*depth], which represent the height, width, and number of frames of the input video tensor respectively. Args: video: A video Tensor of shape [T, H, W, C]. mask_shape: An optional integer tensor that specifies the depth, height and width of the mask to cut. If it is not set, the shape is randomly sampled as described above. The shape dimensions should be divisible by 2 otherwise they will rounded down. replace: What pixel value to fill in the image in the area that has the cutout mask applied to it. Returns: A video Tensor with cutout applied. """ tf.debugging.assert_shapes([ (video, ('T', 'H', 'W', 'C')), ]) video_depth = tf.shape(video)[0] video_height = tf.shape(video)[1] video_width = tf.shape(video)[2] # Sample the center location in the image where the zero mask will be applied. cutout_center_height = tf.random.uniform( shape=[], minval=0, maxval=video_height, dtype=tf.int32 ) cutout_center_width = tf.random.uniform( shape=[], minval=0, maxval=video_width, dtype=tf.int32 ) cutout_center_depth = tf.random.uniform( shape=[], minval=0, maxval=video_depth, dtype=tf.int32 ) if mask_shape is not None: pad_shape = tf.maximum(1, mask_shape // 2) pad_size_depth, pad_size_height, pad_size_width = ( pad_shape[0], pad_shape[1], pad_shape[2], ) else: pad_size_height = tf.random.uniform( shape=[], minval=tf.maximum(1, tf.cast(video_height / 4, tf.int32)), maxval=tf.maximum(2, tf.cast(video_height / 2, tf.int32)), dtype=tf.int32, ) pad_size_width = tf.random.uniform( shape=[], minval=tf.maximum(1, tf.cast(video_width / 4, tf.int32)), maxval=tf.maximum(2, tf.cast(video_width / 2, tf.int32)), dtype=tf.int32, ) pad_size_depth = tf.random.uniform( shape=[], minval=1, maxval=tf.maximum(2, tf.cast(video_depth / 4, tf.int32)), dtype=tf.int32, ) lower_pad = tf.maximum(0, cutout_center_height - pad_size_height) upper_pad = tf.maximum( 0, video_height - cutout_center_height - pad_size_height ) left_pad = tf.maximum(0, cutout_center_width - pad_size_width) right_pad = tf.maximum(0, video_width - cutout_center_width - pad_size_width) back_pad = tf.maximum(0, cutout_center_depth - pad_size_depth) forward_pad = tf.maximum( 0, video_depth - cutout_center_depth - pad_size_depth ) cutout_shape = [ video_depth - (back_pad + forward_pad), video_height - (lower_pad + upper_pad), video_width - (left_pad + right_pad), ] padding_dims = [[back_pad, forward_pad], [lower_pad, upper_pad], [left_pad, right_pad]] mask = tf.pad( tf.zeros(cutout_shape, dtype=video.dtype), padding_dims, constant_values=1 ) mask = tf.expand_dims(mask, -1) num_channels = tf.shape(video)[-1] mask = tf.tile(mask, [1, 1, 1, num_channels]) video = tf.where( tf.equal(mask, 0), tf.ones_like(video, dtype=video.dtype) * replace, video ) return video def gaussian_noise( image: tf.Tensor, low: float = 0.1, high: float = 2.0) -> tf.Tensor: """Add Gaussian noise to image(s).""" augmented_image = gaussian_filter2d( # pylint: disable=g-long-lambda image, filter_shape=[3, 3], sigma=np.random.uniform(low=low, high=high) ) return augmented_image def solarize(image: tf.Tensor, threshold: int = 128) -> tf.Tensor: """Solarize the input image(s).""" # For each pixel in the image, select the pixel # if the value is less than the threshold. # Otherwise, subtract 255 from the pixel. return tf.where(image < threshold, image, 255 - image) def solarize_add(image: tf.Tensor, addition: int = 0, threshold: int = 128) -> tf.Tensor: """Additive solarize the input image(s).""" # For each pixel in the image less than threshold # we add 'addition' amount to it and then clip the # pixel value to be between 0 and 255. The value # of 'addition' is between -128 and 128. added_image = tf.cast(image, tf.int64) + addition added_image = tf.cast(tf.clip_by_value(added_image, 0, 255), tf.uint8) return tf.where(image < threshold, added_image, image) def grayscale(image: tf.Tensor) -> tf.Tensor: """Convert image to grayscale.""" return tf.image.grayscale_to_rgb(tf.image.rgb_to_grayscale(image)) def color(image: tf.Tensor, factor: float) -> tf.Tensor: """Equivalent of PIL Color.""" degenerate = grayscale(image) return blend(degenerate, image, factor) def contrast(image: tf.Tensor, factor: float) -> tf.Tensor: """Equivalent of PIL Contrast.""" degenerate = tf.image.rgb_to_grayscale(image) # Cast before calling tf.histogram. degenerate = tf.cast(degenerate, tf.int32) # Compute the grayscale histogram, then compute the mean pixel value, # and create a constant image size of that value. Use that as the # blending degenerate target of the original image. hist = tf.histogram_fixed_width(degenerate, [0, 255], nbins=256) mean = tf.reduce_sum(tf.cast(hist, tf.float32)) / 256.0 degenerate = tf.ones_like(degenerate, dtype=tf.float32) * mean degenerate = tf.clip_by_value(degenerate, 0.0, 255.0) degenerate = tf.image.grayscale_to_rgb(tf.cast(degenerate, tf.uint8)) return blend(degenerate, image, factor) def brightness(image: tf.Tensor, factor: float) -> tf.Tensor: """Equivalent of PIL Brightness.""" degenerate = tf.zeros_like(image) return blend(degenerate, image, factor) def posterize(image: tf.Tensor, bits: int) -> tf.Tensor: """Equivalent of PIL Posterize.""" shift = 8 - bits return tf.bitwise.left_shift(tf.bitwise.right_shift(image, shift), shift) def wrapped_rotate(image: tf.Tensor, degrees: float, replace: int) -> tf.Tensor: """Applies rotation with wrap/unwrap.""" image = rotate(wrap(image), degrees=degrees) return unwrap(image, replace) def translate_x(image: tf.Tensor, pixels: int, replace: int) -> tf.Tensor: """Equivalent of PIL Translate in X dimension.""" image = translate(wrap(image), [-pixels, 0]) return unwrap(image, replace) def translate_y(image: tf.Tensor, pixels: int, replace: int) -> tf.Tensor: """Equivalent of PIL Translate in Y dimension.""" image = translate(wrap(image), [0, -pixels]) return unwrap(image, replace) def shear_x(image: tf.Tensor, level: float, replace: int) -> tf.Tensor: """Equivalent of PIL Shearing in X dimension.""" # Shear parallel to x axis is a projective transform # with a matrix form of: # [1 level # 0 1]. image = transform( image=wrap(image), transforms=[1., level, 0., 0., 1., 0., 0., 0.]) return unwrap(image, replace) def shear_y(image: tf.Tensor, level: float, replace: int) -> tf.Tensor: """Equivalent of PIL Shearing in Y dimension.""" # Shear parallel to y axis is a projective transform # with a matrix form of: # [1 0 # level 1]. image = transform( image=wrap(image), transforms=[1., 0., 0., level, 1., 0., 0., 0.]) return unwrap(image, replace) def autocontrast(image: tf.Tensor) -> tf.Tensor: """Implements Autocontrast function from PIL using TF ops. Args: image: A 3D uint8 tensor. Returns: The image after it has had autocontrast applied to it and will be of type uint8. """ def scale_channel(image: tf.Tensor) -> tf.Tensor: """Scale the 2D image using the autocontrast rule.""" # A possibly cheaper version can be done using cumsum/unique_with_counts # over the histogram values, rather than iterating over the entire image. # to compute mins and maxes. lo = tf.cast(tf.reduce_min(image), tf.float32) hi = tf.cast(tf.reduce_max(image), tf.float32) # Scale the image, making the lowest value 0 and the highest value 255. def scale_values(im): scale = 255.0 / (hi - lo) offset = -lo * scale im = tf.cast(im, tf.float32) * scale + offset im = tf.clip_by_value(im, 0.0, 255.0) return tf.cast(im, tf.uint8) result = tf.cond(hi > lo, lambda: scale_values(image), lambda: image) return result # Assumes RGB for now. Scales each channel independently # and then stacks the result. s1 = scale_channel(image[..., 0]) s2 = scale_channel(image[..., 1]) s3 = scale_channel(image[..., 2]) image = tf.stack([s1, s2, s3], -1) return image def sharpness(image: tf.Tensor, factor: float) -> tf.Tensor: """Implements Sharpness function from PIL using TF ops.""" orig_image = image image = tf.cast(image, tf.float32) # Make image 4D for conv operation. image = tf.expand_dims(image, 0) # SMOOTH PIL Kernel. if orig_image.shape.rank == 3: kernel = tf.constant([[1, 1, 1], [1, 5, 1], [1, 1, 1]], dtype=tf.float32, shape=[3, 3, 1, 1]) / 13. # Tile across channel dimension. kernel = tf.tile(kernel, [1, 1, 3, 1]) strides = [1, 1, 1, 1] degenerate = tf.nn.depthwise_conv2d( image, kernel, strides, padding='VALID', dilations=[1, 1]) elif orig_image.shape.rank == 4: kernel = tf.constant([[1, 1, 1], [1, 5, 1], [1, 1, 1]], dtype=tf.float32, shape=[1, 3, 3, 1, 1]) / 13. strides = [1, 1, 1, 1, 1] # Run the kernel across each channel channels = tf.split(image, 3, axis=-1) degenerates = [ tf.nn.conv3d(channel, kernel, strides, padding='VALID', dilations=[1, 1, 1, 1, 1]) for channel in channels ] degenerate = tf.concat(degenerates, -1) else: raise ValueError('Bad image rank: {}'.format(image.shape.rank)) degenerate = tf.clip_by_value(degenerate, 0.0, 255.0) degenerate = tf.squeeze(tf.cast(degenerate, tf.uint8), [0]) # For the borders of the resulting image, fill in the values of the # original image. mask = tf.ones_like(degenerate) paddings = [[0, 0]] * (orig_image.shape.rank - 3) padded_mask = tf.pad(mask, paddings + [[1, 1], [1, 1], [0, 0]]) padded_degenerate = tf.pad(degenerate, paddings + [[1, 1], [1, 1], [0, 0]]) result = tf.where(tf.equal(padded_mask, 1), padded_degenerate, orig_image) # Blend the final result. return blend(result, orig_image, factor) def equalize(image: tf.Tensor) -> tf.Tensor: """Implements Equalize function from PIL using TF ops.""" def scale_channel(im, c): """Scale the data in the channel to implement equalize.""" im = tf.cast(im[..., c], tf.int32) # Compute the histogram of the image channel. histo = tf.histogram_fixed_width(im, [0, 255], nbins=256) # For the purposes of computing the step, filter out the nonzeros. nonzero = tf.where(tf.not_equal(histo, 0)) nonzero_histo = tf.reshape(tf.gather(histo, nonzero), [-1]) step = (tf.reduce_sum(nonzero_histo) - nonzero_histo[-1]) // 255 def build_lut(histo, step): # Compute the cumulative sum, shifting by step // 2 # and then normalization by step. lut = (tf.cumsum(histo) + (step // 2)) // step # Shift lut, prepending with 0. lut = tf.concat([[0], lut[:-1]], 0) # Clip the counts to be in range. This is done # in the C code for image.point. return tf.clip_by_value(lut, 0, 255) # If step is zero, return the original image. Otherwise, build # lut from the full histogram and step and then index from it. result = tf.cond( tf.equal(step, 0), lambda: im, lambda: tf.gather(build_lut(histo, step), im)) return tf.cast(result, tf.uint8) # Assumes RGB for now. Scales each channel independently # and then stacks the result. s1 = scale_channel(image, 0) s2 = scale_channel(image, 1) s3 = scale_channel(image, 2) image = tf.stack([s1, s2, s3], -1) return image def invert(image: tf.Tensor) -> tf.Tensor: """Inverts the image pixels.""" image = tf.convert_to_tensor(image) return 255 - image def wrap(image: tf.Tensor) -> tf.Tensor: """Returns 'image' with an extra channel set to all 1s.""" shape = tf.shape(image) extended_channel = tf.expand_dims(tf.ones(shape[:-1], image.dtype), -1) extended = tf.concat([image, extended_channel], axis=-1) return extended def unwrap(image: tf.Tensor, replace: int) -> tf.Tensor: """Unwraps an image produced by wrap. Where there is a 0 in the last channel for every spatial position, the rest of the three channels in that spatial dimension are grayed (set to 128). Operations like translate and shear on a wrapped Tensor will leave 0s in empty locations. Some transformations look at the intensity of values to do preprocessing, and we want these empty pixels to assume the 'average' value, rather than pure black. Args: image: A 3D Image Tensor with 4 channels. replace: A one or three value 1D tensor to fill empty pixels. Returns: image: A 3D image Tensor with 3 channels. """ image_shape = tf.shape(image) # Flatten the spatial dimensions. flattened_image = tf.reshape(image, [-1, image_shape[-1]]) # Find all pixels where the last channel is zero. alpha_channel = tf.expand_dims(flattened_image[..., 3], axis=-1) replace = tf.concat([replace, tf.ones([1], image.dtype)], 0) # Where they are zero, fill them in with 'replace'. flattened_image = tf.where( tf.equal(alpha_channel, 0), tf.ones_like(flattened_image, dtype=image.dtype) * replace, flattened_image) image = tf.reshape(flattened_image, image_shape) image = tf.slice( image, [0] * image.shape.rank, tf.concat([image_shape[:-1], [3]], -1)) return image def _scale_bbox_only_op_probability(prob): """Reduce the probability of the bbox-only operation. Probability is reduced so that we do not distort the content of too many bounding boxes that are close to each other. The value of 3.0 was a chosen hyper parameter when designing the autoaugment algorithm that we found empirically to work well. Args: prob: Float that is the probability of applying the bbox-only operation. Returns: Reduced probability. """ return prob / 3.0 def _apply_bbox_augmentation(image, bbox, augmentation_func, *args): """Applies augmentation_func to the subsection of image indicated by bbox. Args: image: 3D uint8 Tensor. bbox: 1D Tensor that has 4 elements (min_y, min_x, max_y, max_x) of type float that represents the normalized coordinates between 0 and 1. augmentation_func: Augmentation function that will be applied to the subsection of image. *args: Additional parameters that will be passed into augmentation_func when it is called. Returns: A modified version of image, where the bbox location in the image will have `ugmentation_func applied to it. """ image_height = tf.cast(tf.shape(image)[0], tf.float32) image_width = tf.cast(tf.shape(image)[1], tf.float32) min_y = tf.cast(image_height * bbox[0], tf.int32) min_x = tf.cast(image_width * bbox[1], tf.int32) max_y = tf.cast(image_height * bbox[2], tf.int32) max_x = tf.cast(image_width * bbox[3], tf.int32) image_height = tf.cast(image_height, tf.int32) image_width = tf.cast(image_width, tf.int32) # Clip to be sure the max values do not fall out of range. max_y = tf.minimum(max_y, image_height - 1) max_x = tf.minimum(max_x, image_width - 1) # Get the sub-tensor that is the image within the bounding box region. bbox_content = image[min_y:max_y + 1, min_x:max_x + 1, :] # Apply the augmentation function to the bbox portion of the image. augmented_bbox_content = augmentation_func(bbox_content, *args) # Pad the augmented_bbox_content and the mask to match the shape of original # image. augmented_bbox_content = tf.pad(augmented_bbox_content, [[min_y, (image_height - 1) - max_y], [min_x, (image_width - 1) - max_x], [0, 0]]) # Create a mask that will be used to zero out a part of the original image. mask_tensor = tf.zeros_like(bbox_content) mask_tensor = tf.pad(mask_tensor, [[min_y, (image_height - 1) - max_y], [min_x, (image_width - 1) - max_x], [0, 0]], constant_values=1) # Replace the old bbox content with the new augmented content. image = image * mask_tensor + augmented_bbox_content return image def _concat_bbox(bbox, bboxes): """Helper function that concates bbox to bboxes along the first dimension.""" # Note if all elements in bboxes are -1 (_INVALID_BOX), then this means # we discard bboxes and start the bboxes Tensor with the current bbox. bboxes_sum_check = tf.reduce_sum(bboxes) bbox = tf.expand_dims(bbox, 0) # This check will be true when it is an _INVALID_BOX bboxes = tf.cond(tf.equal(bboxes_sum_check, -4.0), lambda: bbox, lambda: tf.concat([bboxes, bbox], 0)) return bboxes def _apply_bbox_augmentation_wrapper(image, bbox, new_bboxes, prob, augmentation_func, func_changes_bbox, *args): """Applies _apply_bbox_augmentation with probability prob. Args: image: 3D uint8 Tensor. bbox: 1D Tensor that has 4 elements (min_y, min_x, max_y, max_x) of type float that represents the normalized coordinates between 0 and 1. new_bboxes: 2D Tensor that is a list of the bboxes in the image after they have been altered by aug_func. These will only be changed when func_changes_bbox is set to true. Each bbox has 4 elements (min_y, min_x, max_y, max_x) of type float that are the normalized bbox coordinates between 0 and 1. prob: Float that is the probability of applying _apply_bbox_augmentation. augmentation_func: Augmentation function that will be applied to the subsection of image. func_changes_bbox: Boolean. Does augmentation_func return bbox in addition to image. *args: Additional parameters that will be passed into augmentation_func when it is called. Returns: A tuple. Fist element is a modified version of image, where the bbox location in the image will have augmentation_func applied to it if it is chosen to be called with probability `prob`. The second element is a Tensor of Tensors of length 4 that will contain the altered bbox after applying augmentation_func. """ should_apply_op = tf.cast( tf.floor(tf.random.uniform([], dtype=tf.float32) + prob), tf.bool) if func_changes_bbox: augmented_image, bbox = tf.cond( should_apply_op, lambda: augmentation_func(image, bbox, *args), lambda: (image, bbox)) else: augmented_image = tf.cond( should_apply_op, lambda: _apply_bbox_augmentation(image, bbox, augmentation_func, *args), lambda: image) new_bboxes = _concat_bbox(bbox, new_bboxes) return augmented_image, new_bboxes def _apply_multi_bbox_augmentation_wrapper(image, bboxes, prob, aug_func, func_changes_bbox, *args): """Checks to be sure num bboxes > 0 before calling inner function.""" num_bboxes = tf.shape(bboxes)[0] image, bboxes = tf.cond( tf.equal(num_bboxes, 0), lambda: (image, bboxes), # pylint:disable=g-long-lambda lambda: _apply_multi_bbox_augmentation( image, bboxes, prob, aug_func, func_changes_bbox, *args)) # pylint:enable=g-long-lambda return image, bboxes # Represents an invalid bounding box that is used for checking for padding # lists of bounding box coordinates for a few augmentation operations _INVALID_BOX = [[-1.0, -1.0, -1.0, -1.0]] def _apply_multi_bbox_augmentation(image, bboxes, prob, aug_func, func_changes_bbox, *args): """Applies aug_func to the image for each bbox in bboxes. Args: image: 3D uint8 Tensor. bboxes: 2D Tensor that is a list of the bboxes in the image. Each bbox has 4 elements (min_y, min_x, max_y, max_x) of type float. prob: Float that is the probability of applying aug_func to a specific bounding box within the image. aug_func: Augmentation function that will be applied to the subsections of image indicated by the bbox values in bboxes. func_changes_bbox: Boolean. Does augmentation_func return bbox in addition to image. *args: Additional parameters that will be passed into augmentation_func when it is called. Returns: A modified version of image, where each bbox location in the image will have augmentation_func applied to it if it is chosen to be called with probability prob independently across all bboxes. Also the final bboxes are returned that will be unchanged if func_changes_bbox is set to false and if true, the new altered ones will be returned. Raises: ValueError if applied to video. """ if image.shape.rank == 4: raise ValueError('Image rank 4 is not supported') # Will keep track of the new altered bboxes after aug_func is repeatedly # applied. The -1 values are a dummy value and this first Tensor will be # removed upon appending the first real bbox. new_bboxes = tf.constant(_INVALID_BOX) # If the bboxes are empty, then just give it _INVALID_BOX. The result # will be thrown away. bboxes = tf.cond(tf.equal(tf.size(bboxes), 0), lambda: tf.constant(_INVALID_BOX), lambda: bboxes) bboxes = tf.ensure_shape(bboxes, (None, 4)) # pylint:disable=g-long-lambda wrapped_aug_func = ( lambda _image, bbox, _new_bboxes: _apply_bbox_augmentation_wrapper( _image, bbox, _new_bboxes, prob, aug_func, func_changes_bbox, *args)) # pylint:enable=g-long-lambda # Setup the while_loop. num_bboxes = tf.shape(bboxes)[0] # We loop until we go over all bboxes. idx = tf.constant(0) # Counter for the while loop. # Conditional function when to end the loop once we go over all bboxes # images_and_bboxes contain (_image, _new_bboxes) cond = lambda _idx, _images_and_bboxes: tf.less(_idx, num_bboxes) # Shuffle the bboxes so that the augmentation order is not deterministic if # we are not changing the bboxes with aug_func. if not func_changes_bbox: loop_bboxes = tf.random.shuffle(bboxes) else: loop_bboxes = bboxes # Main function of while_loop where we repeatedly apply augmentation on the # bboxes in the image. # pylint:disable=g-long-lambda body = lambda _idx, _images_and_bboxes: [ _idx + 1, wrapped_aug_func(_images_and_bboxes[0], loop_bboxes[_idx], _images_and_bboxes[1])] # pylint:enable=g-long-lambda _, (image, new_bboxes) = tf.while_loop( cond, body, [idx, (image, new_bboxes)], shape_invariants=[idx.get_shape(), (image.get_shape(), tf.TensorShape([None, 4]))]) # Either return the altered bboxes or the original ones depending on if # we altered them in anyway. if func_changes_bbox: final_bboxes = new_bboxes else: final_bboxes = bboxes return image, final_bboxes def _clip_bbox(min_y, min_x, max_y, max_x): """Clip bounding box coordinates between 0 and 1. Args: min_y: Normalized bbox coordinate of type float between 0 and 1. min_x: Normalized bbox coordinate of type float between 0 and 1. max_y: Normalized bbox coordinate of type float between 0 and 1. max_x: Normalized bbox coordinate of type float between 0 and 1. Returns: Clipped coordinate values between 0 and 1. """ min_y = tf.clip_by_value(min_y, 0.0, 1.0) min_x = tf.clip_by_value(min_x, 0.0, 1.0) max_y = tf.clip_by_value(max_y, 0.0, 1.0) max_x = tf.clip_by_value(max_x, 0.0, 1.0) return min_y, min_x, max_y, max_x def _check_bbox_area(min_y, min_x, max_y, max_x, delta=0.05): """Adjusts bbox coordinates to make sure the area is > 0. Args: min_y: Normalized bbox coordinate of type float between 0 and 1. min_x: Normalized bbox coordinate of type float between 0 and 1. max_y: Normalized bbox coordinate of type float between 0 and 1. max_x: Normalized bbox coordinate of type float between 0 and 1. delta: Float, this is used to create a gap of size 2 * delta between bbox min/max coordinates that are the same on the boundary. This prevents the bbox from having an area of zero. Returns: Tuple of new bbox coordinates between 0 and 1 that will now have a guaranteed area > 0. """ height = max_y - min_y width = max_x - min_x def _adjust_bbox_boundaries(min_coord, max_coord): # Make sure max is never 0 and min is never 1. max_coord = tf.maximum(max_coord, 0.0 + delta) min_coord = tf.minimum(min_coord, 1.0 - delta) return min_coord, max_coord min_y, max_y = tf.cond(tf.equal(height, 0.0), lambda: _adjust_bbox_boundaries(min_y, max_y), lambda: (min_y, max_y)) min_x, max_x = tf.cond(tf.equal(width, 0.0), lambda: _adjust_bbox_boundaries(min_x, max_x), lambda: (min_x, max_x)) return min_y, min_x, max_y, max_x def _rotate_bbox(bbox, image_height, image_width, degrees): """Rotates the bbox coordinated by degrees. Args: bbox: 1D Tensor that has 4 elements (min_y, min_x, max_y, max_x) of type float that represents the normalized coordinates between 0 and 1. image_height: Int, height of the image. image_width: Int, height of the image. degrees: Float, a scalar angle in degrees to rotate all images by. If degrees is positive the image will be rotated clockwise otherwise it will be rotated counterclockwise. Returns: A tensor of the same shape as bbox, but now with the rotated coordinates. """ image_height, image_width = ( tf.cast(image_height, tf.float32), tf.cast(image_width, tf.float32)) # Convert from degrees to radians. degrees_to_radians = math.pi / 180.0 radians = degrees * degrees_to_radians # Translate the bbox to the center of the image and turn the normalized 0-1 # coordinates to absolute pixel locations. # Y coordinates are made negative as the y axis of images goes down with # increasing pixel values, so we negate to make sure x axis and y axis points # are in the traditionally positive direction. min_y = -tf.cast(image_height * (bbox[0] - 0.5), tf.int32) min_x = tf.cast(image_width * (bbox[1] - 0.5), tf.int32) max_y = -tf.cast(image_height * (bbox[2] - 0.5), tf.int32) max_x = tf.cast(image_width * (bbox[3] - 0.5), tf.int32) coordinates = tf.stack( [[min_y, min_x], [min_y, max_x], [max_y, min_x], [max_y, max_x]]) coordinates = tf.cast(coordinates, tf.float32) # Rotate the coordinates according to the rotation matrix clockwise if # radians is positive, else negative rotation_matrix = tf.stack( [[tf.cos(radians), tf.sin(radians)], [-tf.sin(radians), tf.cos(radians)]]) new_coords = tf.cast( tf.matmul(rotation_matrix, tf.transpose(coordinates)), tf.int32) # Find min/max values and convert them back to normalized 0-1 floats. min_y = -( tf.cast(tf.reduce_max(new_coords[0, :]), tf.float32) / image_height - 0.5) min_x = tf.cast(tf.reduce_min(new_coords[1, :]), tf.float32) / image_width + 0.5 max_y = -( tf.cast(tf.reduce_min(new_coords[0, :]), tf.float32) / image_height - 0.5) max_x = tf.cast(tf.reduce_max(new_coords[1, :]), tf.float32) / image_width + 0.5 # Clip the bboxes to be sure the fall between [0, 1]. min_y, min_x, max_y, max_x = _clip_bbox(min_y, min_x, max_y, max_x) min_y, min_x, max_y, max_x = _check_bbox_area(min_y, min_x, max_y, max_x) return tf.stack([min_y, min_x, max_y, max_x]) def rotate_with_bboxes(image, bboxes, degrees, replace): """Equivalent of PIL Rotate that rotates the image and bbox. Args: image: 3D uint8 Tensor. bboxes: 2D Tensor that is a list of the bboxes in the image. Each bbox has 4 elements (min_y, min_x, max_y, max_x) of type float. degrees: Float, a scalar angle in degrees to rotate all images by. If degrees is positive the image will be rotated clockwise otherwise it will be rotated counterclockwise. replace: A one or three value 1D tensor to fill empty pixels. Returns: A tuple containing a 3D uint8 Tensor that will be the result of rotating image by degrees. The second element of the tuple is bboxes, where now the coordinates will be shifted to reflect the rotated image. Raises: ValueError: If applied to video. """ if image.shape.rank == 4: raise ValueError('Image rank 4 is not supported') # Rotate the image. image = wrapped_rotate(image, degrees, replace) # Convert bbox coordinates to pixel values. image_height = tf.shape(image)[0] image_width = tf.shape(image)[1] # pylint:disable=g-long-lambda wrapped_rotate_bbox = lambda bbox: _rotate_bbox( bbox, image_height, image_width, degrees) # pylint:enable=g-long-lambda bboxes = tf.map_fn(wrapped_rotate_bbox, bboxes) return image, bboxes def _shear_bbox(bbox, image_height, image_width, level, shear_horizontal): """Shifts the bbox according to how the image was sheared. Args: bbox: 1D Tensor that has 4 elements (min_y, min_x, max_y, max_x) of type float that represents the normalized coordinates between 0 and 1. image_height: Int, height of the image. image_width: Int, height of the image. level: Float. How much to shear the image. shear_horizontal: If true then shear in X dimension else shear in the Y dimension. Returns: A tensor of the same shape as bbox, but now with the shifted coordinates. """ image_height, image_width = ( tf.cast(image_height, tf.float32), tf.cast(image_width, tf.float32)) # Change bbox coordinates to be pixels. min_y = tf.cast(image_height * bbox[0], tf.int32) min_x = tf.cast(image_width * bbox[1], tf.int32) max_y = tf.cast(image_height * bbox[2], tf.int32) max_x = tf.cast(image_width * bbox[3], tf.int32) coordinates = tf.stack( [[min_y, min_x], [min_y, max_x], [max_y, min_x], [max_y, max_x]]) coordinates = tf.cast(coordinates, tf.float32) # Shear the coordinates according to the translation matrix. if shear_horizontal: translation_matrix = tf.stack( [[1, 0], [-level, 1]]) else: translation_matrix = tf.stack( [[1, -level], [0, 1]]) translation_matrix = tf.cast(translation_matrix, tf.float32) new_coords = tf.cast( tf.matmul(translation_matrix, tf.transpose(coordinates)), tf.int32) # Find min/max values and convert them back to floats. min_y = tf.cast(tf.reduce_min(new_coords[0, :]), tf.float32) / image_height min_x = tf.cast(tf.reduce_min(new_coords[1, :]), tf.float32) / image_width max_y = tf.cast(tf.reduce_max(new_coords[0, :]), tf.float32) / image_height max_x = tf.cast(tf.reduce_max(new_coords[1, :]), tf.float32) / image_width # Clip the bboxes to be sure the fall between [0, 1]. min_y, min_x, max_y, max_x = _clip_bbox(min_y, min_x, max_y, max_x) min_y, min_x, max_y, max_x = _check_bbox_area(min_y, min_x, max_y, max_x) return tf.stack([min_y, min_x, max_y, max_x]) def shear_with_bboxes(image, bboxes, level, replace, shear_horizontal): """Applies Shear Transformation to the image and shifts the bboxes. Args: image: 3D uint8 Tensor. bboxes: 2D Tensor that is a list of the bboxes in the image. Each bbox has 4 elements (min_y, min_x, max_y, max_x) of type float with values between [0, 1]. level: Float. How much to shear the image. This value will be between -0.3 to 0.3. replace: A one or three value 1D tensor to fill empty pixels. shear_horizontal: Boolean. If true then shear in X dimension else shear in the Y dimension. Returns: A tuple containing a 3D uint8 Tensor that will be the result of shearing image by level. The second element of the tuple is bboxes, where now the coordinates will be shifted to reflect the sheared image. Raises: ValueError: If applied to video. """ if image.shape.rank == 4: raise ValueError('Image rank 4 is not supported') if shear_horizontal: image = shear_x(image, level, replace) else: image = shear_y(image, level, replace) # Convert bbox coordinates to pixel values. image_height = tf.shape(image)[0] image_width = tf.shape(image)[1] # pylint:disable=g-long-lambda wrapped_shear_bbox = lambda bbox: _shear_bbox( bbox, image_height, image_width, level, shear_horizontal) # pylint:enable=g-long-lambda bboxes = tf.map_fn(wrapped_shear_bbox, bboxes) return image, bboxes def _shift_bbox(bbox, image_height, image_width, pixels, shift_horizontal): """Shifts the bbox coordinates by pixels. Args: bbox: 1D Tensor that has 4 elements (min_y, min_x, max_y, max_x) of type float that represents the normalized coordinates between 0 and 1. image_height: Int, height of the image. image_width: Int, width of the image. pixels: An int. How many pixels to shift the bbox. shift_horizontal: Boolean. If true then shift in X dimension else shift in Y dimension. Returns: A tensor of the same shape as bbox, but now with the shifted coordinates. """ pixels = tf.cast(pixels, tf.int32) # Convert bbox to integer pixel locations. min_y = tf.cast(tf.cast(image_height, tf.float32) * bbox[0], tf.int32) min_x = tf.cast(tf.cast(image_width, tf.float32) * bbox[1], tf.int32) max_y = tf.cast(tf.cast(image_height, tf.float32) * bbox[2], tf.int32) max_x = tf.cast(tf.cast(image_width, tf.float32) * bbox[3], tf.int32) if shift_horizontal: min_x = tf.maximum(0, min_x - pixels) max_x = tf.minimum(image_width, max_x - pixels) else: min_y = tf.maximum(0, min_y - pixels) max_y = tf.minimum(image_height, max_y - pixels) # Convert bbox back to floats. min_y = tf.cast(min_y, tf.float32) / tf.cast(image_height, tf.float32) min_x = tf.cast(min_x, tf.float32) / tf.cast(image_width, tf.float32) max_y = tf.cast(max_y, tf.float32) / tf.cast(image_height, tf.float32) max_x = tf.cast(max_x, tf.float32) / tf.cast(image_width, tf.float32) # Clip the bboxes to be sure the fall between [0, 1]. min_y, min_x, max_y, max_x = _clip_bbox(min_y, min_x, max_y, max_x) min_y, min_x, max_y, max_x = _check_bbox_area(min_y, min_x, max_y, max_x) return tf.stack([min_y, min_x, max_y, max_x]) def translate_bbox(image, bboxes, pixels, replace, shift_horizontal): """Equivalent of PIL Translate in X/Y dimension that shifts image and bbox. Args: image: 3D uint8 Tensor. bboxes: 2D Tensor that is a list of the bboxes in the image. Each bbox has 4 elements (min_y, min_x, max_y, max_x) of type float with values between [0, 1]. pixels: An int. How many pixels to shift the image and bboxes replace: A one or three value 1D tensor to fill empty pixels. shift_horizontal: Boolean. If true then shift in X dimension else shift in Y dimension. Returns: A tuple containing a 3D uint8 Tensor that will be the result of translating image by pixels. The second element of the tuple is bboxes, where now the coordinates will be shifted to reflect the shifted image. Raises: ValueError if applied to video. """ if image.shape.rank == 4: raise ValueError('Image rank 4 is not supported') if shift_horizontal: image = translate_x(image, pixels, replace) else: image = translate_y(image, pixels, replace) # Convert bbox coordinates to pixel values. image_height = tf.shape(image)[0] image_width = tf.shape(image)[1] # pylint:disable=g-long-lambda wrapped_shift_bbox = lambda bbox: _shift_bbox( bbox, image_height, image_width, pixels, shift_horizontal) # pylint:enable=g-long-lambda bboxes = tf.map_fn(wrapped_shift_bbox, bboxes) return image, bboxes def translate_y_only_bboxes( image: tf.Tensor, bboxes: tf.Tensor, prob: float, pixels: int, replace): """Apply translate_y to each bbox in the image with probability prob.""" if bboxes.shape.rank == 4: raise ValueError('translate_y_only_bboxes does not support rank 4 boxes') func_changes_bbox = False prob = _scale_bbox_only_op_probability(prob) return _apply_multi_bbox_augmentation_wrapper( image, bboxes, prob, translate_y, func_changes_bbox, pixels, replace) def _randomly_negate_tensor(tensor): """With 50% prob turn the tensor negative.""" should_flip = tf.cast(tf.floor(tf.random.uniform([]) + 0.5), tf.bool) final_tensor = tf.cond(should_flip, lambda: tensor, lambda: -tensor) return final_tensor def _rotate_level_to_arg(level: float): level = (level / _MAX_LEVEL) * 30. level = _randomly_negate_tensor(level) return (level,) def _shrink_level_to_arg(level: float): """Converts level to ratio by which we shrink the image content.""" if level == 0: return (1.0,) # if level is zero, do not shrink the image # Maximum shrinking ratio is 2.9. level = 2. / (_MAX_LEVEL / level) + 0.9 return (level,) def _enhance_level_to_arg(level: float): return ((level / _MAX_LEVEL) * 1.8 + 0.1,) def _shear_level_to_arg(level: float): level = (level / _MAX_LEVEL) * 0.3 # Flip level to negative with 50% chance. level = _randomly_negate_tensor(level) return (level,) def _translate_level_to_arg(level: float, translate_const: float): level = (level / _MAX_LEVEL) * float(translate_const) # Flip level to negative with 50% chance. level = _randomly_negate_tensor(level) return (level,) def _gaussian_noise_level_to_arg(level: float, translate_const: float): low_std = (level / _MAX_LEVEL) high_std = translate_const * low_std return low_std, high_std def _mult_to_arg(level: float, multiplier: float = 1.): return (int((level / _MAX_LEVEL) * multiplier),) def _apply_func_with_prob(func: Any, image: tf.Tensor, bboxes: Optional[tf.Tensor], args: Any, prob: float): """Apply `func` to image w/ `args` as input with probability `prob`.""" assert isinstance(args, tuple) assert inspect.getfullargspec(func)[0][1] == 'bboxes' # Apply the function with probability `prob`. should_apply_op = tf.cast( tf.floor(tf.random.uniform([], dtype=tf.float32) + prob), tf.bool) augmented_image, augmented_bboxes = tf.cond( should_apply_op, lambda: func(image, bboxes, *args), lambda: (image, bboxes)) return augmented_image, augmented_bboxes def select_and_apply_random_policy( policies: Any, image: tf.Tensor, bboxes: Optional[tf.Tensor] = None ) -> Tuple[tf.Tensor, Optional[tf.Tensor]]: """Select a random policy from `policies` and apply it to `image`.""" policy_to_select = tf.random.uniform([], maxval=len(policies), dtype=tf.int32) # Note that using tf.case instead of tf.conds would result in significantly # larger graphs and would even break export for some larger policies. for (i, policy) in enumerate(policies): image, bboxes = tf.cond( tf.equal(i, policy_to_select), lambda selected_policy=policy: selected_policy(image, bboxes), lambda: (image, bboxes)) return image, bboxes NAME_TO_FUNC = { 'AutoContrast': autocontrast, 'Equalize': equalize, 'Invert': invert, 'Rotate': wrapped_rotate, 'Posterize': posterize, 'Solarize': solarize, 'SolarizeAdd': solarize_add, 'Color': color, 'Contrast': contrast, 'Brightness': brightness, 'Sharpness': sharpness, 'ShearX': shear_x, 'ShearY': shear_y, 'TranslateX': translate_x, 'TranslateY': translate_y, 'Cutout': cutout, 'Rotate_BBox': rotate_with_bboxes, 'Grayscale': grayscale, 'Gaussian_Noise': gaussian_noise, # pylint:disable=g-long-lambda 'ShearX_BBox': lambda image, bboxes, level, replace: shear_with_bboxes( image, bboxes, level, replace, shear_horizontal=True), 'ShearY_BBox': lambda image, bboxes, level, replace: shear_with_bboxes( image, bboxes, level, replace, shear_horizontal=False), 'TranslateX_BBox': lambda image, bboxes, pixels, replace: translate_bbox( image, bboxes, pixels, replace, shift_horizontal=True), 'TranslateY_BBox': lambda image, bboxes, pixels, replace: translate_bbox( image, bboxes, pixels, replace, shift_horizontal=False), # pylint:enable=g-long-lambda 'TranslateY_Only_BBoxes': translate_y_only_bboxes, } # Functions that require a `bboxes` parameter. REQUIRE_BOXES_FUNCS = frozenset({ 'Rotate_BBox', 'ShearX_BBox', 'ShearY_BBox', 'TranslateX_BBox', 'TranslateY_BBox', 'TranslateY_Only_BBoxes', }) # Functions that have a 'prob' parameter PROB_FUNCS = frozenset({ 'TranslateY_Only_BBoxes', }) # Functions that have a 'replace' parameter REPLACE_FUNCS = frozenset({ 'Rotate', 'TranslateX', 'ShearX', 'ShearY', 'TranslateY', 'Cutout', 'Rotate_BBox', 'ShearX_BBox', 'ShearY_BBox', 'TranslateX_BBox', 'TranslateY_BBox', 'TranslateY_Only_BBoxes', }) def level_to_arg(cutout_const: float, translate_const: float): """Creates a dict mapping image operation names to their arguments.""" no_arg = lambda level: () posterize_arg = lambda level: _mult_to_arg(level, 4) solarize_arg = lambda level: _mult_to_arg(level, 256) solarize_add_arg = lambda level: _mult_to_arg(level, 110) cutout_arg = lambda level: _mult_to_arg(level, cutout_const) translate_arg = lambda level: _translate_level_to_arg(level, translate_const) translate_bbox_arg = lambda level: _translate_level_to_arg(level, 120) args = { 'AutoContrast': no_arg, 'Equalize': no_arg, 'Invert': no_arg, 'Rotate': _rotate_level_to_arg, 'Posterize': posterize_arg, 'Solarize': solarize_arg, 'SolarizeAdd': solarize_add_arg, 'Color': _enhance_level_to_arg, 'Contrast': _enhance_level_to_arg, 'Brightness': _enhance_level_to_arg, 'Sharpness': _enhance_level_to_arg, 'ShearX': _shear_level_to_arg, 'ShearY': _shear_level_to_arg, 'Cutout': cutout_arg, 'TranslateX': translate_arg, 'TranslateY': translate_arg, 'Rotate_BBox': _rotate_level_to_arg, 'ShearX_BBox': _shear_level_to_arg, 'ShearY_BBox': _shear_level_to_arg, 'Grayscale': no_arg, # pylint:disable=g-long-lambda 'Gaussian_Noise': lambda level: _gaussian_noise_level_to_arg( level, translate_const), # pylint:disable=g-long-lambda 'TranslateX_BBox': lambda level: _translate_level_to_arg( level, translate_const), 'TranslateY_BBox': lambda level: _translate_level_to_arg( level, translate_const), # pylint:enable=g-long-lambda 'TranslateY_Only_BBoxes': translate_bbox_arg, } return args def bbox_wrapper(func): """Adds a bboxes function argument to func and returns unchanged bboxes.""" def wrapper(images, bboxes, *args, **kwargs): return (func(images, *args, **kwargs), bboxes) return wrapper def _parse_policy_info(name: str, prob: float, level: float, replace_value: List[int], cutout_const: float, translate_const: float, level_std: float = 0.) -> Tuple[Any, float, Any]: """Return the function that corresponds to `name` and update `level` param.""" func = NAME_TO_FUNC[name] if level_std > 0: level += tf.random.normal([], dtype=tf.float32) level = tf.clip_by_value(level, 0., _MAX_LEVEL) args = level_to_arg(cutout_const, translate_const)[name](level) if name in PROB_FUNCS: # Add in the prob arg if it is required for the function that is called. args = tuple([prob] + list(args)) if name in REPLACE_FUNCS: # Add in replace arg if it is required for the function that is called. args = tuple(list(args) + [replace_value]) # Add bboxes as the second positional argument for the function if it does # not already exist. if 'bboxes' not in inspect.getfullargspec(func)[0]: func = bbox_wrapper(func) return func, prob, args class ImageAugment(object): """Image augmentation class for applying image distortions.""" def distort( self, image: tf.Tensor ) -> tf.Tensor: """Given an image tensor, returns a distorted image with the same shape. Expect the image tensor values are in the range [0, 255]. Args: image: `Tensor` of shape [height, width, 3] or [num_frames, height, width, 3] representing an image or image sequence. Returns: The augmented version of `image`. """ raise NotImplementedError() def distort_with_boxes( self, image: tf.Tensor, bboxes: tf.Tensor ) -> Tuple[tf.Tensor, tf.Tensor]: """Distorts the image and bounding boxes. Expect the image tensor values are in the range [0, 255]. Args: image: `Tensor` of shape [height, width, 3] or [num_frames, height, width, 3] representing an image or image sequence. bboxes: `Tensor` of shape [num_boxes, 4] or [num_frames, num_boxes, 4] representing bounding boxes for an image or image sequence. Returns: The augmented version of `image` and `bboxes`. """ raise NotImplementedError class AutoAugment(ImageAugment): """Applies the AutoAugment policy to images. AutoAugment is from the paper: https://arxiv.org/abs/1805.09501. """ def __init__(self, augmentation_name: str = 'v0', policies: Optional[Iterable[Iterable[Tuple[str, float, float]]]] = None, cutout_const: float = 100, translate_const: float = 250): """Applies the AutoAugment policy to images. Args: augmentation_name: The name of the AutoAugment policy to use. The available options are `v0`, `test`, `reduced_cifar10`, `svhn` and `reduced_imagenet`. `v0` is the policy used for all of the results in the paper and was found to achieve the best results on the COCO dataset. `v1`, `v2` and `v3` are additional good policies found on the COCO dataset that have slight variation in what operations were used during the search procedure along with how many operations are applied in parallel to a single image (2 vs 3). Make sure to set `policies` to `None` (the default) if you want to set options using `augmentation_name`. policies: list of lists of tuples in the form `(func, prob, level)`, `func` is a string name of the augmentation function, `prob` is the probability of applying the `func` operation, `level` (or magnitude) is the input argument for `func`. For example: ``` [[('Equalize', 0.9, 3), ('Color', 0.7, 8)], [('Invert', 0.6, 5), ('Rotate', 0.2, 9), ('ShearX', 0.1, 2)], ...] ``` The outer-most list must be 3-d. The number of operations in a sub-policy can vary from one sub-policy to another. If you provide `policies` as input, any option set with `augmentation_name` will get overriden as they are mutually exclusive. cutout_const: multiplier for applying cutout. translate_const: multiplier for applying translation. Raises: ValueError if `augmentation_name` is unsupported. """ super(AutoAugment, self).__init__() self.augmentation_name = augmentation_name self.cutout_const = float(cutout_const) self.translate_const = float(translate_const) self.available_policies = { 'detection_v0': self.detection_policy_v0(), 'v0': self.policy_v0(), 'test': self.policy_test(), 'simple': self.policy_simple(), 'reduced_cifar10': self.policy_reduced_cifar10(), 'svhn': self.policy_svhn(), 'reduced_imagenet': self.policy_reduced_imagenet(), 'panoptic_deeplab_policy': self.panoptic_deeplab_policy(), 'vit': self.vit(), 'deit3_three_augment': self.deit3_three_augment(), } if not policies: if augmentation_name not in self.available_policies: raise ValueError( 'Invalid augmentation_name: {}'.format(augmentation_name)) self.policies = self.available_policies[augmentation_name] else: self._check_policy_shape(policies) self.policies = policies def _check_policy_shape(self, policies): """Checks dimension and shape of the custom policy. Args: policies: List of list of tuples in the form `(func, prob, level)`. Must have shape of `(:, :, 3)`. Raises: ValueError if the shape of `policies` is unexpected. """ in_shape = np.array(policies).shape if len(in_shape) != 3 or in_shape[-1:] != (3,): raise ValueError('Wrong shape detected for custom policy. Expected ' '(:, :, 3) but got {}.'.format(in_shape)) def _make_tf_policies(self): """Prepares the TF functions for augmentations based on the policies.""" replace_value = [128] * 3 # func is the string name of the augmentation function, prob is the # probability of applying the operation and level is the parameter # associated with the tf op. # tf_policies are functions that take in an image and return an augmented # image. tf_policies = [] for policy in self.policies: tf_policy = [] assert_ranges = [] # Link string name to the correct python function and make sure the # correct argument is passed into that function. for policy_info in policy: _, prob, level = policy_info assert_ranges.append(tf.Assert(tf.less_equal(prob, 1.), [prob])) assert_ranges.append( tf.Assert(tf.less_equal(level, int(_MAX_LEVEL)), [level])) policy_info = list(policy_info) + [ replace_value, self.cutout_const, self.translate_const ] tf_policy.append(_parse_policy_info(*policy_info)) # Now build the tf policy that will apply the augmentation procedue # on image. def make_final_policy(tf_policy_): def final_policy(image_, bboxes_): for func, prob, args in tf_policy_: image_, bboxes_ = _apply_func_with_prob(func, image_, bboxes_, args, prob) return image_, bboxes_ return final_policy with tf.control_dependencies(assert_ranges): tf_policies.append(make_final_policy(tf_policy)) return tf_policies def distort(self, image: tf.Tensor) -> tf.Tensor: """See base class.""" input_image_type = image.dtype if input_image_type != tf.uint8: image = tf.clip_by_value(image, 0.0, 255.0) image = tf.cast(image, dtype=tf.uint8) tf_policies = self._make_tf_policies() image, _ = select_and_apply_random_policy(tf_policies, image, bboxes=None) image = tf.cast(image, dtype=input_image_type) return image def distort_with_boxes(self, image: tf.Tensor, bboxes: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]: """See base class.""" input_image_type = image.dtype if input_image_type != tf.uint8: image = tf.clip_by_value(image, 0.0, 255.0) image = tf.cast(image, dtype=tf.uint8) tf_policies = self._make_tf_policies() image, bboxes = select_and_apply_random_policy(tf_policies, image, bboxes) image = tf.cast(image, dtype=input_image_type) assert bboxes is not None return image, bboxes @staticmethod def detection_policy_v0(): """Autoaugment policy that was used in AutoAugment Paper for Detection. https://arxiv.org/pdf/1906.11172 Each tuple is an augmentation operation of the form (operation, probability, magnitude). Each element in policy is a sub-policy that will be applied sequentially on the image. Returns: the policy. """ policy = [ [('TranslateX_BBox', 0.6, 4), ('Equalize', 0.8, 10)], [('TranslateY_Only_BBoxes', 0.2, 2), ('Cutout', 0.8, 8)], [('Sharpness', 0.0, 8), ('ShearX_BBox', 0.4, 0)], [('ShearY_BBox', 1.0, 2), ('TranslateY_Only_BBoxes', 0.6, 6)], [('Rotate_BBox', 0.6, 10), ('Color', 1.0, 6)], ] return policy @staticmethod def policy_v0(): """Autoaugment policy that was used in AutoAugment Paper. Each tuple is an augmentation operation of the form (operation, probability, magnitude). Each element in policy is a sub-policy that will be applied sequentially on the image. Returns: the policy. """ policy = [ [('Equalize', 0.8, 1), ('ShearY', 0.8, 4)], [('Color', 0.4, 9), ('Equalize', 0.6, 3)], [('Color', 0.4, 1), ('Rotate', 0.6, 8)], [('Solarize', 0.8, 3), ('Equalize', 0.4, 7)], [('Solarize', 0.4, 2), ('Solarize', 0.6, 2)], [('Color', 0.2, 0), ('Equalize', 0.8, 8)], [('Equalize', 0.4, 8), ('SolarizeAdd', 0.8, 3)], [('ShearX', 0.2, 9), ('Rotate', 0.6, 8)], [('Color', 0.6, 1), ('Equalize', 1.0, 2)], [('Invert', 0.4, 9), ('Rotate', 0.6, 0)], [('Equalize', 1.0, 9), ('ShearY', 0.6, 3)], [('Color', 0.4, 7), ('Equalize', 0.6, 0)], [('Posterize', 0.4, 6), ('AutoContrast', 0.4, 7)], [('Solarize', 0.6, 8), ('Color', 0.6, 9)], [('Solarize', 0.2, 4), ('Rotate', 0.8, 9)], [('Rotate', 1.0, 7), ('TranslateY', 0.8, 9)], [('ShearX', 0.0, 0), ('Solarize', 0.8, 4)], [('ShearY', 0.8, 0), ('Color', 0.6, 4)], [('Color', 1.0, 0), ('Rotate', 0.6, 2)], [('Equalize', 0.8, 4), ('Equalize', 0.0, 8)], [('Equalize', 1.0, 4), ('AutoContrast', 0.6, 2)], [('ShearY', 0.4, 7), ('SolarizeAdd', 0.6, 7)], [('Posterize', 0.8, 2), ('Solarize', 0.6, 10)], [('Solarize', 0.6, 8), ('Equalize', 0.6, 1)], [('Color', 0.8, 6), ('Rotate', 0.4, 5)], ] return policy @staticmethod def policy_reduced_cifar10(): """Autoaugment policy for reduced CIFAR-10 dataset. Result is from the AutoAugment paper: https://arxiv.org/abs/1805.09501. Each tuple is an augmentation operation of the form (operation, probability, magnitude). Each element in policy is a sub-policy that will be applied sequentially on the image. Returns: the policy. """ policy = [ [('Invert', 0.1, 7), ('Contrast', 0.2, 6)], [('Rotate', 0.7, 2), ('TranslateX', 0.3, 9)], [('Sharpness', 0.8, 1), ('Sharpness', 0.9, 3)], [('ShearY', 0.5, 8), ('TranslateY', 0.7, 9)], [('AutoContrast', 0.5, 8), ('Equalize', 0.9, 2)], [('ShearY', 0.2, 7), ('Posterize', 0.3, 7)], [('Color', 0.4, 3), ('Brightness', 0.6, 7)], [('Sharpness', 0.3, 9), ('Brightness', 0.7, 9)], [('Equalize', 0.6, 5), ('Equalize', 0.5, 1)], [('Contrast', 0.6, 7), ('Sharpness', 0.6, 5)], [('Color', 0.7, 7), ('TranslateX', 0.5, 8)], [('Equalize', 0.3, 7), ('AutoContrast', 0.4, 8)], [('TranslateY', 0.4, 3), ('Sharpness', 0.2, 6)], [('Brightness', 0.9, 6), ('Color', 0.2, 8)], [('Solarize', 0.5, 2), ('Invert', 0.0, 3)], [('Equalize', 0.2, 0), ('AutoContrast', 0.6, 0)], [('Equalize', 0.2, 8), ('Equalize', 0.6, 4)], [('Color', 0.9, 9), ('Equalize', 0.6, 6)], [('AutoContrast', 0.8, 4), ('Solarize', 0.2, 8)], [('Brightness', 0.1, 3), ('Color', 0.7, 0)], [('Solarize', 0.4, 5), ('AutoContrast', 0.9, 3)], [('TranslateY', 0.9, 9), ('TranslateY', 0.7, 9)], [('AutoContrast', 0.9, 2), ('Solarize', 0.8, 3)], [('Equalize', 0.8, 8), ('Invert', 0.1, 3)], [('TranslateY', 0.7, 9), ('AutoContrast', 0.9, 1)], ] return policy @staticmethod def policy_svhn(): """Autoaugment policy for SVHN dataset. Result is from the AutoAugment paper: https://arxiv.org/abs/1805.09501. Each tuple is an augmentation operation of the form (operation, probability, magnitude). Each element in policy is a sub-policy that will be applied sequentially on the image. Returns: the policy. """ policy = [ [('ShearX', 0.9, 4), ('Invert', 0.2, 3)], [('ShearY', 0.9, 8), ('Invert', 0.7, 5)], [('Equalize', 0.6, 5), ('Solarize', 0.6, 6)], [('Invert', 0.9, 3), ('Equalize', 0.6, 3)], [('Equalize', 0.6, 1), ('Rotate', 0.9, 3)], [('ShearX', 0.9, 4), ('AutoContrast', 0.8, 3)], [('ShearY', 0.9, 8), ('Invert', 0.4, 5)], [('ShearY', 0.9, 5), ('Solarize', 0.2, 6)], [('Invert', 0.9, 6), ('AutoContrast', 0.8, 1)], [('Equalize', 0.6, 3), ('Rotate', 0.9, 3)], [('ShearX', 0.9, 4), ('Solarize', 0.3, 3)], [('ShearY', 0.8, 8), ('Invert', 0.7, 4)], [('Equalize', 0.9, 5), ('TranslateY', 0.6, 6)], [('Invert', 0.9, 4), ('Equalize', 0.6, 7)], [('Contrast', 0.3, 3), ('Rotate', 0.8, 4)], [('Invert', 0.8, 5), ('TranslateY', 0.0, 2)], [('ShearY', 0.7, 6), ('Solarize', 0.4, 8)], [('Invert', 0.6, 4), ('Rotate', 0.8, 4)], [('ShearY', 0.3, 7), ('TranslateX', 0.9, 3)], [('ShearX', 0.1, 6), ('Invert', 0.6, 5)], [('Solarize', 0.7, 2), ('TranslateY', 0.6, 7)], [('ShearY', 0.8, 4), ('Invert', 0.8, 8)], [('ShearX', 0.7, 9), ('TranslateY', 0.8, 3)], [('ShearY', 0.8, 5), ('AutoContrast', 0.7, 3)], [('ShearX', 0.7, 2), ('Invert', 0.1, 5)], ] return policy @staticmethod def policy_reduced_imagenet(): """Autoaugment policy for reduced ImageNet dataset. Result is from the AutoAugment paper: https://arxiv.org/abs/1805.09501. Each tuple is an augmentation operation of the form (operation, probability, magnitude). Each element in policy is a sub-policy that will be applied sequentially on the image. Returns: the policy. """ policy = [ [('Posterize', 0.4, 8), ('Rotate', 0.6, 9)], [('Solarize', 0.6, 5), ('AutoContrast', 0.6, 5)], [('Equalize', 0.8, 8), ('Equalize', 0.6, 3)], [('Posterize', 0.6, 7), ('Posterize', 0.6, 6)], [('Equalize', 0.4, 7), ('Solarize', 0.2, 4)], [('Equalize', 0.4, 4), ('Rotate', 0.8, 8)], [('Solarize', 0.6, 3), ('Equalize', 0.6, 7)], [('Posterize', 0.8, 5), ('Equalize', 1.0, 2)], [('Rotate', 0.2, 3), ('Solarize', 0.6, 8)], [('Equalize', 0.6, 8), ('Posterize', 0.4, 6)], [('Rotate', 0.8, 8), ('Color', 0.4, 0)], [('Rotate', 0.4, 9), ('Equalize', 0.6, 2)], [('Equalize', 0.0, 7), ('Equalize', 0.8, 8)], [('Invert', 0.6, 4), ('Equalize', 1.0, 8)], [('Color', 0.6, 4), ('Contrast', 1.0, 8)], [('Rotate', 0.8, 8), ('Color', 1.0, 2)], [('Color', 0.8, 8), ('Solarize', 0.8, 7)], [('Sharpness', 0.4, 7), ('Invert', 0.6, 8)], [('ShearX', 0.6, 5), ('Equalize', 1.0, 9)], [('Color', 0.4, 0), ('Equalize', 0.6, 3)], [('Equalize', 0.4, 7), ('Solarize', 0.2, 4)], [('Solarize', 0.6, 5), ('AutoContrast', 0.6, 5)], [('Invert', 0.6, 4), ('Equalize', 1.0, 8)], [('Color', 0.6, 4), ('Contrast', 1.0, 8)], [('Equalize', 0.8, 8), ('Equalize', 0.6, 3)] ] return policy @staticmethod def policy_simple(): """Same as `policy_v0`, except with custom ops removed.""" policy = [ [('Color', 0.4, 9), ('Equalize', 0.6, 3)], [('Solarize', 0.8, 3), ('Equalize', 0.4, 7)], [('Solarize', 0.4, 2), ('Solarize', 0.6, 2)], [('Color', 0.2, 0), ('Equalize', 0.8, 8)], [('Equalize', 0.4, 8), ('SolarizeAdd', 0.8, 3)], [('Color', 0.6, 1), ('Equalize', 1.0, 2)], [('Color', 0.4, 7), ('Equalize', 0.6, 0)], [('Posterize', 0.4, 6), ('AutoContrast', 0.4, 7)], [('Solarize', 0.6, 8), ('Color', 0.6, 9)], [('Equalize', 0.8, 4), ('Equalize', 0.0, 8)], [('Equalize', 1.0, 4), ('AutoContrast', 0.6, 2)], [('Posterize', 0.8, 2), ('Solarize', 0.6, 10)], [('Solarize', 0.6, 8), ('Equalize', 0.6, 1)], ] return policy @staticmethod def panoptic_deeplab_policy(): policy = [ [('Sharpness', 0.4, 1.4), ('Brightness', 0.2, 2.0)], [('Equalize', 0.0, 1.8), ('Contrast', 0.2, 2.0)], [('Sharpness', 0.2, 1.8), ('Color', 0.2, 1.8)], [('Solarize', 0.2, 1.4), ('Equalize', 0.6, 1.8)], [('Sharpness', 0.2, 0.2), ('Equalize', 0.2, 1.4)]] return policy @staticmethod def vit(): """Autoaugment policy for a generic ViT.""" policy = [ [('Sharpness', 0.4, 1.4), ('Brightness', 0.2, 2.0), ('Cutout', 0.8, 8)], [('Equalize', 0.0, 1.8), ('Contrast', 0.2, 2.0), ('Cutout', 0.8, 8)], [('Sharpness', 0.2, 1.8), ('Color', 0.2, 1.8), ('Cutout', 0.8, 8)], [('Solarize', 0.2, 1.4), ('Equalize', 0.6, 1.8), ('Cutout', 0.8, 8)], [('Sharpness', 0.2, 0.2), ('Equalize', 0.2, 1.4), ('Cutout', 0.8, 8)], [('Sharpness', 0.4, 7), ('Invert', 0.6, 8), ('Cutout', 0.8, 8)], [('Invert', 0.6, 4), ('Equalize', 1.0, 8), ('Cutout', 0.8, 8)], [('Posterize', 0.6, 7), ('Posterize', 0.6, 6), ('Cutout', 0.8, 8)], [('Solarize', 0.6, 5), ('AutoContrast', 0.6, 5), ('Cutout', 0.8, 8)], ] return policy @staticmethod def deit3_three_augment(): """Autoaugment policy for three augmentations. Proposed in paper: https://arxiv.org/abs/2204.07118. Each tuple is an augmentation operation of the form (operation, probability, magnitude). Each element in policy is a sub-policy that will be applied on the image. Randomly chooses one of the three augmentation to apply on image. Returns: the policy. """ policy = [ [('Grayscale', 1.0, 0)], [('Solarize', 1.0, 5)], # to have threshold as 128 [('Gaussian_Noise', 1.0, 1)], # to have low_std as 0.1 ] return policy @staticmethod def policy_test(): """Autoaugment test policy for debugging.""" policy = [ [('TranslateX', 1.0, 4), ('Equalize', 1.0, 10)], ] return policy def _maybe_identity(x: Optional[tf.Tensor]) -> Optional[tf.Tensor]: return tf.identity(x) if x is not None else None class RandAugment(ImageAugment): """Applies the RandAugment policy to images. RandAugment is from the paper https://arxiv.org/abs/1909.13719. """ def __init__(self, num_layers: int = 2, magnitude: float = 10., cutout_const: float = 40., translate_const: float = 100., magnitude_std: float = 0.0, prob_to_apply: Optional[float] = None, exclude_ops: Optional[List[str]] = None): """Applies the RandAugment policy to images. Args: num_layers: Integer, the number of augmentation transformations to apply sequentially to an image. Represented as (N) in the paper. Usually best values will be in the range [1, 3]. magnitude: Integer, shared magnitude across all augmentation operations. Represented as (M) in the paper. Usually best values are in the range [5, 10]. cutout_const: multiplier for applying cutout. translate_const: multiplier for applying translation. magnitude_std: randomness of the severity as proposed by the authors of the timm library. prob_to_apply: The probability to apply the selected augmentation at each layer. exclude_ops: exclude selected operations. """ super(RandAugment, self).__init__() self.num_layers = num_layers self.magnitude = float(magnitude) self.cutout_const = float(cutout_const) self.translate_const = float(translate_const) self.prob_to_apply = ( float(prob_to_apply) if prob_to_apply is not None else None) self.available_ops = [ 'AutoContrast', 'Equalize', 'Invert', 'Rotate', 'Posterize', 'Solarize', 'Color', 'Contrast', 'Brightness', 'Sharpness', 'ShearX', 'ShearY', 'TranslateX', 'TranslateY', 'Cutout', 'SolarizeAdd' ] self.magnitude_std = magnitude_std if exclude_ops: self.available_ops = [ op for op in self.available_ops if op not in exclude_ops ] @classmethod def build_for_detection(cls, num_layers: int = 2, magnitude: float = 10., cutout_const: float = 40., translate_const: float = 100., magnitude_std: float = 0.0, prob_to_apply: Optional[float] = None, exclude_ops: Optional[List[str]] = None): """Builds a RandAugment that modifies bboxes for geometric transforms.""" augmenter = cls( num_layers=num_layers, magnitude=magnitude, cutout_const=cutout_const, translate_const=translate_const, magnitude_std=magnitude_std, prob_to_apply=prob_to_apply, exclude_ops=exclude_ops) box_aware_ops_by_base_name = { 'Rotate': 'Rotate_BBox', 'ShearX': 'ShearX_BBox', 'ShearY': 'ShearY_BBox', 'TranslateX': 'TranslateX_BBox', 'TranslateY': 'TranslateY_BBox', } augmenter.available_ops = [ box_aware_ops_by_base_name.get(op_name) or op_name for op_name in augmenter.available_ops ] return augmenter def _distort_common( self, image: tf.Tensor, bboxes: Optional[tf.Tensor] = None ) -> Tuple[tf.Tensor, Optional[tf.Tensor]]: """Distorts the image and optionally bounding boxes.""" input_image_type = image.dtype if input_image_type != tf.uint8: image = tf.clip_by_value(image, 0.0, 255.0) image = tf.cast(image, dtype=tf.uint8) replace_value = [128] * 3 min_prob, max_prob = 0.2, 0.8 aug_image = image aug_bboxes = bboxes for _ in range(self.num_layers): op_to_select = tf.random.uniform([], maxval=len(self.available_ops) + 1, dtype=tf.int32) branch_fns = [] for (i, op_name) in enumerate(self.available_ops): prob = tf.random.uniform([], minval=min_prob, maxval=max_prob, dtype=tf.float32) func, _, args = _parse_policy_info(op_name, prob, self.magnitude, replace_value, self.cutout_const, self.translate_const, self.magnitude_std) branch_fns.append(( i, # pylint:disable=g-long-lambda lambda selected_func=func, selected_args=args: selected_func( image, bboxes, *selected_args))) # pylint:enable=g-long-lambda aug_image, aug_bboxes = tf.switch_case( branch_index=op_to_select, branch_fns=branch_fns, default=lambda: (tf.identity(image), _maybe_identity(bboxes))) # pylint: disable=cell-var-from-loop if self.prob_to_apply is not None: aug_image, aug_bboxes = tf.cond( tf.random.uniform(shape=[], dtype=tf.float32) < self.prob_to_apply, lambda: (tf.identity(aug_image), _maybe_identity(aug_bboxes)), lambda: (tf.identity(image), _maybe_identity(bboxes))) image = aug_image bboxes = aug_bboxes image = tf.cast(image, dtype=input_image_type) return image, bboxes def distort(self, image: tf.Tensor) -> tf.Tensor: """See base class.""" image, _ = self._distort_common(image) return image def distort_with_boxes(self, image: tf.Tensor, bboxes: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]: """See base class.""" image, bboxes = self._distort_common(image, bboxes) assert bboxes is not None return image, bboxes class RandomErasing(ImageAugment): """Applies RandomErasing to a single image. Reference: https://arxiv.org/abs/1708.04896 Implementation is inspired by https://github.com/rwightman/pytorch-image-models. """ def __init__(self, probability: float = 0.25, min_area: float = 0.02, max_area: float = 1 / 3, min_aspect: float = 0.3, max_aspect: Optional[float] = None, min_count=1, max_count=1, trials=10): """Applies RandomErasing to a single image. Args: probability: Probability of augmenting the image. Defaults to `0.25`. min_area: Minimum area of the random erasing rectangle. Defaults to `0.02`. max_area: Maximum area of the random erasing rectangle. Defaults to `1/3`. min_aspect: Minimum aspect rate of the random erasing rectangle. Defaults to `0.3`. max_aspect: Maximum aspect rate of the random erasing rectangle. Defaults to `None`. min_count: Minimum number of erased rectangles. Defaults to `1`. max_count: Maximum number of erased rectangles. Defaults to `1`. trials: Maximum number of trials to randomly sample a rectangle that fulfills constraint. Defaults to `10`. """ self._probability = probability self._min_area = float(min_area) self._max_area = float(max_area) self._min_log_aspect = math.log(min_aspect) self._max_log_aspect = math.log(max_aspect or 1 / min_aspect) self._min_count = min_count self._max_count = max_count self._trials = trials def distort(self, image: tf.Tensor) -> tf.Tensor: """Applies RandomErasing to single `image`. Args: image (tf.Tensor): Of shape [height, width, 3] representing an image. Returns: tf.Tensor: The augmented version of `image`. """ uniform_random = tf.random.uniform(shape=[], minval=0., maxval=1.0) mirror_cond = tf.less(uniform_random, self._probability) image = tf.cond(mirror_cond, lambda: self._erase(image), lambda: image) return image @tf.function def _erase(self, image: tf.Tensor) -> tf.Tensor: """Erase an area.""" if self._min_count == self._max_count: count = self._min_count else: count = tf.random.uniform( shape=[], minval=int(self._min_count), maxval=int(self._max_count - self._min_count + 1), dtype=tf.int32) image_height = tf.shape(image)[0] image_width = tf.shape(image)[1] area = tf.cast(image_width * image_height, tf.float32) for _ in range(count): # Work around since break is not supported in tf.function is_trial_successfull = False for _ in range(self._trials): if not is_trial_successfull: erase_area = tf.random.uniform( shape=[], minval=area * self._min_area, maxval=area * self._max_area) aspect_ratio = tf.math.exp( tf.random.uniform( shape=[], minval=self._min_log_aspect, maxval=self._max_log_aspect)) half_height = tf.cast( tf.math.round(tf.math.sqrt(erase_area * aspect_ratio) / 2), dtype=tf.int32) half_width = tf.cast( tf.math.round(tf.math.sqrt(erase_area / aspect_ratio) / 2), dtype=tf.int32) if 2 * half_height < image_height and 2 * half_width < image_width: center_height = tf.random.uniform( shape=[], minval=0, maxval=int(image_height - 2 * half_height), dtype=tf.int32) center_width = tf.random.uniform( shape=[], minval=0, maxval=int(image_width - 2 * half_width), dtype=tf.int32) image = _fill_rectangle( image, center_width, center_height, half_width, half_height, replace=None) is_trial_successfull = True return image class MixupAndCutmix: """Applies Mixup and/or Cutmix to a batch of images. - Mixup: https://arxiv.org/abs/1710.09412 - Cutmix: https://arxiv.org/abs/1905.04899 Implementaion is inspired by https://github.com/rwightman/pytorch-image-models """ def __init__(self, num_classes: int, mixup_alpha: float = .8, cutmix_alpha: float = 1., prob: float = 1.0, switch_prob: float = 0.5, label_smoothing: float = 0.1): """Applies Mixup and/or Cutmix to a batch of images. Args: num_classes (int): Number of classes. mixup_alpha (float, optional): For drawing a random lambda (`lam`) from a beta distribution (for each image). If zero Mixup is deactivated. Defaults to .8. cutmix_alpha (float, optional): For drawing a random lambda (`lam`) from a beta distribution (for each image). If zero Cutmix is deactivated. Defaults to 1.. prob (float, optional): Of augmenting the batch. Defaults to 1.0. switch_prob (float, optional): Probability of applying Cutmix for the batch. Defaults to 0.5. label_smoothing (float, optional): Constant for label smoothing. Defaults to 0.1. """ self.mixup_alpha = mixup_alpha self.cutmix_alpha = cutmix_alpha self.mix_prob = prob self.switch_prob = switch_prob self.label_smoothing = label_smoothing self.num_classes = num_classes self.mode = 'batch' self.mixup_enabled = True if self.mixup_alpha and not self.cutmix_alpha: self.switch_prob = -1 elif not self.mixup_alpha and self.cutmix_alpha: self.switch_prob = 1 def __call__(self, images: tf.Tensor, labels: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]: return self.distort(images, labels) def distort(self, images: tf.Tensor, labels: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]: """Applies Mixup and/or Cutmix to batch of images and transforms labels. Args: images (tf.Tensor): Of shape [batch_size, height, width, 3] representing a batch of image, or [batch_size, time, height, width, 3] representing a batch of video. labels (tf.Tensor): Of shape [batch_size, ] representing the class id for each image of the batch. Returns: Tuple[tf.Tensor, tf.Tensor]: The augmented version of `image` and `labels`. """ labels = tf.reshape(labels, [-1]) augment_cond = tf.less( tf.random.uniform(shape=[], minval=0., maxval=1.0), self.mix_prob) # pylint: disable=g-long-lambda augment_a = lambda: self._update_labels(*tf.cond( tf.less( tf.random.uniform(shape=[], minval=0., maxval=1.0), self.switch_prob ), lambda: self._cutmix(images, labels), lambda: self._mixup( images, labels))) augment_b = lambda: (images, self._smooth_labels(labels)) # pylint: enable=g-long-lambda return tf.cond(augment_cond, augment_a, augment_b) @staticmethod def _sample_from_beta(alpha, beta, shape): sample_alpha = tf.random.gamma(shape, 1., beta=alpha) sample_beta = tf.random.gamma(shape, 1., beta=beta) return sample_alpha / (sample_alpha + sample_beta) def _cutmix(self, images: tf.Tensor, labels: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]: """Applies cutmix.""" lam = MixupAndCutmix._sample_from_beta(self.cutmix_alpha, self.cutmix_alpha, tf.shape(labels)) ratio = tf.math.sqrt(1 - lam) batch_size = tf.shape(images)[0] if images.shape.rank == 4: image_height, image_width = tf.shape(images)[1], tf.shape(images)[2] fill_fn = _fill_rectangle elif images.shape.rank == 5: image_height, image_width = tf.shape(images)[2], tf.shape(images)[3] fill_fn = _fill_rectangle_video else: raise ValueError('Bad image rank: {}'.format(images.shape.rank)) cut_height = tf.cast( ratio * tf.cast(image_height, dtype=tf.float32), dtype=tf.int32) cut_width = tf.cast( ratio * tf.cast(image_height, dtype=tf.float32), dtype=tf.int32) random_center_height = tf.random.uniform( shape=[batch_size], minval=0, maxval=image_height, dtype=tf.int32) random_center_width = tf.random.uniform( shape=[batch_size], minval=0, maxval=image_width, dtype=tf.int32) bbox_area = cut_height * cut_width lam = 1. - bbox_area / (image_height * image_width) lam = tf.cast(lam, dtype=tf.float32) images = tf.map_fn( lambda x: fill_fn(*x), (images, random_center_width, random_center_height, cut_width // 2, cut_height // 2, tf.reverse(images, [0])), dtype=( images.dtype, tf.int32, tf.int32, tf.int32, tf.int32, images.dtype), fn_output_signature=tf.TensorSpec(images.shape[1:], dtype=images.dtype)) return images, labels, lam def _mixup(self, images: tf.Tensor, labels: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]: """Applies mixup.""" lam = MixupAndCutmix._sample_from_beta(self.mixup_alpha, self.mixup_alpha, tf.shape(labels)) if images.shape.rank == 4: lam = tf.reshape(lam, [-1, 1, 1, 1]) elif images.shape.rank == 5: lam = tf.reshape(lam, [-1, 1, 1, 1, 1]) else: raise ValueError('Bad image rank: {}'.format(images.shape.rank)) lam_cast = tf.cast(lam, dtype=images.dtype) images = lam_cast * images + (1. - lam_cast) * tf.reverse(images, [0]) return images, labels, tf.squeeze(lam) def _smooth_labels(self, labels: tf.Tensor) -> tf.Tensor: off_value = self.label_smoothing / self.num_classes on_value = 1. - self.label_smoothing + off_value smooth_labels = tf.one_hot( labels, self.num_classes, on_value=on_value, off_value=off_value) return smooth_labels def _update_labels(self, images: tf.Tensor, labels: tf.Tensor, lam: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]: labels_1 = self._smooth_labels(labels) labels_2 = tf.reverse(labels_1, [0]) lam = tf.reshape(lam, [-1, 1]) labels = lam * labels_1 + (1. - lam) * labels_2 return images, labels