Spaces:
Runtime error
Runtime error
# Copyright 2023 The TensorFlow Authors. All Rights Reserved. | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
"""Augmentation policies for enhanced image/video preprocessing. | |
AutoAugment Reference: | |
- AutoAugment Reference: https://arxiv.org/abs/1805.09501 | |
- AutoAugment for Object Detection Reference: https://arxiv.org/abs/1906.11172 | |
RandAugment Reference: https://arxiv.org/abs/1909.13719 | |
RandomErasing Reference: https://arxiv.org/abs/1708.04896 | |
MixupAndCutmix: | |
- Mixup: https://arxiv.org/abs/1710.09412 | |
- Cutmix: https://arxiv.org/abs/1905.04899 | |
RandomErasing, Mixup and Cutmix are inspired by | |
https://github.com/rwightman/pytorch-image-models | |
""" | |
import inspect | |
import math | |
from typing import Any, List, Iterable, Optional, Tuple, Union | |
import numpy as np | |
import tensorflow as tf, tf_keras | |
# This signifies the max integer that the controller RNN could predict for the | |
# augmentation scheme. | |
_MAX_LEVEL = 10. | |
def to_4d(image: tf.Tensor) -> tf.Tensor: | |
"""Converts an input Tensor to 4 dimensions. | |
4D image => [N, H, W, C] or [N, C, H, W] | |
3D image => [1, H, W, C] or [1, C, H, W] | |
2D image => [1, H, W, 1] | |
Args: | |
image: The 2/3/4D input tensor. | |
Returns: | |
A 4D image tensor. | |
Raises: | |
`TypeError` if `image` is not a 2/3/4D tensor. | |
""" | |
shape = tf.shape(image) | |
original_rank = tf.rank(image) | |
left_pad = tf.cast(tf.less_equal(original_rank, 3), dtype=tf.int32) | |
right_pad = tf.cast(tf.equal(original_rank, 2), dtype=tf.int32) | |
new_shape = tf.concat( | |
[ | |
tf.ones(shape=left_pad, dtype=tf.int32), | |
shape, | |
tf.ones(shape=right_pad, dtype=tf.int32), | |
], | |
axis=0, | |
) | |
return tf.reshape(image, new_shape) | |
def from_4d(image: tf.Tensor, ndims: tf.Tensor) -> tf.Tensor: | |
"""Converts a 4D image back to `ndims` rank.""" | |
shape = tf.shape(image) | |
begin = tf.cast(tf.less_equal(ndims, 3), dtype=tf.int32) | |
end = 4 - tf.cast(tf.equal(ndims, 2), dtype=tf.int32) | |
new_shape = shape[begin:end] | |
return tf.reshape(image, new_shape) | |
def _pad( | |
image: tf.Tensor, | |
filter_shape: Union[List[int], Tuple[int, ...]], | |
mode: str = 'CONSTANT', | |
constant_values: Union[int, tf.Tensor] = 0, | |
) -> tf.Tensor: | |
"""Explicitly pads a 4-D image. | |
Equivalent to the implicit padding method offered in `tf.nn.conv2d` and | |
`tf.nn.depthwise_conv2d`, but supports non-zero, reflect and symmetric | |
padding mode. For the even-sized filter, it pads one more value to the | |
right or the bottom side. | |
Args: | |
image: A 4-D `Tensor` of shape `[batch_size, height, width, channels]`. | |
filter_shape: A `tuple`/`list` of 2 integers, specifying the height and | |
width of the 2-D filter. | |
mode: A `string`, one of "REFLECT", "CONSTANT", or "SYMMETRIC". The type of | |
padding algorithm to use, which is compatible with `mode` argument in | |
`tf.pad`. For more details, please refer to | |
https://www.tensorflow.org/api_docs/python/tf/pad. | |
constant_values: A `scalar`, the pad value to use in "CONSTANT" padding | |
mode. | |
Returns: | |
A padded image. | |
""" | |
if mode.upper() not in {'REFLECT', 'CONSTANT', 'SYMMETRIC'}: | |
raise ValueError( | |
'padding should be one of "REFLECT", "CONSTANT", or "SYMMETRIC".' | |
) | |
constant_values = tf.convert_to_tensor(constant_values, image.dtype) | |
filter_height, filter_width = filter_shape | |
pad_top = (filter_height - 1) // 2 | |
pad_bottom = filter_height - 1 - pad_top | |
pad_left = (filter_width - 1) // 2 | |
pad_right = filter_width - 1 - pad_left | |
paddings = [[0, 0], [pad_top, pad_bottom], [pad_left, pad_right], [0, 0]] | |
return tf.pad(image, paddings, mode=mode, constant_values=constant_values) | |
def _get_gaussian_kernel(sigma, filter_shape): | |
"""Computes 1D Gaussian kernel.""" | |
sigma = tf.convert_to_tensor(sigma) | |
x = tf.range(-filter_shape // 2 + 1, filter_shape // 2 + 1) | |
x = tf.cast(x**2, sigma.dtype) | |
x = tf.nn.softmax(-x / (2.0 * (sigma**2))) | |
return x | |
def _get_gaussian_kernel_2d(gaussian_filter_x, gaussian_filter_y): | |
"""Computes 2D Gaussian kernel given 1D kernels.""" | |
gaussian_kernel = tf.matmul(gaussian_filter_x, gaussian_filter_y) | |
return gaussian_kernel | |
def _normalize_tuple(value, n, name): | |
"""Transforms an integer or iterable of integers into an integer tuple. | |
Args: | |
value: The value to validate and convert. Could an int, or any iterable of | |
ints. | |
n: The size of the tuple to be returned. | |
name: The name of the argument being validated, e.g. "strides" or | |
"kernel_size". This is only used to format error messages. | |
Returns: | |
A tuple of n integers. | |
Raises: | |
ValueError: If something else than an int/long or iterable thereof was | |
passed. | |
""" | |
if isinstance(value, int): | |
return (value,) * n | |
else: | |
try: | |
value_tuple = tuple(value) | |
except TypeError as exc: | |
raise TypeError( | |
f'The {name} argument must be a tuple of {n} integers. ' | |
f'Received: {value}' | |
) from exc | |
if len(value_tuple) != n: | |
raise ValueError( | |
f'The {name} argument must be a tuple of {n} integers. ' | |
f'Received: {value}' | |
) | |
for single_value in value_tuple: | |
try: | |
int(single_value) | |
except (ValueError, TypeError) as exc: | |
raise ValueError( | |
f'The {name} argument must be a tuple of {n} integers. Received:' | |
f' {value} including element {single_value} of type' | |
f' {type(single_value)}.' | |
) from exc | |
return value_tuple | |
def gaussian_filter2d( | |
image: tf.Tensor, | |
filter_shape: Union[List[int], Tuple[int, ...], int], | |
sigma: Union[List[float], Tuple[float], float] = 1.0, | |
padding: str = 'REFLECT', | |
constant_values: Union[int, tf.Tensor] = 0, | |
name: Optional[str] = None, | |
) -> tf.Tensor: | |
"""Performs Gaussian blur on image(s). | |
Args: | |
image: Either a 2-D `Tensor` of shape `[height, width]`, a 3-D `Tensor` of | |
shape `[height, width, channels]`, or a 4-D `Tensor` of shape | |
`[batch_size, height, width, channels]`. | |
filter_shape: An `integer` or `tuple`/`list` of 2 integers, specifying the | |
height and width of the 2-D gaussian filter. Can be a single integer to | |
specify the same value for all spatial dimensions. | |
sigma: A `float` or `tuple`/`list` of 2 floats, specifying the standard | |
deviation in x and y direction the 2-D gaussian filter. Can be a single | |
float to specify the same value for all spatial dimensions. | |
padding: A `string`, one of "REFLECT", "CONSTANT", or "SYMMETRIC". The type | |
of padding algorithm to use, which is compatible with `mode` argument in | |
`tf.pad`. For more details, please refer to | |
https://www.tensorflow.org/api_docs/python/tf/pad. | |
constant_values: A `scalar`, the pad value to use in "CONSTANT" padding | |
mode. | |
name: A name for this operation (optional). | |
Returns: | |
2-D, 3-D or 4-D `Tensor` of the same dtype as input. | |
Raises: | |
ValueError: If `image` is not 2, 3 or 4-dimensional, | |
if `padding` is other than "REFLECT", "CONSTANT" or "SYMMETRIC", | |
if `filter_shape` is invalid, | |
or if `sigma` is invalid. | |
""" | |
with tf.name_scope(name or 'gaussian_filter2d'): | |
if isinstance(sigma, (list, tuple)): | |
if len(sigma) != 2: | |
raise ValueError('sigma should be a float or a tuple/list of 2 floats') | |
else: | |
sigma = (sigma,) * 2 | |
if any(s < 0 for s in sigma): | |
raise ValueError('sigma should be greater than or equal to 0.') | |
image = tf.convert_to_tensor(image, name='image') | |
sigma = tf.convert_to_tensor(sigma, name='sigma') | |
original_ndims = tf.rank(image) | |
image = to_4d(image) | |
# Keep the precision if it's float; | |
# otherwise, convert to float32 for computing. | |
orig_dtype = image.dtype | |
if not image.dtype.is_floating: | |
image = tf.cast(image, tf.float32) | |
channels = tf.shape(image)[3] | |
filter_shape = _normalize_tuple(filter_shape, 2, 'filter_shape') | |
sigma = tf.cast(sigma, image.dtype) | |
gaussian_kernel_x = _get_gaussian_kernel(sigma[1], filter_shape[1]) | |
gaussian_kernel_x = gaussian_kernel_x[tf.newaxis, :] | |
gaussian_kernel_y = _get_gaussian_kernel(sigma[0], filter_shape[0]) | |
gaussian_kernel_y = gaussian_kernel_y[:, tf.newaxis] | |
gaussian_kernel_2d = _get_gaussian_kernel_2d( | |
gaussian_kernel_y, gaussian_kernel_x | |
) | |
gaussian_kernel_2d = gaussian_kernel_2d[:, :, tf.newaxis, tf.newaxis] | |
gaussian_kernel_2d = tf.tile(gaussian_kernel_2d, [1, 1, channels, 1]) | |
image = _pad( | |
image, filter_shape, mode=padding, constant_values=constant_values | |
) | |
output = tf.nn.depthwise_conv2d( | |
input=image, | |
filter=gaussian_kernel_2d, | |
strides=(1, 1, 1, 1), | |
padding='VALID', | |
) | |
output = from_4d(output, original_ndims) | |
return tf.cast(output, orig_dtype) | |
def _convert_translation_to_transform(translations: tf.Tensor) -> tf.Tensor: | |
"""Converts translations to a projective transform. | |
The translation matrix looks like this: | |
[[1 0 -dx] | |
[0 1 -dy] | |
[0 0 1]] | |
Args: | |
translations: The 2-element list representing [dx, dy], or a matrix of | |
2-element lists representing [dx dy] to translate for each image. The | |
shape must be static. | |
Returns: | |
The transformation matrix of shape (num_images, 8). | |
Raises: | |
`TypeError` if | |
- the shape of `translations` is not known or | |
- the shape of `translations` is not rank 1 or 2. | |
""" | |
translations = tf.convert_to_tensor(translations, dtype=tf.float32) | |
if translations.get_shape().ndims is None: | |
raise TypeError('translations rank must be statically known') | |
elif len(translations.get_shape()) == 1: | |
translations = translations[None] | |
elif len(translations.get_shape()) != 2: | |
raise TypeError('translations should have rank 1 or 2.') | |
num_translations = tf.shape(translations)[0] | |
return tf.concat( | |
values=[ | |
tf.ones((num_translations, 1), tf.dtypes.float32), | |
tf.zeros((num_translations, 1), tf.dtypes.float32), | |
-translations[:, 0, None], | |
tf.zeros((num_translations, 1), tf.dtypes.float32), | |
tf.ones((num_translations, 1), tf.dtypes.float32), | |
-translations[:, 1, None], | |
tf.zeros((num_translations, 2), tf.dtypes.float32), | |
], | |
axis=1, | |
) | |
def _convert_angles_to_transform(angles: tf.Tensor, image_width: tf.Tensor, | |
image_height: tf.Tensor) -> tf.Tensor: | |
"""Converts an angle or angles to a projective transform. | |
Args: | |
angles: A scalar to rotate all images, or a vector to rotate a batch of | |
images. This must be a scalar. | |
image_width: The width of the image(s) to be transformed. | |
image_height: The height of the image(s) to be transformed. | |
Returns: | |
A tensor of shape (num_images, 8). | |
Raises: | |
`TypeError` if `angles` is not rank 0 or 1. | |
""" | |
angles = tf.convert_to_tensor(angles, dtype=tf.float32) | |
if len(angles.get_shape()) == 0: # pylint:disable=g-explicit-length-test | |
angles = angles[None] | |
elif len(angles.get_shape()) != 1: | |
raise TypeError('Angles should have a rank 0 or 1.') | |
x_offset = ((image_width - 1) - | |
(tf.math.cos(angles) * (image_width - 1) - tf.math.sin(angles) * | |
(image_height - 1))) / 2.0 | |
y_offset = ((image_height - 1) - | |
(tf.math.sin(angles) * (image_width - 1) + tf.math.cos(angles) * | |
(image_height - 1))) / 2.0 | |
num_angles = tf.shape(angles)[0] | |
return tf.concat( | |
values=[ | |
tf.math.cos(angles)[:, None], | |
-tf.math.sin(angles)[:, None], | |
x_offset[:, None], | |
tf.math.sin(angles)[:, None], | |
tf.math.cos(angles)[:, None], | |
y_offset[:, None], | |
tf.zeros((num_angles, 2), tf.dtypes.float32), | |
], | |
axis=1, | |
) | |
def _apply_transform_to_images( | |
images, | |
transforms, | |
fill_mode='reflect', | |
fill_value=0.0, | |
interpolation='bilinear', | |
output_shape=None, | |
name=None, | |
): | |
"""Applies the given transform(s) to the image(s). | |
Args: | |
images: A tensor of shape `(num_images, num_rows, num_columns, | |
num_channels)` (NHWC). The rank must be statically known (the shape is | |
not `TensorShape(None)`). | |
transforms: Projective transform matrix/matrices. A vector of length 8 or | |
tensor of size N x 8. If one row of transforms is [a0, a1, a2, b0, b1, | |
b2, c0, c1], then it maps the *output* point `(x, y)` to a transformed | |
*input* point `(x', y') = ((a0 x + a1 y + a2) / k, (b0 x + b1 y + b2) / | |
k)`, where `k = c0 x + c1 y + 1`. The transforms are *inverted* compared | |
to the transform mapping input points to output points. Note that | |
gradients are not backpropagated into transformation parameters. | |
fill_mode: Points outside the boundaries of the input are filled according | |
to the given mode (one of `{"constant", "reflect", "wrap", "nearest"}`). | |
fill_value: a float represents the value to be filled outside the | |
boundaries when `fill_mode="constant"`. | |
interpolation: Interpolation mode. Supported values: `"nearest"`, | |
`"bilinear"`. | |
output_shape: Output dimension after the transform, `[height, width]`. If | |
`None`, output is the same size as input image. | |
name: The name of the op. Fill mode behavior for each valid value is as | |
follows | |
- `"reflect"`: `(d c b a | a b c d | d c b a)` The input is extended by | |
reflecting about the edge of the last pixel. | |
- `"constant"`: `(k k k k | a b c d | k k k k)` The input is extended by | |
filling all values beyond the edge with the same constant value k = 0. | |
- `"wrap"`: `(a b c d | a b c d | a b c d)` The input is extended by | |
wrapping around to the opposite edge. | |
- `"nearest"`: `(a a a a | a b c d | d d d d)` The input is extended by | |
the nearest pixel. Input shape: 4D tensor with shape: | |
`(samples, height, width, channels)`, in `"channels_last"` format. | |
Output shape: 4D tensor with shape: `(samples, height, width, channels)`, | |
in `"channels_last"` format. | |
Returns: | |
Image(s) with the same type and shape as `images`, with the given | |
transform(s) applied. Transformed coordinates outside of the input image | |
will be filled with zeros. | |
""" | |
with tf.name_scope(name or 'transform'): | |
if output_shape is None: | |
output_shape = tf.shape(images)[1:3] | |
if not tf.executing_eagerly(): | |
output_shape_value = tf.get_static_value(output_shape) | |
if output_shape_value is not None: | |
output_shape = output_shape_value | |
output_shape = tf.convert_to_tensor( | |
output_shape, tf.int32, name='output_shape' | |
) | |
if not output_shape.get_shape().is_compatible_with([2]): | |
raise ValueError( | |
'output_shape must be a 1-D Tensor of 2 elements: ' | |
'new_height, new_width, instead got ' | |
f'output_shape={output_shape}' | |
) | |
fill_value = tf.convert_to_tensor(fill_value, tf.float32, name='fill_value') | |
return tf.raw_ops.ImageProjectiveTransformV3( | |
images=images, | |
output_shape=output_shape, | |
fill_value=fill_value, | |
transforms=transforms, | |
fill_mode=fill_mode.upper(), | |
interpolation=interpolation.upper(), | |
) | |
def transform( | |
image: tf.Tensor, | |
transforms: Any, | |
interpolation: str = 'nearest', | |
output_shape=None, | |
fill_mode: str = 'reflect', | |
fill_value: float = 0.0, | |
) -> tf.Tensor: | |
"""Transforms an image.""" | |
original_ndims = tf.rank(image) | |
transforms = tf.convert_to_tensor(transforms, dtype=tf.float32) | |
if transforms.shape.rank == 1: | |
transforms = transforms[None] | |
image = to_4d(image) | |
image = _apply_transform_to_images( | |
images=image, | |
transforms=transforms, | |
interpolation=interpolation, | |
fill_mode=fill_mode, | |
fill_value=fill_value, | |
output_shape=output_shape, | |
) | |
return from_4d(image, original_ndims) | |
def translate( | |
image: tf.Tensor, | |
translations, | |
fill_value: float = 0.0, | |
fill_mode: str = 'reflect', | |
interpolation: str = 'nearest', | |
) -> tf.Tensor: | |
"""Translates image(s) by provided vectors. | |
Args: | |
image: An image Tensor of type uint8. | |
translations: A vector or matrix representing [dx dy]. | |
fill_value: a float represents the value to be filled outside the boundaries | |
when `fill_mode="constant"`. | |
fill_mode: Points outside the boundaries of the input are filled according | |
to the given mode (one of `{"constant", "reflect", "wrap", "nearest"}`). | |
interpolation: Interpolation mode. Supported values: `"nearest"`, | |
`"bilinear"`. | |
Returns: | |
The translated version of the image. | |
""" | |
transforms = _convert_translation_to_transform(translations) # pytype: disable=wrong-arg-types # always-use-return-annotations | |
return transform( | |
image, | |
transforms=transforms, | |
interpolation=interpolation, | |
fill_value=fill_value, | |
fill_mode=fill_mode, | |
) | |
def rotate(image: tf.Tensor, degrees: float) -> tf.Tensor: | |
"""Rotates the image by degrees either clockwise or counterclockwise. | |
Args: | |
image: An image Tensor of type uint8. | |
degrees: Float, a scalar angle in degrees to rotate all images by. If | |
degrees is positive the image will be rotated clockwise otherwise it will | |
be rotated counterclockwise. | |
Returns: | |
The rotated version of image. | |
""" | |
# Convert from degrees to radians. | |
degrees_to_radians = math.pi / 180.0 | |
radians = tf.cast(degrees * degrees_to_radians, tf.float32) | |
original_ndims = tf.rank(image) | |
image = to_4d(image) | |
image_height = tf.cast(tf.shape(image)[1], tf.float32) | |
image_width = tf.cast(tf.shape(image)[2], tf.float32) | |
transforms = _convert_angles_to_transform( | |
angles=radians, image_width=image_width, image_height=image_height) | |
# In practice, we should randomize the rotation degrees by flipping | |
# it negatively half the time, but that's done on 'degrees' outside | |
# of the function. | |
image = transform(image, transforms=transforms) | |
return from_4d(image, original_ndims) | |
def blend(image1: tf.Tensor, image2: tf.Tensor, factor: float) -> tf.Tensor: | |
"""Blend image1 and image2 using 'factor'. | |
Factor can be above 0.0. A value of 0.0 means only image1 is used. | |
A value of 1.0 means only image2 is used. A value between 0.0 and | |
1.0 means we linearly interpolate the pixel values between the two | |
images. A value greater than 1.0 "extrapolates" the difference | |
between the two pixel values, and we clip the results to values | |
between 0 and 255. | |
Args: | |
image1: An image Tensor of type uint8. | |
image2: An image Tensor of type uint8. | |
factor: A floating point value above 0.0. | |
Returns: | |
A blended image Tensor of type uint8. | |
""" | |
if factor == 0.0: | |
return tf.convert_to_tensor(image1) | |
if factor == 1.0: | |
return tf.convert_to_tensor(image2) | |
image1 = tf.cast(image1, tf.float32) | |
image2 = tf.cast(image2, tf.float32) | |
difference = image2 - image1 | |
scaled = factor * difference | |
# Do addition in float. | |
temp = tf.cast(image1, tf.float32) + scaled | |
# Interpolate | |
if factor > 0.0 and factor < 1.0: | |
# Interpolation means we always stay within 0 and 255. | |
return tf.cast(temp, tf.uint8) | |
# Extrapolate: | |
# | |
# We need to clip and then cast. | |
return tf.cast(tf.clip_by_value(temp, 0.0, 255.0), tf.uint8) | |
def cutout(image: tf.Tensor, pad_size: int, replace: int = 0) -> tf.Tensor: | |
"""Apply cutout (https://arxiv.org/abs/1708.04552) to image. | |
This operation applies a (2*pad_size x 2*pad_size) mask of zeros to | |
a random location within `image`. The pixel values filled in will be of the | |
value `replace`. The location where the mask will be applied is randomly | |
chosen uniformly over the whole image. | |
Args: | |
image: An image Tensor of type uint8. | |
pad_size: Specifies how big the zero mask that will be generated is that is | |
applied to the image. The mask will be of size (2*pad_size x 2*pad_size). | |
replace: What pixel value to fill in the image in the area that has the | |
cutout mask applied to it. | |
Returns: | |
An image Tensor that is of type uint8. | |
""" | |
if image.shape.rank not in [3, 4]: | |
raise ValueError('Bad image rank: {}'.format(image.shape.rank)) | |
if image.shape.rank == 4: | |
return cutout_video(image, replace=replace) | |
image_height = tf.shape(image)[0] | |
image_width = tf.shape(image)[1] | |
# Sample the center location in the image where the zero mask will be applied. | |
cutout_center_height = tf.random.uniform( | |
shape=[], minval=0, maxval=image_height, dtype=tf.int32) | |
cutout_center_width = tf.random.uniform( | |
shape=[], minval=0, maxval=image_width, dtype=tf.int32) | |
image = _fill_rectangle(image, cutout_center_width, cutout_center_height, | |
pad_size, pad_size, replace) | |
return image | |
def _fill_rectangle(image, | |
center_width, | |
center_height, | |
half_width, | |
half_height, | |
replace=None): | |
"""Fills blank area.""" | |
image_height = tf.shape(image)[0] | |
image_width = tf.shape(image)[1] | |
lower_pad = tf.maximum(0, center_height - half_height) | |
upper_pad = tf.maximum(0, image_height - center_height - half_height) | |
left_pad = tf.maximum(0, center_width - half_width) | |
right_pad = tf.maximum(0, image_width - center_width - half_width) | |
cutout_shape = [ | |
image_height - (lower_pad + upper_pad), | |
image_width - (left_pad + right_pad) | |
] | |
padding_dims = [[lower_pad, upper_pad], [left_pad, right_pad]] | |
mask = tf.pad( | |
tf.zeros(cutout_shape, dtype=image.dtype), | |
padding_dims, | |
constant_values=1) | |
mask = tf.expand_dims(mask, -1) | |
mask = tf.tile(mask, [1, 1, 3]) | |
if replace is None: | |
fill = tf.random.normal(tf.shape(image), dtype=image.dtype) | |
elif isinstance(replace, tf.Tensor): | |
fill = replace | |
else: | |
fill = tf.ones_like(image, dtype=image.dtype) * replace | |
image = tf.where(tf.equal(mask, 0), fill, image) | |
return image | |
def _fill_rectangle_video(image, | |
center_width, | |
center_height, | |
half_width, | |
half_height, | |
replace=None): | |
"""Fills blank area for video.""" | |
image_time = tf.shape(image)[0] | |
image_height = tf.shape(image)[1] | |
image_width = tf.shape(image)[2] | |
lower_pad = tf.maximum(0, center_height - half_height) | |
upper_pad = tf.maximum(0, image_height - center_height - half_height) | |
left_pad = tf.maximum(0, center_width - half_width) | |
right_pad = tf.maximum(0, image_width - center_width - half_width) | |
cutout_shape = [ | |
image_time, image_height - (lower_pad + upper_pad), | |
image_width - (left_pad + right_pad) | |
] | |
padding_dims = [[0, 0], [lower_pad, upper_pad], [left_pad, right_pad]] | |
mask = tf.pad( | |
tf.zeros(cutout_shape, dtype=image.dtype), | |
padding_dims, | |
constant_values=1) | |
mask = tf.expand_dims(mask, -1) | |
mask = tf.tile(mask, [1, 1, 1, 3]) | |
if replace is None: | |
fill = tf.random.normal(tf.shape(image), dtype=image.dtype) | |
elif isinstance(replace, tf.Tensor): | |
fill = replace | |
else: | |
fill = tf.ones_like(image, dtype=image.dtype) * replace | |
image = tf.where(tf.equal(mask, 0), fill, image) | |
return image | |
def cutout_video( | |
video: tf.Tensor, | |
mask_shape: Optional[tf.Tensor] = None, | |
replace: int = 0, | |
) -> tf.Tensor: | |
"""Apply cutout (https://arxiv.org/abs/1708.04552) to a video. | |
This operation applies a random size 3D mask of zeros to a random location | |
within `video`. The mask is padded The pixel values filled in will be of the | |
value `replace`. The location where the mask will be applied is randomly | |
chosen uniformly over the whole video. If the size of the mask is not set, | |
then, it is randomly sampled uniformly from [0.25*height, 0.5*height], | |
[0.25*width, 0.5*width], and [1, 0.25*depth], which represent the height, | |
width, and number of frames of the input video tensor respectively. | |
Args: | |
video: A video Tensor of shape [T, H, W, C]. | |
mask_shape: An optional integer tensor that specifies the depth, height and | |
width of the mask to cut. If it is not set, the shape is randomly sampled | |
as described above. The shape dimensions should be divisible by 2 | |
otherwise they will rounded down. | |
replace: What pixel value to fill in the image in the area that has the | |
cutout mask applied to it. | |
Returns: | |
A video Tensor with cutout applied. | |
""" | |
tf.debugging.assert_shapes([ | |
(video, ('T', 'H', 'W', 'C')), | |
]) | |
video_depth = tf.shape(video)[0] | |
video_height = tf.shape(video)[1] | |
video_width = tf.shape(video)[2] | |
# Sample the center location in the image where the zero mask will be applied. | |
cutout_center_height = tf.random.uniform( | |
shape=[], minval=0, maxval=video_height, dtype=tf.int32 | |
) | |
cutout_center_width = tf.random.uniform( | |
shape=[], minval=0, maxval=video_width, dtype=tf.int32 | |
) | |
cutout_center_depth = tf.random.uniform( | |
shape=[], minval=0, maxval=video_depth, dtype=tf.int32 | |
) | |
if mask_shape is not None: | |
pad_shape = tf.maximum(1, mask_shape // 2) | |
pad_size_depth, pad_size_height, pad_size_width = ( | |
pad_shape[0], | |
pad_shape[1], | |
pad_shape[2], | |
) | |
else: | |
pad_size_height = tf.random.uniform( | |
shape=[], | |
minval=tf.maximum(1, tf.cast(video_height / 4, tf.int32)), | |
maxval=tf.maximum(2, tf.cast(video_height / 2, tf.int32)), | |
dtype=tf.int32, | |
) | |
pad_size_width = tf.random.uniform( | |
shape=[], | |
minval=tf.maximum(1, tf.cast(video_width / 4, tf.int32)), | |
maxval=tf.maximum(2, tf.cast(video_width / 2, tf.int32)), | |
dtype=tf.int32, | |
) | |
pad_size_depth = tf.random.uniform( | |
shape=[], | |
minval=1, | |
maxval=tf.maximum(2, tf.cast(video_depth / 4, tf.int32)), | |
dtype=tf.int32, | |
) | |
lower_pad = tf.maximum(0, cutout_center_height - pad_size_height) | |
upper_pad = tf.maximum( | |
0, video_height - cutout_center_height - pad_size_height | |
) | |
left_pad = tf.maximum(0, cutout_center_width - pad_size_width) | |
right_pad = tf.maximum(0, video_width - cutout_center_width - pad_size_width) | |
back_pad = tf.maximum(0, cutout_center_depth - pad_size_depth) | |
forward_pad = tf.maximum( | |
0, video_depth - cutout_center_depth - pad_size_depth | |
) | |
cutout_shape = [ | |
video_depth - (back_pad + forward_pad), | |
video_height - (lower_pad + upper_pad), | |
video_width - (left_pad + right_pad), | |
] | |
padding_dims = [[back_pad, forward_pad], | |
[lower_pad, upper_pad], | |
[left_pad, right_pad]] | |
mask = tf.pad( | |
tf.zeros(cutout_shape, dtype=video.dtype), padding_dims, constant_values=1 | |
) | |
mask = tf.expand_dims(mask, -1) | |
num_channels = tf.shape(video)[-1] | |
mask = tf.tile(mask, [1, 1, 1, num_channels]) | |
video = tf.where( | |
tf.equal(mask, 0), tf.ones_like(video, dtype=video.dtype) * replace, video | |
) | |
return video | |
def gaussian_noise( | |
image: tf.Tensor, low: float = 0.1, high: float = 2.0) -> tf.Tensor: | |
"""Add Gaussian noise to image(s).""" | |
augmented_image = gaussian_filter2d( # pylint: disable=g-long-lambda | |
image, filter_shape=[3, 3], sigma=np.random.uniform(low=low, high=high) | |
) | |
return augmented_image | |
def solarize(image: tf.Tensor, threshold: int = 128) -> tf.Tensor: | |
"""Solarize the input image(s).""" | |
# For each pixel in the image, select the pixel | |
# if the value is less than the threshold. | |
# Otherwise, subtract 255 from the pixel. | |
return tf.where(image < threshold, image, 255 - image) | |
def solarize_add(image: tf.Tensor, | |
addition: int = 0, | |
threshold: int = 128) -> tf.Tensor: | |
"""Additive solarize the input image(s).""" | |
# For each pixel in the image less than threshold | |
# we add 'addition' amount to it and then clip the | |
# pixel value to be between 0 and 255. The value | |
# of 'addition' is between -128 and 128. | |
added_image = tf.cast(image, tf.int64) + addition | |
added_image = tf.cast(tf.clip_by_value(added_image, 0, 255), tf.uint8) | |
return tf.where(image < threshold, added_image, image) | |
def grayscale(image: tf.Tensor) -> tf.Tensor: | |
"""Convert image to grayscale.""" | |
return tf.image.grayscale_to_rgb(tf.image.rgb_to_grayscale(image)) | |
def color(image: tf.Tensor, factor: float) -> tf.Tensor: | |
"""Equivalent of PIL Color.""" | |
degenerate = grayscale(image) | |
return blend(degenerate, image, factor) | |
def contrast(image: tf.Tensor, factor: float) -> tf.Tensor: | |
"""Equivalent of PIL Contrast.""" | |
degenerate = tf.image.rgb_to_grayscale(image) | |
# Cast before calling tf.histogram. | |
degenerate = tf.cast(degenerate, tf.int32) | |
# Compute the grayscale histogram, then compute the mean pixel value, | |
# and create a constant image size of that value. Use that as the | |
# blending degenerate target of the original image. | |
hist = tf.histogram_fixed_width(degenerate, [0, 255], nbins=256) | |
mean = tf.reduce_sum(tf.cast(hist, tf.float32)) / 256.0 | |
degenerate = tf.ones_like(degenerate, dtype=tf.float32) * mean | |
degenerate = tf.clip_by_value(degenerate, 0.0, 255.0) | |
degenerate = tf.image.grayscale_to_rgb(tf.cast(degenerate, tf.uint8)) | |
return blend(degenerate, image, factor) | |
def brightness(image: tf.Tensor, factor: float) -> tf.Tensor: | |
"""Equivalent of PIL Brightness.""" | |
degenerate = tf.zeros_like(image) | |
return blend(degenerate, image, factor) | |
def posterize(image: tf.Tensor, bits: int) -> tf.Tensor: | |
"""Equivalent of PIL Posterize.""" | |
shift = 8 - bits | |
return tf.bitwise.left_shift(tf.bitwise.right_shift(image, shift), shift) | |
def wrapped_rotate(image: tf.Tensor, degrees: float, replace: int) -> tf.Tensor: | |
"""Applies rotation with wrap/unwrap.""" | |
image = rotate(wrap(image), degrees=degrees) | |
return unwrap(image, replace) | |
def translate_x(image: tf.Tensor, pixels: int, replace: int) -> tf.Tensor: | |
"""Equivalent of PIL Translate in X dimension.""" | |
image = translate(wrap(image), [-pixels, 0]) | |
return unwrap(image, replace) | |
def translate_y(image: tf.Tensor, pixels: int, replace: int) -> tf.Tensor: | |
"""Equivalent of PIL Translate in Y dimension.""" | |
image = translate(wrap(image), [0, -pixels]) | |
return unwrap(image, replace) | |
def shear_x(image: tf.Tensor, level: float, replace: int) -> tf.Tensor: | |
"""Equivalent of PIL Shearing in X dimension.""" | |
# Shear parallel to x axis is a projective transform | |
# with a matrix form of: | |
# [1 level | |
# 0 1]. | |
image = transform( | |
image=wrap(image), transforms=[1., level, 0., 0., 1., 0., 0., 0.]) | |
return unwrap(image, replace) | |
def shear_y(image: tf.Tensor, level: float, replace: int) -> tf.Tensor: | |
"""Equivalent of PIL Shearing in Y dimension.""" | |
# Shear parallel to y axis is a projective transform | |
# with a matrix form of: | |
# [1 0 | |
# level 1]. | |
image = transform( | |
image=wrap(image), transforms=[1., 0., 0., level, 1., 0., 0., 0.]) | |
return unwrap(image, replace) | |
def autocontrast(image: tf.Tensor) -> tf.Tensor: | |
"""Implements Autocontrast function from PIL using TF ops. | |
Args: | |
image: A 3D uint8 tensor. | |
Returns: | |
The image after it has had autocontrast applied to it and will be of type | |
uint8. | |
""" | |
def scale_channel(image: tf.Tensor) -> tf.Tensor: | |
"""Scale the 2D image using the autocontrast rule.""" | |
# A possibly cheaper version can be done using cumsum/unique_with_counts | |
# over the histogram values, rather than iterating over the entire image. | |
# to compute mins and maxes. | |
lo = tf.cast(tf.reduce_min(image), tf.float32) | |
hi = tf.cast(tf.reduce_max(image), tf.float32) | |
# Scale the image, making the lowest value 0 and the highest value 255. | |
def scale_values(im): | |
scale = 255.0 / (hi - lo) | |
offset = -lo * scale | |
im = tf.cast(im, tf.float32) * scale + offset | |
im = tf.clip_by_value(im, 0.0, 255.0) | |
return tf.cast(im, tf.uint8) | |
result = tf.cond(hi > lo, lambda: scale_values(image), lambda: image) | |
return result | |
# Assumes RGB for now. Scales each channel independently | |
# and then stacks the result. | |
s1 = scale_channel(image[..., 0]) | |
s2 = scale_channel(image[..., 1]) | |
s3 = scale_channel(image[..., 2]) | |
image = tf.stack([s1, s2, s3], -1) | |
return image | |
def sharpness(image: tf.Tensor, factor: float) -> tf.Tensor: | |
"""Implements Sharpness function from PIL using TF ops.""" | |
orig_image = image | |
image = tf.cast(image, tf.float32) | |
# Make image 4D for conv operation. | |
image = tf.expand_dims(image, 0) | |
# SMOOTH PIL Kernel. | |
if orig_image.shape.rank == 3: | |
kernel = tf.constant([[1, 1, 1], [1, 5, 1], [1, 1, 1]], | |
dtype=tf.float32, | |
shape=[3, 3, 1, 1]) / 13. | |
# Tile across channel dimension. | |
kernel = tf.tile(kernel, [1, 1, 3, 1]) | |
strides = [1, 1, 1, 1] | |
degenerate = tf.nn.depthwise_conv2d( | |
image, kernel, strides, padding='VALID', dilations=[1, 1]) | |
elif orig_image.shape.rank == 4: | |
kernel = tf.constant([[1, 1, 1], [1, 5, 1], [1, 1, 1]], | |
dtype=tf.float32, | |
shape=[1, 3, 3, 1, 1]) / 13. | |
strides = [1, 1, 1, 1, 1] | |
# Run the kernel across each channel | |
channels = tf.split(image, 3, axis=-1) | |
degenerates = [ | |
tf.nn.conv3d(channel, kernel, strides, padding='VALID', | |
dilations=[1, 1, 1, 1, 1]) | |
for channel in channels | |
] | |
degenerate = tf.concat(degenerates, -1) | |
else: | |
raise ValueError('Bad image rank: {}'.format(image.shape.rank)) | |
degenerate = tf.clip_by_value(degenerate, 0.0, 255.0) | |
degenerate = tf.squeeze(tf.cast(degenerate, tf.uint8), [0]) | |
# For the borders of the resulting image, fill in the values of the | |
# original image. | |
mask = tf.ones_like(degenerate) | |
paddings = [[0, 0]] * (orig_image.shape.rank - 3) | |
padded_mask = tf.pad(mask, paddings + [[1, 1], [1, 1], [0, 0]]) | |
padded_degenerate = tf.pad(degenerate, paddings + [[1, 1], [1, 1], [0, 0]]) | |
result = tf.where(tf.equal(padded_mask, 1), padded_degenerate, orig_image) | |
# Blend the final result. | |
return blend(result, orig_image, factor) | |
def equalize(image: tf.Tensor) -> tf.Tensor: | |
"""Implements Equalize function from PIL using TF ops.""" | |
def scale_channel(im, c): | |
"""Scale the data in the channel to implement equalize.""" | |
im = tf.cast(im[..., c], tf.int32) | |
# Compute the histogram of the image channel. | |
histo = tf.histogram_fixed_width(im, [0, 255], nbins=256) | |
# For the purposes of computing the step, filter out the nonzeros. | |
nonzero = tf.where(tf.not_equal(histo, 0)) | |
nonzero_histo = tf.reshape(tf.gather(histo, nonzero), [-1]) | |
step = (tf.reduce_sum(nonzero_histo) - nonzero_histo[-1]) // 255 | |
def build_lut(histo, step): | |
# Compute the cumulative sum, shifting by step // 2 | |
# and then normalization by step. | |
lut = (tf.cumsum(histo) + (step // 2)) // step | |
# Shift lut, prepending with 0. | |
lut = tf.concat([[0], lut[:-1]], 0) | |
# Clip the counts to be in range. This is done | |
# in the C code for image.point. | |
return tf.clip_by_value(lut, 0, 255) | |
# If step is zero, return the original image. Otherwise, build | |
# lut from the full histogram and step and then index from it. | |
result = tf.cond( | |
tf.equal(step, 0), lambda: im, | |
lambda: tf.gather(build_lut(histo, step), im)) | |
return tf.cast(result, tf.uint8) | |
# Assumes RGB for now. Scales each channel independently | |
# and then stacks the result. | |
s1 = scale_channel(image, 0) | |
s2 = scale_channel(image, 1) | |
s3 = scale_channel(image, 2) | |
image = tf.stack([s1, s2, s3], -1) | |
return image | |
def invert(image: tf.Tensor) -> tf.Tensor: | |
"""Inverts the image pixels.""" | |
image = tf.convert_to_tensor(image) | |
return 255 - image | |
def wrap(image: tf.Tensor) -> tf.Tensor: | |
"""Returns 'image' with an extra channel set to all 1s.""" | |
shape = tf.shape(image) | |
extended_channel = tf.expand_dims(tf.ones(shape[:-1], image.dtype), -1) | |
extended = tf.concat([image, extended_channel], axis=-1) | |
return extended | |
def unwrap(image: tf.Tensor, replace: int) -> tf.Tensor: | |
"""Unwraps an image produced by wrap. | |
Where there is a 0 in the last channel for every spatial position, | |
the rest of the three channels in that spatial dimension are grayed | |
(set to 128). Operations like translate and shear on a wrapped | |
Tensor will leave 0s in empty locations. Some transformations look | |
at the intensity of values to do preprocessing, and we want these | |
empty pixels to assume the 'average' value, rather than pure black. | |
Args: | |
image: A 3D Image Tensor with 4 channels. | |
replace: A one or three value 1D tensor to fill empty pixels. | |
Returns: | |
image: A 3D image Tensor with 3 channels. | |
""" | |
image_shape = tf.shape(image) | |
# Flatten the spatial dimensions. | |
flattened_image = tf.reshape(image, [-1, image_shape[-1]]) | |
# Find all pixels where the last channel is zero. | |
alpha_channel = tf.expand_dims(flattened_image[..., 3], axis=-1) | |
replace = tf.concat([replace, tf.ones([1], image.dtype)], 0) | |
# Where they are zero, fill them in with 'replace'. | |
flattened_image = tf.where( | |
tf.equal(alpha_channel, 0), | |
tf.ones_like(flattened_image, dtype=image.dtype) * replace, | |
flattened_image) | |
image = tf.reshape(flattened_image, image_shape) | |
image = tf.slice( | |
image, | |
[0] * image.shape.rank, | |
tf.concat([image_shape[:-1], [3]], -1)) | |
return image | |
def _scale_bbox_only_op_probability(prob): | |
"""Reduce the probability of the bbox-only operation. | |
Probability is reduced so that we do not distort the content of too many | |
bounding boxes that are close to each other. The value of 3.0 was a chosen | |
hyper parameter when designing the autoaugment algorithm that we found | |
empirically to work well. | |
Args: | |
prob: Float that is the probability of applying the bbox-only operation. | |
Returns: | |
Reduced probability. | |
""" | |
return prob / 3.0 | |
def _apply_bbox_augmentation(image, bbox, augmentation_func, *args): | |
"""Applies augmentation_func to the subsection of image indicated by bbox. | |
Args: | |
image: 3D uint8 Tensor. | |
bbox: 1D Tensor that has 4 elements (min_y, min_x, max_y, max_x) | |
of type float that represents the normalized coordinates between 0 and 1. | |
augmentation_func: Augmentation function that will be applied to the | |
subsection of image. | |
*args: Additional parameters that will be passed into augmentation_func | |
when it is called. | |
Returns: | |
A modified version of image, where the bbox location in the image will | |
have `ugmentation_func applied to it. | |
""" | |
image_height = tf.cast(tf.shape(image)[0], tf.float32) | |
image_width = tf.cast(tf.shape(image)[1], tf.float32) | |
min_y = tf.cast(image_height * bbox[0], tf.int32) | |
min_x = tf.cast(image_width * bbox[1], tf.int32) | |
max_y = tf.cast(image_height * bbox[2], tf.int32) | |
max_x = tf.cast(image_width * bbox[3], tf.int32) | |
image_height = tf.cast(image_height, tf.int32) | |
image_width = tf.cast(image_width, tf.int32) | |
# Clip to be sure the max values do not fall out of range. | |
max_y = tf.minimum(max_y, image_height - 1) | |
max_x = tf.minimum(max_x, image_width - 1) | |
# Get the sub-tensor that is the image within the bounding box region. | |
bbox_content = image[min_y:max_y + 1, min_x:max_x + 1, :] | |
# Apply the augmentation function to the bbox portion of the image. | |
augmented_bbox_content = augmentation_func(bbox_content, *args) | |
# Pad the augmented_bbox_content and the mask to match the shape of original | |
# image. | |
augmented_bbox_content = tf.pad(augmented_bbox_content, | |
[[min_y, (image_height - 1) - max_y], | |
[min_x, (image_width - 1) - max_x], | |
[0, 0]]) | |
# Create a mask that will be used to zero out a part of the original image. | |
mask_tensor = tf.zeros_like(bbox_content) | |
mask_tensor = tf.pad(mask_tensor, | |
[[min_y, (image_height - 1) - max_y], | |
[min_x, (image_width - 1) - max_x], | |
[0, 0]], | |
constant_values=1) | |
# Replace the old bbox content with the new augmented content. | |
image = image * mask_tensor + augmented_bbox_content | |
return image | |
def _concat_bbox(bbox, bboxes): | |
"""Helper function that concates bbox to bboxes along the first dimension.""" | |
# Note if all elements in bboxes are -1 (_INVALID_BOX), then this means | |
# we discard bboxes and start the bboxes Tensor with the current bbox. | |
bboxes_sum_check = tf.reduce_sum(bboxes) | |
bbox = tf.expand_dims(bbox, 0) | |
# This check will be true when it is an _INVALID_BOX | |
bboxes = tf.cond(tf.equal(bboxes_sum_check, -4.0), | |
lambda: bbox, | |
lambda: tf.concat([bboxes, bbox], 0)) | |
return bboxes | |
def _apply_bbox_augmentation_wrapper(image, bbox, new_bboxes, prob, | |
augmentation_func, func_changes_bbox, | |
*args): | |
"""Applies _apply_bbox_augmentation with probability prob. | |
Args: | |
image: 3D uint8 Tensor. | |
bbox: 1D Tensor that has 4 elements (min_y, min_x, max_y, max_x) | |
of type float that represents the normalized coordinates between 0 and 1. | |
new_bboxes: 2D Tensor that is a list of the bboxes in the image after they | |
have been altered by aug_func. These will only be changed when | |
func_changes_bbox is set to true. Each bbox has 4 elements | |
(min_y, min_x, max_y, max_x) of type float that are the normalized | |
bbox coordinates between 0 and 1. | |
prob: Float that is the probability of applying _apply_bbox_augmentation. | |
augmentation_func: Augmentation function that will be applied to the | |
subsection of image. | |
func_changes_bbox: Boolean. Does augmentation_func return bbox in addition | |
to image. | |
*args: Additional parameters that will be passed into augmentation_func | |
when it is called. | |
Returns: | |
A tuple. Fist element is a modified version of image, where the bbox | |
location in the image will have augmentation_func applied to it if it is | |
chosen to be called with probability `prob`. The second element is a | |
Tensor of Tensors of length 4 that will contain the altered bbox after | |
applying augmentation_func. | |
""" | |
should_apply_op = tf.cast( | |
tf.floor(tf.random.uniform([], dtype=tf.float32) + prob), tf.bool) | |
if func_changes_bbox: | |
augmented_image, bbox = tf.cond( | |
should_apply_op, | |
lambda: augmentation_func(image, bbox, *args), | |
lambda: (image, bbox)) | |
else: | |
augmented_image = tf.cond( | |
should_apply_op, | |
lambda: _apply_bbox_augmentation(image, bbox, augmentation_func, *args), | |
lambda: image) | |
new_bboxes = _concat_bbox(bbox, new_bboxes) | |
return augmented_image, new_bboxes | |
def _apply_multi_bbox_augmentation_wrapper(image, bboxes, prob, aug_func, | |
func_changes_bbox, *args): | |
"""Checks to be sure num bboxes > 0 before calling inner function.""" | |
num_bboxes = tf.shape(bboxes)[0] | |
image, bboxes = tf.cond( | |
tf.equal(num_bboxes, 0), | |
lambda: (image, bboxes), | |
# pylint:disable=g-long-lambda | |
lambda: _apply_multi_bbox_augmentation( | |
image, bboxes, prob, aug_func, func_changes_bbox, *args)) | |
# pylint:enable=g-long-lambda | |
return image, bboxes | |
# Represents an invalid bounding box that is used for checking for padding | |
# lists of bounding box coordinates for a few augmentation operations | |
_INVALID_BOX = [[-1.0, -1.0, -1.0, -1.0]] | |
def _apply_multi_bbox_augmentation(image, bboxes, prob, aug_func, | |
func_changes_bbox, *args): | |
"""Applies aug_func to the image for each bbox in bboxes. | |
Args: | |
image: 3D uint8 Tensor. | |
bboxes: 2D Tensor that is a list of the bboxes in the image. Each bbox | |
has 4 elements (min_y, min_x, max_y, max_x) of type float. | |
prob: Float that is the probability of applying aug_func to a specific | |
bounding box within the image. | |
aug_func: Augmentation function that will be applied to the | |
subsections of image indicated by the bbox values in bboxes. | |
func_changes_bbox: Boolean. Does augmentation_func return bbox in addition | |
to image. | |
*args: Additional parameters that will be passed into augmentation_func | |
when it is called. | |
Returns: | |
A modified version of image, where each bbox location in the image will | |
have augmentation_func applied to it if it is chosen to be called with | |
probability prob independently across all bboxes. Also the final | |
bboxes are returned that will be unchanged if func_changes_bbox is set to | |
false and if true, the new altered ones will be returned. | |
Raises: | |
ValueError if applied to video. | |
""" | |
if image.shape.rank == 4: | |
raise ValueError('Image rank 4 is not supported') | |
# Will keep track of the new altered bboxes after aug_func is repeatedly | |
# applied. The -1 values are a dummy value and this first Tensor will be | |
# removed upon appending the first real bbox. | |
new_bboxes = tf.constant(_INVALID_BOX) | |
# If the bboxes are empty, then just give it _INVALID_BOX. The result | |
# will be thrown away. | |
bboxes = tf.cond(tf.equal(tf.size(bboxes), 0), | |
lambda: tf.constant(_INVALID_BOX), | |
lambda: bboxes) | |
bboxes = tf.ensure_shape(bboxes, (None, 4)) | |
# pylint:disable=g-long-lambda | |
wrapped_aug_func = ( | |
lambda _image, bbox, _new_bboxes: _apply_bbox_augmentation_wrapper( | |
_image, bbox, _new_bboxes, prob, aug_func, func_changes_bbox, *args)) | |
# pylint:enable=g-long-lambda | |
# Setup the while_loop. | |
num_bboxes = tf.shape(bboxes)[0] # We loop until we go over all bboxes. | |
idx = tf.constant(0) # Counter for the while loop. | |
# Conditional function when to end the loop once we go over all bboxes | |
# images_and_bboxes contain (_image, _new_bboxes) | |
cond = lambda _idx, _images_and_bboxes: tf.less(_idx, num_bboxes) | |
# Shuffle the bboxes so that the augmentation order is not deterministic if | |
# we are not changing the bboxes with aug_func. | |
if not func_changes_bbox: | |
loop_bboxes = tf.random.shuffle(bboxes) | |
else: | |
loop_bboxes = bboxes | |
# Main function of while_loop where we repeatedly apply augmentation on the | |
# bboxes in the image. | |
# pylint:disable=g-long-lambda | |
body = lambda _idx, _images_and_bboxes: [ | |
_idx + 1, wrapped_aug_func(_images_and_bboxes[0], | |
loop_bboxes[_idx], | |
_images_and_bboxes[1])] | |
# pylint:enable=g-long-lambda | |
_, (image, new_bboxes) = tf.while_loop( | |
cond, body, [idx, (image, new_bboxes)], | |
shape_invariants=[idx.get_shape(), | |
(image.get_shape(), tf.TensorShape([None, 4]))]) | |
# Either return the altered bboxes or the original ones depending on if | |
# we altered them in anyway. | |
if func_changes_bbox: | |
final_bboxes = new_bboxes | |
else: | |
final_bboxes = bboxes | |
return image, final_bboxes | |
def _clip_bbox(min_y, min_x, max_y, max_x): | |
"""Clip bounding box coordinates between 0 and 1. | |
Args: | |
min_y: Normalized bbox coordinate of type float between 0 and 1. | |
min_x: Normalized bbox coordinate of type float between 0 and 1. | |
max_y: Normalized bbox coordinate of type float between 0 and 1. | |
max_x: Normalized bbox coordinate of type float between 0 and 1. | |
Returns: | |
Clipped coordinate values between 0 and 1. | |
""" | |
min_y = tf.clip_by_value(min_y, 0.0, 1.0) | |
min_x = tf.clip_by_value(min_x, 0.0, 1.0) | |
max_y = tf.clip_by_value(max_y, 0.0, 1.0) | |
max_x = tf.clip_by_value(max_x, 0.0, 1.0) | |
return min_y, min_x, max_y, max_x | |
def _check_bbox_area(min_y, min_x, max_y, max_x, delta=0.05): | |
"""Adjusts bbox coordinates to make sure the area is > 0. | |
Args: | |
min_y: Normalized bbox coordinate of type float between 0 and 1. | |
min_x: Normalized bbox coordinate of type float between 0 and 1. | |
max_y: Normalized bbox coordinate of type float between 0 and 1. | |
max_x: Normalized bbox coordinate of type float between 0 and 1. | |
delta: Float, this is used to create a gap of size 2 * delta between | |
bbox min/max coordinates that are the same on the boundary. | |
This prevents the bbox from having an area of zero. | |
Returns: | |
Tuple of new bbox coordinates between 0 and 1 that will now have a | |
guaranteed area > 0. | |
""" | |
height = max_y - min_y | |
width = max_x - min_x | |
def _adjust_bbox_boundaries(min_coord, max_coord): | |
# Make sure max is never 0 and min is never 1. | |
max_coord = tf.maximum(max_coord, 0.0 + delta) | |
min_coord = tf.minimum(min_coord, 1.0 - delta) | |
return min_coord, max_coord | |
min_y, max_y = tf.cond(tf.equal(height, 0.0), | |
lambda: _adjust_bbox_boundaries(min_y, max_y), | |
lambda: (min_y, max_y)) | |
min_x, max_x = tf.cond(tf.equal(width, 0.0), | |
lambda: _adjust_bbox_boundaries(min_x, max_x), | |
lambda: (min_x, max_x)) | |
return min_y, min_x, max_y, max_x | |
def _rotate_bbox(bbox, image_height, image_width, degrees): | |
"""Rotates the bbox coordinated by degrees. | |
Args: | |
bbox: 1D Tensor that has 4 elements (min_y, min_x, max_y, max_x) | |
of type float that represents the normalized coordinates between 0 and 1. | |
image_height: Int, height of the image. | |
image_width: Int, height of the image. | |
degrees: Float, a scalar angle in degrees to rotate all images by. If | |
degrees is positive the image will be rotated clockwise otherwise it will | |
be rotated counterclockwise. | |
Returns: | |
A tensor of the same shape as bbox, but now with the rotated coordinates. | |
""" | |
image_height, image_width = ( | |
tf.cast(image_height, tf.float32), tf.cast(image_width, tf.float32)) | |
# Convert from degrees to radians. | |
degrees_to_radians = math.pi / 180.0 | |
radians = degrees * degrees_to_radians | |
# Translate the bbox to the center of the image and turn the normalized 0-1 | |
# coordinates to absolute pixel locations. | |
# Y coordinates are made negative as the y axis of images goes down with | |
# increasing pixel values, so we negate to make sure x axis and y axis points | |
# are in the traditionally positive direction. | |
min_y = -tf.cast(image_height * (bbox[0] - 0.5), tf.int32) | |
min_x = tf.cast(image_width * (bbox[1] - 0.5), tf.int32) | |
max_y = -tf.cast(image_height * (bbox[2] - 0.5), tf.int32) | |
max_x = tf.cast(image_width * (bbox[3] - 0.5), tf.int32) | |
coordinates = tf.stack( | |
[[min_y, min_x], [min_y, max_x], [max_y, min_x], [max_y, max_x]]) | |
coordinates = tf.cast(coordinates, tf.float32) | |
# Rotate the coordinates according to the rotation matrix clockwise if | |
# radians is positive, else negative | |
rotation_matrix = tf.stack( | |
[[tf.cos(radians), tf.sin(radians)], | |
[-tf.sin(radians), tf.cos(radians)]]) | |
new_coords = tf.cast( | |
tf.matmul(rotation_matrix, tf.transpose(coordinates)), tf.int32) | |
# Find min/max values and convert them back to normalized 0-1 floats. | |
min_y = -( | |
tf.cast(tf.reduce_max(new_coords[0, :]), tf.float32) / image_height - 0.5) | |
min_x = tf.cast(tf.reduce_min(new_coords[1, :]), | |
tf.float32) / image_width + 0.5 | |
max_y = -( | |
tf.cast(tf.reduce_min(new_coords[0, :]), tf.float32) / image_height - 0.5) | |
max_x = tf.cast(tf.reduce_max(new_coords[1, :]), | |
tf.float32) / image_width + 0.5 | |
# Clip the bboxes to be sure the fall between [0, 1]. | |
min_y, min_x, max_y, max_x = _clip_bbox(min_y, min_x, max_y, max_x) | |
min_y, min_x, max_y, max_x = _check_bbox_area(min_y, min_x, max_y, max_x) | |
return tf.stack([min_y, min_x, max_y, max_x]) | |
def rotate_with_bboxes(image, bboxes, degrees, replace): | |
"""Equivalent of PIL Rotate that rotates the image and bbox. | |
Args: | |
image: 3D uint8 Tensor. | |
bboxes: 2D Tensor that is a list of the bboxes in the image. Each bbox | |
has 4 elements (min_y, min_x, max_y, max_x) of type float. | |
degrees: Float, a scalar angle in degrees to rotate all images by. If | |
degrees is positive the image will be rotated clockwise otherwise it will | |
be rotated counterclockwise. | |
replace: A one or three value 1D tensor to fill empty pixels. | |
Returns: | |
A tuple containing a 3D uint8 Tensor that will be the result of rotating | |
image by degrees. The second element of the tuple is bboxes, where now | |
the coordinates will be shifted to reflect the rotated image. | |
Raises: | |
ValueError: If applied to video. | |
""" | |
if image.shape.rank == 4: | |
raise ValueError('Image rank 4 is not supported') | |
# Rotate the image. | |
image = wrapped_rotate(image, degrees, replace) | |
# Convert bbox coordinates to pixel values. | |
image_height = tf.shape(image)[0] | |
image_width = tf.shape(image)[1] | |
# pylint:disable=g-long-lambda | |
wrapped_rotate_bbox = lambda bbox: _rotate_bbox( | |
bbox, image_height, image_width, degrees) | |
# pylint:enable=g-long-lambda | |
bboxes = tf.map_fn(wrapped_rotate_bbox, bboxes) | |
return image, bboxes | |
def _shear_bbox(bbox, image_height, image_width, level, shear_horizontal): | |
"""Shifts the bbox according to how the image was sheared. | |
Args: | |
bbox: 1D Tensor that has 4 elements (min_y, min_x, max_y, max_x) | |
of type float that represents the normalized coordinates between 0 and 1. | |
image_height: Int, height of the image. | |
image_width: Int, height of the image. | |
level: Float. How much to shear the image. | |
shear_horizontal: If true then shear in X dimension else shear in | |
the Y dimension. | |
Returns: | |
A tensor of the same shape as bbox, but now with the shifted coordinates. | |
""" | |
image_height, image_width = ( | |
tf.cast(image_height, tf.float32), tf.cast(image_width, tf.float32)) | |
# Change bbox coordinates to be pixels. | |
min_y = tf.cast(image_height * bbox[0], tf.int32) | |
min_x = tf.cast(image_width * bbox[1], tf.int32) | |
max_y = tf.cast(image_height * bbox[2], tf.int32) | |
max_x = tf.cast(image_width * bbox[3], tf.int32) | |
coordinates = tf.stack( | |
[[min_y, min_x], [min_y, max_x], [max_y, min_x], [max_y, max_x]]) | |
coordinates = tf.cast(coordinates, tf.float32) | |
# Shear the coordinates according to the translation matrix. | |
if shear_horizontal: | |
translation_matrix = tf.stack( | |
[[1, 0], [-level, 1]]) | |
else: | |
translation_matrix = tf.stack( | |
[[1, -level], [0, 1]]) | |
translation_matrix = tf.cast(translation_matrix, tf.float32) | |
new_coords = tf.cast( | |
tf.matmul(translation_matrix, tf.transpose(coordinates)), tf.int32) | |
# Find min/max values and convert them back to floats. | |
min_y = tf.cast(tf.reduce_min(new_coords[0, :]), tf.float32) / image_height | |
min_x = tf.cast(tf.reduce_min(new_coords[1, :]), tf.float32) / image_width | |
max_y = tf.cast(tf.reduce_max(new_coords[0, :]), tf.float32) / image_height | |
max_x = tf.cast(tf.reduce_max(new_coords[1, :]), tf.float32) / image_width | |
# Clip the bboxes to be sure the fall between [0, 1]. | |
min_y, min_x, max_y, max_x = _clip_bbox(min_y, min_x, max_y, max_x) | |
min_y, min_x, max_y, max_x = _check_bbox_area(min_y, min_x, max_y, max_x) | |
return tf.stack([min_y, min_x, max_y, max_x]) | |
def shear_with_bboxes(image, bboxes, level, replace, shear_horizontal): | |
"""Applies Shear Transformation to the image and shifts the bboxes. | |
Args: | |
image: 3D uint8 Tensor. | |
bboxes: 2D Tensor that is a list of the bboxes in the image. Each bbox | |
has 4 elements (min_y, min_x, max_y, max_x) of type float with values | |
between [0, 1]. | |
level: Float. How much to shear the image. This value will be between | |
-0.3 to 0.3. | |
replace: A one or three value 1D tensor to fill empty pixels. | |
shear_horizontal: Boolean. If true then shear in X dimension else shear in | |
the Y dimension. | |
Returns: | |
A tuple containing a 3D uint8 Tensor that will be the result of shearing | |
image by level. The second element of the tuple is bboxes, where now | |
the coordinates will be shifted to reflect the sheared image. | |
Raises: | |
ValueError: If applied to video. | |
""" | |
if image.shape.rank == 4: | |
raise ValueError('Image rank 4 is not supported') | |
if shear_horizontal: | |
image = shear_x(image, level, replace) | |
else: | |
image = shear_y(image, level, replace) | |
# Convert bbox coordinates to pixel values. | |
image_height = tf.shape(image)[0] | |
image_width = tf.shape(image)[1] | |
# pylint:disable=g-long-lambda | |
wrapped_shear_bbox = lambda bbox: _shear_bbox( | |
bbox, image_height, image_width, level, shear_horizontal) | |
# pylint:enable=g-long-lambda | |
bboxes = tf.map_fn(wrapped_shear_bbox, bboxes) | |
return image, bboxes | |
def _shift_bbox(bbox, image_height, image_width, pixels, shift_horizontal): | |
"""Shifts the bbox coordinates by pixels. | |
Args: | |
bbox: 1D Tensor that has 4 elements (min_y, min_x, max_y, max_x) | |
of type float that represents the normalized coordinates between 0 and 1. | |
image_height: Int, height of the image. | |
image_width: Int, width of the image. | |
pixels: An int. How many pixels to shift the bbox. | |
shift_horizontal: Boolean. If true then shift in X dimension else shift in | |
Y dimension. | |
Returns: | |
A tensor of the same shape as bbox, but now with the shifted coordinates. | |
""" | |
pixels = tf.cast(pixels, tf.int32) | |
# Convert bbox to integer pixel locations. | |
min_y = tf.cast(tf.cast(image_height, tf.float32) * bbox[0], tf.int32) | |
min_x = tf.cast(tf.cast(image_width, tf.float32) * bbox[1], tf.int32) | |
max_y = tf.cast(tf.cast(image_height, tf.float32) * bbox[2], tf.int32) | |
max_x = tf.cast(tf.cast(image_width, tf.float32) * bbox[3], tf.int32) | |
if shift_horizontal: | |
min_x = tf.maximum(0, min_x - pixels) | |
max_x = tf.minimum(image_width, max_x - pixels) | |
else: | |
min_y = tf.maximum(0, min_y - pixels) | |
max_y = tf.minimum(image_height, max_y - pixels) | |
# Convert bbox back to floats. | |
min_y = tf.cast(min_y, tf.float32) / tf.cast(image_height, tf.float32) | |
min_x = tf.cast(min_x, tf.float32) / tf.cast(image_width, tf.float32) | |
max_y = tf.cast(max_y, tf.float32) / tf.cast(image_height, tf.float32) | |
max_x = tf.cast(max_x, tf.float32) / tf.cast(image_width, tf.float32) | |
# Clip the bboxes to be sure the fall between [0, 1]. | |
min_y, min_x, max_y, max_x = _clip_bbox(min_y, min_x, max_y, max_x) | |
min_y, min_x, max_y, max_x = _check_bbox_area(min_y, min_x, max_y, max_x) | |
return tf.stack([min_y, min_x, max_y, max_x]) | |
def translate_bbox(image, bboxes, pixels, replace, shift_horizontal): | |
"""Equivalent of PIL Translate in X/Y dimension that shifts image and bbox. | |
Args: | |
image: 3D uint8 Tensor. | |
bboxes: 2D Tensor that is a list of the bboxes in the image. Each bbox | |
has 4 elements (min_y, min_x, max_y, max_x) of type float with values | |
between [0, 1]. | |
pixels: An int. How many pixels to shift the image and bboxes | |
replace: A one or three value 1D tensor to fill empty pixels. | |
shift_horizontal: Boolean. If true then shift in X dimension else shift in | |
Y dimension. | |
Returns: | |
A tuple containing a 3D uint8 Tensor that will be the result of translating | |
image by pixels. The second element of the tuple is bboxes, where now | |
the coordinates will be shifted to reflect the shifted image. | |
Raises: | |
ValueError if applied to video. | |
""" | |
if image.shape.rank == 4: | |
raise ValueError('Image rank 4 is not supported') | |
if shift_horizontal: | |
image = translate_x(image, pixels, replace) | |
else: | |
image = translate_y(image, pixels, replace) | |
# Convert bbox coordinates to pixel values. | |
image_height = tf.shape(image)[0] | |
image_width = tf.shape(image)[1] | |
# pylint:disable=g-long-lambda | |
wrapped_shift_bbox = lambda bbox: _shift_bbox( | |
bbox, image_height, image_width, pixels, shift_horizontal) | |
# pylint:enable=g-long-lambda | |
bboxes = tf.map_fn(wrapped_shift_bbox, bboxes) | |
return image, bboxes | |
def translate_y_only_bboxes( | |
image: tf.Tensor, bboxes: tf.Tensor, prob: float, pixels: int, replace): | |
"""Apply translate_y to each bbox in the image with probability prob.""" | |
if bboxes.shape.rank == 4: | |
raise ValueError('translate_y_only_bboxes does not support rank 4 boxes') | |
func_changes_bbox = False | |
prob = _scale_bbox_only_op_probability(prob) | |
return _apply_multi_bbox_augmentation_wrapper( | |
image, bboxes, prob, translate_y, func_changes_bbox, pixels, replace) | |
def _randomly_negate_tensor(tensor): | |
"""With 50% prob turn the tensor negative.""" | |
should_flip = tf.cast(tf.floor(tf.random.uniform([]) + 0.5), tf.bool) | |
final_tensor = tf.cond(should_flip, lambda: tensor, lambda: -tensor) | |
return final_tensor | |
def _rotate_level_to_arg(level: float): | |
level = (level / _MAX_LEVEL) * 30. | |
level = _randomly_negate_tensor(level) | |
return (level,) | |
def _shrink_level_to_arg(level: float): | |
"""Converts level to ratio by which we shrink the image content.""" | |
if level == 0: | |
return (1.0,) # if level is zero, do not shrink the image | |
# Maximum shrinking ratio is 2.9. | |
level = 2. / (_MAX_LEVEL / level) + 0.9 | |
return (level,) | |
def _enhance_level_to_arg(level: float): | |
return ((level / _MAX_LEVEL) * 1.8 + 0.1,) | |
def _shear_level_to_arg(level: float): | |
level = (level / _MAX_LEVEL) * 0.3 | |
# Flip level to negative with 50% chance. | |
level = _randomly_negate_tensor(level) | |
return (level,) | |
def _translate_level_to_arg(level: float, translate_const: float): | |
level = (level / _MAX_LEVEL) * float(translate_const) | |
# Flip level to negative with 50% chance. | |
level = _randomly_negate_tensor(level) | |
return (level,) | |
def _gaussian_noise_level_to_arg(level: float, translate_const: float): | |
low_std = (level / _MAX_LEVEL) | |
high_std = translate_const * low_std | |
return low_std, high_std | |
def _mult_to_arg(level: float, multiplier: float = 1.): | |
return (int((level / _MAX_LEVEL) * multiplier),) | |
def _apply_func_with_prob(func: Any, image: tf.Tensor, | |
bboxes: Optional[tf.Tensor], args: Any, prob: float): | |
"""Apply `func` to image w/ `args` as input with probability `prob`.""" | |
assert isinstance(args, tuple) | |
assert inspect.getfullargspec(func)[0][1] == 'bboxes' | |
# Apply the function with probability `prob`. | |
should_apply_op = tf.cast( | |
tf.floor(tf.random.uniform([], dtype=tf.float32) + prob), tf.bool) | |
augmented_image, augmented_bboxes = tf.cond( | |
should_apply_op, | |
lambda: func(image, bboxes, *args), | |
lambda: (image, bboxes)) | |
return augmented_image, augmented_bboxes | |
def select_and_apply_random_policy( | |
policies: Any, image: tf.Tensor, bboxes: Optional[tf.Tensor] = None | |
) -> Tuple[tf.Tensor, Optional[tf.Tensor]]: | |
"""Select a random policy from `policies` and apply it to `image`.""" | |
policy_to_select = tf.random.uniform([], maxval=len(policies), dtype=tf.int32) | |
# Note that using tf.case instead of tf.conds would result in significantly | |
# larger graphs and would even break export for some larger policies. | |
for (i, policy) in enumerate(policies): | |
image, bboxes = tf.cond( | |
tf.equal(i, policy_to_select), | |
lambda selected_policy=policy: selected_policy(image, bboxes), | |
lambda: (image, bboxes)) | |
return image, bboxes | |
NAME_TO_FUNC = { | |
'AutoContrast': autocontrast, | |
'Equalize': equalize, | |
'Invert': invert, | |
'Rotate': wrapped_rotate, | |
'Posterize': posterize, | |
'Solarize': solarize, | |
'SolarizeAdd': solarize_add, | |
'Color': color, | |
'Contrast': contrast, | |
'Brightness': brightness, | |
'Sharpness': sharpness, | |
'ShearX': shear_x, | |
'ShearY': shear_y, | |
'TranslateX': translate_x, | |
'TranslateY': translate_y, | |
'Cutout': cutout, | |
'Rotate_BBox': rotate_with_bboxes, | |
'Grayscale': grayscale, | |
'Gaussian_Noise': gaussian_noise, | |
# pylint:disable=g-long-lambda | |
'ShearX_BBox': lambda image, bboxes, level, replace: shear_with_bboxes( | |
image, bboxes, level, replace, shear_horizontal=True), | |
'ShearY_BBox': lambda image, bboxes, level, replace: shear_with_bboxes( | |
image, bboxes, level, replace, shear_horizontal=False), | |
'TranslateX_BBox': lambda image, bboxes, pixels, replace: translate_bbox( | |
image, bboxes, pixels, replace, shift_horizontal=True), | |
'TranslateY_BBox': lambda image, bboxes, pixels, replace: translate_bbox( | |
image, bboxes, pixels, replace, shift_horizontal=False), | |
# pylint:enable=g-long-lambda | |
'TranslateY_Only_BBoxes': translate_y_only_bboxes, | |
} | |
# Functions that require a `bboxes` parameter. | |
REQUIRE_BOXES_FUNCS = frozenset({ | |
'Rotate_BBox', | |
'ShearX_BBox', | |
'ShearY_BBox', | |
'TranslateX_BBox', | |
'TranslateY_BBox', | |
'TranslateY_Only_BBoxes', | |
}) | |
# Functions that have a 'prob' parameter | |
PROB_FUNCS = frozenset({ | |
'TranslateY_Only_BBoxes', | |
}) | |
# Functions that have a 'replace' parameter | |
REPLACE_FUNCS = frozenset({ | |
'Rotate', | |
'TranslateX', | |
'ShearX', | |
'ShearY', | |
'TranslateY', | |
'Cutout', | |
'Rotate_BBox', | |
'ShearX_BBox', | |
'ShearY_BBox', | |
'TranslateX_BBox', | |
'TranslateY_BBox', | |
'TranslateY_Only_BBoxes', | |
}) | |
def level_to_arg(cutout_const: float, translate_const: float): | |
"""Creates a dict mapping image operation names to their arguments.""" | |
no_arg = lambda level: () | |
posterize_arg = lambda level: _mult_to_arg(level, 4) | |
solarize_arg = lambda level: _mult_to_arg(level, 256) | |
solarize_add_arg = lambda level: _mult_to_arg(level, 110) | |
cutout_arg = lambda level: _mult_to_arg(level, cutout_const) | |
translate_arg = lambda level: _translate_level_to_arg(level, translate_const) | |
translate_bbox_arg = lambda level: _translate_level_to_arg(level, 120) | |
args = { | |
'AutoContrast': no_arg, | |
'Equalize': no_arg, | |
'Invert': no_arg, | |
'Rotate': _rotate_level_to_arg, | |
'Posterize': posterize_arg, | |
'Solarize': solarize_arg, | |
'SolarizeAdd': solarize_add_arg, | |
'Color': _enhance_level_to_arg, | |
'Contrast': _enhance_level_to_arg, | |
'Brightness': _enhance_level_to_arg, | |
'Sharpness': _enhance_level_to_arg, | |
'ShearX': _shear_level_to_arg, | |
'ShearY': _shear_level_to_arg, | |
'Cutout': cutout_arg, | |
'TranslateX': translate_arg, | |
'TranslateY': translate_arg, | |
'Rotate_BBox': _rotate_level_to_arg, | |
'ShearX_BBox': _shear_level_to_arg, | |
'ShearY_BBox': _shear_level_to_arg, | |
'Grayscale': no_arg, | |
# pylint:disable=g-long-lambda | |
'Gaussian_Noise': lambda level: _gaussian_noise_level_to_arg( | |
level, translate_const), | |
# pylint:disable=g-long-lambda | |
'TranslateX_BBox': lambda level: _translate_level_to_arg( | |
level, translate_const), | |
'TranslateY_BBox': lambda level: _translate_level_to_arg( | |
level, translate_const), | |
# pylint:enable=g-long-lambda | |
'TranslateY_Only_BBoxes': translate_bbox_arg, | |
} | |
return args | |
def bbox_wrapper(func): | |
"""Adds a bboxes function argument to func and returns unchanged bboxes.""" | |
def wrapper(images, bboxes, *args, **kwargs): | |
return (func(images, *args, **kwargs), bboxes) | |
return wrapper | |
def _parse_policy_info(name: str, | |
prob: float, | |
level: float, | |
replace_value: List[int], | |
cutout_const: float, | |
translate_const: float, | |
level_std: float = 0.) -> Tuple[Any, float, Any]: | |
"""Return the function that corresponds to `name` and update `level` param.""" | |
func = NAME_TO_FUNC[name] | |
if level_std > 0: | |
level += tf.random.normal([], dtype=tf.float32) | |
level = tf.clip_by_value(level, 0., _MAX_LEVEL) | |
args = level_to_arg(cutout_const, translate_const)[name](level) | |
if name in PROB_FUNCS: | |
# Add in the prob arg if it is required for the function that is called. | |
args = tuple([prob] + list(args)) | |
if name in REPLACE_FUNCS: | |
# Add in replace arg if it is required for the function that is called. | |
args = tuple(list(args) + [replace_value]) | |
# Add bboxes as the second positional argument for the function if it does | |
# not already exist. | |
if 'bboxes' not in inspect.getfullargspec(func)[0]: | |
func = bbox_wrapper(func) | |
return func, prob, args | |
class ImageAugment(object): | |
"""Image augmentation class for applying image distortions.""" | |
def distort( | |
self, | |
image: tf.Tensor | |
) -> tf.Tensor: | |
"""Given an image tensor, returns a distorted image with the same shape. | |
Expect the image tensor values are in the range [0, 255]. | |
Args: | |
image: `Tensor` of shape [height, width, 3] or | |
[num_frames, height, width, 3] representing an image or image sequence. | |
Returns: | |
The augmented version of `image`. | |
""" | |
raise NotImplementedError() | |
def distort_with_boxes( | |
self, | |
image: tf.Tensor, | |
bboxes: tf.Tensor | |
) -> Tuple[tf.Tensor, tf.Tensor]: | |
"""Distorts the image and bounding boxes. | |
Expect the image tensor values are in the range [0, 255]. | |
Args: | |
image: `Tensor` of shape [height, width, 3] or | |
[num_frames, height, width, 3] representing an image or image sequence. | |
bboxes: `Tensor` of shape [num_boxes, 4] or [num_frames, num_boxes, 4] | |
representing bounding boxes for an image or image sequence. | |
Returns: | |
The augmented version of `image` and `bboxes`. | |
""" | |
raise NotImplementedError | |
class AutoAugment(ImageAugment): | |
"""Applies the AutoAugment policy to images. | |
AutoAugment is from the paper: https://arxiv.org/abs/1805.09501. | |
""" | |
def __init__(self, | |
augmentation_name: str = 'v0', | |
policies: Optional[Iterable[Iterable[Tuple[str, float, | |
float]]]] = None, | |
cutout_const: float = 100, | |
translate_const: float = 250): | |
"""Applies the AutoAugment policy to images. | |
Args: | |
augmentation_name: The name of the AutoAugment policy to use. The | |
available options are `v0`, `test`, `reduced_cifar10`, `svhn` and | |
`reduced_imagenet`. `v0` is the policy used for all | |
of the results in the paper and was found to achieve the best results on | |
the COCO dataset. `v1`, `v2` and `v3` are additional good policies found | |
on the COCO dataset that have slight variation in what operations were | |
used during the search procedure along with how many operations are | |
applied in parallel to a single image (2 vs 3). Make sure to set | |
`policies` to `None` (the default) if you want to set options using | |
`augmentation_name`. | |
policies: list of lists of tuples in the form `(func, prob, level)`, | |
`func` is a string name of the augmentation function, `prob` is the | |
probability of applying the `func` operation, `level` (or magnitude) is | |
the input argument for `func`. For example: | |
``` | |
[[('Equalize', 0.9, 3), ('Color', 0.7, 8)], | |
[('Invert', 0.6, 5), ('Rotate', 0.2, 9), ('ShearX', 0.1, 2)], ...] | |
``` | |
The outer-most list must be 3-d. The number of operations in a | |
sub-policy can vary from one sub-policy to another. | |
If you provide `policies` as input, any option set with | |
`augmentation_name` will get overriden as they are mutually exclusive. | |
cutout_const: multiplier for applying cutout. | |
translate_const: multiplier for applying translation. | |
Raises: | |
ValueError if `augmentation_name` is unsupported. | |
""" | |
super(AutoAugment, self).__init__() | |
self.augmentation_name = augmentation_name | |
self.cutout_const = float(cutout_const) | |
self.translate_const = float(translate_const) | |
self.available_policies = { | |
'detection_v0': self.detection_policy_v0(), | |
'v0': self.policy_v0(), | |
'test': self.policy_test(), | |
'simple': self.policy_simple(), | |
'reduced_cifar10': self.policy_reduced_cifar10(), | |
'svhn': self.policy_svhn(), | |
'reduced_imagenet': self.policy_reduced_imagenet(), | |
'panoptic_deeplab_policy': self.panoptic_deeplab_policy(), | |
'vit': self.vit(), | |
'deit3_three_augment': self.deit3_three_augment(), | |
} | |
if not policies: | |
if augmentation_name not in self.available_policies: | |
raise ValueError( | |
'Invalid augmentation_name: {}'.format(augmentation_name)) | |
self.policies = self.available_policies[augmentation_name] | |
else: | |
self._check_policy_shape(policies) | |
self.policies = policies | |
def _check_policy_shape(self, policies): | |
"""Checks dimension and shape of the custom policy. | |
Args: | |
policies: List of list of tuples in the form `(func, prob, level)`. Must | |
have shape of `(:, :, 3)`. | |
Raises: | |
ValueError if the shape of `policies` is unexpected. | |
""" | |
in_shape = np.array(policies).shape | |
if len(in_shape) != 3 or in_shape[-1:] != (3,): | |
raise ValueError('Wrong shape detected for custom policy. Expected ' | |
'(:, :, 3) but got {}.'.format(in_shape)) | |
def _make_tf_policies(self): | |
"""Prepares the TF functions for augmentations based on the policies.""" | |
replace_value = [128] * 3 | |
# func is the string name of the augmentation function, prob is the | |
# probability of applying the operation and level is the parameter | |
# associated with the tf op. | |
# tf_policies are functions that take in an image and return an augmented | |
# image. | |
tf_policies = [] | |
for policy in self.policies: | |
tf_policy = [] | |
assert_ranges = [] | |
# Link string name to the correct python function and make sure the | |
# correct argument is passed into that function. | |
for policy_info in policy: | |
_, prob, level = policy_info | |
assert_ranges.append(tf.Assert(tf.less_equal(prob, 1.), [prob])) | |
assert_ranges.append( | |
tf.Assert(tf.less_equal(level, int(_MAX_LEVEL)), [level])) | |
policy_info = list(policy_info) + [ | |
replace_value, self.cutout_const, self.translate_const | |
] | |
tf_policy.append(_parse_policy_info(*policy_info)) | |
# Now build the tf policy that will apply the augmentation procedue | |
# on image. | |
def make_final_policy(tf_policy_): | |
def final_policy(image_, bboxes_): | |
for func, prob, args in tf_policy_: | |
image_, bboxes_ = _apply_func_with_prob(func, image_, bboxes_, args, | |
prob) | |
return image_, bboxes_ | |
return final_policy | |
with tf.control_dependencies(assert_ranges): | |
tf_policies.append(make_final_policy(tf_policy)) | |
return tf_policies | |
def distort(self, image: tf.Tensor) -> tf.Tensor: | |
"""See base class.""" | |
input_image_type = image.dtype | |
if input_image_type != tf.uint8: | |
image = tf.clip_by_value(image, 0.0, 255.0) | |
image = tf.cast(image, dtype=tf.uint8) | |
tf_policies = self._make_tf_policies() | |
image, _ = select_and_apply_random_policy(tf_policies, image, bboxes=None) | |
image = tf.cast(image, dtype=input_image_type) | |
return image | |
def distort_with_boxes(self, image: tf.Tensor, | |
bboxes: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]: | |
"""See base class.""" | |
input_image_type = image.dtype | |
if input_image_type != tf.uint8: | |
image = tf.clip_by_value(image, 0.0, 255.0) | |
image = tf.cast(image, dtype=tf.uint8) | |
tf_policies = self._make_tf_policies() | |
image, bboxes = select_and_apply_random_policy(tf_policies, image, bboxes) | |
image = tf.cast(image, dtype=input_image_type) | |
assert bboxes is not None | |
return image, bboxes | |
def detection_policy_v0(): | |
"""Autoaugment policy that was used in AutoAugment Paper for Detection. | |
https://arxiv.org/pdf/1906.11172 | |
Each tuple is an augmentation operation of the form | |
(operation, probability, magnitude). Each element in policy is a | |
sub-policy that will be applied sequentially on the image. | |
Returns: | |
the policy. | |
""" | |
policy = [ | |
[('TranslateX_BBox', 0.6, 4), ('Equalize', 0.8, 10)], | |
[('TranslateY_Only_BBoxes', 0.2, 2), ('Cutout', 0.8, 8)], | |
[('Sharpness', 0.0, 8), ('ShearX_BBox', 0.4, 0)], | |
[('ShearY_BBox', 1.0, 2), ('TranslateY_Only_BBoxes', 0.6, 6)], | |
[('Rotate_BBox', 0.6, 10), ('Color', 1.0, 6)], | |
] | |
return policy | |
def policy_v0(): | |
"""Autoaugment policy that was used in AutoAugment Paper. | |
Each tuple is an augmentation operation of the form | |
(operation, probability, magnitude). Each element in policy is a | |
sub-policy that will be applied sequentially on the image. | |
Returns: | |
the policy. | |
""" | |
policy = [ | |
[('Equalize', 0.8, 1), ('ShearY', 0.8, 4)], | |
[('Color', 0.4, 9), ('Equalize', 0.6, 3)], | |
[('Color', 0.4, 1), ('Rotate', 0.6, 8)], | |
[('Solarize', 0.8, 3), ('Equalize', 0.4, 7)], | |
[('Solarize', 0.4, 2), ('Solarize', 0.6, 2)], | |
[('Color', 0.2, 0), ('Equalize', 0.8, 8)], | |
[('Equalize', 0.4, 8), ('SolarizeAdd', 0.8, 3)], | |
[('ShearX', 0.2, 9), ('Rotate', 0.6, 8)], | |
[('Color', 0.6, 1), ('Equalize', 1.0, 2)], | |
[('Invert', 0.4, 9), ('Rotate', 0.6, 0)], | |
[('Equalize', 1.0, 9), ('ShearY', 0.6, 3)], | |
[('Color', 0.4, 7), ('Equalize', 0.6, 0)], | |
[('Posterize', 0.4, 6), ('AutoContrast', 0.4, 7)], | |
[('Solarize', 0.6, 8), ('Color', 0.6, 9)], | |
[('Solarize', 0.2, 4), ('Rotate', 0.8, 9)], | |
[('Rotate', 1.0, 7), ('TranslateY', 0.8, 9)], | |
[('ShearX', 0.0, 0), ('Solarize', 0.8, 4)], | |
[('ShearY', 0.8, 0), ('Color', 0.6, 4)], | |
[('Color', 1.0, 0), ('Rotate', 0.6, 2)], | |
[('Equalize', 0.8, 4), ('Equalize', 0.0, 8)], | |
[('Equalize', 1.0, 4), ('AutoContrast', 0.6, 2)], | |
[('ShearY', 0.4, 7), ('SolarizeAdd', 0.6, 7)], | |
[('Posterize', 0.8, 2), ('Solarize', 0.6, 10)], | |
[('Solarize', 0.6, 8), ('Equalize', 0.6, 1)], | |
[('Color', 0.8, 6), ('Rotate', 0.4, 5)], | |
] | |
return policy | |
def policy_reduced_cifar10(): | |
"""Autoaugment policy for reduced CIFAR-10 dataset. | |
Result is from the AutoAugment paper: https://arxiv.org/abs/1805.09501. | |
Each tuple is an augmentation operation of the form | |
(operation, probability, magnitude). Each element in policy is a | |
sub-policy that will be applied sequentially on the image. | |
Returns: | |
the policy. | |
""" | |
policy = [ | |
[('Invert', 0.1, 7), ('Contrast', 0.2, 6)], | |
[('Rotate', 0.7, 2), ('TranslateX', 0.3, 9)], | |
[('Sharpness', 0.8, 1), ('Sharpness', 0.9, 3)], | |
[('ShearY', 0.5, 8), ('TranslateY', 0.7, 9)], | |
[('AutoContrast', 0.5, 8), ('Equalize', 0.9, 2)], | |
[('ShearY', 0.2, 7), ('Posterize', 0.3, 7)], | |
[('Color', 0.4, 3), ('Brightness', 0.6, 7)], | |
[('Sharpness', 0.3, 9), ('Brightness', 0.7, 9)], | |
[('Equalize', 0.6, 5), ('Equalize', 0.5, 1)], | |
[('Contrast', 0.6, 7), ('Sharpness', 0.6, 5)], | |
[('Color', 0.7, 7), ('TranslateX', 0.5, 8)], | |
[('Equalize', 0.3, 7), ('AutoContrast', 0.4, 8)], | |
[('TranslateY', 0.4, 3), ('Sharpness', 0.2, 6)], | |
[('Brightness', 0.9, 6), ('Color', 0.2, 8)], | |
[('Solarize', 0.5, 2), ('Invert', 0.0, 3)], | |
[('Equalize', 0.2, 0), ('AutoContrast', 0.6, 0)], | |
[('Equalize', 0.2, 8), ('Equalize', 0.6, 4)], | |
[('Color', 0.9, 9), ('Equalize', 0.6, 6)], | |
[('AutoContrast', 0.8, 4), ('Solarize', 0.2, 8)], | |
[('Brightness', 0.1, 3), ('Color', 0.7, 0)], | |
[('Solarize', 0.4, 5), ('AutoContrast', 0.9, 3)], | |
[('TranslateY', 0.9, 9), ('TranslateY', 0.7, 9)], | |
[('AutoContrast', 0.9, 2), ('Solarize', 0.8, 3)], | |
[('Equalize', 0.8, 8), ('Invert', 0.1, 3)], | |
[('TranslateY', 0.7, 9), ('AutoContrast', 0.9, 1)], | |
] | |
return policy | |
def policy_svhn(): | |
"""Autoaugment policy for SVHN dataset. | |
Result is from the AutoAugment paper: https://arxiv.org/abs/1805.09501. | |
Each tuple is an augmentation operation of the form | |
(operation, probability, magnitude). Each element in policy is a | |
sub-policy that will be applied sequentially on the image. | |
Returns: | |
the policy. | |
""" | |
policy = [ | |
[('ShearX', 0.9, 4), ('Invert', 0.2, 3)], | |
[('ShearY', 0.9, 8), ('Invert', 0.7, 5)], | |
[('Equalize', 0.6, 5), ('Solarize', 0.6, 6)], | |
[('Invert', 0.9, 3), ('Equalize', 0.6, 3)], | |
[('Equalize', 0.6, 1), ('Rotate', 0.9, 3)], | |
[('ShearX', 0.9, 4), ('AutoContrast', 0.8, 3)], | |
[('ShearY', 0.9, 8), ('Invert', 0.4, 5)], | |
[('ShearY', 0.9, 5), ('Solarize', 0.2, 6)], | |
[('Invert', 0.9, 6), ('AutoContrast', 0.8, 1)], | |
[('Equalize', 0.6, 3), ('Rotate', 0.9, 3)], | |
[('ShearX', 0.9, 4), ('Solarize', 0.3, 3)], | |
[('ShearY', 0.8, 8), ('Invert', 0.7, 4)], | |
[('Equalize', 0.9, 5), ('TranslateY', 0.6, 6)], | |
[('Invert', 0.9, 4), ('Equalize', 0.6, 7)], | |
[('Contrast', 0.3, 3), ('Rotate', 0.8, 4)], | |
[('Invert', 0.8, 5), ('TranslateY', 0.0, 2)], | |
[('ShearY', 0.7, 6), ('Solarize', 0.4, 8)], | |
[('Invert', 0.6, 4), ('Rotate', 0.8, 4)], | |
[('ShearY', 0.3, 7), ('TranslateX', 0.9, 3)], | |
[('ShearX', 0.1, 6), ('Invert', 0.6, 5)], | |
[('Solarize', 0.7, 2), ('TranslateY', 0.6, 7)], | |
[('ShearY', 0.8, 4), ('Invert', 0.8, 8)], | |
[('ShearX', 0.7, 9), ('TranslateY', 0.8, 3)], | |
[('ShearY', 0.8, 5), ('AutoContrast', 0.7, 3)], | |
[('ShearX', 0.7, 2), ('Invert', 0.1, 5)], | |
] | |
return policy | |
def policy_reduced_imagenet(): | |
"""Autoaugment policy for reduced ImageNet dataset. | |
Result is from the AutoAugment paper: https://arxiv.org/abs/1805.09501. | |
Each tuple is an augmentation operation of the form | |
(operation, probability, magnitude). Each element in policy is a | |
sub-policy that will be applied sequentially on the image. | |
Returns: | |
the policy. | |
""" | |
policy = [ | |
[('Posterize', 0.4, 8), ('Rotate', 0.6, 9)], | |
[('Solarize', 0.6, 5), ('AutoContrast', 0.6, 5)], | |
[('Equalize', 0.8, 8), ('Equalize', 0.6, 3)], | |
[('Posterize', 0.6, 7), ('Posterize', 0.6, 6)], | |
[('Equalize', 0.4, 7), ('Solarize', 0.2, 4)], | |
[('Equalize', 0.4, 4), ('Rotate', 0.8, 8)], | |
[('Solarize', 0.6, 3), ('Equalize', 0.6, 7)], | |
[('Posterize', 0.8, 5), ('Equalize', 1.0, 2)], | |
[('Rotate', 0.2, 3), ('Solarize', 0.6, 8)], | |
[('Equalize', 0.6, 8), ('Posterize', 0.4, 6)], | |
[('Rotate', 0.8, 8), ('Color', 0.4, 0)], | |
[('Rotate', 0.4, 9), ('Equalize', 0.6, 2)], | |
[('Equalize', 0.0, 7), ('Equalize', 0.8, 8)], | |
[('Invert', 0.6, 4), ('Equalize', 1.0, 8)], | |
[('Color', 0.6, 4), ('Contrast', 1.0, 8)], | |
[('Rotate', 0.8, 8), ('Color', 1.0, 2)], | |
[('Color', 0.8, 8), ('Solarize', 0.8, 7)], | |
[('Sharpness', 0.4, 7), ('Invert', 0.6, 8)], | |
[('ShearX', 0.6, 5), ('Equalize', 1.0, 9)], | |
[('Color', 0.4, 0), ('Equalize', 0.6, 3)], | |
[('Equalize', 0.4, 7), ('Solarize', 0.2, 4)], | |
[('Solarize', 0.6, 5), ('AutoContrast', 0.6, 5)], | |
[('Invert', 0.6, 4), ('Equalize', 1.0, 8)], | |
[('Color', 0.6, 4), ('Contrast', 1.0, 8)], | |
[('Equalize', 0.8, 8), ('Equalize', 0.6, 3)] | |
] | |
return policy | |
def policy_simple(): | |
"""Same as `policy_v0`, except with custom ops removed.""" | |
policy = [ | |
[('Color', 0.4, 9), ('Equalize', 0.6, 3)], | |
[('Solarize', 0.8, 3), ('Equalize', 0.4, 7)], | |
[('Solarize', 0.4, 2), ('Solarize', 0.6, 2)], | |
[('Color', 0.2, 0), ('Equalize', 0.8, 8)], | |
[('Equalize', 0.4, 8), ('SolarizeAdd', 0.8, 3)], | |
[('Color', 0.6, 1), ('Equalize', 1.0, 2)], | |
[('Color', 0.4, 7), ('Equalize', 0.6, 0)], | |
[('Posterize', 0.4, 6), ('AutoContrast', 0.4, 7)], | |
[('Solarize', 0.6, 8), ('Color', 0.6, 9)], | |
[('Equalize', 0.8, 4), ('Equalize', 0.0, 8)], | |
[('Equalize', 1.0, 4), ('AutoContrast', 0.6, 2)], | |
[('Posterize', 0.8, 2), ('Solarize', 0.6, 10)], | |
[('Solarize', 0.6, 8), ('Equalize', 0.6, 1)], | |
] | |
return policy | |
def panoptic_deeplab_policy(): | |
policy = [ | |
[('Sharpness', 0.4, 1.4), ('Brightness', 0.2, 2.0)], | |
[('Equalize', 0.0, 1.8), ('Contrast', 0.2, 2.0)], | |
[('Sharpness', 0.2, 1.8), ('Color', 0.2, 1.8)], | |
[('Solarize', 0.2, 1.4), ('Equalize', 0.6, 1.8)], | |
[('Sharpness', 0.2, 0.2), ('Equalize', 0.2, 1.4)]] | |
return policy | |
def vit(): | |
"""Autoaugment policy for a generic ViT.""" | |
policy = [ | |
[('Sharpness', 0.4, 1.4), ('Brightness', 0.2, 2.0), ('Cutout', 0.8, 8)], | |
[('Equalize', 0.0, 1.8), ('Contrast', 0.2, 2.0), ('Cutout', 0.8, 8)], | |
[('Sharpness', 0.2, 1.8), ('Color', 0.2, 1.8), ('Cutout', 0.8, 8)], | |
[('Solarize', 0.2, 1.4), ('Equalize', 0.6, 1.8), ('Cutout', 0.8, 8)], | |
[('Sharpness', 0.2, 0.2), ('Equalize', 0.2, 1.4), ('Cutout', 0.8, 8)], | |
[('Sharpness', 0.4, 7), ('Invert', 0.6, 8), ('Cutout', 0.8, 8)], | |
[('Invert', 0.6, 4), ('Equalize', 1.0, 8), ('Cutout', 0.8, 8)], | |
[('Posterize', 0.6, 7), ('Posterize', 0.6, 6), ('Cutout', 0.8, 8)], | |
[('Solarize', 0.6, 5), ('AutoContrast', 0.6, 5), ('Cutout', 0.8, 8)], | |
] | |
return policy | |
def deit3_three_augment(): | |
"""Autoaugment policy for three augmentations. | |
Proposed in paper: https://arxiv.org/abs/2204.07118. | |
Each tuple is an augmentation operation of the form | |
(operation, probability, magnitude). Each element in policy is a | |
sub-policy that will be applied on the image. Randomly chooses one of the | |
three augmentation to apply on image. | |
Returns: | |
the policy. | |
""" | |
policy = [ | |
[('Grayscale', 1.0, 0)], | |
[('Solarize', 1.0, 5)], # to have threshold as 128 | |
[('Gaussian_Noise', 1.0, 1)], # to have low_std as 0.1 | |
] | |
return policy | |
def policy_test(): | |
"""Autoaugment test policy for debugging.""" | |
policy = [ | |
[('TranslateX', 1.0, 4), ('Equalize', 1.0, 10)], | |
] | |
return policy | |
def _maybe_identity(x: Optional[tf.Tensor]) -> Optional[tf.Tensor]: | |
return tf.identity(x) if x is not None else None | |
class RandAugment(ImageAugment): | |
"""Applies the RandAugment policy to images. | |
RandAugment is from the paper https://arxiv.org/abs/1909.13719. | |
""" | |
def __init__(self, | |
num_layers: int = 2, | |
magnitude: float = 10., | |
cutout_const: float = 40., | |
translate_const: float = 100., | |
magnitude_std: float = 0.0, | |
prob_to_apply: Optional[float] = None, | |
exclude_ops: Optional[List[str]] = None): | |
"""Applies the RandAugment policy to images. | |
Args: | |
num_layers: Integer, the number of augmentation transformations to apply | |
sequentially to an image. Represented as (N) in the paper. Usually best | |
values will be in the range [1, 3]. | |
magnitude: Integer, shared magnitude across all augmentation operations. | |
Represented as (M) in the paper. Usually best values are in the range | |
[5, 10]. | |
cutout_const: multiplier for applying cutout. | |
translate_const: multiplier for applying translation. | |
magnitude_std: randomness of the severity as proposed by the authors of | |
the timm library. | |
prob_to_apply: The probability to apply the selected augmentation at each | |
layer. | |
exclude_ops: exclude selected operations. | |
""" | |
super(RandAugment, self).__init__() | |
self.num_layers = num_layers | |
self.magnitude = float(magnitude) | |
self.cutout_const = float(cutout_const) | |
self.translate_const = float(translate_const) | |
self.prob_to_apply = ( | |
float(prob_to_apply) if prob_to_apply is not None else None) | |
self.available_ops = [ | |
'AutoContrast', 'Equalize', 'Invert', 'Rotate', 'Posterize', 'Solarize', | |
'Color', 'Contrast', 'Brightness', 'Sharpness', 'ShearX', 'ShearY', | |
'TranslateX', 'TranslateY', 'Cutout', 'SolarizeAdd' | |
] | |
self.magnitude_std = magnitude_std | |
if exclude_ops: | |
self.available_ops = [ | |
op for op in self.available_ops if op not in exclude_ops | |
] | |
def build_for_detection(cls, | |
num_layers: int = 2, | |
magnitude: float = 10., | |
cutout_const: float = 40., | |
translate_const: float = 100., | |
magnitude_std: float = 0.0, | |
prob_to_apply: Optional[float] = None, | |
exclude_ops: Optional[List[str]] = None): | |
"""Builds a RandAugment that modifies bboxes for geometric transforms.""" | |
augmenter = cls( | |
num_layers=num_layers, | |
magnitude=magnitude, | |
cutout_const=cutout_const, | |
translate_const=translate_const, | |
magnitude_std=magnitude_std, | |
prob_to_apply=prob_to_apply, | |
exclude_ops=exclude_ops) | |
box_aware_ops_by_base_name = { | |
'Rotate': 'Rotate_BBox', | |
'ShearX': 'ShearX_BBox', | |
'ShearY': 'ShearY_BBox', | |
'TranslateX': 'TranslateX_BBox', | |
'TranslateY': 'TranslateY_BBox', | |
} | |
augmenter.available_ops = [ | |
box_aware_ops_by_base_name.get(op_name) or op_name | |
for op_name in augmenter.available_ops | |
] | |
return augmenter | |
def _distort_common( | |
self, | |
image: tf.Tensor, | |
bboxes: Optional[tf.Tensor] = None | |
) -> Tuple[tf.Tensor, Optional[tf.Tensor]]: | |
"""Distorts the image and optionally bounding boxes.""" | |
input_image_type = image.dtype | |
if input_image_type != tf.uint8: | |
image = tf.clip_by_value(image, 0.0, 255.0) | |
image = tf.cast(image, dtype=tf.uint8) | |
replace_value = [128] * 3 | |
min_prob, max_prob = 0.2, 0.8 | |
aug_image = image | |
aug_bboxes = bboxes | |
for _ in range(self.num_layers): | |
op_to_select = tf.random.uniform([], | |
maxval=len(self.available_ops) + 1, | |
dtype=tf.int32) | |
branch_fns = [] | |
for (i, op_name) in enumerate(self.available_ops): | |
prob = tf.random.uniform([], | |
minval=min_prob, | |
maxval=max_prob, | |
dtype=tf.float32) | |
func, _, args = _parse_policy_info(op_name, prob, self.magnitude, | |
replace_value, self.cutout_const, | |
self.translate_const, | |
self.magnitude_std) | |
branch_fns.append(( | |
i, | |
# pylint:disable=g-long-lambda | |
lambda selected_func=func, selected_args=args: selected_func( | |
image, bboxes, *selected_args))) | |
# pylint:enable=g-long-lambda | |
aug_image, aug_bboxes = tf.switch_case( | |
branch_index=op_to_select, | |
branch_fns=branch_fns, | |
default=lambda: (tf.identity(image), _maybe_identity(bboxes))) # pylint: disable=cell-var-from-loop | |
if self.prob_to_apply is not None: | |
aug_image, aug_bboxes = tf.cond( | |
tf.random.uniform(shape=[], dtype=tf.float32) < self.prob_to_apply, | |
lambda: (tf.identity(aug_image), _maybe_identity(aug_bboxes)), | |
lambda: (tf.identity(image), _maybe_identity(bboxes))) | |
image = aug_image | |
bboxes = aug_bboxes | |
image = tf.cast(image, dtype=input_image_type) | |
return image, bboxes | |
def distort(self, image: tf.Tensor) -> tf.Tensor: | |
"""See base class.""" | |
image, _ = self._distort_common(image) | |
return image | |
def distort_with_boxes(self, image: tf.Tensor, | |
bboxes: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]: | |
"""See base class.""" | |
image, bboxes = self._distort_common(image, bboxes) | |
assert bboxes is not None | |
return image, bboxes | |
class RandomErasing(ImageAugment): | |
"""Applies RandomErasing to a single image. | |
Reference: https://arxiv.org/abs/1708.04896 | |
Implementation is inspired by | |
https://github.com/rwightman/pytorch-image-models. | |
""" | |
def __init__(self, | |
probability: float = 0.25, | |
min_area: float = 0.02, | |
max_area: float = 1 / 3, | |
min_aspect: float = 0.3, | |
max_aspect: Optional[float] = None, | |
min_count=1, | |
max_count=1, | |
trials=10): | |
"""Applies RandomErasing to a single image. | |
Args: | |
probability: Probability of augmenting the image. Defaults to `0.25`. | |
min_area: Minimum area of the random erasing rectangle. Defaults to | |
`0.02`. | |
max_area: Maximum area of the random erasing rectangle. Defaults to `1/3`. | |
min_aspect: Minimum aspect rate of the random erasing rectangle. Defaults | |
to `0.3`. | |
max_aspect: Maximum aspect rate of the random erasing rectangle. Defaults | |
to `None`. | |
min_count: Minimum number of erased rectangles. Defaults to `1`. | |
max_count: Maximum number of erased rectangles. Defaults to `1`. | |
trials: Maximum number of trials to randomly sample a rectangle that | |
fulfills constraint. Defaults to `10`. | |
""" | |
self._probability = probability | |
self._min_area = float(min_area) | |
self._max_area = float(max_area) | |
self._min_log_aspect = math.log(min_aspect) | |
self._max_log_aspect = math.log(max_aspect or 1 / min_aspect) | |
self._min_count = min_count | |
self._max_count = max_count | |
self._trials = trials | |
def distort(self, image: tf.Tensor) -> tf.Tensor: | |
"""Applies RandomErasing to single `image`. | |
Args: | |
image (tf.Tensor): Of shape [height, width, 3] representing an image. | |
Returns: | |
tf.Tensor: The augmented version of `image`. | |
""" | |
uniform_random = tf.random.uniform(shape=[], minval=0., maxval=1.0) | |
mirror_cond = tf.less(uniform_random, self._probability) | |
image = tf.cond(mirror_cond, lambda: self._erase(image), lambda: image) | |
return image | |
def _erase(self, image: tf.Tensor) -> tf.Tensor: | |
"""Erase an area.""" | |
if self._min_count == self._max_count: | |
count = self._min_count | |
else: | |
count = tf.random.uniform( | |
shape=[], | |
minval=int(self._min_count), | |
maxval=int(self._max_count - self._min_count + 1), | |
dtype=tf.int32) | |
image_height = tf.shape(image)[0] | |
image_width = tf.shape(image)[1] | |
area = tf.cast(image_width * image_height, tf.float32) | |
for _ in range(count): | |
# Work around since break is not supported in tf.function | |
is_trial_successfull = False | |
for _ in range(self._trials): | |
if not is_trial_successfull: | |
erase_area = tf.random.uniform( | |
shape=[], | |
minval=area * self._min_area, | |
maxval=area * self._max_area) | |
aspect_ratio = tf.math.exp( | |
tf.random.uniform( | |
shape=[], | |
minval=self._min_log_aspect, | |
maxval=self._max_log_aspect)) | |
half_height = tf.cast( | |
tf.math.round(tf.math.sqrt(erase_area * aspect_ratio) / 2), | |
dtype=tf.int32) | |
half_width = tf.cast( | |
tf.math.round(tf.math.sqrt(erase_area / aspect_ratio) / 2), | |
dtype=tf.int32) | |
if 2 * half_height < image_height and 2 * half_width < image_width: | |
center_height = tf.random.uniform( | |
shape=[], | |
minval=0, | |
maxval=int(image_height - 2 * half_height), | |
dtype=tf.int32) | |
center_width = tf.random.uniform( | |
shape=[], | |
minval=0, | |
maxval=int(image_width - 2 * half_width), | |
dtype=tf.int32) | |
image = _fill_rectangle( | |
image, | |
center_width, | |
center_height, | |
half_width, | |
half_height, | |
replace=None) | |
is_trial_successfull = True | |
return image | |
class MixupAndCutmix: | |
"""Applies Mixup and/or Cutmix to a batch of images. | |
- Mixup: https://arxiv.org/abs/1710.09412 | |
- Cutmix: https://arxiv.org/abs/1905.04899 | |
Implementaion is inspired by https://github.com/rwightman/pytorch-image-models | |
""" | |
def __init__(self, | |
num_classes: int, | |
mixup_alpha: float = .8, | |
cutmix_alpha: float = 1., | |
prob: float = 1.0, | |
switch_prob: float = 0.5, | |
label_smoothing: float = 0.1): | |
"""Applies Mixup and/or Cutmix to a batch of images. | |
Args: | |
num_classes (int): Number of classes. | |
mixup_alpha (float, optional): For drawing a random lambda (`lam`) from a | |
beta distribution (for each image). If zero Mixup is deactivated. | |
Defaults to .8. | |
cutmix_alpha (float, optional): For drawing a random lambda (`lam`) from a | |
beta distribution (for each image). If zero Cutmix is deactivated. | |
Defaults to 1.. | |
prob (float, optional): Of augmenting the batch. Defaults to 1.0. | |
switch_prob (float, optional): Probability of applying Cutmix for the | |
batch. Defaults to 0.5. | |
label_smoothing (float, optional): Constant for label smoothing. Defaults | |
to 0.1. | |
""" | |
self.mixup_alpha = mixup_alpha | |
self.cutmix_alpha = cutmix_alpha | |
self.mix_prob = prob | |
self.switch_prob = switch_prob | |
self.label_smoothing = label_smoothing | |
self.num_classes = num_classes | |
self.mode = 'batch' | |
self.mixup_enabled = True | |
if self.mixup_alpha and not self.cutmix_alpha: | |
self.switch_prob = -1 | |
elif not self.mixup_alpha and self.cutmix_alpha: | |
self.switch_prob = 1 | |
def __call__(self, images: tf.Tensor, | |
labels: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]: | |
return self.distort(images, labels) | |
def distort(self, images: tf.Tensor, | |
labels: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]: | |
"""Applies Mixup and/or Cutmix to batch of images and transforms labels. | |
Args: | |
images (tf.Tensor): Of shape [batch_size, height, width, 3] representing a | |
batch of image, or [batch_size, time, height, width, 3] representing a | |
batch of video. | |
labels (tf.Tensor): Of shape [batch_size, ] representing the class id for | |
each image of the batch. | |
Returns: | |
Tuple[tf.Tensor, tf.Tensor]: The augmented version of `image` and | |
`labels`. | |
""" | |
labels = tf.reshape(labels, [-1]) | |
augment_cond = tf.less( | |
tf.random.uniform(shape=[], minval=0., maxval=1.0), self.mix_prob) | |
# pylint: disable=g-long-lambda | |
augment_a = lambda: self._update_labels(*tf.cond( | |
tf.less( | |
tf.random.uniform(shape=[], minval=0., maxval=1.0), self.switch_prob | |
), lambda: self._cutmix(images, labels), lambda: self._mixup( | |
images, labels))) | |
augment_b = lambda: (images, self._smooth_labels(labels)) | |
# pylint: enable=g-long-lambda | |
return tf.cond(augment_cond, augment_a, augment_b) | |
def _sample_from_beta(alpha, beta, shape): | |
sample_alpha = tf.random.gamma(shape, 1., beta=alpha) | |
sample_beta = tf.random.gamma(shape, 1., beta=beta) | |
return sample_alpha / (sample_alpha + sample_beta) | |
def _cutmix(self, images: tf.Tensor, | |
labels: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]: | |
"""Applies cutmix.""" | |
lam = MixupAndCutmix._sample_from_beta(self.cutmix_alpha, self.cutmix_alpha, | |
tf.shape(labels)) | |
ratio = tf.math.sqrt(1 - lam) | |
batch_size = tf.shape(images)[0] | |
if images.shape.rank == 4: | |
image_height, image_width = tf.shape(images)[1], tf.shape(images)[2] | |
fill_fn = _fill_rectangle | |
elif images.shape.rank == 5: | |
image_height, image_width = tf.shape(images)[2], tf.shape(images)[3] | |
fill_fn = _fill_rectangle_video | |
else: | |
raise ValueError('Bad image rank: {}'.format(images.shape.rank)) | |
cut_height = tf.cast( | |
ratio * tf.cast(image_height, dtype=tf.float32), dtype=tf.int32) | |
cut_width = tf.cast( | |
ratio * tf.cast(image_height, dtype=tf.float32), dtype=tf.int32) | |
random_center_height = tf.random.uniform( | |
shape=[batch_size], minval=0, maxval=image_height, dtype=tf.int32) | |
random_center_width = tf.random.uniform( | |
shape=[batch_size], minval=0, maxval=image_width, dtype=tf.int32) | |
bbox_area = cut_height * cut_width | |
lam = 1. - bbox_area / (image_height * image_width) | |
lam = tf.cast(lam, dtype=tf.float32) | |
images = tf.map_fn( | |
lambda x: fill_fn(*x), | |
(images, random_center_width, random_center_height, cut_width // 2, | |
cut_height // 2, tf.reverse(images, [0])), | |
dtype=( | |
images.dtype, tf.int32, tf.int32, tf.int32, tf.int32, images.dtype), | |
fn_output_signature=tf.TensorSpec(images.shape[1:], dtype=images.dtype)) | |
return images, labels, lam | |
def _mixup(self, images: tf.Tensor, | |
labels: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]: | |
"""Applies mixup.""" | |
lam = MixupAndCutmix._sample_from_beta(self.mixup_alpha, self.mixup_alpha, | |
tf.shape(labels)) | |
if images.shape.rank == 4: | |
lam = tf.reshape(lam, [-1, 1, 1, 1]) | |
elif images.shape.rank == 5: | |
lam = tf.reshape(lam, [-1, 1, 1, 1, 1]) | |
else: | |
raise ValueError('Bad image rank: {}'.format(images.shape.rank)) | |
lam_cast = tf.cast(lam, dtype=images.dtype) | |
images = lam_cast * images + (1. - lam_cast) * tf.reverse(images, [0]) | |
return images, labels, tf.squeeze(lam) | |
def _smooth_labels(self, labels: tf.Tensor) -> tf.Tensor: | |
off_value = self.label_smoothing / self.num_classes | |
on_value = 1. - self.label_smoothing + off_value | |
smooth_labels = tf.one_hot( | |
labels, self.num_classes, on_value=on_value, off_value=off_value) | |
return smooth_labels | |
def _update_labels(self, images: tf.Tensor, labels: tf.Tensor, | |
lam: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]: | |
labels_1 = self._smooth_labels(labels) | |
labels_2 = tf.reverse(labels_1, [0]) | |
lam = tf.reshape(lam, [-1, 1]) | |
labels = lam * labels_1 + (1. - lam) * labels_2 | |
return images, labels | |