# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import math import numpy as np import cv2 def clip_boxes_to_image(boxes, height, width): """ Clip the boxes with the height and width of the image size. Args: boxes (ndarray): bounding boxes to peform crop. The dimension is `num boxes` x 4. height (int): the height of the image. width (int): the width of the image. Returns: boxes (ndarray): cropped bounding boxes. """ boxes[:, [0, 2]] = np.minimum( width - 1.0, np.maximum(0.0, boxes[:, [0, 2]]) ) boxes[:, [1, 3]] = np.minimum( height - 1.0, np.maximum(0.0, boxes[:, [1, 3]]) ) return boxes def random_short_side_scale_jitter_list(images, min_size, max_size, boxes=None): """ Perform a spatial short scale jittering on the given images and corresponding boxes. Args: images (list): list of images to perform scale jitter. Dimension is `height` x `width` x `channel`. min_size (int): the minimal size to scale the frames. max_size (int): the maximal size to scale the frames. boxes (list): optional. Corresponding boxes to images. Dimension is `num boxes` x 4. Returns: (list): the list of scaled images with dimension of `new height` x `new width` x `channel`. (ndarray or None): the scaled boxes with dimension of `num boxes` x 4. """ size = int(round(1.0 / np.random.uniform(1.0 / max_size, 1.0 / min_size))) height = images[0].shape[0] width = images[0].shape[1] if (width <= height and width == size) or ( height <= width and height == size ): return images, boxes new_width = size new_height = size if width < height: new_height = int(math.floor((float(height) / width) * size)) if boxes is not None: boxes = [ proposal * float(new_height) / height for proposal in boxes ] else: new_width = int(math.floor((float(width) / height) * size)) if boxes is not None: boxes = [proposal * float(new_width) / width for proposal in boxes] return ( [ cv2.resize( image, (new_width, new_height), interpolation=cv2.INTER_LINEAR ).astype(np.float32) for image in images ], boxes, ) def scale(size, image): """ Scale the short side of the image to size. Args: size (int): size to scale the image. image (array): image to perform short side scale. Dimension is `height` x `width` x `channel`. Returns: (ndarray): the scaled image with dimension of `height` x `width` x `channel`. """ height = image.shape[0] width = image.shape[1] if (width <= height and width == size) or ( height <= width and height == size ): return image new_width = size new_height = size if width < height: new_height = int(math.floor((float(height) / width) * size)) else: new_width = int(math.floor((float(width) / height) * size)) img = cv2.resize( image, (new_width, new_height), interpolation=cv2.INTER_LINEAR ) return img.astype(np.float32) def scale_boxes(size, boxes, height, width): """ Scale the short side of the box to size. Args: size (int): size to scale the image. boxes (ndarray): bounding boxes to peform scale. The dimension is `num boxes` x 4. height (int): the height of the image. width (int): the width of the image. Returns: boxes (ndarray): scaled bounding boxes. """ if (width <= height and width == size) or ( height <= width and height == size ): return boxes new_width = size new_height = size if width < height: new_height = int(math.floor((float(height) / width) * size)) boxes *= float(new_height) / height else: new_width = int(math.floor((float(width) / height) * size)) boxes *= float(new_width) / width return boxes def horizontal_flip_list(prob, images, order="CHW", boxes=None): """ Horizontally flip the list of image and optional boxes. Args: prob (float): probability to flip. image (list): ilist of images to perform short side scale. Dimension is `height` x `width` x `channel` or `channel` x `height` x `width`. order (str): order of the `height`, `channel` and `width`. boxes (list): optional. Corresponding boxes to images. Dimension is `num boxes` x 4. Returns: (ndarray): the scaled image with dimension of `height` x `width` x `channel`. (list): optional. Corresponding boxes to images. Dimension is `num boxes` x 4. """ _, width, _ = images[0].shape if np.random.uniform() < prob: if boxes is not None: boxes = [flip_boxes(proposal, width) for proposal in boxes] if order == "CHW": out_images = [] for image in images: image = np.asarray(image).swapaxes(2, 0) image = image[::-1] out_images.append(image.swapaxes(0, 2)) return out_images, boxes elif order == "HWC": return [cv2.flip(image, 1) for image in images], boxes return images, boxes def spatial_shift_crop_list(size, images, spatial_shift_pos, boxes=None): """ Perform left, center, or right crop of the given list of images. Args: size (int): size to crop. image (list): ilist of images to perform short side scale. Dimension is `height` x `width` x `channel` or `channel` x `height` x `width`. spatial_shift_pos (int): option includes 0 (left), 1 (middle), and 2 (right) crop. boxes (list): optional. Corresponding boxes to images. Dimension is `num boxes` x 4. Returns: cropped (ndarray): the cropped list of images with dimension of `height` x `width` x `channel`. boxes (list): optional. Corresponding boxes to images. Dimension is `num boxes` x 4. """ assert spatial_shift_pos in [0, 1, 2] height = images[0].shape[0] width = images[0].shape[1] y_offset = int(math.ceil((height - size) / 2)) x_offset = int(math.ceil((width - size) / 2)) if height > width: if spatial_shift_pos == 0: y_offset = 0 elif spatial_shift_pos == 2: y_offset = height - size else: if spatial_shift_pos == 0: x_offset = 0 elif spatial_shift_pos == 2: x_offset = width - size cropped = [ image[y_offset : y_offset + size, x_offset : x_offset + size, :] for image in images ] assert cropped[0].shape[0] == size, "Image height not cropped properly" assert cropped[0].shape[1] == size, "Image width not cropped properly" if boxes is not None: for i in range(len(boxes)): boxes[i][:, [0, 2]] -= x_offset boxes[i][:, [1, 3]] -= y_offset return cropped, boxes def CHW2HWC(image): """ Transpose the dimension from `channel` x `height` x `width` to `height` x `width` x `channel`. Args: image (array): image to transpose. Returns (array): transposed image. """ return image.transpose([1, 2, 0]) def HWC2CHW(image): """ Transpose the dimension from `height` x `width` x `channel` to `channel` x `height` x `width`. Args: image (array): image to transpose. Returns (array): transposed image. """ return image.transpose([2, 0, 1]) def color_jitter_list( images, img_brightness=0, img_contrast=0, img_saturation=0 ): """ Perform color jitter on the list of images. Args: images (list): list of images to perform color jitter. img_brightness (float): jitter ratio for brightness. img_contrast (float): jitter ratio for contrast. img_saturation (float): jitter ratio for saturation. Returns: images (list): the jittered list of images. """ jitter = [] if img_brightness != 0: jitter.append("brightness") if img_contrast != 0: jitter.append("contrast") if img_saturation != 0: jitter.append("saturation") if len(jitter) > 0: order = np.random.permutation(np.arange(len(jitter))) for idx in range(0, len(jitter)): if jitter[order[idx]] == "brightness": images = brightness_list(img_brightness, images) elif jitter[order[idx]] == "contrast": images = contrast_list(img_contrast, images) elif jitter[order[idx]] == "saturation": images = saturation_list(img_saturation, images) return images def lighting_list(imgs, alphastd, eigval, eigvec, alpha=None): """ Perform AlexNet-style PCA jitter on the given list of images. Args: images (list): list of images to perform lighting jitter. alphastd (float): jitter ratio for PCA jitter. eigval (list): eigenvalues for PCA jitter. eigvec (list[list]): eigenvectors for PCA jitter. Returns: out_images (list): the list of jittered images. """ if alphastd == 0: return imgs # generate alpha1, alpha2, alpha3 alpha = np.random.normal(0, alphastd, size=(1, 3)) eig_vec = np.array(eigvec) eig_val = np.reshape(eigval, (1, 3)) rgb = np.sum( eig_vec * np.repeat(alpha, 3, axis=0) * np.repeat(eig_val, 3, axis=0), axis=1, ) out_images = [] for img in imgs: for idx in range(img.shape[0]): img[idx] = img[idx] + rgb[2 - idx] out_images.append(img) return out_images def color_normalization(image, mean, stddev): """ Perform color normalization on the image with the given mean and stddev. Args: image (array): image to perform color normalization. mean (float): mean value to subtract. stddev (float): stddev to devide. """ # Input image should in format of CHW assert len(mean) == image.shape[0], "channel mean not computed properly" assert len(stddev) == image.shape[0], "channel stddev not computed properly" for idx in range(image.shape[0]): image[idx] = image[idx] - mean[idx] image[idx] = image[idx] / stddev[idx] return image def pad_image(image, pad_size, order="CHW"): """ Pad the given image with the size of pad_size. Args: image (array): image to pad. pad_size (int): size to pad. order (str): order of the `height`, `channel` and `width`. Returns: img (array): padded image. """ if order == "CHW": img = np.pad( image, ((0, 0), (pad_size, pad_size), (pad_size, pad_size)), mode=str("constant"), ) elif order == "HWC": img = np.pad( image, ((pad_size, pad_size), (pad_size, pad_size), (0, 0)), mode=str("constant"), ) return img def horizontal_flip(prob, image, order="CHW"): """ Horizontally flip the image. Args: prob (float): probability to flip. image (array): image to pad. order (str): order of the `height`, `channel` and `width`. Returns: img (array): flipped image. """ assert order in ["CHW", "HWC"], "order {} is not supported".format(order) if np.random.uniform() < prob: if order == "CHW": image = image[:, :, ::-1] elif order == "HWC": image = image[:, ::-1, :] else: raise NotImplementedError("Unknown order {}".format(order)) return image def flip_boxes(boxes, im_width): """ Horizontally flip the boxes. Args: boxes (array): box to flip. im_width (int): width of the image. Returns: boxes_flipped (array): flipped box. """ boxes_flipped = boxes.copy() boxes_flipped[:, 0::4] = im_width - boxes[:, 2::4] - 1 boxes_flipped[:, 2::4] = im_width - boxes[:, 0::4] - 1 return boxes_flipped def crop_boxes(boxes, x_offset, y_offset): """ Crop the boxes given the offsets. Args: boxes (array): boxes to crop. x_offset (int): offset on x. y_offset (int): offset on y. """ boxes[:, [0, 2]] = boxes[:, [0, 2]] - x_offset boxes[:, [1, 3]] = boxes[:, [1, 3]] - y_offset return boxes def random_crop_list(images, size, pad_size=0, order="CHW", boxes=None): """ Perform random crop on a list of images. Args: images (list): list of images to perform random crop. size (int): size to crop. pad_size (int): padding size. order (str): order of the `height`, `channel` and `width`. boxes (list): optional. Corresponding boxes to images. Dimension is `num boxes` x 4. Returns: cropped (ndarray): the cropped list of images with dimension of `height` x `width` x `channel`. boxes (list): optional. Corresponding boxes to images. Dimension is `num boxes` x 4. """ # explicitly dealing processing per image order to avoid flipping images. if pad_size > 0: images = [ pad_image(pad_size=pad_size, image=image, order=order) for image in images ] # image format should be CHW. if order == "CHW": if images[0].shape[1] == size and images[0].shape[2] == size: return images, boxes height = images[0].shape[1] width = images[0].shape[2] y_offset = 0 if height > size: y_offset = int(np.random.randint(0, height - size)) x_offset = 0 if width > size: x_offset = int(np.random.randint(0, width - size)) cropped = [ image[:, y_offset : y_offset + size, x_offset : x_offset + size] for image in images ] assert cropped[0].shape[1] == size, "Image not cropped properly" assert cropped[0].shape[2] == size, "Image not cropped properly" elif order == "HWC": if images[0].shape[0] == size and images[0].shape[1] == size: return images, boxes height = images[0].shape[0] width = images[0].shape[1] y_offset = 0 if height > size: y_offset = int(np.random.randint(0, height - size)) x_offset = 0 if width > size: x_offset = int(np.random.randint(0, width - size)) cropped = [ image[y_offset : y_offset + size, x_offset : x_offset + size, :] for image in images ] assert cropped[0].shape[0] == size, "Image not cropped properly" assert cropped[0].shape[1] == size, "Image not cropped properly" if boxes is not None: boxes = [crop_boxes(proposal, x_offset, y_offset) for proposal in boxes] return cropped, boxes def center_crop(size, image): """ Perform center crop on input images. Args: size (int): size of the cropped height and width. image (array): the image to perform center crop. """ height = image.shape[0] width = image.shape[1] y_offset = int(math.ceil((height - size) / 2)) x_offset = int(math.ceil((width - size) / 2)) cropped = image[y_offset : y_offset + size, x_offset : x_offset + size, :] assert cropped.shape[0] == size, "Image height not cropped properly" assert cropped.shape[1] == size, "Image width not cropped properly" return cropped # ResNet style scale jittering: randomly select the scale from # [1/max_size, 1/min_size] def random_scale_jitter(image, min_size, max_size): """ Perform ResNet style random scale jittering: randomly select the scale from [1/max_size, 1/min_size]. Args: image (array): image to perform random scale. min_size (int): min size to scale. max_size (int) max size to scale. Returns: image (array): scaled image. """ img_scale = int( round(1.0 / np.random.uniform(1.0 / max_size, 1.0 / min_size)) ) image = scale(img_scale, image) return image def random_scale_jitter_list(images, min_size, max_size): """ Perform ResNet style random scale jittering on a list of image: randomly select the scale from [1/max_size, 1/min_size]. Note that all the image will share the same scale. Args: images (list): list of images to perform random scale. min_size (int): min size to scale. max_size (int) max size to scale. Returns: images (list): list of scaled image. """ img_scale = int( round(1.0 / np.random.uniform(1.0 / max_size, 1.0 / min_size)) ) return [scale(img_scale, image) for image in images] def random_sized_crop(image, size, area_frac=0.08): """ Perform random sized cropping on the given image. Random crop with size 8% - 100% image area and aspect ratio in [3/4, 4/3]. Args: image (array): image to crop. size (int): size to crop. area_frac (float): area of fraction. Returns: (array): cropped image. """ for _ in range(0, 10): height = image.shape[0] width = image.shape[1] area = height * width target_area = np.random.uniform(area_frac, 1.0) * area aspect_ratio = np.random.uniform(3.0 / 4.0, 4.0 / 3.0) w = int(round(math.sqrt(float(target_area) * aspect_ratio))) h = int(round(math.sqrt(float(target_area) / aspect_ratio))) if np.random.uniform() < 0.5: w, h = h, w if h <= height and w <= width: if height == h: y_offset = 0 else: y_offset = np.random.randint(0, height - h) if width == w: x_offset = 0 else: x_offset = np.random.randint(0, width - w) y_offset = int(y_offset) x_offset = int(x_offset) cropped = image[y_offset : y_offset + h, x_offset : x_offset + w, :] assert ( cropped.shape[0] == h and cropped.shape[1] == w ), "Wrong crop size" cropped = cv2.resize( cropped, (size, size), interpolation=cv2.INTER_LINEAR ) return cropped.astype(np.float32) return center_crop(size, scale(size, image)) def lighting(img, alphastd, eigval, eigvec): """ Perform AlexNet-style PCA jitter on the given image. Args: image (array): list of images to perform lighting jitter. alphastd (float): jitter ratio for PCA jitter. eigval (array): eigenvalues for PCA jitter. eigvec (list): eigenvectors for PCA jitter. Returns: img (tensor): the jittered image. """ if alphastd == 0: return img # generate alpha1, alpha2, alpha3. alpha = np.random.normal(0, alphastd, size=(1, 3)) eig_vec = np.array(eigvec) eig_val = np.reshape(eigval, (1, 3)) rgb = np.sum( eig_vec * np.repeat(alpha, 3, axis=0) * np.repeat(eig_val, 3, axis=0), axis=1, ) for idx in range(img.shape[0]): img[idx] = img[idx] + rgb[2 - idx] return img def random_sized_crop_list(images, size, crop_area_fraction=0.08): """ Perform random sized cropping on the given list of images. Random crop with size 8% - 100% image area and aspect ratio in [3/4, 4/3]. Args: images (list): image to crop. size (int): size to crop. area_frac (float): area of fraction. Returns: (list): list of cropped image. """ for _ in range(0, 10): height = images[0].shape[0] width = images[0].shape[1] area = height * width target_area = np.random.uniform(crop_area_fraction, 1.0) * area aspect_ratio = np.random.uniform(3.0 / 4.0, 4.0 / 3.0) w = int(round(math.sqrt(float(target_area) * aspect_ratio))) h = int(round(math.sqrt(float(target_area) / aspect_ratio))) if np.random.uniform() < 0.5: w, h = h, w if h <= height and w <= width: if height == h: y_offset = 0 else: y_offset = np.random.randint(0, height - h) if width == w: x_offset = 0 else: x_offset = np.random.randint(0, width - w) y_offset = int(y_offset) x_offset = int(x_offset) croppsed_images = [] for image in images: cropped = image[ y_offset : y_offset + h, x_offset : x_offset + w, : ] assert ( cropped.shape[0] == h and cropped.shape[1] == w ), "Wrong crop size" cropped = cv2.resize( cropped, (size, size), interpolation=cv2.INTER_LINEAR ) croppsed_images.append(cropped.astype(np.float32)) return croppsed_images return [center_crop(size, scale(size, image)) for image in images] def blend(image1, image2, alpha): return image1 * alpha + image2 * (1 - alpha) def grayscale(image): """ Convert the image to gray scale. Args: image (tensor): image to convert to gray scale. Dimension is `channel` x `height` x `width`. Returns: img_gray (tensor): image in gray scale. """ # R -> 0.299, G -> 0.587, B -> 0.114. img_gray = np.copy(image) gray_channel = 0.299 * image[2] + 0.587 * image[1] + 0.114 * image[0] img_gray[0] = gray_channel img_gray[1] = gray_channel img_gray[2] = gray_channel return img_gray def saturation(var, image): """ Perform color saturation on the given image. Args: var (float): variance. image (array): image to perform color saturation. Returns: (array): image that performed color saturation. """ img_gray = grayscale(image) alpha = 1.0 + np.random.uniform(-var, var) return blend(image, img_gray, alpha) def brightness(var, image): """ Perform color brightness on the given image. Args: var (float): variance. image (array): image to perform color brightness. Returns: (array): image that performed color brightness. """ img_bright = np.zeros(image.shape).astype(image.dtype) alpha = 1.0 + np.random.uniform(-var, var) return blend(image, img_bright, alpha) def contrast(var, image): """ Perform color contrast on the given image. Args: var (float): variance. image (array): image to perform color contrast. Returns: (array): image that performed color contrast. """ img_gray = grayscale(image) img_gray.fill(np.mean(img_gray[0])) alpha = 1.0 + np.random.uniform(-var, var) return blend(image, img_gray, alpha) def saturation_list(var, images): """ Perform color saturation on the list of given images. Args: var (float): variance. images (list): list of images to perform color saturation. Returns: (list): list of images that performed color saturation. """ alpha = 1.0 + np.random.uniform(-var, var) out_images = [] for image in images: img_gray = grayscale(image) out_images.append(blend(image, img_gray, alpha)) return out_images def brightness_list(var, images): """ Perform color brightness on the given list of images. Args: var (float): variance. images (list): list of images to perform color brightness. Returns: (array): list of images that performed color brightness. """ alpha = 1.0 + np.random.uniform(-var, var) out_images = [] for image in images: img_bright = np.zeros(image.shape).astype(image.dtype) out_images.append(blend(image, img_bright, alpha)) return out_images def contrast_list(var, images): """ Perform color contrast on the given list of images. Args: var (float): variance. images (list): list of images to perform color contrast. Returns: (array): image that performed color contrast. """ alpha = 1.0 + np.random.uniform(-var, var) out_images = [] for image in images: img_gray = grayscale(image) img_gray.fill(np.mean(img_gray[0])) out_images.append(blend(image, img_gray, alpha)) return out_images def color_jitter(image, img_brightness=0, img_contrast=0, img_saturation=0): """ Perform color jitter on the given image. Args: image (array): image to perform color jitter. img_brightness (float): jitter ratio for brightness. img_contrast (float): jitter ratio for contrast. img_saturation (float): jitter ratio for saturation. Returns: image (array): the jittered image. """ jitter = [] if img_brightness != 0: jitter.append("brightness") if img_contrast != 0: jitter.append("contrast") if img_saturation != 0: jitter.append("saturation") if len(jitter) > 0: order = np.random.permutation(np.arange(len(jitter))) for idx in range(0, len(jitter)): if jitter[order[idx]] == "brightness": image = brightness(img_brightness, image) elif jitter[order[idx]] == "contrast": image = contrast(img_contrast, image) elif jitter[order[idx]] == "saturation": image = saturation(img_saturation, image) return image def revert_scaled_boxes(size, boxes, img_height, img_width): """ Revert scaled input boxes to match the original image size. Args: size (int): size of the cropped image. boxes (array): shape (num_boxes, 4). img_height (int): height of original image. img_width (int): width of original image. Returns: reverted_boxes (array): boxes scaled back to the original image size. """ scaled_aspect = np.min([img_height, img_width]) scale_ratio = scaled_aspect / size reverted_boxes = boxes * scale_ratio return reverted_boxes