# Copyright 2023 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Data parser and processing for segmentation datasets.""" import tensorflow as tf, tf_keras from official.vision.configs import semantic_segmentation as config_lib from official.vision.dataloaders import decoder from official.vision.dataloaders import parser from official.vision.dataloaders import utils from official.vision.ops import preprocess_ops class Decoder(decoder.Decoder): """A tf.Example decoder for segmentation task.""" def __init__(self, image_feature=config_lib.DenseFeatureConfig(), additional_dense_features=None): self._keys_to_features = { 'image/encoded': tf.io.FixedLenFeature((), tf.string, default_value=''), 'image/height': tf.io.FixedLenFeature((), tf.int64, default_value=0), 'image/width': tf.io.FixedLenFeature((), tf.int64, default_value=0), 'image/segmentation/class/encoded': tf.io.FixedLenFeature((), tf.string, default_value=''), image_feature.feature_name: tf.io.FixedLenFeature((), tf.string, default_value='') } if additional_dense_features: for feature in additional_dense_features: self._keys_to_features[feature.feature_name] = tf.io.FixedLenFeature( (), tf.string, default_value='') def decode(self, serialized_example): return tf.io.parse_single_example(serialized_example, self._keys_to_features) class Parser(parser.Parser): """Parser to parse an image and its annotations into a dictionary of tensors.""" def __init__(self, output_size, crop_size=None, resize_eval_groundtruth=True, gt_is_matting_map=False, groundtruth_padded_size=None, ignore_label=255, aug_rand_hflip=False, preserve_aspect_ratio=True, aug_scale_min=1.0, aug_scale_max=1.0, dtype='float32', image_feature=config_lib.DenseFeatureConfig(), additional_dense_features=None): """Initializes parameters for parsing annotations in the dataset. Args: output_size: `Tensor` or `list` for [height, width] of output image. The output_size should be divided by the largest feature stride 2^max_level. crop_size: `Tensor` or `list` for [height, width] of the crop. If specified a training crop of size crop_size is returned. This is useful for cropping original images during training while evaluating on original image sizes. resize_eval_groundtruth: `bool`, if True, eval ground-truth masks are resized to output_size. gt_is_matting_map: `bool`, if True, the expected mask is in the range between 0 and 255. The parser will normalize the value of the mask into the range between 0 and 1. groundtruth_padded_size: `Tensor` or `list` for [height, width]. When resize_eval_groundtruth is set to False, the ground-truth masks are padded to this size. ignore_label: `int` the pixel with ignore label will not used for training and evaluation. aug_rand_hflip: `bool`, if True, augment training with random horizontal flip. preserve_aspect_ratio: `bool`, if True, the aspect ratio is preserved, otherwise, the image is resized to output_size. aug_scale_min: `float`, the minimum scale applied to `output_size` for data augmentation during training. aug_scale_max: `float`, the maximum scale applied to `output_size` for data augmentation during training. dtype: `str`, data type. One of {`bfloat16`, `float32`, `float16`}. image_feature: the config for the image input (usually RGB). Defaults to the config for a 3-channel image with key = `image/encoded` and ImageNet dataset mean/stddev. additional_dense_features: `list` of DenseFeatureConfig for additional dense features. """ self._output_size = output_size self._crop_size = crop_size self._resize_eval_groundtruth = resize_eval_groundtruth if (not resize_eval_groundtruth) and (groundtruth_padded_size is None): raise ValueError('groundtruth_padded_size ([height, width]) needs to be' 'specified when resize_eval_groundtruth is False.') self._gt_is_matting_map = gt_is_matting_map self._groundtruth_padded_size = groundtruth_padded_size self._ignore_label = ignore_label self._preserve_aspect_ratio = preserve_aspect_ratio # Data augmentation. self._aug_rand_hflip = aug_rand_hflip self._aug_scale_min = aug_scale_min self._aug_scale_max = aug_scale_max # dtype. self._dtype = dtype self._image_feature = image_feature self._additional_dense_features = additional_dense_features def _prepare_image_and_label(self, data): """Prepare normalized image and label.""" height = data['image/height'] width = data['image/width'] label = tf.io.decode_image( data['image/segmentation/class/encoded'], channels=1) label = tf.reshape(label, (1, height, width)) label = tf.cast(label, tf.float32) image = tf.io.decode_image( data[self._image_feature.feature_name], channels=self._image_feature.num_channels, dtype=tf.uint8) image = tf.reshape(image, (height, width, self._image_feature.num_channels)) # Normalizes the image feature with mean and std values, which are divided # by 255 because an uint8 image are re-scaled automatically. Images other # than uint8 type will be wrongly normalized. image = preprocess_ops.normalize_image( image, [mean / 255.0 for mean in self._image_feature.mean], [stddev / 255.0 for stddev in self._image_feature.stddev]) if self._additional_dense_features: input_list = [image] for feature_cfg in self._additional_dense_features: feature = tf.io.decode_image( data[feature_cfg.feature_name], channels=feature_cfg.num_channels, dtype=tf.uint8) feature = tf.reshape(feature, (height, width, feature_cfg.num_channels)) feature = preprocess_ops.normalize_image( feature, [mean / 255.0 for mean in feature_cfg.mean], [stddev / 255.0 for stddev in feature_cfg.stddev]) input_list.append(feature) concat_input = tf.concat(input_list, axis=2) else: concat_input = image if not self._preserve_aspect_ratio: label = tf.reshape(label, [data['image/height'], data['image/width'], 1]) concat_input = tf.image.resize( concat_input, self._output_size, method='bilinear') label = tf.image.resize(label, self._output_size, method='nearest') label = tf.reshape(label[:, :, -1], [1] + self._output_size) return concat_input, label def _parse_train_data(self, data): """Parses data for training and evaluation.""" image, label = self._prepare_image_and_label(data) # Normalize the label into the range of 0 and 1 for matting ground-truth. # Note that the input ground-truth labels must be 0 to 255, and do not # contain ignore_label. For gt_is_matting_map case, ignore_label is only # used for padding the labels. if self._gt_is_matting_map: scale = tf.constant(255.0, dtype=tf.float32) scale = tf.expand_dims(scale, axis=0) scale = tf.expand_dims(scale, axis=0) label = tf.cast(label, tf.float32) / scale if self._crop_size: label = tf.reshape(label, [data['image/height'], data['image/width'], 1]) # If output_size is specified, resize image, and label to desired # output_size. if self._output_size: image = tf.image.resize(image, self._output_size, method='bilinear') label = tf.image.resize(label, self._output_size, method='nearest') image_mask = tf.concat([image, label], axis=2) image_mask_crop = tf.image.random_crop( image_mask, self._crop_size + [tf.shape(image_mask)[-1]]) image = image_mask_crop[:, :, :-1] label = tf.reshape(image_mask_crop[:, :, -1], [1] + self._crop_size) # Flips image randomly during training. if self._aug_rand_hflip: image, _, label = preprocess_ops.random_horizontal_flip( image, masks=label) train_image_size = self._crop_size if self._crop_size else self._output_size # Resizes and crops image. image, image_info = preprocess_ops.resize_and_crop_image( image, train_image_size, train_image_size, aug_scale_min=self._aug_scale_min, aug_scale_max=self._aug_scale_max) # Resizes and crops boxes. image_scale = image_info[2, :] offset = image_info[3, :] # Pad label and make sure the padded region assigned to the ignore label. # The label is first offset by +1 and then padded with 0. label += 1 label = tf.expand_dims(label, axis=3) label = preprocess_ops.resize_and_crop_masks(label, image_scale, train_image_size, offset) label -= 1 label = tf.where( tf.equal(label, -1), self._ignore_label * tf.ones_like(label), label) label = tf.squeeze(label, axis=0) valid_mask = tf.not_equal(label, self._ignore_label) labels = { 'masks': label, 'valid_masks': valid_mask, 'image_info': image_info, } # Cast image as self._dtype image = tf.cast(image, dtype=self._dtype) return image, labels def _parse_eval_data(self, data): """Parses data for training and evaluation.""" image, label = self._prepare_image_and_label(data) # Binarize mask if ground-truth is a matting map if self._gt_is_matting_map: label = tf.divide(tf.cast(label, dtype=tf.float32), 255.0) label = utils.binarize_matting_map(label) # The label is first offset by +1 and then padded with 0. label += 1 label = tf.expand_dims(label, axis=3) # Resizes and crops image. image, image_info = preprocess_ops.resize_and_crop_image( image, self._output_size, self._output_size) if self._resize_eval_groundtruth: # Resizes eval masks to match input image sizes. In that case, mean IoU # is computed on output_size not the original size of the images. image_scale = image_info[2, :] offset = image_info[3, :] label = preprocess_ops.resize_and_crop_masks(label, image_scale, self._output_size, offset) else: label = tf.image.pad_to_bounding_box(label, 0, 0, self._groundtruth_padded_size[0], self._groundtruth_padded_size[1]) label -= 1 label = tf.where( tf.equal(label, -1), self._ignore_label * tf.ones_like(label), label) label = tf.squeeze(label, axis=0) valid_mask = tf.not_equal(label, self._ignore_label) labels = { 'masks': label, 'valid_masks': valid_mask, 'image_info': image_info } # Cast image as self._dtype image = tf.cast(image, dtype=self._dtype) return image, labels