Spaces:

deanna-emery
/

ASL-MoViNet-T5-translator

Runtime error

File size: 11,838 Bytes
# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Data parser and processing for segmentation datasets."""

import tensorflow as tf, tf_keras
from official.vision.configs import semantic_segmentation as config_lib
from official.vision.dataloaders import decoder
from official.vision.dataloaders import parser
from official.vision.dataloaders import utils
from official.vision.ops import preprocess_ops


class Decoder(decoder.Decoder):
  """A tf.Example decoder for segmentation task."""

  def __init__(self,
               image_feature=config_lib.DenseFeatureConfig(),
               additional_dense_features=None):
    self._keys_to_features = {
        'image/encoded':
            tf.io.FixedLenFeature((), tf.string, default_value=''),
        'image/height':
            tf.io.FixedLenFeature((), tf.int64, default_value=0),
        'image/width':
            tf.io.FixedLenFeature((), tf.int64, default_value=0),
        'image/segmentation/class/encoded':
            tf.io.FixedLenFeature((), tf.string, default_value=''),
        image_feature.feature_name:
            tf.io.FixedLenFeature((), tf.string, default_value='')
    }
    if additional_dense_features:
      for feature in additional_dense_features:
        self._keys_to_features[feature.feature_name] = tf.io.FixedLenFeature(
            (), tf.string, default_value='')

  def decode(self, serialized_example):
    return tf.io.parse_single_example(serialized_example,
                                      self._keys_to_features)


class Parser(parser.Parser):
  """Parser to parse an image and its annotations into a dictionary of tensors."""

  def __init__(self,
               output_size,
               crop_size=None,
               resize_eval_groundtruth=True,
               gt_is_matting_map=False,
               groundtruth_padded_size=None,
               ignore_label=255,
               aug_rand_hflip=False,
               preserve_aspect_ratio=True,
               aug_scale_min=1.0,
               aug_scale_max=1.0,
               dtype='float32',
               image_feature=config_lib.DenseFeatureConfig(),
               additional_dense_features=None):
    """Initializes parameters for parsing annotations in the dataset.

    Args:
      output_size: `Tensor` or `list` for [height, width] of output image. The
        output_size should be divided by the largest feature stride 2^max_level.
      crop_size: `Tensor` or `list` for [height, width] of the crop. If
        specified a training crop of size crop_size is returned. This is useful
        for cropping original images during training while evaluating on
        original image sizes.
      resize_eval_groundtruth: `bool`, if True, eval ground-truth masks are
        resized to output_size.
      gt_is_matting_map: `bool`, if True, the expected mask is in the range
        between 0 and 255. The parser will normalize the value of the mask into
        the range between 0 and 1.
      groundtruth_padded_size: `Tensor` or `list` for [height, width]. When
        resize_eval_groundtruth is set to False, the ground-truth masks are
        padded to this size.
      ignore_label: `int` the pixel with ignore label will not used for training
        and evaluation.
      aug_rand_hflip: `bool`, if True, augment training with random horizontal
        flip.
      preserve_aspect_ratio: `bool`, if True, the aspect ratio is preserved,
        otherwise, the image is resized to output_size.
      aug_scale_min: `float`, the minimum scale applied to `output_size` for
        data augmentation during training.
      aug_scale_max: `float`, the maximum scale applied to `output_size` for
        data augmentation during training.
      dtype: `str`, data type. One of {`bfloat16`, `float32`, `float16`}.
      image_feature: the config for the image input (usually RGB). Defaults to
        the config for a 3-channel image with key = `image/encoded` and ImageNet
        dataset mean/stddev.
      additional_dense_features: `list` of DenseFeatureConfig for additional
        dense features.
    """
    self._output_size = output_size
    self._crop_size = crop_size
    self._resize_eval_groundtruth = resize_eval_groundtruth
    if (not resize_eval_groundtruth) and (groundtruth_padded_size is None):
      raise ValueError('groundtruth_padded_size ([height, width]) needs to be'
                       'specified when resize_eval_groundtruth is False.')
    self._gt_is_matting_map = gt_is_matting_map
    self._groundtruth_padded_size = groundtruth_padded_size
    self._ignore_label = ignore_label
    self._preserve_aspect_ratio = preserve_aspect_ratio

    # Data augmentation.
    self._aug_rand_hflip = aug_rand_hflip
    self._aug_scale_min = aug_scale_min
    self._aug_scale_max = aug_scale_max

    # dtype.
    self._dtype = dtype

    self._image_feature = image_feature
    self._additional_dense_features = additional_dense_features

  def _prepare_image_and_label(self, data):
    """Prepare normalized image and label."""
    height = data['image/height']
    width = data['image/width']

    label = tf.io.decode_image(
        data['image/segmentation/class/encoded'], channels=1)
    label = tf.reshape(label, (1, height, width))
    label = tf.cast(label, tf.float32)

    image = tf.io.decode_image(
        data[self._image_feature.feature_name],
        channels=self._image_feature.num_channels,
        dtype=tf.uint8)
    image = tf.reshape(image, (height, width, self._image_feature.num_channels))
    # Normalizes the image feature with mean and std values, which are divided
    # by 255 because an uint8 image are re-scaled automatically. Images other
    # than uint8 type will be wrongly normalized.
    image = preprocess_ops.normalize_image(
        image, [mean / 255.0 for mean in self._image_feature.mean],
        [stddev / 255.0 for stddev in self._image_feature.stddev])

    if self._additional_dense_features:
      input_list = [image]
      for feature_cfg in self._additional_dense_features:
        feature = tf.io.decode_image(
            data[feature_cfg.feature_name],
            channels=feature_cfg.num_channels,
            dtype=tf.uint8)
        feature = tf.reshape(feature, (height, width, feature_cfg.num_channels))
        feature = preprocess_ops.normalize_image(
            feature, [mean / 255.0 for mean in feature_cfg.mean],
            [stddev / 255.0 for stddev in feature_cfg.stddev])
        input_list.append(feature)
      concat_input = tf.concat(input_list, axis=2)
    else:
      concat_input = image

    if not self._preserve_aspect_ratio:
      label = tf.reshape(label, [data['image/height'], data['image/width'], 1])
      concat_input = tf.image.resize(
          concat_input, self._output_size, method='bilinear')
      label = tf.image.resize(label, self._output_size, method='nearest')
      label = tf.reshape(label[:, :, -1], [1] + self._output_size)

    return concat_input, label

  def _parse_train_data(self, data):
    """Parses data for training and evaluation."""
    image, label = self._prepare_image_and_label(data)

    # Normalize the label into the range of 0 and 1 for matting ground-truth.
    # Note that the input ground-truth labels must be 0 to 255, and do not
    # contain ignore_label. For gt_is_matting_map case, ignore_label is only
    # used for padding the labels.
    if self._gt_is_matting_map:
      scale = tf.constant(255.0, dtype=tf.float32)
      scale = tf.expand_dims(scale, axis=0)
      scale = tf.expand_dims(scale, axis=0)
      label = tf.cast(label, tf.float32) / scale

    if self._crop_size:

      label = tf.reshape(label, [data['image/height'], data['image/width'], 1])
      # If output_size is specified, resize image, and label to desired
      # output_size.
      if self._output_size:
        image = tf.image.resize(image, self._output_size, method='bilinear')
        label = tf.image.resize(label, self._output_size, method='nearest')

      image_mask = tf.concat([image, label], axis=2)
      image_mask_crop = tf.image.random_crop(
          image_mask, self._crop_size + [tf.shape(image_mask)[-1]])
      image = image_mask_crop[:, :, :-1]
      label = tf.reshape(image_mask_crop[:, :, -1], [1] + self._crop_size)

    # Flips image randomly during training.
    if self._aug_rand_hflip:
      image, _, label = preprocess_ops.random_horizontal_flip(
          image, masks=label)

    train_image_size = self._crop_size if self._crop_size else self._output_size
    # Resizes and crops image.
    image, image_info = preprocess_ops.resize_and_crop_image(
        image,
        train_image_size,
        train_image_size,
        aug_scale_min=self._aug_scale_min,
        aug_scale_max=self._aug_scale_max)

    # Resizes and crops boxes.
    image_scale = image_info[2, :]
    offset = image_info[3, :]

    # Pad label and make sure the padded region assigned to the ignore label.
    # The label is first offset by +1 and then padded with 0.
    label += 1
    label = tf.expand_dims(label, axis=3)
    label = preprocess_ops.resize_and_crop_masks(label, image_scale,
                                                 train_image_size, offset)
    label -= 1
    label = tf.where(
        tf.equal(label, -1), self._ignore_label * tf.ones_like(label), label)
    label = tf.squeeze(label, axis=0)
    valid_mask = tf.not_equal(label, self._ignore_label)

    labels = {
        'masks': label,
        'valid_masks': valid_mask,
        'image_info': image_info,
    }

    # Cast image as self._dtype
    image = tf.cast(image, dtype=self._dtype)

    return image, labels

  def _parse_eval_data(self, data):
    """Parses data for training and evaluation."""
    image, label = self._prepare_image_and_label(data)

    # Binarize mask if ground-truth is a matting map
    if self._gt_is_matting_map:
      label = tf.divide(tf.cast(label, dtype=tf.float32), 255.0)
      label = utils.binarize_matting_map(label)

    # The label is first offset by +1 and then padded with 0.
    label += 1
    label = tf.expand_dims(label, axis=3)

    # Resizes and crops image.
    image, image_info = preprocess_ops.resize_and_crop_image(
        image, self._output_size, self._output_size)

    if self._resize_eval_groundtruth:
      # Resizes eval masks to match input image sizes. In that case, mean IoU
      # is computed on output_size not the original size of the images.
      image_scale = image_info[2, :]
      offset = image_info[3, :]
      label = preprocess_ops.resize_and_crop_masks(label, image_scale,
                                                   self._output_size, offset)
    else:
      label = tf.image.pad_to_bounding_box(label, 0, 0,
                                           self._groundtruth_padded_size[0],
                                           self._groundtruth_padded_size[1])

    label -= 1
    label = tf.where(
        tf.equal(label, -1), self._ignore_label * tf.ones_like(label), label)
    label = tf.squeeze(label, axis=0)

    valid_mask = tf.not_equal(label, self._ignore_label)
    labels = {
        'masks': label,
        'valid_masks': valid_mask,
        'image_info': image_info
    }

    # Cast image as self._dtype
    image = tf.cast(image, dtype=self._dtype)

    return image, labels