# Copyright 2023 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Util functions related to pycocotools and COCO eval.""" import copy import json # Import libraries from absl import logging import numpy as np from PIL import Image from pycocotools import coco from pycocotools import mask as mask_api import six import tensorflow as tf, tf_keras from official.common import dataset_fn from official.vision.dataloaders import tf_example_decoder from official.vision.ops import box_ops from official.vision.ops import mask_ops class COCOWrapper(coco.COCO): """COCO wrapper class. This class wraps COCO API object, which provides the following additional functionalities: 1. Support string type image id. 2. Support loading the ground-truth dataset using the external annotation dictionary. 3. Support loading the prediction results using the external annotation dictionary. """ def __init__(self, eval_type='box', annotation_file=None, gt_dataset=None): """Instantiates a COCO-style API object. Args: eval_type: either 'box' or 'mask'. annotation_file: a JSON file that stores annotations of the eval dataset. This is required if `gt_dataset` is not provided. gt_dataset: the ground-truth eval datatset in COCO API format. """ if ((annotation_file and gt_dataset) or ((not annotation_file) and (not gt_dataset))): raise ValueError('One and only one of `annotation_file` and `gt_dataset` ' 'needs to be specified.') if eval_type not in ['box', 'mask']: raise ValueError('The `eval_type` can only be either `box` or `mask`.') coco.COCO.__init__(self, annotation_file=annotation_file) self._eval_type = eval_type if gt_dataset: self.dataset = gt_dataset self.createIndex() def loadRes(self, predictions): """Loads result file and return a result api object. Args: predictions: a list of dictionary each representing an annotation in COCO format. The required fields are `image_id`, `category_id`, `score`, `bbox`, `segmentation`. Returns: res: result COCO api object. Raises: ValueError: if the set of image id from predctions is not the subset of the set of image id of the ground-truth dataset. """ res = coco.COCO() res.dataset['images'] = copy.deepcopy(self.dataset['images']) res.dataset['categories'] = copy.deepcopy(self.dataset['categories']) image_ids = [ann['image_id'] for ann in predictions] if set(image_ids) != (set(image_ids) & set(self.getImgIds())): raise ValueError('Results do not correspond to the current dataset!') for ann in predictions: x1, x2, y1, y2 = [ann['bbox'][0], ann['bbox'][0] + ann['bbox'][2], ann['bbox'][1], ann['bbox'][1] + ann['bbox'][3]] if self._eval_type == 'box': ann['area'] = ann['bbox'][2] * ann['bbox'][3] ann['segmentation'] = [ [x1, y1, x1, y2, x2, y2, x2, y1]] elif self._eval_type == 'mask': ann['area'] = mask_api.area(ann['segmentation']) res.dataset['annotations'] = copy.deepcopy(predictions) res.createIndex() return res def convert_predictions_to_coco_annotations(predictions): """Converts a batch of predictions to annotations in COCO format. Args: predictions: a dictionary of lists of numpy arrays including the following fields. 'K' below denotes the maximum number of instances per image. Required fields: - source_id: a list of numpy arrays of int or string of shape [batch_size]. - detection_boxes: a list of numpy arrays of float of shape [batch_size, K, 4], where coordinates are in the original image space (not the scaled image space). - detection_classes: a list of numpy arrays of int of shape [batch_size, K]. - detection_scores: a list of numpy arrays of float of shape [batch_size, K]. Optional fields: - detection_masks: a list of numpy arrays of float of shape [batch_size, K, mask_height, mask_width]. - detection_keypoints: a list of numpy arrays of float of shape [batch_size, K, num_keypoints, 2] Returns: coco_predictions: prediction in COCO annotation format. """ coco_predictions = [] num_batches = len(predictions['source_id']) max_num_detections = predictions['detection_classes'][0].shape[1] use_outer_box = 'detection_outer_boxes' in predictions for i in range(num_batches): predictions['detection_boxes'][i] = box_ops.yxyx_to_xywh( predictions['detection_boxes'][i]) if use_outer_box: predictions['detection_outer_boxes'][i] = box_ops.yxyx_to_xywh( predictions['detection_outer_boxes'][i]) mask_boxes = predictions['detection_outer_boxes'] else: mask_boxes = predictions['detection_boxes'] batch_size = predictions['source_id'][i].shape[0] if 'detection_keypoints' in predictions: # Adds extra ones to indicate the visibility for each keypoint as is # recommended by MSCOCO. Also, convert keypoint from [y, x] to [x, y] # as mandated by COCO. num_keypoints = predictions['detection_keypoints'][i].shape[2] coco_keypoints = np.concatenate( [ predictions['detection_keypoints'][i][..., 1:], predictions['detection_keypoints'][i][..., :1], np.ones([batch_size, max_num_detections, num_keypoints, 1]), ], axis=-1, ).astype(int) for j in range(batch_size): if 'detection_masks' in predictions: image_masks = mask_ops.paste_instance_masks( predictions['detection_masks'][i][j], mask_boxes[i][j], int(predictions['image_info'][i][j, 0, 0]), int(predictions['image_info'][i][j, 0, 1]), ) binary_masks = (image_masks > 0.0).astype(np.uint8) encoded_masks = [ mask_api.encode(np.asfortranarray(binary_mask)) for binary_mask in list(binary_masks) ] for k in range(max_num_detections): ann = {} ann['image_id'] = predictions['source_id'][i][j] ann['category_id'] = predictions['detection_classes'][i][j, k] ann['bbox'] = predictions['detection_boxes'][i][j, k] ann['score'] = predictions['detection_scores'][i][j, k] if 'detection_masks' in predictions: ann['segmentation'] = encoded_masks[k] if 'detection_keypoints' in predictions: ann['keypoints'] = coco_keypoints[j, k].flatten().tolist() coco_predictions.append(ann) for i, ann in enumerate(coco_predictions): ann['id'] = i + 1 return coco_predictions def convert_groundtruths_to_coco_dataset(groundtruths, label_map=None): """Converts ground-truths to the dataset in COCO format. Args: groundtruths: a dictionary of numpy arrays including the fields below. Note that each element in the list represent the number for a single example without batch dimension. 'K' below denotes the actual number of instances for each image. Required fields: - source_id: a list of numpy arrays of int or string of shape [batch_size]. - height: a list of numpy arrays of int of shape [batch_size]. - width: a list of numpy arrays of int of shape [batch_size]. - num_detections: a list of numpy arrays of int of shape [batch_size]. - boxes: a list of numpy arrays of float of shape [batch_size, K, 4], where coordinates are in the original image space (not the normalized coordinates). - classes: a list of numpy arrays of int of shape [batch_size, K]. Optional fields: - is_crowds: a list of numpy arrays of int of shape [batch_size, K]. If th field is absent, it is assumed that this instance is not crowd. - areas: a list of numy arrays of float of shape [batch_size, K]. If the field is absent, the area is calculated using either boxes or masks depending on which one is available. - masks: a list of numpy arrays of string of shape [batch_size, K], label_map: (optional) a dictionary that defines items from the category id to the category name. If `None`, collect the category mapping from the `groundtruths`. Returns: coco_groundtruths: the ground-truth dataset in COCO format. """ source_ids = np.concatenate(groundtruths['source_id'], axis=0) heights = np.concatenate(groundtruths['height'], axis=0) widths = np.concatenate(groundtruths['width'], axis=0) gt_images = [{'id': int(i), 'height': int(h), 'width': int(w)} for i, h, w in zip(source_ids, heights, widths)] gt_annotations = [] num_batches = len(groundtruths['source_id']) for i in range(num_batches): logging.log_every_n( logging.INFO, 'convert_groundtruths_to_coco_dataset: Processing annotation %d', 100, i) max_num_instances = groundtruths['classes'][i].shape[1] batch_size = groundtruths['source_id'][i].shape[0] for j in range(batch_size): num_instances = groundtruths['num_detections'][i][j] if num_instances > max_num_instances: logging.warning( 'num_groundtruths is larger than max_num_instances, %d v.s. %d', num_instances, max_num_instances) num_instances = max_num_instances for k in range(int(num_instances)): ann = {} ann['image_id'] = int(groundtruths['source_id'][i][j]) if 'is_crowds' in groundtruths: ann['iscrowd'] = int(groundtruths['is_crowds'][i][j, k]) else: ann['iscrowd'] = 0 ann['category_id'] = int(groundtruths['classes'][i][j, k]) boxes = groundtruths['boxes'][i] ann['bbox'] = [ float(boxes[j, k, 1]), float(boxes[j, k, 0]), float(boxes[j, k, 3] - boxes[j, k, 1]), float(boxes[j, k, 2] - boxes[j, k, 0])] if 'areas' in groundtruths: ann['area'] = float(groundtruths['areas'][i][j, k]) else: ann['area'] = float( (boxes[j, k, 3] - boxes[j, k, 1]) * (boxes[j, k, 2] - boxes[j, k, 0])) if 'masks' in groundtruths: if isinstance(groundtruths['masks'][i][j, k], tf.Tensor): mask = Image.open( six.BytesIO(groundtruths['masks'][i][j, k].numpy())) else: mask = Image.open( six.BytesIO(groundtruths['masks'][i][j, k])) np_mask = np.array(mask, dtype=np.uint8) np_mask[np_mask > 0] = 255 encoded_mask = mask_api.encode(np.asfortranarray(np_mask)) ann['segmentation'] = encoded_mask # Ensure the content of `counts` is JSON serializable string. if 'counts' in ann['segmentation']: ann['segmentation']['counts'] = six.ensure_str( ann['segmentation']['counts']) if 'areas' not in groundtruths: ann['area'] = mask_api.area(encoded_mask) if 'keypoints' in groundtruths: keypoints = groundtruths['keypoints'][i] coco_keypoints = [] num_valid_keypoints = 0 for z in range(len(keypoints[j, k, :, 1])): # Convert from [y, x] to [x, y] as mandated by COCO. x = float(keypoints[j, k, z, 1]) y = float(keypoints[j, k, z, 0]) coco_keypoints.append(x) coco_keypoints.append(y) if tf.math.is_nan(x) or tf.math.is_nan(y) or ( x == 0 and y == 0): visibility = 0 else: visibility = 2 num_valid_keypoints = num_valid_keypoints + 1 coco_keypoints.append(visibility) ann['keypoints'] = coco_keypoints ann['num_keypoints'] = num_valid_keypoints gt_annotations.append(ann) for i, ann in enumerate(gt_annotations): ann['id'] = i + 1 if label_map: gt_categories = [{'id': i, 'name': label_map[i]} for i in label_map] else: category_ids = [gt['category_id'] for gt in gt_annotations] gt_categories = [{'id': i} for i in set(category_ids)] gt_dataset = { 'images': gt_images, 'categories': gt_categories, 'annotations': copy.deepcopy(gt_annotations), } return gt_dataset class COCOGroundtruthGenerator: """Generates the ground-truth annotations from a single example.""" def __init__(self, file_pattern, file_type, num_examples, include_mask, regenerate_source_id=False): self._file_pattern = file_pattern self._num_examples = num_examples self._include_mask = include_mask self._dataset_fn = dataset_fn.pick_dataset_fn(file_type) self._regenerate_source_id = regenerate_source_id def _parse_single_example(self, example): """Parses a single serialized tf.Example proto. Args: example: a serialized tf.Example proto string. Returns: A dictionary of ground-truth with the following fields: source_id: a scalar tensor of int64 representing the image source_id. height: a scalar tensor of int64 representing the image height. width: a scalar tensor of int64 representing the image width. boxes: a float tensor of shape [K, 4], representing the ground-truth boxes in absolute coordinates with respect to the original image size. classes: a int64 tensor of shape [K], representing the class labels of each instances. is_crowds: a bool tensor of shape [K], indicating whether the instance is crowd. areas: a float tensor of shape [K], indicating the area of each instance. masks: a string tensor of shape [K], containing the bytes of the png mask of each instance. """ decoder = tf_example_decoder.TfExampleDecoder( include_mask=self._include_mask, regenerate_source_id=self._regenerate_source_id) decoded_tensors = decoder.decode(example) image = decoded_tensors['image'] image_size = tf.shape(image)[0:2] boxes = box_ops.denormalize_boxes( decoded_tensors['groundtruth_boxes'], image_size) source_id = decoded_tensors['source_id'] if source_id.dtype is tf.string: source_id = tf.strings.to_number(source_id, out_type=tf.int64) groundtruths = { 'source_id': source_id, 'height': decoded_tensors['height'], 'width': decoded_tensors['width'], 'num_detections': tf.shape(decoded_tensors['groundtruth_classes'])[0], 'boxes': boxes, 'classes': decoded_tensors['groundtruth_classes'], 'is_crowds': decoded_tensors['groundtruth_is_crowd'], 'areas': decoded_tensors['groundtruth_area'], } if self._include_mask: groundtruths.update({ 'masks': decoded_tensors['groundtruth_instance_masks_png'], }) return groundtruths def _build_pipeline(self): """Builds data pipeline to generate ground-truth annotations.""" dataset = tf.data.Dataset.list_files(self._file_pattern, shuffle=False) dataset = dataset.interleave( map_func=lambda filename: self._dataset_fn(filename).prefetch(1), cycle_length=None, num_parallel_calls=tf.data.experimental.AUTOTUNE) dataset = dataset.take(self._num_examples) dataset = dataset.map(self._parse_single_example, num_parallel_calls=tf.data.experimental.AUTOTUNE) dataset = dataset.batch(1, drop_remainder=False) dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE) return dataset def __call__(self): return self._build_pipeline() def scan_and_generator_annotation_file(file_pattern: str, file_type: str, num_samples: int, include_mask: bool, annotation_file: str, regenerate_source_id: bool = False): """Scans and generate the COCO-style annotation JSON file given a dataset.""" groundtruth_generator = COCOGroundtruthGenerator( file_pattern, file_type, num_samples, include_mask, regenerate_source_id) generate_annotation_file(groundtruth_generator, annotation_file) def generate_annotation_file(groundtruth_generator, annotation_file): """Generates COCO-style annotation JSON file given a ground-truth generator.""" groundtruths = {} logging.info('Loading groundtruth annotations from dataset to memory...') for i, groundtruth in enumerate(groundtruth_generator()): logging.log_every_n(logging.INFO, 'generate_annotation_file: Processing annotation %d', 100, i) for k, v in six.iteritems(groundtruth): if k not in groundtruths: groundtruths[k] = [v] else: groundtruths[k].append(v) gt_dataset = convert_groundtruths_to_coco_dataset(groundtruths) logging.info('Saving groundtruth annotations to the JSON file...') with tf.io.gfile.GFile(annotation_file, 'w') as f: f.write(json.dumps(gt_dataset)) logging.info('Done saving the JSON file...')