# Copyright 2023 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Implementation of the Panoptic Quality metric. Panoptic Quality is an instance-based metric for evaluating the task of image parsing, aka panoptic segmentation. Please see the paper for details: "Panoptic Segmentation", Alexander Kirillov, Kaiming He, Ross Girshick, Carsten Rother and Piotr Dollar. arXiv:1801.00868, 2018. Note that this metric class is branched from https://github.com/tensorflow/models/blob/master/research/deeplab/evaluation/panoptic_quality.py """ import collections from typing import Any, Dict, Optional, Tuple, Union import numpy as np import tensorflow as tf, tf_keras from official.vision.ops import box_ops _EPSILON = 1e-10 def realdiv_maybe_zero(x, y): """Element-wise x / y where y may contain zeros, for those returns 0 too.""" return np.where( np.less(np.abs(y), _EPSILON), np.zeros_like(x), np.divide(x, y)) def _ids_to_counts(id_array): """Given a numpy array, a mapping from each unique entry to its count.""" ids, counts = np.unique(id_array, return_counts=True) return dict(zip(ids, counts)) class PanopticQuality: """Metric class for Panoptic Quality. "Panoptic Segmentation" by Alexander Kirillov, Kaiming He, Ross Girshick, Carsten Rother, Piotr Dollar. https://arxiv.org/abs/1801.00868 """ def __init__(self, num_categories, ignored_label, max_instances_per_category, offset): """Initialization for PanopticQualityMetric. Args: num_categories: The number of segmentation categories (or "classes" in the dataset). ignored_label: A category id that is ignored in evaluation, e.g. the void label as defined in COCO panoptic segmentation dataset. max_instances_per_category: The maximum number of instances for each category. Used in ensuring unique instance labels. offset: The maximum number of unique labels. This is used, by multiplying the ground-truth labels, to generate unique ids for individual regions of overlap between ground-truth and predicted segments. """ self.num_categories = num_categories self.ignored_label = ignored_label self.max_instances_per_category = max_instances_per_category self.offset = offset self.reset() def _naively_combine_labels(self, category_mask, instance_mask): """Naively creates a combined label array from categories and instances.""" return (category_mask.astype(np.uint32) * self.max_instances_per_category + instance_mask.astype(np.uint32)) def compare_and_accumulate(self, groundtruths, predictions): """Compares predictions with ground-truths, and accumulates the metrics. It is not assumed that instance ids are unique across different categories. See for example combine_semantic_and_instance_predictions.py in official PanopticAPI evaluation code for issues to consider when fusing category and instance labels. Instances ids of the ignored category have the meaning that id 0 is "void" and remaining ones are crowd instances. Args: groundtruths: A dictionary contains ground-truth labels. It should contain the following fields. - category_mask: A 2D numpy uint16 array of ground-truth per-pixel category labels. - instance_mask: A 2D numpy uint16 array of ground-truth per-pixel instance labels. predictions: A dictionary contains the model outputs. It should contain the following fields. - category_array: A 2D numpy uint16 array of predicted per-pixel category labels. - instance_array: A 2D numpy uint16 array of predicted instance labels. """ groundtruth_category_mask = groundtruths['category_mask'] groundtruth_instance_mask = groundtruths['instance_mask'] predicted_category_mask = predictions['category_mask'] predicted_instance_mask = predictions['instance_mask'] # First, combine the category and instance labels so that every unique # value for (category, instance) is assigned a unique integer label. pred_segment_id = self._naively_combine_labels(predicted_category_mask, predicted_instance_mask) gt_segment_id = self._naively_combine_labels(groundtruth_category_mask, groundtruth_instance_mask) # Pre-calculate areas for all ground-truth and predicted segments. gt_segment_areas = _ids_to_counts(gt_segment_id) pred_segment_areas = _ids_to_counts(pred_segment_id) # We assume there is only one void segment and it has instance id = 0. void_segment_id = self.ignored_label * self.max_instances_per_category # There may be other ignored ground-truth segments with instance id > 0, # find those ids using the unique segment ids extracted with the area # computation above. ignored_segment_ids = { gt_segment_id for gt_segment_id in gt_segment_areas if (gt_segment_id // self.max_instances_per_category) == self.ignored_label } # Next, combine the ground-truth and predicted labels. Divide up the pixels # based on which ground-truth segment and predicted segment they belong to, # this will assign a different 32-bit integer label to each choice of # (ground-truth segment, predicted segment), encoded as # gt_segment_id * offset + pred_segment_id. intersection_id_array = ( gt_segment_id.astype(np.uint64) * self.offset + pred_segment_id.astype(np.uint64)) # For every combination of (ground-truth segment, predicted segment) with a # non-empty intersection, this counts the number of pixels in that # intersection. intersection_areas = _ids_to_counts(intersection_id_array) # Helper function that computes the area of the overlap between a predicted # segment and the ground-truth void/ignored segment. def prediction_void_overlap(pred_segment_id): void_intersection_id = void_segment_id * self.offset + pred_segment_id return intersection_areas.get(void_intersection_id, 0) # Compute overall ignored overlap. def prediction_ignored_overlap(pred_segment_id): total_ignored_overlap = 0 for ignored_segment_id in ignored_segment_ids: intersection_id = ignored_segment_id * self.offset + pred_segment_id total_ignored_overlap += intersection_areas.get(intersection_id, 0) return total_ignored_overlap # Sets that are populated with segments which ground-truth/predicted # segments have been matched with overlapping predicted/ground-truth # segments respectively. gt_matched = set() pred_matched = set() # Calculate IoU per pair of intersecting segments of the same category. for intersection_id, intersection_area in intersection_areas.items(): gt_segment_id = int(intersection_id // self.offset) pred_segment_id = int(intersection_id % self.offset) gt_category = int(gt_segment_id // self.max_instances_per_category) pred_category = int(pred_segment_id // self.max_instances_per_category) if gt_category != pred_category: continue # Union between the ground-truth and predicted segments being compared # does not include the portion of the predicted segment that consists of # ground-truth "void" pixels. union = ( gt_segment_areas[gt_segment_id] + pred_segment_areas[pred_segment_id] - intersection_area - prediction_void_overlap(pred_segment_id)) iou = intersection_area / union if iou > 0.5: self.tp_per_class[gt_category] += 1 self.iou_per_class[gt_category] += iou gt_matched.add(gt_segment_id) pred_matched.add(pred_segment_id) # Count false negatives for each category. for gt_segment_id in gt_segment_areas: if gt_segment_id in gt_matched: continue category = gt_segment_id // self.max_instances_per_category # Failing to detect a void segment is not a false negative. if category == self.ignored_label: continue self.fn_per_class[category] += 1 # Count false positives for each category. for pred_segment_id in pred_segment_areas: if pred_segment_id in pred_matched: continue # A false positive is not penalized if is mostly ignored in the # ground-truth. if (prediction_ignored_overlap(pred_segment_id) / pred_segment_areas[pred_segment_id]) > 0.5: continue category = pred_segment_id // self.max_instances_per_category self.fp_per_class[category] += 1 def _valid_categories(self): """Categories with a "valid" value for the metric, have > 0 instances. We will ignore the `ignore_label` class and other classes which have `tp + fn + fp = 0`. Returns: Boolean array of shape `[num_categories]`. """ valid_categories = np.not_equal( self.tp_per_class + self.fn_per_class + self.fp_per_class, 0) if self.ignored_label >= 0 and self.ignored_label < self.num_categories: valid_categories[self.ignored_label] = False return valid_categories def result_per_category(self): """For supported metrics, return individual per-category metric values. Returns: A dictionary contains all per-class metrics, each metrics is a numpy array of shape `[self.num_categories]`, where index `i` is the metrics value over only that category. """ sq_per_class = realdiv_maybe_zero(self.iou_per_class, self.tp_per_class) rq_per_class = realdiv_maybe_zero( self.tp_per_class, self.tp_per_class + 0.5 * self.fn_per_class + 0.5 * self.fp_per_class) return { 'sq_per_class': sq_per_class, 'rq_per_class': rq_per_class, 'pq_per_class': np.multiply(sq_per_class, rq_per_class) } def result(self, is_thing=None): """Computes and returns the detailed metric results over all comparisons. Args: is_thing: A boolean array of length `num_categories`. The entry `is_thing[category_id]` is True iff that category is a "thing" category instead of "stuff." Returns: A dictionary with a breakdown of metrics and/or metric factors by things, stuff, and all categories. """ results = self.result_per_category() valid_categories = self._valid_categories() # If known, break down which categories are valid _and_ things/stuff. category_sets = collections.OrderedDict() category_sets['All'] = valid_categories if is_thing is not None: category_sets['Things'] = np.logical_and(valid_categories, is_thing) category_sets['Stuff'] = np.logical_and(valid_categories, np.logical_not(is_thing)) for category_set_name, in_category_set in category_sets.items(): if np.any(in_category_set): results.update({ f'{category_set_name}_pq': np.mean(results['pq_per_class'][in_category_set]), f'{category_set_name}_sq': np.mean(results['sq_per_class'][in_category_set]), f'{category_set_name}_rq': np.mean(results['rq_per_class'][in_category_set]), # The number of categories in this subset. f'{category_set_name}_num_categories': np.sum(in_category_set.astype(np.int32)), }) else: results.update({ f'{category_set_name}_pq': 0., f'{category_set_name}_sq': 0., f'{category_set_name}_rq': 0., f'{category_set_name}_num_categories': 0 }) return results def reset(self): """Resets the accumulation to the metric class's state at initialization.""" self.iou_per_class = np.zeros(self.num_categories, dtype=np.float64) self.tp_per_class = np.zeros(self.num_categories, dtype=np.float64) self.fn_per_class = np.zeros(self.num_categories, dtype=np.float64) self.fp_per_class = np.zeros(self.num_categories, dtype=np.float64) def _get_instance_class_ids( category_mask: tf.Tensor, instance_mask: tf.Tensor, max_num_instances: int, ignored_label: int, ) -> tf.Tensor: """Get the class id of each instance (index starts from 1).""" # (batch_size, height, width) instance_mask = tf.where( (instance_mask == 0) | (category_mask == ignored_label), -1, instance_mask ) # (batch_size, height, width, max_num_instances + 1) instance_binary_mask = tf.one_hot( instance_mask, max_num_instances + 1, dtype=tf.int32 ) # (batch_size, max_num_instances + 1) result = tf.reduce_max( instance_binary_mask * category_mask[..., tf.newaxis], axis=[1, 2] ) # If not an instance, sets the class id to -1. return tf.where(result == 0, -1, result) class PanopticQualityV2(tf_keras.metrics.Metric): """Panoptic quality metrics with vectorized implementation. This implementation is supported on TPU. "Panoptic Segmentation" by Alexander Kirillov, Kaiming He, Ross Girshick, Carsten Rother, Piotr Dollar. https://arxiv.org/abs/1801.00868 """ def __init__( self, num_categories: int, is_thing: Optional[Tuple[bool, ...]] = None, max_num_instances: int = 255, ignored_label: int = 255, rescale_predictions: bool = False, name: Optional[str] = None, dtype: Optional[Union[str, tf.dtypes.DType]] = tf.float32, ): """Initialization for PanopticQualityV2. Args: num_categories: the number of categories. is_thing: a boolean array of length `num_categories`. The entry `is_thing[category_id]` is True iff that category is a "thing" category instead of "stuff". Default to `None`, and it means categories are not classified into these two categories. max_num_instances: the maximum number of instances in an image. ignored_label: a category id that is ignored in evaluation, e.g. the void label as defined in COCO panoptic segmentation dataset. rescale_predictions: whether to scale back prediction to original image sizes. If True, the image_info of the groundtruth is used to rescale predictions. name: string name of the metric instance. dtype: data type of the metric result. """ super().__init__(name=name, dtype=dtype) self._num_categories = num_categories if is_thing is not None: self._is_thing = is_thing else: self._is_thing = [True] * self._num_categories self._max_num_instances = max_num_instances self._ignored_label = ignored_label self._rescale_predictions = rescale_predictions # Variables self.tp_count = self.add_weight( 'tp_count', shape=[self._num_categories], initializer='zeros', dtype=tf.float32, ) self.fp_count = self.add_weight( 'fp_count', shape=[self._num_categories], initializer='zeros', dtype=tf.float32, ) self.fn_count = self.add_weight( 'fn_count', shape=[self._num_categories], initializer='zeros', dtype=tf.float32, ) self.tp_iou_sum = self.add_weight( 'tp_iou_sum', shape=[self._num_categories], initializer='zeros', dtype=tf.float32, ) def get_config(self) -> Dict[str, Any]: """Returns the serializable config of the metric.""" return { 'num_categories': self._num_categories, 'is_thing': self._is_thing, 'max_num_instances': self._max_num_instances, 'ignored_label': self._ignored_label, 'rescale_predictions': self._rescale_predictions, 'name': self.name, 'dtype': self.dtype, } def reset_state(self): """Resets all of the metric state variables.""" self.tp_count.assign(tf.zeros_like(self.tp_count)) self.fp_count.assign(tf.zeros_like(self.fp_count)) self.fn_count.assign(tf.zeros_like(self.fn_count)) self.tp_iou_sum.assign(tf.zeros_like(self.tp_iou_sum)) def update_state( self, y_true: Dict[str, tf.Tensor], y_pred: Dict[str, tf.Tensor] ): category_mask = tf.convert_to_tensor(y_pred['category_mask'], tf.int32) instance_mask = tf.convert_to_tensor(y_pred['instance_mask'], tf.int32) gt_category_mask = tf.convert_to_tensor(y_true['category_mask'], tf.int32) gt_instance_mask = tf.convert_to_tensor(y_true['instance_mask'], tf.int32) if self._rescale_predictions: _, height, width = gt_category_mask.get_shape().as_list() # Instead of cropping the masks to the original image shape (dynamic), # here we keep the mask shape (fixed) and ignore the pixels outside the # original image shape. image_shape = tf.cast(y_true['image_info'][:, 0, :], tf.int32) # (batch_size, 2) y0_x0 = tf.broadcast_to( tf.constant([[0, 0]], dtype=tf.int32), tf.shape(image_shape) ) # (batch_size, 4) image_shape_bbox = tf.concat([y0_x0, image_shape], axis=1) # (batch_size, height, width) image_shape_masks = box_ops.bbox2mask( bbox=image_shape_bbox, image_height=height, image_width=width, dtype=tf.bool, ) # (batch_size, height, width) category_mask = tf.where( image_shape_masks, category_mask, self._ignored_label ) instance_mask = tf.where(image_shape_masks, instance_mask, 0) gt_category_mask = tf.where( image_shape_masks, gt_category_mask, self._ignored_label ) gt_instance_mask = tf.where(image_shape_masks, gt_instance_mask, 0) self._update_thing_classes( category_mask, instance_mask, gt_category_mask, gt_instance_mask ) self._update_stuff_classes(category_mask, gt_category_mask) def _update_thing_classes( self, category_mask: tf.Tensor, instance_mask: tf.Tensor, gt_category_mask: tf.Tensor, gt_instance_mask: tf.Tensor, ): _, height, width = category_mask.get_shape().as_list() # (batch_size, num_detections + 1) instance_class_ids = _get_instance_class_ids( category_mask, instance_mask, self._max_num_instances, self._ignored_label, ) # (batch_size, num_gts + 1) gt_instance_class_ids = _get_instance_class_ids( gt_category_mask, gt_instance_mask, self._max_num_instances, self._ignored_label, ) # (batch_size, height, width) valid_mask = gt_category_mask != self._ignored_label # (batch_size, height, width, num_detections + 1) instance_binary_masks = tf.one_hot( tf.where(instance_mask > 0, instance_mask, -1), self._max_num_instances + 1, on_value=True, off_value=False, ) # (batch_size, height, width, num_gts + 1) gt_instance_binary_masks = tf.one_hot( tf.where(gt_instance_mask > 0, gt_instance_mask, -1), self._max_num_instances + 1, on_value=True, off_value=False, ) # (batch_size, height * width, num_detections + 1) flattened_binary_masks = tf.reshape( instance_binary_masks & valid_mask[..., tf.newaxis], [-1, height * width, self._max_num_instances + 1], ) # (batch_size, height * width, num_gts + 1) flattened_gt_binary_masks = tf.reshape( gt_instance_binary_masks & valid_mask[..., tf.newaxis], [-1, height * width, self._max_num_instances + 1], ) # (batch_size, num_detections + 1, height * width) flattened_binary_masks = tf.transpose(flattened_binary_masks, [0, 2, 1]) # (batch_size, num_detections + 1, num_gts + 1) intersection = tf.matmul( tf.cast(flattened_binary_masks, tf.float32), tf.cast(flattened_gt_binary_masks, tf.float32), ) union = ( tf.math.count_nonzero( flattened_binary_masks, axis=-1, keepdims=True, dtype=tf.float32 ) + tf.math.count_nonzero( flattened_gt_binary_masks, axis=-2, keepdims=True, dtype=tf.float32 ) - intersection ) # (batch_size, num_detections + 1, num_gts + 1) detection_to_gt_ious = tf.math.divide_no_nan(intersection, union) detection_matches_gt = ( (detection_to_gt_ious > 0.5) & ( instance_class_ids[:, :, tf.newaxis] == gt_instance_class_ids[:, tf.newaxis, :] ) & (gt_instance_class_ids[:, tf.newaxis, :] > 0) ) # (batch_size, num_gts + 1) is_tp = tf.reduce_any(detection_matches_gt, axis=1) # (batch_size, num_gts + 1) tp_iou = tf.reduce_max( tf.where(detection_matches_gt, detection_to_gt_ious, 0), axis=1 ) # (batch_size, num_detections + 1) is_fp = tf.reduce_any(instance_binary_masks, axis=[1, 2]) & ~tf.reduce_any( detection_matches_gt, axis=2 ) # (batch_size, height, width, num_detections + 1) fp_binary_mask = is_fp[:, tf.newaxis, tf.newaxis, :] & instance_binary_masks # (batch_size, num_detections + 1) fp_area = tf.math.count_nonzero( fp_binary_mask, axis=[1, 2], dtype=tf.float32 ) # (batch_size, num_detections + 1) fp_crowd_or_ignored_area = tf.math.count_nonzero( fp_binary_mask & ( ( # An instance detection matches a crowd ground truth instance if # the instance class of the detection matches the class of the # ground truth and the instance id of the ground truth is 0 (the # instance is crowd). (instance_mask > 0) & (category_mask > 0) & (gt_category_mask == category_mask) & (gt_instance_mask == 0) ) | (gt_category_mask == self._ignored_label) )[..., tf.newaxis], axis=[1, 2], dtype=tf.float32, ) # Don't count the detection as false positive if over 50% pixels of the # instance detection are crowd of the matching class or ignored pixels in # ground truth. # (batch_size, num_detections + 1) is_fp &= tf.math.divide_no_nan(fp_crowd_or_ignored_area, fp_area) <= 0.5 # (batch_size, num_detections + 1, num_categories) detection_by_class = tf.one_hot( instance_class_ids, self._num_categories, on_value=True, off_value=False ) # (batch_size, num_gts + 1, num_categories) gt_by_class = tf.one_hot( gt_instance_class_ids, self._num_categories, on_value=True, off_value=False, ) # (num_categories,) gt_count = tf.math.count_nonzero(gt_by_class, axis=[0, 1], dtype=tf.float32) tp_count = tf.math.count_nonzero( is_tp[..., tf.newaxis] & gt_by_class, axis=[0, 1], dtype=tf.float32 ) fn_count = gt_count - tp_count fp_count = tf.math.count_nonzero( is_fp[..., tf.newaxis] & detection_by_class, axis=[0, 1], dtype=tf.float32, ) tp_iou_sum = tf.reduce_sum( tf.cast(gt_by_class, tf.float32) * tp_iou[..., tf.newaxis], axis=[0, 1] ) self.tp_count.assign_add(tp_count) self.fn_count.assign_add(fn_count) self.fp_count.assign_add(fp_count) self.tp_iou_sum.assign_add(tp_iou_sum) def _update_stuff_classes( self, category_mask: tf.Tensor, gt_category_mask: tf.Tensor ): # (batch_size, height, width, num_categories) category_binary_mask = tf.one_hot( category_mask, self._num_categories, on_value=True, off_value=False ) gt_category_binary_mask = tf.one_hot( gt_category_mask, self._num_categories, on_value=True, off_value=False ) # (batch_size, height, width) valid_mask = gt_category_mask != self._ignored_label # (batch_size, num_categories) intersection = tf.math.count_nonzero( category_binary_mask & gt_category_binary_mask & valid_mask[..., tf.newaxis], axis=[1, 2], dtype=tf.float32, ) union = tf.math.count_nonzero( (category_binary_mask | gt_category_binary_mask) & valid_mask[..., tf.newaxis], axis=[1, 2], dtype=tf.float32, ) iou = tf.math.divide_no_nan(intersection, union) is_thing = tf.constant(self._is_thing, dtype=tf.bool) # (batch_size, num_categories) is_tp = (iou > 0.5) & ~is_thing is_fn = ( tf.reduce_any(gt_category_binary_mask, axis=[1, 2]) & ~is_thing & ~is_tp ) is_fp = ( tf.reduce_any(category_binary_mask, axis=[1, 2]) & ~is_thing & ~is_tp ) # (batch_size, height, width, num_categories) fp_binary_mask = is_fp[:, tf.newaxis, tf.newaxis, :] & category_binary_mask # (batch_size, num_categories) fp_area = tf.math.count_nonzero( fp_binary_mask, axis=[1, 2], dtype=tf.float32 ) fp_ignored_area = tf.math.count_nonzero( fp_binary_mask & (gt_category_mask == self._ignored_label)[..., tf.newaxis], axis=[1, 2], dtype=tf.float32, ) # Don't count the detection as false positive if over 50% pixels of the # stuff detection are ignored pixels in ground truth. is_fp &= tf.math.divide_no_nan(fp_ignored_area, fp_area) <= 0.5 # (num_categories,) tp_count = tf.math.count_nonzero(is_tp, axis=0, dtype=tf.float32) fn_count = tf.math.count_nonzero(is_fn, axis=0, dtype=tf.float32) fp_count = tf.math.count_nonzero(is_fp, axis=0, dtype=tf.float32) tp_iou_sum = tf.reduce_sum(tf.cast(is_tp, tf.float32) * iou, axis=0) self.tp_count.assign_add(tp_count) self.fn_count.assign_add(fn_count) self.fp_count.assign_add(fp_count) self.tp_iou_sum.assign_add(tp_iou_sum) def result(self) -> Dict[str, tf.Tensor]: """Returns the metrics values as a dict.""" # (num_categories,) tp_fn_fp_count = self.tp_count + self.fn_count + self.fp_count is_ignore_label = tf.one_hot( self._ignored_label, self._num_categories, on_value=True, off_value=False, ) sq_per_class = tf.math.divide_no_nan( self.tp_iou_sum, self.tp_count ) * tf.cast(~is_ignore_label, tf.float32) rq_per_class = tf.math.divide_no_nan( self.tp_count, self.tp_count + 0.5 * self.fp_count + 0.5 * self.fn_count ) * tf.cast(~is_ignore_label, tf.float32) pq_per_class = sq_per_class * rq_per_class is_thing = tf.constant(self._is_thing, dtype=tf.bool) result = { # (num_categories,) 'valid_thing_classes': ( (tp_fn_fp_count > 0) & is_thing & ~is_ignore_label ), # (num_categories,) 'valid_stuff_classes': ( (tp_fn_fp_count > 0) & ~is_thing & ~is_ignore_label ), # (num_categories,) 'sq_per_class': sq_per_class, # (num_categories,) 'rq_per_class': rq_per_class, # (num_categories,) 'pq_per_class': pq_per_class, } return result