# Copyright 2023 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Contains definitions of ROI generator.""" from typing import Optional, Mapping # Import libraries import tensorflow as tf, tf_keras from official.vision.ops import box_ops from official.vision.ops import nms def _multilevel_propose_rois(raw_boxes: Mapping[str, tf.Tensor], raw_scores: Mapping[str, tf.Tensor], anchor_boxes: Mapping[str, tf.Tensor], image_shape: tf.Tensor, pre_nms_top_k: int = 2000, pre_nms_score_threshold: float = 0.0, pre_nms_min_size_threshold: float = 0.0, nms_iou_threshold: float = 0.7, num_proposals: int = 1000, use_batched_nms: bool = False, decode_boxes: bool = True, clip_boxes: bool = True, apply_sigmoid_to_score: bool = True): """Proposes RoIs given a group of candidates from different FPN levels. The following describes the steps: 1. For each individual level: a. Apply sigmoid transform if specified. b. Decode boxes if specified. c. Clip boxes if specified. d. Filter small boxes and those fall outside image if specified. e. Apply pre-NMS filtering including pre-NMS top k and score thresholding. f. Apply NMS. 2. Aggregate post-NMS boxes from each level. 3. Apply an overall top k to generate the final selected RoIs. Args: raw_boxes: A `dict` with keys representing FPN levels and values representing box tenors of shape [batch_size, feature_h, feature_w, num_anchors * 4]. raw_scores: A `dict` with keys representing FPN levels and values representing logit tensors of shape [batch_size, feature_h, feature_w, num_anchors]. anchor_boxes: A `dict` with keys representing FPN levels and values representing anchor box tensors of shape [batch_size, feature_h * feature_w * num_anchors, 4]. image_shape: A `tf.Tensor` of shape [batch_size, 2] where the last dimension are [height, width] of the scaled image. pre_nms_top_k: An `int` of top scoring RPN proposals *per level* to keep before applying NMS. Default: 2000. pre_nms_score_threshold: A `float` between 0 and 1 representing the minimal box score to keep before applying NMS. This is often used as a pre-filtering step for better performance. Default: 0, no filtering is applied. pre_nms_min_size_threshold: A `float` representing the minimal box size in each side (w.r.t. the scaled image) to keep before applying NMS. This is often used as a pre-filtering step for better performance. Default: 0, no filtering is applied. nms_iou_threshold: A `float` between 0 and 1 representing the IoU threshold used for NMS. If 0.0, no NMS is applied. Default: 0.7. num_proposals: An `int` of top scoring RPN proposals *in total* to keep after applying NMS. Default: 1000. use_batched_nms: A `bool` indicating whether NMS is applied in batch using `tf.image.combined_non_max_suppression`. Currently only available in CPU/GPU. Default is False. decode_boxes: A `bool` indicating whether `raw_boxes` needs to be decoded using `anchor_boxes`. If False, use `raw_boxes` directly and ignore `anchor_boxes`. Default is True. clip_boxes: A `bool` indicating whether boxes are first clipped to the scaled image size before appliying NMS. If False, no clipping is applied and `image_shape` is ignored. Default is True. apply_sigmoid_to_score: A `bool` indicating whether apply sigmoid to `raw_scores` before applying NMS. Default is True. Returns: selected_rois: A `tf.Tensor` of shape [batch_size, num_proposals, 4], representing the box coordinates of the selected proposals w.r.t. the scaled image. selected_roi_scores: A `tf.Tensor` of shape [batch_size, num_proposals, 1], representing the scores of the selected proposals. """ with tf.name_scope('multilevel_propose_rois'): rois = [] roi_scores = [] image_shape = tf.expand_dims(image_shape, axis=1) for level in sorted(raw_scores.keys()): with tf.name_scope('level_%s' % level): _, feature_h, feature_w, num_anchors_per_location = ( raw_scores[level].get_shape().as_list()) num_boxes = feature_h * feature_w * num_anchors_per_location this_level_scores = tf.reshape(raw_scores[level], [-1, num_boxes]) this_level_boxes = tf.reshape(raw_boxes[level], [-1, num_boxes, 4]) this_level_anchors = tf.cast( tf.reshape(anchor_boxes[level], [-1, num_boxes, 4]), dtype=this_level_scores.dtype) if apply_sigmoid_to_score: this_level_scores = tf.sigmoid(this_level_scores) if decode_boxes: this_level_boxes = box_ops.decode_boxes( this_level_boxes, this_level_anchors) if clip_boxes: this_level_boxes = box_ops.clip_boxes( this_level_boxes, image_shape) if pre_nms_min_size_threshold > 0.0: this_level_boxes, this_level_scores = box_ops.filter_boxes( this_level_boxes, this_level_scores, image_shape, pre_nms_min_size_threshold) this_level_pre_nms_top_k = min(num_boxes, pre_nms_top_k) this_level_post_nms_top_k = min(num_boxes, num_proposals) if nms_iou_threshold > 0.0: if use_batched_nms: this_level_rois, this_level_roi_scores, _, _ = ( tf.image.combined_non_max_suppression( tf.expand_dims(this_level_boxes, axis=2), tf.expand_dims(this_level_scores, axis=-1), max_output_size_per_class=this_level_pre_nms_top_k, max_total_size=this_level_post_nms_top_k, iou_threshold=nms_iou_threshold, score_threshold=pre_nms_score_threshold, pad_per_class=False, clip_boxes=False)) else: if pre_nms_score_threshold > 0.0: this_level_boxes, this_level_scores = ( box_ops.filter_boxes_by_scores( this_level_boxes, this_level_scores, pre_nms_score_threshold)) this_level_boxes, this_level_scores = box_ops.top_k_boxes( this_level_boxes, this_level_scores, k=this_level_pre_nms_top_k) this_level_roi_scores, this_level_rois = ( nms.sorted_non_max_suppression_padded( this_level_scores, this_level_boxes, max_output_size=this_level_post_nms_top_k, iou_threshold=nms_iou_threshold)) else: this_level_rois, this_level_roi_scores = box_ops.top_k_boxes( this_level_boxes, this_level_scores, k=this_level_post_nms_top_k) rois.append(this_level_rois) roi_scores.append(this_level_roi_scores) all_rois = tf.concat(rois, axis=1) all_roi_scores = tf.concat(roi_scores, axis=1) with tf.name_scope('top_k_rois'): _, num_valid_rois = all_roi_scores.get_shape().as_list() overall_top_k = min(num_valid_rois, num_proposals) selected_rois, selected_roi_scores = box_ops.top_k_boxes( all_rois, all_roi_scores, k=overall_top_k) return selected_rois, selected_roi_scores @tf_keras.utils.register_keras_serializable(package='Vision') class MultilevelROIGenerator(tf_keras.layers.Layer): """Proposes RoIs for the second stage processing.""" def __init__(self, pre_nms_top_k: int = 2000, pre_nms_score_threshold: float = 0.0, pre_nms_min_size_threshold: float = 0.0, nms_iou_threshold: float = 0.7, num_proposals: int = 1000, test_pre_nms_top_k: int = 1000, test_pre_nms_score_threshold: float = 0.0, test_pre_nms_min_size_threshold: float = 0.0, test_nms_iou_threshold: float = 0.7, test_num_proposals: int = 1000, use_batched_nms: bool = False, **kwargs): """Initializes a ROI generator. The ROI generator transforms the raw predictions from RPN to ROIs. Args: pre_nms_top_k: An `int` of the number of top scores proposals to be kept before applying NMS. pre_nms_score_threshold: A `float` of the score threshold to apply before applying NMS. Proposals whose scores are below this threshold are thrown away. pre_nms_min_size_threshold: A `float` of the threshold of each side of the box (w.r.t. the scaled image). Proposals whose sides are below this threshold are thrown away. nms_iou_threshold: A `float` in [0, 1], the NMS IoU threshold. num_proposals: An `int` of the final number of proposals to generate. test_pre_nms_top_k: An `int` of the number of top scores proposals to be kept before applying NMS in testing. test_pre_nms_score_threshold: A `float` of the score threshold to apply before applying NMS in testing. Proposals whose scores are below this threshold are thrown away. test_pre_nms_min_size_threshold: A `float` of the threshold of each side of the box (w.r.t. the scaled image) in testing. Proposals whose sides are below this threshold are thrown away. test_nms_iou_threshold: A `float` in [0, 1] of the NMS IoU threshold in testing. test_num_proposals: An `int` of the final number of proposals to generate in testing. use_batched_nms: A `bool` of whether or not use `tf.image.combined_non_max_suppression`. **kwargs: Additional keyword arguments passed to Layer. """ self._config_dict = { 'pre_nms_top_k': pre_nms_top_k, 'pre_nms_score_threshold': pre_nms_score_threshold, 'pre_nms_min_size_threshold': pre_nms_min_size_threshold, 'nms_iou_threshold': nms_iou_threshold, 'num_proposals': num_proposals, 'test_pre_nms_top_k': test_pre_nms_top_k, 'test_pre_nms_score_threshold': test_pre_nms_score_threshold, 'test_pre_nms_min_size_threshold': test_pre_nms_min_size_threshold, 'test_nms_iou_threshold': test_nms_iou_threshold, 'test_num_proposals': test_num_proposals, 'use_batched_nms': use_batched_nms, } super(MultilevelROIGenerator, self).__init__(**kwargs) def call(self, raw_boxes: Mapping[str, tf.Tensor], raw_scores: Mapping[str, tf.Tensor], anchor_boxes: Mapping[str, tf.Tensor], image_shape: tf.Tensor, training: Optional[bool] = None): """Proposes RoIs given a group of candidates from different FPN levels. The following describes the steps: 1. For each individual level: a. Apply sigmoid transform if specified. b. Decode boxes if specified. c. Clip boxes if specified. d. Filter small boxes and those fall outside image if specified. e. Apply pre-NMS filtering including pre-NMS top k and score thresholding. f. Apply NMS. 2. Aggregate post-NMS boxes from each level. 3. Apply an overall top k to generate the final selected RoIs. Args: raw_boxes: A `dict` with keys representing FPN levels and values representing box tenors of shape [batch, feature_h, feature_w, num_anchors * 4]. raw_scores: A `dict` with keys representing FPN levels and values representing logit tensors of shape [batch, feature_h, feature_w, num_anchors]. anchor_boxes: A `dict` with keys representing FPN levels and values representing anchor box tensors of shape [batch, feature_h * feature_w * num_anchors, 4]. image_shape: A `tf.Tensor` of shape [batch, 2] where the last dimension are [height, width] of the scaled image. training: A `bool` that indicates whether it is in training mode. Returns: roi_boxes: A `tf.Tensor` of shape [batch, num_proposals, 4], the proposed ROIs in the scaled image coordinate. roi_scores: A `tf.Tensor` of shape [batch, num_proposals], scores of the proposed ROIs. """ roi_boxes, roi_scores = _multilevel_propose_rois( raw_boxes, raw_scores, anchor_boxes, image_shape, pre_nms_top_k=( self._config_dict['pre_nms_top_k'] if training else self._config_dict['test_pre_nms_top_k']), pre_nms_score_threshold=( self._config_dict['pre_nms_score_threshold'] if training else self._config_dict['test_pre_nms_score_threshold']), pre_nms_min_size_threshold=( self._config_dict['pre_nms_min_size_threshold'] if training else self._config_dict['test_pre_nms_min_size_threshold']), nms_iou_threshold=( self._config_dict['nms_iou_threshold'] if training else self._config_dict['test_nms_iou_threshold']), num_proposals=( self._config_dict['num_proposals'] if training else self._config_dict['test_num_proposals']), use_batched_nms=self._config_dict['use_batched_nms'], decode_boxes=True, clip_boxes=True, apply_sigmoid_to_score=True) return roi_boxes, roi_scores def get_config(self): return self._config_dict @classmethod def from_config(cls, config, custom_objects=None): return cls(**config)