# Copyright 2023 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ROI-related ops.""" from __future__ import absolute_import from __future__ import division from __future__ import print_function import tensorflow as tf, tf_keras from official.legacy.detection.ops import nms from official.legacy.detection.utils import box_utils def multilevel_propose_rois(rpn_boxes, rpn_scores, anchor_boxes, image_shape, rpn_pre_nms_top_k=2000, rpn_post_nms_top_k=1000, rpn_nms_threshold=0.7, rpn_score_threshold=0.0, rpn_min_size_threshold=0.0, decode_boxes=True, clip_boxes=True, use_batched_nms=False, apply_sigmoid_to_score=True): """Proposes RoIs given a group of candidates from different FPN levels. The following describes the steps: 1. For each individual level: a. Apply sigmoid transform if specified. b. Decode boxes if specified. c. Clip boxes if specified. d. Filter small boxes and those fall outside image if specified. e. Apply pre-NMS filtering including pre-NMS top k and score thresholding. f. Apply NMS. 2. Aggregate post-NMS boxes from each level. 3. Apply an overall top k to generate the final selected RoIs. Args: rpn_boxes: a dict with keys representing FPN levels and values representing box tenors of shape [batch_size, feature_h, feature_w, num_anchors * 4]. rpn_scores: a dict with keys representing FPN levels and values representing logit tensors of shape [batch_size, feature_h, feature_w, num_anchors]. anchor_boxes: a dict with keys representing FPN levels and values representing anchor box tensors of shape [batch_size, feature_h, feature_w, num_anchors * 4]. image_shape: a tensor of shape [batch_size, 2] where the last dimension are [height, width] of the scaled image. rpn_pre_nms_top_k: an integer of top scoring RPN proposals *per level* to keep before applying NMS. Default: 2000. rpn_post_nms_top_k: an integer of top scoring RPN proposals *in total* to keep after applying NMS. Default: 1000. rpn_nms_threshold: a float between 0 and 1 representing the IoU threshold used for NMS. If 0.0, no NMS is applied. Default: 0.7. rpn_score_threshold: a float between 0 and 1 representing the minimal box score to keep before applying NMS. This is often used as a pre-filtering step for better performance. If 0, no filtering is applied. Default: 0. rpn_min_size_threshold: a float representing the minimal box size in each side (w.r.t. the scaled image) to keep before applying NMS. This is often used as a pre-filtering step for better performance. If 0, no filtering is applied. Default: 0. decode_boxes: a boolean indicating whether `rpn_boxes` needs to be decoded using `anchor_boxes`. If False, use `rpn_boxes` directly and ignore `anchor_boxes`. Default: True. clip_boxes: a boolean indicating whether boxes are first clipped to the scaled image size before appliying NMS. If False, no clipping is applied and `image_shape` is ignored. Default: True. use_batched_nms: a boolean indicating whether NMS is applied in batch using `tf.image.combined_non_max_suppression`. Currently only available in CPU/GPU. Default: False. apply_sigmoid_to_score: a boolean indicating whether apply sigmoid to `rpn_scores` before applying NMS. Default: True. Returns: selected_rois: a tensor of shape [batch_size, rpn_post_nms_top_k, 4], representing the box coordinates of the selected proposals w.r.t. the scaled image. selected_roi_scores: a tensor of shape [batch_size, rpn_post_nms_top_k, 1], representing the scores of the selected proposals. """ with tf.name_scope('multilevel_propose_rois'): rois = [] roi_scores = [] image_shape = tf.expand_dims(image_shape, axis=1) for level in sorted(rpn_scores.keys()): with tf.name_scope('level_%d' % level): _, feature_h, feature_w, num_anchors_per_location = ( rpn_scores[level].get_shape().as_list()) num_boxes = feature_h * feature_w * num_anchors_per_location this_level_scores = tf.reshape(rpn_scores[level], [-1, num_boxes]) this_level_boxes = tf.reshape(rpn_boxes[level], [-1, num_boxes, 4]) this_level_anchors = tf.cast( tf.reshape(anchor_boxes[level], [-1, num_boxes, 4]), dtype=this_level_scores.dtype) if apply_sigmoid_to_score: this_level_scores = tf.sigmoid(this_level_scores) if decode_boxes: this_level_boxes = box_utils.decode_boxes(this_level_boxes, this_level_anchors) if clip_boxes: this_level_boxes = box_utils.clip_boxes(this_level_boxes, image_shape) if rpn_min_size_threshold > 0.0: this_level_boxes, this_level_scores = box_utils.filter_boxes( this_level_boxes, this_level_scores, image_shape, rpn_min_size_threshold) this_level_pre_nms_top_k = min(num_boxes, rpn_pre_nms_top_k) this_level_post_nms_top_k = min(num_boxes, rpn_post_nms_top_k) if rpn_nms_threshold > 0.0: if use_batched_nms: this_level_rois, this_level_roi_scores, _, _ = ( tf.image.combined_non_max_suppression( tf.expand_dims(this_level_boxes, axis=2), tf.expand_dims(this_level_scores, axis=-1), max_output_size_per_class=this_level_pre_nms_top_k, max_total_size=this_level_post_nms_top_k, iou_threshold=rpn_nms_threshold, score_threshold=rpn_score_threshold, pad_per_class=False, clip_boxes=False)) else: if rpn_score_threshold > 0.0: this_level_boxes, this_level_scores = ( box_utils.filter_boxes_by_scores(this_level_boxes, this_level_scores, rpn_score_threshold)) this_level_boxes, this_level_scores = box_utils.top_k_boxes( this_level_boxes, this_level_scores, k=this_level_pre_nms_top_k) this_level_roi_scores, this_level_rois = ( nms.sorted_non_max_suppression_padded( this_level_scores, this_level_boxes, max_output_size=this_level_post_nms_top_k, iou_threshold=rpn_nms_threshold)) else: this_level_rois, this_level_roi_scores = box_utils.top_k_boxes( this_level_rois, this_level_scores, k=this_level_post_nms_top_k) rois.append(this_level_rois) roi_scores.append(this_level_roi_scores) all_rois = tf.concat(rois, axis=1) all_roi_scores = tf.concat(roi_scores, axis=1) with tf.name_scope('top_k_rois'): _, num_valid_rois = all_roi_scores.get_shape().as_list() overall_top_k = min(num_valid_rois, rpn_post_nms_top_k) selected_rois, selected_roi_scores = box_utils.top_k_boxes( all_rois, all_roi_scores, k=overall_top_k) return selected_rois, selected_roi_scores class ROIGenerator(tf_keras.layers.Layer): """Proposes RoIs for the second stage processing.""" def __init__(self, params): self._rpn_pre_nms_top_k = params.rpn_pre_nms_top_k self._rpn_post_nms_top_k = params.rpn_post_nms_top_k self._rpn_nms_threshold = params.rpn_nms_threshold self._rpn_score_threshold = params.rpn_score_threshold self._rpn_min_size_threshold = params.rpn_min_size_threshold self._test_rpn_pre_nms_top_k = params.test_rpn_pre_nms_top_k self._test_rpn_post_nms_top_k = params.test_rpn_post_nms_top_k self._test_rpn_nms_threshold = params.test_rpn_nms_threshold self._test_rpn_score_threshold = params.test_rpn_score_threshold self._test_rpn_min_size_threshold = params.test_rpn_min_size_threshold self._use_batched_nms = params.use_batched_nms super(ROIGenerator, self).__init__(autocast=False) def call(self, boxes, scores, anchor_boxes, image_shape, is_training): """Generates RoI proposals. Args: boxes: a dict with keys representing FPN levels and values representing box tenors of shape [batch_size, feature_h, feature_w, num_anchors * 4]. scores: a dict with keys representing FPN levels and values representing logit tensors of shape [batch_size, feature_h, feature_w, num_anchors]. anchor_boxes: a dict with keys representing FPN levels and values representing anchor box tensors of shape [batch_size, feature_h, feature_w, num_anchors * 4]. image_shape: a tensor of shape [batch_size, 2] where the last dimension are [height, width] of the scaled image. is_training: a bool indicating whether it is in training or inference mode. Returns: proposed_rois: a tensor of shape [batch_size, rpn_post_nms_top_k, 4], representing the box coordinates of the proposed RoIs w.r.t. the scaled image. proposed_roi_scores: a tensor of shape [batch_size, rpn_post_nms_top_k, 1], representing the scores of the proposed RoIs. """ proposed_rois, proposed_roi_scores = multilevel_propose_rois( boxes, scores, anchor_boxes, image_shape, rpn_pre_nms_top_k=(self._rpn_pre_nms_top_k if is_training else self._test_rpn_pre_nms_top_k), rpn_post_nms_top_k=(self._rpn_post_nms_top_k if is_training else self._test_rpn_post_nms_top_k), rpn_nms_threshold=(self._rpn_nms_threshold if is_training else self._test_rpn_nms_threshold), rpn_score_threshold=(self._rpn_score_threshold if is_training else self._test_rpn_score_threshold), rpn_min_size_threshold=(self._rpn_min_size_threshold if is_training else self._test_rpn_min_size_threshold), decode_boxes=True, clip_boxes=True, use_batched_nms=self._use_batched_nms, apply_sigmoid_to_score=True) return proposed_rois, proposed_roi_scores class OlnROIGenerator(ROIGenerator): """Proposes RoIs for the second stage processing.""" def __call__(self, boxes, scores, anchor_boxes, image_shape, is_training, is_box_lrtb=False, object_scores=None): """Generates RoI proposals. Args: boxes: a dict with keys representing FPN levels and values representing box tenors of shape [batch_size, feature_h, feature_w, num_anchors * 4]. scores: a dict with keys representing FPN levels and values representing logit tensors of shape [batch_size, feature_h, feature_w, num_anchors]. anchor_boxes: a dict with keys representing FPN levels and values representing anchor box tensors of shape [batch_size, feature_h, feature_w, num_anchors * 4]. image_shape: a tensor of shape [batch_size, 2] where the last dimension are [height, width] of the scaled image. is_training: a bool indicating whether it is in training or inference mode. is_box_lrtb: a bool indicating whether boxes are in lrtb (=left,right,top, bottom) format. object_scores: another objectness score (e.g., centerness). In OLN, we use object_scores=centerness as a replacement of the scores at each level. A dict with keys representing FPN levels and values representing logit tensors of shape [batch_size, feature_h, feature_w, num_anchors]. Returns: proposed_rois: a tensor of shape [batch_size, rpn_post_nms_top_k, 4], representing the box coordinates of the proposed RoIs w.r.t. the scaled image. proposed_roi_scores: a tensor of shape [batch_size, rpn_post_nms_top_k, 1], representing the scores of the proposed RoIs. """ proposed_rois, proposed_roi_scores = self.oln_multilevel_propose_rois( boxes, scores, anchor_boxes, image_shape, rpn_pre_nms_top_k=(self._rpn_pre_nms_top_k if is_training else self._test_rpn_pre_nms_top_k), rpn_post_nms_top_k=(self._rpn_post_nms_top_k if is_training else self._test_rpn_post_nms_top_k), rpn_nms_threshold=(self._rpn_nms_threshold if is_training else self._test_rpn_nms_threshold), rpn_score_threshold=(self._rpn_score_threshold if is_training else self._test_rpn_score_threshold), rpn_min_size_threshold=(self._rpn_min_size_threshold if is_training else self._test_rpn_min_size_threshold), decode_boxes=True, clip_boxes=True, use_batched_nms=self._use_batched_nms, apply_sigmoid_to_score=True, is_box_lrtb=is_box_lrtb, rpn_object_scores=object_scores,) return proposed_rois, proposed_roi_scores def oln_multilevel_propose_rois(self, rpn_boxes, rpn_scores, anchor_boxes, image_shape, rpn_pre_nms_top_k=2000, rpn_post_nms_top_k=1000, rpn_nms_threshold=0.7, rpn_score_threshold=0.0, rpn_min_size_threshold=0.0, decode_boxes=True, clip_boxes=True, use_batched_nms=False, apply_sigmoid_to_score=True, is_box_lrtb=False, rpn_object_scores=None,): """Proposes RoIs given a group of candidates from different FPN levels. The following describes the steps: 1. For each individual level: a. Adjust scores for each level if specified by rpn_object_scores. b. Apply sigmoid transform if specified. c. Decode boxes (either of xyhw or left-right-top-bottom format) if specified. d. Clip boxes if specified. e. Filter small boxes and those fall outside image if specified. f. Apply pre-NMS filtering including pre-NMS top k and score thresholding. g. Apply NMS. 2. Aggregate post-NMS boxes from each level. 3. Apply an overall top k to generate the final selected RoIs. Args: rpn_boxes: a dict with keys representing FPN levels and values representing box tenors of shape [batch_size, feature_h, feature_w, num_anchors * 4]. rpn_scores: a dict with keys representing FPN levels and values representing logit tensors of shape [batch_size, feature_h, feature_w, num_anchors]. anchor_boxes: a dict with keys representing FPN levels and values representing anchor box tensors of shape [batch_size, feature_h, feature_w, num_anchors * 4]. image_shape: a tensor of shape [batch_size, 2] where the last dimension are [height, width] of the scaled image. rpn_pre_nms_top_k: an integer of top scoring RPN proposals *per level* to keep before applying NMS. Default: 2000. rpn_post_nms_top_k: an integer of top scoring RPN proposals *in total* to keep after applying NMS. Default: 1000. rpn_nms_threshold: a float between 0 and 1 representing the IoU threshold used for NMS. If 0.0, no NMS is applied. Default: 0.7. rpn_score_threshold: a float between 0 and 1 representing the minimal box score to keep before applying NMS. This is often used as a pre-filtering step for better performance. If 0, no filtering is applied. Default: 0. rpn_min_size_threshold: a float representing the minimal box size in each side (w.r.t. the scaled image) to keep before applying NMS. This is often used as a pre-filtering step for better performance. If 0, no filtering is applied. Default: 0. decode_boxes: a boolean indicating whether `rpn_boxes` needs to be decoded using `anchor_boxes`. If False, use `rpn_boxes` directly and ignore `anchor_boxes`. Default: True. clip_boxes: a boolean indicating whether boxes are first clipped to the scaled image size before appliying NMS. If False, no clipping is applied and `image_shape` is ignored. Default: True. use_batched_nms: a boolean indicating whether NMS is applied in batch using `tf.image.combined_non_max_suppression`. Currently only available in CPU/GPU. Default: False. apply_sigmoid_to_score: a boolean indicating whether apply sigmoid to `rpn_scores` before applying NMS. Default: True. is_box_lrtb: a bool indicating whether boxes are in lrtb (=left,right,top, bottom) format. rpn_object_scores: a predicted objectness score (e.g., centerness). In OLN, we use object_scores=centerness as a replacement of the scores at each level. A dict with keys representing FPN levels and values representing logit tensors of shape [batch_size, feature_h, feature_w, num_anchors]. Returns: selected_rois: a tensor of shape [batch_size, rpn_post_nms_top_k, 4], representing the box coordinates of the selected proposals w.r.t. the scaled image. selected_roi_scores: a tensor of shape [batch_size, rpn_post_nms_top_k, 1],representing the scores of the selected proposals. """ with tf.name_scope('multilevel_propose_rois'): rois = [] roi_scores = [] image_shape = tf.expand_dims(image_shape, axis=1) for level in sorted(rpn_scores.keys()): with tf.name_scope('level_%d' % level): _, feature_h, feature_w, num_anchors_per_location = ( rpn_scores[level].get_shape().as_list()) num_boxes = feature_h * feature_w * num_anchors_per_location this_level_scores = tf.reshape(rpn_scores[level], [-1, num_boxes]) this_level_boxes = tf.reshape(rpn_boxes[level], [-1, num_boxes, 4]) this_level_anchors = tf.cast( tf.reshape(anchor_boxes[level], [-1, num_boxes, 4]), dtype=this_level_scores.dtype) if rpn_object_scores: this_level_object_scores = rpn_object_scores[level] this_level_object_scores = tf.reshape(this_level_object_scores, [-1, num_boxes]) this_level_object_scores = tf.cast(this_level_object_scores, this_level_scores.dtype) this_level_scores = this_level_object_scores if apply_sigmoid_to_score: this_level_scores = tf.sigmoid(this_level_scores) if decode_boxes: if is_box_lrtb: # Box in left-right-top-bottom format. this_level_boxes = box_utils.decode_boxes_lrtb( this_level_boxes, this_level_anchors) else: # Box in standard x-y-h-w format. this_level_boxes = box_utils.decode_boxes( this_level_boxes, this_level_anchors) if clip_boxes: this_level_boxes = box_utils.clip_boxes( this_level_boxes, image_shape) if rpn_min_size_threshold > 0.0: this_level_boxes, this_level_scores = box_utils.filter_boxes( this_level_boxes, this_level_scores, image_shape, rpn_min_size_threshold) this_level_pre_nms_top_k = min(num_boxes, rpn_pre_nms_top_k) this_level_post_nms_top_k = min(num_boxes, rpn_post_nms_top_k) if rpn_nms_threshold > 0.0: if use_batched_nms: this_level_rois, this_level_roi_scores, _, _ = ( tf.image.combined_non_max_suppression( tf.expand_dims(this_level_boxes, axis=2), tf.expand_dims(this_level_scores, axis=-1), max_output_size_per_class=this_level_pre_nms_top_k, max_total_size=this_level_post_nms_top_k, iou_threshold=rpn_nms_threshold, score_threshold=rpn_score_threshold, pad_per_class=False, clip_boxes=False)) else: if rpn_score_threshold > 0.0: this_level_boxes, this_level_scores = ( box_utils.filter_boxes_by_scores(this_level_boxes, this_level_scores, rpn_score_threshold)) this_level_boxes, this_level_scores = box_utils.top_k_boxes( this_level_boxes, this_level_scores, k=this_level_pre_nms_top_k) this_level_roi_scores, this_level_rois = ( nms.sorted_non_max_suppression_padded( this_level_scores, this_level_boxes, max_output_size=this_level_post_nms_top_k, iou_threshold=rpn_nms_threshold)) else: this_level_rois, this_level_roi_scores = box_utils.top_k_boxes( this_level_rois, this_level_scores, k=this_level_post_nms_top_k) rois.append(this_level_rois) roi_scores.append(this_level_roi_scores) all_rois = tf.concat(rois, axis=1) all_roi_scores = tf.concat(roi_scores, axis=1) with tf.name_scope('top_k_rois'): _, num_valid_rois = all_roi_scores.get_shape().as_list() overall_top_k = min(num_valid_rois, rpn_post_nms_top_k) selected_rois, selected_roi_scores = box_utils.top_k_boxes( all_rois, all_roi_scores, k=overall_top_k) return selected_rois, selected_roi_scores