Spaces:
Runtime error
Runtime error
# Copyright 2023 The TensorFlow Authors. All Rights Reserved. | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
"""Post-processing model outputs to generate detection.""" | |
from __future__ import absolute_import | |
from __future__ import division | |
from __future__ import print_function | |
import functools | |
import tensorflow as tf, tf_keras | |
from official.legacy.detection.ops import nms | |
from official.legacy.detection.utils import box_utils | |
def generate_detections_factory(params): | |
"""Factory to select function to generate detection.""" | |
if params.use_batched_nms: | |
func = functools.partial( | |
_generate_detections_batched, | |
max_total_size=params.max_total_size, | |
nms_iou_threshold=params.nms_iou_threshold, | |
score_threshold=params.score_threshold) | |
else: | |
func = functools.partial( | |
_generate_detections, | |
max_total_size=params.max_total_size, | |
nms_iou_threshold=params.nms_iou_threshold, | |
score_threshold=params.score_threshold, | |
pre_nms_num_boxes=params.pre_nms_num_boxes) | |
return func | |
def _select_top_k_scores(scores_in, pre_nms_num_detections): | |
"""Select top_k scores and indices for each class. | |
Args: | |
scores_in: a Tensor with shape [batch_size, N, num_classes], which stacks | |
class logit outputs on all feature levels. The N is the number of total | |
anchors on all levels. The num_classes is the number of classes predicted | |
by the model. | |
pre_nms_num_detections: Number of candidates before NMS. | |
Returns: | |
scores and indices: Tensors with shape [batch_size, pre_nms_num_detections, | |
num_classes]. | |
""" | |
batch_size, num_anchors, num_class = scores_in.get_shape().as_list() | |
scores_trans = tf.transpose(scores_in, perm=[0, 2, 1]) | |
scores_trans = tf.reshape(scores_trans, [-1, num_anchors]) | |
top_k_scores, top_k_indices = tf.nn.top_k( | |
scores_trans, k=pre_nms_num_detections, sorted=True) | |
top_k_scores = tf.reshape(top_k_scores, | |
[batch_size, num_class, pre_nms_num_detections]) | |
top_k_indices = tf.reshape(top_k_indices, | |
[batch_size, num_class, pre_nms_num_detections]) | |
return tf.transpose(top_k_scores, | |
[0, 2, 1]), tf.transpose(top_k_indices, [0, 2, 1]) | |
def _generate_detections(boxes, | |
scores, | |
max_total_size=100, | |
nms_iou_threshold=0.3, | |
score_threshold=0.05, | |
pre_nms_num_boxes=5000): | |
"""Generate the final detections given the model outputs. | |
This uses classes unrolling with while loop based NMS, could be parralled | |
at batch dimension. | |
Args: | |
boxes: a tensor with shape [batch_size, N, num_classes, 4] or [batch_size, | |
N, 1, 4], which box predictions on all feature levels. The N is the number | |
of total anchors on all levels. | |
scores: a tensor with shape [batch_size, N, num_classes], which stacks class | |
probability on all feature levels. The N is the number of total anchors on | |
all levels. The num_classes is the number of classes predicted by the | |
model. Note that the class_outputs here is the raw score. | |
max_total_size: a scalar representing maximum number of boxes retained over | |
all classes. | |
nms_iou_threshold: a float representing the threshold for deciding whether | |
boxes overlap too much with respect to IOU. | |
score_threshold: a float representing the threshold for deciding when to | |
remove boxes based on score. | |
pre_nms_num_boxes: an int number of top candidate detections per class | |
before NMS. | |
Returns: | |
nms_boxes: `float` Tensor of shape [batch_size, max_total_size, 4] | |
representing top detected boxes in [y1, x1, y2, x2]. | |
nms_scores: `float` Tensor of shape [batch_size, max_total_size] | |
representing sorted confidence scores for detected boxes. The values are | |
between [0, 1]. | |
nms_classes: `int` Tensor of shape [batch_size, max_total_size] representing | |
classes for detected boxes. | |
valid_detections: `int` Tensor of shape [batch_size] only the top | |
`valid_detections` boxes are valid detections. | |
""" | |
with tf.name_scope('generate_detections'): | |
nmsed_boxes = [] | |
nmsed_classes = [] | |
nmsed_scores = [] | |
valid_detections = [] | |
batch_size, _, num_classes_for_box, _ = boxes.get_shape().as_list() | |
_, total_anchors, num_classes = scores.get_shape().as_list() | |
# Selects top pre_nms_num scores and indices before NMS. | |
scores, indices = _select_top_k_scores( | |
scores, min(total_anchors, pre_nms_num_boxes)) | |
for i in range(num_classes): | |
boxes_i = boxes[:, :, min(num_classes_for_box - 1, i), :] | |
scores_i = scores[:, :, i] | |
# Obtains pre_nms_num_boxes before running NMS. | |
boxes_i = tf.gather(boxes_i, indices[:, :, i], batch_dims=1, axis=1) | |
# Filter out scores. | |
boxes_i, scores_i = box_utils.filter_boxes_by_scores( | |
boxes_i, scores_i, min_score_threshold=score_threshold) | |
(nmsed_scores_i, nmsed_boxes_i) = nms.sorted_non_max_suppression_padded( | |
tf.cast(scores_i, tf.float32), | |
tf.cast(boxes_i, tf.float32), | |
max_total_size, | |
iou_threshold=nms_iou_threshold) | |
nmsed_classes_i = tf.fill([batch_size, max_total_size], i) | |
nmsed_boxes.append(nmsed_boxes_i) | |
nmsed_scores.append(nmsed_scores_i) | |
nmsed_classes.append(nmsed_classes_i) | |
nmsed_boxes = tf.concat(nmsed_boxes, axis=1) | |
nmsed_scores = tf.concat(nmsed_scores, axis=1) | |
nmsed_classes = tf.concat(nmsed_classes, axis=1) | |
nmsed_scores, indices = tf.nn.top_k( | |
nmsed_scores, k=max_total_size, sorted=True) | |
nmsed_boxes = tf.gather(nmsed_boxes, indices, batch_dims=1, axis=1) | |
nmsed_classes = tf.gather(nmsed_classes, indices, batch_dims=1) | |
valid_detections = tf.reduce_sum( | |
input_tensor=tf.cast(tf.greater(nmsed_scores, -1), tf.int32), axis=1) | |
return nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections | |
def _generate_detections_per_image(boxes, | |
scores, | |
max_total_size=100, | |
nms_iou_threshold=0.3, | |
score_threshold=0.05, | |
pre_nms_num_boxes=5000): | |
"""Generate the final detections per image given the model outputs. | |
Args: | |
boxes: a tensor with shape [N, num_classes, 4] or [N, 1, 4], which box | |
predictions on all feature levels. The N is the number of total anchors on | |
all levels. | |
scores: a tensor with shape [N, num_classes], which stacks class probability | |
on all feature levels. The N is the number of total anchors on all levels. | |
The num_classes is the number of classes predicted by the model. Note that | |
the class_outputs here is the raw score. | |
max_total_size: a scalar representing maximum number of boxes retained over | |
all classes. | |
nms_iou_threshold: a float representing the threshold for deciding whether | |
boxes overlap too much with respect to IOU. | |
score_threshold: a float representing the threshold for deciding when to | |
remove boxes based on score. | |
pre_nms_num_boxes: an int number of top candidate detections per class | |
before NMS. | |
Returns: | |
nms_boxes: `float` Tensor of shape [max_total_size, 4] representing top | |
detected boxes in [y1, x1, y2, x2]. | |
nms_scores: `float` Tensor of shape [max_total_size] representing sorted | |
confidence scores for detected boxes. The values are between [0, 1]. | |
nms_classes: `int` Tensor of shape [max_total_size] representing classes for | |
detected boxes. | |
valid_detections: `int` Tensor of shape [1] only the top `valid_detections` | |
boxes are valid detections. | |
""" | |
nmsed_boxes = [] | |
nmsed_scores = [] | |
nmsed_classes = [] | |
num_classes_for_box = boxes.get_shape().as_list()[1] | |
num_classes = scores.get_shape().as_list()[1] | |
for i in range(num_classes): | |
boxes_i = boxes[:, min(num_classes_for_box - 1, i)] | |
scores_i = scores[:, i] | |
# Obtains pre_nms_num_boxes before running NMS. | |
scores_i, indices = tf.nn.top_k( | |
scores_i, k=tf.minimum(tf.shape(input=scores_i)[-1], pre_nms_num_boxes)) | |
boxes_i = tf.gather(boxes_i, indices) | |
(nmsed_indices_i, nmsed_num_valid_i) = tf.image.non_max_suppression_padded( | |
tf.cast(boxes_i, tf.float32), | |
tf.cast(scores_i, tf.float32), | |
max_total_size, | |
iou_threshold=nms_iou_threshold, | |
score_threshold=score_threshold, | |
pad_to_max_output_size=True, | |
name='nms_detections_' + str(i)) | |
nmsed_boxes_i = tf.gather(boxes_i, nmsed_indices_i) | |
nmsed_scores_i = tf.gather(scores_i, nmsed_indices_i) | |
# Sets scores of invalid boxes to -1. | |
nmsed_scores_i = tf.where( | |
tf.less(tf.range(max_total_size), [nmsed_num_valid_i]), nmsed_scores_i, | |
-tf.ones_like(nmsed_scores_i)) | |
nmsed_classes_i = tf.fill([max_total_size], i) | |
nmsed_boxes.append(nmsed_boxes_i) | |
nmsed_scores.append(nmsed_scores_i) | |
nmsed_classes.append(nmsed_classes_i) | |
# Concats results from all classes and sort them. | |
nmsed_boxes = tf.concat(nmsed_boxes, axis=0) | |
nmsed_scores = tf.concat(nmsed_scores, axis=0) | |
nmsed_classes = tf.concat(nmsed_classes, axis=0) | |
nmsed_scores, indices = tf.nn.top_k( | |
nmsed_scores, k=max_total_size, sorted=True) | |
nmsed_boxes = tf.gather(nmsed_boxes, indices) | |
nmsed_classes = tf.gather(nmsed_classes, indices) | |
valid_detections = tf.reduce_sum( | |
input_tensor=tf.cast(tf.greater(nmsed_scores, -1), tf.int32)) | |
return nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections | |
def _generate_detections_batched(boxes, scores, max_total_size, | |
nms_iou_threshold, score_threshold): | |
"""Generates detected boxes with scores and classes for one-stage detector. | |
The function takes output of multi-level ConvNets and anchor boxes and | |
generates detected boxes. Note that this used batched nms, which is not | |
supported on TPU currently. | |
Args: | |
boxes: a tensor with shape [batch_size, N, num_classes, 4] or [batch_size, | |
N, 1, 4], which box predictions on all feature levels. The N is the number | |
of total anchors on all levels. | |
scores: a tensor with shape [batch_size, N, num_classes], which stacks class | |
probability on all feature levels. The N is the number of total anchors on | |
all levels. The num_classes is the number of classes predicted by the | |
model. Note that the class_outputs here is the raw score. | |
max_total_size: a scalar representing maximum number of boxes retained over | |
all classes. | |
nms_iou_threshold: a float representing the threshold for deciding whether | |
boxes overlap too much with respect to IOU. | |
score_threshold: a float representing the threshold for deciding when to | |
remove boxes based on score. | |
Returns: | |
nms_boxes: `float` Tensor of shape [batch_size, max_total_size, 4] | |
representing top detected boxes in [y1, x1, y2, x2]. | |
nms_scores: `float` Tensor of shape [batch_size, max_total_size] | |
representing sorted confidence scores for detected boxes. The values are | |
between [0, 1]. | |
nms_classes: `int` Tensor of shape [batch_size, max_total_size] representing | |
classes for detected boxes. | |
valid_detections: `int` Tensor of shape [batch_size] only the top | |
`valid_detections` boxes are valid detections. | |
""" | |
with tf.name_scope('generate_detections'): | |
# TODO(tsungyi): Removes normalization/denomalization once the | |
# tf.image.combined_non_max_suppression is coordinate system agnostic. | |
# Normalizes maximum box cooridinates to 1. | |
normalizer = tf.reduce_max(boxes) | |
boxes /= normalizer | |
(nmsed_boxes, nmsed_scores, nmsed_classes, | |
valid_detections) = tf.image.combined_non_max_suppression( | |
boxes, | |
scores, | |
max_output_size_per_class=max_total_size, | |
max_total_size=max_total_size, | |
iou_threshold=nms_iou_threshold, | |
score_threshold=score_threshold, | |
pad_per_class=False, | |
) | |
# De-normalizes box cooridinates. | |
nmsed_boxes *= normalizer | |
nmsed_classes = tf.cast(nmsed_classes, tf.int32) | |
return nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections | |
class MultilevelDetectionGenerator(tf_keras.layers.Layer): | |
"""Generates detected boxes with scores and classes for one-stage detector.""" | |
def __init__(self, min_level, max_level, params): | |
self._min_level = min_level | |
self._max_level = max_level | |
self._generate_detections = generate_detections_factory(params) | |
super(MultilevelDetectionGenerator, self).__init__(autocast=False) | |
def call(self, box_outputs, class_outputs, anchor_boxes, image_shape): | |
# Collects outputs from all levels into a list. | |
boxes = [] | |
scores = [] | |
for i in range(self._min_level, self._max_level + 1): | |
box_outputs_i_shape = tf.shape(box_outputs[i]) | |
batch_size = box_outputs_i_shape[0] | |
num_anchors_per_locations = box_outputs_i_shape[-1] // 4 | |
num_classes = tf.shape(class_outputs[i])[-1] // num_anchors_per_locations | |
# Applies score transformation and remove the implicit background class. | |
scores_i = tf.sigmoid( | |
tf.reshape(class_outputs[i], [batch_size, -1, num_classes])) | |
scores_i = tf.slice(scores_i, [0, 0, 1], [-1, -1, -1]) | |
# Box decoding. | |
# The anchor boxes are shared for all data in a batch. | |
# One stage detector only supports class agnostic box regression. | |
anchor_boxes_i = tf.reshape(anchor_boxes[i], [batch_size, -1, 4]) | |
box_outputs_i = tf.reshape(box_outputs[i], [batch_size, -1, 4]) | |
boxes_i = box_utils.decode_boxes(box_outputs_i, anchor_boxes_i) | |
# Box clipping. | |
boxes_i = box_utils.clip_boxes(boxes_i, image_shape) | |
boxes.append(boxes_i) | |
scores.append(scores_i) | |
boxes = tf.concat(boxes, axis=1) | |
scores = tf.concat(scores, axis=1) | |
nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections = ( | |
self._generate_detections(tf.expand_dims(boxes, axis=2), scores)) | |
# Adds 1 to offset the background class which has index 0. | |
nmsed_classes += 1 | |
return nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections | |
class GenericDetectionGenerator(tf_keras.layers.Layer): | |
"""Generates the final detected boxes with scores and classes.""" | |
def __init__(self, params): | |
super(GenericDetectionGenerator, self).__init__(autocast=False) | |
self._generate_detections = generate_detections_factory(params) | |
def call(self, box_outputs, class_outputs, anchor_boxes, image_shape): | |
"""Generate final detections. | |
Args: | |
box_outputs: a tensor of shape of [batch_size, K, num_classes * 4] | |
representing the class-specific box coordinates relative to anchors. | |
class_outputs: a tensor of shape of [batch_size, K, num_classes] | |
representing the class logits before applying score activiation. | |
anchor_boxes: a tensor of shape of [batch_size, K, 4] representing the | |
corresponding anchor boxes w.r.t `box_outputs`. | |
image_shape: a tensor of shape of [batch_size, 2] storing the image height | |
and width w.r.t. the scaled image, i.e. the same image space as | |
`box_outputs` and `anchor_boxes`. | |
Returns: | |
nms_boxes: `float` Tensor of shape [batch_size, max_total_size, 4] | |
representing top detected boxes in [y1, x1, y2, x2]. | |
nms_scores: `float` Tensor of shape [batch_size, max_total_size] | |
representing sorted confidence scores for detected boxes. The values are | |
between [0, 1]. | |
nms_classes: `int` Tensor of shape [batch_size, max_total_size] | |
representing classes for detected boxes. | |
valid_detections: `int` Tensor of shape [batch_size] only the top | |
`valid_detections` boxes are valid detections. | |
""" | |
class_outputs = tf.nn.softmax(class_outputs, axis=-1) | |
# Removes the background class. | |
class_outputs_shape = tf.shape(class_outputs) | |
batch_size = class_outputs_shape[0] | |
num_locations = class_outputs_shape[1] | |
num_classes = class_outputs_shape[-1] | |
num_detections = num_locations * (num_classes - 1) | |
class_outputs = tf.slice(class_outputs, [0, 0, 1], [-1, -1, -1]) | |
box_outputs = tf.reshape( | |
box_outputs, | |
tf.stack([batch_size, num_locations, num_classes, 4], axis=-1)) | |
box_outputs = tf.slice(box_outputs, [0, 0, 1, 0], [-1, -1, -1, -1]) | |
anchor_boxes = tf.tile( | |
tf.expand_dims(anchor_boxes, axis=2), [1, 1, num_classes - 1, 1]) | |
box_outputs = tf.reshape(box_outputs, | |
tf.stack([batch_size, num_detections, 4], axis=-1)) | |
anchor_boxes = tf.reshape( | |
anchor_boxes, tf.stack([batch_size, num_detections, 4], axis=-1)) | |
# Box decoding. | |
decoded_boxes = box_utils.decode_boxes( | |
box_outputs, anchor_boxes, weights=[10.0, 10.0, 5.0, 5.0]) | |
# Box clipping | |
decoded_boxes = box_utils.clip_boxes(decoded_boxes, image_shape) | |
decoded_boxes = tf.reshape( | |
decoded_boxes, | |
tf.stack([batch_size, num_locations, num_classes - 1, 4], axis=-1)) | |
nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections = ( | |
self._generate_detections(decoded_boxes, class_outputs)) | |
# Adds 1 to offset the background class which has index 0. | |
nmsed_classes += 1 | |
return nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections | |
class OlnDetectionGenerator(GenericDetectionGenerator): | |
"""Generates the final detected boxes with scores and classes.""" | |
def __call__(self, box_outputs, class_outputs, anchor_boxes, image_shape, | |
is_single_fg_score=False, keep_nms=True): | |
"""Generate final detections for Object Localization Network (OLN). | |
Args: | |
box_outputs: a tensor of shape of [batch_size, K, num_classes * 4] | |
representing the class-specific box coordinates relative to anchors. | |
class_outputs: a tensor of shape of [batch_size, K, num_classes] | |
representing the class logits before applying score activiation. | |
anchor_boxes: a tensor of shape of [batch_size, K, 4] representing the | |
corresponding anchor boxes w.r.t `box_outputs`. | |
image_shape: a tensor of shape of [batch_size, 2] storing the image height | |
and width w.r.t. the scaled image, i.e. the same image space as | |
`box_outputs` and `anchor_boxes`. | |
is_single_fg_score: a Bool indicator of whether class_outputs includes the | |
background scores concatenated or not. By default, class_outputs is a | |
concatenation of both scores for the foreground and background. That is, | |
scores_without_bg=False. | |
keep_nms: a Bool indicator of whether to perform NMS or not. | |
Returns: | |
nms_boxes: `float` Tensor of shape [batch_size, max_total_size, 4] | |
representing top detected boxes in [y1, x1, y2, x2]. | |
nms_scores: `float` Tensor of shape [batch_size, max_total_size] | |
representing sorted confidence scores for detected boxes. The values are | |
between [0, 1]. | |
nms_classes: `int` Tensor of shape [batch_size, max_total_size] | |
representing classes for detected boxes. | |
valid_detections: `int` Tensor of shape [batch_size] only the top | |
`valid_detections` boxes are valid detections. | |
""" | |
if is_single_fg_score: | |
# Concatenates dummy background scores. | |
dummy_bg_scores = tf.zeros_like(class_outputs) | |
class_outputs = tf.stack([dummy_bg_scores, class_outputs], -1) | |
else: | |
class_outputs = tf.nn.softmax(class_outputs, axis=-1) | |
# Removes the background class. | |
class_outputs_shape = tf.shape(class_outputs) | |
batch_size = class_outputs_shape[0] | |
num_locations = class_outputs_shape[1] | |
num_classes = class_outputs_shape[-1] | |
num_detections = num_locations * (num_classes - 1) | |
class_outputs = tf.slice(class_outputs, [0, 0, 1], [-1, -1, -1]) | |
box_outputs = tf.reshape( | |
box_outputs, | |
tf.stack([batch_size, num_locations, num_classes, 4], axis=-1)) | |
box_outputs = tf.slice(box_outputs, [0, 0, 1, 0], [-1, -1, -1, -1]) | |
anchor_boxes = tf.tile( | |
tf.expand_dims(anchor_boxes, axis=2), [1, 1, num_classes - 1, 1]) | |
box_outputs = tf.reshape(box_outputs, | |
tf.stack([batch_size, num_detections, 4], axis=-1)) | |
anchor_boxes = tf.reshape( | |
anchor_boxes, tf.stack([batch_size, num_detections, 4], axis=-1)) | |
# Box decoding. For RPN outputs, box_outputs are all zeros. | |
decoded_boxes = box_utils.decode_boxes( | |
box_outputs, anchor_boxes, weights=[10.0, 10.0, 5.0, 5.0]) | |
# Box clipping | |
decoded_boxes = box_utils.clip_boxes(decoded_boxes, image_shape) | |
decoded_boxes = tf.reshape( | |
decoded_boxes, | |
tf.stack([batch_size, num_locations, num_classes - 1, 4], axis=-1)) | |
if keep_nms: | |
nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections = ( | |
self._generate_detections(decoded_boxes, class_outputs)) | |
# Adds 1 to offset the background class which has index 0. | |
nmsed_classes += 1 | |
else: | |
nmsed_boxes = decoded_boxes[:, :, 0, :] | |
nmsed_scores = class_outputs[:, :, 0] | |
nmsed_classes = tf.cast(tf.ones_like(nmsed_scores), tf.int32) | |
valid_detections = tf.cast( | |
tf.reduce_sum(tf.ones_like(nmsed_scores), axis=-1), tf.int32) | |
return nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections | |