|
|
|
|
|
|
|
import torch |
|
from torchvision.ops import boxes as box_ops |
|
from torchvision.ops import nms |
|
|
|
|
|
def batched_nms( |
|
boxes: torch.Tensor, scores: torch.Tensor, idxs: torch.Tensor, iou_threshold: float |
|
): |
|
""" |
|
Same as torchvision.ops.boxes.batched_nms, but with float(). |
|
""" |
|
assert boxes.shape[-1] == 4 |
|
|
|
|
|
|
|
|
|
return box_ops.batched_nms(boxes.float(), scores, idxs, iou_threshold) |
|
|
|
|
|
|
|
|
|
def nms_rotated(boxes: torch.Tensor, scores: torch.Tensor, iou_threshold: float): |
|
""" |
|
Performs non-maximum suppression (NMS) on the rotated boxes according |
|
to their intersection-over-union (IoU). |
|
|
|
Rotated NMS iteratively removes lower scoring rotated boxes which have an |
|
IoU greater than iou_threshold with another (higher scoring) rotated box. |
|
|
|
Note that RotatedBox (5, 3, 4, 2, -90) covers exactly the same region as |
|
RotatedBox (5, 3, 4, 2, 90) does, and their IoU will be 1. However, they |
|
can be representing completely different objects in certain tasks, e.g., OCR. |
|
|
|
As for the question of whether rotated-NMS should treat them as faraway boxes |
|
even though their IOU is 1, it depends on the application and/or ground truth annotation. |
|
|
|
As an extreme example, consider a single character v and the square box around it. |
|
|
|
If the angle is 0 degree, the object (text) would be read as 'v'; |
|
|
|
If the angle is 90 degrees, the object (text) would become '>'; |
|
|
|
If the angle is 180 degrees, the object (text) would become '^'; |
|
|
|
If the angle is 270/-90 degrees, the object (text) would become '<' |
|
|
|
All of these cases have IoU of 1 to each other, and rotated NMS that only |
|
uses IoU as criterion would only keep one of them with the highest score - |
|
which, practically, still makes sense in most cases because typically |
|
only one of theses orientations is the correct one. Also, it does not matter |
|
as much if the box is only used to classify the object (instead of transcribing |
|
them with a sequential OCR recognition model) later. |
|
|
|
On the other hand, when we use IoU to filter proposals that are close to the |
|
ground truth during training, we should definitely take the angle into account if |
|
we know the ground truth is labeled with the strictly correct orientation (as in, |
|
upside-down words are annotated with -180 degrees even though they can be covered |
|
with a 0/90/-90 degree box, etc.) |
|
|
|
The way the original dataset is annotated also matters. For example, if the dataset |
|
is a 4-point polygon dataset that does not enforce ordering of vertices/orientation, |
|
we can estimate a minimum rotated bounding box to this polygon, but there's no way |
|
we can tell the correct angle with 100% confidence (as shown above, there could be 4 different |
|
rotated boxes, with angles differed by 90 degrees to each other, covering the exactly |
|
same region). In that case we have to just use IoU to determine the box |
|
proximity (as many detection benchmarks (even for text) do) unless there're other |
|
assumptions we can make (like width is always larger than height, or the object is not |
|
rotated by more than 90 degrees CCW/CW, etc.) |
|
|
|
In summary, not considering angles in rotated NMS seems to be a good option for now, |
|
but we should be aware of its implications. |
|
|
|
Args: |
|
boxes (Tensor[N, 5]): Rotated boxes to perform NMS on. They are expected to be in |
|
(x_center, y_center, width, height, angle_degrees) format. |
|
scores (Tensor[N]): Scores for each one of the rotated boxes |
|
iou_threshold (float): Discards all overlapping rotated boxes with IoU < iou_threshold |
|
|
|
Returns: |
|
keep (Tensor): int64 tensor with the indices of the elements that have been kept |
|
by Rotated NMS, sorted in decreasing order of scores |
|
""" |
|
return torch.ops.detectron2.nms_rotated(boxes, scores, iou_threshold) |
|
|
|
|
|
|
|
|
|
|
|
|
|
@torch.jit.script_if_tracing |
|
def batched_nms_rotated( |
|
boxes: torch.Tensor, scores: torch.Tensor, idxs: torch.Tensor, iou_threshold: float |
|
): |
|
""" |
|
Performs non-maximum suppression in a batched fashion. |
|
|
|
Each index value correspond to a category, and NMS |
|
will not be applied between elements of different categories. |
|
|
|
Args: |
|
boxes (Tensor[N, 5]): |
|
boxes where NMS will be performed. They |
|
are expected to be in (x_ctr, y_ctr, width, height, angle_degrees) format |
|
scores (Tensor[N]): |
|
scores for each one of the boxes |
|
idxs (Tensor[N]): |
|
indices of the categories for each one of the boxes. |
|
iou_threshold (float): |
|
discards all overlapping boxes |
|
with IoU < iou_threshold |
|
|
|
Returns: |
|
Tensor: |
|
int64 tensor with the indices of the elements that have been kept |
|
by NMS, sorted in decreasing order of scores |
|
""" |
|
assert boxes.shape[-1] == 5 |
|
|
|
if boxes.numel() == 0: |
|
return torch.empty((0,), dtype=torch.int64, device=boxes.device) |
|
boxes = boxes.float() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
max_coordinate = ( |
|
torch.max(boxes[:, 0], boxes[:, 1]) + torch.max(boxes[:, 2], boxes[:, 3]) / 2 |
|
).max() |
|
min_coordinate = ( |
|
torch.min(boxes[:, 0], boxes[:, 1]) - torch.max(boxes[:, 2], boxes[:, 3]) / 2 |
|
).min() |
|
offsets = idxs.to(boxes) * (max_coordinate - min_coordinate + 1) |
|
boxes_for_nms = boxes.clone() |
|
boxes_for_nms[:, :2] += offsets[:, None] |
|
keep = nms_rotated(boxes_for_nms, scores, iou_threshold) |
|
return keep |
|
|