santit96's picture
Create the streamlit app that classifies the trash in an image into classes
fa84113
""" RetinaNet / EfficientDet Anchor Gen
Adapted for PyTorch from Tensorflow impl at
https://github.com/google/automl/blob/6f6694cec1a48cdb33d5d1551a2d5db8ad227798/efficientdet/anchors.py
Hacked together by Ross Wightman, original copyright below
"""
# Copyright 2020 Google Research. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Anchor definition.
This module is borrowed from TPU RetinaNet implementation:
https://github.com/tensorflow/tpu/blob/master/models/official/retinanet/anchors.py
"""
from typing import Optional, Tuple, Sequence
import numpy as np
import torch
import torch.nn as nn
#import torchvision.ops.boxes as tvb
from torchvision.ops.boxes import batched_nms, remove_small_boxes
from typing import List
from effdet.object_detection import ArgMaxMatcher, FasterRcnnBoxCoder, BoxList, IouSimilarity, TargetAssigner
from .soft_nms import batched_soft_nms
# The minimum score to consider a logit for identifying detections.
MIN_CLASS_SCORE = -5.0
# The score for a dummy detection
_DUMMY_DETECTION_SCORE = -1e5
# The maximum number of (anchor,class) pairs to keep for non-max suppression.
MAX_DETECTION_POINTS = 5000
# The maximum number of detections per image.
MAX_DETECTIONS_PER_IMAGE = 100
def decode_box_outputs(rel_codes, anchors, output_xyxy: bool=False):
"""Transforms relative regression coordinates to absolute positions.
Network predictions are normalized and relative to a given anchor; this
reverses the transformation and outputs absolute coordinates for the input image.
Args:
rel_codes: box regression targets.
anchors: anchors on all feature levels.
Returns:
outputs: bounding boxes.
"""
ycenter_a = (anchors[:, 0] + anchors[:, 2]) / 2
xcenter_a = (anchors[:, 1] + anchors[:, 3]) / 2
ha = anchors[:, 2] - anchors[:, 0]
wa = anchors[:, 3] - anchors[:, 1]
ty, tx, th, tw = rel_codes.unbind(dim=1)
w = torch.exp(tw) * wa
h = torch.exp(th) * ha
ycenter = ty * ha + ycenter_a
xcenter = tx * wa + xcenter_a
ymin = ycenter - h / 2.
xmin = xcenter - w / 2.
ymax = ycenter + h / 2.
xmax = xcenter + w / 2.
if output_xyxy:
out = torch.stack([xmin, ymin, xmax, ymax], dim=1)
else:
out = torch.stack([ymin, xmin, ymax, xmax], dim=1)
return out
def clip_boxes_xyxy(boxes: torch.Tensor, size: torch.Tensor):
boxes = boxes.clamp(min=0)
size = torch.cat([size, size], dim=0)
boxes = boxes.min(size)
return boxes
def generate_detections(
cls_outputs, box_outputs, anchor_boxes, indices, classes,
img_scale: Optional[torch.Tensor], img_size: Optional[torch.Tensor],
max_det_per_image: int = MAX_DETECTIONS_PER_IMAGE, soft_nms: bool = False):
"""Generates detections with RetinaNet model outputs and anchors.
Args:
cls_outputs: a torch tensor with shape [N, 1], which has the highest class
scores on all feature levels. The N is the number of selected
top-K total anchors on all levels. (k being MAX_DETECTION_POINTS)
box_outputs: a torch tensor with shape [N, 4], which stacks box regression
outputs on all feature levels. The N is the number of selected top-k
total anchors on all levels. (k being MAX_DETECTION_POINTS)
anchor_boxes: a torch tensor with shape [N, 4], which stacks anchors on all
feature levels. The N is the number of selected top-k total anchors on all levels.
indices: a torch tensor with shape [N], which is the indices from top-k selection.
classes: a torch tensor with shape [N], which represents the class
prediction on all selected anchors from top-k selection.
img_scale: a float tensor representing the scale between original image
and input image for the detector. It is used to rescale detections for
evaluating with the original groundtruth annotations.
max_det_per_image: an int constant, added as argument to make torchscript happy
Returns:
detections: detection results in a tensor with shape [MAX_DETECTION_POINTS, 6],
each row representing [x_min, y_min, x_max, y_max, score, class]
"""
assert box_outputs.shape[-1] == 4
assert anchor_boxes.shape[-1] == 4
assert cls_outputs.shape[-1] == 1
anchor_boxes = anchor_boxes[indices, :]
# Appply bounding box regression to anchors, boxes are converted to xyxy
# here since PyTorch NMS expects them in that form.
boxes = decode_box_outputs(box_outputs.float(), anchor_boxes, output_xyxy=True)
if img_scale is not None and img_size is not None:
boxes = clip_boxes_xyxy(boxes, img_size / img_scale) # clip before NMS better?
scores = cls_outputs.sigmoid().squeeze(1).float()
if soft_nms:
top_detection_idx, soft_scores = batched_soft_nms(
boxes, scores, classes, method_gaussian=True, iou_threshold=0.3, score_threshold=.001)
scores[top_detection_idx] = soft_scores
else:
top_detection_idx = batched_nms(boxes, scores, classes, iou_threshold=0.5)
# keep only topk scoring predictions
top_detection_idx = top_detection_idx[:max_det_per_image]
boxes = boxes[top_detection_idx]
scores = scores[top_detection_idx, None]
classes = classes[top_detection_idx, None] + 1 # back to class idx with background class = 0
if img_scale is not None:
boxes = boxes * img_scale
# FIXME add option to convert boxes back to yxyx? Otherwise must be handled downstream if
# that is the preferred output format.
# stack em and pad out to MAX_DETECTIONS_PER_IMAGE if necessary
num_det = len(top_detection_idx)
detections = torch.cat([boxes, scores, classes.float()], dim=1)
if num_det < max_det_per_image:
detections = torch.cat([
detections,
torch.zeros((max_det_per_image - num_det, 6), device=detections.device, dtype=detections.dtype)
], dim=0)
return detections
def get_feat_sizes(image_size: Tuple[int, int], max_level: int):
"""Get feat widths and heights for all levels.
Args:
image_size: a tuple (H, W)
max_level: maximum feature level.
Returns:
feat_sizes: a list of tuples (height, width) for each level.
"""
feat_size = image_size
feat_sizes = [feat_size]
for _ in range(1, max_level + 1):
feat_size = ((feat_size[0] - 1) // 2 + 1, (feat_size[1] - 1) // 2 + 1)
feat_sizes.append(feat_size)
return feat_sizes
class Anchors(nn.Module):
"""RetinaNet Anchors class."""
def __init__(self, min_level, max_level, num_scales, aspect_ratios, anchor_scale, image_size: Tuple[int, int]):
"""Constructs multiscale RetinaNet anchors.
Args:
min_level: integer number of minimum level of the output feature pyramid.
max_level: integer number of maximum level of the output feature pyramid.
num_scales: integer number representing intermediate scales added
on each level. For instances, num_scales=2 adds two additional
anchor scales [2^0, 2^0.5] on each level.
aspect_ratios: list of tuples representing the aspect ratio anchors added
on each level. For instances, aspect_ratios =
[(1, 1), (1.4, 0.7), (0.7, 1.4)] adds three anchors on each level.
anchor_scale: float number representing the scale of size of the base
anchor to the feature stride 2^level.
image_size: Sequence specifying input image size of model (H, W).
The image_size should be divided by the largest feature stride 2^max_level.
"""
super(Anchors, self).__init__()
self.min_level = min_level
self.max_level = max_level
self.num_scales = num_scales
self.aspect_ratios = aspect_ratios
if isinstance(anchor_scale, Sequence):
assert len(anchor_scale) == max_level - min_level + 1
self.anchor_scales = anchor_scale
else:
self.anchor_scales = [anchor_scale] * (max_level - min_level + 1)
assert isinstance(image_size, Sequence) and len(image_size) == 2
# FIXME this restriction can likely be relaxed with some additional changes
assert image_size[0] % 2 ** max_level == 0, 'Image size must be divisible by 2 ** max_level (128)'
assert image_size[1] % 2 ** max_level == 0, 'Image size must be divisible by 2 ** max_level (128)'
self.image_size = tuple(image_size)
self.feat_sizes = get_feat_sizes(image_size, max_level)
self.config = self._generate_configs()
self.register_buffer('boxes', self._generate_boxes())
@classmethod
def from_config(cls, config):
return cls(
config.min_level, config.max_level,
config.num_scales, config.aspect_ratios,
config.anchor_scale, config.image_size)
def _generate_configs(self):
"""Generate configurations of anchor boxes."""
anchor_configs = {}
feat_sizes = self.feat_sizes
for level in range(self.min_level, self.max_level + 1):
anchor_configs[level] = []
for scale_octave in range(self.num_scales):
for aspect in self.aspect_ratios:
anchor_configs[level].append(
((feat_sizes[0][0] // feat_sizes[level][0],
feat_sizes[0][1] // feat_sizes[level][1]),
scale_octave / float(self.num_scales), aspect,
self.anchor_scales[level - self.min_level]))
return anchor_configs
def _generate_boxes(self):
"""Generates multiscale anchor boxes."""
boxes_all = []
for _, configs in self.config.items():
boxes_level = []
for config in configs:
stride, octave_scale, aspect, anchor_scale = config
base_anchor_size_x = anchor_scale * stride[1] * 2 ** octave_scale
base_anchor_size_y = anchor_scale * stride[0] * 2 ** octave_scale
if isinstance(aspect, Sequence):
aspect_x = aspect[0]
aspect_y = aspect[1]
else:
aspect_x = np.sqrt(aspect)
aspect_y = 1.0 / aspect_x
anchor_size_x_2 = base_anchor_size_x * aspect_x / 2.0
anchor_size_y_2 = base_anchor_size_y * aspect_y / 2.0
x = np.arange(stride[1] / 2, self.image_size[1], stride[1])
y = np.arange(stride[0] / 2, self.image_size[0], stride[0])
xv, yv = np.meshgrid(x, y)
xv = xv.reshape(-1)
yv = yv.reshape(-1)
boxes = np.vstack((yv - anchor_size_y_2, xv - anchor_size_x_2,
yv + anchor_size_y_2, xv + anchor_size_x_2))
boxes = np.swapaxes(boxes, 0, 1)
boxes_level.append(np.expand_dims(boxes, axis=1))
# concat anchors on the same level to the reshape NxAx4
boxes_level = np.concatenate(boxes_level, axis=1)
boxes_all.append(boxes_level.reshape([-1, 4]))
anchor_boxes = np.vstack(boxes_all)
anchor_boxes = torch.from_numpy(anchor_boxes).float()
return anchor_boxes
def get_anchors_per_location(self):
return self.num_scales * len(self.aspect_ratios)
class AnchorLabeler(object):
"""Labeler for multiscale anchor boxes.
"""
def __init__(self, anchors, num_classes: int, match_threshold: float = 0.5):
"""Constructs anchor labeler to assign labels to anchors.
Args:
anchors: an instance of class Anchors.
num_classes: integer number representing number of classes in the dataset.
match_threshold: float number between 0 and 1 representing the threshold
to assign positive labels for anchors.
"""
similarity_calc = IouSimilarity()
matcher = ArgMaxMatcher(
match_threshold,
unmatched_threshold=match_threshold,
negatives_lower_than_unmatched=True,
force_match_for_each_row=True)
box_coder = FasterRcnnBoxCoder()
self.target_assigner = TargetAssigner(similarity_calc, matcher, box_coder)
self.anchors = anchors
self.match_threshold = match_threshold
self.num_classes = num_classes
self.indices_cache = {}
def label_anchors(self, gt_boxes, gt_classes, filter_valid=True):
"""Labels anchors with ground truth inputs.
Args:
gt_boxes: A float tensor with shape [N, 4] representing groundtruth boxes.
For each row, it stores [y0, x0, y1, x1] for four corners of a box.
gt_classes: A integer tensor with shape [N, 1] representing groundtruth classes.
filter_valid: Filter out any boxes w/ gt class <= -1 before assigning
Returns:
cls_targets_dict: ordered dictionary with keys [min_level, min_level+1, ..., max_level].
The values are tensor with shape [height_l, width_l, num_anchors]. The height_l and width_l
represent the dimension of class logits at l-th level.
box_targets_dict: ordered dictionary with keys [min_level, min_level+1, ..., max_level].
The values are tensor with shape [height_l, width_l, num_anchors * 4]. The height_l and
width_l represent the dimension of bounding box regression output at l-th level.
num_positives: scalar tensor storing number of positives in an image.
"""
cls_targets_out = []
box_targets_out = []
if filter_valid:
valid_idx = gt_classes > -1 # filter gt targets w/ label <= -1
gt_boxes = gt_boxes[valid_idx]
gt_classes = gt_classes[valid_idx]
cls_targets, box_targets, matches = self.target_assigner.assign(
BoxList(self.anchors.boxes), BoxList(gt_boxes), gt_classes)
# class labels start from 1 and the background class = -1
cls_targets = (cls_targets - 1).long()
# Unpack labels.
"""Unpacks an array of cls/box into multiple scales."""
count = 0
for level in range(self.anchors.min_level, self.anchors.max_level + 1):
feat_size = self.anchors.feat_sizes[level]
steps = feat_size[0] * feat_size[1] * self.anchors.get_anchors_per_location()
cls_targets_out.append(cls_targets[count:count + steps].view([feat_size[0], feat_size[1], -1]))
box_targets_out.append(box_targets[count:count + steps].view([feat_size[0], feat_size[1], -1]))
count += steps
num_positives = (matches.match_results > -1).float().sum()
return cls_targets_out, box_targets_out, num_positives
def batch_label_anchors(self, gt_boxes, gt_classes, filter_valid=True):
batch_size = len(gt_boxes)
assert batch_size == len(gt_classes)
num_levels = self.anchors.max_level - self.anchors.min_level + 1
cls_targets_out = [[] for _ in range(num_levels)]
box_targets_out = [[] for _ in range(num_levels)]
num_positives_out = []
anchor_box_list = BoxList(self.anchors.boxes)
for i in range(batch_size):
last_sample = i == batch_size - 1
if filter_valid:
valid_idx = gt_classes[i] > -1 # filter gt targets w/ label <= -1
gt_box_list = BoxList(gt_boxes[i][valid_idx])
gt_class_i = gt_classes[i][valid_idx]
else:
gt_box_list = BoxList(gt_boxes[i])
gt_class_i = gt_classes[i]
cls_targets, box_targets, matches = self.target_assigner.assign(anchor_box_list, gt_box_list, gt_class_i)
# class labels start from 1 and the background class = -1
cls_targets = (cls_targets - 1).long()
# Unpack labels.
"""Unpacks an array of cls/box into multiple scales."""
count = 0
for level in range(self.anchors.min_level, self.anchors.max_level + 1):
level_idx = level - self.anchors.min_level
feat_size = self.anchors.feat_sizes[level]
steps = feat_size[0] * feat_size[1] * self.anchors.get_anchors_per_location()
cls_targets_out[level_idx].append(
cls_targets[count:count + steps].view([feat_size[0], feat_size[1], -1]))
box_targets_out[level_idx].append(
box_targets[count:count + steps].view([feat_size[0], feat_size[1], -1]))
count += steps
if last_sample:
cls_targets_out[level_idx] = torch.stack(cls_targets_out[level_idx])
box_targets_out[level_idx] = torch.stack(box_targets_out[level_idx])
num_positives_out.append((matches.match_results > -1).float().sum())
if last_sample:
num_positives_out = torch.stack(num_positives_out)
return cls_targets_out, box_targets_out, num_positives_out