diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..91a04233555bd3ecd0a0fb8b32b41fa573d01ec2 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,4 @@ +*.psd filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml new file mode 100644 index 0000000000000000000000000000000000000000..453dc7c1852259d952755381431e7e2be5d55cb7 --- /dev/null +++ b/.github/workflows/main.yml @@ -0,0 +1,20 @@ +name: Sync to Hugging Face hub +on: + push: + branches: [main] + + # to run this workflow manually from the Actions tab + workflow_dispatch: + +jobs: + sync-to-hub: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + with: + fetch-depth: 0 + lfs: true + - name: Push to hub + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + run: git push --force https://santit96:$HF_TOKEN@huggingface.co/spaces/rootstrap-org/waste-classifier main diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..4526ed90d42e61b98847adedbe767e9488824b2a --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +__pycache__ +.DS_Store +*.jpg +*.png +*.jpeg diff --git a/README.md b/README.md new file mode 100644 index 0000000000000000000000000000000000000000..74a1f7712f8d55a366c21796378d8c603eef6e74 --- /dev/null +++ b/README.md @@ -0,0 +1,14 @@ +--- +title: Waste Classifier +emoji: ♻️ +colorFrom: green +colorTo: gray +sdk: streamlit +sdk_version: 1.25.0 +pinned: false +--- + +Waste Classifier +============================== + +Waste Detection and Classifier tool diff --git a/app.py b/app.py new file mode 100644 index 0000000000000000000000000000000000000000..c5d832da21db24290172878432f4ca669c25db4d --- /dev/null +++ b/app.py @@ -0,0 +1,82 @@ +""" +Streamlit app +""" +import sys + +import streamlit as st + +from constants import (CLAS_FILEPATH, CLAS_THRESHOLD, CLASSES, DET_FILEPATH, + DET_NAME, DET_THRESHOLD, DEVICE, OUTPUT_IMG_FILEPATH) + +sys.path.append("./efficientdet") + +from PIL import Image + +from efficientdet.efficientdet import plot_results +from trash_detector import detect_trash + + +def initial_config(): + """ + Initial configuration of streamlit page + """ + st.set_page_config( + page_title="Waste Classifier", + page_icon="♻️", + ) + + +def render(): + """ + Render the streamlit app + """ + st.title("Waste classifier") + st.markdown("""Classify your waste into different classes""") + + # Image loader and button + uploaded_file = st.file_uploader( + "Upload image with trash", type=["jpg", "jpeg", "png", "gif", "bmp"] + ) + classify_button = st.button("Classify trash") + + if classify_button: + if not uploaded_file: + st.error("Upload an image") + else: + # Create two columns + col1, col2 = st.columns(2) + + # Column 1: Uploaded image + with col1: + st.write("Uploaded image") + st.image( + uploaded_file, caption="Uploaded Image.", use_column_width=True + ) + + # Column 2: Classified image + with col2: + with st.spinner(text="Classifying the trash..."): + img = Image.open(uploaded_file).convert("RGB") + cls_prob, bboxes_final = detect_trash( + img, + DET_NAME, + DET_FILEPATH, + CLAS_FILEPATH, + DEVICE, + DET_THRESHOLD, + CLAS_THRESHOLD, + ) + # plot and save demo image + plot_results( + img, cls_prob, bboxes_final, CLASSES, OUTPUT_IMG_FILEPATH + ) + output_img = Image.open(OUTPUT_IMG_FILEPATH) + st.write("Classified image") + st.image( + output_img, caption="Classified Image.", use_column_width=True + ) + + +if __name__ == "__main__": + initial_config() + render() diff --git a/constants.py b/constants.py new file mode 100644 index 0000000000000000000000000000000000000000..ce4a34c7bf90ed1ec019c113cea38c16e918e504 --- /dev/null +++ b/constants.py @@ -0,0 +1,8 @@ +CLAS_FILEPATH = "models/resnet50-classifier.pkl" +DET_FILEPATH = "models/efficientdet-d2-detector.pth.tar" +CLASSES = ["cardboard", "compost", "glass", "metal", "paper", "plastic", "trash"] +DET_NAME = "tf_efficientdet_d2" +CLAS_THRESHOLD = 0.5 +DET_THRESHOLD = 0.17 +DEVICE = "cpu" +OUTPUT_IMG_FILEPATH = "classified_image.jpg" diff --git a/efficientdet/__init__.py b/efficientdet/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/efficientdet/effdet/__init__.py b/efficientdet/effdet/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..c2aa4bc6bbb65fb9dd873894862b089aa6a33d18 --- /dev/null +++ b/efficientdet/effdet/__init__.py @@ -0,0 +1,7 @@ +from .efficientdet import EfficientDet +from .bench import DetBenchPredict, DetBenchTrain, unwrap_bench +from .data import create_dataset, create_loader, create_parser, DetectionDatset, SkipSubset +from .evaluator import CocoEvaluator, PascalEvaluator, OpenImagesEvaluator, create_evaluator +from .config import get_efficientdet_config, default_detection_model_configs +from .factory import create_model, create_model_from_config +from .helpers import load_checkpoint, load_pretrained diff --git a/efficientdet/effdet/anchors.py b/efficientdet/effdet/anchors.py new file mode 100644 index 0000000000000000000000000000000000000000..fcafc37491ccb774020b9246b2518034072724b3 --- /dev/null +++ b/efficientdet/effdet/anchors.py @@ -0,0 +1,421 @@ +""" RetinaNet / EfficientDet Anchor Gen + +Adapted for PyTorch from Tensorflow impl at + https://github.com/google/automl/blob/6f6694cec1a48cdb33d5d1551a2d5db8ad227798/efficientdet/anchors.py + +Hacked together by Ross Wightman, original copyright below +""" +# Copyright 2020 Google Research. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Anchor definition. + +This module is borrowed from TPU RetinaNet implementation: +https://github.com/tensorflow/tpu/blob/master/models/official/retinanet/anchors.py +""" +from typing import Optional, Tuple, Sequence + +import numpy as np +import torch +import torch.nn as nn +#import torchvision.ops.boxes as tvb +from torchvision.ops.boxes import batched_nms, remove_small_boxes +from typing import List + +from effdet.object_detection import ArgMaxMatcher, FasterRcnnBoxCoder, BoxList, IouSimilarity, TargetAssigner +from .soft_nms import batched_soft_nms + + +# The minimum score to consider a logit for identifying detections. +MIN_CLASS_SCORE = -5.0 + +# The score for a dummy detection +_DUMMY_DETECTION_SCORE = -1e5 + +# The maximum number of (anchor,class) pairs to keep for non-max suppression. +MAX_DETECTION_POINTS = 5000 + +# The maximum number of detections per image. +MAX_DETECTIONS_PER_IMAGE = 100 + + +def decode_box_outputs(rel_codes, anchors, output_xyxy: bool=False): + """Transforms relative regression coordinates to absolute positions. + + Network predictions are normalized and relative to a given anchor; this + reverses the transformation and outputs absolute coordinates for the input image. + + Args: + rel_codes: box regression targets. + + anchors: anchors on all feature levels. + + Returns: + outputs: bounding boxes. + + """ + ycenter_a = (anchors[:, 0] + anchors[:, 2]) / 2 + xcenter_a = (anchors[:, 1] + anchors[:, 3]) / 2 + ha = anchors[:, 2] - anchors[:, 0] + wa = anchors[:, 3] - anchors[:, 1] + + ty, tx, th, tw = rel_codes.unbind(dim=1) + + w = torch.exp(tw) * wa + h = torch.exp(th) * ha + ycenter = ty * ha + ycenter_a + xcenter = tx * wa + xcenter_a + ymin = ycenter - h / 2. + xmin = xcenter - w / 2. + ymax = ycenter + h / 2. + xmax = xcenter + w / 2. + if output_xyxy: + out = torch.stack([xmin, ymin, xmax, ymax], dim=1) + else: + out = torch.stack([ymin, xmin, ymax, xmax], dim=1) + return out + + +def clip_boxes_xyxy(boxes: torch.Tensor, size: torch.Tensor): + boxes = boxes.clamp(min=0) + size = torch.cat([size, size], dim=0) + boxes = boxes.min(size) + return boxes + + +def generate_detections( + cls_outputs, box_outputs, anchor_boxes, indices, classes, + img_scale: Optional[torch.Tensor], img_size: Optional[torch.Tensor], + max_det_per_image: int = MAX_DETECTIONS_PER_IMAGE, soft_nms: bool = False): + """Generates detections with RetinaNet model outputs and anchors. + + Args: + cls_outputs: a torch tensor with shape [N, 1], which has the highest class + scores on all feature levels. The N is the number of selected + top-K total anchors on all levels. (k being MAX_DETECTION_POINTS) + + box_outputs: a torch tensor with shape [N, 4], which stacks box regression + outputs on all feature levels. The N is the number of selected top-k + total anchors on all levels. (k being MAX_DETECTION_POINTS) + + anchor_boxes: a torch tensor with shape [N, 4], which stacks anchors on all + feature levels. The N is the number of selected top-k total anchors on all levels. + + indices: a torch tensor with shape [N], which is the indices from top-k selection. + + classes: a torch tensor with shape [N], which represents the class + prediction on all selected anchors from top-k selection. + + img_scale: a float tensor representing the scale between original image + and input image for the detector. It is used to rescale detections for + evaluating with the original groundtruth annotations. + + max_det_per_image: an int constant, added as argument to make torchscript happy + + Returns: + detections: detection results in a tensor with shape [MAX_DETECTION_POINTS, 6], + each row representing [x_min, y_min, x_max, y_max, score, class] + """ + assert box_outputs.shape[-1] == 4 + assert anchor_boxes.shape[-1] == 4 + assert cls_outputs.shape[-1] == 1 + + anchor_boxes = anchor_boxes[indices, :] + + # Appply bounding box regression to anchors, boxes are converted to xyxy + # here since PyTorch NMS expects them in that form. + boxes = decode_box_outputs(box_outputs.float(), anchor_boxes, output_xyxy=True) + if img_scale is not None and img_size is not None: + boxes = clip_boxes_xyxy(boxes, img_size / img_scale) # clip before NMS better? + + scores = cls_outputs.sigmoid().squeeze(1).float() + if soft_nms: + top_detection_idx, soft_scores = batched_soft_nms( + boxes, scores, classes, method_gaussian=True, iou_threshold=0.3, score_threshold=.001) + scores[top_detection_idx] = soft_scores + else: + top_detection_idx = batched_nms(boxes, scores, classes, iou_threshold=0.5) + + # keep only topk scoring predictions + top_detection_idx = top_detection_idx[:max_det_per_image] + boxes = boxes[top_detection_idx] + scores = scores[top_detection_idx, None] + classes = classes[top_detection_idx, None] + 1 # back to class idx with background class = 0 + + if img_scale is not None: + boxes = boxes * img_scale + + # FIXME add option to convert boxes back to yxyx? Otherwise must be handled downstream if + # that is the preferred output format. + + # stack em and pad out to MAX_DETECTIONS_PER_IMAGE if necessary + num_det = len(top_detection_idx) + detections = torch.cat([boxes, scores, classes.float()], dim=1) + if num_det < max_det_per_image: + detections = torch.cat([ + detections, + torch.zeros((max_det_per_image - num_det, 6), device=detections.device, dtype=detections.dtype) + ], dim=0) + return detections + + +def get_feat_sizes(image_size: Tuple[int, int], max_level: int): + """Get feat widths and heights for all levels. + Args: + image_size: a tuple (H, W) + max_level: maximum feature level. + Returns: + feat_sizes: a list of tuples (height, width) for each level. + """ + feat_size = image_size + feat_sizes = [feat_size] + for _ in range(1, max_level + 1): + feat_size = ((feat_size[0] - 1) // 2 + 1, (feat_size[1] - 1) // 2 + 1) + feat_sizes.append(feat_size) + return feat_sizes + + +class Anchors(nn.Module): + """RetinaNet Anchors class.""" + + def __init__(self, min_level, max_level, num_scales, aspect_ratios, anchor_scale, image_size: Tuple[int, int]): + """Constructs multiscale RetinaNet anchors. + + Args: + min_level: integer number of minimum level of the output feature pyramid. + + max_level: integer number of maximum level of the output feature pyramid. + + num_scales: integer number representing intermediate scales added + on each level. For instances, num_scales=2 adds two additional + anchor scales [2^0, 2^0.5] on each level. + + aspect_ratios: list of tuples representing the aspect ratio anchors added + on each level. For instances, aspect_ratios = + [(1, 1), (1.4, 0.7), (0.7, 1.4)] adds three anchors on each level. + + anchor_scale: float number representing the scale of size of the base + anchor to the feature stride 2^level. + + image_size: Sequence specifying input image size of model (H, W). + The image_size should be divided by the largest feature stride 2^max_level. + """ + super(Anchors, self).__init__() + self.min_level = min_level + self.max_level = max_level + self.num_scales = num_scales + self.aspect_ratios = aspect_ratios + if isinstance(anchor_scale, Sequence): + assert len(anchor_scale) == max_level - min_level + 1 + self.anchor_scales = anchor_scale + else: + self.anchor_scales = [anchor_scale] * (max_level - min_level + 1) + + assert isinstance(image_size, Sequence) and len(image_size) == 2 + # FIXME this restriction can likely be relaxed with some additional changes + assert image_size[0] % 2 ** max_level == 0, 'Image size must be divisible by 2 ** max_level (128)' + assert image_size[1] % 2 ** max_level == 0, 'Image size must be divisible by 2 ** max_level (128)' + self.image_size = tuple(image_size) + self.feat_sizes = get_feat_sizes(image_size, max_level) + self.config = self._generate_configs() + self.register_buffer('boxes', self._generate_boxes()) + + @classmethod + def from_config(cls, config): + return cls( + config.min_level, config.max_level, + config.num_scales, config.aspect_ratios, + config.anchor_scale, config.image_size) + + def _generate_configs(self): + """Generate configurations of anchor boxes.""" + anchor_configs = {} + feat_sizes = self.feat_sizes + for level in range(self.min_level, self.max_level + 1): + anchor_configs[level] = [] + for scale_octave in range(self.num_scales): + for aspect in self.aspect_ratios: + anchor_configs[level].append( + ((feat_sizes[0][0] // feat_sizes[level][0], + feat_sizes[0][1] // feat_sizes[level][1]), + scale_octave / float(self.num_scales), aspect, + self.anchor_scales[level - self.min_level])) + return anchor_configs + + def _generate_boxes(self): + """Generates multiscale anchor boxes.""" + boxes_all = [] + for _, configs in self.config.items(): + boxes_level = [] + for config in configs: + stride, octave_scale, aspect, anchor_scale = config + base_anchor_size_x = anchor_scale * stride[1] * 2 ** octave_scale + base_anchor_size_y = anchor_scale * stride[0] * 2 ** octave_scale + if isinstance(aspect, Sequence): + aspect_x = aspect[0] + aspect_y = aspect[1] + else: + aspect_x = np.sqrt(aspect) + aspect_y = 1.0 / aspect_x + anchor_size_x_2 = base_anchor_size_x * aspect_x / 2.0 + anchor_size_y_2 = base_anchor_size_y * aspect_y / 2.0 + + x = np.arange(stride[1] / 2, self.image_size[1], stride[1]) + y = np.arange(stride[0] / 2, self.image_size[0], stride[0]) + xv, yv = np.meshgrid(x, y) + xv = xv.reshape(-1) + yv = yv.reshape(-1) + + boxes = np.vstack((yv - anchor_size_y_2, xv - anchor_size_x_2, + yv + anchor_size_y_2, xv + anchor_size_x_2)) + boxes = np.swapaxes(boxes, 0, 1) + boxes_level.append(np.expand_dims(boxes, axis=1)) + + # concat anchors on the same level to the reshape NxAx4 + boxes_level = np.concatenate(boxes_level, axis=1) + boxes_all.append(boxes_level.reshape([-1, 4])) + + anchor_boxes = np.vstack(boxes_all) + anchor_boxes = torch.from_numpy(anchor_boxes).float() + return anchor_boxes + + def get_anchors_per_location(self): + return self.num_scales * len(self.aspect_ratios) + + +class AnchorLabeler(object): + """Labeler for multiscale anchor boxes. + """ + + def __init__(self, anchors, num_classes: int, match_threshold: float = 0.5): + """Constructs anchor labeler to assign labels to anchors. + + Args: + anchors: an instance of class Anchors. + + num_classes: integer number representing number of classes in the dataset. + + match_threshold: float number between 0 and 1 representing the threshold + to assign positive labels for anchors. + """ + similarity_calc = IouSimilarity() + matcher = ArgMaxMatcher( + match_threshold, + unmatched_threshold=match_threshold, + negatives_lower_than_unmatched=True, + force_match_for_each_row=True) + box_coder = FasterRcnnBoxCoder() + + self.target_assigner = TargetAssigner(similarity_calc, matcher, box_coder) + self.anchors = anchors + self.match_threshold = match_threshold + self.num_classes = num_classes + self.indices_cache = {} + + def label_anchors(self, gt_boxes, gt_classes, filter_valid=True): + """Labels anchors with ground truth inputs. + + Args: + gt_boxes: A float tensor with shape [N, 4] representing groundtruth boxes. + For each row, it stores [y0, x0, y1, x1] for four corners of a box. + + gt_classes: A integer tensor with shape [N, 1] representing groundtruth classes. + + filter_valid: Filter out any boxes w/ gt class <= -1 before assigning + + Returns: + cls_targets_dict: ordered dictionary with keys [min_level, min_level+1, ..., max_level]. + The values are tensor with shape [height_l, width_l, num_anchors]. The height_l and width_l + represent the dimension of class logits at l-th level. + + box_targets_dict: ordered dictionary with keys [min_level, min_level+1, ..., max_level]. + The values are tensor with shape [height_l, width_l, num_anchors * 4]. The height_l and + width_l represent the dimension of bounding box regression output at l-th level. + + num_positives: scalar tensor storing number of positives in an image. + """ + cls_targets_out = [] + box_targets_out = [] + + if filter_valid: + valid_idx = gt_classes > -1 # filter gt targets w/ label <= -1 + gt_boxes = gt_boxes[valid_idx] + gt_classes = gt_classes[valid_idx] + + cls_targets, box_targets, matches = self.target_assigner.assign( + BoxList(self.anchors.boxes), BoxList(gt_boxes), gt_classes) + + # class labels start from 1 and the background class = -1 + cls_targets = (cls_targets - 1).long() + + # Unpack labels. + """Unpacks an array of cls/box into multiple scales.""" + count = 0 + for level in range(self.anchors.min_level, self.anchors.max_level + 1): + feat_size = self.anchors.feat_sizes[level] + steps = feat_size[0] * feat_size[1] * self.anchors.get_anchors_per_location() + cls_targets_out.append(cls_targets[count:count + steps].view([feat_size[0], feat_size[1], -1])) + box_targets_out.append(box_targets[count:count + steps].view([feat_size[0], feat_size[1], -1])) + count += steps + + num_positives = (matches.match_results > -1).float().sum() + + return cls_targets_out, box_targets_out, num_positives + + def batch_label_anchors(self, gt_boxes, gt_classes, filter_valid=True): + batch_size = len(gt_boxes) + assert batch_size == len(gt_classes) + num_levels = self.anchors.max_level - self.anchors.min_level + 1 + cls_targets_out = [[] for _ in range(num_levels)] + box_targets_out = [[] for _ in range(num_levels)] + num_positives_out = [] + + anchor_box_list = BoxList(self.anchors.boxes) + for i in range(batch_size): + last_sample = i == batch_size - 1 + + if filter_valid: + valid_idx = gt_classes[i] > -1 # filter gt targets w/ label <= -1 + gt_box_list = BoxList(gt_boxes[i][valid_idx]) + gt_class_i = gt_classes[i][valid_idx] + else: + gt_box_list = BoxList(gt_boxes[i]) + gt_class_i = gt_classes[i] + cls_targets, box_targets, matches = self.target_assigner.assign(anchor_box_list, gt_box_list, gt_class_i) + + # class labels start from 1 and the background class = -1 + cls_targets = (cls_targets - 1).long() + + # Unpack labels. + """Unpacks an array of cls/box into multiple scales.""" + count = 0 + for level in range(self.anchors.min_level, self.anchors.max_level + 1): + level_idx = level - self.anchors.min_level + feat_size = self.anchors.feat_sizes[level] + steps = feat_size[0] * feat_size[1] * self.anchors.get_anchors_per_location() + cls_targets_out[level_idx].append( + cls_targets[count:count + steps].view([feat_size[0], feat_size[1], -1])) + box_targets_out[level_idx].append( + box_targets[count:count + steps].view([feat_size[0], feat_size[1], -1])) + count += steps + if last_sample: + cls_targets_out[level_idx] = torch.stack(cls_targets_out[level_idx]) + box_targets_out[level_idx] = torch.stack(box_targets_out[level_idx]) + + num_positives_out.append((matches.match_results > -1).float().sum()) + if last_sample: + num_positives_out = torch.stack(num_positives_out) + + return cls_targets_out, box_targets_out, num_positives_out + diff --git a/efficientdet/effdet/bench.py b/efficientdet/effdet/bench.py new file mode 100644 index 0000000000000000000000000000000000000000..16c1e98a7126f1886a377fa3bac7e9f6be617919 --- /dev/null +++ b/efficientdet/effdet/bench.py @@ -0,0 +1,143 @@ +""" PyTorch EfficientDet support benches + +Hacked together by Ross Wightman +""" +from typing import Optional, Dict, List +import torch +import torch.nn as nn +from timm.utils import ModelEma +from .anchors import Anchors, AnchorLabeler, generate_detections, MAX_DETECTION_POINTS +from .loss import DetectionLoss + + +def _post_process( + cls_outputs: List[torch.Tensor], + box_outputs: List[torch.Tensor], + num_levels: int, + num_classes: int, + max_detection_points: int = MAX_DETECTION_POINTS, +): + """Selects top-k predictions. + + Post-proc code adapted from Tensorflow version at: https://github.com/google/automl/tree/master/efficientdet + and optimized for PyTorch. + + Args: + cls_outputs: an OrderDict with keys representing levels and values + representing logits in [batch_size, height, width, num_anchors]. + + box_outputs: an OrderDict with keys representing levels and values + representing box regression targets in [batch_size, height, width, num_anchors * 4]. + + num_levels (int): number of feature levels + + num_classes (int): number of output classes + """ + batch_size = cls_outputs[0].shape[0] + cls_outputs_all = torch.cat([ + cls_outputs[level].permute(0, 2, 3, 1).reshape([batch_size, -1, num_classes]) + for level in range(num_levels)], 1) + + box_outputs_all = torch.cat([ + box_outputs[level].permute(0, 2, 3, 1).reshape([batch_size, -1, 4]) + for level in range(num_levels)], 1) + + _, cls_topk_indices_all = torch.topk(cls_outputs_all.reshape(batch_size, -1), dim=1, k=max_detection_points) + indices_all = cls_topk_indices_all // num_classes + classes_all = cls_topk_indices_all % num_classes + + box_outputs_all_after_topk = torch.gather( + box_outputs_all, 1, indices_all.unsqueeze(2).expand(-1, -1, 4)) + + cls_outputs_all_after_topk = torch.gather( + cls_outputs_all, 1, indices_all.unsqueeze(2).expand(-1, -1, num_classes)) + cls_outputs_all_after_topk = torch.gather( + cls_outputs_all_after_topk, 2, classes_all.unsqueeze(2)) + + return cls_outputs_all_after_topk, box_outputs_all_after_topk, indices_all, classes_all + + +@torch.jit.script +def _batch_detection( + batch_size: int, class_out, box_out, anchor_boxes, indices, classes, + img_scale: Optional[torch.Tensor] = None, img_size: Optional[torch.Tensor] = None): + batch_detections = [] + # FIXME we may be able to do this as a batch with some tensor reshaping/indexing, PR welcome + for i in range(batch_size): + img_scale_i = None if img_scale is None else img_scale[i] + img_size_i = None if img_size is None else img_size[i] + detections = generate_detections( + class_out[i], box_out[i], anchor_boxes, indices[i], classes[i], img_scale_i, img_size_i) + batch_detections.append(detections) + return torch.stack(batch_detections, dim=0) + + +class DetBenchPredict(nn.Module): + def __init__(self, model): + super(DetBenchPredict, self).__init__() + self.model = model + self.config = model.config # FIXME remove this when we can use @property (torchscript limitation) + self.num_levels = model.config.num_levels + self.num_classes = model.config.num_classes + self.anchors = Anchors.from_config(model.config) + + def forward(self, x, img_info: Optional[Dict[str, torch.Tensor]] = None): + class_out, box_out = self.model(x) + class_out, box_out, indices, classes = _post_process( + class_out, box_out, num_levels=self.num_levels, num_classes=self.num_classes) + if img_info is None: + img_scale, img_size = None, None + else: + img_scale, img_size = img_info['img_scale'], img_info['img_size'] + return _batch_detection( + x.shape[0], class_out, box_out, self.anchors.boxes, indices, classes, img_scale, img_size) + + +class DetBenchTrain(nn.Module): + def __init__(self, model, create_labeler=True): + super(DetBenchTrain, self).__init__() + self.model = model + self.config = model.config # FIXME remove this when we can use @property (torchscript limitation) + self.num_levels = model.config.num_levels + self.num_classes = model.config.num_classes + self.anchors = Anchors.from_config(model.config) + self.anchor_labeler = None + if create_labeler: + self.anchor_labeler = AnchorLabeler(self.anchors, self.num_classes, match_threshold=0.5) + self.loss_fn = DetectionLoss(model.config) + + def forward(self, x, target: Dict[str, torch.Tensor]): + class_out, box_out = self.model(x) + if self.anchor_labeler is None: + # target should contain pre-computed anchor labels if labeler not present in bench + assert 'label_num_positives' in target + cls_targets = [target[f'label_cls_{l}'] for l in range(self.num_levels)] + box_targets = [target[f'label_bbox_{l}'] for l in range(self.num_levels)] + num_positives = target['label_num_positives'] + else: + cls_targets, box_targets, num_positives = self.anchor_labeler.batch_label_anchors( + target['bbox'], target['cls']) + + loss, class_loss, box_loss = self.loss_fn(class_out, box_out, cls_targets, box_targets, num_positives) + output = {'loss': loss, 'class_loss': class_loss, 'box_loss': box_loss} + if not self.training: + # if eval mode, output detections for evaluation + class_out_pp, box_out_pp, indices, classes = _post_process( + class_out, box_out, num_levels=self.num_levels, num_classes=self.num_classes) + output['detections'] = _batch_detection( + x.shape[0], class_out_pp, box_out_pp, self.anchors.boxes, indices, classes, + target['img_scale'], target['img_size']) + return output + + +def unwrap_bench(model): + # Unwrap a model in support bench so that various other fns can access the weights and attribs of the + # underlying model directly + if isinstance(model, ModelEma): # unwrap ModelEma + return unwrap_bench(model.ema) + elif hasattr(model, 'module'): # unwrap DDP + return unwrap_bench(model.module) + elif hasattr(model, 'model'): # unwrap Bench -> model + return unwrap_bench(model.model) + else: + return model diff --git a/efficientdet/effdet/config/__init__.py b/efficientdet/effdet/config/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..0dd74ec181acca5c0aa089fc573e3c43690ad64b --- /dev/null +++ b/efficientdet/effdet/config/__init__.py @@ -0,0 +1,4 @@ +from .config_utils import set_config_readonly, set_config_writeable +from .fpn_config import get_fpn_config +from .model_config import get_efficientdet_config, default_detection_model_configs +from .train_config import default_detection_train_config diff --git a/efficientdet/effdet/config/config_utils.py b/efficientdet/effdet/config/config_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..f367cccab787691881e05f6e42354443db6d1874 --- /dev/null +++ b/efficientdet/effdet/config/config_utils.py @@ -0,0 +1,9 @@ +from omegaconf import OmegaConf + + +def set_config_readonly(conf): + OmegaConf.set_readonly(conf, True) + + +def set_config_writeable(conf): + OmegaConf.set_readonly(conf, False) diff --git a/efficientdet/effdet/config/fpn_config.py b/efficientdet/effdet/config/fpn_config.py new file mode 100644 index 0000000000000000000000000000000000000000..e12ed18923632a713fb478fe97ebc75f1e370124 --- /dev/null +++ b/efficientdet/effdet/config/fpn_config.py @@ -0,0 +1,184 @@ +import itertools + +from omegaconf import OmegaConf + + +def bifpn_config(min_level, max_level, weight_method=None): + """BiFPN config. + Adapted from https://github.com/google/automl/blob/56815c9986ffd4b508fe1d68508e268d129715c1/efficientdet/keras/fpn_configs.py + """ + p = OmegaConf.create() + weight_method = weight_method or 'fastattn' + + num_levels = max_level - min_level + 1 + node_ids = {min_level + i: [i] for i in range(num_levels)} + + level_last_id = lambda level: node_ids[level][-1] + level_all_ids = lambda level: node_ids[level] + id_cnt = itertools.count(num_levels) + + p.nodes = [] + for i in range(max_level - 1, min_level - 1, -1): + # top-down path. + p.nodes.append({ + 'reduction': 1 << i, + 'inputs_offsets': [level_last_id(i), level_last_id(i + 1)], + 'weight_method': weight_method, + }) + node_ids[i].append(next(id_cnt)) + + for i in range(min_level + 1, max_level + 1): + # bottom-up path. + p.nodes.append({ + 'reduction': 1 << i, + 'inputs_offsets': level_all_ids(i) + [level_last_id(i - 1)], + 'weight_method': weight_method, + }) + node_ids[i].append(next(id_cnt)) + return p + + +def panfpn_config(min_level, max_level, weight_method=None): + """PAN FPN config. + + This defines FPN layout from Path Aggregation Networks as an alternate to + BiFPN, it does not implement the full PAN spec. + + Paper: https://arxiv.org/abs/1803.01534 + """ + p = OmegaConf.create() + weight_method = weight_method or 'fastattn' + + num_levels = max_level - min_level + 1 + node_ids = {min_level + i: [i] for i in range(num_levels)} + level_last_id = lambda level: node_ids[level][-1] + id_cnt = itertools.count(num_levels) + + p.nodes = [] + for i in range(max_level, min_level - 1, -1): + # top-down path. + offsets = [level_last_id(i), level_last_id(i + 1)] if i != max_level else [level_last_id(i)] + p.nodes.append({ + 'reduction': 1 << i, + 'inputs_offsets': offsets, + 'weight_method': weight_method, + }) + node_ids[i].append(next(id_cnt)) + + for i in range(min_level, max_level + 1): + # bottom-up path. + offsets = [level_last_id(i), level_last_id(i - 1)] if i != min_level else [level_last_id(i)] + p.nodes.append({ + 'reduction': 1 << i, + 'inputs_offsets': offsets, + 'weight_method': weight_method, + }) + node_ids[i].append(next(id_cnt)) + + return p + + +def qufpn_config(min_level, max_level, weight_method=None): + """A dynamic quad fpn config that can adapt to different min/max levels. + + It extends the idea of BiFPN, and has four paths: + (up_down -> bottom_up) + (bottom_up -> up_down). + + Paper: https://ieeexplore.ieee.org/document/9225379 + Ref code: From contribution to TF EfficientDet + https://github.com/google/automl/blob/eb74c6739382e9444817d2ad97c4582dbe9a9020/efficientdet/keras/fpn_configs.py + """ + p = OmegaConf.create() + weight_method = weight_method or 'fastattn' + quad_method = 'fastattn' + num_levels = max_level - min_level + 1 + node_ids = {min_level + i: [i] for i in range(num_levels)} + level_last_id = lambda level: node_ids[level][-1] + level_all_ids = lambda level: node_ids[level] + level_first_id = lambda level: node_ids[level][0] + id_cnt = itertools.count(num_levels) + + p.nodes = [] + for i in range(max_level - 1, min_level - 1, -1): + # top-down path 1. + p.nodes.append({ + 'reduction': 1 << i, + 'inputs_offsets': [level_last_id(i), level_last_id(i + 1)], + 'weight_method': weight_method + }) + node_ids[i].append(next(id_cnt)) + node_ids[max_level].append(node_ids[max_level][-1]) + + for i in range(min_level + 1, max_level): + # bottom-up path 2. + p.nodes.append({ + 'reduction': 1 << i, + 'inputs_offsets': level_all_ids(i) + [level_last_id(i - 1)], + 'weight_method': weight_method + }) + node_ids[i].append(next(id_cnt)) + + i = max_level + p.nodes.append({ + 'reduction': 1 << i, + 'inputs_offsets': [level_first_id(i)] + [level_last_id(i - 1)], + 'weight_method': weight_method + }) + node_ids[i].append(next(id_cnt)) + node_ids[min_level].append(node_ids[min_level][-1]) + + for i in range(min_level + 1, max_level + 1, 1): + # bottom-up path 3. + p.nodes.append({ + 'reduction': 1 << i, + 'inputs_offsets': [ + level_first_id(i), level_last_id(i - 1) if i != min_level + 1 else level_first_id(i - 1)], + 'weight_method': weight_method + }) + node_ids[i].append(next(id_cnt)) + node_ids[min_level].append(node_ids[min_level][-1]) + + for i in range(max_level - 1, min_level, -1): + # top-down path 4. + p.nodes.append({ + 'reduction': 1 << i, + 'inputs_offsets': [node_ids[i][0]] + [node_ids[i][-1]] + [level_last_id(i + 1)], + 'weight_method': weight_method + }) + node_ids[i].append(next(id_cnt)) + i = min_level + p.nodes.append({ + 'reduction': 1 << i, + 'inputs_offsets': [node_ids[i][0]] + [level_last_id(i + 1)], + 'weight_method': weight_method + }) + node_ids[i].append(next(id_cnt)) + node_ids[max_level].append(node_ids[max_level][-1]) + + # NOTE: the order of the quad path is reversed from the original, my code expects the output of + # each FPN repeat to be same as input from backbone, in order of increasing reductions + for i in range(min_level, max_level + 1): + # quad-add path. + p.nodes.append({ + 'reduction': 1 << i, + 'inputs_offsets': [node_ids[i][2], node_ids[i][4]], + 'weight_method': quad_method + }) + node_ids[i].append(next(id_cnt)) + + return p + + +def get_fpn_config(fpn_name, min_level=3, max_level=7): + if not fpn_name: + fpn_name = 'bifpn_fa' + name_to_config = { + 'bifpn_sum': bifpn_config(min_level=min_level, max_level=max_level, weight_method='sum'), + 'bifpn_attn': bifpn_config(min_level=min_level, max_level=max_level, weight_method='attn'), + 'bifpn_fa': bifpn_config(min_level=min_level, max_level=max_level, weight_method='fastattn'), + 'pan_sum': panfpn_config(min_level=min_level, max_level=max_level, weight_method='sum'), + 'pan_fa': panfpn_config(min_level=min_level, max_level=max_level, weight_method='fastattn'), + 'qufpn_sum': qufpn_config(min_level=min_level, max_level=max_level, weight_method='sum'), + 'qufpn_fa': qufpn_config(min_level=min_level, max_level=max_level, weight_method='fastattn'), + } + return name_to_config[fpn_name] diff --git a/efficientdet/effdet/config/model_config.py b/efficientdet/effdet/config/model_config.py new file mode 100644 index 0000000000000000000000000000000000000000..c92ac142818ce69d1533842d8f40c85e2f6421aa --- /dev/null +++ b/efficientdet/effdet/config/model_config.py @@ -0,0 +1,538 @@ +"""EfficientDet Configurations + +Adapted from official impl at https://github.com/google/automl/tree/master/efficientdet + +TODO use a different config system (OmegaConfig -> Hydra?), separate model from train specific hparams +""" + +from omegaconf import OmegaConf +from copy import deepcopy + + +def default_detection_model_configs(): + """Returns a default detection configs.""" + h = OmegaConf.create() + + # model name. + h.name = 'tf_efficientdet_d1' + + h.backbone_name = 'tf_efficientnet_b1' + h.backbone_args = None # FIXME sort out kwargs vs config for backbone creation + + # model specific, input preprocessing parameters + h.image_size = (640, 640) + + # dataset specific head parameters + h.num_classes = 90 + + # feature + anchor config + h.min_level = 3 + h.max_level = 7 + h.num_levels = h.max_level - h.min_level + 1 + h.num_scales = 3 + h.aspect_ratios = [(1.0, 1.0), (1.4, 0.7), (0.7, 1.4)] + # ratio w/h: 2.0 means w=1.4, h=0.7. Can be computed with k-mean per dataset. + #h.aspect_ratios = [1.0, 2.0, 0.5] + h.anchor_scale = 4.0 + + # FPN and head config + h.pad_type = 'same' # original TF models require an equivalent of Tensorflow 'SAME' padding + h.act_type = 'swish' + h.norm_layer = None # defaults to batch norm when None + h.norm_kwargs = dict(eps=.001, momentum=.01) + h.box_class_repeats = 3 + h.fpn_cell_repeats = 3 + h.fpn_channels = 88 + h.separable_conv = True + h.apply_bn_for_resampling = True + h.conv_after_downsample = False + h.conv_bn_relu_pattern = False + h.use_native_resize_op = False + h.pooling_type = None + h.redundant_bias = True # original TF models have back to back bias + BN layers, not necessary! + h.head_bn_level_first = False # change order of BN in head repeat list of lists, True for torchscript compat + + h.fpn_name = None + h.fpn_config = None + h.fpn_drop_path_rate = 0. # No stochastic depth in default. NOTE not currently used, unstable training + + # classification loss (used by train bench) + h.alpha = 0.25 + h.gamma = 1.5 + h.label_smoothing = 0. # only supported if new_focal == True + h.new_focal = False # use new focal loss (supports label smoothing but uses more mem, less optimal w/ jit script) + h.jit_loss = False # torchscript jit for loss fn speed improvement, can impact stability and/or increase mem usage + + # localization loss (used by train bench) + h.delta = 0.1 + h.box_loss_weight = 50.0 + + return h + + +efficientdet_model_param_dict = dict( + # Models with PyTorch friendly padding and my PyTorch pretrained backbones, training TBD + efficientdet_d0=dict( + name='efficientdet_d0', + backbone_name='efficientnet_b0', + image_size=(512, 512), + fpn_channels=64, + fpn_cell_repeats=3, + box_class_repeats=3, + pad_type='', + redundant_bias=False, + backbone_args=dict(drop_path_rate=0.1), + url='https://github.com/rwightman/efficientdet-pytorch/releases/download/v0.1/efficientdet_d0-f3276ba8.pth', + ), + efficientdet_d1=dict( + name='efficientdet_d1', + backbone_name='efficientnet_b1', + image_size=(640, 640), + fpn_channels=88, + fpn_cell_repeats=4, + box_class_repeats=3, + pad_type='', + redundant_bias=False, + backbone_args=dict(drop_path_rate=0.2), + url='https://github.com/rwightman/efficientdet-pytorch/releases/download/v0.1/efficientdet_d1-bb7e98fe.pth', + ), + efficientdet_d2=dict( + name='efficientdet_d2', + backbone_name='efficientnet_b2', + image_size=(768, 768), + fpn_channels=112, + fpn_cell_repeats=5, + box_class_repeats=3, + pad_type='', + redundant_bias=False, + backbone_args=dict(drop_path_rate=0.2), + url='', # no pretrained weights yet + ), + efficientdet_d3=dict( + name='efficientdet_d3', + backbone_name='efficientnet_b3', + image_size=(896, 896), + fpn_channels=160, + fpn_cell_repeats=6, + box_class_repeats=4, + pad_type='', + redundant_bias=False, + backbone_args=dict(drop_path_rate=0.2), + url='', # no pretrained weights yet + ), + efficientdet_d4=dict( + name='efficientdet_d4', + backbone_name='efficientnet_b4', + image_size=(1024, 1024), + fpn_channels=224, + fpn_cell_repeats=7, + box_class_repeats=4, + backbone_args=dict(drop_path_rate=0.2), + ), + efficientdet_d5=dict( + name='efficientdet_d5', + backbone_name='efficientnet_b5', + image_size=(1280, 1280), + fpn_channels=288, + fpn_cell_repeats=7, + box_class_repeats=4, + backbone_args=dict(drop_path_rate=0.2), + url='', + ), + + # My own experimental configs with alternate models, training TBD + # Note: any 'timm' model in the EfficientDet family can be used as a backbone here. + resdet50=dict( + name='resdet50', + backbone_name='resnet50', + image_size=(640, 640), + fpn_channels=88, + fpn_cell_repeats=4, + box_class_repeats=3, + pad_type='', + act_type='relu', + redundant_bias=False, + separable_conv=False, + backbone_args=dict(drop_path_rate=0.2), + url='https://github.com/rwightman/efficientdet-pytorch/releases/download/v0.1/resdet50_416-08676892.pth', + ), + cspresdet50=dict( + name='cspresdet50', + backbone_name='cspresnet50', + image_size=(640, 640), + aspect_ratios=[1.0, 2.0, 0.5], + fpn_channels=88, + fpn_cell_repeats=4, + box_class_repeats=3, + pad_type='', + act_type='leaky_relu', + redundant_bias=False, + separable_conv=False, + head_bn_level_first=True, + backbone_args=dict(drop_path_rate=0.2), + url='', + ), + cspresdext50=dict( + name='cspresdext50', + backbone_name='cspresnext50', + image_size=(640, 640), + aspect_ratios=[1.0, 2.0, 0.5], + fpn_channels=88, + fpn_cell_repeats=4, + box_class_repeats=3, + pad_type='', + act_type='leaky_relu', + redundant_bias=False, + separable_conv=False, + head_bn_level_first=True, + backbone_args=dict(drop_path_rate=0.2), + url='', + ), + cspresdext50pan=dict( + name='cspresdext50pan', + backbone_name='cspresnext50', + image_size=(640, 640), + aspect_ratios=[1.0, 2.0, 0.5], + fpn_channels=88, + fpn_cell_repeats=3, + box_class_repeats=3, + pad_type='', + act_type='leaky_relu', + fpn_name='pan_fa', # PAN FPN experiment + redundant_bias=False, + separable_conv=False, + head_bn_level_first=True, + backbone_args=dict(drop_path_rate=0.2), + url='', + ), + cspdarkdet53=dict( + name='cspdarkdet53', + backbone_name='cspdarknet53', + image_size=(640, 640), + aspect_ratios=[1.0, 2.0, 0.5], + fpn_channels=88, + fpn_cell_repeats=4, + box_class_repeats=3, + pad_type='', + act_type='leaky_relu', + redundant_bias=False, + separable_conv=False, + head_bn_level_first=True, + backbone_args=dict(drop_path_rate=0.2), + url='', + ), + mixdet_m=dict( + name='mixdet_m', + backbone_name='mixnet_m', + image_size=(512, 512), + aspect_ratios=[1.0, 2.0, 0.5], + fpn_channels=64, + fpn_cell_repeats=3, + box_class_repeats=3, + pad_type='', + redundant_bias=False, + head_bn_level_first=True, + backbone_args=dict(drop_path_rate=0.1), + url='', # no pretrained weights yet + ), + mixdet_l=dict( + name='mixdet_l', + backbone_name='mixnet_l', + image_size=(640, 640), + aspect_ratios=[1.0, 2.0, 0.5], + fpn_channels=88, + fpn_cell_repeats=4, + box_class_repeats=3, + pad_type='', + redundant_bias=False, + head_bn_level_first=True, + backbone_args=dict(drop_path_rate=0.2), + url='', # no pretrained weights yet + ), + mobiledetv2_110d=dict( + name='mobiledetv2_110d', + backbone_name='mobilenetv2_110d', + image_size=(384, 384), + aspect_ratios=[1.0, 2.0, 0.5], + fpn_channels=48, + fpn_cell_repeats=3, + box_class_repeats=3, + pad_type='', + act_type='relu6', + redundant_bias=False, + head_bn_level_first=True, + backbone_args=dict(drop_path_rate=0.05), + url='', # no pretrained weights yet + ), + mobiledetv2_120d=dict( + name='mobiledetv2_120d', + backbone_name='mobilenetv2_120d', + image_size=(512, 512), + aspect_ratios=[1.0, 2.0, 0.5], + fpn_channels=56, + fpn_cell_repeats=3, + box_class_repeats=3, + pad_type='', + act_type='relu6', + redundant_bias=False, + head_bn_level_first=True, + backbone_args=dict(drop_path_rate=0.1), + url='', # no pretrained weights yet + ), + mobiledetv3_large=dict( + name='mobiledetv3_large', + backbone_name='mobilenetv3_large_100', + image_size=(512, 512), + aspect_ratios=[1.0, 2.0, 0.5], + fpn_channels=64, + fpn_cell_repeats=3, + box_class_repeats=3, + pad_type='', + act_type='hard_swish', + redundant_bias=False, + head_bn_level_first=True, + backbone_args=dict(drop_path_rate=0.1), + url='', # no pretrained weights yet + ), + efficientdet_q0=dict( + name='efficientdet_q0', + backbone_name='efficientnet_b0', + image_size=(512, 512), + fpn_channels=64, + fpn_cell_repeats=3, + box_class_repeats=3, + pad_type='', + fpn_name='qufpn_fa', # quad-fpn + fast attn experiment + redundant_bias=False, + head_bn_level_first=True, + backbone_args=dict(drop_path_rate=0.1), + url='', + ), + efficientdet_w0=dict( + name='efficientdet_w0', # 'wide' + backbone_name='efficientnet_b0', + image_size=(512, 512), + aspect_ratios=[1.0, 2.0, 0.5], + fpn_channels=80, + fpn_cell_repeats=3, + box_class_repeats=3, + pad_type='', + redundant_bias=False, + head_bn_level_first=True, + backbone_args=dict( + drop_path_rate=0.1, + feature_location='depthwise'), # features from after DW/SE in IR block + url='', # no pretrained weights yet + ), + efficientdet_es=dict( + name='efficientdet_es', #EdgeTPU-Small + backbone_name='efficientnet_es', + image_size=(512, 512), + aspect_ratios=[1.0, 2.0, 0.5], + fpn_channels=72, + fpn_cell_repeats=3, + box_class_repeats=3, + pad_type='', + act_type='relu', + redundant_bias=False, + head_bn_level_first=True, + separable_conv=False, + backbone_args=dict(drop_path_rate=0.1), + url='', + ), + efficientdet_em=dict( + name='efficientdet_em', # Edge-TPU Medium + backbone_name='efficientnet_em', + image_size=(640, 640), + aspect_ratios=[1.0, 2.0, 0.5], + fpn_channels=96, + fpn_cell_repeats=4, + box_class_repeats=3, + pad_type='', + act_type='relu', + redundant_bias=False, + head_bn_level_first=True, + separable_conv=False, + backbone_args=dict(drop_path_rate=0.2), + url='', # no pretrained weights yet + ), + efficientdet_lite0=dict( + name='efficientdet_lite0', + backbone_name='efficientnet_lite0', + image_size=(512, 512), + fpn_channels=64, + fpn_cell_repeats=3, + box_class_repeats=3, + act_type='relu', + redundant_bias=False, + head_bn_level_first=True, + backbone_args=dict(drop_path_rate=0.1), + url='', + ), + + # Models ported from Tensorflow with pretrained backbones ported from Tensorflow + tf_efficientdet_d0=dict( + name='tf_efficientdet_d0', + backbone_name='tf_efficientnet_b0', + image_size=(512, 512), + fpn_channels=64, + fpn_cell_repeats=3, + box_class_repeats=3, + backbone_args=dict(drop_path_rate=0.2), + url='https://github.com/rwightman/efficientdet-pytorch/releases/download/v0.1/tf_efficientdet_d0_34-f153e0cf.pth', + ), + tf_efficientdet_d1=dict( + name='tf_efficientdet_d1', + backbone_name='tf_efficientnet_b1', + image_size=(640, 640), + fpn_channels=88, + fpn_cell_repeats=4, + box_class_repeats=3, + backbone_args=dict(drop_path_rate=0.2), + url='https://github.com/rwightman/efficientdet-pytorch/releases/download/v0.1/tf_efficientdet_d1_40-a30f94af.pth' + ), + tf_efficientdet_d2=dict( + name='tf_efficientdet_d2', + backbone_name='tf_efficientnet_b2', + image_size=(768, 768), + fpn_channels=112, + fpn_cell_repeats=5, + box_class_repeats=3, + backbone_args=dict(drop_path_rate=0.2), + url='https://github.com/rwightman/efficientdet-pytorch/releases/download/v0.1/tf_efficientdet_d2_43-8107aa99.pth', + ), + tf_efficientdet_d3=dict( + name='tf_efficientdet_d3', + backbone_name='tf_efficientnet_b3', + image_size=(896, 896), + fpn_channels=160, + fpn_cell_repeats=6, + box_class_repeats=4, + backbone_args=dict(drop_path_rate=0.2), + url='https://github.com/rwightman/efficientdet-pytorch/releases/download/v0.1/tf_efficientdet_d3_47-0b525f35.pth', + ), + tf_efficientdet_d4=dict( + name='tf_efficientdet_d4', + backbone_name='tf_efficientnet_b4', + image_size=(1024, 1024), + fpn_channels=224, + fpn_cell_repeats=7, + box_class_repeats=4, + backbone_args=dict(drop_path_rate=0.2), + url='https://github.com/rwightman/efficientdet-pytorch/releases/download/v0.1/tf_efficientdet_d4_49-f56376d9.pth', + ), + tf_efficientdet_d5=dict( + name='tf_efficientdet_d5', + backbone_name='tf_efficientnet_b5', + image_size=(1280, 1280), + fpn_channels=288, + fpn_cell_repeats=7, + box_class_repeats=4, + backbone_args=dict(drop_path_rate=0.2), + url='https://github.com/rwightman/efficientdet-pytorch/releases/download/v0.1/tf_efficientdet_d5_51-c79f9be6.pth', + ), + tf_efficientdet_d6=dict( + name='tf_efficientdet_d6', + backbone_name='tf_efficientnet_b6', + image_size=(1280, 1280), + fpn_channels=384, + fpn_cell_repeats=8, + box_class_repeats=5, + fpn_name='bifpn_sum', # Use unweighted sum for training stability. + backbone_args=dict(drop_path_rate=0.2), + url='https://github.com/rwightman/efficientdet-pytorch/releases/download/v0.1/tf_efficientdet_d6_52-4eda3773.pth' + ), + tf_efficientdet_d7=dict( + name='tf_efficientdet_d7', + backbone_name='tf_efficientnet_b6', + image_size=(1536, 1536), + fpn_channels=384, + fpn_cell_repeats=8, + box_class_repeats=5, + anchor_scale=5.0, + fpn_name='bifpn_sum', # Use unweighted sum for training stability. + backbone_args=dict(drop_path_rate=0.2), + url='https://github.com/rwightman/efficientdet-pytorch/releases/download/v0.1/tf_efficientdet_d7_53-6d1d7a95.pth' + ), + tf_efficientdet_d7x=dict( + name='tf_efficientdet_d7x', + backbone_name='tf_efficientnet_b7', + image_size=(1536, 1536), + fpn_channels=384, + fpn_cell_repeats=8, + box_class_repeats=5, + anchor_scale=4.0, + max_level=8, + fpn_name='bifpn_sum', # Use unweighted sum for training stability. + backbone_args=dict(drop_path_rate=0.2), + url='https://github.com/rwightman/efficientdet-pytorch/releases/download/v0.1/tf_efficientdet_d7x-f390b87c.pth' + ), + + # The lite configs are in TF automl repository but no weights yet and listed as 'not final' + tf_efficientdet_lite0=dict( + name='tf_efficientdet_lite0', + backbone_name='tf_efficientnet_lite0', + image_size=(512, 512), + fpn_channels=64, + fpn_cell_repeats=3, + box_class_repeats=3, + act_type='relu', + redundant_bias=False, + backbone_args=dict(drop_path_rate=0.1), + # unlike other tf_ models, this was not ported from tf automl impl, but trained from tf pretrained efficient lite + # weights using this code, will likely replace if/when official det-lite weights are released + url='https://github.com/rwightman/efficientdet-pytorch/releases/download/v0.1/tf_efficientdet_lite0-f5f303a9.pth', + ), + tf_efficientdet_lite1=dict( + name='tf_efficientdet_lite1', + backbone_name='tf_efficientnet_lite1', + image_size=(640, 640), + fpn_channels=88, + fpn_cell_repeats=4, + box_class_repeats=3, + act_type='relu', + backbone_args=dict(drop_path_rate=0.2), + url='', # no pretrained weights yet + ), + tf_efficientdet_lite2=dict( + name='tf_efficientdet_lite2', + backbone_name='tf_efficientnet_lite2', + image_size=(768, 768), + fpn_channels=112, + fpn_cell_repeats=5, + box_class_repeats=3, + act_type='relu', + backbone_args=dict(drop_path_rate=0.2), + url='', + ), + tf_efficientdet_lite3=dict( + name='tf_efficientdet_lite3', + backbone_name='tf_efficientnet_lite3', + image_size=(896, 896), + fpn_channels=160, + fpn_cell_repeats=6, + box_class_repeats=4, + act_type='relu', + backbone_args=dict(drop_path_rate=0.2), + url='', + ), + tf_efficientdet_lite4=dict( + name='tf_efficientdet_lite4', + backbone_name='tf_efficientnet_lite4', + image_size=(1024, 1024), + fpn_channels=224, + fpn_cell_repeats=7, + box_class_repeats=4, + act_type='relu', + backbone_args=dict(drop_path_rate=0.2), + url='', + ), +) + + +def get_efficientdet_config(model_name='tf_efficientdet_d1'): + """Get the default config for EfficientDet based on model name.""" + h = default_detection_model_configs() + h.update(efficientdet_model_param_dict[model_name]) + h.num_levels = h.max_level - h.min_level + 1 + return deepcopy(h) # may be unnecessary, ensure no references to param dict values diff --git a/efficientdet/effdet/config/train_config.py b/efficientdet/effdet/config/train_config.py new file mode 100644 index 0000000000000000000000000000000000000000..88deab0a5f3046c5e43d3c7ac8bb2269ee606875 --- /dev/null +++ b/efficientdet/effdet/config/train_config.py @@ -0,0 +1,34 @@ +from omegaconf import OmegaConf + + +def default_detection_train_config(): + # FIXME currently using args for train config, will revisit, perhaps move to Hydra + h = OmegaConf.create() + + # dataset + h.skip_crowd_during_training = True + + # augmentation + h.input_rand_hflip = True + h.train_scale_min = 0.1 + h.train_scale_max = 2.0 + h.autoaugment_policy = None + + # optimization + h.momentum = 0.9 + h.learning_rate = 0.08 + h.lr_warmup_init = 0.008 + h.lr_warmup_epoch = 1.0 + h.first_lr_drop_epoch = 200.0 + h.second_lr_drop_epoch = 250.0 + h.clip_gradients_norm = 10.0 + h.num_epochs = 300 + + # regularization l2 loss. + h.weight_decay = 4e-5 + + h.lr_decay_method = 'cosine' + h.moving_average_decay = 0.9998 + h.ckpt_var_scope = None + + return h diff --git a/efficientdet/effdet/data/__init__.py b/efficientdet/effdet/data/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..fcc1ac591550d6f2f25afc3c9de08f28a2c07287 --- /dev/null +++ b/efficientdet/effdet/data/__init__.py @@ -0,0 +1,6 @@ +from .dataset_factory import create_dataset +from .dataset import DetectionDatset, SkipSubset +from .input_config import resolve_input_config +from .loader import create_loader +from .parsers import create_parser +from .transforms import * diff --git a/efficientdet/effdet/data/dataset.py b/efficientdet/effdet/data/dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..7296a46fd84aabed897718279dd52eb4e0707921 --- /dev/null +++ b/efficientdet/effdet/data/dataset.py @@ -0,0 +1,145 @@ +""" Detection dataset + +Hacked together by Ross Wightman +""" +import torch.utils.data as data +import numpy as np +import albumentations as A +import torch + +from PIL import Image +from .parsers import create_parser + + +class DetectionDatset(data.Dataset): + """`Object Detection Dataset. Use with parsers for COCO, VOC, and OpenImages. + Args: + parser (string, Parser): + transform (callable, optional): A function/transform that takes in an PIL image + and returns a transformed version. E.g, ``transforms.ToTensor`` + + """ + + def __init__(self, data_dir, parser=None, parser_kwargs=None, transform=None, transforms=None): + super(DetectionDatset, self).__init__() + parser_kwargs = parser_kwargs or {} + self.data_dir = data_dir + if isinstance(parser, str): + self._parser = create_parser(parser, **parser_kwargs) + else: + assert parser is not None and len(parser.img_ids) + self._parser = parser + self._transform = transform + self._transforms = transforms + + def __getitem__(self, index): + """ + Args: + index (int): Index + Returns: + tuple: Tuple (image, annotations (target)). + """ + img_info = self._parser.img_infos[index] + target = dict(img_idx=index, img_size=(img_info['width'], img_info['height'])) + if self._parser.has_labels: + ann = self._parser.get_ann_info(index) + target.update(ann) + img_path = self.data_dir / img_info['file_name'] + img = Image.open(img_path).convert('RGB') + if self.transforms is not None: + img = torch.as_tensor(np.array(img), dtype=torch.uint8) + voc_boxes = [] + for coord in target['bbox']: + xmin = coord[1] + ymin = coord[0] + xmax = coord[3] + ymax = coord[2] + if xmin<1: + xmin = 1 + if ymin<1: + ymin = 1 + if xmax>=img.shape[1]-1: + xmax = img.shape[1]-1 + if ymax>=img.shape[0]-1: + ymax = img.shape[0]-1 + voc_boxes.append([xmin, ymin, xmax, ymax]) + transformed = self.transforms(image=np.array(img), bbox_classes=target['cls'], bboxes=voc_boxes) + img = torch.as_tensor(transformed['image'], dtype=torch.uint8) + target['bbox'] = [] + for coord in transformed['bboxes']: + ymin = int(coord[1]) + xmin = int(coord[0]) + ymax = int(coord[3]) + xmax = int(coord[2]) + target['bbox'].append([ymin, xmin, ymax, xmax]) + target['bbox'] = np.array(target['bbox'], dtype=np.float32) + target['cls'] = np.array(transformed['bbox_classes']) + img = Image.fromarray(np.array(img).astype('uint8'), 'RGB') + target['img_size'] = img.size + + if self.transform is not None: + img, target = self.transform(img, target) + + return img, target + + def __len__(self): + return len(self._parser.img_ids) + + @property + def parser(self): + return self._parser + + @property + def transform(self): + return self._transform + + @transform.setter + def transform(self, t): + self._transform = t + + @property + def transforms(self): + return self._transforms + + @transforms.setter + def transforms(self, t): + self._transforms = t + +class SkipSubset(data.Dataset): + r""" + Subset of a dataset at specified indices. + + Arguments: + dataset (Dataset): The whole Dataset + n (int): skip rate (select every nth) + """ + def __init__(self, dataset, n=2): + self.dataset = dataset + assert n >= 1 + self.indices = np.arange(len(dataset))[::n] + + def __getitem__(self, idx): + return self.dataset[self.indices[idx]] + + def __len__(self): + return len(self.indices) + + @property + def parser(self): + return self.dataset.parser + + @property + def transform(self): + return self.dataset.transform + + @transform.setter + def transform(self, t): + self.dataset.transform = t + + @property + def transforms(self): + return self.dataset.transforms + + @transforms.setter + def transforms(self, t): + self.dataset.transforms = t diff --git a/efficientdet/effdet/data/dataset_config.py b/efficientdet/effdet/data/dataset_config.py new file mode 100644 index 0000000000000000000000000000000000000000..e5f17b4c368384a0bb98cd53fa2e144cf43d54df --- /dev/null +++ b/efficientdet/effdet/data/dataset_config.py @@ -0,0 +1,194 @@ +""" COCO detect-waste dataset configurations + +Updated 2021 Wimlds in Detect Waste in Pomerania +""" +from dataclasses import dataclass +from typing import Dict + + +@dataclass +class CocoCfg: + variant: str = None + parser: str = 'coco' + num_classes: int = 80 + splits: Dict[str, dict] = None + + +@dataclass +class TACOCfg(CocoCfg): + root: str = "" + ann: str = "" + variant: str = '2017' + num_classes: int = 28 + + def add_split(self): + self.splits = { + 'train': {'ann_filename': self.ann+'_train.json', + 'img_dir': self.root, + 'has_labels': True}, + 'val': {'ann_filename': self.ann+'_test.json', + 'img_dir': self.root, + 'has_labels': True} + } + + +@dataclass +class DetectwasteCfg(CocoCfg): + root: str = "" + ann: str = "" + variant: str = '2017' + num_classes: int = 7 + + def add_split(self): + self.splits = { + 'train': {'ann_filename': self.ann+'_train.json', + 'img_dir': self.root, + 'has_labels': True}, + 'val': {'ann_filename': self.ann+'_test.json', + 'img_dir': self.root, + 'has_labels': True} + } + + +@dataclass +class BinaryCfg(CocoCfg): + root: str = "" + ann: str = "" + variant: str = '2017' + num_classes: int = 1 + + def add_split(self): + self.splits = { + 'train': {'ann_filename': self.ann+'_train.json', + 'img_dir': self.root, + 'has_labels': True}, + 'val': {'ann_filename': self.ann+'_test.json', + 'img_dir': self.root, + 'has_labels': True} + } + + +@dataclass +class BinaryMultiCfg(CocoCfg): + root: str = "" + ann: str = "" + variant: str = '2017' + num_classes: int = 1 + + def add_split(self): + self.splits = { + 'train': {'ann_filename': self.ann+'_train.json', + 'img_dir': self.root, + 'has_labels': True}, + 'val': {'ann_filename': self.ann+'_test.json', + 'img_dir': self.root, + 'has_labels': True} + } + + +@dataclass +class TrashCanCfg(CocoCfg): + root: str = "" + ann: str = "" + variant: str = '2017' + num_classes: int = 8 + + def add_split(self): + self.splits = { + 'train': {'ann_filename': self.ann+'_train.json', + 'img_dir': self.root, + 'has_labels': True}, + 'val': {'ann_filename': self.ann+'_test.json', + 'img_dir': self.root, + 'has_labels': True} + } + + +@dataclass +class UAVVasteCfg(CocoCfg): + root: str = "" + ann: str = "" + variant: str = '2017' + num_classes: int = 1 + + def add_split(self): + self.splits = { + 'train': {'ann_filename': self.ann+'_train.json', + 'img_dir': self.root, + 'has_labels': True}, + 'val': {'ann_filename': self.ann+'_test.json', + 'img_dir': self.root, + 'has_labels': True} + } + + +@dataclass +class ICRACfg(CocoCfg): + root: str = "" + ann: str = "" + variant: str = '2017' + num_classes: int = 7 + + def add_split(self): + self.splits = { + 'train': {'ann_filename': self.ann+'_train.json', + 'img_dir': self.root, + 'has_labels': True}, + 'val': {'ann_filename': self.ann+'_test.json', + 'img_dir': self.root, + 'has_labels': True} + } + + +@dataclass +class DrinkWasteCfg(CocoCfg): + root: str = "" + ann: str = "" + variant: str = '2017' + num_classes: int = 4 + + def add_split(self): + self.splits = { + 'train': {'ann_filename': self.ann+'_train.json', + 'img_dir': self.root, + 'has_labels': True}, + 'val': {'ann_filename': self.ann+'_test.json', + 'img_dir': self.root, + 'has_labels': True} + } + + +@dataclass +class MJU_WasteCfg(CocoCfg): + root: str = "" + ann: str = "" + variant: str = '2017' + num_classes: int = 1 + + def add_split(self): + self.splits = { + 'train': {'ann_filename': self.ann+'_train.json', + 'img_dir': self.root, + 'has_labels': True}, + 'val': {'ann_filename': self.ann+'_test.json', + 'img_dir': self.root, + 'has_labels': True} + } + + +@dataclass +class WadeCfg(CocoCfg): + root: str = "" + ann: str = "" + variant: str = '2017' + num_classes: int = 1 + + def add_split(self): + self.splits = { + 'train': {'ann_filename': self.ann+'_train.json', + 'img_dir': self.root, + 'has_labels': True}, + 'val': {'ann_filename': self.ann+'_test.json', + 'img_dir': self.root, + 'has_labels': True} + } diff --git a/efficientdet/effdet/data/dataset_factory.py b/efficientdet/effdet/data/dataset_factory.py new file mode 100644 index 0000000000000000000000000000000000000000..5497e2732637506148928f68e00adaddb79a6c92 --- /dev/null +++ b/efficientdet/effdet/data/dataset_factory.py @@ -0,0 +1,85 @@ +""" Dataset factory + +Updated 2021 Wimlds in Detect Waste in Pomerania +""" +from collections import OrderedDict +from pathlib import Path + +from .dataset_config import * +from .parsers import * +from .dataset import DetectionDatset +from .parsers import create_parser + +# list of detect-waste datasets +waste_datasets_list = ['taco', 'detectwaste', 'binary', 'multi', + 'uav', 'mju', 'trashcan', 'wade', 'icra' + 'drinkwaste'] + + +def create_dataset(name, root, ann, splits=('train', 'val')): + if isinstance(splits, str): + splits = (splits,) + name = name.lower() + root = Path(root) + dataset_cls = DetectionDatset + datasets = OrderedDict() + if name.startswith('coco'): + if 'coco2014' in name: + dataset_cfg = Coco2014Cfg() + else: + dataset_cfg = Coco2017Cfg() + for s in splits: + if s not in dataset_cfg.splits: + raise RuntimeError(f'{s} split not found in config') + split_cfg = dataset_cfg.splits[s] + ann_file = root / split_cfg['ann_filename'] + parser_cfg = CocoParserCfg( + ann_filename=ann_file, + has_labels=split_cfg['has_labels'] + ) + datasets[s] = dataset_cls( + data_dir=root / Path(split_cfg['img_dir']), + parser=create_parser(dataset_cfg.parser, cfg=parser_cfg), + ) + datasets = OrderedDict() + elif name in waste_datasets_list: + if name.startswith('taco'): + dataset_cfg = TACOCfg(root=root, ann=ann) + elif name.startswith('detectwaste'): + dataset_cfg = DetectwasteCfg(root=root, ann=ann) + elif name.startswith('binary'): + dataset_cfg = BinaryCfg(root=root, ann=ann) + elif name.startswith('multi'): + dataset_cfg = BinaryMultiCfg(root=root, ann=ann) + elif name.startswith('uav'): + dataset_cfg = UAVVasteCfg(root=root, ann=ann) + elif name.startswith('trashcan'): + dataset_cfg = TrashCanCfg(root=root, ann=ann) + elif name.startswith('drinkwaste'): + dataset_cfg = DrinkWasteCfg(root=root, ann=ann) + elif name.startswith('mju'): + dataset_cfg = MJU_WasteCfg(root=root, ann=ann) + elif name.startswith('wade'): + dataset_cfg = WadeCfg(root=root, ann=ann) + elif name.startswith('icra'): + dataset_cfg = ICRACfg(root=root, ann=ann) + else: + assert False, f'Unknown dataset parser ({name})' + dataset_cfg.add_split() + for s in splits: + if s not in dataset_cfg.splits: + raise RuntimeError(f'{s} split not found in config') + split_cfg = dataset_cfg.splits[s] + parser_cfg = CocoParserCfg( + ann_filename=split_cfg['ann_filename'], + has_labels=split_cfg['has_labels'] + ) + datasets[s] = dataset_cls( + data_dir=split_cfg['img_dir'], + parser=create_parser(dataset_cfg.parser, cfg=parser_cfg), + ) + else: + assert False, f'Unknown dataset parser ({name})' + + datasets = list(datasets.values()) + return datasets if len(datasets) > 1 else datasets[0] diff --git a/efficientdet/effdet/data/input_config.py b/efficientdet/effdet/data/input_config.py new file mode 100644 index 0000000000000000000000000000000000000000..3d7e96e99161dc442afcb0b239afcb72939d08ea --- /dev/null +++ b/efficientdet/effdet/data/input_config.py @@ -0,0 +1,60 @@ +from .transforms import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD + + +def resolve_input_config(args, model_config=None, model=None): + if not isinstance(args, dict): + args = vars(args) + input_config = {} + if not model_config and model is not None and hasattr(model, 'config'): + model_config = model.config + + # Resolve input/image size + in_chans = 3 + input_size = (in_chans, 512, 512) + + if 'input_size' in model_config: + input_size = tuple(model_config['input_size']) + elif 'image_size' in model_config: + input_size = (in_chans,) + tuple(model_config['image_size']) + assert isinstance(input_size, tuple) and len(input_size) == 3 + input_config['input_size'] = input_size + + # resolve interpolation method + input_config['interpolation'] = 'bicubic' + if 'interpolation' in args and args['interpolation']: + input_config['interpolation'] = args['interpolation'] + elif 'interpolation' in model_config: + input_config['interpolation'] = model_config['interpolation'] + + # resolve dataset + model mean for normalization + input_config['mean'] = IMAGENET_DEFAULT_MEAN + if 'mean' in args and args['mean'] is not None: + mean = tuple(args['mean']) + if len(mean) == 1: + mean = tuple(list(mean) * in_chans) + else: + assert len(mean) == in_chans + input_config['mean'] = mean + elif 'mean' in model_config: + input_config['mean'] = model_config['mean'] + + # resolve dataset + model std deviation for normalization + input_config['std'] = IMAGENET_DEFAULT_STD + if 'std' in args and args['std'] is not None: + std = tuple(args['std']) + if len(std) == 1: + std = tuple(list(std) * in_chans) + else: + assert len(std) == in_chans + input_config['std'] = std + elif 'std' in model_config: + input_config['std'] = model_config['std'] + + # resolve letterbox fill color + input_config['fill_color'] = 'mean' + if 'fill_color' in args and args['fill_color'] is not None: + input_config['fill_color'] = args['fill_color'] + elif 'fill_color' in model_config: + input_config['fill_color'] = model_config['fill_color'] + + return input_config diff --git a/efficientdet/effdet/data/loader.py b/efficientdet/effdet/data/loader.py new file mode 100644 index 0000000000000000000000000000000000000000..9956d0e9090e8d94d57a72ce185924638de7871e --- /dev/null +++ b/efficientdet/effdet/data/loader.py @@ -0,0 +1,226 @@ +""" Object detection loader/collate + +Hacked together by / Copyright 2020 Ross Wightman +""" +import torch.utils.data +from .transforms import * +from .transforms_albumentation import get_transform +from .random_erasing import RandomErasing +from effdet.anchors import AnchorLabeler +from timm.data.distributed_sampler import OrderedDistributedSampler +import os + +MAX_NUM_INSTANCES = 100 + + +class DetectionFastCollate: + """ A detection specific, optimized collate function w/ a bit of state. + + Optionally performs anchor labelling. Doing this here offloads some work from the + GPU and the main training process thread and increases the load on the dataloader + threads. + + """ + def __init__( + self, + instance_keys=None, + instance_shapes=None, + instance_fill=-1, + max_instances=MAX_NUM_INSTANCES, + anchor_labeler=None, + ): + instance_keys = instance_keys or {'bbox', 'bbox_ignore', 'cls'} + instance_shapes = instance_shapes or dict( + bbox=(max_instances, 4), bbox_ignore=(max_instances, 4), cls=(max_instances,)) + self.instance_info = {k: dict(fill=instance_fill, shape=instance_shapes[k]) for k in instance_keys} + self.max_instances = max_instances + self.anchor_labeler = anchor_labeler + + def __call__(self, batch): + batch_size = len(batch) + target = dict() + labeler_outputs = dict() + img_tensor = torch.zeros((batch_size, *batch[0][0].shape), dtype=torch.uint8) + for i in range(batch_size): + img_tensor[i] += torch.from_numpy(batch[i][0]) + labeler_inputs = {} + for tk, tv in batch[i][1].items(): + instance_info = self.instance_info.get(tk, None) + if instance_info is not None: + # target tensor is associated with a detection instance + tv = torch.from_numpy(tv).to(dtype=torch.float32) + if self.anchor_labeler is None: + if i == 0: + shape = (batch_size,) + instance_info['shape'] + target_tensor = torch.full(shape, instance_info['fill'], dtype=torch.float32) + target[tk] = target_tensor + else: + target_tensor = target[tk] + num_elem = min(tv.shape[0], self.max_instances) + target_tensor[i, 0:num_elem] = tv[0:num_elem] + else: + # no need to pass gt tensors through when labeler in use + if tk in ('bbox', 'cls'): + labeler_inputs[tk] = tv + else: + # target tensor is an image-level annotation / metadata + if i == 0: + # first batch elem, create destination tensors + if isinstance(tv, (tuple, list)): + # per batch elem sequence + shape = (batch_size, len(tv)) + dtype = torch.float32 if isinstance(tv[0], (float, np.floating)) else torch.int32 + else: + # per batch elem scalar + shape = batch_size, + dtype = torch.float32 if isinstance(tv, (float, np.floating)) else torch.int64 + target_tensor = torch.zeros(shape, dtype=dtype) + target[tk] = target_tensor + else: + target_tensor = target[tk] + target_tensor[i] = torch.tensor(tv, dtype=target_tensor.dtype) + + if self.anchor_labeler is not None: + cls_targets, box_targets, num_positives = self.anchor_labeler.label_anchors( + labeler_inputs['bbox'], labeler_inputs['cls'], filter_valid=False) + if i == 0: + # first batch elem, create destination tensors, separate key per level + for j, (ct, bt) in enumerate(zip(cls_targets, box_targets)): + labeler_outputs[f'label_cls_{j}'] = torch.zeros( + (batch_size,) + ct.shape, dtype=torch.int64) + labeler_outputs[f'label_bbox_{j}'] = torch.zeros( + (batch_size,) + bt.shape, dtype=torch.float32) + labeler_outputs['label_num_positives'] = torch.zeros(batch_size) + for j, (ct, bt) in enumerate(zip(cls_targets, box_targets)): + labeler_outputs[f'label_cls_{j}'][i] = ct + labeler_outputs[f'label_bbox_{j}'][i] = bt + labeler_outputs['label_num_positives'][i] = num_positives + if labeler_outputs: + target.update(labeler_outputs) + + return img_tensor, target + + +class PrefetchLoader: + + def __init__(self, + loader, + mean=IMAGENET_DEFAULT_MEAN, + std=IMAGENET_DEFAULT_STD, + re_prob=0., + re_mode='pixel', + re_count=1, + ): + self.loader = loader + self.mean = torch.tensor([x * 255 for x in mean]).cuda().view(1, 3, 1, 1) + self.std = torch.tensor([x * 255 for x in std]).cuda().view(1, 3, 1, 1) + if re_prob > 0.: + self.random_erasing = RandomErasing(probability=re_prob, mode=re_mode, max_count=re_count) + else: + self.random_erasing = None + + def __iter__(self): + stream = torch.cuda.Stream() + first = True + + for next_input, next_target in self.loader: + with torch.cuda.stream(stream): + next_input = next_input.cuda(non_blocking=True) + next_input = next_input.float().sub_(self.mean).div_(self.std) + next_target = {k: v.cuda(non_blocking=True) for k, v in next_target.items()} + if self.random_erasing is not None: + next_input = self.random_erasing(next_input, next_target) + + if not first: + yield input, target + else: + first = False + + torch.cuda.current_stream().wait_stream(stream) + input = next_input + target = next_target + + yield input, target + + def __len__(self): + return len(self.loader) + + @property + def sampler(self): + return self.loader.sampler + + @property + def dataset(self): + return self.loader.dataset + + +def create_loader( + dataset, + input_size, + batch_size, + is_training=False, + use_prefetcher=True, + re_prob=0., + re_mode='pixel', + re_count=1, + interpolation='bilinear', + fill_color='mean', + mean=IMAGENET_DEFAULT_MEAN, + std=IMAGENET_DEFAULT_STD, + num_workers=1, + distributed=False, + pin_mem=False, + anchor_labeler=None, +): + if isinstance(input_size, tuple): + img_size = input_size[-2:] + else: + img_size = input_size + + if is_training: + transforms = get_transform() + transform = transforms_coco_train( + img_size, + interpolation=interpolation, + use_prefetcher=use_prefetcher, + fill_color=fill_color, + mean=mean, + std=std) + else: + transforms = None + transform = transforms_coco_eval( + img_size, + interpolation=interpolation, + use_prefetcher=use_prefetcher, + fill_color=fill_color, + mean=mean, + std=std) + dataset.transforms = transforms + dataset.transform = transform + + sampler = None + if distributed: + if is_training: + sampler = torch.utils.data.distributed.DistributedSampler(dataset) + else: + # This will add extra duplicate entries to result in equal num + # of samples per-process, will slightly alter validation results + sampler = OrderedDistributedSampler(dataset) + + collate_fn = DetectionFastCollate(anchor_labeler=anchor_labeler) + loader = torch.utils.data.DataLoader( + dataset, + batch_size=batch_size, + shuffle=sampler is None and is_training, + num_workers=num_workers, + sampler=sampler, + pin_memory=pin_mem, + collate_fn=collate_fn, + ) + if use_prefetcher: + if is_training: + loader = PrefetchLoader(loader, mean=mean, std=std, re_prob=re_prob, re_mode=re_mode, re_count=re_count) + else: + loader = PrefetchLoader(loader, mean=mean, std=std) + + return loader diff --git a/efficientdet/effdet/data/parsers/__init__.py b/efficientdet/effdet/data/parsers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..bea708cb8cc1c8a9e7a263dabc438211d9f79c2c --- /dev/null +++ b/efficientdet/effdet/data/parsers/__init__.py @@ -0,0 +1,2 @@ +from .parser_config import OpenImagesParserCfg, CocoParserCfg, VocParserCfg +from .parser_factory import create_parser diff --git a/efficientdet/effdet/data/parsers/parser.py b/efficientdet/effdet/data/parsers/parser.py new file mode 100644 index 0000000000000000000000000000000000000000..b593280520fb0eae22f9b127908a52da882ce76c --- /dev/null +++ b/efficientdet/effdet/data/parsers/parser.py @@ -0,0 +1,82 @@ +from numbers import Integral +from typing import List, Union, Dict, Any + + +class Parser: + """ Parser base class. + + The attributes listed below make up a public interface common to all parsers. They can be accessed directly + once the dataset is constructed and annotations are populated. + + Attributes: + + cat_names (list[str]): + list of category (class) names, with background class at position 0. + cat_ids (list[union[str, int]): + list of dataset specific, unique integer or string category ids, does not include background + cat_id_to_label (dict): + map from category id to integer 1-indexed class label + + img_ids (list): + list of dataset specific, unique image ids corresponding to valid samples in dataset + img_ids_invalid (list): + list of image ids corresponding to invalid images, not used as samples + img_infos (list[dict]): + image info, list of info dicts with filename, width, height for each image sample + """ + def __init__( + self, + bbox_yxyx: bool = False, + has_labels: bool = True, + include_masks: bool = False, + include_bboxes_ignore: bool = False, + ignore_empty_gt: bool = False, + min_img_size: int = 32, + ): + """ + Args: + yxyx (bool): output coords in yxyx format, otherwise xyxy + has_labels (bool): dataset has labels (for training validation, False usually for test sets) + include_masks (bool): include segmentation masks in target output (not supported yet for any dataset) + include_bboxes_ignore (bool): include ignored bbox in target output + ignore_empty_gt (bool): ignore images with no ground truth (no negative images) + min_img_size (bool): ignore images with width or height smaller than this number + sub_sample (int): sample every N images from the dataset + """ + # parser config, determines how dataset parsed and validated + self.yxyx = bbox_yxyx + self.has_labels = has_labels + self.include_masks = include_masks + self.include_bboxes_ignore = include_bboxes_ignore + self.ignore_empty_gt = ignore_empty_gt + self.min_img_size = min_img_size + self.label_offset = 1 + + # Category (class) metadata. Populated by _load_annotations() + self.cat_names: List[str] = [] + self.cat_ids: List[Union[str, Integral]] = [] + self.cat_id_to_label: Dict[Union[str, Integral], Integral] = dict() + + # Image metadata. Populated by _load_annotations() + self.img_ids: List[Union[str, Integral]] = [] + self.img_ids_invalid: List[Union[str, Integral]] = [] + self.img_infos: List[Dict[str, Any]] = [] + + @property + def cat_dicts(self): + """return category names and labels in format compatible with TF Models Evaluator + list[dict(name=, id=)] + """ + return [ + dict( + name=name, + id=cat_id if not self.cat_id_to_label else self.cat_id_to_label[cat_id] + ) for name, cat_id in zip(self.cat_names, self.cat_ids)] + + @property + def max_label(self): + if self.cat_id_to_label: + return max(self.cat_id_to_label.values()) + else: + assert len(self.cat_ids) and isinstance(self.cat_ids[0], Integral) + return max(self.cat_ids) diff --git a/efficientdet/effdet/data/parsers/parser_coco.py b/efficientdet/effdet/data/parsers/parser_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..58bc2495c38c2b6835626528e37a4da119127d73 --- /dev/null +++ b/efficientdet/effdet/data/parsers/parser_coco.py @@ -0,0 +1,93 @@ +""" COCO dataset parser + +Copyright 2020 Ross Wightman +""" +import numpy as np +from pycocotools.coco import COCO +from .parser import Parser +from .parser_config import CocoParserCfg + + +class CocoParser(Parser): + + def __init__(self, cfg: CocoParserCfg): + super().__init__( + bbox_yxyx=cfg.bbox_yxyx, + has_labels=cfg.has_labels, + include_masks=cfg.include_masks, + include_bboxes_ignore=cfg.include_bboxes_ignore, + ignore_empty_gt=cfg.has_labels and cfg.ignore_empty_gt, + min_img_size=cfg.min_img_size + ) + self.cat_ids_as_labels = True # this is the default for original TF EfficientDet models + self.coco = None + self._load_annotations(cfg.ann_filename) + + def get_ann_info(self, idx): + img_id = self.img_ids[idx] + return self._parse_img_ann(img_id) + + def _load_annotations(self, ann_file): + assert self.coco is None + self.coco = COCO(ann_file) + self.cat_ids = self.coco.getCatIds() + self.cat_names = [c['name'] for c in self.coco.loadCats(ids=self.cat_ids)] + if not self.cat_ids_as_labels: + self.cat_id_to_label = {cat_id: i + self.label_offset for i, cat_id in enumerate(self.cat_ids)} + img_ids_with_ann = set(_['image_id'] for _ in self.coco.anns.values()) + for img_id in sorted(self.coco.imgs.keys()): + info = self.coco.loadImgs([img_id])[0] + if (min(info['width'], info['height']) < self.min_img_size or + (self.ignore_empty_gt and img_id not in img_ids_with_ann)): + self.img_ids_invalid.append(img_id) + continue + self.img_ids.append(img_id) + self.img_infos.append(info) + + def _parse_img_ann(self, img_id): + ann_ids = self.coco.getAnnIds(imgIds=[img_id]) + ann_info = self.coco.loadAnns(ann_ids) + bboxes = [] + bboxes_ignore = [] + cls = [] + + for i, ann in enumerate(ann_info): + if ann.get('ignore', False): + continue + x1, y1, w, h = ann['bbox'] + if self.include_masks and ann['area'] <= 0: + continue + if w < 1 or h < 1: + continue + + if self.yxyx: + bbox = [y1, x1, y1 + h, x1 + w] + else: + bbox = [x1, y1, x1 + w, y1 + h] + + if ann.get('iscrowd', False): + if self.include_bboxes_ignore: + bboxes_ignore.append(bbox) + else: + bboxes.append(bbox) + cls.append(self.cat_id_to_label[ann['category_id']] if self.cat_id_to_label else ann['category_id']) + + if bboxes: + bboxes = np.array(bboxes, ndmin=2, dtype=np.float32) + cls = np.array(cls, dtype=np.int64) + else: + bboxes = np.zeros((0, 4), dtype=np.float32) + cls = np.array([], dtype=np.int64) + + if self.include_bboxes_ignore: + if bboxes_ignore: + bboxes_ignore = np.array(bboxes_ignore, ndmin=2, dtype=np.float32) + else: + bboxes_ignore = np.zeros((0, 4), dtype=np.float32) + + ann = dict(bbox=bboxes, cls=cls) + + if self.include_bboxes_ignore: + ann['bbox_ignore'] = bboxes_ignore + + return ann diff --git a/efficientdet/effdet/data/parsers/parser_config.py b/efficientdet/effdet/data/parsers/parser_config.py new file mode 100644 index 0000000000000000000000000000000000000000..8537d3e1b176e06a7d391552ae3f4fc602013270 --- /dev/null +++ b/efficientdet/effdet/data/parsers/parser_config.py @@ -0,0 +1,49 @@ +""" Dataset parser configs + +Copyright 2020 Ross Wightman +""" +from dataclasses import dataclass + +__all__ = ['CocoParserCfg', 'OpenImagesParserCfg', 'VocParserCfg'] + + +@dataclass +class CocoParserCfg: + ann_filename: str # absolute path + include_masks: bool = False + include_bboxes_ignore: bool = False + has_labels: bool = True + bbox_yxyx: bool = True + min_img_size: int = 32 + ignore_empty_gt: bool = False + + +@dataclass +class VocParserCfg: + split_filename: str + ann_filename: str + img_filename: str = '%.jpg' + keep_difficult: bool = True + classes: list = None + add_background: bool = True + has_labels: bool = True + bbox_yxyx: bool = True + min_img_size: int = 32 + ignore_empty_gt: bool = False + + +@dataclass +class OpenImagesParserCfg: + categories_filename: str + img_info_filename: str + bbox_filename: str + img_label_filename: str = '' + masks_filename: str = '' + img_filename: str = '%s.jpg' # relative to dataset img_dir + task: str = 'obj' + prefix_levels: int = 1 + add_background: bool = True + has_labels: bool = True + bbox_yxyx: bool = True + min_img_size: int = 32 + ignore_empty_gt: bool = False diff --git a/efficientdet/effdet/data/parsers/parser_factory.py b/efficientdet/effdet/data/parsers/parser_factory.py new file mode 100644 index 0000000000000000000000000000000000000000..9dcd46a740a5114902f28a22c9f186acefb81507 --- /dev/null +++ b/efficientdet/effdet/data/parsers/parser_factory.py @@ -0,0 +1,19 @@ +""" Parser factory + +Copyright 2020 Ross Wightman +""" +from .parser_coco import CocoParser +from .parser_voc import VocParser +from .parser_open_images import OpenImagesParser + + +def create_parser(name, **kwargs): + if name == 'coco': + parser = CocoParser(**kwargs) + elif name == 'voc': + parser = VocParser(**kwargs) + elif name == 'openimages': + parser = OpenImagesParser(**kwargs) + else: + assert False, f'Unknown dataset parser ({name})' + return parser diff --git a/efficientdet/effdet/data/parsers/parser_open_images.py b/efficientdet/effdet/data/parsers/parser_open_images.py new file mode 100644 index 0000000000000000000000000000000000000000..3c201ac2df47924ac648c2de245b9ca807c1fbc0 --- /dev/null +++ b/efficientdet/effdet/data/parsers/parser_open_images.py @@ -0,0 +1,211 @@ +""" OpenImages dataset parser + +Copyright 2020 Ross Wightman +""" +import numpy as np +import os +import logging + +from .parser import Parser +from .parser_config import OpenImagesParserCfg + +_logger = logging.getLogger(__name__) + + +class OpenImagesParser(Parser): + + def __init__(self, cfg: OpenImagesParserCfg): + super().__init__( + bbox_yxyx=cfg.bbox_yxyx, + has_labels=cfg.has_labels, + include_masks=False, # FIXME to support someday + include_bboxes_ignore=False, + ignore_empty_gt=cfg.has_labels and cfg.ignore_empty_gt, + min_img_size=cfg.min_img_size + ) + self.img_prefix_levels = cfg.prefix_levels + self.mask_prefix_levels = 1 + self._anns = None # access via get_ann_info() + self._img_to_ann = None + self._load_annotations( + categories_filename=cfg.categories_filename, + img_info_filename=cfg.img_info_filename, + img_filename=cfg.img_filename, + masks_filename=cfg.masks_filename, + bbox_filename=cfg.bbox_filename + ) + + def _load_annotations( + self, + categories_filename: str, + img_info_filename: str, + img_filename: str, + masks_filename: str, + bbox_filename: str, + ): + import pandas as pd # For now, blow up on pandas req only when trying to load open images anno + + _logger.info('Loading categories...') + classes_df = pd.read_csv(categories_filename, header=None) + self.cat_ids = classes_df[0].tolist() + self.cat_names = classes_df[1].tolist() + self.cat_id_to_label = {c: i + self.label_offset for i, c in enumerate(self.cat_ids)} + + def _img_filename(img_id): + # build image filenames that are relative to img_dir + filename = img_filename % img_id + if self.img_prefix_levels: + levels = [c for c in img_id[:self.img_prefix_levels]] + filename = os.path.join(*levels, filename) + return filename + + def _mask_filename(mask_path): + # FIXME finish + if self.mask_prefix_levels: + levels = [c for c in mask_path[:self.mask_prefix_levels]] + mask_path = os.path.join(*levels, mask_path) + return mask_path + + def _load_img_info(csv_file, select_img_ids=None): + _logger.info('Read img_info csv...') + img_info_df = pd.read_csv(csv_file, index_col='id') + + _logger.info('Filter images...') + if select_img_ids is not None: + img_info_df = img_info_df.loc[select_img_ids] + img_info_df = img_info_df[ + (img_info_df['width'] >= self.min_img_size) & (img_info_df['height'] >= self.min_img_size)] + + _logger.info('Mapping ids...') + img_info_df['img_id'] = img_info_df.index + img_info_df['file_name'] = img_info_df.index.map(lambda x: _img_filename(x)) + img_info_df = img_info_df[['img_id', 'file_name', 'width', 'height']] + img_sizes = img_info_df[['width', 'height']].values + self.img_infos = img_info_df.to_dict('records') + self.img_ids = img_info_df.index.values.tolist() + img_id_to_idx = {img_id: idx for idx, img_id in enumerate(self.img_ids)} + return img_sizes, img_id_to_idx + + if self.include_masks and self.has_labels: + masks_df = pd.read_csv(masks_filename) + + # NOTE currently using dataset masks anno ImageIDs to form valid img_ids from the dataset + anno_img_ids = sorted(masks_df['ImageID'].unique()) + img_sizes, img_id_to_idx = _load_img_info(img_info_filename, select_img_ids=anno_img_ids) + + masks_df['ImageIdx'] = masks_df['ImageID'].map(img_id_to_idx) + if np.issubdtype(masks_df.ImageIdx.dtype, np.floating): + masks_df = masks_df.dropna(axis='rows') + masks_df['ImageIdx'] = masks_df.ImageIdx.astype(np.int32) + masks_df.sort_values('ImageIdx', inplace=True) + ann_img_idx = masks_df['ImageIdx'].values + img_sizes = img_sizes[ann_img_idx] + masks_df['BoxXMin'] = masks_df['BoxXMin'] * img_sizes[:, 0] + masks_df['BoxXMax'] = masks_df['BoxXMax'] * img_sizes[:, 0] + masks_df['BoxYMin'] = masks_df['BoxYMin'] * img_sizes[:, 1] + masks_df['BoxYMax'] = masks_df['BoxYMax'] * img_sizes[:, 1] + masks_df['LabelIdx'] = masks_df['LabelName'].map(self.cat_id_to_label) + # FIXME remap mask filename with _mask_filename + + self._anns = dict( + bbox=masks_df[['BoxXMin', 'BoxYMin', 'BoxXMax', 'BoxYMax']].values.astype(np.float32), + label=masks_df[['LabelIdx']].values.astype(np.int32), + mask_path=masks_df[['MaskPath']].values + ) + _, ri, rc = np.unique(ann_img_idx, return_index=True, return_counts=True) + self._img_to_ann = list(zip(ri, rc)) # index, count tuples + elif self.has_labels: + _logger.info('Loading bbox...') + bbox_df = pd.read_csv(bbox_filename) + + # NOTE currently using dataset box anno ImageIDs to form valid img_ids from the larger dataset. + # FIXME use *imagelabels.csv or imagelabels-boxable.csv for negative examples (without box?) + anno_img_ids = sorted(bbox_df['ImageID'].unique()) + img_sizes, img_id_to_idx = _load_img_info(img_info_filename, select_img_ids=anno_img_ids) + + _logger.info('Process bbox...') + bbox_df['ImageIdx'] = bbox_df['ImageID'].map(img_id_to_idx) + if np.issubdtype(bbox_df.ImageIdx.dtype, np.floating): + bbox_df = bbox_df.dropna(axis='rows') + bbox_df['ImageIdx'] = bbox_df.ImageIdx.astype(np.int32) + bbox_df.sort_values('ImageIdx', inplace=True) + ann_img_idx = bbox_df['ImageIdx'].values + img_sizes = img_sizes[ann_img_idx] + bbox_df['XMin'] = bbox_df['XMin'] * img_sizes[:, 0] + bbox_df['XMax'] = bbox_df['XMax'] * img_sizes[:, 0] + bbox_df['YMin'] = bbox_df['YMin'] * img_sizes[:, 1] + bbox_df['YMax'] = bbox_df['YMax'] * img_sizes[:, 1] + bbox_df['LabelIdx'] = bbox_df['LabelName'].map(self.cat_id_to_label).astype(np.int32) + + self._anns = dict( + bbox=bbox_df[['XMin', 'YMin', 'XMax', 'YMax']].values.astype(np.float32), + label=bbox_df[['LabelIdx', 'IsGroupOf']].values.astype(np.int32), + ) + _, ri, rc = np.unique(ann_img_idx, return_index=True, return_counts=True) + self._img_to_ann = list(zip(ri, rc)) # index, count tuples + else: + _load_img_info(img_info_filename) + + _logger.info('Annotations loaded!') + + def get_ann_info(self, idx): + if not self.has_labels: + return dict() + start_idx, num_ann = self._img_to_ann[idx] + ann_keys = tuple(self._anns.keys()) + ann_values = tuple(self._anns[k][start_idx:start_idx + num_ann] for k in ann_keys) + return self._parse_ann_info(idx, ann_keys, ann_values) + + def _parse_ann_info(self, img_idx, ann_keys, ann_values): + """ + """ + gt_bboxes = [] + gt_labels = [] + gt_bboxes_ignore = [] + if self.include_masks: + assert 'mask_path' in ann_keys + gt_masks = [] + + for ann in zip(*ann_values): + ann = dict(zip(ann_keys, ann)) + x1, y1, x2, y2 = ann['bbox'] + if x2 - x1 < 1 or y2 - y1 < 1: + continue + label = ann['label'][0] + iscrowd = False + if len(ann['label']) > 1: + iscrowd = ann['label'][1] + if self.yxyx: + bbox = np.array([y1, x1, y2, x2], dtype=np.float32) + else: + bbox = ann['bbox'] + if iscrowd: + gt_bboxes_ignore.append(bbox) + else: + gt_bboxes.append(bbox) + gt_labels.append(label) + # if self.include_masks: + # img_info = self.img_infos[img_idx] + # mask_img = SegmentationMask(ann['mask_filename'], img_info['width'], img_info['height']) + # gt_masks.append(mask_img) + + if gt_bboxes: + gt_bboxes = np.array(gt_bboxes, ndmin=2, dtype=np.float32) + gt_labels = np.array(gt_labels, dtype=np.int64) + else: + gt_bboxes = np.zeros((0, 4), dtype=np.float32) + gt_labels = np.array([], dtype=np.int64) + + if self.include_bboxes_ignore: + if gt_bboxes_ignore: + gt_bboxes_ignore = np.array(gt_bboxes_ignore, ndmin=2, dtype=np.float32) + else: + gt_bboxes_ignore = np.zeros((0, 4), dtype=np.float32) + + ann = dict(bbox=gt_bboxes, cls=gt_labels) + + if self.include_bboxes_ignore: + ann.update(dict(bbox_ignore=gt_bboxes_ignore, cls_ignore=np.array([], dtype=np.int64))) + if self.include_masks: + ann['masks'] = gt_masks + return ann diff --git a/efficientdet/effdet/data/parsers/parser_voc.py b/efficientdet/effdet/data/parsers/parser_voc.py new file mode 100644 index 0000000000000000000000000000000000000000..554d43315d7c56f6a073109eab7d4af06c1cc8a7 --- /dev/null +++ b/efficientdet/effdet/data/parsers/parser_voc.py @@ -0,0 +1,148 @@ +""" Pascal VOC dataset parser + +Copyright 2020 Ross Wightman +""" +import os +import xml.etree.ElementTree as ET +from collections import defaultdict +import numpy as np + +from .parser import Parser +from .parser_config import VocParserCfg + + +class VocParser(Parser): + + DEFAULT_CLASSES = ( + 'aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car', 'cat', 'chair', + 'cow', 'diningtable', 'dog', 'horse', 'motorbike', 'person', 'pottedplant', + 'sheep', 'sofa', 'train', 'tvmonitor') + + def __init__(self, cfg: VocParserCfg): + super().__init__( + bbox_yxyx=cfg.bbox_yxyx, + has_labels=cfg.has_labels, + include_masks=False, # FIXME to support someday + include_bboxes_ignore=False, + ignore_empty_gt=cfg.has_labels and cfg.ignore_empty_gt, + min_img_size=cfg.min_img_size + ) + self.correct_bbox = 1 + self.keep_difficult = cfg.keep_difficult + + self.anns = None + self.img_id_to_idx = {} + self._load_annotations( + split_filename=cfg.split_filename, + img_filename=cfg.img_filename, + ann_filename=cfg.ann_filename, + classes=cfg.classes, + ) + + def _load_annotations( + self, + split_filename: str, + img_filename: str, + ann_filename: str, + classes=None, + ): + classes = classes or self.DEFAULT_CLASSES + self.cat_names = list(classes) + self.cat_ids = self.cat_names + self.cat_id_to_label = {cat: i + self.label_offset for i, cat in enumerate(self.cat_ids)} + + self.anns = [] + + with open(split_filename) as f: + ids = f.readlines() + for img_id in ids: + img_id = img_id.strip("\n") + filename = img_filename % img_id + xml_path = ann_filename % img_id + tree = ET.parse(xml_path) + root = tree.getroot() + size = root.find('size') + width = int(size.find('width').text) + height = int(size.find('height').text) + if min(width, height) < self.min_img_size: + continue + + anns = [] + for obj_idx, obj in enumerate(root.findall('object')): + name = obj.find('name').text + label = self.cat_id_to_label[name] + difficult = int(obj.find('difficult').text) + bnd_box = obj.find('bndbox') + bbox = [ + int(bnd_box.find('xmin').text), + int(bnd_box.find('ymin').text), + int(bnd_box.find('xmax').text), + int(bnd_box.find('ymax').text) + ] + anns.append(dict(label=label, bbox=bbox, difficult=difficult)) + + if not self.ignore_empty_gt or len(anns): + self.anns.append(anns) + self.img_infos.append(dict(id=img_id, file_name=filename, width=width, height=height)) + self.img_ids.append(img_id) + else: + self.img_ids_invalid.append(img_id) + + def merge(self, other): + assert len(self.cat_ids) == len(other.cat_ids) + self.img_ids.extend(other.img_ids) + self.img_infos.extend(other.img_infos) + self.anns.extend(other.anns) + + def get_ann_info(self, idx): + return self._parse_ann_info(self.anns[idx]) + + def _parse_ann_info(self, ann_info): + bboxes = [] + labels = [] + bboxes_ignore = [] + labels_ignore = [] + for ann in ann_info: + ignore = False + x1, y1, x2, y2 = ann['bbox'] + label = ann['label'] + w = x2 - x1 + h = y2 - y1 + if w < 1 or h < 1: + ignore = True + if self.yxyx: + bbox = [y1, x1, y2, x2] + else: + bbox = ann['bbox'] + if ignore or (ann['difficult'] and not self.keep_difficult): + bboxes_ignore.append(bbox) + labels_ignore.append(label) + else: + bboxes.append(bbox) + labels.append(label) + + if not bboxes: + bboxes = np.zeros((0, 4), dtype=np.float32) + labels = np.zeros((0, ), dtype=np.float32) + else: + bboxes = np.array(bboxes, ndmin=2, dtype=np.float32) - self.correct_bbox + labels = np.array(labels, dtype=np.float32) + + if self.include_bboxes_ignore: + if not bboxes_ignore: + bboxes_ignore = np.zeros((0, 4), dtype=np.float32) + labels_ignore = np.zeros((0, ), dtype=np.float32) + else: + bboxes_ignore = np.array(bboxes_ignore, ndmin=2, dtype=np.float32) - self.correct_bbox + labels_ignore = np.array(labels_ignore, dtype=np.float32) + + ann = dict( + bbox=bboxes.astype(np.float32), + cls=labels.astype(np.int64)) + + if self.include_bboxes_ignore: + ann.update(dict( + bbox_ignore=bboxes_ignore.astype(np.float32), + cls_ignore=labels_ignore.astype(np.int64))) + return ann + diff --git a/efficientdet/effdet/data/random_erasing.py b/efficientdet/effdet/data/random_erasing.py new file mode 100644 index 0000000000000000000000000000000000000000..ded751ecf0b22a106f5eccd78fdc4fe3b83a44fd --- /dev/null +++ b/efficientdet/effdet/data/random_erasing.py @@ -0,0 +1,94 @@ +""" Multi-Scale RandomErasing + +Copyright 2020 Ross Wightman +""" +import random +import math +import torch + + +def _get_pixels(per_pixel, rand_color, patch_size, dtype=torch.float32, device='cuda'): + # NOTE I've seen CUDA illegal memory access errors being caused by the normal_() + # paths, flip the order so normal is run on CPU if this becomes a problem + # Issue has been fixed in master https://github.com/pytorch/pytorch/issues/19508 + if per_pixel: + return torch.empty(patch_size, dtype=dtype, device=device).normal_() + elif rand_color: + return torch.empty((patch_size[0], 1, 1), dtype=dtype, device=device).normal_() + else: + return torch.zeros((patch_size[0], 1, 1), dtype=dtype, device=device) + + +class RandomErasing: + """ Randomly selects a rectangle region in an image and erases its pixels. + 'Random Erasing Data Augmentation' by Zhong et al. + See https://arxiv.org/pdf/1708.04896.pdf + + This variant of RandomErasing is tweaked for multi-scale obj detection training. + Args: + probability: Probability that the Random Erasing operation will be performed. + min_area: Minimum percentage of erased area wrt input image area. + max_area: Maximum percentage of erased area wrt input image area. + min_aspect: Minimum aspect ratio of erased area. + mode: pixel color mode, one of 'const', 'rand', or 'pixel' + 'const' - erase block is constant color of 0 for all channels + 'rand' - erase block is same per-channel random (normal) color + 'pixel' - erase block is per-pixel random (normal) color + max_count: maximum number of erasing blocks per image, area per box is scaled by count. + per-image count is randomly chosen between 1 and this value. + """ + + def __init__( + self, + probability=0.5, min_area=0.02, max_area=1/4, min_aspect=0.3, max_aspect=None, + mode='const', min_count=1, max_count=None, num_splits=0, device='cuda'): + self.probability = probability + self.min_area = min_area + self.max_area = max_area + max_aspect = max_aspect or 1 / min_aspect + self.log_aspect_ratio = (math.log(min_aspect), math.log(max_aspect)) + self.min_count = min_count + self.max_count = max_count or min_count + self.num_splits = num_splits + mode = mode.lower() + self.rand_color = False + self.per_pixel = False + if mode == 'rand': + self.rand_color = True # per block random normal + elif mode == 'pixel': + self.per_pixel = True # per pixel random normal + else: + assert not mode or mode == 'const' + self.device = device + + def _erase(self, img, chan, img_h, img_w, dtype): + if random.random() > self.probability: + return + area = img_h * img_w + count = self.min_count if self.min_count == self.max_count else \ + random.randint(self.min_count, self.max_count) + for _ in range(count): + for attempt in range(10): + target_area = random.uniform(self.min_area, self.max_area) * area / count + aspect_ratio = math.exp(random.uniform(*self.log_aspect_ratio)) + h = int(round(math.sqrt(target_area * aspect_ratio))) + w = int(round(math.sqrt(target_area / aspect_ratio))) + if w < img_w and h < img_h: + top = random.randint(0, img_h - h) + left = random.randint(0, img_w - w) + img[:, top:top + h, left:left + w] = _get_pixels( + self.per_pixel, self.rand_color, (chan, h, w), + dtype=dtype, device=self.device) + break + + def __call__(self, input, target): + batch_size, chan, input_h, input_w = input.shape + img_scales = target['img_scale'] + img_size = (target['img_size'] / img_scales.unsqueeze(1)).int() + img_size[:, 0] = img_size[:, 0].clamp(max=input_w) + img_size[:, 1] = img_size[:, 1].clamp(max=input_h) + # skip first slice of batch if num_splits is set (for clean portion of samples) + batch_start = batch_size // self.num_splits if self.num_splits > 1 else 0 + for i in range(batch_start, batch_size): + self._erase(input[i], chan, img_size[i, 1], img_size[i, 0], input.dtype) + return input diff --git a/efficientdet/effdet/data/transforms.py b/efficientdet/effdet/data/transforms.py new file mode 100644 index 0000000000000000000000000000000000000000..c62b7727a4395f23927700efef6d59c1d36af2c5 --- /dev/null +++ b/efficientdet/effdet/data/transforms.py @@ -0,0 +1,275 @@ +""" COCO transforms (quick and dirty) + +Hacked together by Ross Wightman +""" +import torch +from PIL import Image +import numpy as np +import random +import math + +IMAGENET_DEFAULT_MEAN = (0.485, 0.456, 0.406) +IMAGENET_DEFAULT_STD = (0.229, 0.224, 0.225) +IMAGENET_INCEPTION_MEAN = (0.5, 0.5, 0.5) +IMAGENET_INCEPTION_STD = (0.5, 0.5, 0.5) + + +class ImageToNumpy: + + def __call__(self, pil_img, annotations: dict): + np_img = np.array(pil_img, dtype=np.uint8) + if np_img.ndim < 3: + np_img = np.expand_dims(np_img, axis=-1) + np_img = np.moveaxis(np_img, 2, 0) # HWC to CHW + return np_img, annotations + + +class ImageToTensor: + + def __init__(self, dtype=torch.float32): + self.dtype = dtype + + def __call__(self, pil_img, annotations: dict): + np_img = np.array(pil_img, dtype=np.uint8) + if np_img.ndim < 3: + np_img = np.expand_dims(np_img, axis=-1) + np_img = np.moveaxis(np_img, 2, 0) # HWC to CHW + return torch.from_numpy(np_img).to(dtype=self.dtype), annotations + + +def _pil_interp(method): + if method == 'bicubic': + return Image.BICUBIC + elif method == 'lanczos': + return Image.LANCZOS + elif method == 'hamming': + return Image.HAMMING + else: + # default bilinear, do we want to allow nearest? + return Image.BILINEAR + + +def clip_boxes_(boxes, img_size): + height, width = img_size + clip_upper = np.array([height, width] * 2, dtype=boxes.dtype) + np.clip(boxes, 0, clip_upper, out=boxes) + + +def clip_boxes(boxes, img_size): + clipped_boxes = boxes.copy() + clip_boxes_(clipped_boxes, img_size) + return clipped_boxes + + +def _size_tuple(size): + if isinstance(size, int): + return size, size + else: + assert len(size) == 2 + return size + + +class ResizePad: + + def __init__(self, target_size: int, interpolation: str = 'bilinear', fill_color: tuple = (0, 0, 0)): + self.target_size = _size_tuple(target_size) + self.interpolation = interpolation + self.fill_color = fill_color + + def __call__(self, img, anno: dict): + width, height = img.size + + img_scale_y = self.target_size[0] / height + img_scale_x = self.target_size[1] / width + img_scale = min(img_scale_y, img_scale_x) + scaled_h = int(height * img_scale) + scaled_w = int(width * img_scale) + + new_img = Image.new("RGB", (self.target_size[1], self.target_size[0]), color=self.fill_color) + interp_method = _pil_interp(self.interpolation) + img = img.resize((scaled_w, scaled_h), interp_method) + new_img.paste(img) + + if 'bbox' in anno: + # FIXME haven't tested this path since not currently using dataset annotations for train/eval + bbox = anno['bbox'] + bbox[:, :4] *= img_scale + clip_boxes_(bbox, (scaled_h, scaled_w)) + valid_indices = (bbox[:, :2] < bbox[:, 2:4]).all(axis=1) + anno['bbox'] = bbox[valid_indices, :] + anno['cls'] = anno['cls'][valid_indices] + + anno['img_scale'] = 1. / img_scale # back to original + + return new_img, anno + + +class RandomResizePad: + + def __init__(self, target_size: int, scale: tuple = (0.1, 2.0), interpolation: str = 'bilinear', + fill_color: tuple = (0, 0, 0)): + self.target_size = _size_tuple(target_size) + self.scale = scale + self.interpolation = interpolation + self.fill_color = fill_color + + def _get_params(self, img): + # Select a random scale factor. + scale_factor = random.uniform(*self.scale) + scaled_target_height = scale_factor * self.target_size[0] + scaled_target_width = scale_factor * self.target_size[1] + + # Recompute the accurate scale_factor using rounded scaled image size. + width, height = img.size + img_scale_y = scaled_target_height / height + img_scale_x = scaled_target_width / width + img_scale = min(img_scale_y, img_scale_x) + + # Select non-zero random offset (x, y) if scaled image is larger than target size + scaled_h = int(height * img_scale) + scaled_w = int(width * img_scale) + offset_y = scaled_h - self.target_size[0] + offset_x = scaled_w - self.target_size[1] + offset_y = int(max(0.0, float(offset_y)) * random.uniform(0, 1)) + offset_x = int(max(0.0, float(offset_x)) * random.uniform(0, 1)) + return scaled_h, scaled_w, offset_y, offset_x, img_scale + + def __call__(self, img, anno: dict): + scaled_h, scaled_w, offset_y, offset_x, img_scale = self._get_params(img) + + interp_method = _pil_interp(self.interpolation) + img = img.resize((scaled_w, scaled_h), interp_method) + right, lower = min(scaled_w, offset_x + self.target_size[1]), min(scaled_h, offset_y + self.target_size[0]) + img = img.crop((offset_x, offset_y, right, lower)) + new_img = Image.new("RGB", (self.target_size[1], self.target_size[0]), color=self.fill_color) + new_img.paste(img) + + if 'bbox' in anno: + # FIXME not fully tested + bbox = anno['bbox'].copy() # FIXME copy for debugger inspection, back to inplace + bbox[:, :4] *= img_scale + box_offset = np.stack([offset_y, offset_x] * 2) + bbox -= box_offset + clip_boxes_(bbox, (scaled_h, scaled_w)) + valid_indices = (bbox[:, :2] < bbox[:, 2:4]).all(axis=1) + anno['bbox'] = bbox[valid_indices, :] + anno['cls'] = anno['cls'][valid_indices] + + anno['img_scale'] = 1. / img_scale # back to original + + return new_img, anno + + +class RandomFlip: + + def __init__(self, horizontal=True, vertical=False, prob=0.5): + self.horizontal = horizontal + self.vertical = vertical + self.prob = prob + + def _get_params(self): + do_horizontal = random.random() < self.prob if self.horizontal else False + do_vertical = random.random() < self.prob if self.vertical else False + return do_horizontal, do_vertical + + def __call__(self, img, annotations: dict): + do_horizontal, do_vertical = self._get_params() + width, height = img.size + + def _fliph(bbox): + x_max = width - bbox[:, 1] + x_min = width - bbox[:, 3] + bbox[:, 1] = x_min + bbox[:, 3] = x_max + + def _flipv(bbox): + y_max = height - bbox[:, 0] + y_min = height - bbox[:, 2] + bbox[:, 0] = y_min + bbox[:, 2] = y_max + + if do_horizontal and do_vertical: + img = img.transpose(Image.ROTATE_180) + if 'bbox' in annotations: + _fliph(annotations['bbox']) + _flipv(annotations['bbox']) + elif do_horizontal: + img = img.transpose(Image.FLIP_LEFT_RIGHT) + if 'bbox' in annotations: + _fliph(annotations['bbox']) + elif do_vertical: + img = img.transpose(Image.FLIP_TOP_BOTTOM) + if 'bbox' in annotations: + _flipv(annotations['bbox']) + + return img, annotations + + +def resolve_fill_color(fill_color, img_mean=IMAGENET_DEFAULT_MEAN): + if isinstance(fill_color, tuple): + assert len(fill_color) == 3 + fill_color = fill_color + else: + try: + int_color = int(fill_color) + fill_color = (int_color,) * 3 + except ValueError: + assert fill_color == 'mean' + fill_color = tuple([int(round(255 * x)) for x in img_mean]) + return fill_color + + +class Compose: + + def __init__(self, transforms: list): + self.transforms = transforms + + def __call__(self, img, annotations: dict): + for t in self.transforms: + img, annotations = t(img, annotations) + return img, annotations + + +def transforms_coco_eval( + img_size=224, + interpolation='bilinear', + use_prefetcher=False, + fill_color='mean', + mean=IMAGENET_DEFAULT_MEAN, + std=IMAGENET_DEFAULT_STD): + + fill_color = resolve_fill_color(fill_color, mean) + + image_tfl = [ + ResizePad( + target_size=img_size, interpolation=interpolation, fill_color=fill_color), + ImageToNumpy(), + ] + + assert use_prefetcher, "Only supporting prefetcher usage right now" + + image_tf = Compose(image_tfl) + return image_tf + + +def transforms_coco_train( + img_size=224, + interpolation='random', + use_prefetcher=False, + fill_color='mean', + mean=IMAGENET_DEFAULT_MEAN, + std=IMAGENET_DEFAULT_STD): + + fill_color = resolve_fill_color(fill_color, mean) + + image_tfl = [ + RandomFlip(horizontal=True, prob=0.5), + RandomResizePad( + target_size=img_size, interpolation=interpolation, fill_color=fill_color), + ImageToNumpy(), + ] + + assert use_prefetcher, "Only supporting prefetcher usage right now" + + image_tf = Compose(image_tfl) + return image_tf diff --git a/efficientdet/effdet/data/transforms_albumentation.py b/efficientdet/effdet/data/transforms_albumentation.py new file mode 100755 index 0000000000000000000000000000000000000000..446bf27a49d4a7b87ef96937da1e0951342c6c76 --- /dev/null +++ b/efficientdet/effdet/data/transforms_albumentation.py @@ -0,0 +1,23 @@ +import albumentations as A + +from albumentations.augmentations.transforms import ( + RandomBrightness, Downscale, RandomFog, RandomRain, RandomSnow) + +from albumentations.augmentations.blur.transforms import Blur + +def get_transform(): + transforms = A.Compose([ + #HorizontalFlip(p=0.5), + #VerticalFlip(p=0.5), + #RandomSizedBBoxSafeCrop(700, 700, erosion_rate=0.0, interpolation=1, always_apply=False, p=0.5), + Blur(blur_limit=7, always_apply=False, p=0.5), + RandomBrightness(limit=0.2, always_apply=False, p=0.5), + #Downscale(scale_min=0.5, scale_max=0.9, interpolation=0, always_apply=False, p=0.5), + #PadIfNeeded(min_height=1024, min_width=1024, pad_height_divisor=None, pad_width_divisor=None, border_mode=4, value=None, mask_value=None, always_apply=False, p=1.0), + #RandomFog(fog_coef_lower=0.3, fog_coef_upper=1, alpha_coef=0.08, always_apply=False, p=0.2), + #RandomRain(slant_lower=-10, slant_upper=10, drop_length=20, drop_width=1, drop_color=(200, 200, 200), p=0.2), + #RandomSnow(snow_point_lower=0.1, snow_point_upper=0.3, brightness_coeff=2.5, always_apply=False, p=0.2) + ], bbox_params=A.BboxParams(format='pascal_voc', label_fields=['bbox_classes']) + ) + return transforms + \ No newline at end of file diff --git a/efficientdet/effdet/distributed.py b/efficientdet/effdet/distributed.py new file mode 100644 index 0000000000000000000000000000000000000000..63f024eb5cb9f5d209b39158d762d0da714d7a0a --- /dev/null +++ b/efficientdet/effdet/distributed.py @@ -0,0 +1,308 @@ +""" PyTorch distributed helpers + +Some of this lifted from Detectron2 with other fns added by myself. Some of the Detectron2 fns +were intended for use with GLOO PG. I am using NCCL here with default PG so not everything will work +as is -RW +""" +import functools +import logging +import numpy as np +import pickle +import torch +import torch.distributed as dist + +_LOCAL_PROCESS_GROUP = None +""" +A torch process group which only includes processes that on the same machine as the current process. +This variable is set when processes are spawned by `launch()` in "engine/launch.py". +""" + + +def get_world_size() -> int: + if not dist.is_available(): + return 1 + if not dist.is_initialized(): + return 1 + return dist.get_world_size() + + +def get_rank() -> int: + if not dist.is_available(): + return 0 + if not dist.is_initialized(): + return 0 + return dist.get_rank() + + +def get_local_rank() -> int: + """ + Returns: + The rank of the current process within the local (per-machine) process group. + """ + if not dist.is_available(): + return 0 + if not dist.is_initialized(): + return 0 + assert _LOCAL_PROCESS_GROUP is not None + return dist.get_rank(group=_LOCAL_PROCESS_GROUP) + + +def get_local_size() -> int: + """ + Returns: + The size of the per-machine process group, + i.e. the number of processes per machine. + """ + if not dist.is_available(): + return 1 + if not dist.is_initialized(): + return 1 + return dist.get_world_size(group=_LOCAL_PROCESS_GROUP) + + +def is_main_process() -> bool: + return get_rank() == 0 + + +def synchronize(): + """ + Helper function to synchronize (barrier) among all processes when + using distributed training + """ + if not dist.is_available(): + return + if not dist.is_initialized(): + return + world_size = dist.get_world_size() + if world_size == 1: + return + dist.barrier() + + +@functools.lru_cache() +def _get_global_gloo_group(): + """ + Return a process group based on gloo backend, containing all the ranks + The result is cached. + """ + if dist.get_backend() == "nccl": + return dist.new_group(backend="gloo") + else: + return dist.group.WORLD + + +def _serialize_to_tensor(data, group): + backend = dist.get_backend(group) + assert backend in ["gloo", "nccl"] + device = torch.device("cpu" if backend == "gloo" else "cuda") + + buffer = pickle.dumps(data) + if len(buffer) > 1024 ** 3: + logger = logging.getLogger(__name__) + logger.warning( + "Rank {} trying to all-gather {:.2f} GB of data on device {}".format( + get_rank(), len(buffer) / (1024 ** 3), device + ) + ) + storage = torch.ByteStorage.from_buffer(buffer) + tensor = torch.ByteTensor(storage).to(device=device) + return tensor + + +def _pad_to_largest_tensor(tensor, group): + """ + Returns: + list[int]: size of the tensor, on each rank + Tensor: padded tensor that has the max size + """ + world_size = dist.get_world_size(group=group) + assert ( + world_size >= 1 + ), "comm.gather/all_gather must be called from ranks within the given group!" + local_size = torch.tensor([tensor.numel()], dtype=torch.int64, device=tensor.device) + size_list = [ + torch.zeros([1], dtype=torch.int64, device=tensor.device) for _ in range(world_size) + ] + dist.all_gather(size_list, local_size, group=group) + size_list = [int(size.item()) for size in size_list] + + max_size = max(size_list) + + # we pad the tensor because torch all_gather does not support + # gathering tensors of different shapes + if local_size != max_size: + padding = torch.zeros((max_size - local_size,), dtype=torch.uint8, device=tensor.device) + tensor = torch.cat((tensor, padding), dim=0) + return size_list, tensor + + +def all_gather(data, group=None): + """ + Run all_gather on arbitrary picklable data (not necessarily tensors). + Args: + data: any picklable object + group: a torch process group. By default, will use a group which + contains all ranks on gloo backend. + Returns: + list[data]: list of data gathered from each rank + """ + if get_world_size() == 1: + return [data] + if group is None: + group = _get_global_gloo_group() + if dist.get_world_size(group) == 1: + return [data] + + tensor = _serialize_to_tensor(data, group) + + size_list, tensor = _pad_to_largest_tensor(tensor, group) + max_size = max(size_list) + + # receiving Tensor from all ranks + tensor_list = [torch.empty((max_size,), dtype=torch.uint8, device=tensor.device) for _ in size_list] + dist.all_gather(tensor_list, tensor, group=group) + + data_list = [] + for size, tensor in zip(size_list, tensor_list): + buffer = tensor.cpu().numpy().tobytes()[:size] + data_list.append(pickle.loads(buffer)) + + return data_list + + +def gather(data, dst=0, group=None): + """ + Run gather on arbitrary picklable data (not necessarily tensors). + Args: + data: any picklable object + dst (int): destination rank + group: a torch process group. By default, will use a group which + contains all ranks on gloo backend. + Returns: + list[data]: on dst, a list of data gathered from each rank. Otherwise, + an empty list. + """ + if get_world_size() == 1: + return [data] + if group is None: + group = _get_global_gloo_group() + if dist.get_world_size(group=group) == 1: + return [data] + rank = dist.get_rank(group=group) + + tensor = _serialize_to_tensor(data, group) + size_list, tensor = _pad_to_largest_tensor(tensor, group) + + # receiving Tensor from all ranks + if rank == dst: + max_size = max(size_list) + tensor_list = [torch.empty((max_size,), dtype=torch.uint8, device=tensor.device) for _ in size_list] + dist.gather(tensor, tensor_list, dst=dst, group=group) + + data_list = [] + for size, tensor in zip(size_list, tensor_list): + buffer = tensor.cpu().numpy().tobytes()[:size] + data_list.append(pickle.loads(buffer)) + return data_list + else: + dist.gather(tensor, [], dst=dst, group=group) + return [] + + +def shared_random_seed(): + """ + Returns: + int: a random number that is the same across all workers. + If workers need a shared RNG, they can use this shared seed to + create one. + All workers must call this function, otherwise it will deadlock. + """ + ints = np.random.randint(2 ** 31) + all_ints = all_gather(ints) + return all_ints[0] + + +def reduce_dict(input_dict, average=True): + """ + Reduce the values in the dictionary from all processes so that process with rank + 0 has the reduced results. + Args: + input_dict (dict): inputs to be reduced. All the values must be scalar CUDA Tensor. + average (bool): whether to do average or sum + Returns: + a dict with the same keys as input_dict, after reduction. + """ + world_size = get_world_size() + if world_size < 2: + return input_dict + with torch.no_grad(): + names = [] + values = [] + # sort the keys so that they are consistent across processes + for k in sorted(input_dict.keys()): + names.append(k) + values.append(input_dict[k]) + values = torch.stack(values, dim=0) + dist.reduce(values, dst=0) + if dist.get_rank() == 0 and average: + # only main process gets accumulated, so only divide by + # world_size in this case + values /= world_size + reduced_dict = {k: v for k, v in zip(names, values)} + return reduced_dict + + +def all_gather_container(container, group=None, cat_dim=0): + group = group or dist.group.WORLD + world_size = dist.get_world_size(group) + + def _do_gather(tensor): + tensor_list = [torch.empty_like(tensor) for _ in range(world_size)] + dist.all_gather(tensor_list, tensor, group=group) + return torch.cat(tensor_list, dim=cat_dim) + + if isinstance(container, dict): + gathered = dict() + for k, v in container.items(): + v = _do_gather(v) + gathered[k] = v + return gathered + elif isinstance(container, (list, tuple)): + gathered = [_do_gather(v) for v in container] + if isinstance(container, tuple): + gathered = tuple(gathered) + return gathered + else: + # if not a dict, list, tuple, expect a singular tensor + assert isinstance(container, torch.Tensor) + return _do_gather(container) + + +def gather_container(container, dst, group=None, cat_dim=0): + group = group or dist.group.WORLD + world_size = dist.get_world_size(group) + this_rank = dist.get_rank(group) + + def _do_gather(tensor): + if this_rank == dst: + tensor_list = [torch.empty_like(tensor) for _ in range(world_size)] + else: + tensor_list = None + dist.gather(tensor, tensor_list, dst=dst, group=group) + return torch.cat(tensor_list, dim=cat_dim) + + if isinstance(container, dict): + gathered = dict() + for k, v in container.items(): + v = _do_gather(v) + gathered[k] = v + return gathered + elif isinstance(container, (list, tuple)): + gathered = [_do_gather(v) for v in container] + if isinstance(container, tuple): + gathered = tuple(gathered) + return gathered + else: + # if not a dict, list, tuple, expect a singular tensor + assert isinstance(container, torch.Tensor) + return _do_gather(container) diff --git a/efficientdet/effdet/efficientdet.py b/efficientdet/effdet/efficientdet.py new file mode 100644 index 0000000000000000000000000000000000000000..47dcdc5359b0038857aaf5061756c130e16cf57d --- /dev/null +++ b/efficientdet/effdet/efficientdet.py @@ -0,0 +1,557 @@ +""" PyTorch EfficientDet model + +Based on official Tensorflow version at: https://github.com/google/automl/tree/master/efficientdet +Paper: https://arxiv.org/abs/1911.09070 + +Hacked together by Ross Wightman +""" +import torch +import torch.nn as nn +import logging +import math +from collections import OrderedDict +from typing import List, Callable +from functools import partial + + +from timm import create_model +from timm.models.layers import create_conv2d, drop_path, create_pool2d, Swish, get_act_layer +from .config import get_fpn_config, set_config_writeable, set_config_readonly + +_DEBUG = False + +_ACT_LAYER = Swish + + +class SequentialList(nn.Sequential): + """ This module exists to work around torchscript typing issues list -> list""" + def __init__(self, *args): + super(SequentialList, self).__init__(*args) + + def forward(self, x: List[torch.Tensor]) -> List[torch.Tensor]: + for module in self: + x = module(x) + return x + + +class ConvBnAct2d(nn.Module): + def __init__(self, in_channels, out_channels, kernel_size, stride=1, dilation=1, padding='', bias=False, + norm_layer=nn.BatchNorm2d, act_layer=_ACT_LAYER): + super(ConvBnAct2d, self).__init__() + self.conv = create_conv2d( + in_channels, out_channels, kernel_size, stride=stride, dilation=dilation, padding=padding, bias=bias) + self.bn = None if norm_layer is None else norm_layer(out_channels) + self.act = None if act_layer is None else act_layer(inplace=True) + + def forward(self, x): + x = self.conv(x) + if self.bn is not None: + x = self.bn(x) + if self.act is not None: + x = self.act(x) + return x + + +class SeparableConv2d(nn.Module): + """ Separable Conv + """ + def __init__(self, in_channels, out_channels, kernel_size=3, stride=1, dilation=1, padding='', bias=False, + channel_multiplier=1.0, pw_kernel_size=1, norm_layer=nn.BatchNorm2d, act_layer=_ACT_LAYER): + super(SeparableConv2d, self).__init__() + self.conv_dw = create_conv2d( + in_channels, int(in_channels * channel_multiplier), kernel_size, + stride=stride, dilation=dilation, padding=padding, depthwise=True) + + self.conv_pw = create_conv2d( + int(in_channels * channel_multiplier), out_channels, pw_kernel_size, padding=padding, bias=bias) + + self.bn = None if norm_layer is None else norm_layer(out_channels) + self.act = None if act_layer is None else act_layer(inplace=True) + + def forward(self, x): + x = self.conv_dw(x) + x = self.conv_pw(x) + if self.bn is not None: + x = self.bn(x) + if self.act is not None: + x = self.act(x) + return x + + +class ResampleFeatureMap(nn.Sequential): + + def __init__(self, in_channels, out_channels, reduction_ratio=1., pad_type='', pooling_type='max', + norm_layer=nn.BatchNorm2d, apply_bn=False, conv_after_downsample=False, redundant_bias=False): + super(ResampleFeatureMap, self).__init__() + pooling_type = pooling_type or 'max' + self.in_channels = in_channels + self.out_channels = out_channels + self.reduction_ratio = reduction_ratio + self.conv_after_downsample = conv_after_downsample + + conv = None + if in_channels != out_channels: + conv = ConvBnAct2d( + in_channels, out_channels, kernel_size=1, padding=pad_type, + norm_layer=norm_layer if apply_bn else None, + bias=not apply_bn or redundant_bias, act_layer=None) + + if reduction_ratio > 1: + stride_size = int(reduction_ratio) + if conv is not None and not self.conv_after_downsample: + self.add_module('conv', conv) + self.add_module( + 'downsample', + create_pool2d( + pooling_type, kernel_size=stride_size + 1, stride=stride_size, padding=pad_type)) + if conv is not None and self.conv_after_downsample: + self.add_module('conv', conv) + else: + if conv is not None: + self.add_module('conv', conv) + if reduction_ratio < 1: + scale = int(1 // reduction_ratio) + self.add_module('upsample', nn.UpsamplingNearest2d(scale_factor=scale)) + + # def forward(self, x): + # # here for debugging only + # assert x.shape[1] == self.in_channels + # if self.reduction_ratio > 1: + # if hasattr(self, 'conv') and not self.conv_after_downsample: + # x = self.conv(x) + # x = self.downsample(x) + # if hasattr(self, 'conv') and self.conv_after_downsample: + # x = self.conv(x) + # else: + # if hasattr(self, 'conv'): + # x = self.conv(x) + # if self.reduction_ratio < 1: + # x = self.upsample(x) + # return x + + +class FpnCombine(nn.Module): + def __init__(self, feature_info, fpn_config, fpn_channels, inputs_offsets, target_reduction, pad_type='', + pooling_type='max', norm_layer=nn.BatchNorm2d, apply_bn_for_resampling=False, + conv_after_downsample=False, redundant_bias=False, weight_method='attn'): + super(FpnCombine, self).__init__() + self.inputs_offsets = inputs_offsets + self.weight_method = weight_method + + self.resample = nn.ModuleDict() + for idx, offset in enumerate(inputs_offsets): + in_channels = fpn_channels + if offset < len(feature_info): + in_channels = feature_info[offset]['num_chs'] + input_reduction = feature_info[offset]['reduction'] + else: + node_idx = offset - len(feature_info) + input_reduction = fpn_config.nodes[node_idx]['reduction'] + reduction_ratio = target_reduction / input_reduction + self.resample[str(offset)] = ResampleFeatureMap( + in_channels, fpn_channels, reduction_ratio=reduction_ratio, pad_type=pad_type, + pooling_type=pooling_type, norm_layer=norm_layer, apply_bn=apply_bn_for_resampling, + conv_after_downsample=conv_after_downsample, redundant_bias=redundant_bias) + + if weight_method == 'attn' or weight_method == 'fastattn': + self.edge_weights = nn.Parameter(torch.ones(len(inputs_offsets)), requires_grad=True) # WSM + else: + self.edge_weights = None + + def forward(self, x: List[torch.Tensor]): + dtype = x[0].dtype + nodes = [] + for offset, resample in zip(self.inputs_offsets, self.resample.values()): + input_node = x[offset] + input_node = resample(input_node) + nodes.append(input_node) + + if self.weight_method == 'attn': + normalized_weights = torch.softmax(self.edge_weights.to(dtype=dtype), dim=0) + out = torch.stack(nodes, dim=-1) * normalized_weights + elif self.weight_method == 'fastattn': + edge_weights = nn.functional.relu(self.edge_weights.to(dtype=dtype)) + weights_sum = torch.sum(edge_weights) + out = torch.stack( + [(nodes[i] * edge_weights[i]) / (weights_sum + 0.0001) for i in range(len(nodes))], dim=-1) + elif self.weight_method == 'sum': + out = torch.stack(nodes, dim=-1) + else: + raise ValueError('unknown weight_method {}'.format(self.weight_method)) + out = torch.sum(out, dim=-1) + return out + + +class Fnode(nn.Module): + """ A simple wrapper used in place of nn.Sequential for torchscript typing + Handles input type List[Tensor] -> output type Tensor + """ + def __init__(self, combine: nn.Module, after_combine: nn.Module): + super(Fnode, self).__init__() + self.combine = combine + self.after_combine = after_combine + + def forward(self, x: List[torch.Tensor]) -> torch.Tensor: + return self.after_combine(self.combine(x)) + + +class BiFpnLayer(nn.Module): + def __init__(self, feature_info, fpn_config, fpn_channels, num_levels=5, pad_type='', + pooling_type='max', norm_layer=nn.BatchNorm2d, act_layer=_ACT_LAYER, + apply_bn_for_resampling=False, conv_after_downsample=True, conv_bn_relu_pattern=False, + separable_conv=True, redundant_bias=False): + super(BiFpnLayer, self).__init__() + self.num_levels = num_levels + self.conv_bn_relu_pattern = False + + self.feature_info = [] + self.fnode = nn.ModuleList() + for i, fnode_cfg in enumerate(fpn_config.nodes): + logging.debug('fnode {} : {}'.format(i, fnode_cfg)) + reduction = fnode_cfg['reduction'] + combine = FpnCombine( + feature_info, fpn_config, fpn_channels, tuple(fnode_cfg['inputs_offsets']), + target_reduction=reduction, pad_type=pad_type, pooling_type=pooling_type, norm_layer=norm_layer, + apply_bn_for_resampling=apply_bn_for_resampling, conv_after_downsample=conv_after_downsample, + redundant_bias=redundant_bias, weight_method=fnode_cfg['weight_method']) + + after_combine = nn.Sequential() + conv_kwargs = dict( + in_channels=fpn_channels, out_channels=fpn_channels, kernel_size=3, padding=pad_type, + bias=False, norm_layer=norm_layer, act_layer=act_layer) + if not conv_bn_relu_pattern: + conv_kwargs['bias'] = redundant_bias + conv_kwargs['act_layer'] = None + after_combine.add_module('act', act_layer(inplace=True)) + after_combine.add_module( + 'conv', SeparableConv2d(**conv_kwargs) if separable_conv else ConvBnAct2d(**conv_kwargs)) + + self.fnode.append(Fnode(combine=combine, after_combine=after_combine)) + self.feature_info.append(dict(num_chs=fpn_channels, reduction=reduction)) + + self.feature_info = self.feature_info[-num_levels::] + + def forward(self, x: List[torch.Tensor]): + for fn in self.fnode: + x.append(fn(x)) + return x[-self.num_levels::] + + +class BiFpn(nn.Module): + + def __init__(self, config, feature_info): + super(BiFpn, self).__init__() + self.num_levels = config.num_levels + norm_layer = config.norm_layer or nn.BatchNorm2d + if config.norm_kwargs: + norm_layer = partial(norm_layer, **config.norm_kwargs) + act_layer = get_act_layer(config.act_type) or _ACT_LAYER + fpn_config = config.fpn_config or get_fpn_config( + config.fpn_name, min_level=config.min_level, max_level=config.max_level) + + self.resample = nn.ModuleDict() + for level in range(config.num_levels): + if level < len(feature_info): + in_chs = feature_info[level]['num_chs'] + reduction = feature_info[level]['reduction'] + else: + # Adds a coarser level by downsampling the last feature map + reduction_ratio = 2 + self.resample[str(level)] = ResampleFeatureMap( + in_channels=in_chs, + out_channels=config.fpn_channels, + pad_type=config.pad_type, + pooling_type=config.pooling_type, + norm_layer=norm_layer, + reduction_ratio=reduction_ratio, + apply_bn=config.apply_bn_for_resampling, + conv_after_downsample=config.conv_after_downsample, + redundant_bias=config.redundant_bias, + ) + in_chs = config.fpn_channels + reduction = int(reduction * reduction_ratio) + feature_info.append(dict(num_chs=in_chs, reduction=reduction)) + + self.cell = SequentialList() + for rep in range(config.fpn_cell_repeats): + logging.debug('building cell {}'.format(rep)) + fpn_layer = BiFpnLayer( + feature_info=feature_info, + fpn_config=fpn_config, + fpn_channels=config.fpn_channels, + num_levels=config.num_levels, + pad_type=config.pad_type, + pooling_type=config.pooling_type, + norm_layer=norm_layer, + act_layer=act_layer, + separable_conv=config.separable_conv, + apply_bn_for_resampling=config.apply_bn_for_resampling, + conv_after_downsample=config.conv_after_downsample, + conv_bn_relu_pattern=config.conv_bn_relu_pattern, + redundant_bias=config.redundant_bias, + ) + self.cell.add_module(str(rep), fpn_layer) + feature_info = fpn_layer.feature_info + + def forward(self, x: List[torch.Tensor]): + for resample in self.resample.values(): + x.append(resample(x[-1])) + x = self.cell(x) + return x + + +class HeadNet(nn.Module): + + def __init__(self, config, num_outputs): + super(HeadNet, self).__init__() + self.num_levels = config.num_levels + self.bn_level_first = getattr(config, 'head_bn_level_first', False) + norm_layer = config.norm_layer or nn.BatchNorm2d + if config.norm_kwargs: + norm_layer = partial(norm_layer, **config.norm_kwargs) + act_layer = get_act_layer(config.act_type) or _ACT_LAYER + + # Build convolution repeats + conv_fn = SeparableConv2d if config.separable_conv else ConvBnAct2d + conv_kwargs = dict( + in_channels=config.fpn_channels, out_channels=config.fpn_channels, kernel_size=3, + padding=config.pad_type, bias=config.redundant_bias, act_layer=None, norm_layer=None) + self.conv_rep = nn.ModuleList([conv_fn(**conv_kwargs) for _ in range(config.box_class_repeats)]) + + # Build batchnorm repeats. There is a unique batchnorm per feature level for each repeat. + # This can be organized with repeats first or feature levels first in module lists, the original models + # and weights were setup with repeats first, levels first is required for efficient torchscript usage. + self.bn_rep = nn.ModuleList() + if self.bn_level_first: + for _ in range(self.num_levels): + self.bn_rep.append(nn.ModuleList([ + norm_layer(config.fpn_channels) for _ in range(config.box_class_repeats)])) + else: + for _ in range(config.box_class_repeats): + self.bn_rep.append(nn.ModuleList([ + nn.Sequential(OrderedDict([('bn', norm_layer(config.fpn_channels))])) + for _ in range(self.num_levels)])) + + self.act = act_layer(inplace=True) + + # Prediction (output) layer. Has bias with special init reqs, see init fn. + num_anchors = len(config.aspect_ratios) * config.num_scales + predict_kwargs = dict( + in_channels=config.fpn_channels, out_channels=num_outputs * num_anchors, kernel_size=3, + padding=config.pad_type, bias=True, norm_layer=None, act_layer=None) + self.predict = conv_fn(**predict_kwargs) + + @torch.jit.ignore() + def toggle_bn_level_first(self): + """ Toggle the batchnorm layers between feature level first vs repeat first access pattern + Limitations in torchscript require feature levels to be iterated over first. + + This function can be used to allow loading weights in the original order, and then toggle before + jit scripting the model. + """ + with torch.no_grad(): + new_bn_rep = nn.ModuleList() + for i in range(len(self.bn_rep[0])): + bn_first = nn.ModuleList() + for r in self.bn_rep.children(): + m = r[i] + # NOTE original rep first model def has extra Sequential container with 'bn', this was + # flattened in the level first definition. + bn_first.append(m[0] if isinstance(m, nn.Sequential) else nn.Sequential(OrderedDict([('bn', m)]))) + new_bn_rep.append(bn_first) + self.bn_level_first = not self.bn_level_first + self.bn_rep = new_bn_rep + + @torch.jit.ignore() + def _forward(self, x: List[torch.Tensor]) -> List[torch.Tensor]: + outputs = [] + for level in range(self.num_levels): + x_level = x[level] + for conv, bn in zip(self.conv_rep, self.bn_rep): + x_level = conv(x_level) + x_level = bn[level](x_level) # this is not allowed in torchscript + x_level = self.act(x_level) + outputs.append(self.predict(x_level)) + return outputs + + def _forward_level_first(self, x: List[torch.Tensor]) -> List[torch.Tensor]: + outputs = [] + for level, bn_rep in enumerate(self.bn_rep): # iterating over first bn dim first makes TS happy + x_level = x[level] + for conv, bn in zip(self.conv_rep, bn_rep): + x_level = conv(x_level) + x_level = bn(x_level) + x_level = self.act(x_level) + outputs.append(self.predict(x_level)) + return outputs + + def forward(self, x: List[torch.Tensor]) -> List[torch.Tensor]: + if self.bn_level_first: + return self._forward_level_first(x) + else: + return self._forward(x) + + +def _init_weight(m, n='', ): + """ Weight initialization as per Tensorflow official implementations. + """ + + def _fan_in_out(w, groups=1): + dimensions = w.dim() + if dimensions < 2: + raise ValueError("Fan in and fan out can not be computed for tensor with fewer than 2 dimensions") + num_input_fmaps = w.size(1) + num_output_fmaps = w.size(0) + receptive_field_size = 1 + if w.dim() > 2: + receptive_field_size = w[0][0].numel() + fan_in = num_input_fmaps * receptive_field_size + fan_out = num_output_fmaps * receptive_field_size + fan_out //= groups + return fan_in, fan_out + + def _glorot_uniform(w, gain=1, groups=1): + fan_in, fan_out = _fan_in_out(w, groups) + gain /= max(1., (fan_in + fan_out) / 2.) # fan avg + limit = math.sqrt(3.0 * gain) + w.data.uniform_(-limit, limit) + + def _variance_scaling(w, gain=1, groups=1): + fan_in, fan_out = _fan_in_out(w, groups) + gain /= max(1., fan_in) # fan in + # gain /= max(1., (fan_in + fan_out) / 2.) # fan + + # should it be normal or trunc normal? using normal for now since no good trunc in PT + # constant taken from scipy.stats.truncnorm.std(a=-2, b=2, loc=0., scale=1.) + # std = math.sqrt(gain) / .87962566103423978 + # w.data.trunc_normal(std=std) + std = math.sqrt(gain) + w.data.normal_(std=std) + + if isinstance(m, SeparableConv2d): + if 'box_net' in n or 'class_net' in n: + _variance_scaling(m.conv_dw.weight, groups=m.conv_dw.groups) + _variance_scaling(m.conv_pw.weight) + if m.conv_pw.bias is not None: + if 'class_net.predict' in n: + m.conv_pw.bias.data.fill_(-math.log((1 - 0.01) / 0.01)) + else: + m.conv_pw.bias.data.zero_() + else: + _glorot_uniform(m.conv_dw.weight, groups=m.conv_dw.groups) + _glorot_uniform(m.conv_pw.weight) + if m.conv_pw.bias is not None: + m.conv_pw.bias.data.zero_() + elif isinstance(m, ConvBnAct2d): + if 'box_net' in n or 'class_net' in n: + m.conv.weight.data.normal_(std=.01) + if m.conv.bias is not None: + if 'class_net.predict' in n: + m.conv.bias.data.fill_(-math.log((1 - 0.01) / 0.01)) + else: + m.conv.bias.data.zero_() + else: + _glorot_uniform(m.conv.weight) + if m.conv.bias is not None: + m.conv.bias.data.zero_() + elif isinstance(m, nn.BatchNorm2d): + # looks like all bn init the same? + m.weight.data.fill_(1.0) + m.bias.data.zero_() + + +def _init_weight_alt(m, n='', ): + """ Weight initialization alternative, based on EfficientNet bacbkone init w/ class bias addition + NOTE: this will likely be removed after some experimentation + """ + if isinstance(m, nn.Conv2d): + fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels + fan_out //= m.groups + m.weight.data.normal_(0, math.sqrt(2.0 / fan_out)) + if m.bias is not None: + if 'class_net.predict' in n: + m.bias.data.fill_(-math.log((1 - 0.01) / 0.01)) + else: + m.bias.data.zero_() + elif isinstance(m, nn.BatchNorm2d): + m.weight.data.fill_(1.0) + m.bias.data.zero_() + + +def get_feature_info(backbone): + if isinstance(backbone.feature_info, Callable): + # old accessor for timm versions <= 0.1.30, efficientnet and mobilenetv3 and related nets only + feature_info = [dict(num_chs=f['num_chs'], reduction=f['reduction']) + for i, f in enumerate(backbone.feature_info())] + else: + # new feature info accessor, timm >= 0.2, all models supported + feature_info = backbone.feature_info.get_dicts(keys=['num_chs', 'reduction']) + return feature_info + + +class EfficientDet(nn.Module): + + def __init__(self, config, pretrained_backbone=True, alternate_init=False): + super(EfficientDet, self).__init__() + self.config = config + set_config_readonly(self.config) + self.backbone = create_model( + config.backbone_name, features_only=True, out_indices=(2, 3, 4), + pretrained=pretrained_backbone, **config.backbone_args) + feature_info = get_feature_info(self.backbone) + self.fpn = BiFpn(self.config, feature_info) + self.class_net = HeadNet(self.config, num_outputs=self.config.num_classes) + self.box_net = HeadNet(self.config, num_outputs=4) + + for n, m in self.named_modules(): + if 'backbone' not in n: + if alternate_init: + _init_weight_alt(m, n) + else: + _init_weight(m, n) + + @torch.jit.ignore() + def reset_head(self, num_classes=None, aspect_ratios=None, num_scales=None, alternate_init=False): + reset_class_head = False + reset_box_head = False + set_config_writeable(self.config) + if num_classes is not None: + reset_class_head = True + self.config.num_classes = num_classes + if aspect_ratios is not None: + reset_box_head = True + self.config.aspect_ratios = aspect_ratios + if num_scales is not None: + reset_box_head = True + self.config.num_scales = num_scales + set_config_readonly(self.config) + + if reset_class_head: + self.class_net = HeadNet(self.config, num_outputs=self.config.num_classes) + for n, m in self.class_net.named_modules(prefix='class_net'): + if alternate_init: + _init_weight_alt(m, n) + else: + _init_weight(m, n) + + if reset_box_head: + self.box_net = HeadNet(self.config, num_outputs=4) + for n, m in self.box_net.named_modules(prefix='box_net'): + if alternate_init: + _init_weight_alt(m, n) + else: + _init_weight(m, n) + + @torch.jit.ignore() + def toggle_head_bn_level_first(self): + """ Toggle the head batchnorm layers between being access with feature_level first vs repeat + """ + self.class_net.toggle_bn_level_first() + self.box_net.toggle_bn_level_first() + + def forward(self, x): + x = self.backbone(x) + x = self.fpn(x) + x_class = self.class_net(x) + x_box = self.box_net(x) + return x_class, x_box diff --git a/efficientdet/effdet/evaluation/README.md b/efficientdet/effdet/evaluation/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3546caa9900f652968bffb7bb8593a1cdd824228 --- /dev/null +++ b/efficientdet/effdet/evaluation/README.md @@ -0,0 +1,7 @@ +# Tensorflow Models Evaluation + +The code in this folder has been extracted and adapted from evaluation/evaluator code at https://github.com/tensorflow/models/tree/master/research/object_detection/utils + +Original code is licensed Apache 2.0, Copyright Google Inc. +https://github.com/tensorflow/models/blob/master/LICENSE + \ No newline at end of file diff --git a/efficientdet/effdet/evaluation/__init__.py b/efficientdet/effdet/evaluation/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/efficientdet/effdet/evaluation/detection_evaluator.py b/efficientdet/effdet/evaluation/detection_evaluator.py new file mode 100644 index 0000000000000000000000000000000000000000..385204cd088544592d4291de9c1436f620335162 --- /dev/null +++ b/efficientdet/effdet/evaluation/detection_evaluator.py @@ -0,0 +1,590 @@ +from abc import ABCMeta +from abc import abstractmethod +#import collections +import logging +import unicodedata +import numpy as np + +from .fields import InputDataFields, DetectionResultFields +from .object_detection_evaluation import ObjectDetectionEvaluation + + +def create_category_index(categories): + """Creates dictionary of COCO compatible categories keyed by category id. + Args: + categories: a list of dicts, each of which has the following keys: + 'id': (required) an integer id uniquely identifying this category. + 'name': (required) string representing category name e.g., 'cat', 'dog', 'pizza'. + Returns: + category_index: a dict containing the same entries as categories, but keyed + by the 'id' field of each category. + """ + category_index = {} + for cat in categories: + category_index[cat['id']] = cat + return category_index + + +class DetectionEvaluator(metaclass=ABCMeta): + """Interface for object detection evalution classes. + Example usage of the Evaluator: + ------------------------------ + evaluator = DetectionEvaluator(categories) + # Detections and groundtruth for image 1. + evaluator.add_single_gt_image_info(...) + evaluator.add_single_detected_image_info(...) + # Detections and groundtruth for image 2. + evaluator.add_single_gt_image_info(...) + evaluator.add_single_detected_image_info(...) + metrics_dict = evaluator.evaluation() + """ + + def __init__(self, categories): + """Constructor. + Args: + categories: A list of dicts, each of which has the following keys - + 'id': (required) an integer id uniquely identifying this category. + 'name': (required) string representing category name e.g., 'cat', 'dog'. + """ + self._categories = categories + + def observe_result_dict_for_single_example(self, eval_dict): + """Observes an evaluation result dict for a single example. + When executing eagerly, once all observations have been observed by this + method you can use `.evaluation()` to get the final metrics. + When using `tf.estimator.Estimator` for evaluation this function is used by + `get_estimator_eval_metric_ops()` to construct the metric update op. + Args: + eval_dict: A dictionary that holds tensors for evaluating an object + detection model, returned from + eval_util.result_dict_for_single_example(). + Returns: + None when executing eagerly, or an update_op that can be used to update + the eval metrics in `tf.estimator.EstimatorSpec`. + """ + raise NotImplementedError('Not implemented for this evaluator!') + + @abstractmethod + def add_single_ground_truth_image_info(self, image_id, gt_dict): + """Adds groundtruth for a single image to be used for evaluation. + Args: + image_id: A unique string/integer identifier for the image. + gt_dict: A dictionary of groundtruth numpy arrays required for evaluations. + """ + pass + + @abstractmethod + def add_single_detected_image_info(self, image_id, detections_dict): + """Adds detections for a single image to be used for evaluation. + Args: + image_id: A unique string/integer identifier for the image. + detections_dict: A dictionary of detection numpy arrays required for evaluation. + """ + pass + + @abstractmethod + def evaluate(self): + """Evaluates detections and returns a dictionary of metrics.""" + pass + + @abstractmethod + def clear(self): + """Clears the state to prepare for a fresh evaluation.""" + pass + + +class ObjectDetectionEvaluator(DetectionEvaluator): + """A class to evaluation detections.""" + + def __init__(self, + categories, + matching_iou_threshold=0.5, + recall_lower_bound=0.0, + recall_upper_bound=1.0, + evaluate_corlocs=False, + evaluate_precision_recall=False, + metric_prefix=None, + use_weighted_mean_ap=False, + evaluate_masks=False, + group_of_weight=0.0): + """Constructor. + Args: + categories: A list of dicts, each of which has the following keys - + 'id': (required) an integer id uniquely identifying this category. + 'name': (required) string representing category name e.g., 'cat', 'dog'. + matching_iou_threshold: IOU threshold to use for matching groundtruth boxes to detection boxes. + recall_lower_bound: lower bound of recall operating area. + recall_upper_bound: upper bound of recall operating area. + evaluate_corlocs: (optional) boolean which determines if corloc scores are to be returned or not. + evaluate_precision_recall: (optional) boolean which determines if + precision and recall values are to be returned or not. + metric_prefix: (optional) string prefix for metric name; if None, no prefix is used. + use_weighted_mean_ap: (optional) boolean which determines if the mean + average precision is computed directly from the scores and tp_fp_labels of all classes. + evaluate_masks: If False, evaluation will be performed based on boxes. If + True, mask evaluation will be performed instead. + group_of_weight: Weight of group-of boxes.If set to 0, detections of the + correct class within a group-of box are ignored. If weight is > 0, then + if at least one detection falls within a group-of box with + matching_iou_threshold, weight group_of_weight is added to true + positives. Consequently, if no detection falls within a group-of box, + weight group_of_weight is added to false negatives. + Raises: + ValueError: If the category ids are not 1-indexed. + """ + super(ObjectDetectionEvaluator, self).__init__(categories) + self._num_classes = max([cat['id'] for cat in categories]) + if min(cat['id'] for cat in categories) < 1: + raise ValueError('Classes should be 1-indexed.') + self._matching_iou_threshold = matching_iou_threshold + self._recall_lower_bound = recall_lower_bound + self._recall_upper_bound = recall_upper_bound + self._use_weighted_mean_ap = use_weighted_mean_ap + self._label_id_offset = 1 + self._evaluate_masks = evaluate_masks + self._group_of_weight = group_of_weight + self._evaluation = ObjectDetectionEvaluation( + num_gt_classes=self._num_classes, + matching_iou_threshold=self._matching_iou_threshold, + recall_lower_bound=self._recall_lower_bound, + recall_upper_bound=self._recall_upper_bound, + use_weighted_mean_ap=self._use_weighted_mean_ap, + label_id_offset=self._label_id_offset, + group_of_weight=self._group_of_weight) + self._image_ids = set([]) + self._evaluate_corlocs = evaluate_corlocs + self._evaluate_precision_recall = evaluate_precision_recall + self._metric_prefix = (metric_prefix + '_') if metric_prefix else '' + self._build_metric_names() + + def _build_metric_names(self): + """Builds a list with metric names.""" + if self._recall_lower_bound > 0.0 or self._recall_upper_bound < 1.0: + self._metric_names = [ + self._metric_prefix + 'Precision/mAP@{}IOU@[{:.1f},{:.1f}]Recall'.format( + self._matching_iou_threshold, self._recall_lower_bound, self._recall_upper_bound) + ] + else: + self._metric_names = [ + self._metric_prefix + 'Precision/mAP@{}IOU'.format(self._matching_iou_threshold) + ] + if self._evaluate_corlocs: + self._metric_names.append( + self._metric_prefix + 'Precision/meanCorLoc@{}IOU'.format(self._matching_iou_threshold)) + + category_index = create_category_index(self._categories) + for idx in range(self._num_classes): + if idx + self._label_id_offset in category_index: + category_name = category_index[idx + self._label_id_offset]['name'] + category_name = unicodedata.normalize('NFKD', category_name) + self._metric_names.append( + self._metric_prefix + 'PerformanceByCategory/AP@{}IOU/{}'.format( + self._matching_iou_threshold, category_name)) + if self._evaluate_corlocs: + self._metric_names.append( + self._metric_prefix + 'PerformanceByCategory/CorLoc@{}IOU/{}'.format( + self._matching_iou_threshold, category_name)) + + def add_single_ground_truth_image_info(self, image_id, gt_dict): + """Adds groundtruth for a single image to be used for evaluation. + Args: + image_id: A unique string/integer identifier for the image. + gt_dict: A dictionary containing - + InputDataFields.gt_boxes: float32 numpy array + of shape [num_boxes, 4] containing `num_boxes` groundtruth boxes of + the format [ymin, xmin, ymax, xmax] in absolute image coordinates. + InputDataFields.gt_classes: integer numpy array + of shape [num_boxes] containing 1-indexed groundtruth classes for the boxes. + InputDataFields.gt_difficult: Optional length M numpy boolean array + denoting whether a ground truth box is a difficult instance or not. + This field is optional to support the case that no boxes are difficult. + InputDataFields.gt_instance_masks: Optional numpy array of shape + [num_boxes, height, width] with values in {0, 1}. + Raises: + ValueError: On adding groundtruth for an image more than once. Will also + raise error if instance masks are not in groundtruth dictionary. + """ + if image_id in self._image_ids: + return + + gt_classes = gt_dict[InputDataFields.gt_classes] - self._label_id_offset + # If the key is not present in the gt_dict or the array is empty + # (unless there are no annotations for the groundtruth on this image) + # use values from the dictionary or insert None otherwise. + if (InputDataFields.gt_difficult in gt_dict and + (gt_dict[InputDataFields.gt_difficult].size or not gt_classes.size)): + gt_difficult = gt_dict[InputDataFields.gt_difficult] + else: + gt_difficult = None + # FIXME disable difficult flag warning, will support flag eventually + # if not len(self._image_ids) % 1000: + # logging.warning('image %s does not have groundtruth difficult flag specified', image_id) + gt_masks = None + if self._evaluate_masks: + if InputDataFields.gt_instance_masks not in gt_dict: + raise ValueError('Instance masks not in groundtruth dictionary.') + gt_masks = gt_dict[InputDataFields.gt_instance_masks] + self._evaluation.add_single_ground_truth_image_info( + image_key=image_id, + gt_boxes=gt_dict[InputDataFields.gt_boxes], + gt_class_labels=gt_classes, + gt_is_difficult_list=gt_difficult, + gt_masks=gt_masks) + self._image_ids.update([image_id]) + + def add_single_detected_image_info(self, image_id, detections_dict): + """Adds detections for a single image to be used for evaluation. + Args: + image_id: A unique string/integer identifier for the image. + detections_dict: A dictionary containing - + DetectionResultFields.detection_boxes: float32 numpy + array of shape [num_boxes, 4] containing `num_boxes` detection boxes + of the format [ymin, xmin, ymax, xmax] in absolute image coordinates. + DetectionResultFields.detection_scores: float32 numpy + array of shape [num_boxes] containing detection scores for the boxes. + DetectionResultFields.detection_classes: integer numpy + array of shape [num_boxes] containing 1-indexed detection classes for the boxes. + DetectionResultFields.detection_masks: uint8 numpy array + of shape [num_boxes, height, width] containing `num_boxes` masks of + values ranging between 0 and 1. + Raises: + ValueError: If detection masks are not in detections dictionary. + """ + detection_classes = detections_dict[DetectionResultFields.detection_classes] - self._label_id_offset + detection_masks = None + if self._evaluate_masks: + if DetectionResultFields.detection_masks not in detections_dict: + raise ValueError('Detection masks not in detections dictionary.') + detection_masks = detections_dict[DetectionResultFields.detection_masks] + self._evaluation.add_single_detected_image_info( + image_key=image_id, + detected_boxes=detections_dict[DetectionResultFields.detection_boxes], + detected_scores=detections_dict[DetectionResultFields.detection_scores], + detected_class_labels=detection_classes, + detected_masks=detection_masks) + + def evaluate(self): + """Compute evaluation result. + Returns: + A dictionary of metrics with the following fields - + 1. summary_metrics: + '_Precision/mAP@IOU': mean + average precision at the specified IOU threshold. + 2. per_category_ap: category specific results with keys of the form + '_PerformanceByCategory/ + mAP@IOU/category'. + """ + metrics = self._evaluation.evaluate() + pascal_metrics = {self._metric_names[0]: metrics['mean_ap']} + if self._evaluate_corlocs: + pascal_metrics[self._metric_names[1]] = metrics['mean_corloc'] + category_index = create_category_index(self._categories) + for idx in range(metrics['per_class_ap'].size): + if idx + self._label_id_offset in category_index: + category_name = category_index[idx + self._label_id_offset]['name'] + category_name = unicodedata.normalize('NFKD', category_name) + display_name = self._metric_prefix + 'PerformanceByCategory/AP@{}IOU/{}'.format( + self._matching_iou_threshold, category_name) + pascal_metrics[display_name] = metrics['per_class_ap'][idx] + + # Optionally add precision and recall values + if self._evaluate_precision_recall: + display_name = self._metric_prefix + 'PerformanceByCategory/Precision@{}IOU/{}'.format( + self._matching_iou_threshold, category_name) + pascal_metrics[display_name] = metrics['per_class_precision'][idx] + display_name = self._metric_prefix + 'PerformanceByCategory/Recall@{}IOU/{}'.format( + self._matching_iou_threshold, category_name) + pascal_metrics[display_name] = metrics['per_class_precision'][idx] + + # Optionally add CorLoc metrics.classes + if self._evaluate_corlocs: + display_name = self._metric_prefix + 'PerformanceByCategory/CorLoc@{}IOU/{}'.format( + self._matching_iou_threshold, category_name) + pascal_metrics[display_name] = metrics['per_class_corloc'][idx] + + return pascal_metrics + + def clear(self): + """Clears the state to prepare for a fresh evaluation.""" + self._evaluation = ObjectDetectionEvaluation( + num_gt_classes=self._num_classes, + matching_iou_threshold=self._matching_iou_threshold, + use_weighted_mean_ap=self._use_weighted_mean_ap, + label_id_offset=self._label_id_offset) + self._image_ids.clear() + + +class PascalDetectionEvaluator(ObjectDetectionEvaluator): + """A class to evaluation detections using PASCAL metrics.""" + + def __init__(self, categories, matching_iou_threshold=0.5): + super(PascalDetectionEvaluator, self).__init__( + categories, + matching_iou_threshold=matching_iou_threshold, + evaluate_corlocs=False, + metric_prefix='PascalBoxes', + use_weighted_mean_ap=False) + + +class WeightedPascalDetectionEvaluator(ObjectDetectionEvaluator): + """A class to evaluation detections using weighted PASCAL metrics. + Weighted PASCAL metrics computes the mean average precision as the average + precision given the scores and tp_fp_labels of all classes. In comparison, + PASCAL metrics computes the mean average precision as the mean of the + per-class average precisions. + This definition is very similar to the mean of the per-class average + precisions weighted by class frequency. However, they are typically not the + same as the average precision is not a linear function of the scores and + tp_fp_labels. + """ + + def __init__(self, categories, matching_iou_threshold=0.5): + super(WeightedPascalDetectionEvaluator, self).__init__( + categories, + matching_iou_threshold=matching_iou_threshold, + evaluate_corlocs=False, + metric_prefix='WeightedPascalBoxes', + use_weighted_mean_ap=True) + + +class PrecisionAtRecallDetectionEvaluator(ObjectDetectionEvaluator): + """A class to evaluation detections using precision@recall metrics.""" + + def __init__(self, + categories, + matching_iou_threshold=0.5, + recall_lower_bound=0., + recall_upper_bound=1.0): + super(PrecisionAtRecallDetectionEvaluator, self).__init__( + categories, + matching_iou_threshold=matching_iou_threshold, + recall_lower_bound=recall_lower_bound, + recall_upper_bound=recall_upper_bound, + evaluate_corlocs=False, + metric_prefix='PrecisionAtRecallBoxes', + use_weighted_mean_ap=False) + + +class OpenImagesDetectionEvaluator(ObjectDetectionEvaluator): + """A class to evaluation detections using Open Images V2 metrics. + Open Images V2 introduce group_of type of bounding boxes and this metric + handles those boxes appropriately. + """ + + def __init__(self, + categories, + matching_iou_threshold=0.5, + evaluate_masks=False, + evaluate_corlocs=False, + metric_prefix='OpenImagesV5', + group_of_weight=0.0): + """Constructor. + Args: + categories: A list of dicts, each of which has the following keys - + 'id': (required) an integer id uniquely identifying this category. + 'name': (required) string representing category name e.g., 'cat', 'dog'. + matching_iou_threshold: IOU threshold to use for matching groundtruth + boxes to detection boxes. + evaluate_masks: if True, evaluator evaluates masks. + evaluate_corlocs: if True, additionally evaluates and returns CorLoc. + metric_prefix: Prefix name of the metric. + group_of_weight: Weight of the group-of bounding box. If set to 0 (default + for Open Images V2 detection protocol), detections of the correct class + within a group-of box are ignored. If weight is > 0, then if at least + one detection falls within a group-of box with matching_iou_threshold, + weight group_of_weight is added to true positives. Consequently, if no + detection falls within a group-of box, weight group_of_weight is added + to false negatives. + """ + + super(OpenImagesDetectionEvaluator, self).__init__( + categories, + matching_iou_threshold, + evaluate_corlocs, + metric_prefix=metric_prefix, + group_of_weight=group_of_weight, + evaluate_masks=evaluate_masks) + + def add_single_ground_truth_image_info(self, image_id, gt_dict): + """Adds groundtruth for a single image to be used for evaluation. + Args: + image_id: A unique string/integer identifier for the image. + gt_dict: A dictionary containing - + InputDataFields.gt_boxes: float32 numpy array + of shape [num_boxes, 4] containing `num_boxes` groundtruth boxes of + the format [ymin, xmin, ymax, xmax] in absolute image coordinates. + InputDataFields.gt_classes: integer numpy array + of shape [num_boxes] containing 1-indexed groundtruth classes for the boxes. + InputDataFields.gt_group_of: Optional length M + numpy boolean array denoting whether a groundtruth box contains a group of instances. + Raises: + ValueError: On adding groundtruth for an image more than once. + """ + if image_id in self._image_ids: + return + + gt_classes = (gt_dict[InputDataFields.gt_classes] - self._label_id_offset) + # If the key is not present in the gt_dict or the array is empty + # (unless there are no annotations for the groundtruth on this image) + # use values from the dictionary or insert None otherwise. + if (InputDataFields.gt_group_of in gt_dict and + (gt_dict[InputDataFields.gt_group_of].size or not gt_classes.size)): + gt_group_of = gt_dict[InputDataFields.gt_group_of] + else: + gt_group_of = None + # FIXME disable warning for now, will add group_of flag eventually + # if not len(self._image_ids) % 1000: + # logging.warning('image %s does not have groundtruth group_of flag specified', image_id) + if self._evaluate_masks: + gt_masks = gt_dict[InputDataFields.gt_instance_masks] + else: + gt_masks = None + + self._evaluation.add_single_ground_truth_image_info( + image_id, + gt_dict[InputDataFields.gt_boxes], + gt_classes, + gt_is_difficult_list=None, + gt_is_group_of_list=gt_group_of, + gt_masks=gt_masks) + self._image_ids.update([image_id]) + + +class OpenImagesChallengeEvaluator(OpenImagesDetectionEvaluator): + """A class implements Open Images Challenge metrics. + Both Detection and Instance Segmentation evaluation metrics are implemented. + Open Images Challenge Detection metric has two major changes in comparison + with Open Images V2 detection metric: + - a custom weight might be specified for detecting an object contained in a group-of box. + - verified image-level labels should be explicitly provided for evaluation: in case an + image has neither positive nor negative image level label of class c, all detections of + this class on this image will be ignored. + + Open Images Challenge Instance Segmentation metric allows to measure performance + of models in case of incomplete annotations: some instances are + annotations only on box level and some - on image-level. In addition, + image-level labels are taken into account as in detection metric. + + Open Images Challenge Detection metric default parameters: + evaluate_masks = False + group_of_weight = 1.0 + + Open Images Challenge Instance Segmentation metric default parameters: + evaluate_masks = True + (group_of_weight will not matter) + """ + + def __init__( + self, + categories, + evaluate_masks=False, + matching_iou_threshold=0.5, + evaluate_corlocs=False, + group_of_weight=1.0): + """Constructor. + Args: + categories: A list of dicts, each of which has the following keys - + 'id': (required) an integer id uniquely identifying this category. + 'name': (required) string representing category name e.g., 'cat', 'dog'. + evaluate_masks: set to true for instance segmentation metric and to false + for detection metric. + matching_iou_threshold: IOU threshold to use for matching groundtruth + boxes to detection boxes. + evaluate_corlocs: if True, additionally evaluates and returns CorLoc. + group_of_weight: Weight of group-of boxes. If set to 0, detections of the + correct class within a group-of box are ignored. If weight is > 0, then + if at least one detection falls within a group-of box with + matching_iou_threshold, weight group_of_weight is added to true + positives. Consequently, if no detection falls within a group-of box, + weight group_of_weight is added to false negatives. + """ + if not evaluate_masks: + metrics_prefix = 'OpenImagesDetectionChallenge' + else: + metrics_prefix = 'OpenImagesInstanceSegmentationChallenge' + + super(OpenImagesChallengeEvaluator, self).__init__( + categories, + matching_iou_threshold, + evaluate_masks=evaluate_masks, + evaluate_corlocs=evaluate_corlocs, + group_of_weight=group_of_weight, + metric_prefix=metrics_prefix) + + self._evaluatable_labels = {} + + def add_single_ground_truth_image_info(self, image_id, gt_dict): + """Adds groundtruth for a single image to be used for evaluation. + Args: + image_id: A unique string/integer identifier for the image. + gt_dict: A dictionary containing - + InputDataFields.gt_boxes: float32 numpy array of shape [num_boxes, 4] + containing `num_boxes` groundtruth boxes of the format [ymin, xmin, ymax, xmax] + in absolute image coordinates. + InputDataFields.gt_classes: integer numpy array of shape [num_boxes] + containing 1-indexed groundtruth classes for the boxes. + InputDataFields.gt_image_classes: integer 1D + numpy array containing all classes for which labels are verified. + InputDataFields.gt_group_of: Optional length M + numpy boolean array denoting whether a groundtruth box contains a group of instances. + Raises: + ValueError: On adding groundtruth for an image more than once. + """ + super(OpenImagesChallengeEvaluator, + self).add_single_ground_truth_image_info(image_id, gt_dict) + input_fields = InputDataFields + gt_classes = gt_dict[input_fields.gt_classes] - self._label_id_offset + image_classes = np.array([], dtype=int) + if input_fields.gt_image_classes in gt_dict: + image_classes = gt_dict[input_fields.gt_image_classes] + elif input_fields.gt_labeled_classes in gt_dict: + image_classes = gt_dict[input_fields.gt_labeled_classes] + image_classes -= self._label_id_offset + self._evaluatable_labels[image_id] = np.unique( + np.concatenate((image_classes, gt_classes))) + + def add_single_detected_image_info(self, image_id, detections_dict): + """Adds detections for a single image to be used for evaluation. + Args: + image_id: A unique string/integer identifier for the image. + detections_dict: A dictionary containing - + DetectionResultFields.detection_boxes: float32 numpy + array of shape [num_boxes, 4] containing `num_boxes` detection boxes + of the format [ymin, xmin, ymax, xmax] in absolute image coordinates. + DetectionResultFields.detection_scores: float32 numpy + array of shape [num_boxes] containing detection scores for the boxes. + DetectionResultFields.detection_classes: integer numpy + array of shape [num_boxes] containing 1-indexed detection classes for + the boxes. + Raises: + ValueError: If detection masks are not in detections dictionary. + """ + if image_id not in self._image_ids: + # Since for the correct work of evaluator it is assumed that groundtruth + # is inserted first we make sure to break the code if is it not the case. + self._image_ids.update([image_id]) + self._evaluatable_labels[image_id] = np.array([]) + + detection_classes = detections_dict[DetectionResultFields.detection_classes] - self._label_id_offset + allowed_classes = np.where(np.isin(detection_classes, self._evaluatable_labels[image_id])) + detection_classes = detection_classes[allowed_classes] + detected_boxes = detections_dict[DetectionResultFields.detection_boxes][allowed_classes] + detected_scores = detections_dict[DetectionResultFields.detection_scores][allowed_classes] + + if self._evaluate_masks: + detection_masks = detections_dict[DetectionResultFields.detection_masks][allowed_classes] + else: + detection_masks = None + self._evaluation.add_single_detected_image_info( + image_key=image_id, + detected_boxes=detected_boxes, + detected_scores=detected_scores, + detected_class_labels=detection_classes, + detected_masks=detection_masks) + + def clear(self): + """Clears stored data.""" + + super(OpenImagesChallengeEvaluator, self).clear() + self._evaluatable_labels.clear() + diff --git a/efficientdet/effdet/evaluation/fields.py b/efficientdet/effdet/evaluation/fields.py new file mode 100644 index 0000000000000000000000000000000000000000..d029b77dc5c4ec79aba1a6021760981fc23d3096 --- /dev/null +++ b/efficientdet/effdet/evaluation/fields.py @@ -0,0 +1,105 @@ + +class InputDataFields(object): + """Names for the input tensors. + Holds the standard data field names to use for identifying input tensors. This + should be used by the decoder to identify keys for the returned tensor_dict + containing input tensors. And it should be used by the model to identify the + tensors it needs. + Attributes: + image: image. + image_additional_channels: additional channels. + key: unique key corresponding to image. + filename: original filename of the dataset (without common path). + gt_image_classes: image-level class labels. + gt_image_confidences: image-level class confidences. + gt_labeled_classes: image-level annotation that indicates the + classes for which an image has been labeled. + gt_boxes: coordinates of the ground truth boxes in the image. + gt_classes: box-level class labels. + gt_confidences: box-level class confidences. The shape should be + the same as the shape of gt_classes. + gt_label_types: box-level label types (e.g. explicit negative). + gt_is_crowd: [DEPRECATED, use gt_group_of instead] + is the groundtruth a single object or a crowd. + gt_area: area of a groundtruth segment. + gt_difficult: is a `difficult` object + gt_group_of: is a `group_of` objects, e.g. multiple objects of the + same class, forming a connected group, where instances are heavily + occluding each other. + gt_instance_masks: ground truth instance masks. + gt_instance_boundaries: ground truth instance boundaries. + gt_instance_classes: instance mask-level class labels. + gt_label_weights: groundtruth label weights. + gt_weights: groundtruth weight factor for bounding boxes. + image_height: height of images, used to decode + image_width: width of images, used to decode + """ + image = 'image' + key = 'image_id' + filename = 'filename' + gt_boxes = 'bbox' + gt_classes = 'cls' + gt_confidences = 'confidences' + gt_label_types = 'label_types' + gt_image_classes = 'img_cls' + gt_image_confidences = 'img_confidences' + gt_labeled_classes = 'labeled_cls' + gt_is_crowd = 'is_crowd' + gt_area = 'area' + gt_difficult = 'difficult' + gt_group_of = 'group_of' + gt_instance_masks = 'instance_masks' + gt_instance_boundaries = 'instance_boundaries' + gt_instance_classes = 'instance_classes' + image_height = 'img_height' + image_width = 'img_width' + image_size = 'img_size' + + +class DetectionResultFields(object): + """Naming conventions for storing the output of the detector. + Attributes: + source_id: source of the original image. + key: unique key corresponding to image. + detection_boxes: coordinates of the detection boxes in the image. + detection_scores: detection scores for the detection boxes in the image. + detection_multiclass_scores: class score distribution (including background) + for detection boxes in the image including background class. + detection_classes: detection-level class labels. + detection_masks: contains a segmentation mask for each detection box. + """ + + key = 'image_id' + detection_boxes = 'bbox' + detection_scores = 'score' + detection_classes = 'cls' + detection_masks = 'masks' + + +class BoxListFields(object): + """Naming conventions for BoxLists. + Attributes: + boxes: bounding box coordinates. + classes: classes per bounding box. + scores: scores per bounding box. + weights: sample weights per bounding box. + objectness: objectness score per bounding box. + masks: masks per bounding box. + boundaries: boundaries per bounding box. + keypoints: keypoints per bounding box. + keypoint_heatmaps: keypoint heatmaps per bounding box. + is_crowd: is_crowd annotation per bounding box. + """ + boxes = 'boxes' + classes = 'classes' + scores = 'scores' + weights = 'weights' + confidences = 'confidences' + objectness = 'objectness' + masks = 'masks' + boundaries = 'boundaries' + keypoints = 'keypoints' + keypoint_visibilities = 'keypoint_visibilities' + keypoint_heatmaps = 'keypoint_heatmaps' + is_crowd = 'is_crowd' + group_of = 'group_of' diff --git a/efficientdet/effdet/evaluation/metrics.py b/efficientdet/effdet/evaluation/metrics.py new file mode 100644 index 0000000000000000000000000000000000000000..01a73ef028e9e9da8d2fe5c51d5736ca958e938b --- /dev/null +++ b/efficientdet/effdet/evaluation/metrics.py @@ -0,0 +1,148 @@ +import numpy as np + + +def compute_precision_recall(scores, labels, num_gt): + """Compute precision and recall. + Args: + scores: A float numpy array representing detection score + labels: A float numpy array representing weighted true/false positive labels + num_gt: Number of ground truth instances + Raises: + ValueError: if the input is not of the correct format + Returns: + precision: Fraction of positive instances over detected ones. This value is + None if no ground truth labels are present. + recall: Fraction of detected positive instance over all positive instances. + This value is None if no ground truth labels are present. + """ + if not isinstance(labels, np.ndarray) or len(labels.shape) != 1: + raise ValueError("labels must be single dimension numpy array") + + if labels.dtype != np.float and labels.dtype != np.bool: + raise ValueError("labels type must be either bool or float") + + if not isinstance(scores, np.ndarray) or len(scores.shape) != 1: + raise ValueError("scores must be single dimension numpy array") + + if num_gt < np.sum(labels): + raise ValueError("Number of true positives must be smaller than num_gt.") + + if len(scores) != len(labels): + raise ValueError("scores and labels must be of the same size.") + + if num_gt == 0: + return None, None + + sorted_indices = np.argsort(scores) + sorted_indices = sorted_indices[::-1] + true_positive_labels = labels[sorted_indices] + false_positive_labels = (true_positive_labels <= 0).astype(float) + cum_true_positives = np.cumsum(true_positive_labels) + cum_false_positives = np.cumsum(false_positive_labels) + precision = cum_true_positives.astype(float) / (cum_true_positives + cum_false_positives) + recall = cum_true_positives.astype(float) / num_gt + return precision, recall + + +def compute_average_precision(precision, recall): + """Compute Average Precision according to the definition in VOCdevkit. + Precision is modified to ensure that it does not decrease as recall + decrease. + Args: + precision: A float [N, 1] numpy array of precisions + recall: A float [N, 1] numpy array of recalls + Raises: + ValueError: if the input is not of the correct format + Returns: + average_precison: The area under the precision recall curve. NaN if + precision and recall are None. + """ + if precision is None: + if recall is not None: + raise ValueError("If precision is None, recall must also be None") + return np.NAN + + if not isinstance(precision, np.ndarray) or not isinstance(recall, np.ndarray): + raise ValueError("precision and recall must be numpy array") + if precision.dtype != np.float or recall.dtype != np.float: + raise ValueError("input must be float numpy array.") + if len(precision) != len(recall): + raise ValueError("precision and recall must be of the same size.") + if not precision.size: + return 0.0 + if np.amin(precision) < 0 or np.amax(precision) > 1: + raise ValueError("Precision must be in the range of [0, 1].") + if np.amin(recall) < 0 or np.amax(recall) > 1: + raise ValueError("recall must be in the range of [0, 1].") + if not all(recall[i] <= recall[i + 1] for i in range(len(recall) - 1)): + raise ValueError("recall must be a non-decreasing array") + + recall = np.concatenate([[0], recall, [1]]) + precision = np.concatenate([[0], precision, [0]]) + + # Preprocess precision to be a non-decreasing array + for i in range(len(precision) - 2, -1, -1): + precision[i] = np.maximum(precision[i], precision[i + 1]) + + indices = np.where(recall[1:] != recall[:-1])[0] + 1 + average_precision = np.sum((recall[indices] - recall[indices - 1]) * precision[indices]) + return average_precision + + +def compute_cor_loc(num_gt_imgs_per_class, num_images_correctly_detected_per_class): + """Compute CorLoc according to the definition in the following paper. + https://www.robots.ox.ac.uk/~vgg/rg/papers/deselaers-eccv10.pdf + Returns nans if there are no ground truth images for a class. + Args: + num_gt_imgs_per_class: 1D array, representing number of images containing + at least one object instance of a particular class + num_images_correctly_detected_per_class: 1D array, representing number of + images that are correctly detected at least one object instance of a particular class + Returns: + corloc_per_class: A float numpy array represents the corloc score of each class + """ + return np.where( + num_gt_imgs_per_class == 0, np.nan, + num_images_correctly_detected_per_class / num_gt_imgs_per_class) + + +def compute_median_rank_at_k(tp_fp_list, k): + """Computes MedianRank@k, where k is the top-scoring labels. + Args: + tp_fp_list: a list of numpy arrays; each numpy array corresponds to the all + detection on a single image, where the detections are sorted by score in + descending order. Further, each numpy array element can have boolean or + float values. True positive elements have either value >0.0 or True; + any other value is considered false positive. + k: number of top-scoring proposals to take. + Returns: + median_rank: median rank of all true positive proposals among top k by score. + """ + ranks = [] + for i in range(len(tp_fp_list)): + ranks.append(np.where(tp_fp_list[i][0:min(k, tp_fp_list[i].shape[0])] > 0)[0]) + concatenated_ranks = np.concatenate(ranks) + return np.median(concatenated_ranks) + + +def compute_recall_at_k(tp_fp_list, num_gt, k): + """Computes Recall@k, MedianRank@k, where k is the top-scoring labels. + Args: + tp_fp_list: a list of numpy arrays; each numpy array corresponds to the all + detection on a single image, where the detections are sorted by score in + descending order. Further, each numpy array element can have boolean or + float values. True positive elements have either value >0.0 or True; + any other value is considered false positive. + num_gt: number of groundtruth anotations. + k: number of top-scoring proposals to take. + Returns: + recall: recall evaluated on the top k by score detections. + """ + + tp_fp_eval = [] + for i in range(len(tp_fp_list)): + tp_fp_eval.append(tp_fp_list[i][0:min(k, tp_fp_list[i].shape[0])]) + + tp_fp_eval = np.concatenate(tp_fp_eval) + + return np.sum(tp_fp_eval) / num_gt diff --git a/efficientdet/effdet/evaluation/np_box_list.py b/efficientdet/effdet/evaluation/np_box_list.py new file mode 100644 index 0000000000000000000000000000000000000000..60e3b44c32a72f088ca3b736d24a62aac252d795 --- /dev/null +++ b/efficientdet/effdet/evaluation/np_box_list.py @@ -0,0 +1,696 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Bounding Box List operations for Numpy BoxLists. + +Example box operations that are supported: + * Areas: compute bounding box areas + * IOU: pairwise intersection-over-union scores +""" +import numpy as np + + +class BoxList(object): + """Box collection. + BoxList represents a list of bounding boxes as numpy array, where each + bounding box is represented as a row of 4 numbers, + [y_min, x_min, y_max, x_max]. It is assumed that all bounding boxes within a + given list correspond to a single image. + Optionally, users can add additional related fields (such as + objectness/classification scores). + """ + + def __init__(self, data): + """Constructs box collection. + Args: + data: a numpy array of shape [N, 4] representing box coordinates + Raises: + ValueError: if bbox data is not a numpy array + ValueError: if invalid dimensions for bbox data + """ + if not isinstance(data, np.ndarray): + raise ValueError('data must be a numpy array.') + if len(data.shape) != 2 or data.shape[1] != 4: + raise ValueError('Invalid dimensions for box data.') + if data.dtype != np.float32 and data.dtype != np.float64: + raise ValueError('Invalid data type for box data: float is required.') + if not self._is_valid_boxes(data): + raise ValueError('Invalid box data. data must be a numpy array of ' + 'N*[y_min, x_min, y_max, x_max]') + self.data = {'boxes': data} + + def num_boxes(self): + """Return number of boxes held in collections.""" + return self.data['boxes'].shape[0] + + def get_extra_fields(self): + """Return all non-box fields.""" + return [k for k in self.data.keys() if k != 'boxes'] + + def has_field(self, field): + return field in self.data + + def add_field(self, field, field_data): + """Add data to a specified field. + Args: + field: a string parameter used to speficy a related field to be accessed. + field_data: a numpy array of [N, ...] representing the data associated + with the field. + Raises: + ValueError: if the field is already exist or the dimension of the field + data does not matches the number of boxes. + """ + if self.has_field(field): + raise ValueError('Field ' + field + 'already exists') + if len(field_data.shape) < 1 or field_data.shape[0] != self.num_boxes(): + raise ValueError('Invalid dimensions for field data') + self.data[field] = field_data + + def get(self): + """Convenience function for accesssing box coordinates. + Returns: + a numpy array of shape [N, 4] representing box corners + """ + return self.get_field('boxes') + + def get_field(self, field): + """Accesses data associated with the specified field in the box collection. + Args: + field: a string parameter used to speficy a related field to be accessed. + Returns: + a numpy 1-d array representing data of an associated field + Raises: + ValueError: if invalid field + """ + if not self.has_field(field): + raise ValueError('field {} does not exist'.format(field)) + return self.data[field] + + def get_coordinates(self): + """Get corner coordinates of boxes. + Returns: + a list of 4 1-d numpy arrays [y_min, x_min, y_max, x_max] + """ + box_coordinates = self.get() + y_min = box_coordinates[:, 0] + x_min = box_coordinates[:, 1] + y_max = box_coordinates[:, 2] + x_max = box_coordinates[:, 3] + return [y_min, x_min, y_max, x_max] + + def _is_valid_boxes(self, data): + """Check whether data fullfills the format of N*[ymin, xmin, ymax, xmin]. + Args: + data: a numpy array of shape [N, 4] representing box coordinates + Returns: + a boolean indicating whether all ymax of boxes are equal or greater than + ymin, and all xmax of boxes are equal or greater than xmin. + """ + if data.shape[0] > 0: + for i in range(data.shape[0]): + if data[i, 0] > data[i, 2] or data[i, 1] > data[i, 3]: + return False + return True + + +def area(boxes): + """Computes area of boxes. + + Args: + boxes: Numpy array with shape [N, 4] holding N boxes + + Returns: + a numpy array with shape [N*1] representing box areas + """ + return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1]) + + +def intersection(boxes1, boxes2): + """Compute pairwise intersection areas between boxes. + + Args: + boxes1: a numpy array with shape [N, 4] holding N boxes + boxes2: a numpy array with shape [M, 4] holding M boxes + + Returns: + a numpy array with shape [N*M] representing pairwise intersection area + """ + [y_min1, x_min1, y_max1, x_max1] = np.split(boxes1, 4, axis=1) + [y_min2, x_min2, y_max2, x_max2] = np.split(boxes2, 4, axis=1) + + all_pairs_min_ymax = np.minimum(y_max1, np.transpose(y_max2)) + all_pairs_max_ymin = np.maximum(y_min1, np.transpose(y_min2)) + intersect_heights = np.maximum(np.zeros(all_pairs_max_ymin.shape), all_pairs_min_ymax - all_pairs_max_ymin) + all_pairs_min_xmax = np.minimum(x_max1, np.transpose(x_max2)) + all_pairs_max_xmin = np.maximum(x_min1, np.transpose(x_min2)) + intersect_widths = np.maximum(np.zeros(all_pairs_max_xmin.shape), all_pairs_min_xmax - all_pairs_max_xmin) + return intersect_heights * intersect_widths + + +def iou(boxes1, boxes2): + """Computes pairwise intersection-over-union between box collections. + + Args: + boxes1: a numpy array with shape [N, 4] holding N boxes. + boxes2: a numpy array with shape [M, 4] holding N boxes. + + Returns: + a numpy array with shape [N, M] representing pairwise iou scores. + """ + intersect = intersection(boxes1, boxes2) + area1 = area(boxes1) + area2 = area(boxes2) + union = np.expand_dims(area1, axis=1) + np.expand_dims(area2, axis=0) - intersect + return intersect / union + + +def ioa(boxes1, boxes2): + """Computes pairwise intersection-over-area between box collections. + + Intersection-over-area (ioa) between two boxes box1 and box2 is defined as + their intersection area over box2's area. Note that ioa is not symmetric, + that is, IOA(box1, box2) != IOA(box2, box1). + + Args: + boxes1: a numpy array with shape [N, 4] holding N boxes. + boxes2: a numpy array with shape [M, 4] holding N boxes. + + Returns: + a numpy array with shape [N, M] representing pairwise ioa scores. + """ + intersect = intersection(boxes1, boxes2) + areas = np.expand_dims(area(boxes2), axis=0) + return intersect / areas + + +class SortOrder(object): + """Enum class for sort order. + + Attributes: + ascend: ascend order. + descend: descend order. + """ + ASCEND = 1 + DESCEND = 2 + + +def area_boxlist(boxlist): + """Computes area of boxes. + + Args: + boxlist: BoxList holding N boxes + + Returns: + a numpy array with shape [N*1] representing box areas + """ + y_min, x_min, y_max, x_max = boxlist.get_coordinates() + return (y_max - y_min) * (x_max - x_min) + + +def intersection_boxlist(boxlist1, boxlist2): + """Compute pairwise intersection areas between boxes. + + Args: + boxlist1: BoxList holding N boxes + boxlist2: BoxList holding M boxes + + Returns: + a numpy array with shape [N*M] representing pairwise intersection area + """ + return intersection(boxlist1.get(), boxlist2.get()) + + +def iou_boxlist(boxlist1, boxlist2): + """Computes pairwise intersection-over-union between box collections. + + Args: + boxlist1: BoxList holding N boxes + boxlist2: BoxList holding M boxes + + Returns: + a numpy array with shape [N, M] representing pairwise iou scores. + """ + return iou(boxlist1.get(), boxlist2.get()) + + +def ioa_boxlist(boxlist1, boxlist2): + """Computes pairwise intersection-over-area between box collections. + + Intersection-over-area (ioa) between two boxes box1 and box2 is defined as + their intersection area over box2's area. Note that ioa is not symmetric, + that is, IOA(box1, box2) != IOA(box2, box1). + + Args: + boxlist1: BoxList holding N boxes + boxlist2: BoxList holding M boxes + + Returns: + a numpy array with shape [N, M] representing pairwise ioa scores. + """ + return ioa(boxlist1.get(), boxlist2.get()) + + +def gather_boxlist(boxlist, indices, fields=None): + """Gather boxes from BoxList according to indices and return new BoxList. + + By default, gather returns boxes corresponding to the input index list, as + well as all additional fields stored in the boxlist (indexing into the + first dimension). However one can optionally only gather from a + subset of fields. + + Args: + boxlist: BoxList holding N boxes + indices: a 1-d numpy array of type int_ + fields: (optional) list of fields to also gather from. If None (default), + all fields are gathered from. Pass an empty fields list to only gather the box coordinates. + + Returns: + subboxlist: a BoxList corresponding to the subset of the input BoxList specified by indices + + Raises: + ValueError: if specified field is not contained in boxlist or if the indices are not of type int_ + """ + if indices.size: + if np.amax(indices) >= boxlist.num_boxes() or np.amin(indices) < 0: + raise ValueError('indices are out of valid range.') + subboxlist = BoxList(boxlist.get()[indices, :]) + if fields is None: + fields = boxlist.get_extra_fields() + for field in fields: + extra_field_data = boxlist.get_field(field) + subboxlist.add_field(field, extra_field_data[indices, ...]) + return subboxlist + + +def sort_by_field_boxlist(boxlist, field, order=SortOrder.DESCEND): + """Sort boxes and associated fields according to a scalar field. + + A common use case is reordering the boxes according to descending scores. + + Args: + boxlist: BoxList holding N boxes. + field: A BoxList field for sorting and reordering the BoxList. + order: (Optional) 'descend' or 'ascend'. Default is descend. + + Returns: + sorted_boxlist: A sorted BoxList with the field in the specified order. + + Raises: + ValueError: if specified field does not exist or is not of single dimension. + ValueError: if the order is not either descend or ascend. + """ + if not boxlist.has_field(field): + raise ValueError('Field ' + field + ' does not exist') + if len(boxlist.get_field(field).shape) != 1: + raise ValueError('Field ' + field + 'should be single dimension.') + if order != SortOrder.DESCEND and order != SortOrder.ASCEND: + raise ValueError('Invalid sort order') + + field_to_sort = boxlist.get_field(field) + sorted_indices = np.argsort(field_to_sort) + if order == SortOrder.DESCEND: + sorted_indices = sorted_indices[::-1] + return gather_boxlist(boxlist, sorted_indices) + + +def non_max_suppression(boxlist, max_output_size=10000, iou_threshold=1.0, score_threshold=-10.0): + """Non maximum suppression. + + This op greedily selects a subset of detection bounding boxes, pruning + away boxes that have high IOU (intersection over union) overlap (> thresh) + with already selected boxes. In each iteration, the detected bounding box with + highest score in the available pool is selected. + + Args: + boxlist: BoxList holding N boxes. Must contain a 'scores' field + representing detection scores. All scores belong to the same class. + max_output_size: maximum number of retained boxes + iou_threshold: intersection over union threshold. + score_threshold: minimum score threshold. Remove the boxes with scores less than + this value. Default value is set to -10. A very low threshold to pass pretty + much all the boxes, unless the user sets a different score threshold. + + Returns: + a BoxList holding M boxes where M <= max_output_size + Raises: + ValueError: if 'scores' field does not exist + ValueError: if threshold is not in [0, 1] + ValueError: if max_output_size < 0 + """ + if not boxlist.has_field('scores'): + raise ValueError('Field scores does not exist') + if iou_threshold < 0. or iou_threshold > 1.0: + raise ValueError('IOU threshold must be in [0, 1]') + if max_output_size < 0: + raise ValueError('max_output_size must be bigger than 0.') + + boxlist = filter_scores_greater_than(boxlist, score_threshold) + if boxlist.num_boxes() == 0: + return boxlist + + boxlist = sort_by_field_boxlist(boxlist, 'scores') + + # Prevent further computation if NMS is disabled. + if iou_threshold == 1.0: + if boxlist.num_boxes() > max_output_size: + selected_indices = np.arange(max_output_size) + return gather_boxlist(boxlist, selected_indices) + else: + return boxlist + + boxes = boxlist.get() + num_boxes = boxlist.num_boxes() + # is_index_valid is True only for all remaining valid boxes, + is_index_valid = np.full(num_boxes, 1, dtype=bool) + selected_indices = [] + num_output = 0 + for i in range(num_boxes): + if num_output < max_output_size: + if is_index_valid[i]: + num_output += 1 + selected_indices.append(i) + is_index_valid[i] = False + valid_indices = np.where(is_index_valid)[0] + if valid_indices.size == 0: + break + + intersect_over_union = iou(np.expand_dims(boxes[i, :], axis=0), boxes[valid_indices, :]) + intersect_over_union = np.squeeze(intersect_over_union, axis=0) + is_index_valid[valid_indices] = np.logical_and( + is_index_valid[valid_indices], + intersect_over_union <= iou_threshold) + return gather_boxlist(boxlist, np.array(selected_indices)) + + +def multi_class_non_max_suppression(boxlist, score_thresh, iou_thresh, max_output_size): + """Multi-class version of non maximum suppression. + + This op greedily selects a subset of detection bounding boxes, pruning + away boxes that have high IOU (intersection over union) overlap (> thresh) + with already selected boxes. It operates independently for each class for + which scores are provided (via the scores field of the input box_list), + pruning boxes with score less than a provided threshold prior to + applying NMS. + + Args: + boxlist: BoxList holding N boxes. Must contain a 'scores' field + representing detection scores. This scores field is a tensor that can + be 1 dimensional (in the case of a single class) or 2-dimensional, which + which case we assume that it takes the shape [num_boxes, num_classes]. + We further assume that this rank is known statically and that + scores.shape[1] is also known (i.e., the number of classes is fixed + and known at graph construction time). + score_thresh: scalar threshold for score (low scoring boxes are removed). + iou_thresh: scalar threshold for IOU (boxes that that high IOU overlap + with previously selected boxes are removed). + max_output_size: maximum number of retained boxes per class. + + Returns: + a BoxList holding M boxes with a rank-1 scores field representing + corresponding scores for each box with scores sorted in decreasing order + and a rank-1 classes field representing a class label for each box. + Raises: + ValueError: if iou_thresh is not in [0, 1] or if input boxlist does not have + a valid scores field. + """ + if not 0 <= iou_thresh <= 1.0: + raise ValueError('thresh must be between 0 and 1') + if not isinstance(boxlist, BoxList): + raise ValueError('boxlist must be a BoxList') + if not boxlist.has_field('scores'): + raise ValueError('input boxlist must have \'scores\' field') + scores = boxlist.get_field('scores') + if len(scores.shape) == 1: + scores = np.reshape(scores, [-1, 1]) + elif len(scores.shape) == 2: + if scores.shape[1] is None: + raise ValueError('scores field must have statically defined second dimension') + else: + raise ValueError('scores field must be of rank 1 or 2') + num_boxes = boxlist.num_boxes() + num_scores = scores.shape[0] + num_classes = scores.shape[1] + + if num_boxes != num_scores: + raise ValueError('Incorrect scores field length: actual vs expected.') + + selected_boxes_list = [] + for class_idx in range(num_classes): + boxlist_and_class_scores = BoxList(boxlist.get()) + class_scores = np.reshape(scores[0:num_scores, class_idx], [-1]) + boxlist_and_class_scores.add_field('scores', class_scores) + boxlist_filt = filter_scores_greater_than(boxlist_and_class_scores, score_thresh) + nms_result = non_max_suppression( + boxlist_filt, max_output_size=max_output_size, iou_threshold=iou_thresh, score_threshold=score_thresh) + nms_result.add_field('classes', np.zeros_like(nms_result.get_field('scores')) + class_idx) + selected_boxes_list.append(nms_result) + selected_boxes = concatenate_boxlist(selected_boxes_list) + sorted_boxes = sort_by_field_boxlist(selected_boxes, 'scores') + return sorted_boxes + + +def scale(boxlist, y_scale, x_scale): + """Scale box coordinates in x and y dimensions. + + Args: + boxlist: BoxList holding N boxes + y_scale: float + x_scale: float + + Returns: + boxlist: BoxList holding N boxes + """ + y_min, x_min, y_max, x_max = np.array_split(boxlist.get(), 4, axis=1) + y_min = y_scale * y_min + y_max = y_scale * y_max + x_min = x_scale * x_min + x_max = x_scale * x_max + scaled_boxlist = BoxList(np.hstack([y_min, x_min, y_max, x_max])) + + fields = boxlist.get_extra_fields() + for field in fields: + extra_field_data = boxlist.get_field(field) + scaled_boxlist.add_field(field, extra_field_data) + + return scaled_boxlist + + +def clip_to_window(boxlist, window, filter_nonoverlapping=True): + """Clip bounding boxes to a window. + + This op clips input bounding boxes (represented by bounding box + corners) to a window, optionally filtering out boxes that do not + overlap at all with the window. + + Args: + boxlist: BoxList holding M_in boxes + window: a numpy array of shape [4] representing the [y_min, x_min, y_max, x_max] + window to which the op should clip boxes. + filter_nonoverlapping: whether to filter out boxes that do not overlap at all with the window. + + Returns: + a BoxList holding M_out boxes where M_out <= M_in + """ + y_min, x_min, y_max, x_max = np.array_split(boxlist.get(), 4, axis=1) + win_y_min = window[0] + win_x_min = window[1] + win_y_max = window[2] + win_x_max = window[3] + y_min_clipped = np.fmax(np.fmin(y_min, win_y_max), win_y_min) + y_max_clipped = np.fmax(np.fmin(y_max, win_y_max), win_y_min) + x_min_clipped = np.fmax(np.fmin(x_min, win_x_max), win_x_min) + x_max_clipped = np.fmax(np.fmin(x_max, win_x_max), win_x_min) + clipped = BoxList(np.hstack([y_min_clipped, x_min_clipped, y_max_clipped, x_max_clipped])) + clipped = _copy_extra_fields(clipped, boxlist) + if filter_nonoverlapping: + areas = area(clipped) + nonzero_area_indices = np.reshape(np.nonzero(np.greater(areas, 0.0)), [-1]).astype(np.int32) + clipped = gather_boxlist(clipped, nonzero_area_indices) + return clipped + + +def prune_non_overlapping_boxes(boxlist1, boxlist2, minoverlap=0.0): + """Prunes the boxes in boxlist1 that overlap less than thresh with boxlist2. + + For each box in boxlist1, we want its IOA to be more than minoverlap with + at least one of the boxes in boxlist2. If it does not, we remove it. + + Args: + boxlist1: BoxList holding N boxes. + boxlist2: BoxList holding M boxes. + minoverlap: Minimum required overlap between boxes, to count them as overlapping. + + Returns: + A pruned boxlist with size [N', 4]. + """ + intersection_over_area = ioa(boxlist2, boxlist1) # [M, N] tensor + intersection_over_area = np.amax(intersection_over_area, axis=0) # [N] tensor + keep_bool = np.greater_equal(intersection_over_area, np.array(minoverlap)) + keep_inds = np.nonzero(keep_bool)[0] + new_boxlist1 = gather_boxlist(boxlist1, keep_inds) + return new_boxlist1 + + +def prune_outside_window(boxlist, window): + """Prunes bounding boxes that fall outside a given window. + + This function prunes bounding boxes that even partially fall outside the given + window. See also ClipToWindow which only prunes bounding boxes that fall + completely outside the window, and clips any bounding boxes that partially + overflow. + + Args: + boxlist: a BoxList holding M_in boxes. + window: a numpy array of size 4, representing [ymin, xmin, ymax, xmax] of the window. + + Returns: + pruned_corners: a tensor with shape [M_out, 4] where M_out <= M_in. + valid_indices: a tensor with shape [M_out] indexing the valid bounding boxes in the input tensor. + """ + + y_min, x_min, y_max, x_max = np.array_split(boxlist.get(), 4, axis=1) + win_y_min = window[0] + win_x_min = window[1] + win_y_max = window[2] + win_x_max = window[3] + coordinate_violations = np.hstack([ + np.less(y_min, win_y_min), np.less(x_min, win_x_min), + np.greater(y_max, win_y_max), np.greater(x_max, win_x_max)]) + valid_indices = np.reshape(np.where(np.logical_not(np.max(coordinate_violations, axis=1))), [-1]) + return gather_boxlist(boxlist, valid_indices), valid_indices + + +def concatenate_boxlist(boxlists, fields=None): + """Concatenate list of BoxLists. + + This op concatenates a list of input BoxLists into a larger BoxList. It also + handles concatenation of BoxList fields as long as the field tensor shapes + are equal except for the first dimension. + + Args: + boxlists: list of BoxList objects + fields: optional list of fields to also concatenate. By default, all + fields from the first BoxList in the list are included in the concatenation. + + Returns: + a BoxList with number of boxes equal to + sum([boxlist.num_boxes() for boxlist in BoxList]) + Raises: + ValueError: if boxlists is invalid (i.e., is not a list, is empty, or + contains non BoxList objects), or if requested fields are not contained in all boxlists + """ + if not isinstance(boxlists, list): + raise ValueError('boxlists should be a list') + if not boxlists: + raise ValueError('boxlists should have nonzero length') + for boxlist in boxlists: + if not isinstance(boxlist, BoxList): + raise ValueError('all elements of boxlists should be BoxList objects') + concatenated = BoxList(np.vstack([boxlist.get() for boxlist in boxlists])) + if fields is None: + fields = boxlists[0].get_extra_fields() + for field in fields: + first_field_shape = boxlists[0].get_field(field).shape + first_field_shape = first_field_shape[1:] + for boxlist in boxlists: + if not boxlist.has_field(field): + raise ValueError('boxlist must contain all requested fields') + field_shape = boxlist.get_field(field).shape + field_shape = field_shape[1:] + if field_shape != first_field_shape: + raise ValueError('field %s must have same shape for all boxlists ' + 'except for the 0th dimension.' % field) + concatenated_field = np.concatenate([boxlist.get_field(field) for boxlist in boxlists], axis=0) + concatenated.add_field(field, concatenated_field) + return concatenated + + +def filter_scores_greater_than(boxlist, thresh): + """Filter to keep only boxes with score exceeding a given threshold. + + This op keeps the collection of boxes whose corresponding scores are + greater than the input threshold. + + Args: + boxlist: BoxList holding N boxes. Must contain a 'scores' field representing detection scores. + thresh: scalar threshold + + Returns: + a BoxList holding M boxes where M <= N + + Raises: + ValueError: if boxlist not a BoxList object or if it does not have a scores field + """ + if not isinstance(boxlist, BoxList): + raise ValueError('boxlist must be a BoxList') + if not boxlist.has_field('scores'): + raise ValueError('input boxlist must have \'scores\' field') + scores = boxlist.get_field('scores') + if len(scores.shape) > 2: + raise ValueError('Scores should have rank 1 or 2') + if len(scores.shape) == 2 and scores.shape[1] != 1: + raise ValueError('Scores should have rank 1 or have shape ' + 'consistent with [None, 1]') + high_score_indices = np.reshape(np.where(np.greater(scores, thresh)), [-1]).astype(np.int32) + return gather_boxlist(boxlist, high_score_indices) + + +def change_coordinate_frame(boxlist, window): + """Change coordinate frame of the boxlist to be relative to window's frame. + + Given a window of the form [ymin, xmin, ymax, xmax], + changes bounding box coordinates from boxlist to be relative to this window + (e.g., the min corner maps to (0,0) and the max corner maps to (1,1)). + + An example use case is data augmentation: where we are given groundtruth + boxes (boxlist) and would like to randomly crop the image to some + window (window). In this case we need to change the coordinate frame of + each groundtruth box to be relative to this new window. + + Args: + boxlist: A BoxList object holding N boxes. + window: a size 4 1-D numpy array. + + Returns: + Returns a BoxList object with N boxes. + """ + win_height = window[2] - window[0] + win_width = window[3] - window[1] + boxlist_new = scale( + BoxList(boxlist.get() - [window[0], window[1], window[0], window[1]]), 1.0 / win_height, 1.0 / win_width) + _copy_extra_fields(boxlist_new, boxlist) + + return boxlist_new + + +def _copy_extra_fields(boxlist_to_copy_to, boxlist_to_copy_from): + """Copies the extra fields of boxlist_to_copy_from to boxlist_to_copy_to. + + Args: + boxlist_to_copy_to: BoxList to which extra fields are copied. + boxlist_to_copy_from: BoxList from which fields are copied. + + Returns: + boxlist_to_copy_to with extra fields. + """ + for field in boxlist_to_copy_from.get_extra_fields(): + boxlist_to_copy_to.add_field(field, boxlist_to_copy_from.get_field(field)) + return boxlist_to_copy_to + + +def _update_valid_indices_by_removing_high_iou_boxes( + selected_indices, is_index_valid, intersect_over_union, threshold): + max_iou = np.max(intersect_over_union[:, selected_indices], axis=1) + return np.logical_and(is_index_valid, max_iou <= threshold) diff --git a/efficientdet/effdet/evaluation/np_mask_list.py b/efficientdet/effdet/evaluation/np_mask_list.py new file mode 100644 index 0000000000000000000000000000000000000000..22cdb8770ffb6ce1e4f0233ca814273ca29bbf8f --- /dev/null +++ b/efficientdet/effdet/evaluation/np_mask_list.py @@ -0,0 +1,478 @@ +import numpy as np +from .np_box_list import * + +EPSILON = 1e-7 + + +class MaskList(BoxList): + """Convenience wrapper for BoxList with masks. + + BoxMaskList extends the np_box_list.BoxList to contain masks as well. + In particular, its constructor receives both boxes and masks. Note that the + masks correspond to the full image. + """ + + def __init__(self, box_data, mask_data): + """Constructs box collection. + + Args: + box_data: a numpy array of shape [N, 4] representing box coordinates + mask_data: a numpy array of shape [N, height, width] representing masks + with values are in {0,1}. The masks correspond to the full + image. The height and the width will be equal to image height and width. + + Raises: + ValueError: if bbox data is not a numpy array + ValueError: if invalid dimensions for bbox data + ValueError: if mask data is not a numpy array + ValueError: if invalid dimension for mask data + """ + super(MaskList, self).__init__(box_data) + if not isinstance(mask_data, np.ndarray): + raise ValueError('Mask data must be a numpy array.') + if len(mask_data.shape) != 3: + raise ValueError('Invalid dimensions for mask data.') + if mask_data.dtype != np.uint8: + raise ValueError('Invalid data type for mask data: uint8 is required.') + if mask_data.shape[0] != box_data.shape[0]: + raise ValueError('There should be the same number of boxes and masks.') + self.data['masks'] = mask_data + + def get_masks(self): + """Convenience function for accessing masks. + + Returns: + a numpy array of shape [N, height, width] representing masks + """ + return self.get_field('masks') + + +def boxlist_to_masklist(boxlist): + """Converts a BoxList containing 'masks' into a BoxMaskList. + + Args: + boxlist: An np_box_list.BoxList object. + + Returns: + An BoxMaskList object. + + Raises: + ValueError: If boxlist does not contain `masks` as a field. + """ + if not boxlist.has_field('masks'): + raise ValueError('boxlist does not contain mask field.') + masklist = MaskList(box_data=boxlist.get(), mask_data=boxlist.get_field('masks')) + extra_fields = boxlist.get_extra_fields() + for key in extra_fields: + if key != 'masks': + masklist.data[key] = boxlist.get_field(key) + return masklist + + +def area_mask(masks): + """Computes area of masks. + + Args: + masks: Numpy array with shape [N, height, width] holding N masks. Masks + values are of type np.uint8 and values are in {0,1}. + + Returns: + a numpy array with shape [N*1] representing mask areas. + + Raises: + ValueError: If masks.dtype is not np.uint8 + """ + if masks.dtype != np.uint8: + raise ValueError('Masks type should be np.uint8') + return np.sum(masks, axis=(1, 2), dtype=np.float32) + + +def intersection_mask(masks1, masks2): + """Compute pairwise intersection areas between masks. + + Args: + masks1: a numpy array with shape [N, height, width] holding N masks. Masks + values are of type np.uint8 and values are in {0,1}. + masks2: a numpy array with shape [M, height, width] holding M masks. Masks + values are of type np.uint8 and values are in {0,1}. + + Returns: + a numpy array with shape [N*M] representing pairwise intersection area. + + Raises: + ValueError: If masks1 and masks2 are not of type np.uint8. + """ + if masks1.dtype != np.uint8 or masks2.dtype != np.uint8: + raise ValueError('masks1 and masks2 should be of type np.uint8') + n = masks1.shape[0] + m = masks2.shape[0] + answer = np.zeros([n, m], dtype=np.float32) + for i in np.arange(n): + for j in np.arange(m): + answer[i, j] = np.sum(np.minimum(masks1[i], masks2[j]), dtype=np.float32) + return answer + + +def iou_mask(masks1, masks2): + """Computes pairwise intersection-over-union between mask collections. + + Args: + masks1: a numpy array with shape [N, height, width] holding N masks. Masks + values are of type np.uint8 and values are in {0,1}. + masks2: a numpy array with shape [M, height, width] holding N masks. Masks + values are of type np.uint8 and values are in {0,1}. + + Returns: + a numpy array with shape [N, M] representing pairwise iou scores. + + Raises: + ValueError: If masks1 and masks2 are not of type np.uint8. + """ + if masks1.dtype != np.uint8 or masks2.dtype != np.uint8: + raise ValueError('masks1 and masks2 should be of type np.uint8') + intersect = intersection(masks1, masks2) + area1 = area(masks1) + area2 = area(masks2) + union = np.expand_dims(area1, axis=1) + np.expand_dims(area2, axis=0) - intersect + return intersect / np.maximum(union, EPSILON) + + +def ioa_mask(masks1, masks2): + """Computes pairwise intersection-over-area between box collections. + + Intersection-over-area (ioa) between two masks, mask1 and mask2 is defined as + their intersection area over mask2's area. Note that ioa is not symmetric, + that is, IOA(mask1, mask2) != IOA(mask2, mask1). + + Args: + masks1: a numpy array with shape [N, height, width] holding N masks. Masks + values are of type np.uint8 and values are in {0,1}. + masks2: a numpy array with shape [M, height, width] holding N masks. Masks + values are of type np.uint8 and values are in {0,1}. + + Returns: + a numpy array with shape [N, M] representing pairwise ioa scores. + + Raises: + ValueError: If masks1 and masks2 are not of type np.uint8. + """ + if masks1.dtype != np.uint8 or masks2.dtype != np.uint8: + raise ValueError('masks1 and masks2 should be of type np.uint8') + intersect = intersection(masks1, masks2) + areas = np.expand_dims(area(masks2), axis=0) + return intersect / (areas + EPSILON) + + +def area_masklist(masklist): + """Computes area of masks. + + Args: + masklist: BoxMaskList holding N boxes and masks + + Returns: + a numpy array with shape [N*1] representing mask areas + """ + return area_mask(masklist.get_masks()) + + +def intersection_masklist(masklist1, masklist2): + """Compute pairwise intersection areas between masks. + + Args: + masklist1: BoxMaskList holding N boxes and masks + masklist2: BoxMaskList holding M boxes and masks + + Returns: + a numpy array with shape [N*M] representing pairwise intersection area + """ + return intersection_mask(masklist1.get_masks(), masklist2.get_masks()) + + +def iou_masklist(masklist1, masklist2): + """Computes pairwise intersection-over-union between box and mask collections. + + Args: + masklist1: BoxMaskList holding N boxes and masks + masklist2: BoxMaskList holding M boxes and masks + + Returns: + a numpy array with shape [N, M] representing pairwise iou scores. + """ + return iou_mask(masklist1.get_masks(), masklist2.get_masks()) + + +def ioa_masklist(masklist1, masklist2): + """Computes pairwise intersection-over-area between box and mask collections. + + Intersection-over-area (ioa) between two masks mask1 and mask2 is defined as + their intersection area over mask2's area. Note that ioa is not symmetric, + that is, IOA(mask1, mask2) != IOA(mask2, mask1). + + Args: + masklist1: BoxMaskList holding N boxes and masks + masklist2: BoxMaskList holding M boxes and masks + + Returns: + a numpy array with shape [N, M] representing pairwise ioa scores. + """ + return ioa_mask(masklist1.get_masks(), masklist2.get_masks()) + + +def gather_masklist(masklist, indices, fields=None): + """Gather boxes from BoxMaskList according to indices. + + By default, gather returns boxes corresponding to the input index list, as + well as all additional fields stored in the masklist (indexing into the + first dimension). However one can optionally only gather from a + subset of fields. + + Args: + masklist: BoxMaskList holding N boxes + indices: a 1-d numpy array of type int_ + fields: (optional) list of fields to also gather from. If None (default), all fields + are gathered from. Pass an empty fields list to only gather the box coordinates. + + Returns: + submasklist: a BoxMaskList corresponding to the subset of the input masklist specified by indices + + Raises: + ValueError: if specified field is not contained in masklist or if the indices are not of type int_ + """ + if fields is not None: + if 'masks' not in fields: + fields.append('masks') + return boxlist_to_masklist(gather_boxlist(boxlist=masklist, indices=indices, fields=fields)) + + +def sort_by_field_masklist(masklist, field, order=SortOrder.DESCEND): + """Sort boxes and associated fields according to a scalar field. + + A common use case is reordering the boxes according to descending scores. + + Args: + masklist: BoxMaskList holding N boxes. + field: A BoxMaskList field for sorting and reordering the BoxMaskList. + order: (Optional) 'descend' or 'ascend'. Default is descend. + + Returns: + sorted_masklist: A sorted BoxMaskList with the field in the specified order. + """ + return boxlist_to_masklist(sort_by_field_boxlist(boxlist=masklist, field=field, order=order)) + + +def non_max_suppression_mask(masklist, max_output_size=10000, iou_threshold=1.0, score_threshold=-10.0): + """Non maximum suppression. + + This op greedily selects a subset of detection bounding boxes, pruning + away boxes that have high IOU (intersection over union) overlap (> thresh) + with already selected boxes. In each iteration, the detected bounding box with + highest score in the available pool is selected. + + Args: + masklist: BoxMaskList holding N boxes. Must contain a 'scores' field representing + detection scores. All scores belong to the same class. + max_output_size: maximum number of retained boxes + iou_threshold: intersection over union threshold. + score_threshold: minimum score threshold. Remove the boxes with scores + less than this value. Default value is set to -10. A very + low threshold to pass pretty much all the boxes, unless + the user sets a different score threshold. + + Returns: + an BoxMaskList holding M boxes where M <= max_output_size + + Raises: + ValueError: if 'scores' field does not exist + ValueError: if threshold is not in [0, 1] + ValueError: if max_output_size < 0 + """ + if not masklist.has_field('scores'): + raise ValueError('Field scores does not exist') + if iou_threshold < 0. or iou_threshold > 1.0: + raise ValueError('IOU threshold must be in [0, 1]') + if max_output_size < 0: + raise ValueError('max_output_size must be bigger than 0.') + + masklist = filter_scores_greater_than(masklist, score_threshold) + if masklist.num_boxes() == 0: + return masklist + + masklist = sort_by_field_boxlist(masklist, 'scores') + + # Prevent further computation if NMS is disabled. + if iou_threshold == 1.0: + if masklist.num_boxes() > max_output_size: + selected_indices = np.arange(max_output_size) + return gather_masklist(masklist, selected_indices) + else: + return masklist + + masks = masklist.get_masks() + num_masks = masklist.num_boxes() + + # is_index_valid is True only for all remaining valid boxes, + is_index_valid = np.full(num_masks, 1, dtype=bool) + selected_indices = [] + num_output = 0 + for i in range(num_masks): + if num_output < max_output_size: + if is_index_valid[i]: + num_output += 1 + selected_indices.append(i) + is_index_valid[i] = False + valid_indices = np.where(is_index_valid)[0] + if valid_indices.size == 0: + break + + intersect_over_union = iou_mask(np.expand_dims(masks[i], axis=0), masks[valid_indices]) + intersect_over_union = np.squeeze(intersect_over_union, axis=0) + is_index_valid[valid_indices] = np.logical_and( + is_index_valid[valid_indices], + intersect_over_union <= iou_threshold) + return gather_masklist(masklist, np.array(selected_indices)) + + +def multi_class_non_max_suppression_mask(masklist, score_thresh, iou_thresh, max_output_size): + """Multi-class version of non maximum suppression. + + This op greedily selects a subset of detection bounding boxes, pruning away boxes that have + high IOU (intersection over union) overlap (> thresh) with already selected boxes. It + operates independently for each class for which scores are provided (via the scores field + of the input box_list), pruning boxes with score less than a provided threshold prior to + applying NMS. + + Args: + masklist: BoxMaskList holding N boxes. Must contain a 'scores' field representing detection + scores. This scores field is a tensor that can be 1 dimensional (in the case of a + single class) or 2-dimensional, in which case we assume that it takes the shape + [num_boxes, num_classes]. We further assume that this rank is known statically and + that scores.shape[1] is also known (i.e., the number of classes is fixed and known + at graph construction time). + score_thresh: scalar threshold for score (low scoring boxes are removed). + iou_thresh: scalar threshold for IOU (boxes that that high IOU overlap with previously + selected boxes are removed). + max_output_size: maximum number of retained boxes per class. + + Returns: + a masklist holding M boxes with a rank-1 scores field representing + corresponding scores for each box with scores sorted in decreasing order + and a rank-1 classes field representing a class label for each box. + Raises: + ValueError: if iou_thresh is not in [0, 1] or if input masklist does not have a valid scores field. + """ + if not 0 <= iou_thresh <= 1.0: + raise ValueError('thresh must be between 0 and 1') + if not isinstance(masklist, MaskList): + raise ValueError('masklist must be a masklist') + if not masklist.has_field('scores'): + raise ValueError('input masklist must have \'scores\' field') + scores = masklist.get_field('scores') + if len(scores.shape) == 1: + scores = np.reshape(scores, [-1, 1]) + elif len(scores.shape) == 2: + if scores.shape[1] is None: + raise ValueError('scores field must have statically defined second dimension') + else: + raise ValueError('scores field must be of rank 1 or 2') + + num_boxes = masklist.num_boxes() + num_scores = scores.shape[0] + num_classes = scores.shape[1] + + if num_boxes != num_scores: + raise ValueError('Incorrect scores field length: actual vs expected.') + + selected_boxes_list = [] + for class_idx in range(num_classes): + masklist_and_class_scores = MaskList(box_data=masklist.get(), mask_data=masklist.get_masks()) + class_scores = np.reshape(scores[0:num_scores, class_idx], [-1]) + masklist_and_class_scores.add_field('scores', class_scores) + masklist_filt = filter_scores_greater_than(masklist_and_class_scores, score_thresh) + nms_result = non_max_suppression( + masklist_filt, + max_output_size=max_output_size, + iou_threshold=iou_thresh, + score_threshold=score_thresh) + nms_result.add_field('classes', np.zeros_like(nms_result.get_field('scores')) + class_idx) + selected_boxes_list.append(nms_result) + selected_boxes = concatenate_boxlist(selected_boxes_list) + sorted_boxes = sort_by_field_boxlist(selected_boxes, 'scores') + return boxlist_to_masklist(boxlist=sorted_boxes) + + +def prune_non_overlapping_masklist(masklist1, masklist2, minoverlap=0.0): + """Prunes the boxes in list1 that overlap less than thresh with list2. + + For each mask in masklist1, we want its IOA to be more than minoverlap + with at least one of the masks in masklist2. If it does not, we remove + it. If the masks are not full size image, we do the pruning based on boxes. + + Args: + masklist1: BoxMaskList holding N boxes and masks. + masklist2: BoxMaskList holding M boxes and masks. + minoverlap: Minimum required overlap between boxes, to count them as overlapping. + + Returns: + A pruned masklist with size [N', 4]. + """ + intersection_over_area = ioa_masklist(masklist2, masklist1) # [M, N] tensor + intersection_over_area = np.amax(intersection_over_area, axis=0) # [N] tensor + keep_bool = np.greater_equal(intersection_over_area, np.array(minoverlap)) + keep_inds = np.nonzero(keep_bool)[0] + new_masklist1 = gather_masklist(masklist1, keep_inds) + return new_masklist1 + + +def concatenate_masklist(masklists, fields=None): + """Concatenate list of masklists. + + This op concatenates a list of input masklists into a larger + masklist. It also + handles concatenation of masklist fields as long as the field tensor + shapes are equal except for the first dimension. + + Args: + masklists: list of BoxMaskList objects + fields: optional list of fields to also concatenate. By default, all + fields from the first BoxMaskList in the list are included in the concatenation. + + Returns: + a masklist with number of boxes equal to sum([masklist.num_boxes() for masklist in masklist]) + Raises: + ValueError: if masklists is invalid (i.e., is not a list, is empty, or contains non + masklist objects), or if requested fields are not contained in all masklists + """ + if fields is not None: + if 'masks' not in fields: + fields.append('masks') + return boxlist_to_masklist(concatenate_boxlist(boxlists=masklists, fields=fields)) + + +def filter_scores_greater_than_masklist(masklist, thresh): + """Filter to keep only boxes and masks with score exceeding a given threshold. + + This op keeps the collection of boxes and masks whose corresponding scores are + greater than the input threshold. + + Args: + masklist: BoxMaskList holding N boxes and masks. Must contain a + 'scores' field representing detection scores. + thresh: scalar threshold + + Returns: + a BoxMaskList holding M boxes and masks where M <= N + + Raises: + ValueError: if masklist not a BoxMaskList object or if it does not have a scores field + """ + if not isinstance(masklist, MaskList): + raise ValueError('masklist must be a BoxMaskList') + if not masklist.has_field('scores'): + raise ValueError('input masklist must have \'scores\' field') + scores = masklist.get_field('scores') + if len(scores.shape) > 2: + raise ValueError('Scores should have rank 1 or 2') + if len(scores.shape) == 2 and scores.shape[1] != 1: + raise ValueError('Scores should have rank 1 or have shape consistent with [None, 1]') + high_score_indices = np.reshape(np.where(np.greater(scores, thresh)), [-1]).astype(np.int32) + return gather_masklist(masklist, high_score_indices) diff --git a/efficientdet/effdet/evaluation/object_detection_evaluation.py b/efficientdet/effdet/evaluation/object_detection_evaluation.py new file mode 100644 index 0000000000000000000000000000000000000000..ee9211196f0493e0837f72b35b3542a1d882ef45 --- /dev/null +++ b/efficientdet/effdet/evaluation/object_detection_evaluation.py @@ -0,0 +1,273 @@ +import logging + +import numpy as np + +from effdet.evaluation.metrics import compute_precision_recall, compute_average_precision, compute_cor_loc +from effdet.evaluation.per_image_evaluation import PerImageEvaluation + + +class ObjectDetectionEvaluation: + """Internal implementation of Pascal object detection metrics.""" + + def __init__(self, + num_gt_classes, + matching_iou_threshold=0.5, + nms_iou_threshold=1.0, + nms_max_output_boxes=10000, + recall_lower_bound=0.0, + recall_upper_bound=1.0, + use_weighted_mean_ap=False, + label_id_offset=0, + group_of_weight=0.0, + per_image_eval_class=PerImageEvaluation): + """Constructor. + Args: + num_gt_classes: Number of ground-truth classes. + matching_iou_threshold: IOU threshold used for matching detected boxes to ground-truth boxes. + nms_iou_threshold: IOU threshold used for non-maximum suppression. + nms_max_output_boxes: Maximum number of boxes returned by non-maximum suppression. + recall_lower_bound: lower bound of recall operating area + recall_upper_bound: upper bound of recall operating area + use_weighted_mean_ap: (optional) boolean which determines if the mean + average precision is computed directly from the scores and tp_fp_labels of all classes. + label_id_offset: The label id offset. + group_of_weight: Weight of group-of boxes.If set to 0, detections of the + correct class within a group-of box are ignored. If weight is > 0, then + if at least one detection falls within a group-of box with + matching_iou_threshold, weight group_of_weight is added to true + positives. Consequently, if no detection falls within a group-of box, + weight group_of_weight is added to false negatives. + per_image_eval_class: The class that contains functions for computing per image metrics. + Raises: + ValueError: if num_gt_classes is smaller than 1. + """ + if num_gt_classes < 1: + raise ValueError('Need at least 1 groundtruth class for evaluation.') + + self.per_image_eval = per_image_eval_class( + num_gt_classes=num_gt_classes, + matching_iou_threshold=matching_iou_threshold, + nms_iou_threshold=nms_iou_threshold, + nms_max_output_boxes=nms_max_output_boxes, + group_of_weight=group_of_weight) + self.recall_lower_bound = recall_lower_bound + self.recall_upper_bound = recall_upper_bound + self.group_of_weight = group_of_weight + self.num_class = num_gt_classes + self.use_weighted_mean_ap = use_weighted_mean_ap + self.label_id_offset = label_id_offset + + self.gt_boxes = {} + self.gt_class_labels = {} + self.gt_masks = {} + self.gt_is_difficult_list = {} + self.gt_is_group_of_list = {} + self.num_gt_instances_per_class = np.zeros(self.num_class, dtype=float) + self.num_gt_imgs_per_class = np.zeros(self.num_class, dtype=int) + + self._initialize_detections() + + def _initialize_detections(self): + """Initializes internal data structures.""" + self.detection_keys = set() + self.scores_per_class = [[] for _ in range(self.num_class)] + self.tp_fp_labels_per_class = [[] for _ in range(self.num_class)] + self.num_images_correctly_detected_per_class = np.zeros(self.num_class) + self.average_precision_per_class = np.empty(self.num_class, dtype=float) + self.average_precision_per_class.fill(np.nan) + self.precisions_per_class = [np.nan] * self.num_class + self.recalls_per_class = [np.nan] * self.num_class + self.sum_tp_class = [np.nan] * self.num_class + + self.corloc_per_class = np.ones(self.num_class, dtype=float) + + def clear_detections(self): + self._initialize_detections() + + def add_single_ground_truth_image_info( + self, image_key, gt_boxes, gt_class_labels, + gt_is_difficult_list=None, gt_is_group_of_list=None, gt_masks=None): + """Adds groundtruth for a single image to be used for evaluation. + Args: + image_key: A unique string/integer identifier for the image. + gt_boxes: float32 numpy array of shape [num_boxes, 4] containing + `num_boxes` groundtruth boxes of the format [ymin, xmin, ymax, xmax] in absolute image coordinates. + gt_class_labels: integer numpy array of shape [num_boxes] + containing 0-indexed groundtruth classes for the boxes. + gt_is_difficult_list: A length M numpy boolean array denoting + whether a ground truth box is a difficult instance or not. To support + the case that no boxes are difficult, it is by default set as None. + gt_is_group_of_list: A length M numpy boolean array denoting + whether a ground truth box is a group-of box or not. To support the case + that no boxes are groups-of, it is by default set as None. + gt_masks: uint8 numpy array of shape [num_boxes, height, width] + containing `num_boxes` groundtruth masks. The mask values range from 0 to 1. + """ + if image_key in self.gt_boxes: + logging.warning('image %s has already been added to the ground truth database.', image_key) + return + + self.gt_boxes[image_key] = gt_boxes + self.gt_class_labels[image_key] = gt_class_labels + self.gt_masks[image_key] = gt_masks + if gt_is_difficult_list is None: + num_boxes = gt_boxes.shape[0] + gt_is_difficult_list = np.zeros(num_boxes, dtype=bool) + gt_is_difficult_list = gt_is_difficult_list.astype(dtype=bool) + self.gt_is_difficult_list[image_key] = gt_is_difficult_list + if gt_is_group_of_list is None: + num_boxes = gt_boxes.shape[0] + gt_is_group_of_list = np.zeros(num_boxes, dtype=bool) + if gt_masks is None: + num_boxes = gt_boxes.shape[0] + mask_presence_indicator = np.zeros(num_boxes, dtype=bool) + else: + mask_presence_indicator = (np.sum(gt_masks, axis=(1, 2)) == 0).astype(dtype=bool) + + gt_is_group_of_list = gt_is_group_of_list.astype(dtype=bool) + self.gt_is_group_of_list[image_key] = gt_is_group_of_list + + # ignore boxes without masks + masked_gt_is_difficult_list = gt_is_difficult_list | mask_presence_indicator + for class_index in range(self.num_class): + num_gt_instances = np.sum( + gt_class_labels[~masked_gt_is_difficult_list & ~gt_is_group_of_list] == class_index) + num_groupof_gt_instances = self.group_of_weight * np.sum( + gt_class_labels[gt_is_group_of_list & ~masked_gt_is_difficult_list] == class_index) + self.num_gt_instances_per_class[class_index] += num_gt_instances + num_groupof_gt_instances + if np.any(gt_class_labels == class_index): + self.num_gt_imgs_per_class[class_index] += 1 + + def add_single_detected_image_info( + self, image_key, detected_boxes, detected_scores, detected_class_labels, detected_masks=None): + """Adds detections for a single image to be used for evaluation. + Args: + image_key: A unique string/integer identifier for the image. + detected_boxes: float32 numpy array of shape [num_boxes, 4] containing + `num_boxes` detection boxes of the format [ymin, xmin, ymax, xmax] in + absolute image coordinates. + detected_scores: float32 numpy array of shape [num_boxes] containing + detection scores for the boxes. + detected_class_labels: integer numpy array of shape [num_boxes] containing + 0-indexed detection classes for the boxes. + detected_masks: np.uint8 numpy array of shape [num_boxes, height, width] + containing `num_boxes` detection masks with values ranging between 0 and 1. + Raises: + ValueError: if the number of boxes, scores and class labels differ in length. + """ + if len(detected_boxes) != len(detected_scores) or len(detected_boxes) != len(detected_class_labels): + raise ValueError( + 'detected_boxes, detected_scores and ' + 'detected_class_labels should all have same lengths. Got' + '[%d, %d, %d]' % len(detected_boxes), len(detected_scores), + len(detected_class_labels)) + + if image_key in self.detection_keys: + logging.warning('image %s has already been added to the detection result database', image_key) + return + + self.detection_keys.add(image_key) + if image_key in self.gt_boxes: + gt_boxes = self.gt_boxes[image_key] + gt_class_labels = self.gt_class_labels[image_key] + # Masks are popped instead of look up. The reason is that we do not want + # to keep all masks in memory which can cause memory overflow. + gt_masks = self.gt_masks.pop(image_key) + gt_is_difficult_list = self.gt_is_difficult_list[image_key] + gt_is_group_of_list = self.gt_is_group_of_list[image_key] + else: + gt_boxes = np.empty(shape=[0, 4], dtype=float) + gt_class_labels = np.array([], dtype=int) + if detected_masks is None: + gt_masks = None + else: + gt_masks = np.empty(shape=[0, 1, 1], dtype=float) + gt_is_difficult_list = np.array([], dtype=bool) + gt_is_group_of_list = np.array([], dtype=bool) + scores, tp_fp_labels, is_class_correctly_detected_in_image = \ + self.per_image_eval.compute_object_detection_metrics( + detected_boxes=detected_boxes, + detected_scores=detected_scores, + detected_class_labels=detected_class_labels, + gt_boxes=gt_boxes, + gt_class_labels=gt_class_labels, + gt_is_difficult_list=gt_is_difficult_list, + gt_is_group_of_list=gt_is_group_of_list, + detected_masks=detected_masks, + gt_masks=gt_masks) + + for i in range(self.num_class): + if scores[i].shape[0] > 0: + self.scores_per_class[i].append(scores[i]) + self.tp_fp_labels_per_class[i].append(tp_fp_labels[i]) + self.num_images_correctly_detected_per_class += is_class_correctly_detected_in_image + + def evaluate(self): + """Compute evaluation result. + Returns: + A dict with the following fields - + average_precision: float numpy array of average precision for each class. + mean_ap: mean average precision of all classes, float scalar + precisions: List of precisions, each precision is a float numpy array + recalls: List of recalls, each recall is a float numpy array + corloc: numpy float array + mean_corloc: Mean CorLoc score for each class, float scalar + """ + if (self.num_gt_instances_per_class == 0).any(): + logging.warning( + 'The following classes have no ground truth examples: %s', + np.squeeze(np.argwhere(self.num_gt_instances_per_class == 0)) + self.label_id_offset) + + if self.use_weighted_mean_ap: + all_scores = np.array([], dtype=float) + all_tp_fp_labels = np.array([], dtype=bool) + for class_index in range(self.num_class): + if self.num_gt_instances_per_class[class_index] == 0: + continue + if not self.scores_per_class[class_index]: + scores = np.array([], dtype=float) + tp_fp_labels = np.array([], dtype=float) + else: + scores = np.concatenate(self.scores_per_class[class_index]) + tp_fp_labels = np.concatenate(self.tp_fp_labels_per_class[class_index]) + if self.use_weighted_mean_ap: + all_scores = np.append(all_scores, scores) + all_tp_fp_labels = np.append(all_tp_fp_labels, tp_fp_labels) + precision, recall = compute_precision_recall( + scores, tp_fp_labels, self.num_gt_instances_per_class[class_index]) + recall_within_bound_indices = [ + index for index, value in enumerate(recall) if + value >= self.recall_lower_bound and value <= self.recall_upper_bound + ] + recall_within_bound = recall[recall_within_bound_indices] + precision_within_bound = precision[recall_within_bound_indices] + + self.precisions_per_class[class_index] = precision_within_bound + self.recalls_per_class[class_index] = recall_within_bound + self.sum_tp_class[class_index] = tp_fp_labels.sum() + average_precision = compute_average_precision(precision_within_bound, recall_within_bound) + self.average_precision_per_class[class_index] = average_precision + logging.debug('average_precision: %f', average_precision) + + self.corloc_per_class = compute_cor_loc( + self.num_gt_imgs_per_class, self.num_images_correctly_detected_per_class) + + if self.use_weighted_mean_ap: + num_gt_instances = np.sum(self.num_gt_instances_per_class) + precision, recall = compute_precision_recall(all_scores, all_tp_fp_labels, num_gt_instances) + recall_within_bound_indices = [ + index for index, value in enumerate(recall) if + value >= self.recall_lower_bound and value <= self.recall_upper_bound + ] + recall_within_bound = recall[recall_within_bound_indices] + precision_within_bound = precision[recall_within_bound_indices] + mean_ap = compute_average_precision(precision_within_bound, recall_within_bound) + else: + mean_ap = np.nanmean(self.average_precision_per_class) + mean_corloc = np.nanmean(self.corloc_per_class) + + return dict( + per_class_ap=self.average_precision_per_class, mean_ap=mean_ap, + per_class_precision=self.precisions_per_class, + per_class_recall=self.recalls_per_class, + per_class_corlocs=self.corloc_per_class, mean_corloc=mean_corloc) diff --git a/efficientdet/effdet/evaluation/per_image_evaluation.py b/efficientdet/effdet/evaluation/per_image_evaluation.py new file mode 100644 index 0000000000000000000000000000000000000000..e904027c5214b82d5729c5132914daff244c4c98 --- /dev/null +++ b/efficientdet/effdet/evaluation/per_image_evaluation.py @@ -0,0 +1,538 @@ +from .np_mask_list import * +from .metrics import * + + +class PerImageEvaluation: + """Evaluate detection result of a single image.""" + + def __init__(self, + num_gt_classes, + matching_iou_threshold=0.5, + nms_iou_threshold=0.3, + nms_max_output_boxes=50, + group_of_weight=0.0): + """Initialized PerImageEvaluation by evaluation parameters. + Args: + num_gt_classes: Number of ground truth object classes + matching_iou_threshold: A ratio of area intersection to union, which is + the threshold to consider whether a detection is true positive or not + nms_iou_threshold: IOU threshold used in Non Maximum Suppression. + nms_max_output_boxes: Number of maximum output boxes in NMS. + group_of_weight: Weight of the group-of boxes. + """ + self.matching_iou_threshold = matching_iou_threshold + self.nms_iou_threshold = nms_iou_threshold + self.nms_max_output_boxes = nms_max_output_boxes + self.num_gt_classes = num_gt_classes + self.group_of_weight = group_of_weight + + def compute_object_detection_metrics( + self, detected_boxes, detected_scores, detected_class_labels, + gt_boxes, gt_class_labels, gt_is_difficult_list, gt_is_group_of_list, + detected_masks=None, gt_masks=None): + """Evaluates detections as being tp, fp or weighted from a single image. + The evaluation is done in two stages: + 1. All detections are matched to non group-of boxes; true positives are + determined and detections matched to difficult boxes are ignored. + 2. Detections that are determined as false positives are matched against + group-of boxes and weighted if matched. + Args: + detected_boxes: A float numpy array of shape [N, 4], representing N + regions of detected object regions. Each row is of the format [y_min, x_min, y_max, x_max] + detected_scores: A float numpy array of shape [N, 1], representing the + confidence scores of the detected N object instances. + detected_class_labels: A integer numpy array of shape [N, 1], repreneting + the class labels of the detected N object instances. + gt_boxes: A float numpy array of shape [M, 4], representing M + regions of object instances in ground truth + gt_class_labels: An integer numpy array of shape [M, 1], + representing M class labels of object instances in ground truth + gt_is_difficult_list: A boolean numpy array of length M denoting + whether a ground truth box is a difficult instance or not + gt_is_group_of_list: A boolean numpy array of length M denoting + whether a ground truth box has group-of tag + detected_masks: (optional) A uint8 numpy array of shape [N, height, + width]. If not None, the metrics will be computed based on masks. + gt_masks: (optional) A uint8 numpy array of shape [M, height, + width]. Can have empty masks, i.e. where all values are 0. + Returns: + scores: A list of C float numpy arrays. Each numpy array is of + shape [K, 1], representing K scores detected with object class label c + tp_fp_labels: A list of C boolean numpy arrays. Each numpy array + is of shape [K, 1], representing K True/False positive label of + object instances detected with class label c + is_class_correctly_detected_in_image: a numpy integer array of + shape [C, 1], indicating whether the correponding class has a least + one instance being correctly detected in the image + """ + detected_boxes, detected_scores, detected_class_labels, detected_masks = ( + self._remove_invalid_boxes(detected_boxes, detected_scores, detected_class_labels, detected_masks)) + + scores, tp_fp_labels = self._compute_tp_fp( + detected_boxes=detected_boxes, + detected_scores=detected_scores, + detected_class_labels=detected_class_labels, + gt_boxes=gt_boxes, + gt_class_labels=gt_class_labels, + gt_is_difficult_list=gt_is_difficult_list, + gt_is_group_of_list=gt_is_group_of_list, + detected_masks=detected_masks, + gt_masks=gt_masks) + + is_class_correctly_detected_in_image = self._compute_cor_loc( + detected_boxes=detected_boxes, + detected_scores=detected_scores, + detected_class_labels=detected_class_labels, + gt_boxes=gt_boxes, + gt_class_labels=gt_class_labels, + detected_masks=detected_masks, + gt_masks=gt_masks) + + return scores, tp_fp_labels, is_class_correctly_detected_in_image + + def _compute_cor_loc( + self, detected_boxes, detected_scores, detected_class_labels, + gt_boxes, gt_class_labels, detected_masks=None, gt_masks=None): + """Compute CorLoc score for object detection result. + Args: + detected_boxes: A float numpy array of shape [N, 4], representing N + regions of detected object regions. Each row is of the format [y_min, x_min, y_max, x_max] + detected_scores: A float numpy array of shape [N, 1], representing the + confidence scores of the detected N object instances. + detected_class_labels: A integer numpy array of shape [N, 1], repreneting + the class labels of the detected N object instances. + gt_boxes: A float numpy array of shape [M, 4], representing M + regions of object instances in ground truth + gt_class_labels: An integer numpy array of shape [M, 1], + representing M class labels of object instances in ground truth + detected_masks: (optional) A uint8 numpy array of shape [N, height, width]. + If not None, the scores will be computed based on masks. + gt_masks: (optional) A uint8 numpy array of shape [M, height, width]. + Returns: + is_class_correctly_detected_in_image: a numpy integer array of + shape [C, 1], indicating whether the correponding class has a least + one instance being correctly detected in the image + Raises: + ValueError: If detected masks is not None but groundtruth masks are None, + or the other way around. + """ + if (detected_masks is not None and gt_masks is None) or ( + detected_masks is None and gt_masks is not None): + raise ValueError( + 'If `detected_masks` is provided, then `gt_masks` should also be provided.') + + is_class_correctly_detected_in_image = np.zeros( + self.num_gt_classes, dtype=int) + for i in range(self.num_gt_classes): + (gt_boxes_at_ith_class, gt_masks_at_ith_class, + detected_boxes_at_ith_class, detected_scores_at_ith_class, + detected_masks_at_ith_class) = self._get_ith_class_arrays( + detected_boxes, detected_scores, detected_masks, + detected_class_labels, gt_boxes, gt_masks, + gt_class_labels, i) + is_class_correctly_detected_in_image[i] = ( + self._compute_is_class_correctly_detected_in_image( + detected_boxes=detected_boxes_at_ith_class, + detected_scores=detected_scores_at_ith_class, + gt_boxes=gt_boxes_at_ith_class, + detected_masks=detected_masks_at_ith_class, + gt_masks=gt_masks_at_ith_class)) + + return is_class_correctly_detected_in_image + + def _compute_is_class_correctly_detected_in_image( + self, detected_boxes, detected_scores, gt_boxes, detected_masks=None, gt_masks=None): + """Compute CorLoc score for a single class. + Args: + detected_boxes: A numpy array of shape [N, 4] representing detected box coordinates + detected_scores: A 1-d numpy array of length N representing classification score + gt_boxes: A numpy array of shape [M, 4] representing ground truth box coordinates + detected_masks: (optional) A np.uint8 numpy array of shape [N, height, width]. + If not None, the scores will be computed based on masks. + gt_masks: (optional) A np.uint8 numpy array of shape [M, height, width]. + Returns: + is_class_correctly_detected_in_image: An integer 1 or 0 denoting whether a + class is correctly detected in the image or not + """ + if detected_boxes.size > 0: + if gt_boxes.size > 0: + max_score_id = np.argmax(detected_scores) + mask_mode = False + if detected_masks is not None and gt_masks is not None: + mask_mode = True + if mask_mode: + detected_boxlist = MaskList( + box_data=np.expand_dims(detected_boxes[max_score_id], axis=0), + mask_data=np.expand_dims(detected_masks[max_score_id], axis=0)) + gt_boxlist = MaskList(box_data=gt_boxes, mask_data=gt_masks) + iou = iou_masklist(detected_boxlist, gt_boxlist) + else: + detected_boxlist = BoxList(np.expand_dims(detected_boxes[max_score_id, :], axis=0)) + gt_boxlist = BoxList(gt_boxes) + iou = iou_boxlist(detected_boxlist, gt_boxlist) + if np.max(iou) >= self.matching_iou_threshold: + return 1 + return 0 + + def _compute_tp_fp( + self, detected_boxes, detected_scores, detected_class_labels, + gt_boxes, gt_class_labels, gt_is_difficult_list, gt_is_group_of_list, detected_masks=None, gt_masks=None): + """Labels true/false positives of detections of an image across all classes. + Args: + detected_boxes: A float numpy array of shape [N, 4], representing N + regions of detected object regions. Each row is of the format [y_min, x_min, y_max, x_max] + detected_scores: A float numpy array of shape [N, 1], representing the + confidence scores of the detected N object instances. + detected_class_labels: A integer numpy array of shape [N, 1], representing + the class labels of the detected N object instances. + gt_boxes: A float numpy array of shape [M, 4], representing M + regions of object instances in ground truth + gt_class_labels: An integer numpy array of shape [M, 1], + representing M class labels of object instances in ground truth + gt_is_difficult_list: A boolean numpy array of length M denoting + whether a ground truth box is a difficult instance or not + gt_is_group_of_list: A boolean numpy array of length M denoting + whether a ground truth box has group-of tag + detected_masks: (optional) A np.uint8 numpy array of shape [N, height, + width]. If not None, the scores will be computed based on masks. + gt_masks: (optional) A np.uint8 numpy array of shape [M, height, width]. + Returns: + result_scores: A list of float numpy arrays. Each numpy array is of + shape [K, 1], representing K scores detected with object class label c + result_tp_fp_labels: A list of boolean numpy array. Each numpy array is of + shape [K, 1], representing K True/False positive label of object + instances detected with class label c + Raises: + ValueError: If detected masks is not None but groundtruth masks are None, + or the other way around. + """ + if detected_masks is not None and gt_masks is None: + raise ValueError( + 'Detected masks is available but groundtruth masks is not.') + if detected_masks is None and gt_masks is not None: + raise ValueError( + 'Groundtruth masks is available but detected masks is not.') + + result_scores = [] + result_tp_fp_labels = [] + for i in range(self.num_gt_classes): + gt_is_difficult_list_at_ith_class = ( + gt_is_difficult_list[gt_class_labels == i]) + gt_is_group_of_list_at_ith_class = ( + gt_is_group_of_list[gt_class_labels == i]) + (gt_boxes_at_ith_class, gt_masks_at_ith_class, + detected_boxes_at_ith_class, detected_scores_at_ith_class, + detected_masks_at_ith_class) = self._get_ith_class_arrays( + detected_boxes, detected_scores, detected_masks, + detected_class_labels, gt_boxes, gt_masks, + gt_class_labels, i) + scores, tp_fp_labels = self._compute_tp_fp_for_single_class( + detected_boxes=detected_boxes_at_ith_class, + detected_scores=detected_scores_at_ith_class, + gt_boxes=gt_boxes_at_ith_class, + gt_is_difficult_list=gt_is_difficult_list_at_ith_class, + gt_is_group_of_list=gt_is_group_of_list_at_ith_class, + detected_masks=detected_masks_at_ith_class, + gt_masks=gt_masks_at_ith_class) + result_scores.append(scores) + result_tp_fp_labels.append(tp_fp_labels) + return result_scores, result_tp_fp_labels + + def _get_overlaps_and_scores_mask_mode( + self, detected_boxes, detected_scores, detected_masks, + gt_boxes, gt_masks, gt_is_group_of_list): + """Computes overlaps and scores between detected and groudntruth masks. + Args: + detected_boxes: A numpy array of shape [N, 4] representing detected box coordinates + detected_scores: A 1-d numpy array of length N representing classification score + detected_masks: A uint8 numpy array of shape [N, height, width]. If not + None, the scores will be computed based on masks. + gt_boxes: A numpy array of shape [M, 4] representing ground truth box coordinates + gt_masks: A uint8 numpy array of shape [M, height, width]. + gt_is_group_of_list: A boolean numpy array of length M denoting + whether a ground truth box has group-of tag. If a groundtruth box is + group-of box, every detection matching this box is ignored. + Returns: + iou: A float numpy array of size [num_detected_boxes, num_gt_boxes]. If + gt_non_group_of_boxlist.num_boxes() == 0 it will be None. + ioa: A float numpy array of size [num_detected_boxes, num_gt_boxes]. If + gt_group_of_boxlist.num_boxes() == 0 it will be None. + scores: The score of the detected boxlist. + num_boxes: Number of non-maximum suppressed detected boxes. + """ + detected_boxlist = MaskList(box_data=detected_boxes, mask_data=detected_masks) + detected_boxlist.add_field('scores', detected_scores) + detected_boxlist = non_max_suppression(detected_boxlist, self.nms_max_output_boxes, self.nms_iou_threshold) + gt_non_group_of_boxlist = MaskList( + box_data=gt_boxes[~gt_is_group_of_list], mask_data=gt_masks[~gt_is_group_of_list]) + gt_group_of_boxlist = MaskList( + box_data=gt_boxes[gt_is_group_of_list], mask_data=gt_masks[gt_is_group_of_list]) + iou_b = iou_masklist(detected_boxlist, gt_non_group_of_boxlist) + ioa_b = np.transpose(ioa_masklist(gt_group_of_boxlist, detected_boxlist)) + scores = detected_boxlist.get_field('scores') + num_boxes = detected_boxlist.num_boxes() + return iou_b, ioa_b, scores, num_boxes + + def _get_overlaps_and_scores_box_mode( + self, detected_boxes, detected_scores, gt_boxes, gt_is_group_of_list): + """Computes overlaps and scores between detected and groudntruth boxes. + Args: + detected_boxes: A numpy array of shape [N, 4] representing detected box coordinates + detected_scores: A 1-d numpy array of length N representing classification score + gt_boxes: A numpy array of shape [M, 4] representing ground truth box coordinates + gt_is_group_of_list: A boolean numpy array of length M denoting + whether a ground truth box has group-of tag. If a groundtruth box is + group-of box, every detection matching this box is ignored. + Returns: + iou: A float numpy array of size [num_detected_boxes, num_gt_boxes]. If + gt_non_group_of_boxlist.num_boxes() == 0 it will be None. + ioa: A float numpy array of size [num_detected_boxes, num_gt_boxes]. If + gt_group_of_boxlist.num_boxes() == 0 it will be None. + scores: The score of the detected boxlist. + num_boxes: Number of non-maximum suppressed detected boxes. + """ + detected_boxlist = BoxList(detected_boxes) + detected_boxlist.add_field('scores', detected_scores) + detected_boxlist = non_max_suppression(detected_boxlist, self.nms_max_output_boxes, self.nms_iou_threshold) + gt_non_group_of_boxlist = BoxList(gt_boxes[~gt_is_group_of_list]) + gt_group_of_boxlist = BoxList(gt_boxes[gt_is_group_of_list]) + iou_b = iou_boxlist(detected_boxlist, gt_non_group_of_boxlist) + ioa_b = np.transpose(ioa_boxlist(gt_group_of_boxlist, detected_boxlist)) + scores = detected_boxlist.get_field('scores') + num_boxes = detected_boxlist.num_boxes() + return iou_b, ioa_b, scores, num_boxes + + def _compute_tp_fp_for_single_class( + self, detected_boxes, detected_scores, gt_boxes, + gt_is_difficult_list, gt_is_group_of_list, detected_masks=None, gt_masks=None): + """Labels boxes detected with the same class from the same image as tp/fp. + Args: + detected_boxes: A numpy array of shape [N, 4] representing detected box coordinates + detected_scores: A 1-d numpy array of length N representing classification score + gt_boxes: A numpy array of shape [M, 4] representing ground truth box coordinates + gt_is_difficult_list: A boolean numpy array of length M denoting + whether a ground truth box is a difficult instance or not. If a + groundtruth box is difficult, every detection matching this box is ignored. + gt_is_group_of_list: A boolean numpy array of length M denoting + whether a ground truth box has group-of tag. If a groundtruth box is + group-of box, every detection matching this box is ignored. + detected_masks: (optional) A uint8 numpy array of shape [N, height, + width]. If not None, the scores will be computed based on masks. + gt_masks: (optional) A uint8 numpy array of shape [M, height, width]. + Returns: + Two arrays of the same size, containing all boxes that were evaluated as + being true positives or false positives; if a box matched to a difficult + box or to a group-of box, it is ignored. + scores: A numpy array representing the detection scores. + tp_fp_labels: a boolean numpy array indicating whether a detection is a true positive. + """ + if detected_boxes.size == 0: + return np.array([], dtype=float), np.array([], dtype=bool) + + mask_mode = False + if detected_masks is not None and gt_masks is not None: + mask_mode = True + + iou_b = np.ndarray([0, 0]) + ioa_b = np.ndarray([0, 0]) + iou_m = np.ndarray([0, 0]) + ioa_m = np.ndarray([0, 0]) + if mask_mode: + # For Instance Segmentation Evaluation on Open Images V5, not all boxed + # instances have corresponding segmentation annotations. Those boxes that + # dont have segmentation annotations are represented as empty masks in + # gt_masks nd array. + mask_presence_indicator = (np.sum(gt_masks, axis=(1, 2)) > 0) + + iou_m, ioa_m, scores, num_detected_boxes = self._get_overlaps_and_scores_mask_mode( + detected_boxes=detected_boxes, + detected_scores=detected_scores, + detected_masks=detected_masks, + gt_boxes=gt_boxes[mask_presence_indicator, :], + gt_masks=gt_masks[mask_presence_indicator, :], + gt_is_group_of_list=gt_is_group_of_list[mask_presence_indicator]) + + if sum(mask_presence_indicator) < len(mask_presence_indicator): + # Not all masks are present - some masks are empty + iou_b, ioa_b, _, num_detected_boxes = self._get_overlaps_and_scores_box_mode( + detected_boxes=detected_boxes, + detected_scores=detected_scores, + gt_boxes=gt_boxes[~mask_presence_indicator, :], + gt_is_group_of_list=gt_is_group_of_list[~mask_presence_indicator]) + num_detected_boxes = detected_boxes.shape[0] + else: + mask_presence_indicator = np.zeros(gt_is_group_of_list.shape, dtype=bool) + iou_b, ioa_b, scores, num_detected_boxes = self._get_overlaps_and_scores_box_mode( + detected_boxes=detected_boxes, + detected_scores=detected_scores, + gt_boxes=gt_boxes, + gt_is_group_of_list=gt_is_group_of_list) + + if gt_boxes.size == 0: + return scores, np.zeros(num_detected_boxes, dtype=bool) + + tp_fp_labels = np.zeros(num_detected_boxes, dtype=bool) + is_matched_to_box = np.zeros(num_detected_boxes, dtype=bool) + is_matched_to_difficult = np.zeros(num_detected_boxes, dtype=bool) + is_matched_to_group_of = np.zeros(num_detected_boxes, dtype=bool) + + def compute_match_iou(iou_matrix, gt_nongroup_of_is_difficult_list, is_box): + """Computes TP/FP for non group-of box matching. + The function updates the following local variables: + tp_fp_labels - if a box is matched to group-of + is_matched_to_difficult - the detections that were processed at this are + matched to difficult box. + is_matched_to_box - the detections that were processed at this stage are marked as is_box. + Args: + iou_matrix: intersection-over-union matrix [num_gt_boxes]x[num_det_boxes]. + gt_nongroup_of_is_difficult_list: boolean that specifies if gt box is difficult. + is_box: boolean that specifies if currently boxes or masks are processed. + """ + max_overlap_gt_ids = np.argmax(iou_matrix, axis=1) + is_gt_detected = np.zeros(iou_matrix.shape[1], dtype=bool) + for i in range(num_detected_boxes): + gt_id = max_overlap_gt_ids[i] + is_evaluatable = ( + not tp_fp_labels[i] and + not is_matched_to_difficult[i] and + iou_matrix[i, gt_id] >= self.matching_iou_threshold and + not is_matched_to_group_of[i]) + if is_evaluatable: + if not gt_nongroup_of_is_difficult_list[gt_id]: + if not is_gt_detected[gt_id]: + tp_fp_labels[i] = True + is_gt_detected[gt_id] = True + is_matched_to_box[i] = is_box + else: + is_matched_to_difficult[i] = True + + def compute_match_ioa(ioa_matrix, is_box): + """Computes TP/FP for group-of box matching. + The function updates the following local variables: + is_matched_to_group_of - if a box is matched to group-of + is_matched_to_box - the detections that were processed at this stage are marked as is_box. + Args: + ioa_matrix: intersection-over-area matrix [num_gt_boxes]x[num_det_boxes]. + is_box: boolean that specifies if currently boxes or masks are processed. + Returns: + scores_group_of: of detections matched to group-of boxes[num_groupof_matched]. + tp_fp_labels_group_of: boolean array of size [num_groupof_matched], all values are True. + """ + scores_group_of = np.zeros(ioa_matrix.shape[1], dtype=float) + tp_fp_labels_group_of = self.group_of_weight * np.ones(ioa_matrix.shape[1], dtype=float) + max_overlap_group_of_gt_ids = np.argmax(ioa_matrix, axis=1) + for i in range(num_detected_boxes): + gt_id = max_overlap_group_of_gt_ids[i] + is_evaluatable = ( + not tp_fp_labels[i] and + not is_matched_to_difficult[i] and + ioa_matrix[i, gt_id] >= self.matching_iou_threshold and + not is_matched_to_group_of[i]) + if is_evaluatable: + is_matched_to_group_of[i] = True + is_matched_to_box[i] = is_box + scores_group_of[gt_id] = max(scores_group_of[gt_id], scores[i]) + selector = np.where((scores_group_of > 0) & (tp_fp_labels_group_of > 0)) + scores_group_of = scores_group_of[selector] + tp_fp_labels_group_of = tp_fp_labels_group_of[selector] + + return scores_group_of, tp_fp_labels_group_of + + # The evaluation is done in two stages: + # 1. Evaluate all objects that actually have instance level masks. + # 2. Evaluate all objects that are not already evaluated as boxes. + if iou_m.shape[1] > 0: + gt_is_difficult_mask_list = gt_is_difficult_list[mask_presence_indicator] + gt_is_group_of_mask_list = gt_is_group_of_list[mask_presence_indicator] + compute_match_iou(iou_m, gt_is_difficult_mask_list[~gt_is_group_of_mask_list], is_box=False) + + scores_mask_group_of = np.ndarray([0], dtype=float) + tp_fp_labels_mask_group_of = np.ndarray([0], dtype=float) + if ioa_m.shape[1] > 0: + scores_mask_group_of, tp_fp_labels_mask_group_of = compute_match_ioa(ioa_m, is_box=False) + + # Tp-fp evaluation for non-group of boxes (if any). + if iou_b.shape[1] > 0: + gt_is_difficult_box_list = gt_is_difficult_list[~mask_presence_indicator] + gt_is_group_of_box_list = gt_is_group_of_list[~mask_presence_indicator] + compute_match_iou(iou_b, gt_is_difficult_box_list[~gt_is_group_of_box_list], is_box=True) + + scores_box_group_of = np.ndarray([0], dtype=float) + tp_fp_labels_box_group_of = np.ndarray([0], dtype=float) + if ioa_b.shape[1] > 0: + scores_box_group_of, tp_fp_labels_box_group_of = compute_match_ioa(ioa_b, is_box=True) + + if mask_mode: + # Note: here crowds are treated as ignore regions. + valid_entries = (~is_matched_to_difficult & ~is_matched_to_group_of & ~is_matched_to_box) + return np.concatenate((scores[valid_entries], scores_mask_group_of)),\ + np.concatenate((tp_fp_labels[valid_entries].astype(float), tp_fp_labels_mask_group_of)) + else: + valid_entries = (~is_matched_to_difficult & ~is_matched_to_group_of) + return np.concatenate((scores[valid_entries], scores_box_group_of)),\ + np.concatenate((tp_fp_labels[valid_entries].astype(float), tp_fp_labels_box_group_of)) + + def _get_ith_class_arrays( + self, detected_boxes, detected_scores, detected_masks, detected_class_labels, + gt_boxes, gt_masks, gt_class_labels, class_index): + """Returns numpy arrays belonging to class with index `class_index`. + Args: + detected_boxes: A numpy array containing detected boxes. + detected_scores: A numpy array containing detected scores. + detected_masks: A numpy array containing detected masks. + detected_class_labels: A numpy array containing detected class labels. + gt_boxes: A numpy array containing groundtruth boxes. + gt_masks: A numpy array containing groundtruth masks. + gt_class_labels: A numpy array containing groundtruth class labels. + class_index: An integer index. + Returns: + gt_boxes_at_ith_class: A numpy array containing groundtruth boxes labeled as ith class. + gt_masks_at_ith_class: A numpy array containing groundtruth masks labeled as ith class. + detected_boxes_at_ith_class: A numpy array containing detected boxes corresponding to the ith class. + detected_scores_at_ith_class: A numpy array containing detected scores corresponding to the ith class. + detected_masks_at_ith_class: A numpy array containing detected masks corresponding to the ith class. + """ + selected_groundtruth = (gt_class_labels == class_index) + gt_boxes_at_ith_class = gt_boxes[selected_groundtruth] + if gt_masks is not None: + gt_masks_at_ith_class = gt_masks[selected_groundtruth] + else: + gt_masks_at_ith_class = None + selected_detections = (detected_class_labels == class_index) + detected_boxes_at_ith_class = detected_boxes[selected_detections] + detected_scores_at_ith_class = detected_scores[selected_detections] + if detected_masks is not None: + detected_masks_at_ith_class = detected_masks[selected_detections] + else: + detected_masks_at_ith_class = None + return (gt_boxes_at_ith_class, gt_masks_at_ith_class, + detected_boxes_at_ith_class, detected_scores_at_ith_class, + detected_masks_at_ith_class) + + def _remove_invalid_boxes( + self, detected_boxes, detected_scores, detected_class_labels, detected_masks=None): + """Removes entries with invalid boxes. + A box is invalid if either its xmax is smaller than its xmin, or its ymax is smaller than its ymin. + Args: + detected_boxes: A float numpy array of size [num_boxes, 4] containing box + coordinates in [ymin, xmin, ymax, xmax] format. + detected_scores: A float numpy array of size [num_boxes]. + detected_class_labels: A int32 numpy array of size [num_boxes]. + detected_masks: A uint8 numpy array of size [num_boxes, height, width]. + Returns: + valid_detected_boxes: A float numpy array of size [num_valid_boxes, 4] + containing box coordinates in [ymin, xmin, ymax, xmax] format. + valid_detected_scores: A float numpy array of size [num_valid_boxes]. + valid_detected_class_labels: A int32 numpy array of size [num_valid_boxes]. + valid_detected_masks: A uint8 numpy array of size [num_valid_boxes, height, width]. + """ + valid_indices = np.logical_and( + detected_boxes[:, 0] < detected_boxes[:, 2], detected_boxes[:, 1] < detected_boxes[:, 3]) + detected_boxes = detected_boxes[valid_indices] + detected_scores = detected_scores[valid_indices] + detected_class_labels = detected_class_labels[valid_indices] + if detected_masks is not None: + detected_masks = detected_masks[valid_indices] + return [detected_boxes, detected_scores, detected_class_labels, detected_masks] + + diff --git a/efficientdet/effdet/evaluator.py b/efficientdet/effdet/evaluator.py new file mode 100644 index 0000000000000000000000000000000000000000..f32e673ea33e11a9bb45cc6d40c53667d4485408 --- /dev/null +++ b/efficientdet/effdet/evaluator.py @@ -0,0 +1,195 @@ +import torch +import torch.distributed as dist +import abc +import json +import logging +import time +import numpy as np + +from .distributed import synchronize, is_main_process, all_gather_container +from pycocotools.cocoeval import COCOeval + +# FIXME experimenting with speedups for OpenImages eval, it's slow +#import pyximport; py_importer, pyx_importer = pyximport.install(pyimport=True) +import effdet.evaluation.detection_evaluator as tfm_eval +#pyximport.uninstall(py_importer, pyx_importer) + +_logger = logging.getLogger(__name__) + + +__all__ = ['CocoEvaluator', 'PascalEvaluator', 'OpenImagesEvaluator', 'create_evaluator'] + + +class Evaluator: + + def __init__(self, distributed=False, pred_yxyx=False): + self.distributed = distributed + self.distributed_device = None + self.pred_yxyx = pred_yxyx + self.img_indices = [] + self.predictions = [] + + def add_predictions(self, detections, target): + if self.distributed: + if self.distributed_device is None: + # cache for use later to broadcast end metric + self.distributed_device = detections.device + synchronize() + detections = all_gather_container(detections) + img_indices = all_gather_container(target['img_idx']) + if not is_main_process(): + return + else: + img_indices = target['img_idx'] + + detections = detections.cpu().numpy() + img_indices = img_indices.cpu().numpy() + for img_idx, img_dets in zip(img_indices, detections): + self.img_indices.append(img_idx) + self.predictions.append(img_dets) + + def _coco_predictions(self): + # generate coco-style predictions + coco_predictions = [] + coco_ids = [] + for img_idx, img_dets in zip(self.img_indices, self.predictions): + img_id = self._dataset.img_ids[img_idx] + coco_ids.append(img_id) + if self.pred_yxyx: + # to xyxy + img_dets[:, 0:4] = img_dets[:, [1, 0, 3, 2]] + # to xywh + img_dets[:, 2] -= img_dets[:, 0] + img_dets[:, 3] -= img_dets[:, 1] + for det in img_dets: + score = float(det[4]) + if score < .001: # stop when below this threshold, scores in descending order + break + coco_det = dict( + image_id=int(img_id), + bbox=det[0:4].tolist(), + score=score, + category_id=int(det[5])) + coco_predictions.append(coco_det) + return coco_predictions, coco_ids + + @abc.abstractmethod + def evaluate(self): + pass + + def save(self, result_file): + # save results in coco style, override to save in a alternate form + if not self.distributed or dist.get_rank() == 0: + assert len(self.predictions) + coco_predictions, coco_ids = self._coco_predictions() + json.dump(coco_predictions, open(result_file, 'w'), indent=4) + + +class CocoEvaluator(Evaluator): + + def __init__(self, dataset, neptune=None, distributed=False, pred_yxyx=False): + super().__init__(distributed=distributed, pred_yxyx=pred_yxyx) + self._dataset = dataset.parser + self.coco_api = dataset.parser.coco + self.neptune = neptune + + def reset(self): + self.img_indices = [] + self.predictions = [] + + def evaluate(self): + if not self.distributed or dist.get_rank() == 0: + assert len(self.predictions) + coco_predictions, coco_ids = self._coco_predictions() + json.dump(coco_predictions, open('./temp.json', 'w'), indent=4) + results = self.coco_api.loadRes('./temp.json') + coco_eval = COCOeval(self.coco_api, results, 'bbox') + coco_eval.params.imgIds = coco_ids # score only ids we've used + coco_eval.evaluate() + coco_eval.accumulate() + coco_eval.summarize() + metric = coco_eval.stats[0] # mAP 0.5-0.95 + if self.neptune: + self.neptune.log_metric('valid/mAP/0.5-0.95IOU', metric) + self.neptune.log_metric('valid/mAP/0.5IOU', coco_eval.stats[1]) + if self.distributed: + dist.broadcast(torch.tensor(metric, device=self.distributed_device), 0) + else: + metric = torch.tensor(0, device=self.distributed_device) + dist.broadcast(metric, 0) + metric = metric.item() + self.reset() + return metric + + +class TfmEvaluator(Evaluator): + """ Tensorflow Models Evaluator Wrapper """ + def __init__( + self, dataset, neptune=None, distributed=False, pred_yxyx=False, + evaluator_cls=tfm_eval.ObjectDetectionEvaluator): + super().__init__(distributed=distributed, pred_yxyx=pred_yxyx) + self._evaluator = evaluator_cls(categories=dataset.parser.cat_dicts) + self._eval_metric_name = self._evaluator._metric_names[0] + self._dataset = dataset.parser + self.neptune = neptune + + def reset(self): + self._evaluator.clear() + self.img_indices = [] + self.predictions = [] + + def evaluate(self): + if not self.distributed or dist.get_rank() == 0: + for img_idx, img_dets in zip(self.img_indices, self.predictions): + gt = self._dataset.get_ann_info(img_idx) + self._evaluator.add_single_ground_truth_image_info(img_idx, gt) + + bbox = img_dets[:, 0:4] if self.pred_yxyx else img_dets[:, [1, 0, 3, 2]] + det = dict(bbox=bbox, score=img_dets[:, 4], cls=img_dets[:, 5]) + self._evaluator.add_single_detected_image_info(img_idx, det) + + metrics = self._evaluator.evaluate() + _logger.info('Metrics:') + for k, v in metrics.items(): + _logger.info(f'{k}: {v}') + if self.neptune: + key = 'valid/mAP/' + str(k).split('/')[-1] + self.neptune.log_metric(key, v) + + map_metric = metrics[self._eval_metric_name] + if self.distributed: + dist.broadcast(torch.tensor(map_metric, device=self.distributed_device), 0) + else: + map_metric = torch.tensor(0, device=self.distributed_device) + wait = dist.broadcast(map_metric, 0, async_op=True) + while not wait.is_completed(): + # wait without spinning the cpu @ 100%, no need for low latency here + time.sleep(0.5) + map_metric = map_metric.item() + self.reset() + return map_metric + + +class PascalEvaluator(TfmEvaluator): + + def __init__(self, dataset, neptune=None, distributed=False, pred_yxyx=False): + super().__init__( + dataset, neptune, distributed=distributed, pred_yxyx=pred_yxyx, evaluator_cls=tfm_eval.PascalDetectionEvaluator) + + +class OpenImagesEvaluator(TfmEvaluator): + + def __init__(self, dataset, distributed=False, pred_yxyx=False): + super().__init__( + dataset, distributed=distributed, pred_yxyx=pred_yxyx, evaluator_cls=tfm_eval.OpenImagesDetectionEvaluator) + + +def create_evaluator(name, dataset, neptune=None, distributed=False, pred_yxyx=False): + # FIXME support OpenImages Challenge2019 metric w/ image level label consideration + if 'coco' in name: + return CocoEvaluator(dataset, neptune, distributed=distributed, pred_yxyx=pred_yxyx) + elif 'openimages' in name: + return OpenImagesEvaluator(dataset, distributed=distributed, pred_yxyx=pred_yxyx) + else: + return CocoEvaluator(dataset, neptune, distributed=distributed, pred_yxyx=pred_yxyx) + #return PascalEvaluator(dataset, neptune, distributed=distributed, pred_yxyx=pred_yxyx) diff --git a/efficientdet/effdet/factory.py b/efficientdet/effdet/factory.py new file mode 100644 index 0000000000000000000000000000000000000000..dbf8abf4f229cfe3cf503805183ff2a8bf0e2973 --- /dev/null +++ b/efficientdet/effdet/factory.py @@ -0,0 +1,54 @@ +from .efficientdet import EfficientDet, HeadNet +from .bench import DetBenchTrain, DetBenchPredict +from .config import get_efficientdet_config +from .helpers import load_pretrained, load_checkpoint + + +def create_model( + model_name, bench_task='', num_classes=None, pretrained=False, + checkpoint_path='', checkpoint_ema=False, **kwargs): + + config = get_efficientdet_config(model_name) + return create_model_from_config( + config, bench_task=bench_task, num_classes=num_classes, pretrained=pretrained, + checkpoint_path=checkpoint_path, checkpoint_ema=checkpoint_ema, **kwargs) + + +def create_model_from_config( + config, bench_task='', num_classes=None, pretrained=False, + checkpoint_path='', checkpoint_ema=False, **kwargs): + + pretrained_backbone = kwargs.pop('pretrained_backbone', True) + if pretrained or checkpoint_path: + pretrained_backbone = False # no point in loading backbone weights + + # Config overrides, override some config values via kwargs. + overrides = ('redundant_bias', 'label_smoothing', 'new_focal', 'jit_loss') + for ov in overrides: + value = kwargs.pop(ov, None) + if value is not None: + setattr(config, ov, value) + + labeler = kwargs.pop('bench_labeler', False) + + # create the base model + model = EfficientDet(config, pretrained_backbone=pretrained_backbone, **kwargs) + + # pretrained weights are always spec'd for original config, load them before we change the model + if pretrained: + load_pretrained(model, config.url) + + # reset model head if num_classes doesn't match configs + if num_classes is not None and num_classes != config.num_classes: + model.reset_head(num_classes=num_classes) + + # load an argument specified training checkpoint + if checkpoint_path: + load_checkpoint(model, checkpoint_path, use_ema=checkpoint_ema) + + # wrap model in task specific training/prediction bench if set + if bench_task == 'train': + model = DetBenchTrain(model, create_labeler=labeler) + elif bench_task == 'predict': + model = DetBenchPredict(model) + return model diff --git a/efficientdet/effdet/helpers.py b/efficientdet/effdet/helpers.py new file mode 100644 index 0000000000000000000000000000000000000000..597e65df2eb60424ff783156fad90e78be7cb9b9 --- /dev/null +++ b/efficientdet/effdet/helpers.py @@ -0,0 +1,22 @@ +import torch +import os +import logging +from collections import OrderedDict + +from timm.models import load_checkpoint + +try: + from torch.hub import load_state_dict_from_url +except ImportError: + from torch.utils.model_zoo import load_url as load_state_dict_from_url + + +def load_pretrained(model, url, filter_fn=None, strict=True): + if not url: + logging.warning("Pretrained model URL is empty, using random initialization. " + "Did you intend to use a `tf_` variant of the model?") + return + state_dict = load_state_dict_from_url(url, progress=False, map_location='cpu') + if filter_fn is not None: + state_dict = filter_fn(state_dict) + model.load_state_dict(state_dict, strict=strict) diff --git a/efficientdet/effdet/loss.py b/efficientdet/effdet/loss.py new file mode 100644 index 0000000000000000000000000000000000000000..44ddca31f03b27ba86d6a1c955cc85cd722e08ae --- /dev/null +++ b/efficientdet/effdet/loss.py @@ -0,0 +1,259 @@ +""" EfficientDet Focal, Huber/Smooth L1 loss fns w/ jit support + +Based on loss fn in Google's automl EfficientDet repository (Apache 2.0 license). +https://github.com/google/automl/tree/master/efficientdet + +Copyright 2020 Ross Wightman +""" +import torch +import torch.nn as nn +import torch.nn.functional as F + +from typing import Optional, List, Tuple + + +def focal_loss_legacy(logits, targets, alpha: float, gamma: float, normalizer): + """Compute the focal loss between `logits` and the golden `target` values. + + 'Legacy focal loss matches the loss used in the official Tensorflow impl for initial + model releases and some time after that. It eventually transitioned to the 'New' loss + defined below. + + Focal loss = -(1-pt)^gamma * log(pt) + where pt is the probability of being classified to the true class. + + Args: + logits: A float32 tensor of size [batch, height_in, width_in, num_predictions]. + + targets: A float32 tensor of size [batch, height_in, width_in, num_predictions]. + + alpha: A float32 scalar multiplying alpha to the loss from positive examples + and (1-alpha) to the loss from negative examples. + + gamma: A float32 scalar modulating loss from hard and easy examples. + + normalizer: A float32 scalar normalizes the total loss from all examples. + + Returns: + loss: A float32 scalar representing normalized total loss. + """ + positive_label_mask = targets == 1.0 + cross_entropy = F.binary_cross_entropy_with_logits(logits, targets.to(logits.dtype), reduction='none') + neg_logits = -1.0 * logits + modulator = torch.exp(gamma * targets * neg_logits - gamma * torch.log1p(torch.exp(neg_logits))) + + loss = modulator * cross_entropy + weighted_loss = torch.where(positive_label_mask, alpha * loss, (1.0 - alpha) * loss) + return weighted_loss / normalizer + + +def new_focal_loss(logits, targets, alpha: float, gamma: float, normalizer, label_smoothing: float = 0.01): + """Compute the focal loss between `logits` and the golden `target` values. + + 'New' is not the best descriptor, but this focal loss impl matches recent versions of + the official Tensorflow impl of EfficientDet. It has support for label smoothing, however + it is a bit slower, doesn't jit optimize well, and uses more memory. + + Focal loss = -(1-pt)^gamma * log(pt) + where pt is the probability of being classified to the true class. + Args: + logits: A float32 tensor of size [batch, height_in, width_in, num_predictions]. + targets: A float32 tensor of size [batch, height_in, width_in, num_predictions]. + alpha: A float32 scalar multiplying alpha to the loss from positive examples + and (1-alpha) to the loss from negative examples. + gamma: A float32 scalar modulating loss from hard and easy examples. + normalizer: Divide loss by this value. + label_smoothing: Float in [0, 1]. If > `0` then smooth the labels. + Returns: + loss: A float32 scalar representing normalized total loss. + """ + # compute focal loss multipliers before label smoothing, such that it will not blow up the loss. + pred_prob = logits.sigmoid() + targets = targets.to(logits.dtype) + onem_targets = 1. - targets + p_t = (targets * pred_prob) + (onem_targets * (1. - pred_prob)) + alpha_factor = targets * alpha + onem_targets * (1. - alpha) + modulating_factor = (1. - p_t) ** gamma + + # apply label smoothing for cross_entropy for each entry. + if label_smoothing > 0.: + targets = targets * (1. - label_smoothing) + .5 * label_smoothing + ce = F.binary_cross_entropy_with_logits(logits, targets, reduction='none') + + # compute the final loss and return + return (1 / normalizer) * alpha_factor * modulating_factor * ce + + +def huber_loss( + input, target, delta: float = 1., weights: Optional[torch.Tensor] = None, size_average: bool = True): + """ + """ + err = input - target + abs_err = err.abs() + quadratic = torch.clamp(abs_err, max=delta) + linear = abs_err - quadratic + loss = 0.5 * quadratic.pow(2) + delta * linear + if weights is not None: + loss *= weights + if size_average: + return loss.mean() + else: + return loss.sum() + + +def smooth_l1_loss( + input, target, beta: float = 1. / 9, weights: Optional[torch.Tensor] = None, size_average: bool = True): + """ + very similar to the smooth_l1_loss from pytorch, but with the extra beta parameter + """ + if beta < 1e-5: + # if beta == 0, then torch.where will result in nan gradients when + # the chain rule is applied due to pytorch implementation details + # (the False branch "0.5 * n ** 2 / 0" has an incoming gradient of + # zeros, rather than "no gradient"). To avoid this issue, we define + # small values of beta to be exactly l1 loss. + loss = torch.abs(input - target) + else: + err = torch.abs(input - target) + loss = torch.where(err < beta, 0.5 * err.pow(2) / beta, err - 0.5 * beta) + if weights is not None: + loss *= weights + if size_average: + return loss.mean() + else: + return loss.sum() + + +def _box_loss(box_outputs, box_targets, num_positives, delta: float = 0.1): + """Computes box regression loss.""" + # delta is typically around the mean value of regression target. + # for instances, the regression targets of 512x512 input with 6 anchors on + # P3-P7 pyramid is about [0.1, 0.1, 0.2, 0.2]. + normalizer = num_positives * 4.0 + mask = box_targets != 0.0 + box_loss = huber_loss(box_outputs, box_targets, weights=mask, delta=delta, size_average=False) + return box_loss / normalizer + + +def one_hot(x, num_classes: int): + # NOTE: PyTorch one-hot does not handle -ve entries (no hot) like Tensorflow, so mask them out + x_non_neg = (x >= 0).unsqueeze(-1) + onehot = torch.zeros(x.shape + (num_classes,), device=x.device, dtype=torch.float32) + return onehot.scatter(-1, x.unsqueeze(-1) * x_non_neg, 1) * x_non_neg + + +def loss_fn( + cls_outputs: List[torch.Tensor], + box_outputs: List[torch.Tensor], + cls_targets: List[torch.Tensor], + box_targets: List[torch.Tensor], + num_positives: torch.Tensor, + num_classes: int, + alpha: float, + gamma: float, + delta: float, + box_loss_weight: float, + label_smoothing: float = 0., + new_focal: bool = False, + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """Computes total detection loss. + Computes total detection loss including box and class loss from all levels. + Args: + cls_outputs: a List with values representing logits in [batch_size, height, width, num_anchors]. + at each feature level (index) + + box_outputs: a List with values representing box regression targets in + [batch_size, height, width, num_anchors * 4] at each feature level (index) + + cls_targets: groundtruth class targets. + + box_targets: groundtrusth box targets. + + num_positives: num positive grountruth anchors + + Returns: + total_loss: an integer tensor representing total loss reducing from class and box losses from all levels. + + cls_loss: an integer tensor representing total class loss. + + box_loss: an integer tensor representing total box regression loss. + """ + # Sum all positives in a batch for normalization and avoid zero + # num_positives_sum, which would lead to inf loss during training + num_positives_sum = (num_positives.sum() + 1.0).float() + levels = len(cls_outputs) + + cls_losses = [] + box_losses = [] + for l in range(levels): + cls_targets_at_level = cls_targets[l] + box_targets_at_level = box_targets[l] + + # Onehot encoding for classification labels. + cls_targets_at_level_oh = one_hot(cls_targets_at_level, num_classes) + + bs, height, width, _, _ = cls_targets_at_level_oh.shape + cls_targets_at_level_oh = cls_targets_at_level_oh.view(bs, height, width, -1) + cls_outputs_at_level = cls_outputs[l].permute(0, 2, 3, 1).float() + if new_focal: + cls_loss = new_focal_loss( + cls_outputs_at_level, cls_targets_at_level_oh, + alpha=alpha, gamma=gamma, normalizer=num_positives_sum, label_smoothing=label_smoothing) + else: + cls_loss = focal_loss_legacy( + cls_outputs_at_level, cls_targets_at_level_oh, + alpha=alpha, gamma=gamma, normalizer=num_positives_sum) + cls_loss = cls_loss.view(bs, height, width, -1, num_classes) + cls_loss = cls_loss * (cls_targets_at_level != -2).unsqueeze(-1) + cls_losses.append(cls_loss.sum()) # FIXME reference code added a clamp here at some point ...clamp(0, 2)) + + box_losses.append(_box_loss( + box_outputs[l].permute(0, 2, 3, 1).float(), + box_targets_at_level, + num_positives_sum, + delta=delta)) + + # Sum per level losses to total loss. + cls_loss = torch.sum(torch.stack(cls_losses, dim=-1), dim=-1) + box_loss = torch.sum(torch.stack(box_losses, dim=-1), dim=-1) + total_loss = cls_loss + box_loss_weight * box_loss + return total_loss, cls_loss, box_loss + + +loss_jit = torch.jit.script(loss_fn) + + +class DetectionLoss(nn.Module): + + __constants__ = ['num_classes'] + + def __init__(self, config): + super(DetectionLoss, self).__init__() + self.config = config + self.num_classes = config.num_classes + self.alpha = config.alpha + self.gamma = config.gamma + self.delta = config.delta + self.box_loss_weight = config.box_loss_weight + self.label_smoothing = config.label_smoothing + self.new_focal = config.new_focal + self.use_jit = config.jit_loss + + def forward( + self, + cls_outputs: List[torch.Tensor], + box_outputs: List[torch.Tensor], + cls_targets: List[torch.Tensor], + box_targets: List[torch.Tensor], + num_positives: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + + l_fn = loss_fn + if not torch.jit.is_scripting() and self.use_jit: + # This branch only active if parent / bench itself isn't being scripted + # NOTE: I haven't figured out what to do here wrt to tracing, is it an issue? + l_fn = loss_jit + + return l_fn( + cls_outputs, box_outputs, cls_targets, box_targets, num_positives, + num_classes=self.num_classes, alpha=self.alpha, gamma=self.gamma, delta=self.delta, + box_loss_weight=self.box_loss_weight, label_smoothing=self.label_smoothing, new_focal=self.new_focal) diff --git a/efficientdet/effdet/object_detection/README.md b/efficientdet/effdet/object_detection/README.md new file mode 100644 index 0000000000000000000000000000000000000000..c2ed3902017e307fc5af61cdf3710ec02aa8c213 --- /dev/null +++ b/efficientdet/effdet/object_detection/README.md @@ -0,0 +1,3 @@ +# Tensorflow Object Detection + +All of this code is adapted/ported/copied from https://github.com/google/automl/tree/552d0facd14f4fe9205a67fb13ecb5690a4d1c94/efficientdet/object_detection \ No newline at end of file diff --git a/efficientdet/effdet/object_detection/__init__.py b/efficientdet/effdet/object_detection/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..5679660c5c4006a824117c84f49f7b2e0e1c2703 --- /dev/null +++ b/efficientdet/effdet/object_detection/__init__.py @@ -0,0 +1,22 @@ +# Copyright 2020 Google Research. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +# Object detection data loaders and libraries are mostly based on RetinaNet: +# https://github.com/tensorflow/tpu/tree/master/models/official/retinanet +from .argmax_matcher import ArgMaxMatcher +from .box_coder import FasterRcnnBoxCoder +from .box_list import BoxList +from .matcher import Match +from .region_similarity_calculator import IouSimilarity +from .target_assigner import TargetAssigner diff --git a/efficientdet/effdet/object_detection/argmax_matcher.py b/efficientdet/effdet/object_detection/argmax_matcher.py new file mode 100644 index 0000000000000000000000000000000000000000..3b98b7a90f986184d2148c80eaec02f7f112016c --- /dev/null +++ b/efficientdet/effdet/object_detection/argmax_matcher.py @@ -0,0 +1,174 @@ +# Copyright 2020 Google Research. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Argmax matcher implementation. + +This class takes a similarity matrix and matches columns to rows based on the +maximum value per column. One can specify matched_thresholds and +to prevent columns from matching to rows (generally resulting in a negative +training example) and unmatched_theshold to ignore the match (generally +resulting in neither a positive or negative training example). + +This matcher is used in Fast(er)-RCNN. + +Note: matchers are used in TargetAssigners. There is a create_target_assigner +factory function for popular implementations. +""" +import torch +from .matcher import Match +from typing import Optional + + +def one_hot_bool(x, num_classes: int): + # for improved perf over PyTorch builtin one_hot, scatter to bool + onehot = torch.zeros(x.size(0), num_classes, device=x.device, dtype=torch.bool) + return onehot.scatter_(1, x.unsqueeze(1), 1) + + +@torch.jit.script +class ArgMaxMatcher(object): # cannot inherit with torchscript + """Matcher based on highest value. + + This class computes matches from a similarity matrix. Each column is matched + to a single row. + + To support object detection target assignment this class enables setting both + matched_threshold (upper threshold) and unmatched_threshold (lower thresholds) + defining three categories of similarity which define whether examples are + positive, negative, or ignored: + (1) similarity >= matched_threshold: Highest similarity. Matched/Positive! + (2) matched_threshold > similarity >= unmatched_threshold: Medium similarity. + Depending on negatives_lower_than_unmatched, this is either + Unmatched/Negative OR Ignore. + (3) unmatched_threshold > similarity: Lowest similarity. Depending on flag + negatives_lower_than_unmatched, either Unmatched/Negative OR Ignore. + For ignored matches this class sets the values in the Match object to -2. + """ + + def __init__(self, + matched_threshold: float, + unmatched_threshold: Optional[float] = None, + negatives_lower_than_unmatched: bool = True, + force_match_for_each_row: bool = False): + """Construct ArgMaxMatcher. + + Args: + matched_threshold: Threshold for positive matches. Positive if + sim >= matched_threshold, where sim is the maximum value of the + similarity matrix for a given column. Set to None for no threshold. + unmatched_threshold: Threshold for negative matches. Negative if + sim < unmatched_threshold. Defaults to matched_threshold + when set to None. + negatives_lower_than_unmatched: Boolean which defaults to True. If True + then negative matches are the ones below the unmatched_threshold, + whereas ignored matches are in between the matched and unmatched + threshold. If False, then negative matches are in between the matched + and unmatched threshold, and everything lower than unmatched is ignored. + force_match_for_each_row: If True, ensures that each row is matched to + at least one column (which is not guaranteed otherwise if the + matched_threshold is high). Defaults to False. See + argmax_matcher_test.testMatcherForceMatch() for an example. + + Raises: + ValueError: if unmatched_threshold is set but matched_threshold is not set + or if unmatched_threshold > matched_threshold. + """ + if (matched_threshold is None) and (unmatched_threshold is not None): + raise ValueError('Need to also define matched_threshold when unmatched_threshold is defined') + self._matched_threshold = matched_threshold + self._unmatched_threshold: float = 0. + if unmatched_threshold is None: + self._unmatched_threshold = matched_threshold + else: + if unmatched_threshold > matched_threshold: + raise ValueError('unmatched_threshold needs to be smaller or equal to matched_threshold') + self._unmatched_threshold = unmatched_threshold + if not negatives_lower_than_unmatched: + if self._unmatched_threshold == self._matched_threshold: + raise ValueError('When negatives are in between matched and unmatched thresholds, these ' + 'cannot be of equal value. matched: %s, unmatched: %s', + self._matched_threshold, self._unmatched_threshold) + self._force_match_for_each_row = force_match_for_each_row + self._negatives_lower_than_unmatched = negatives_lower_than_unmatched + + def _match_when_rows_are_empty(self, similarity_matrix): + """Performs matching when the rows of similarity matrix are empty. + + When the rows are empty, all detections are false positives. So we return + a tensor of -1's to indicate that the columns do not match to any rows. + + Returns: + matches: int32 tensor indicating the row each column matches to. + """ + return -1 * torch.ones(similarity_matrix.shape[1], dtype=torch.long, device=similarity_matrix.device) + + def _match_when_rows_are_non_empty(self, similarity_matrix): + """Performs matching when the rows of similarity matrix are non empty. + + Returns: + matches: int32 tensor indicating the row each column matches to. + """ + # Matches for each column + matched_vals, matches = torch.max(similarity_matrix, 0) + + # Deal with matched and unmatched threshold + if self._matched_threshold is not None: + # Get logical indices of ignored and unmatched columns as tf.int64 + below_unmatched_threshold = self._unmatched_threshold > matched_vals + between_thresholds = (matched_vals >= self._unmatched_threshold) & \ + (self._matched_threshold > matched_vals) + + if self._negatives_lower_than_unmatched: + matches = self._set_values_using_indicator(matches, below_unmatched_threshold, -1) + matches = self._set_values_using_indicator(matches, between_thresholds, -2) + else: + matches = self._set_values_using_indicator(matches, below_unmatched_threshold, -2) + matches = self._set_values_using_indicator(matches, between_thresholds, -1) + + if self._force_match_for_each_row: + force_match_column_ids = torch.argmax(similarity_matrix, 1) + force_match_column_indicators = one_hot_bool(force_match_column_ids, similarity_matrix.shape[1]) + force_match_column_mask, force_match_row_ids = torch.max(force_match_column_indicators, 0) + final_matches = torch.where(force_match_column_mask, force_match_row_ids, matches) + return final_matches + else: + return matches + + def match(self, similarity_matrix): + """Tries to match each column of the similarity matrix to a row. + + Args: + similarity_matrix: tensor of shape [N, M] representing any similarity metric. + + Returns: + Match object with corresponding matches for each of M columns. + """ + if similarity_matrix.shape[0] == 0: + return Match(self._match_when_rows_are_empty(similarity_matrix)) + else: + return Match(self._match_when_rows_are_non_empty(similarity_matrix)) + + def _set_values_using_indicator(self, x, indicator, val: int): + """Set the indicated fields of x to val. + + Args: + x: tensor. + indicator: boolean with same shape as x. + val: scalar with value to set. + + Returns: + modified tensor. + """ + indicator = indicator.to(dtype=x.dtype) + return x * (1 - indicator) + val * indicator diff --git a/efficientdet/effdet/object_detection/box_coder.py b/efficientdet/effdet/object_detection/box_coder.py new file mode 100644 index 0000000000000000000000000000000000000000..cfdccd76f4232804d77bf94c9fb82de5c66d0b48 --- /dev/null +++ b/efficientdet/effdet/object_detection/box_coder.py @@ -0,0 +1,172 @@ +# Copyright 2020 Google Research. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Base box coder. + +Box coders convert between coordinate frames, namely image-centric +(with (0,0) on the top left of image) and anchor-centric (with (0,0) being +defined by a specific anchor). + +Users of a BoxCoder can call two methods: + encode: which encodes a box with respect to a given anchor + (or rather, a tensor of boxes wrt a corresponding tensor of anchors) and + decode: which inverts this encoding with a decode operation. +In both cases, the arguments are assumed to be in 1-1 correspondence already; +it is not the job of a BoxCoder to perform matching. +""" +import torch +from typing import List, Optional +from .box_list import BoxList + +# Box coder types. +FASTER_RCNN = 'faster_rcnn' +KEYPOINT = 'keypoint' +MEAN_STDDEV = 'mean_stddev' +SQUARE = 'square' + + +"""Faster RCNN box coder. + +Faster RCNN box coder follows the coding schema described below: + ty = (y - ya) / ha + tx = (x - xa) / wa + th = log(h / ha) + tw = log(w / wa) + where x, y, w, h denote the box's center coordinates, width and height + respectively. Similarly, xa, ya, wa, ha denote the anchor's center + coordinates, width and height. tx, ty, tw and th denote the anchor-encoded + center, width and height respectively. + + See http://arxiv.org/abs/1506.01497 for details. +""" + + +EPS = 1e-8 + + +#@torch.jit.script +class FasterRcnnBoxCoder(object): + """Faster RCNN box coder.""" + + def __init__(self, scale_factors: Optional[List[float]] = None, eps: float = EPS): + """Constructor for FasterRcnnBoxCoder. + + Args: + scale_factors: List of 4 positive scalars to scale ty, tx, th and tw. + If set to None, does not perform scaling. For Faster RCNN, + the open-source implementation recommends using [10.0, 10.0, 5.0, 5.0]. + """ + self._scale_factors = scale_factors + if scale_factors is not None: + assert len(scale_factors) == 4 + for scalar in scale_factors: + assert scalar > 0 + self.eps = eps + + #@property + def code_size(self): + return 4 + + def encode(self, boxes: BoxList, anchors: BoxList): + """Encode a box collection with respect to anchor collection. + + Args: + boxes: BoxList holding N boxes to be encoded. + anchors: BoxList of anchors. + + Returns: + a tensor representing N anchor-encoded boxes of the format [ty, tx, th, tw]. + """ + # Convert anchors to the center coordinate representation. + ycenter_a, xcenter_a, ha, wa = anchors.get_center_coordinates_and_sizes() + ycenter, xcenter, h, w = boxes.get_center_coordinates_and_sizes() + # Avoid NaN in division and log below. + ha += self.eps + wa += self.eps + h += self.eps + w += self.eps + + tx = (xcenter - xcenter_a) / wa + ty = (ycenter - ycenter_a) / ha + tw = torch.log(w / wa) + th = torch.log(h / ha) + # Scales location targets as used in paper for joint training. + if self._scale_factors is not None: + ty *= self._scale_factors[0] + tx *= self._scale_factors[1] + th *= self._scale_factors[2] + tw *= self._scale_factors[3] + return torch.stack([ty, tx, th, tw]).t() + + def decode(self, rel_codes, anchors: BoxList): + """Decode relative codes to boxes. + + Args: + rel_codes: a tensor representing N anchor-encoded boxes. + anchors: BoxList of anchors. + + Returns: + boxes: BoxList holding N bounding boxes. + """ + ycenter_a, xcenter_a, ha, wa = anchors.get_center_coordinates_and_sizes() + + ty, tx, th, tw = rel_codes.t().unbind() + if self._scale_factors is not None: + ty /= self._scale_factors[0] + tx /= self._scale_factors[1] + th /= self._scale_factors[2] + tw /= self._scale_factors[3] + w = torch.exp(tw) * wa + h = torch.exp(th) * ha + ycenter = ty * ha + ycenter_a + xcenter = tx * wa + xcenter_a + ymin = ycenter - h / 2. + xmin = xcenter - w / 2. + ymax = ycenter + h / 2. + xmax = xcenter + w / 2. + return BoxList(torch.stack([ymin, xmin, ymax, xmax]).t()) + + +def batch_decode(encoded_boxes, box_coder: FasterRcnnBoxCoder, anchors: BoxList): + """Decode a batch of encoded boxes. + + This op takes a batch of encoded bounding boxes and transforms + them to a batch of bounding boxes specified by their corners in + the order of [y_min, x_min, y_max, x_max]. + + Args: + encoded_boxes: a float32 tensor of shape [batch_size, num_anchors, + code_size] representing the location of the objects. + box_coder: a BoxCoder object. + anchors: a BoxList of anchors used to encode `encoded_boxes`. + + Returns: + decoded_boxes: a float32 tensor of shape [batch_size, num_anchors, coder_size] + representing the corners of the objects in the order of [y_min, x_min, y_max, x_max]. + + Raises: + ValueError: if batch sizes of the inputs are inconsistent, or if + the number of anchors inferred from encoded_boxes and anchors are inconsistent. + """ + assert len(encoded_boxes.shape) == 3 + if encoded_boxes.shape[1] != anchors.num_boxes(): + raise ValueError('The number of anchors inferred from encoded_boxes' + ' and anchors are inconsistent: shape[1] of encoded_boxes' + ' %s should be equal to the number of anchors: %s.' % + (encoded_boxes.shape[1], anchors.num_boxes())) + + decoded_boxes = torch.stack([ + box_coder.decode(boxes, anchors).boxes for boxes in encoded_boxes.unbind() + ]) + return decoded_boxes diff --git a/efficientdet/effdet/object_detection/box_list.py b/efficientdet/effdet/object_detection/box_list.py new file mode 100644 index 0000000000000000000000000000000000000000..09b77f3d7aa6a8a97728e13b0bd6d108acec0603 --- /dev/null +++ b/efficientdet/effdet/object_detection/box_list.py @@ -0,0 +1,197 @@ +# Copyright 2020 Google Research. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Bounding Box List definition. + +BoxList represents a list of bounding boxes as tensorflow +tensors, where each bounding box is represented as a row of 4 numbers, +[y_min, x_min, y_max, x_max]. It is assumed that all bounding boxes +within a given list correspond to a single image. See also +box_list.py for common box related operations (such as area, iou, etc). + +Optionally, users can add additional related fields (such as weights). +We assume the following things to be true about fields: +* they correspond to boxes in the box_list along the 0th dimension +* they have inferable rank at graph construction time +* all dimensions except for possibly the 0th can be inferred + (i.e., not None) at graph construction time. + +Some other notes: + * Following tensorflow conventions, we use height, width ordering, + and correspondingly, y,x (or ymin, xmin, ymax, xmax) ordering + * Tensors are always provided as (flat) [N, 4] tensors. +""" +import torch +from typing import Optional, List, Dict + + +@torch.jit.script +class BoxList(object): + """Box collection.""" + data: Dict[str, torch.Tensor] + + def __init__(self, boxes): + """Constructs box collection. + + Args: + boxes: a tensor of shape [N, 4] representing box corners + + Raises: + ValueError: if invalid dimensions for bbox data or if bbox data is not in float32 format. + """ + if len(boxes.shape) != 2 or boxes.shape[-1] != 4: + raise ValueError('Invalid dimensions for box data.') + if boxes.dtype != torch.float32: + raise ValueError('Invalid tensor type: should be tf.float32') + self.data = {'boxes': boxes} + + def num_boxes(self): + """Returns number of boxes held in collection. + + Returns: + a tensor representing the number of boxes held in the collection. + """ + return self.data['boxes'].shape[0] + + def get_all_fields(self): + """Returns all fields.""" + return self.data.keys() + + def get_extra_fields(self): + """Returns all non-box fields (i.e., everything not named 'boxes').""" + # return [k for k in self.data.keys() if k != 'boxes'] # FIXME torscript doesn't support comprehensions yet + extra: List[str] = [] + for k in self.data.keys(): + if k != 'boxes': + extra.append(k) + return extra + + def add_field(self, field: str, field_data: torch.Tensor): + """Add field to box list. + + This method can be used to add related box data such as weights/labels, etc. + + Args: + field: a string key to access the data via `get` + field_data: a tensor containing the data to store in the BoxList + """ + self.data[field] = field_data + + def has_field(self, field: str): + return field in self.data + + #@property # FIXME for torchscript compat + def boxes(self): + """Convenience function for accessing box coordinates. + + Returns: + a tensor with shape [N, 4] representing box coordinates. + """ + return self.get_field('boxes') + + #@boxes.setter # FIXME for torchscript compat + def set_boxes(self, boxes): + """Convenience function for setting box coordinates. + + Args: + boxes: a tensor of shape [N, 4] representing box corners + + Raises: + ValueError: if invalid dimensions for bbox data + """ + if len(boxes.shape) != 2 or boxes.shape[-1] != 4: + raise ValueError('Invalid dimensions for box data.') + self.data['boxes'] = boxes + + def get_field(self, field: str): + """Accesses a box collection and associated fields. + + This function returns specified field with object; if no field is specified, + it returns the box coordinates. + + Args: + field: this optional string parameter can be used to specify a related field to be accessed. + + Returns: + a tensor representing the box collection or an associated field. + + Raises: + ValueError: if invalid field + """ + if not self.has_field(field): + raise ValueError(f'field {field} does not exist') + return self.data[field] + + def set_field(self, field: str, value: torch.Tensor): + """Sets the value of a field. + + Updates the field of a box_list with a given value. + + Args: + field: (string) name of the field to set value. + value: the value to assign to the field. + + Raises: + ValueError: if the box_list does not have specified field. + """ + if not self.has_field(field): + raise ValueError(f'field {field} does not exist') + self.data[field] = value + + def get_center_coordinates_and_sizes(self): + """Computes the center coordinates, height and width of the boxes. + + Returns: + a list of 4 1-D tensors [ycenter, xcenter, height, width]. + """ + box_corners = self.boxes() + ymin, xmin, ymax, xmax = box_corners.t().unbind() + width = xmax - xmin + height = ymax - ymin + ycenter = ymin + height / 2. + xcenter = xmin + width / 2. + return [ycenter, xcenter, height, width] + + def transpose_coordinates(self): + """Transpose the coordinate representation in a boxlist. + + """ + y_min, x_min, y_max, x_max = self.boxes().chunk(4, dim=1) + self.set_boxes(torch.cat([x_min, y_min, x_max, y_max], 1)) + + def as_tensor_dict(self, fields: Optional[List[str]] = None): + """Retrieves specified fields as a dictionary of tensors. + + Args: + fields: (optional) list of fields to return in the dictionary. + If None (default), all fields are returned. + + Returns: + tensor_dict: A dictionary of tensors specified by fields. + + Raises: + ValueError: if specified field is not contained in boxlist. + """ + tensor_dict = {} + if fields is None: + fields = self.get_all_fields() + for field in fields: + if not self.has_field(field): + raise ValueError('boxlist must contain all specified fields') + tensor_dict[field] = self.get_field(field) + return tensor_dict + + #@property + def device(self): + return self.data['boxes'].device diff --git a/efficientdet/effdet/object_detection/matcher.py b/efficientdet/effdet/object_detection/matcher.py new file mode 100644 index 0000000000000000000000000000000000000000..22aaab118d2bf7d65f4bb403c0cfd657ec74741c --- /dev/null +++ b/efficientdet/effdet/object_detection/matcher.py @@ -0,0 +1,179 @@ +# Copyright 2020 Google Research. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Matcher interface and Match class. + +This module defines the Matcher interface and the Match object. The job of the +matcher is to match row and column indices based on the similarity matrix and +other optional parameters. Each column is matched to at most one row. There +are three possibilities for the matching: + +1) match: A column matches a row. +2) no_match: A column does not match any row. +3) ignore: A column that is neither 'match' nor no_match. + +The ignore case is regularly encountered in object detection: when an anchor has +a relatively small overlap with a ground-truth box, one neither wants to +consider this box a positive example (match) nor a negative example (no match). + +The Match class is used to store the match results and it provides simple apis +to query the results. +""" +import torch + + +@torch.jit.script +class Match(object): + """Class to store results from the matcher. + + This class is used to store the results from the matcher. It provides + convenient methods to query the matching results. + """ + + def __init__(self, match_results: torch.Tensor): + """Constructs a Match object. + + Args: + match_results: Integer tensor of shape [N] with (1) match_results[i]>=0, + meaning that column i is matched with row match_results[i]. + (2) match_results[i]=-1, meaning that column i is not matched. + (3) match_results[i]=-2, meaning that column i is ignored. + + Raises: + ValueError: if match_results does not have rank 1 or is not an integer int32 scalar tensor + """ + if len(match_results.shape) != 1: + raise ValueError('match_results should have rank 1') + if match_results.dtype not in (torch.int32, torch.int64): + raise ValueError('match_results should be an int32 or int64 scalar tensor') + self.match_results = match_results + + def matched_column_indices(self): + """Returns column indices that match to some row. + + The indices returned by this op are always sorted in increasing order. + + Returns: + column_indices: int32 tensor of shape [K] with column indices. + """ + return torch.nonzero(self.match_results > -1).flatten().long() + + def matched_column_indicator(self): + """Returns column indices that are matched. + + Returns: + column_indices: int32 tensor of shape [K] with column indices. + """ + return self.match_results >= 0 + + def num_matched_columns(self): + """Returns number (int32 scalar tensor) of matched columns.""" + return self.matched_column_indices().numel() + + def unmatched_column_indices(self): + """Returns column indices that do not match any row. + + The indices returned by this op are always sorted in increasing order. + + Returns: + column_indices: int32 tensor of shape [K] with column indices. + """ + return torch.nonzero(self.match_results == -1).flatten().long() + + def unmatched_column_indicator(self): + """Returns column indices that are unmatched. + + Returns: + column_indices: int32 tensor of shape [K] with column indices. + """ + return self.match_results == -1 + + def num_unmatched_columns(self): + """Returns number (int32 scalar tensor) of unmatched columns.""" + return self.unmatched_column_indices().numel() + + def ignored_column_indices(self): + """Returns column indices that are ignored (neither Matched nor Unmatched). + + The indices returned by this op are always sorted in increasing order. + + Returns: + column_indices: int32 tensor of shape [K] with column indices. + """ + return torch.nonzero(self.ignored_column_indicator()).flatten().long() + + def ignored_column_indicator(self): + """Returns boolean column indicator where True means the column is ignored. + + Returns: + column_indicator: boolean vector which is True for all ignored column indices. + """ + return self.match_results == -2 + + def num_ignored_columns(self): + """Returns number (int32 scalar tensor) of matched columns.""" + return self.ignored_column_indices().numel() + + def unmatched_or_ignored_column_indices(self): + """Returns column indices that are unmatched or ignored. + + The indices returned by this op are always sorted in increasing order. + + Returns: + column_indices: int32 tensor of shape [K] with column indices. + """ + return torch.nonzero(0 > self.match_results).flatten().long() + + def matched_row_indices(self): + """Returns row indices that match some column. + + The indices returned by this op are ordered so as to be in correspondence with the output of + matched_column_indicator(). For example if self.matched_column_indicator() is [0,2], + and self.matched_row_indices() is [7, 3], then we know that column 0 was matched to row 7 and + column 2 was matched to row 3. + + Returns: + row_indices: int32 tensor of shape [K] with row indices. + """ + return torch.gather(self.match_results, 0, self.matched_column_indices()).flatten().long() + + def gather_based_on_match(self, input_tensor, unmatched_value, ignored_value): + """Gathers elements from `input_tensor` based on match results. + + For columns that are matched to a row, gathered_tensor[col] is set to input_tensor[match_results[col]]. + For columns that are unmatched, gathered_tensor[col] is set to unmatched_value. Finally, for columns that + are ignored gathered_tensor[col] is set to ignored_value. + + Note that the input_tensor.shape[1:] must match with unmatched_value.shape + and ignored_value.shape + + Args: + input_tensor: Tensor to gather values from. + unmatched_value: Constant tensor or python scalar value for unmatched columns. + ignored_value: Constant tensor or python scalar for ignored columns. + + Returns: + gathered_tensor: A tensor containing values gathered from input_tensor. + The shape of the gathered tensor is [match_results.shape[0]] + input_tensor.shape[1:]. + """ + if isinstance(ignored_value, torch.Tensor): + input_tensor = torch.cat([ignored_value, unmatched_value, input_tensor], dim=0) + else: + # scalars + input_tensor = torch.cat([ + torch.tensor([ignored_value, unmatched_value], dtype=input_tensor.dtype, device=input_tensor.device), + input_tensor], dim=0) + gather_indices = torch.clamp(self.match_results + 2, min=0) + gathered_tensor = torch.index_select(input_tensor, 0, gather_indices) + return gathered_tensor diff --git a/efficientdet/effdet/object_detection/region_similarity_calculator.py b/efficientdet/effdet/object_detection/region_similarity_calculator.py new file mode 100644 index 0000000000000000000000000000000000000000..f6945bc7576f68ee0c5b716b52feef7340d2804c --- /dev/null +++ b/efficientdet/effdet/object_detection/region_similarity_calculator.py @@ -0,0 +1,101 @@ +# Copyright 2020 Google Research. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Region Similarity Calculators for BoxLists. + +Region Similarity Calculators compare a pairwise measure of similarity +between the boxes in two BoxLists. +""" +import torch +from .box_list import BoxList + + +def area(boxlist: BoxList): + """Computes area of boxes. + + Args: + boxlist: BoxList holding N boxes + + Returns: + a tensor with shape [N] representing box areas. + """ + y_min, x_min, y_max, x_max = boxlist.boxes().chunk(4, dim=1) + out = (y_max - y_min).squeeze(1) * (x_max - x_min).squeeze(1) + return out + + +def intersection(boxlist1: BoxList, boxlist2: BoxList): + """Compute pairwise intersection areas between boxes. + + Args: + boxlist1: BoxList holding N boxes + boxlist2: BoxList holding M boxes + + Returns: + a tensor with shape [N, M] representing pairwise intersections + """ + y_min1, x_min1, y_max1, x_max1 = boxlist1.boxes().chunk(4, dim=1) + y_min2, x_min2, y_max2, x_max2 = boxlist2.boxes().chunk(4, dim=1) + all_pairs_min_ymax = torch.min(y_max1, y_max2.t()) + all_pairs_max_ymin = torch.max(y_min1, y_min2.t()) + intersect_heights = torch.clamp(all_pairs_min_ymax - all_pairs_max_ymin, min=0) + all_pairs_min_xmax = torch.min(x_max1, x_max2.t()) + all_pairs_max_xmin = torch.max(x_min1, x_min2.t()) + intersect_widths = torch.clamp(all_pairs_min_xmax - all_pairs_max_xmin, min=0) + return intersect_heights * intersect_widths + + +def iou(boxlist1: BoxList, boxlist2: BoxList): + """Computes pairwise intersection-over-union between box collections. + + Args: + boxlist1: BoxList holding N boxes + boxlist2: BoxList holding M boxes + + Returns: + a tensor with shape [N, M] representing pairwise iou scores. + """ + intersections = intersection(boxlist1, boxlist2) + areas1 = area(boxlist1) + areas2 = area(boxlist2) + unions = areas1.unsqueeze(1) + areas2.unsqueeze(0) - intersections + return torch.where(intersections == 0.0, torch.zeros_like(intersections), intersections / unions) + + +@torch.jit.script +class IouSimilarity(object): + """Class to compute similarity based on Intersection over Union (IOU) metric. + + This class computes pairwise similarity between two BoxLists based on IOU. + """ + def __init__(self): + pass + + def compare(self, boxlist1: BoxList, boxlist2: BoxList): + """Computes matrix of pairwise similarity between BoxLists. + + This op (to be overridden) computes a measure of pairwise similarity between + the boxes in the given BoxLists. Higher values indicate more similarity. + + Note that this method simply measures similarity and does not explicitly + perform a matching. + + Args: + boxlist1: BoxList holding N boxes. + boxlist2: BoxList holding M boxes. + + Returns: + a (float32) tensor of shape [N, M] with pairwise similarity score. + """ + return iou(boxlist1, boxlist2) diff --git a/efficientdet/effdet/object_detection/target_assigner.py b/efficientdet/effdet/object_detection/target_assigner.py new file mode 100644 index 0000000000000000000000000000000000000000..6b97a4e72827d5295ad4e349601c43c1385031ca --- /dev/null +++ b/efficientdet/effdet/object_detection/target_assigner.py @@ -0,0 +1,266 @@ +# Copyright 2020 Google Research. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Base target assigner module. + +The job of a TargetAssigner is, for a given set of anchors (bounding boxes) and +groundtruth detections (bounding boxes), to assign classification and regression +targets to each anchor as well as weights to each anchor (specifying, e.g., +which anchors should not contribute to training loss). + +It assigns classification/regression targets by performing the following steps: +1) Computing pairwise similarity between anchors and groundtruth boxes using a + provided RegionSimilarity Calculator +2) Computing a matching based on the similarity matrix using a provided Matcher +3) Assigning regression targets based on the matching and a provided BoxCoder +4) Assigning classification targets based on the matching and groundtruth labels + +Note that TargetAssigners only operate on detections from a single +image at a time, so any logic for applying a TargetAssigner to multiple +images must be handled externally. +""" +import torch +from typing import Optional + +from . import box_list +from .region_similarity_calculator import IouSimilarity +from .argmax_matcher import ArgMaxMatcher +from .matcher import Match +from .box_list import BoxList +from .box_coder import FasterRcnnBoxCoder + +KEYPOINTS_FIELD_NAME = 'keypoints' + + +#@torch.jit.script +class TargetAssigner(object): + """Target assigner to compute classification and regression targets.""" + + def __init__(self, similarity_calc: IouSimilarity, matcher: ArgMaxMatcher, box_coder: FasterRcnnBoxCoder, + negative_class_weight: float = 1.0, unmatched_cls_target: Optional[float] = None, + keypoints_field_name: str = KEYPOINTS_FIELD_NAME): + """Construct Object Detection Target Assigner. + + Args: + similarity_calc: a RegionSimilarityCalculator + + matcher: Matcher used to match groundtruth to anchors. + + box_coder: BoxCoder used to encode matching groundtruth boxes with respect to anchors. + + negative_class_weight: classification weight to be associated to negative + anchors (default: 1.0). The weight must be in [0., 1.]. + + unmatched_cls_target: a float32 tensor with shape [d_1, d_2, ..., d_k] + which is consistent with the classification target for each + anchor (and can be empty for scalar targets). This shape must thus be + compatible with the groundtruth labels that are passed to the "assign" + function (which have shape [num_gt_boxes, d_1, d_2, ..., d_k]). + If set to None, unmatched_cls_target is set to be [0] for each anchor. + + Raises: + ValueError: if similarity_calc is not a RegionSimilarityCalculator or + if matcher is not a Matcher or if box_coder is not a BoxCoder + """ + self._similarity_calc = similarity_calc + self._matcher = matcher + self._box_coder = box_coder + self._negative_class_weight = negative_class_weight + if unmatched_cls_target is not None: + self._unmatched_cls_target = unmatched_cls_target + else: + self._unmatched_cls_target = 0. + self._keypoints_field_name = keypoints_field_name + + def assign(self, anchors: BoxList, groundtruth_boxes: BoxList, groundtruth_labels=None, groundtruth_weights=None): + """Assign classification and regression targets to each anchor. + + For a given set of anchors and groundtruth detections, match anchors + to groundtruth_boxes and assign classification and regression targets to + each anchor as well as weights based on the resulting match (specifying, + e.g., which anchors should not contribute to training loss). + + Anchors that are not matched to anything are given a classification target + of self._unmatched_cls_target which can be specified via the constructor. + + Args: + anchors: a BoxList representing N anchors + + groundtruth_boxes: a BoxList representing M groundtruth boxes + + groundtruth_labels: a tensor of shape [M, d_1, ... d_k] + with labels for each of the ground_truth boxes. The subshape + [d_1, ... d_k] can be empty (corresponding to scalar inputs). When set + to None, groundtruth_labels assumes a binary problem where all + ground_truth boxes get a positive label (of 1). + + groundtruth_weights: a float tensor of shape [M] indicating the weight to + assign to all anchors match to a particular groundtruth box. The weights + must be in [0., 1.]. If None, all weights are set to 1. + + **params: Additional keyword arguments for specific implementations of the Matcher. + + Returns: + cls_targets: a float32 tensor with shape [num_anchors, d_1, d_2 ... d_k], + where the subshape [d_1, ..., d_k] is compatible with groundtruth_labels + which has shape [num_gt_boxes, d_1, d_2, ... d_k]. + + cls_weights: a float32 tensor with shape [num_anchors] + + reg_targets: a float32 tensor with shape [num_anchors, box_code_dimension] + + reg_weights: a float32 tensor with shape [num_anchors] + + match: a matcher.Match object encoding the match between anchors and groundtruth boxes, + with rows corresponding to groundtruth boxes and columns corresponding to anchors. + + Raises: + ValueError: if anchors or groundtruth_boxes are not of type box_list.BoxList + """ + if not isinstance(anchors, box_list.BoxList): + raise ValueError('anchors must be an BoxList') + if not isinstance(groundtruth_boxes, box_list.BoxList): + raise ValueError('groundtruth_boxes must be an BoxList') + + # device = anchors.device() + # if groundtruth_labels is None: + # groundtruth_labels = torch.ones(groundtruth_boxes.num_boxes(), device=device).unsqueeze(0) + # groundtruth_labels = groundtruth_labels.unsqueeze(-1) + # if groundtruth_weights is None: + # num_gt_boxes = groundtruth_boxes.num_boxes() + # if not num_gt_boxes: + # num_gt_boxes = groundtruth_boxes.num_boxes() + # groundtruth_weights = torch.ones([num_gt_boxes], device=device) + + match_quality_matrix = self._similarity_calc.compare(groundtruth_boxes, anchors) + match = self._matcher.match(match_quality_matrix) + reg_targets = self._create_regression_targets(anchors, groundtruth_boxes, match) + cls_targets = self._create_classification_targets(groundtruth_labels, match) + #reg_weights = self._create_regression_weights(match, groundtruth_weights) + #cls_weights = self._create_classification_weights(match, groundtruth_weights) + + return cls_targets, reg_targets, match + + def _create_regression_targets(self, anchors: BoxList, groundtruth_boxes: BoxList, match: Match): + """Returns a regression target for each anchor. + + Args: + anchors: a BoxList representing N anchors + + groundtruth_boxes: a BoxList representing M groundtruth_boxes + + match: a matcher.Match object + + Returns: + reg_targets: a float32 tensor with shape [N, box_code_dimension] + """ + device = anchors.device() + zero_box = torch.zeros((1, 4), device=device) + matched_gt_boxes = match.gather_based_on_match( + groundtruth_boxes.boxes(), unmatched_value=zero_box, ignored_value=zero_box) + matched_gt_boxlist = box_list.BoxList(matched_gt_boxes) + if groundtruth_boxes.has_field(self._keypoints_field_name): + groundtruth_keypoints = groundtruth_boxes.get_field(self._keypoints_field_name) + zero_kp = torch.zeros((1,) + groundtruth_keypoints.shape[1:], device=device) + matched_keypoints = match.gather_based_on_match( + groundtruth_keypoints, unmatched_value=zero_kp, ignored_value=zero_kp) + matched_gt_boxlist.add_field(self._keypoints_field_name, matched_keypoints) + matched_reg_targets = self._box_coder.encode(matched_gt_boxlist, anchors) + + unmatched_ignored_reg_targets = self._default_regression_target(device).repeat(match.match_results.shape[0], 1) + + matched_anchors_mask = match.matched_column_indicator() + reg_targets = torch.where(matched_anchors_mask.unsqueeze(1), matched_reg_targets, unmatched_ignored_reg_targets) + return reg_targets + + def _default_regression_target(self, device: torch.device): + """Returns the default target for anchors to regress to. + + Default regression targets are set to zero (though in this implementation what + these targets are set to should not matter as the regression weight of any box + set to regress to the default target is zero). + + Returns: + default_target: a float32 tensor with shape [1, box_code_dimension] + """ + return torch.zeros(1, self._box_coder.code_size(), device=device) + + def _create_classification_targets(self, groundtruth_labels, match: Match): + """Create classification targets for each anchor. + + Assign a classification target of for each anchor to the matching + groundtruth label that is provided by match. Anchors that are not matched + to anything are given the target self._unmatched_cls_target + + Args: + groundtruth_labels: a tensor of shape [num_gt_boxes, d_1, ... d_k] + with labels for each of the ground_truth boxes. The subshape + [d_1, ... d_k] can be empty (corresponding to scalar labels). + match: a matcher.Match object that provides a matching between anchors + and groundtruth boxes. + + Returns: + a float32 tensor with shape [num_anchors, d_1, d_2 ... d_k], where the + subshape [d_1, ..., d_k] is compatible with groundtruth_labels which has + shape [num_gt_boxes, d_1, d_2, ... d_k]. + """ + return match.gather_based_on_match( + groundtruth_labels, + unmatched_value=self._unmatched_cls_target, ignored_value=self._unmatched_cls_target) + + def _create_regression_weights(self, match: Match, groundtruth_weights): + """Set regression weight for each anchor. + + Only positive anchors are set to contribute to the regression loss, so this + method returns a weight of 1 for every positive anchor and 0 for every + negative anchor. + + Args: + match: a matcher.Match object that provides a matching between anchors and groundtruth boxes. + groundtruth_weights: a float tensor of shape [M] indicating the weight to + assign to all anchors match to a particular groundtruth box. + + Returns: + a float32 tensor with shape [num_anchors] representing regression weights. + """ + return match.gather_based_on_match(groundtruth_weights, ignored_value=0., unmatched_value=0.) + + def _create_classification_weights(self, match: Match, groundtruth_weights): + """Create classification weights for each anchor. + + Positive (matched) anchors are associated with a weight of + positive_class_weight and negative (unmatched) anchors are associated with + a weight of negative_class_weight. When anchors are ignored, weights are set + to zero. By default, both positive/negative weights are set to 1.0, + but they can be adjusted to handle class imbalance (which is almost always + the case in object detection). + + Args: + match: a matcher.Match object that provides a matching between anchors and groundtruth boxes. + groundtruth_weights: a float tensor of shape [M] indicating the weight to + assign to all anchors match to a particular groundtruth box. + + Returns: + a float32 tensor with shape [num_anchors] representing classification weights. + """ + return match.gather_based_on_match( + groundtruth_weights, ignored_value=0., unmatched_value=self._negative_class_weight) + + def box_coder(self): + """Get BoxCoder of this TargetAssigner. + + Returns: + BoxCoder object. + """ + return self._box_coder diff --git a/efficientdet/effdet/soft_nms.py b/efficientdet/effdet/soft_nms.py new file mode 100644 index 0000000000000000000000000000000000000000..fff0158e3e17b499053c5b1bd9c27581835bbc80 --- /dev/null +++ b/efficientdet/effdet/soft_nms.py @@ -0,0 +1,170 @@ +""" PyTorch Soft-NMS + +This code was adapted from a PR for detectron2 submitted by https://github.com/alekseynp +https://github.com/facebookresearch/detectron2/pull/1183/files + +Detectron2 is licensed Apache 2.0, Copyright Facebook Inc. +""" +import torch +from typing import List + + +def pairwise_iou(boxes1, boxes2) -> torch.Tensor: + """ + Given two lists of boxes of size N and M, + compute the IoU (intersection over union) + between __all__ N x M pairs of boxes. + The box order must be (xmin, ymin, xmax, ymax). + Args: + boxes1,boxes2 (Boxes): two `Boxes`. Contains N & M boxes, respectively. + Returns: + Tensor: IoU, sized [N,M]. + """ + area1 = (boxes1[:, 2] - boxes1[:, 0]) * (boxes1[:, 3] - boxes1[:, 1]) # [N,] + area2 = (boxes2[:, 2] - boxes2[:, 0]) * (boxes2[:, 3] - boxes2[:, 1]) # [M,] + + width_height = torch.min(boxes1[:, None, 2:], boxes2[:, 2:]) - torch.max( + boxes1[:, None, :2], boxes2[:, :2] + ) # [N,M,2] + + width_height.clamp_(min=0) # [N,M,2] + inter = width_height.prod(dim=2) # [N,M] + + # handle empty boxes + iou = torch.where( + inter > 0, + inter / (area1[:, None] + area2 - inter), + torch.zeros(1, dtype=inter.dtype, device=inter.device), + ) + return iou + + +def soft_nms( + boxes, + scores, + method_gaussian: bool = True, + sigma: float = 0.5, + iou_threshold: float = .5, + score_threshold: float = 0.005 +): + """ + Soft non-max suppression algorithm. + + Implementation of [Soft-NMS -- Improving Object Detection With One Line of Codec] + (https://arxiv.org/abs/1704.04503) + + Args: + boxes_remain (Tensor[N, ?]): + boxes where NMS will be performed + if Boxes, in (x1, y1, x2, y2) format + if RotatedBoxes, in (x_ctr, y_ctr, width, height, angle_degrees) format + scores_remain (Tensor[N]): + scores for each one of the boxes + method_gaussian (bool): use gaussian method if True, otherwise linear + sigma (float): + parameter for Gaussian penalty function + iou_threshold (float): + iou threshold for applying linear decay. Nt from the paper + re-used as threshold for standard "hard" nms + score_threshold (float): + boxes with scores below this threshold are pruned at each iteration. + Dramatically reduces computation time. Authors use values in [10e-4, 10e-2] + + Returns: + tuple(Tensor, Tensor): + [0]: int64 tensor with the indices of the elements that have been kept + by Soft NMS, sorted in decreasing order of scores + [1]: float tensor with the re-scored scores of the elements that were kept + """ + device = boxes.device + boxes_remain = boxes.clone() + scores_remain = scores.clone() + num_elem = scores_remain.size()[0] + idxs = torch.arange(num_elem) + idxs_out = torch.zeros(num_elem, dtype=torch.int64, device=device) + scores_out = torch.zeros(num_elem, dtype=torch.float32, device=device) + count: int = 0 + + while scores_remain.numel() > 0: + top_idx = torch.argmax(scores_remain) + idxs_out[count] = idxs[top_idx] + scores_out[count] = scores_remain[top_idx] + count += 1 + + top_box = boxes_remain[top_idx] + ious = pairwise_iou(top_box.unsqueeze(0), boxes_remain)[0] + + if method_gaussian: + decay = torch.exp(-torch.pow(ious, 2) / sigma) + else: + decay = torch.ones_like(ious) + decay_mask = ious > iou_threshold + decay[decay_mask] = 1 - ious[decay_mask] + + scores_remain *= decay + keep = scores_remain > score_threshold + keep[top_idx] = torch.tensor(False, device=device) + + boxes_remain = boxes_remain[keep] + scores_remain = scores_remain[keep] + idxs = idxs[keep] + + return idxs_out[:count], scores_out[:count] + + +def batched_soft_nms( + boxes, scores, idxs, + method_gaussian: bool = True, + sigma: float = 0.5, + iou_threshold: float = .5, + score_threshold: float = 0.001): + + """ + Performs soft non-maximum suppression in a batched fashion. + + Each index value correspond to a category, and NMS + will not be applied between elements of different categories. + + Args: + boxes (Tensor[N, 4]): + boxes where NMS will be performed. They + are expected to be in (x1, y1, x2, y2) format + scores (Tensor[N]): + scores for each one of the boxes + idxs (Tensor[N]): + indices of the categories for each one of the boxes. + method (str): + one of ['gaussian', 'linear', 'hard'] + see paper for details. users encouraged not to use "hard", as this is the + same nms available elsewhere in detectron2 + sigma (float): + parameter for Gaussian penalty function + iou_threshold (float): + iou threshold for applying linear decay. Nt from the paper + re-used as threshold for standard "hard" nms + score_threshold (float): + boxes with scores below this threshold are pruned at each iteration. + Dramatically reduces computation time. Authors use values in [10e-4, 10e-2] + Returns: + tuple(Tensor, Tensor): + [0]: int64 tensor with the indices of the elements that have been kept + by Soft NMS, sorted in decreasing order of scores + [1]: float tensor with the re-scored scores of the elements that were kept + """ + if boxes.numel() == 0: + return ( + torch.empty((0,), dtype=torch.int64, device=boxes.device), + torch.empty((0,), dtype=torch.float32, device=scores.device), + ) + # strategy: in order to perform NMS independently per class. + # we add an offset to all the boxes. The offset is dependent + # only on the class idx, and is large enough so that boxes + # from different classes do not overlap + max_coordinate = boxes.max() + offsets = idxs.to(boxes) * (max_coordinate + 1) + boxes_for_nms = boxes + offsets[:, None] + return soft_nms( + boxes_for_nms, scores, method_gaussian=method_gaussian, sigma=sigma, + iou_threshold=iou_threshold, score_threshold=score_threshold + ) + diff --git a/efficientdet/effdet/version.py b/efficientdet/effdet/version.py new file mode 100644 index 0000000000000000000000000000000000000000..020ed73d7a09b032ea1b3291090cbbdeee5a181a --- /dev/null +++ b/efficientdet/effdet/version.py @@ -0,0 +1 @@ +__version__ = '0.2.2' diff --git a/efficientdet/efficientdet.py b/efficientdet/efficientdet.py new file mode 100755 index 0000000000000000000000000000000000000000..4d61c3db9747e2cfbf8c9b8831427f610c56c9e9 --- /dev/null +++ b/efficientdet/efficientdet.py @@ -0,0 +1,268 @@ +''' +Efficientdet demo +''' +import argparse +import cv2 +import os +import time + +from PIL import Image +import PIL.ImageColor as ImageColor +import requests +import matplotlib.pyplot as plt + +import torch +import torchvision.transforms as T +from tqdm import tqdm + +from effdet import create_model + + +def get_args_parser(): + parser = argparse.ArgumentParser( + 'Test detr on one image') + parser.add_argument( + '--img', metavar='IMG', + help='path to image, could be url', + default='https://www.fyidenmark.com/images/denmark-litter.jpg') + parser.add_argument( + '--save', metavar='OUTPUT', + help='path to save image with predictions (if None show image)', + default=None) + parser.add_argument('--classes', nargs='+', default=['Litter']) + parser.add_argument( + '--checkpoint', type=str, + help='path to checkpoint') + parser.add_argument( + '--device', type=str, default='cpu', + help='device to evaluate model (default: cpu)') + parser.add_argument( + '--prob_threshold', type=float, default=0.3, + help='probability threshold to show results (default: 0.5)') + parser.add_argument( + '--video', action='store_true', default=False, + help="If true, we treat impute as video (default: False)") + parser.set_defaults(redundant_bias=None) + return parser + + +# standard PyTorch mean-std input image normalization +def get_transforms(im, size=768): + transform = T.Compose([ + T.Resize((size, size)), + T.ToTensor(), + T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) + ]) + return transform(im).unsqueeze(0) + + +def rescale_bboxes(out_bbox, size, resize): + img_w, img_h = size + out_w, out_h = resize + b = out_bbox * torch.tensor([img_w/out_w, img_h/out_h, + img_w/out_w, img_h/out_h], + dtype=torch.float32).to( + out_bbox.device) + return b + + +# from https://deepdrive.pl/ +def get_output(img, prob, boxes, classes=['Litter'], stat_text=None): + # colors for visualization + STANDARD_COLORS = [ + 'AliceBlue', 'Chartreuse', 'Aqua', 'Aquamarine', 'Azure', 'Beige', + 'Bisque', 'BlanchedAlmond', 'BlueViolet', 'BurlyWood', 'CadetBlue', + 'AntiqueWhite', 'Chocolate', 'Coral', 'CornflowerBlue', 'Cornsilk', + 'Crimson', 'Cyan', 'DarkCyan', 'DarkGoldenRod', 'DarkGrey', + 'DarkKhaki', 'DarkOrange', 'DarkOrchid', 'DarkSalmon', + 'DarkSeaGreen', 'DarkTurquoise', 'DarkViolet', + 'DeepPink', 'DeepSkyBlue', 'DodgerBlue', 'FireBrick', 'FloralWhite', + 'ForestGreen', 'Fuchsia', 'Gainsboro', 'GhostWhite', 'Gold', + 'GoldenRod', 'Salmon', 'Tan', 'HoneyDew', 'HotPink', + 'IndianRed', 'Ivory', 'Khaki', 'Lavender', 'LavenderBlush', + 'LawnGreen', 'LemonChiffon', 'LightBlue', + 'LightCoral', 'LightCyan', 'LightGoldenRodYellow', 'LightGray', + 'LightGrey', 'LightGreen', 'LightPink', 'LightSalmon', 'LightSeaGreen', + 'LightSkyBlue', 'LightSlateGray', 'LightSlateGrey', 'LightSteelBlue', + 'LightYellow', 'Lime', 'LimeGreen', 'Linen', 'Magenta', + 'MediumAquaMarine', 'MediumOrchid', 'MediumPurple', + 'MediumSeaGreen', 'MediumSlateBlue', 'MediumSpringGreen', + 'MediumTurquoise', 'MediumVioletRed', 'MintCream', + 'MistyRose', 'Moccasin', 'NavajoWhite', 'OldLace', 'Olive', + 'OliveDrab', 'Orange', 'OrangeRed', 'Orchid', 'PaleGoldenRod', + 'PaleGreen', 'PaleTurquoise', 'PaleVioletRed', 'PapayaWhip', + 'PeachPuff', 'Peru', 'Pink', 'Plum', 'PowderBlue', 'Purple', + 'Red', 'RosyBrown', 'RoyalBlue', 'SaddleBrown', 'Green', 'SandyBrown', + 'SeaGreen', 'SeaShell', 'Sienna', 'Silver', 'SkyBlue', 'SlateBlue', + 'SlateGray', 'SlateGrey', 'Snow', 'SpringGreen', 'SteelBlue', + 'GreenYellow', 'Teal', 'Thistle', 'Tomato', 'Turquoise', 'Violet', + 'Wheat', 'White', 'WhiteSmoke', 'Yellow', 'YellowGreen' + ] + palette = [ImageColor.getrgb(_) for _ in STANDARD_COLORS] + for p, (x0, y0, x1, y1) in zip(prob, boxes.tolist()): + cl = int(p[1] - 1) + color = palette[cl] + start_p, end_p = (int(x0), int(y0)), (int(x1), int(y1)) + cv2.rectangle(img, start_p, end_p, color, 2) + text = "%s %.1f%%" % (classes[cl], p[0]*100) + cv2.putText(img, text, start_p, cv2.FONT_HERSHEY_SIMPLEX, 1, + (0, 0, 0), 10) + cv2.putText(img, text, start_p, cv2.FONT_HERSHEY_SIMPLEX, 1, color, 2) + if stat_text is not None: + cv2.putText(img, stat_text, (30, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, + (0, 0, 0), 10) + cv2.putText(img, stat_text, (30, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, + (255, 255, 255), 3) + return img + + +# from https://deepdrive.pl/ +def save_frames(args, num_iter=45913): + if not os.path.exists(args.save): + os.makedirs(args.save) + + cap = cv2.VideoCapture(args.img) + counter = 0 + pbar = tqdm(total=num_iter+1) + num_classes = len(args.classes) + model_name = args.checkpoint.split('-')[-1].split('/')[0] + model = set_model(model_name, num_classes, args.checkpoint, args.device) + model.eval() + + model.to(args.device) + + while(cap.isOpened()): + ret, img = cap.read() + if img is None: + print("END") + break + + # scale + BGR to RGB + inference_size = (768, 768) + scaled_img = cv2.resize(img[:, :, ::-1], inference_size) + + transform = T.Compose([ + T.ToTensor(), + T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) + ]) + + # mean-std normalize the input image (batch-size: 1) + img_tens = transform(scaled_img).unsqueeze(0).to(args.device) + + # Inference + t0 = time.time() + with torch.no_grad(): + # propagate through the model + output = model(img_tens) + t1 = time.time() + + # keep only predictions above set confidence + bboxes_keep = output[0, output[0, :, 4] > args.prob_threshold] + probas = bboxes_keep[:, 4:] + + # convert boxes to image scales + bboxes_scaled = rescale_bboxes(bboxes_keep[:, :4], + (img.shape[1], img.shape[0]), + inference_size) + + txt = "Detect-waste %s Threshold=%.2f " \ + "Inference %dx%d GPU: %s Inference time %.3fs" % \ + (model_name, args.prob_threshold, inference_size[0], + inference_size[1], torch.cuda.get_device_name(0), + t1 - t0) + result = get_output(img, probas, bboxes_scaled, + args.classes, txt) + cv2.imwrite(os.path.join(args.save, 'img%08d.jpg' % counter), result) + counter += 1 + pbar.update(1) + del img + del img_tens + del result + + cap.release() + + +def plot_results(pil_img, prob, boxes, classes=['Litter'], + save_path=None, colors=None): + plt.figure(figsize=(16, 10)) + plt.imshow(pil_img) + ax = plt.gca() + if colors is None: + # colors for visualization + colors = 100 * [ + [0.000, 0.447, 0.741], [0.850, 0.325, 0.098], [0.929, 0.694, 0.125], + [0.494, 0.184, 0.556], [0.466, 0.674, 0.188], [0.301, 0.745, 0.933]] + for p, (xmin, ymin, xmax, ymax), c in zip(prob, boxes, colors): + ax.add_patch(plt.Rectangle((xmin, ymin), xmax - xmin, ymax - ymin, + fill=False, color=c, linewidth=3)) + cl = int(p[1]) + text = f'{classes[cl]}: {p[0]:0.2f}' + ax.text(xmin, ymin, text, fontsize=15, + bbox=dict(facecolor='yellow', alpha=0.5)) + plt.axis('off') + if save_path is not None: + plt.savefig(save_path, bbox_inches='tight', + transparent=True, pad_inches=0) + plt.close() + print(f'Image saved at {save_path}') + else: + plt.show() + + +def set_model(model_type, num_classes, checkpoint_path, device): + + # create model + model = create_model( + model_type, + bench_task='predict', + num_classes=num_classes, + pretrained=False, + redundant_bias=True, + checkpoint_path=checkpoint_path + ) + + param_count = sum([m.numel() for m in model.parameters()]) + print('Model %s created, param count: %d' % (model_type, param_count)) + model = model.to(device) + return model + + +def main(args): + # prepare model for evaluation + torch.set_grad_enabled(False) + num_classes = len(args.classes) + model_name = args.checkpoint.split('-')[-1].split('/')[0] + model = set_model(model_name, num_classes, args.checkpoint, args.device) + + model.eval() + # get image + if args.img.startswith('https'): + im = Image.open(requests.get(args.img, stream=True).raw).convert('RGB') + else: + im = Image.open(args.img).convert('RGB') + + # mean-std normalize the input image (batch-size: 1) + img = get_transforms(im) + + # propagate through the model + outputs = model(img.to(args.device)) + + # keep only predictions above set confidence + bboxes_keep = outputs[0, outputs[0, :, 4] > args.prob_threshold] + probas = bboxes_keep[:, 4:] + + # convert boxes to image scales + bboxes_scaled = rescale_bboxes(bboxes_keep[:, :4], im.size, + tuple(img.size()[2:])) + + # plot and save demo image + plot_results(im, probas, bboxes_scaled.tolist(), args.classes, args.save) + + +if __name__ == '__main__': + parser = get_args_parser() + args = parser.parse_args() + if args.video: + save_frames(args) + else: + main(args) diff --git a/models/efficientdet-d2-detector.pth.tar b/models/efficientdet-d2-detector.pth.tar new file mode 100644 index 0000000000000000000000000000000000000000..92bc303935575092990a6c8359a19c759d8cda78 --- /dev/null +++ b/models/efficientdet-d2-detector.pth.tar @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:499a3f0c75e13669d69be25854e980812e2f6b50e618ba2b2e90b25f193e7fd9 +size 97791163 diff --git a/models/resnet50-classifier.pkl b/models/resnet50-classifier.pkl new file mode 100644 index 0000000000000000000000000000000000000000..305f3c9740b729229613ce8bd71f9444d38e8071 --- /dev/null +++ b/models/resnet50-classifier.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d2c0667090f996cbe4bab8585300528b8896071e70b1edfdbe671015a074e85 +size 102980821 diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..acd41946d2b46521375aba485cfcfe48ab65519e --- /dev/null +++ b/requirements.txt @@ -0,0 +1,22 @@ +albumentations>=0.5.2 +efficientnet_pytorch +fastai==2.7.13 +funcy==1.15 +iterative-stratification==0.1.6 +matplotlib==3.8.2 +numpy==1.26.2 +omegaconf>=2.0 +opencv-python==4.8.1.78 +opencv-python-headless==4.8.1.78 +pycocotools>=2.0.0 +pytorch_lightning +pyyaml +rembg==2.0.53 +scikit-learn==1.3.2 +scikit-plot +scipy==1.11.4 +streamlit +timm +torch +torchvision +tqdm==4.66.1 \ No newline at end of file diff --git a/trash_detector.py b/trash_detector.py new file mode 100644 index 0000000000000000000000000000000000000000..1f959323b292a6289f80b2299003f4aad01906bc --- /dev/null +++ b/trash_detector.py @@ -0,0 +1,57 @@ +import numpy as np +import torch +from fastai.vision.all import load_learner + +from efficientdet.efficientdet import get_transforms, rescale_bboxes, set_model + + +def localize_trash(im, det_name, det_checkpoint, device, prob_threshold): + # detector + detector = set_model(det_name, 1, det_checkpoint, device) + detector.eval() + # mean-std normalize the input image (batch-size: 1) + img = get_transforms(im) + # propagate through the model + outputs = detector(img.to(device)) + # keep only predictions above set confidence + bboxes_keep = outputs[0, outputs[0, :, 4] > prob_threshold] + probas = bboxes_keep[:, 4:] + # convert boxes to image scales + bboxes_scaled = rescale_bboxes(bboxes_keep[:, :4], im.size, tuple(img.size()[2:])) + return probas, bboxes_scaled + + +def classify_trash(im, clas_checkpoint, cls_th, probas, bboxes_scaled): + # classifier + classifier = load_learner(clas_checkpoint) + + bboxes_final = [] + cls_prob = [] + for p, (xmin, ymin, xmax, ymax) in zip(probas, bboxes_scaled.tolist()): + img = im.crop((xmin, ymin, xmax, ymax)) + outputs = classifier.predict(img) + p[1] = torch.topk(outputs[2], k=1).indices.squeeze(0).item() + p[0] = torch.max(np.trunc(outputs[2] * 100)) + if p[0] >= cls_th * 100: + bboxes_final.append((xmin, ymin, xmax, ymax)) + cls_prob.append(p) + return cls_prob, bboxes_final + + +def detect_trash( + im, det_name, det_checkpoint, clas_checkpoint, device, prob_threshold, cls_th +): + # prepare models for evaluation + torch.set_grad_enabled(False) + + # 1) Localize + probas, bboxes_scaled = localize_trash( + im, det_name, det_checkpoint, device, prob_threshold + ) + + # 2) Classify + cls_prob, bboxes_final = classify_trash( + im, clas_checkpoint, cls_th, probas, bboxes_scaled + ) + + return cls_prob, bboxes_final