AndreasLH's picture
upload repo
56bd2b5
# Copyright (c) Meta Platforms, Inc. and affiliates
import logging
from typing import Dict, List, Optional
from detectron2.layers import move_device_like
from detectron2.structures.image_list import ImageList
import torch
import numpy as np
from detectron2.layers import ShapeSpec, batched_nms
from detectron2.utils.visualizer import Visualizer
from detectron2.data.detection_utils import convert_image_to_rgb
from detectron2.structures import Instances
from detectron2.utils.events import get_event_storage
from detectron2.data import MetadataCatalog
from detectron2.modeling.backbone import Backbone, BACKBONE_REGISTRY
from detectron2.modeling.proposal_generator import build_proposal_generator
from detectron2.utils.logger import _log_api_usage
from detectron2.modeling.meta_arch import (
META_ARCH_REGISTRY, GeneralizedRCNN
)
from cubercnn.data.generate_depth_maps import setup_depth_model
from cubercnn.modeling.roi_heads import build_roi_heads
from detectron2.data import MetadataCatalog
from cubercnn.modeling.roi_heads import build_roi_heads
from cubercnn import util, vis
import torch.nn.functional as F
from detectron2.config import configurable
import torch.nn as nn
logger = logging.getLogger(__name__)
@META_ARCH_REGISTRY.register()
class RCNN3D(GeneralizedRCNN):
@classmethod
def from_config(cls, cfg, priors=None):
backbone = build_backbone(cfg, priors=priors)
return {
"backbone": backbone,
"proposal_generator": build_proposal_generator(cfg, backbone.output_shape()),
"roi_heads": build_roi_heads(cfg, backbone.output_shape(), priors=priors),
"input_format": cfg.INPUT.FORMAT,
"vis_period": cfg.VIS_PERIOD,
"pixel_mean": cfg.MODEL.PIXEL_MEAN,
"pixel_std": cfg.MODEL.PIXEL_STD,
}
def forward(self, batched_inputs: List[Dict[str, torch.Tensor]]):
if not self.training:
return self.inference(batched_inputs)
images = self.preprocess_image(batched_inputs)
# scaling factor for the sample relative to its original scale
# e.g., how much has the image been upsampled by? or downsampled?
im_scales_ratio = [info['height'] / im.shape[1] for (info, im) in zip(batched_inputs, images)]
# The unmodified intrinsics for the image
Ks = [torch.FloatTensor(info['K']) for info in batched_inputs]
if "instances" in batched_inputs[0]:
gt_instances = [x["instances"].to(self.device) for x in batched_inputs]
else:
gt_instances = None
# the backbone is actually a FPN, where the DLA model is the bottom-up structure.
# FPN: https://arxiv.org/abs/1612.03144v2
# backbone and proposal generator only work on 2D images and annotations.
features = self.backbone(images.tensor)
proposals, proposal_losses = self.proposal_generator(images, features, gt_instances)
instances, detector_losses = self.roi_heads(
images, features, proposals,
Ks, im_scales_ratio,
gt_instances
)
if self.vis_period > 0:
storage = get_event_storage()
if storage.iter % self.vis_period == 0 and storage.iter > 0:
self.visualize_training(batched_inputs, proposals, instances)
losses = {}
losses.update(detector_losses)
losses.update(proposal_losses)
return losses
def inference(
self,
batched_inputs: List[Dict[str, torch.Tensor]],
detected_instances: Optional[List[Instances]] = None,
do_postprocess: bool = True,
):
assert not self.training
images = self.preprocess_image(batched_inputs)
# scaling factor for the sample relative to its original scale
# e.g., how much has the image been upsampled by? or downsampled?
im_scales_ratio = [info['height'] / im.shape[1] for (info, im) in zip(batched_inputs, images)]
# The unmodified intrinsics for the image
Ks = [torch.FloatTensor(info['K']) for info in batched_inputs]
features = self.backbone(images.tensor)
# Pass oracle 2D boxes into the RoI heads
if type(batched_inputs == list) and np.any(['oracle2D' in b for b in batched_inputs]):
oracles = [b['oracle2D'] for b in batched_inputs]
results, _ = self.roi_heads(images, features, oracles, Ks, im_scales_ratio, None)
# normal inference
else:
proposals, _ = self.proposal_generator(images, features, None)
results, _ = self.roi_heads(images, features, proposals, Ks, im_scales_ratio, None)
if do_postprocess:
assert not torch.jit.is_scripting(), "Scripting is not supported for postprocess."
return GeneralizedRCNN._postprocess(results, batched_inputs, images.image_sizes)
else:
return results
def visualize_training(self, batched_inputs, proposals, instances):
"""
A function used to visualize images and proposals. It shows ground truth
bounding boxes on the original image and up to 20 top-scoring predicted
object proposals on the original image. Users can implement different
visualization functions for different models.
Args:
batched_inputs (list): a list that contains input to the model.
proposals (list): a list that contains predicted proposals. Both
batched_inputs and proposals should have the same length.
instances (list): a list that contains predicted RoIhead instances. Both
batched_inputs and proposals should have the same length.
"""
storage = get_event_storage()
# minimum number of boxes to try to visualize per image
max_vis_prop = 20
if not hasattr(self, 'thing_classes'):
self.thing_classes = MetadataCatalog.get('omni3d_model').thing_classes
self.num_classes = len(self.thing_classes)
for input, prop, instances_i in zip(batched_inputs, proposals, instances):
img = input["image"]
img = convert_image_to_rgb(img.permute(1, 2, 0), self.input_format)
img_3DGT = np.ascontiguousarray(img.copy()[:, :, [2, 1, 1]]) # BGR
img_3DPR = np.ascontiguousarray(img.copy()[:, :, [2, 1, 1]]) # BGR
'''
Visualize the 2D GT and proposal predictions
'''
v_gt = Visualizer(img, None)
v_gt = v_gt.overlay_instances(boxes=input["instances"].gt_boxes)
anno_img = v_gt.get_image()
box_size = min(len(prop.proposal_boxes), max_vis_prop)
v_pred = Visualizer(img, None)
v_pred = v_pred.overlay_instances(
boxes=prop.proposal_boxes[0:box_size].tensor.cpu().numpy()
)
prop_img = v_pred.get_image()
vis_img_rpn = np.concatenate((anno_img, prop_img), axis=1)
vis_img_rpn = vis_img_rpn.transpose(2, 0, 1)
storage.put_image("Left: GT 2D bounding boxes; Right: Predicted 2D proposals", vis_img_rpn)
'''
Visualize the 3D GT and predictions
'''
K = torch.tensor(input['K'], device=self.device)
scale = input['height']/img.shape[0]
fx, sx = (val.item()/scale for val in K[0, [0, 2]])
fy, sy = (val.item()/scale for val in K[1, [1, 2]])
K_scaled = torch.tensor(
[[1/scale, 0 , 0], [0, 1/scale, 0], [0, 0, 1.0]],
dtype=torch.float32, device=self.device
) @ K
gts_per_image = input["instances"]
gt_classes = gts_per_image.gt_classes
# Filter out irrelevant groundtruth
fg_selection_mask = (gt_classes != -1) & (gt_classes < self.num_classes)
gt_classes = gt_classes[fg_selection_mask]
gt_class_names = [self.thing_classes[cls_idx] for cls_idx in gt_classes]
gt_boxes = gts_per_image.gt_boxes.tensor[fg_selection_mask] # 2D boxes
gt_poses = gts_per_image.gt_poses[fg_selection_mask] # GT poses
# projected 2D center, depth, w, h, l, 3D center
gt_boxes3D = gts_per_image.gt_boxes3D[fg_selection_mask]
# this box may have been mirrored and scaled so
# we need to recompute XYZ in 3D by backprojecting.
gt_z = gt_boxes3D[:, 2]
gt_x3D = gt_z * (gt_boxes3D[:, 0] - sx)/fx
gt_y3D = gt_z * (gt_boxes3D[:, 1] - sy)/fy
# put together the GT boxes
gt_center_3D = torch.stack((gt_x3D, gt_y3D, gt_z)).T
gt_boxes3D_XYZ_WHL = torch.cat((gt_center_3D, gt_boxes3D[:, 3:6]), dim=1)
gt_colors = torch.tensor(
[util.get_color(i) for i in range(len(gt_boxes3D_XYZ_WHL))],
device=self.device
)/255.0
gt_meshes = util.mesh_cuboid(gt_boxes3D_XYZ_WHL, gt_poses, gt_colors)
# perform a simple NMS, which is not cls dependent.
keep = batched_nms(
instances_i.pred_boxes.tensor,
instances_i.scores,
torch.zeros(len(instances_i.scores), dtype=torch.long, device=instances_i.scores.device),
self.roi_heads.box_predictor.test_nms_thresh
)
keep = keep[:max_vis_prop]
num_to_visualize = len(keep)
pred_xyzwhl = torch.cat((instances_i.pred_center_cam[keep], instances_i.pred_dimensions[keep]), dim=1)
pred_pose = instances_i.pred_pose[keep]
pred_colors = torch.tensor(
[util.get_color(i) for i in range(num_to_visualize)],
device=self.device
)/255.0
pred_boxes = instances_i.pred_boxes[keep]
pred_scores = instances_i.scores[keep]
pred_classes = instances_i.pred_classes[keep]
pred_class_names = ['{} {:.2f}'.format(self.thing_classes[cls_idx], score) for cls_idx, score in zip(pred_classes, pred_scores)]
pred_meshes = util.mesh_cuboid(pred_xyzwhl, pred_pose, pred_colors)
# convert to lists
pred_meshes = [pred_meshes.__getitem__(i).detach() for i in range(len(pred_meshes))]
gt_meshes = [gt_meshes.__getitem__(i) for i in range(len(gt_meshes))]
img_3DPR = vis.draw_scene_view(img_3DPR, K_scaled.cpu().numpy(), pred_meshes, text=pred_class_names, mode='front', blend_weight=0.0, blend_weight_overlay=0.85)
img_3DGT = vis.draw_scene_view(img_3DGT, K_scaled.cpu().numpy(), gt_meshes, text=gt_class_names, mode='front', blend_weight=0.0, blend_weight_overlay=0.85)
# horizontal stack 3D GT and pred left/right
vis_img_3d = np.concatenate((img_3DGT, img_3DPR), axis=1)
vis_img_3d = vis_img_3d[:, :, [2, 1, 0]] # RGB
vis_img_3d = vis_img_3d.astype(np.uint8).transpose(2, 0, 1)
storage.put_image("Left: GT 3D cuboids; Right: Predicted 3D cuboids", vis_img_3d)
break # only visualize one image in a batch
@META_ARCH_REGISTRY.register()
class RCNN3D_combined_features(nn.Module):
@configurable
def __init__(self, *, backbone, proposal_generator, roi_heads, input_format, vis_period, pixel_mean, pixel_std, depth_model):
super().__init__()
self.backbone = backbone
self.proposal_generator = proposal_generator
self.roi_heads = roi_heads
self.input_format = input_format
self.vis_period = vis_period
self.depth_model = depth_model
self.register_buffer("pixel_mean", torch.tensor(pixel_mean).view(-1, 1, 1), False)
self.register_buffer("pixel_std", torch.tensor(pixel_std).view(-1, 1, 1), False)
assert (
self.pixel_mean.shape == self.pixel_std.shape
), f"{self.pixel_mean} and {self.pixel_std} have different shapes!"
@classmethod
def from_config(cls, cfg, priors=None):
backbone = build_backbone(cfg, priors=priors)
if cfg.MODEL.DEPTH_ON:
depth_model = 'zoedepth'
pretrained_resource = 'local::depth/checkpoints/depth_anything_metric_depth_indoor.pt'
d_model = setup_depth_model(depth_model, pretrained_resource) #NOTE maybe make the depth model be learnable as well
shape_modified = {key:ShapeSpec(i.channels*2,stride=i.stride) for key, i in backbone.output_shape().items()}
else:
d_model = None
shape_modified = backbone.output_shape()
return {
"backbone": backbone,
"proposal_generator": build_proposal_generator(cfg, backbone.output_shape()),
"roi_heads": build_roi_heads(cfg, shape_modified, priors=priors),
"input_format": cfg.INPUT.FORMAT,
"vis_period": cfg.VIS_PERIOD,
"pixel_mean": cfg.MODEL.PIXEL_MEAN,
"pixel_std": cfg.MODEL.PIXEL_STD,
"depth_model": d_model,
}
@property
def device(self):
return self.pixel_mean.device
def _move_to_current_device(self, x):
return move_device_like(x, self.pixel_mean)
def preprocess_image(self, batched_inputs: List[Dict[str, torch.Tensor]], normalise=True, img_type="image", convert=False, NoOp=False, to_float=False):
"""
Normalize, pad and batch the input images.
"""
images = [self._move_to_current_device(x[img_type]) for x in batched_inputs]
if normalise:
images = [(x - self.pixel_mean) / self.pixel_std for x in images]
if convert:
# convert from BGR to RGB
images = [x[[2,1,0],:,:] for x in images]
if to_float:
images = [x.float()/255.0 for x in images]
if NoOp:
images = ImageList.from_tensors(images)
return images
images = ImageList.from_tensors(
images,
self.backbone.size_divisibility,
padding_constraints=self.backbone.padding_constraints,
)
return images
def _standardize(self, x:torch.Tensor, y:torch.Tensor):
'''standardise x to match the mean and std of y'''
ym = y.mean()
ys = y.std()
xm = x.mean()
xs = x.std()
return (x - xm) * (ys / xs) + ym
def cat_depth_features(self, features, images_raw):
pred_o = self.depth_model(images_raw.tensor.float()/255.0)
# depth features corresponding to p2, p3, p4, p5
d_features = pred_o['depth_features']
# img_features = features['p5']
# we must scale the depth map to the same size as the conv feature, otherwise the scale will not correspond correctly in the roi pooling
for (layer, img_feature), d_feature in zip(features.items(), reversed(d_features)):
d_feature = F.interpolate(d_feature, size=img_feature.shape[-2:], mode='bilinear', align_corners=True)
d_feature = self._standardize(d_feature, img_feature)
features[layer] = torch.cat((img_feature, d_feature), dim=1)
return features
def forward(self, batched_inputs: List[Dict[str, torch.Tensor]]):
if not self.training:
return self.inference(batched_inputs) # segmentor is just none in inference because we dont need the loss
images = self.preprocess_image(batched_inputs)
# NOTE: images_raw are scaled to be padded to the same size as the largest.
# This is necessary because the images are of different sizes, so to batch them they must each be the same size.
images_raw = self.preprocess_image(batched_inputs, img_type='image', convert=True, normalise=False, NoOp=True)
# if we want depth maps they are there
depth_maps = self.preprocess_image(batched_inputs, img_type="depth_map", normalise=False, NoOp=True)
# Note if a single ground map in a batch is missing, we skip the ground map for the entire batch
ground_maps_fail = [i['ground_map'] is None for i in batched_inputs]
ground_maps_fail_idx = [i for i, x in enumerate(ground_maps_fail) if x]
for idx in ground_maps_fail_idx:
batched_inputs[idx]['ground_map'] = torch.tensor([[1]]) # make a dummy to indicate a fail
ground_maps = self.preprocess_image(batched_inputs, img_type="ground_map", normalise=False, NoOp=True)
# scaling factor for the sample relative to its original scale
# e.g., how much has the image been upsampled by? or downsampled?
im_scales_ratio = [info['height'] / im.shape[1] for (info, im) in zip(batched_inputs, images)]
# The unmodified intrinsics for the image
Ks = [torch.FloatTensor(info['K']) for info in batched_inputs]
if "instances" in batched_inputs[0]:
gt_instances = [x["instances"].to(self.device) for x in batched_inputs]
features = self.backbone(images.tensor)
proposals, proposal_losses = self.proposal_generator(images, features, gt_instances)
if self.depth_model is not None:
features = self.cat_depth_features(features, images_raw)
instances, detector_losses = self.roi_heads(
images, images_raw, ground_maps, depth_maps, features, proposals,
Ks, im_scales_ratio,
gt_instances
)
if self.vis_period > 0:
storage = get_event_storage()
if storage.iter % self.vis_period == 0 and storage.iter > 0:
self.visualize_training(batched_inputs, proposals, instances)
losses = {}
losses.update(detector_losses)
losses.update(proposal_losses)
return losses
def inference(
self,
batched_inputs: List[Dict[str, torch.Tensor]],
detected_instances: Optional[List[Instances]] = None,
do_postprocess: bool = True,
):
assert not self.training
images = self.preprocess_image(batched_inputs)
images_raw = self.preprocess_image(batched_inputs, img_type='image', convert=True, normalise=False, NoOp=True)
# do we assume no access to ground maps in inference?
ground_maps = None
depth_maps = None
# scaling factor for the sample relative to its original scale
# e.g., how much has the image been upsampled by? or downsampled?
im_scales_ratio = [info['height'] / im.shape[1] for (info, im) in zip(batched_inputs, images)]
# The unmodified intrinsics for the image
Ks = [torch.FloatTensor(info['K']) for info in batched_inputs]
features = self.backbone(images.tensor)
# Pass oracle 2D boxes into the RoI heads
if type(batched_inputs == list) and np.any(['oracle2D' in b for b in batched_inputs]):
oracles = [b['oracle2D'] for b in batched_inputs]
results, _ = self.roi_heads(images, images_raw, ground_maps, depth_maps, features, oracles, Ks, im_scales_ratio, None)
# normal inference
else:
proposals, _ = self.proposal_generator(images, features, None)
if self.depth_model is not None:
features = self.cat_depth_features(features, images_raw)
# pred boxes are proposals
results, _ = self.roi_heads(images, images_raw, ground_maps, depth_maps, features, proposals, Ks, im_scales_ratio, None)
if do_postprocess:
assert not torch.jit.is_scripting(), "Scripting is not supported for postprocess."
return GeneralizedRCNN._postprocess(results, batched_inputs, images.image_sizes)
else:
return results
def visualize_training(self, batched_inputs, proposals, instances):
"""
A function used to visualize images and proposals. It shows ground truth
bounding boxes on the original image and up to 20 top-scoring predicted
object proposals on the original image. Users can implement different
visualization functions for different models.
Args:
batched_inputs (list): a list that contains input to the model.
proposals (list): a list that contains predicted proposals. Both
batched_inputs and proposals should have the same length.
instances (list): a list that contains predicted RoIhead instances. Both
batched_inputs and proposals should have the same length.
"""
storage = get_event_storage()
# minimum number of boxes to try to visualize per image
max_vis_prop = 20
if not hasattr(self, 'thing_classes'):
self.thing_classes = MetadataCatalog.get('omni3d_model').thing_classes
self.num_classes = len(self.thing_classes)
only2d = instances is None
if only2d:
instances = [None]*len(batched_inputs)
for input, prop, instances_i in zip(batched_inputs, proposals, instances):
img = input["image"]
img = convert_image_to_rgb(img.permute(1, 2, 0), self.input_format)
img_3DGT = np.ascontiguousarray(img.copy()[:, :, [2, 1, 1]]) # BGR
img_3DPR = np.ascontiguousarray(img.copy()[:, :, [2, 1, 1]]) # BGR
'''
Visualize the 2D GT and proposal predictions
'''
v_gt = Visualizer(img, None)
v_gt = v_gt.overlay_instances(boxes=input["instances"].gt_boxes)
anno_img = v_gt.get_image()
box_size = min(len(prop.proposal_boxes), max_vis_prop)
v_pred = Visualizer(img, None)
v_pred = v_pred.overlay_instances(
boxes=prop.proposal_boxes[0:box_size].tensor.cpu().numpy()
)
prop_img = v_pred.get_image()
vis_img_rpn = np.concatenate((anno_img, prop_img), axis=1)
vis_img_rpn = vis_img_rpn.transpose(2, 0, 1)
storage.put_image("Left: GT 2D bounding boxes; Right: Predicted 2D proposals", vis_img_rpn)
if only2d:
break
'''
Visualize the 3D GT and predictions
'''
K = torch.tensor(input['K'], device=self.device)
scale = input['height']/img.shape[0]
fx, sx = (val.item()/scale for val in K[0, [0, 2]])
fy, sy = (val.item()/scale for val in K[1, [1, 2]])
K_scaled = torch.tensor(
[[1/scale, 0 , 0], [0, 1/scale, 0], [0, 0, 1.0]],
dtype=torch.float32, device=self.device
) @ K
gts_per_image = input["instances"]
gt_classes = gts_per_image.gt_classes
# Filter out irrelevant groundtruth
fg_selection_mask = (gt_classes != -1) & (gt_classes < self.num_classes)
gt_classes = gt_classes[fg_selection_mask]
gt_class_names = [self.thing_classes[cls_idx] for cls_idx in gt_classes]
gt_boxes = gts_per_image.gt_boxes.tensor[fg_selection_mask] # 2D boxes
gt_poses = gts_per_image.gt_poses[fg_selection_mask] # GT poses
# projected 2D center, depth, w, h, l, 3D center
gt_boxes3D = gts_per_image.gt_boxes3D[fg_selection_mask]
# this box may have been mirrored and scaled so
# we need to recompute XYZ in 3D by backprojecting.
gt_z = gt_boxes3D[:, 2]
gt_x3D = gt_z * (gt_boxes3D[:, 0] - sx)/fx
gt_y3D = gt_z * (gt_boxes3D[:, 1] - sy)/fy
# put together the GT boxes
gt_center_3D = torch.stack((gt_x3D, gt_y3D, gt_z)).T
gt_boxes3D_XYZ_WHL = torch.cat((gt_center_3D, gt_boxes3D[:, 3:6]), dim=1)
gt_colors = torch.tensor(
[util.get_color(i) for i in range(len(gt_boxes3D_XYZ_WHL))],
device=self.device
)/255.0
gt_meshes = util.mesh_cuboid(gt_boxes3D_XYZ_WHL, gt_poses, gt_colors)
# perform a simple NMS, which is not cls dependent.
keep = batched_nms(
instances_i.pred_boxes.tensor,
instances_i.scores,
torch.zeros(len(instances_i.scores), dtype=torch.long, device=instances_i.scores.device),
self.roi_heads.box_predictor.test_nms_thresh
)
keep = keep[:max_vis_prop]
num_to_visualize = len(keep)
pred_xyzwhl = torch.cat((instances_i.pred_center_cam[keep], instances_i.pred_dimensions[keep]), dim=1)
pred_pose = instances_i.pred_pose[keep]
pred_colors = torch.tensor(
[util.get_color(i) for i in range(num_to_visualize)],
device=self.device
)/255.0
pred_boxes = instances_i.pred_boxes[keep]
pred_scores = instances_i.scores[keep]
pred_classes = instances_i.pred_classes[keep]
pred_class_names = ['{} {:.2f}'.format(self.thing_classes[cls_idx], score) for cls_idx, score in zip(pred_classes, pred_scores)]
pred_meshes = util.mesh_cuboid(pred_xyzwhl, pred_pose, pred_colors)
# convert to lists
pred_meshes = [pred_meshes.__getitem__(i).detach() for i in range(len(pred_meshes))]
gt_meshes = [gt_meshes.__getitem__(i) for i in range(len(gt_meshes))]
img_3DPR = vis.draw_scene_view(img_3DPR, K_scaled.cpu().numpy(), pred_meshes, text=pred_class_names, mode='front', blend_weight=0.0, blend_weight_overlay=0.85)
img_3DGT = vis.draw_scene_view(img_3DGT, K_scaled.cpu().numpy(), gt_meshes, text=gt_class_names, mode='front', blend_weight=0.0, blend_weight_overlay=0.85)
# horizontal stack 3D GT and pred left/right
vis_img_3d = np.concatenate((img_3DGT, img_3DPR), axis=1)
vis_img_3d = vis_img_3d[:, :, [2, 1, 0]] # RGB
vis_img_3d = vis_img_3d.astype(np.uint8).transpose(2, 0, 1)
storage.put_image("Left: GT 3D cuboids; Right: Predicted 3D cuboids", vis_img_3d)
break # only visualize one image in a batch
@META_ARCH_REGISTRY.register()
class BoxNet(nn.Module):
@configurable
def __init__(
self,
*,
backbone: Backbone,
proposal_generator: nn.Module,
roi_heads: nn.Module,
pixel_mean: tuple[float],
pixel_std: tuple[float],
input_format: Optional[str] = None,
vis_period: int = 0,
):
"""
Args:
backbone: a backbone module, must follow detectron2's backbone interface
proposal_generator: a module that generates proposals using backbone features
roi_heads: a ROI head that performs per-region computation
pixel_mean, pixel_std: list or tuple with #channels element, representing
the per-channel mean and std to be used to normalize the input image
input_format: describe the meaning of channels of input. Needed by visualization
vis_period: the period to run visualization. Set to 0 to disable.
"""
super().__init__()
self.backbone = backbone
self.proposal_generator = proposal_generator
self.roi_heads = roi_heads
self.input_format = input_format
self.vis_period = vis_period
if vis_period > 0:
assert input_format is not None, "input_format is required for visualization!"
self.register_buffer("pixel_mean", torch.tensor(pixel_mean).view(-1, 1, 1), False)
self.register_buffer("pixel_std", torch.tensor(pixel_std).view(-1, 1, 1), False)
assert (
self.pixel_mean.shape == self.pixel_std.shape
), f"{self.pixel_mean} and {self.pixel_std} have different shapes!"
@classmethod
def from_config(cls, cfg, priors=None):
backbone = build_backbone(cfg, priors=priors)
return {
"backbone": backbone,
"proposal_generator": build_proposal_generator(cfg, backbone.output_shape()),
"roi_heads": build_roi_heads(cfg, backbone.output_shape(), priors=priors),
"input_format": cfg.INPUT.FORMAT,
"vis_period": cfg.VIS_PERIOD,
"pixel_mean": cfg.MODEL.PIXEL_MEAN,
"pixel_std": cfg.MODEL.PIXEL_STD,
}
@property
def device(self):
return self.pixel_mean.device
def _move_to_current_device(self, x):
return move_device_like(x, self.pixel_mean)
def preprocess_image(self, batched_inputs: List[Dict[str, torch.Tensor]], normalise=True, img_type="image", convert=False, NoOp=False, to_float=False):
"""
Normalize, pad and batch the input images.
"""
images = [self._move_to_current_device(x[img_type]) for x in batched_inputs]
if normalise:
images = [(x - self.pixel_mean) / self.pixel_std for x in images]
else:
if convert:
# convert from BGR to RGB
images = [x[[2,1,0],:,:] for x in images]
if to_float:
images = [x.float()/255.0 for x in images]
if NoOp:
images = ImageList.from_tensors(images,0,)
return images
images = ImageList.from_tensors(
images,
self.backbone.size_divisibility,
padding_constraints=self.backbone.padding_constraints,
)
return images
def forward(self, batched_inputs: List[Dict[str, torch.Tensor]], experiment_type={'use_pred_boxes':True}, proposal_function='propose'):
if not self.training:
if not experiment_type['use_pred_boxes']: # MABO
return self.inference(batched_inputs, do_postprocess=False, experiment_type=experiment_type, proposal_function=proposal_function)
else: # AP
return self.inference(batched_inputs, do_postprocess=True, experiment_type=experiment_type, proposal_function=proposal_function)
if self.training:
images = self.preprocess_image(batched_inputs, img_type='image', convert=False)
images_raw = self.preprocess_image(batched_inputs, img_type='image', convert=True, normalise=False, NoOp=True)
depth_maps = self.preprocess_image(batched_inputs, img_type="depth_map", normalise=False, NoOp=True)
if batched_inputs[0]['ground_map'] is not None:
ground_maps = self.preprocess_image(batched_inputs, img_type="ground_map", normalise=False, NoOp=True)
if not torch.count_nonzero(ground_maps.tensor): # for some reason there is a single ground map causing problems
print('no_ground for', batched_inputs[0]['image_id'])
ground_maps = None
else:
ground_maps = None
# scaling factor for the sample relative to its original scale
# e.g., how much has the image been upsampled by? or downsampled?
im_scales_ratio = [info['height'] / im.shape[1] for (info, im) in zip(batched_inputs, images)]
# The unmodified intrinsics for the image
Ks = [torch.FloatTensor(info['K']) for info in batched_inputs]
features = None
gt_instances = [x["instances"].to(self.device) for x in batched_inputs]
# def forward(self, images, images_raw, combined_features, depth_maps, ground_maps, features, proposals, Ks, im_scales_ratio, segmentor, experiment_type, proposal_function, targets=None):
results = self.roi_heads(images, images_raw, None, depth_maps, ground_maps, features, gt_instances, Ks, im_scales_ratio, experiment_type, proposal_function)
return GeneralizedRCNN._postprocess(results, batched_inputs, images.image_sizes)
def inference(self,
batched_inputs: List[Dict[str, torch.Tensor]],
detected_instances: Optional[List[Instances]] = None, do_postprocess: bool = True, experiment_type={}, proposal_function='propose'):
assert not self.training
# must apply the same preprocessing to both the image, the depth map, and the mask
# except don't normalise the input for the segmentation method
images = self.preprocess_image(batched_inputs, img_type='image', convert=False)
images_raw = self.preprocess_image(batched_inputs, img_type='image', convert=True, normalise=False, NoOp=True)
depth_maps = self.preprocess_image(batched_inputs, img_type="depth_map", normalise=False, NoOp=True)
if batched_inputs[0]['ground_map'] is not None:
ground_maps = self.preprocess_image(batched_inputs, img_type="ground_map", normalise=False, NoOp=True)
else:
#logger.info("ground map file not found, setting to None")
ground_maps = None
# TODO: make logic to predict ground map on the fly
# logger.info("ground map file not found, computing...")
# raise NotImplementedError("Implement ground on the fly, see generate_ground_segmentations.py for reference")
# scaling factor for the sample relative to its original scale
# e.g., how much has the image been upsampled by? or downsampled?
im_scales_ratio = [info['height'] / im.shape[1] for (info, im) in zip(batched_inputs, images)]
# The unmodified intrinsics for the image
Ks = [torch.FloatTensor(info['K']) for info in batched_inputs]
# do_postprocess is the same as using predicted boxes
if do_postprocess:
# gt_instances should be None in inference mode
features = self.backbone(images.tensor)
# normal inference
proposals, _ = self.proposal_generator(images, features, None)
else:
if "instances" in batched_inputs[0]:
gt_instances = [x["instances"].to(self.device) for x in batched_inputs]
else:
gt_instances = None
features, proposals = None, gt_instances
# combined_features = self.scorenet_base.forward_features(images, images_raw)
combined_features = None
# is it necessary to resize images back???
# use the mask and the 2D box to predict the 3D box
# proposals are ground truth for MABO plots and predictions for AP plots
results = self.roi_heads(images, images_raw, combined_features, depth_maps, ground_maps, features, proposals, Ks, im_scales_ratio, experiment_type, proposal_function)
if do_postprocess:
assert not torch.jit.is_scripting(), "Scripting is not supported for postprocess."
return GeneralizedRCNN._postprocess(results, batched_inputs, images.image_sizes)
else:
return results #[{'instances':results}]
def visualize_training(self, batched_inputs, proposals, instances):
"""
A function used to visualize images and proposals. It shows ground truth
bounding boxes on the original image and up to 20 top-scoring predicted
object proposals on the original image. Users can implement different
visualization functions for different models.
Args:
batched_inputs (list): a list that contains input to the model.
proposals (list): a list that contains predicted proposals. Both
batched_inputs and proposals should have the same length.
instances (list): a list that contains predicted RoIhead instances. Both
batched_inputs and proposals should have the same length.
"""
storage = get_event_storage()
# minimum number of boxes to try to visualize per image
max_vis_prop = 20
if not hasattr(self, 'thing_classes'):
self.thing_classes = MetadataCatalog.get('omni3d_model').thing_classes
self.num_classes = len(self.thing_classes)
for input, prop, instances_i in zip(batched_inputs, proposals, instances):
img = input["image"]
img = convert_image_to_rgb(img.permute(1, 2, 0), self.input_format)
img_3DGT = np.ascontiguousarray(img.copy()[:, :, [2, 1, 1]]) # BGR
img_3DPR = np.ascontiguousarray(img.copy()[:, :, [2, 1, 1]]) # BGR
'''
Visualize the 2D GT and proposal predictions
'''
v_gt = Visualizer(img, None)
v_gt = v_gt.overlay_instances(boxes=input["instances"].gt_boxes)
anno_img = v_gt.get_image()
box_size = min(len(prop.proposal_boxes), max_vis_prop)
v_pred = Visualizer(img, None)
v_pred = v_pred.overlay_instances(
boxes=prop.proposal_boxes[0:box_size].tensor.cpu().numpy()
)
prop_img = v_pred.get_image()
vis_img_rpn = np.concatenate((anno_img, prop_img), axis=1)
vis_img_rpn = vis_img_rpn.transpose(2, 0, 1)
storage.put_image("Left: GT 2D bounding boxes; Right: Predicted 2D proposals", vis_img_rpn)
'''
Visualize the 3D GT and predictions
'''
K = torch.tensor(input['K'], device=self.device)
scale = input['height']/img.shape[0]
fx, sx = (val.item()/scale for val in K[0, [0, 2]])
fy, sy = (val.item()/scale for val in K[1, [1, 2]])
K_scaled = torch.tensor(
[[1/scale, 0 , 0], [0, 1/scale, 0], [0, 0, 1.0]],
dtype=torch.float32, device=self.device
) @ K
gts_per_image = input["instances"]
gt_classes = gts_per_image.gt_classes
# Filter out irrelevant groundtruth
fg_selection_mask = (gt_classes != -1) & (gt_classes < self.num_classes)
gt_classes = gt_classes[fg_selection_mask]
gt_class_names = [self.thing_classes[cls_idx] for cls_idx in gt_classes]
gt_boxes = gts_per_image.gt_boxes.tensor[fg_selection_mask] # 2D boxes
gt_poses = gts_per_image.gt_poses[fg_selection_mask] # GT poses
# projected 2D center, depth, w, h, l, 3D center
gt_boxes3D = gts_per_image.gt_boxes3D[fg_selection_mask]
# this box may have been mirrored and scaled so
# we need to recompute XYZ in 3D by backprojecting.
gt_z = gt_boxes3D[:, 2]
gt_x3D = gt_z * (gt_boxes3D[:, 0] - sx)/fx
gt_y3D = gt_z * (gt_boxes3D[:, 1] - sy)/fy
# put together the GT boxes
gt_center_3D = torch.stack((gt_x3D, gt_y3D, gt_z)).T
gt_boxes3D_XYZ_WHL = torch.cat((gt_center_3D, gt_boxes3D[:, 3:6]), dim=1)
gt_colors = torch.tensor(
[util.get_color(i) for i in range(len(gt_boxes3D_XYZ_WHL))],
device=self.device
)/255.0
gt_meshes = util.mesh_cuboid(gt_boxes3D_XYZ_WHL, gt_poses, gt_colors)
# perform a simple NMS, which is not cls dependent.
keep = batched_nms(
instances_i.pred_boxes.tensor,
instances_i.scores,
torch.zeros(len(instances_i.scores), dtype=torch.long, device=instances_i.scores.device),
self.roi_heads.box_predictor.test_nms_thresh
)
keep = keep[:max_vis_prop]
num_to_visualize = len(keep)
pred_xyzwhl = torch.cat((instances_i.pred_center_cam[keep], instances_i.pred_dimensions[keep]), dim=1)
pred_pose = instances_i.pred_pose[keep]
pred_colors = torch.tensor(
[util.get_color(i) for i in range(num_to_visualize)],
device=self.device
)/255.0
pred_boxes = instances_i.pred_boxes[keep]
pred_scores = instances_i.scores[keep]
pred_classes = instances_i.pred_classes[keep]
pred_class_names = ['{} {:.2f}'.format(self.thing_classes[cls_idx], score) for cls_idx, score in zip(pred_classes, pred_scores)]
pred_meshes = util.mesh_cuboid(pred_xyzwhl, pred_pose, pred_colors)
# convert to lists
pred_meshes = [pred_meshes.__getitem__(i).detach() for i in range(len(pred_meshes))]
gt_meshes = [gt_meshes.__getitem__(i) for i in range(len(gt_meshes))]
img_3DPR = vis.draw_scene_view(img_3DPR, K_scaled.cpu().numpy(), pred_meshes, text=pred_class_names, mode='front', blend_weight=0.0, blend_weight_overlay=0.85)
img_3DGT = vis.draw_scene_view(img_3DGT, K_scaled.cpu().numpy(), gt_meshes, text=gt_class_names, mode='front', blend_weight=0.0, blend_weight_overlay=0.85)
# horizontal stack 3D GT and pred left/right
vis_img_3d = np.concatenate((img_3DGT, img_3DPR), axis=1)
vis_img_3d = vis_img_3d[:, :, [2, 1, 0]] # RGB
vis_img_3d = vis_img_3d.astype(np.uint8).transpose(2, 0, 1)
storage.put_image("Left: GT 3D cuboids; Right: Predicted 3D cuboids", vis_img_3d)
break
def build_model(cfg, priors=None):
"""
Build the whole model architecture, defined by ``cfg.MODEL.META_ARCHITECTURE``.
Note that it does not load any weights from ``cfg``.
"""
meta_arch = cfg.MODEL.META_ARCHITECTURE
model = META_ARCH_REGISTRY.get(meta_arch)(cfg, priors=priors)
model.to(torch.device(cfg.MODEL.DEVICE))
_log_api_usage("modeling.meta_arch." + meta_arch)
return model
def build_backbone(cfg, input_shape=None, priors=None):
"""
Build a backbone from `cfg.MODEL.BACKBONE.NAME`.
Returns:
an instance of :class:`Backbone`
"""
if input_shape is None:
input_shape = ShapeSpec(channels=len(cfg.MODEL.PIXEL_MEAN))
backbone_name = cfg.MODEL.BACKBONE.NAME
backbone = BACKBONE_REGISTRY.get(backbone_name)(cfg, input_shape, priors)
assert isinstance(backbone, Backbone)
return backbone