Spaces:
Sleeping
Sleeping
# Copyright (c) Meta Platforms, Inc. and affiliates | |
import logging | |
from typing import Dict, List, Optional | |
from detectron2.layers import move_device_like | |
from detectron2.structures.image_list import ImageList | |
import torch | |
import numpy as np | |
from detectron2.layers import ShapeSpec, batched_nms | |
from detectron2.utils.visualizer import Visualizer | |
from detectron2.data.detection_utils import convert_image_to_rgb | |
from detectron2.structures import Instances | |
from detectron2.utils.events import get_event_storage | |
from detectron2.data import MetadataCatalog | |
from detectron2.modeling.backbone import Backbone, BACKBONE_REGISTRY | |
from detectron2.modeling.proposal_generator import build_proposal_generator | |
from detectron2.utils.logger import _log_api_usage | |
from detectron2.modeling.meta_arch import ( | |
META_ARCH_REGISTRY, GeneralizedRCNN | |
) | |
from cubercnn.data.generate_depth_maps import setup_depth_model | |
from cubercnn.modeling.roi_heads import build_roi_heads | |
from detectron2.data import MetadataCatalog | |
from cubercnn.modeling.roi_heads import build_roi_heads | |
from cubercnn import util, vis | |
import torch.nn.functional as F | |
from detectron2.config import configurable | |
import torch.nn as nn | |
logger = logging.getLogger(__name__) | |
class RCNN3D(GeneralizedRCNN): | |
def from_config(cls, cfg, priors=None): | |
backbone = build_backbone(cfg, priors=priors) | |
return { | |
"backbone": backbone, | |
"proposal_generator": build_proposal_generator(cfg, backbone.output_shape()), | |
"roi_heads": build_roi_heads(cfg, backbone.output_shape(), priors=priors), | |
"input_format": cfg.INPUT.FORMAT, | |
"vis_period": cfg.VIS_PERIOD, | |
"pixel_mean": cfg.MODEL.PIXEL_MEAN, | |
"pixel_std": cfg.MODEL.PIXEL_STD, | |
} | |
def forward(self, batched_inputs: List[Dict[str, torch.Tensor]]): | |
if not self.training: | |
return self.inference(batched_inputs) | |
images = self.preprocess_image(batched_inputs) | |
# scaling factor for the sample relative to its original scale | |
# e.g., how much has the image been upsampled by? or downsampled? | |
im_scales_ratio = [info['height'] / im.shape[1] for (info, im) in zip(batched_inputs, images)] | |
# The unmodified intrinsics for the image | |
Ks = [torch.FloatTensor(info['K']) for info in batched_inputs] | |
if "instances" in batched_inputs[0]: | |
gt_instances = [x["instances"].to(self.device) for x in batched_inputs] | |
else: | |
gt_instances = None | |
# the backbone is actually a FPN, where the DLA model is the bottom-up structure. | |
# FPN: https://arxiv.org/abs/1612.03144v2 | |
# backbone and proposal generator only work on 2D images and annotations. | |
features = self.backbone(images.tensor) | |
proposals, proposal_losses = self.proposal_generator(images, features, gt_instances) | |
instances, detector_losses = self.roi_heads( | |
images, features, proposals, | |
Ks, im_scales_ratio, | |
gt_instances | |
) | |
if self.vis_period > 0: | |
storage = get_event_storage() | |
if storage.iter % self.vis_period == 0 and storage.iter > 0: | |
self.visualize_training(batched_inputs, proposals, instances) | |
losses = {} | |
losses.update(detector_losses) | |
losses.update(proposal_losses) | |
return losses | |
def inference( | |
self, | |
batched_inputs: List[Dict[str, torch.Tensor]], | |
detected_instances: Optional[List[Instances]] = None, | |
do_postprocess: bool = True, | |
): | |
assert not self.training | |
images = self.preprocess_image(batched_inputs) | |
# scaling factor for the sample relative to its original scale | |
# e.g., how much has the image been upsampled by? or downsampled? | |
im_scales_ratio = [info['height'] / im.shape[1] for (info, im) in zip(batched_inputs, images)] | |
# The unmodified intrinsics for the image | |
Ks = [torch.FloatTensor(info['K']) for info in batched_inputs] | |
features = self.backbone(images.tensor) | |
# Pass oracle 2D boxes into the RoI heads | |
if type(batched_inputs == list) and np.any(['oracle2D' in b for b in batched_inputs]): | |
oracles = [b['oracle2D'] for b in batched_inputs] | |
results, _ = self.roi_heads(images, features, oracles, Ks, im_scales_ratio, None) | |
# normal inference | |
else: | |
proposals, _ = self.proposal_generator(images, features, None) | |
results, _ = self.roi_heads(images, features, proposals, Ks, im_scales_ratio, None) | |
if do_postprocess: | |
assert not torch.jit.is_scripting(), "Scripting is not supported for postprocess." | |
return GeneralizedRCNN._postprocess(results, batched_inputs, images.image_sizes) | |
else: | |
return results | |
def visualize_training(self, batched_inputs, proposals, instances): | |
""" | |
A function used to visualize images and proposals. It shows ground truth | |
bounding boxes on the original image and up to 20 top-scoring predicted | |
object proposals on the original image. Users can implement different | |
visualization functions for different models. | |
Args: | |
batched_inputs (list): a list that contains input to the model. | |
proposals (list): a list that contains predicted proposals. Both | |
batched_inputs and proposals should have the same length. | |
instances (list): a list that contains predicted RoIhead instances. Both | |
batched_inputs and proposals should have the same length. | |
""" | |
storage = get_event_storage() | |
# minimum number of boxes to try to visualize per image | |
max_vis_prop = 20 | |
if not hasattr(self, 'thing_classes'): | |
self.thing_classes = MetadataCatalog.get('omni3d_model').thing_classes | |
self.num_classes = len(self.thing_classes) | |
for input, prop, instances_i in zip(batched_inputs, proposals, instances): | |
img = input["image"] | |
img = convert_image_to_rgb(img.permute(1, 2, 0), self.input_format) | |
img_3DGT = np.ascontiguousarray(img.copy()[:, :, [2, 1, 1]]) # BGR | |
img_3DPR = np.ascontiguousarray(img.copy()[:, :, [2, 1, 1]]) # BGR | |
''' | |
Visualize the 2D GT and proposal predictions | |
''' | |
v_gt = Visualizer(img, None) | |
v_gt = v_gt.overlay_instances(boxes=input["instances"].gt_boxes) | |
anno_img = v_gt.get_image() | |
box_size = min(len(prop.proposal_boxes), max_vis_prop) | |
v_pred = Visualizer(img, None) | |
v_pred = v_pred.overlay_instances( | |
boxes=prop.proposal_boxes[0:box_size].tensor.cpu().numpy() | |
) | |
prop_img = v_pred.get_image() | |
vis_img_rpn = np.concatenate((anno_img, prop_img), axis=1) | |
vis_img_rpn = vis_img_rpn.transpose(2, 0, 1) | |
storage.put_image("Left: GT 2D bounding boxes; Right: Predicted 2D proposals", vis_img_rpn) | |
''' | |
Visualize the 3D GT and predictions | |
''' | |
K = torch.tensor(input['K'], device=self.device) | |
scale = input['height']/img.shape[0] | |
fx, sx = (val.item()/scale for val in K[0, [0, 2]]) | |
fy, sy = (val.item()/scale for val in K[1, [1, 2]]) | |
K_scaled = torch.tensor( | |
[[1/scale, 0 , 0], [0, 1/scale, 0], [0, 0, 1.0]], | |
dtype=torch.float32, device=self.device | |
) @ K | |
gts_per_image = input["instances"] | |
gt_classes = gts_per_image.gt_classes | |
# Filter out irrelevant groundtruth | |
fg_selection_mask = (gt_classes != -1) & (gt_classes < self.num_classes) | |
gt_classes = gt_classes[fg_selection_mask] | |
gt_class_names = [self.thing_classes[cls_idx] for cls_idx in gt_classes] | |
gt_boxes = gts_per_image.gt_boxes.tensor[fg_selection_mask] # 2D boxes | |
gt_poses = gts_per_image.gt_poses[fg_selection_mask] # GT poses | |
# projected 2D center, depth, w, h, l, 3D center | |
gt_boxes3D = gts_per_image.gt_boxes3D[fg_selection_mask] | |
# this box may have been mirrored and scaled so | |
# we need to recompute XYZ in 3D by backprojecting. | |
gt_z = gt_boxes3D[:, 2] | |
gt_x3D = gt_z * (gt_boxes3D[:, 0] - sx)/fx | |
gt_y3D = gt_z * (gt_boxes3D[:, 1] - sy)/fy | |
# put together the GT boxes | |
gt_center_3D = torch.stack((gt_x3D, gt_y3D, gt_z)).T | |
gt_boxes3D_XYZ_WHL = torch.cat((gt_center_3D, gt_boxes3D[:, 3:6]), dim=1) | |
gt_colors = torch.tensor( | |
[util.get_color(i) for i in range(len(gt_boxes3D_XYZ_WHL))], | |
device=self.device | |
)/255.0 | |
gt_meshes = util.mesh_cuboid(gt_boxes3D_XYZ_WHL, gt_poses, gt_colors) | |
# perform a simple NMS, which is not cls dependent. | |
keep = batched_nms( | |
instances_i.pred_boxes.tensor, | |
instances_i.scores, | |
torch.zeros(len(instances_i.scores), dtype=torch.long, device=instances_i.scores.device), | |
self.roi_heads.box_predictor.test_nms_thresh | |
) | |
keep = keep[:max_vis_prop] | |
num_to_visualize = len(keep) | |
pred_xyzwhl = torch.cat((instances_i.pred_center_cam[keep], instances_i.pred_dimensions[keep]), dim=1) | |
pred_pose = instances_i.pred_pose[keep] | |
pred_colors = torch.tensor( | |
[util.get_color(i) for i in range(num_to_visualize)], | |
device=self.device | |
)/255.0 | |
pred_boxes = instances_i.pred_boxes[keep] | |
pred_scores = instances_i.scores[keep] | |
pred_classes = instances_i.pred_classes[keep] | |
pred_class_names = ['{} {:.2f}'.format(self.thing_classes[cls_idx], score) for cls_idx, score in zip(pred_classes, pred_scores)] | |
pred_meshes = util.mesh_cuboid(pred_xyzwhl, pred_pose, pred_colors) | |
# convert to lists | |
pred_meshes = [pred_meshes.__getitem__(i).detach() for i in range(len(pred_meshes))] | |
gt_meshes = [gt_meshes.__getitem__(i) for i in range(len(gt_meshes))] | |
img_3DPR = vis.draw_scene_view(img_3DPR, K_scaled.cpu().numpy(), pred_meshes, text=pred_class_names, mode='front', blend_weight=0.0, blend_weight_overlay=0.85) | |
img_3DGT = vis.draw_scene_view(img_3DGT, K_scaled.cpu().numpy(), gt_meshes, text=gt_class_names, mode='front', blend_weight=0.0, blend_weight_overlay=0.85) | |
# horizontal stack 3D GT and pred left/right | |
vis_img_3d = np.concatenate((img_3DGT, img_3DPR), axis=1) | |
vis_img_3d = vis_img_3d[:, :, [2, 1, 0]] # RGB | |
vis_img_3d = vis_img_3d.astype(np.uint8).transpose(2, 0, 1) | |
storage.put_image("Left: GT 3D cuboids; Right: Predicted 3D cuboids", vis_img_3d) | |
break # only visualize one image in a batch | |
class RCNN3D_combined_features(nn.Module): | |
def __init__(self, *, backbone, proposal_generator, roi_heads, input_format, vis_period, pixel_mean, pixel_std, depth_model): | |
super().__init__() | |
self.backbone = backbone | |
self.proposal_generator = proposal_generator | |
self.roi_heads = roi_heads | |
self.input_format = input_format | |
self.vis_period = vis_period | |
self.depth_model = depth_model | |
self.register_buffer("pixel_mean", torch.tensor(pixel_mean).view(-1, 1, 1), False) | |
self.register_buffer("pixel_std", torch.tensor(pixel_std).view(-1, 1, 1), False) | |
assert ( | |
self.pixel_mean.shape == self.pixel_std.shape | |
), f"{self.pixel_mean} and {self.pixel_std} have different shapes!" | |
def from_config(cls, cfg, priors=None): | |
backbone = build_backbone(cfg, priors=priors) | |
if cfg.MODEL.DEPTH_ON: | |
depth_model = 'zoedepth' | |
pretrained_resource = 'local::depth/checkpoints/depth_anything_metric_depth_indoor.pt' | |
d_model = setup_depth_model(depth_model, pretrained_resource) #NOTE maybe make the depth model be learnable as well | |
shape_modified = {key:ShapeSpec(i.channels*2,stride=i.stride) for key, i in backbone.output_shape().items()} | |
else: | |
d_model = None | |
shape_modified = backbone.output_shape() | |
return { | |
"backbone": backbone, | |
"proposal_generator": build_proposal_generator(cfg, backbone.output_shape()), | |
"roi_heads": build_roi_heads(cfg, shape_modified, priors=priors), | |
"input_format": cfg.INPUT.FORMAT, | |
"vis_period": cfg.VIS_PERIOD, | |
"pixel_mean": cfg.MODEL.PIXEL_MEAN, | |
"pixel_std": cfg.MODEL.PIXEL_STD, | |
"depth_model": d_model, | |
} | |
def device(self): | |
return self.pixel_mean.device | |
def _move_to_current_device(self, x): | |
return move_device_like(x, self.pixel_mean) | |
def preprocess_image(self, batched_inputs: List[Dict[str, torch.Tensor]], normalise=True, img_type="image", convert=False, NoOp=False, to_float=False): | |
""" | |
Normalize, pad and batch the input images. | |
""" | |
images = [self._move_to_current_device(x[img_type]) for x in batched_inputs] | |
if normalise: | |
images = [(x - self.pixel_mean) / self.pixel_std for x in images] | |
if convert: | |
# convert from BGR to RGB | |
images = [x[[2,1,0],:,:] for x in images] | |
if to_float: | |
images = [x.float()/255.0 for x in images] | |
if NoOp: | |
images = ImageList.from_tensors(images) | |
return images | |
images = ImageList.from_tensors( | |
images, | |
self.backbone.size_divisibility, | |
padding_constraints=self.backbone.padding_constraints, | |
) | |
return images | |
def _standardize(self, x:torch.Tensor, y:torch.Tensor): | |
'''standardise x to match the mean and std of y''' | |
ym = y.mean() | |
ys = y.std() | |
xm = x.mean() | |
xs = x.std() | |
return (x - xm) * (ys / xs) + ym | |
def cat_depth_features(self, features, images_raw): | |
pred_o = self.depth_model(images_raw.tensor.float()/255.0) | |
# depth features corresponding to p2, p3, p4, p5 | |
d_features = pred_o['depth_features'] | |
# img_features = features['p5'] | |
# we must scale the depth map to the same size as the conv feature, otherwise the scale will not correspond correctly in the roi pooling | |
for (layer, img_feature), d_feature in zip(features.items(), reversed(d_features)): | |
d_feature = F.interpolate(d_feature, size=img_feature.shape[-2:], mode='bilinear', align_corners=True) | |
d_feature = self._standardize(d_feature, img_feature) | |
features[layer] = torch.cat((img_feature, d_feature), dim=1) | |
return features | |
def forward(self, batched_inputs: List[Dict[str, torch.Tensor]]): | |
if not self.training: | |
return self.inference(batched_inputs) # segmentor is just none in inference because we dont need the loss | |
images = self.preprocess_image(batched_inputs) | |
# NOTE: images_raw are scaled to be padded to the same size as the largest. | |
# This is necessary because the images are of different sizes, so to batch them they must each be the same size. | |
images_raw = self.preprocess_image(batched_inputs, img_type='image', convert=True, normalise=False, NoOp=True) | |
# if we want depth maps they are there | |
depth_maps = self.preprocess_image(batched_inputs, img_type="depth_map", normalise=False, NoOp=True) | |
# Note if a single ground map in a batch is missing, we skip the ground map for the entire batch | |
ground_maps_fail = [i['ground_map'] is None for i in batched_inputs] | |
ground_maps_fail_idx = [i for i, x in enumerate(ground_maps_fail) if x] | |
for idx in ground_maps_fail_idx: | |
batched_inputs[idx]['ground_map'] = torch.tensor([[1]]) # make a dummy to indicate a fail | |
ground_maps = self.preprocess_image(batched_inputs, img_type="ground_map", normalise=False, NoOp=True) | |
# scaling factor for the sample relative to its original scale | |
# e.g., how much has the image been upsampled by? or downsampled? | |
im_scales_ratio = [info['height'] / im.shape[1] for (info, im) in zip(batched_inputs, images)] | |
# The unmodified intrinsics for the image | |
Ks = [torch.FloatTensor(info['K']) for info in batched_inputs] | |
if "instances" in batched_inputs[0]: | |
gt_instances = [x["instances"].to(self.device) for x in batched_inputs] | |
features = self.backbone(images.tensor) | |
proposals, proposal_losses = self.proposal_generator(images, features, gt_instances) | |
if self.depth_model is not None: | |
features = self.cat_depth_features(features, images_raw) | |
instances, detector_losses = self.roi_heads( | |
images, images_raw, ground_maps, depth_maps, features, proposals, | |
Ks, im_scales_ratio, | |
gt_instances | |
) | |
if self.vis_period > 0: | |
storage = get_event_storage() | |
if storage.iter % self.vis_period == 0 and storage.iter > 0: | |
self.visualize_training(batched_inputs, proposals, instances) | |
losses = {} | |
losses.update(detector_losses) | |
losses.update(proposal_losses) | |
return losses | |
def inference( | |
self, | |
batched_inputs: List[Dict[str, torch.Tensor]], | |
detected_instances: Optional[List[Instances]] = None, | |
do_postprocess: bool = True, | |
): | |
assert not self.training | |
images = self.preprocess_image(batched_inputs) | |
images_raw = self.preprocess_image(batched_inputs, img_type='image', convert=True, normalise=False, NoOp=True) | |
# do we assume no access to ground maps in inference? | |
ground_maps = None | |
depth_maps = None | |
# scaling factor for the sample relative to its original scale | |
# e.g., how much has the image been upsampled by? or downsampled? | |
im_scales_ratio = [info['height'] / im.shape[1] for (info, im) in zip(batched_inputs, images)] | |
# The unmodified intrinsics for the image | |
Ks = [torch.FloatTensor(info['K']) for info in batched_inputs] | |
features = self.backbone(images.tensor) | |
# Pass oracle 2D boxes into the RoI heads | |
if type(batched_inputs == list) and np.any(['oracle2D' in b for b in batched_inputs]): | |
oracles = [b['oracle2D'] for b in batched_inputs] | |
results, _ = self.roi_heads(images, images_raw, ground_maps, depth_maps, features, oracles, Ks, im_scales_ratio, None) | |
# normal inference | |
else: | |
proposals, _ = self.proposal_generator(images, features, None) | |
if self.depth_model is not None: | |
features = self.cat_depth_features(features, images_raw) | |
# pred boxes are proposals | |
results, _ = self.roi_heads(images, images_raw, ground_maps, depth_maps, features, proposals, Ks, im_scales_ratio, None) | |
if do_postprocess: | |
assert not torch.jit.is_scripting(), "Scripting is not supported for postprocess." | |
return GeneralizedRCNN._postprocess(results, batched_inputs, images.image_sizes) | |
else: | |
return results | |
def visualize_training(self, batched_inputs, proposals, instances): | |
""" | |
A function used to visualize images and proposals. It shows ground truth | |
bounding boxes on the original image and up to 20 top-scoring predicted | |
object proposals on the original image. Users can implement different | |
visualization functions for different models. | |
Args: | |
batched_inputs (list): a list that contains input to the model. | |
proposals (list): a list that contains predicted proposals. Both | |
batched_inputs and proposals should have the same length. | |
instances (list): a list that contains predicted RoIhead instances. Both | |
batched_inputs and proposals should have the same length. | |
""" | |
storage = get_event_storage() | |
# minimum number of boxes to try to visualize per image | |
max_vis_prop = 20 | |
if not hasattr(self, 'thing_classes'): | |
self.thing_classes = MetadataCatalog.get('omni3d_model').thing_classes | |
self.num_classes = len(self.thing_classes) | |
only2d = instances is None | |
if only2d: | |
instances = [None]*len(batched_inputs) | |
for input, prop, instances_i in zip(batched_inputs, proposals, instances): | |
img = input["image"] | |
img = convert_image_to_rgb(img.permute(1, 2, 0), self.input_format) | |
img_3DGT = np.ascontiguousarray(img.copy()[:, :, [2, 1, 1]]) # BGR | |
img_3DPR = np.ascontiguousarray(img.copy()[:, :, [2, 1, 1]]) # BGR | |
''' | |
Visualize the 2D GT and proposal predictions | |
''' | |
v_gt = Visualizer(img, None) | |
v_gt = v_gt.overlay_instances(boxes=input["instances"].gt_boxes) | |
anno_img = v_gt.get_image() | |
box_size = min(len(prop.proposal_boxes), max_vis_prop) | |
v_pred = Visualizer(img, None) | |
v_pred = v_pred.overlay_instances( | |
boxes=prop.proposal_boxes[0:box_size].tensor.cpu().numpy() | |
) | |
prop_img = v_pred.get_image() | |
vis_img_rpn = np.concatenate((anno_img, prop_img), axis=1) | |
vis_img_rpn = vis_img_rpn.transpose(2, 0, 1) | |
storage.put_image("Left: GT 2D bounding boxes; Right: Predicted 2D proposals", vis_img_rpn) | |
if only2d: | |
break | |
''' | |
Visualize the 3D GT and predictions | |
''' | |
K = torch.tensor(input['K'], device=self.device) | |
scale = input['height']/img.shape[0] | |
fx, sx = (val.item()/scale for val in K[0, [0, 2]]) | |
fy, sy = (val.item()/scale for val in K[1, [1, 2]]) | |
K_scaled = torch.tensor( | |
[[1/scale, 0 , 0], [0, 1/scale, 0], [0, 0, 1.0]], | |
dtype=torch.float32, device=self.device | |
) @ K | |
gts_per_image = input["instances"] | |
gt_classes = gts_per_image.gt_classes | |
# Filter out irrelevant groundtruth | |
fg_selection_mask = (gt_classes != -1) & (gt_classes < self.num_classes) | |
gt_classes = gt_classes[fg_selection_mask] | |
gt_class_names = [self.thing_classes[cls_idx] for cls_idx in gt_classes] | |
gt_boxes = gts_per_image.gt_boxes.tensor[fg_selection_mask] # 2D boxes | |
gt_poses = gts_per_image.gt_poses[fg_selection_mask] # GT poses | |
# projected 2D center, depth, w, h, l, 3D center | |
gt_boxes3D = gts_per_image.gt_boxes3D[fg_selection_mask] | |
# this box may have been mirrored and scaled so | |
# we need to recompute XYZ in 3D by backprojecting. | |
gt_z = gt_boxes3D[:, 2] | |
gt_x3D = gt_z * (gt_boxes3D[:, 0] - sx)/fx | |
gt_y3D = gt_z * (gt_boxes3D[:, 1] - sy)/fy | |
# put together the GT boxes | |
gt_center_3D = torch.stack((gt_x3D, gt_y3D, gt_z)).T | |
gt_boxes3D_XYZ_WHL = torch.cat((gt_center_3D, gt_boxes3D[:, 3:6]), dim=1) | |
gt_colors = torch.tensor( | |
[util.get_color(i) for i in range(len(gt_boxes3D_XYZ_WHL))], | |
device=self.device | |
)/255.0 | |
gt_meshes = util.mesh_cuboid(gt_boxes3D_XYZ_WHL, gt_poses, gt_colors) | |
# perform a simple NMS, which is not cls dependent. | |
keep = batched_nms( | |
instances_i.pred_boxes.tensor, | |
instances_i.scores, | |
torch.zeros(len(instances_i.scores), dtype=torch.long, device=instances_i.scores.device), | |
self.roi_heads.box_predictor.test_nms_thresh | |
) | |
keep = keep[:max_vis_prop] | |
num_to_visualize = len(keep) | |
pred_xyzwhl = torch.cat((instances_i.pred_center_cam[keep], instances_i.pred_dimensions[keep]), dim=1) | |
pred_pose = instances_i.pred_pose[keep] | |
pred_colors = torch.tensor( | |
[util.get_color(i) for i in range(num_to_visualize)], | |
device=self.device | |
)/255.0 | |
pred_boxes = instances_i.pred_boxes[keep] | |
pred_scores = instances_i.scores[keep] | |
pred_classes = instances_i.pred_classes[keep] | |
pred_class_names = ['{} {:.2f}'.format(self.thing_classes[cls_idx], score) for cls_idx, score in zip(pred_classes, pred_scores)] | |
pred_meshes = util.mesh_cuboid(pred_xyzwhl, pred_pose, pred_colors) | |
# convert to lists | |
pred_meshes = [pred_meshes.__getitem__(i).detach() for i in range(len(pred_meshes))] | |
gt_meshes = [gt_meshes.__getitem__(i) for i in range(len(gt_meshes))] | |
img_3DPR = vis.draw_scene_view(img_3DPR, K_scaled.cpu().numpy(), pred_meshes, text=pred_class_names, mode='front', blend_weight=0.0, blend_weight_overlay=0.85) | |
img_3DGT = vis.draw_scene_view(img_3DGT, K_scaled.cpu().numpy(), gt_meshes, text=gt_class_names, mode='front', blend_weight=0.0, blend_weight_overlay=0.85) | |
# horizontal stack 3D GT and pred left/right | |
vis_img_3d = np.concatenate((img_3DGT, img_3DPR), axis=1) | |
vis_img_3d = vis_img_3d[:, :, [2, 1, 0]] # RGB | |
vis_img_3d = vis_img_3d.astype(np.uint8).transpose(2, 0, 1) | |
storage.put_image("Left: GT 3D cuboids; Right: Predicted 3D cuboids", vis_img_3d) | |
break # only visualize one image in a batch | |
class BoxNet(nn.Module): | |
def __init__( | |
self, | |
*, | |
backbone: Backbone, | |
proposal_generator: nn.Module, | |
roi_heads: nn.Module, | |
pixel_mean: tuple[float], | |
pixel_std: tuple[float], | |
input_format: Optional[str] = None, | |
vis_period: int = 0, | |
): | |
""" | |
Args: | |
backbone: a backbone module, must follow detectron2's backbone interface | |
proposal_generator: a module that generates proposals using backbone features | |
roi_heads: a ROI head that performs per-region computation | |
pixel_mean, pixel_std: list or tuple with #channels element, representing | |
the per-channel mean and std to be used to normalize the input image | |
input_format: describe the meaning of channels of input. Needed by visualization | |
vis_period: the period to run visualization. Set to 0 to disable. | |
""" | |
super().__init__() | |
self.backbone = backbone | |
self.proposal_generator = proposal_generator | |
self.roi_heads = roi_heads | |
self.input_format = input_format | |
self.vis_period = vis_period | |
if vis_period > 0: | |
assert input_format is not None, "input_format is required for visualization!" | |
self.register_buffer("pixel_mean", torch.tensor(pixel_mean).view(-1, 1, 1), False) | |
self.register_buffer("pixel_std", torch.tensor(pixel_std).view(-1, 1, 1), False) | |
assert ( | |
self.pixel_mean.shape == self.pixel_std.shape | |
), f"{self.pixel_mean} and {self.pixel_std} have different shapes!" | |
def from_config(cls, cfg, priors=None): | |
backbone = build_backbone(cfg, priors=priors) | |
return { | |
"backbone": backbone, | |
"proposal_generator": build_proposal_generator(cfg, backbone.output_shape()), | |
"roi_heads": build_roi_heads(cfg, backbone.output_shape(), priors=priors), | |
"input_format": cfg.INPUT.FORMAT, | |
"vis_period": cfg.VIS_PERIOD, | |
"pixel_mean": cfg.MODEL.PIXEL_MEAN, | |
"pixel_std": cfg.MODEL.PIXEL_STD, | |
} | |
def device(self): | |
return self.pixel_mean.device | |
def _move_to_current_device(self, x): | |
return move_device_like(x, self.pixel_mean) | |
def preprocess_image(self, batched_inputs: List[Dict[str, torch.Tensor]], normalise=True, img_type="image", convert=False, NoOp=False, to_float=False): | |
""" | |
Normalize, pad and batch the input images. | |
""" | |
images = [self._move_to_current_device(x[img_type]) for x in batched_inputs] | |
if normalise: | |
images = [(x - self.pixel_mean) / self.pixel_std for x in images] | |
else: | |
if convert: | |
# convert from BGR to RGB | |
images = [x[[2,1,0],:,:] for x in images] | |
if to_float: | |
images = [x.float()/255.0 for x in images] | |
if NoOp: | |
images = ImageList.from_tensors(images,0,) | |
return images | |
images = ImageList.from_tensors( | |
images, | |
self.backbone.size_divisibility, | |
padding_constraints=self.backbone.padding_constraints, | |
) | |
return images | |
def forward(self, batched_inputs: List[Dict[str, torch.Tensor]], experiment_type={'use_pred_boxes':True}, proposal_function='propose'): | |
if not self.training: | |
if not experiment_type['use_pred_boxes']: # MABO | |
return self.inference(batched_inputs, do_postprocess=False, experiment_type=experiment_type, proposal_function=proposal_function) | |
else: # AP | |
return self.inference(batched_inputs, do_postprocess=True, experiment_type=experiment_type, proposal_function=proposal_function) | |
if self.training: | |
images = self.preprocess_image(batched_inputs, img_type='image', convert=False) | |
images_raw = self.preprocess_image(batched_inputs, img_type='image', convert=True, normalise=False, NoOp=True) | |
depth_maps = self.preprocess_image(batched_inputs, img_type="depth_map", normalise=False, NoOp=True) | |
if batched_inputs[0]['ground_map'] is not None: | |
ground_maps = self.preprocess_image(batched_inputs, img_type="ground_map", normalise=False, NoOp=True) | |
if not torch.count_nonzero(ground_maps.tensor): # for some reason there is a single ground map causing problems | |
print('no_ground for', batched_inputs[0]['image_id']) | |
ground_maps = None | |
else: | |
ground_maps = None | |
# scaling factor for the sample relative to its original scale | |
# e.g., how much has the image been upsampled by? or downsampled? | |
im_scales_ratio = [info['height'] / im.shape[1] for (info, im) in zip(batched_inputs, images)] | |
# The unmodified intrinsics for the image | |
Ks = [torch.FloatTensor(info['K']) for info in batched_inputs] | |
features = None | |
gt_instances = [x["instances"].to(self.device) for x in batched_inputs] | |
# def forward(self, images, images_raw, combined_features, depth_maps, ground_maps, features, proposals, Ks, im_scales_ratio, segmentor, experiment_type, proposal_function, targets=None): | |
results = self.roi_heads(images, images_raw, None, depth_maps, ground_maps, features, gt_instances, Ks, im_scales_ratio, experiment_type, proposal_function) | |
return GeneralizedRCNN._postprocess(results, batched_inputs, images.image_sizes) | |
def inference(self, | |
batched_inputs: List[Dict[str, torch.Tensor]], | |
detected_instances: Optional[List[Instances]] = None, do_postprocess: bool = True, experiment_type={}, proposal_function='propose'): | |
assert not self.training | |
# must apply the same preprocessing to both the image, the depth map, and the mask | |
# except don't normalise the input for the segmentation method | |
images = self.preprocess_image(batched_inputs, img_type='image', convert=False) | |
images_raw = self.preprocess_image(batched_inputs, img_type='image', convert=True, normalise=False, NoOp=True) | |
depth_maps = self.preprocess_image(batched_inputs, img_type="depth_map", normalise=False, NoOp=True) | |
if batched_inputs[0]['ground_map'] is not None: | |
ground_maps = self.preprocess_image(batched_inputs, img_type="ground_map", normalise=False, NoOp=True) | |
else: | |
#logger.info("ground map file not found, setting to None") | |
ground_maps = None | |
# TODO: make logic to predict ground map on the fly | |
# logger.info("ground map file not found, computing...") | |
# raise NotImplementedError("Implement ground on the fly, see generate_ground_segmentations.py for reference") | |
# scaling factor for the sample relative to its original scale | |
# e.g., how much has the image been upsampled by? or downsampled? | |
im_scales_ratio = [info['height'] / im.shape[1] for (info, im) in zip(batched_inputs, images)] | |
# The unmodified intrinsics for the image | |
Ks = [torch.FloatTensor(info['K']) for info in batched_inputs] | |
# do_postprocess is the same as using predicted boxes | |
if do_postprocess: | |
# gt_instances should be None in inference mode | |
features = self.backbone(images.tensor) | |
# normal inference | |
proposals, _ = self.proposal_generator(images, features, None) | |
else: | |
if "instances" in batched_inputs[0]: | |
gt_instances = [x["instances"].to(self.device) for x in batched_inputs] | |
else: | |
gt_instances = None | |
features, proposals = None, gt_instances | |
# combined_features = self.scorenet_base.forward_features(images, images_raw) | |
combined_features = None | |
# is it necessary to resize images back??? | |
# use the mask and the 2D box to predict the 3D box | |
# proposals are ground truth for MABO plots and predictions for AP plots | |
results = self.roi_heads(images, images_raw, combined_features, depth_maps, ground_maps, features, proposals, Ks, im_scales_ratio, experiment_type, proposal_function) | |
if do_postprocess: | |
assert not torch.jit.is_scripting(), "Scripting is not supported for postprocess." | |
return GeneralizedRCNN._postprocess(results, batched_inputs, images.image_sizes) | |
else: | |
return results #[{'instances':results}] | |
def visualize_training(self, batched_inputs, proposals, instances): | |
""" | |
A function used to visualize images and proposals. It shows ground truth | |
bounding boxes on the original image and up to 20 top-scoring predicted | |
object proposals on the original image. Users can implement different | |
visualization functions for different models. | |
Args: | |
batched_inputs (list): a list that contains input to the model. | |
proposals (list): a list that contains predicted proposals. Both | |
batched_inputs and proposals should have the same length. | |
instances (list): a list that contains predicted RoIhead instances. Both | |
batched_inputs and proposals should have the same length. | |
""" | |
storage = get_event_storage() | |
# minimum number of boxes to try to visualize per image | |
max_vis_prop = 20 | |
if not hasattr(self, 'thing_classes'): | |
self.thing_classes = MetadataCatalog.get('omni3d_model').thing_classes | |
self.num_classes = len(self.thing_classes) | |
for input, prop, instances_i in zip(batched_inputs, proposals, instances): | |
img = input["image"] | |
img = convert_image_to_rgb(img.permute(1, 2, 0), self.input_format) | |
img_3DGT = np.ascontiguousarray(img.copy()[:, :, [2, 1, 1]]) # BGR | |
img_3DPR = np.ascontiguousarray(img.copy()[:, :, [2, 1, 1]]) # BGR | |
''' | |
Visualize the 2D GT and proposal predictions | |
''' | |
v_gt = Visualizer(img, None) | |
v_gt = v_gt.overlay_instances(boxes=input["instances"].gt_boxes) | |
anno_img = v_gt.get_image() | |
box_size = min(len(prop.proposal_boxes), max_vis_prop) | |
v_pred = Visualizer(img, None) | |
v_pred = v_pred.overlay_instances( | |
boxes=prop.proposal_boxes[0:box_size].tensor.cpu().numpy() | |
) | |
prop_img = v_pred.get_image() | |
vis_img_rpn = np.concatenate((anno_img, prop_img), axis=1) | |
vis_img_rpn = vis_img_rpn.transpose(2, 0, 1) | |
storage.put_image("Left: GT 2D bounding boxes; Right: Predicted 2D proposals", vis_img_rpn) | |
''' | |
Visualize the 3D GT and predictions | |
''' | |
K = torch.tensor(input['K'], device=self.device) | |
scale = input['height']/img.shape[0] | |
fx, sx = (val.item()/scale for val in K[0, [0, 2]]) | |
fy, sy = (val.item()/scale for val in K[1, [1, 2]]) | |
K_scaled = torch.tensor( | |
[[1/scale, 0 , 0], [0, 1/scale, 0], [0, 0, 1.0]], | |
dtype=torch.float32, device=self.device | |
) @ K | |
gts_per_image = input["instances"] | |
gt_classes = gts_per_image.gt_classes | |
# Filter out irrelevant groundtruth | |
fg_selection_mask = (gt_classes != -1) & (gt_classes < self.num_classes) | |
gt_classes = gt_classes[fg_selection_mask] | |
gt_class_names = [self.thing_classes[cls_idx] for cls_idx in gt_classes] | |
gt_boxes = gts_per_image.gt_boxes.tensor[fg_selection_mask] # 2D boxes | |
gt_poses = gts_per_image.gt_poses[fg_selection_mask] # GT poses | |
# projected 2D center, depth, w, h, l, 3D center | |
gt_boxes3D = gts_per_image.gt_boxes3D[fg_selection_mask] | |
# this box may have been mirrored and scaled so | |
# we need to recompute XYZ in 3D by backprojecting. | |
gt_z = gt_boxes3D[:, 2] | |
gt_x3D = gt_z * (gt_boxes3D[:, 0] - sx)/fx | |
gt_y3D = gt_z * (gt_boxes3D[:, 1] - sy)/fy | |
# put together the GT boxes | |
gt_center_3D = torch.stack((gt_x3D, gt_y3D, gt_z)).T | |
gt_boxes3D_XYZ_WHL = torch.cat((gt_center_3D, gt_boxes3D[:, 3:6]), dim=1) | |
gt_colors = torch.tensor( | |
[util.get_color(i) for i in range(len(gt_boxes3D_XYZ_WHL))], | |
device=self.device | |
)/255.0 | |
gt_meshes = util.mesh_cuboid(gt_boxes3D_XYZ_WHL, gt_poses, gt_colors) | |
# perform a simple NMS, which is not cls dependent. | |
keep = batched_nms( | |
instances_i.pred_boxes.tensor, | |
instances_i.scores, | |
torch.zeros(len(instances_i.scores), dtype=torch.long, device=instances_i.scores.device), | |
self.roi_heads.box_predictor.test_nms_thresh | |
) | |
keep = keep[:max_vis_prop] | |
num_to_visualize = len(keep) | |
pred_xyzwhl = torch.cat((instances_i.pred_center_cam[keep], instances_i.pred_dimensions[keep]), dim=1) | |
pred_pose = instances_i.pred_pose[keep] | |
pred_colors = torch.tensor( | |
[util.get_color(i) for i in range(num_to_visualize)], | |
device=self.device | |
)/255.0 | |
pred_boxes = instances_i.pred_boxes[keep] | |
pred_scores = instances_i.scores[keep] | |
pred_classes = instances_i.pred_classes[keep] | |
pred_class_names = ['{} {:.2f}'.format(self.thing_classes[cls_idx], score) for cls_idx, score in zip(pred_classes, pred_scores)] | |
pred_meshes = util.mesh_cuboid(pred_xyzwhl, pred_pose, pred_colors) | |
# convert to lists | |
pred_meshes = [pred_meshes.__getitem__(i).detach() for i in range(len(pred_meshes))] | |
gt_meshes = [gt_meshes.__getitem__(i) for i in range(len(gt_meshes))] | |
img_3DPR = vis.draw_scene_view(img_3DPR, K_scaled.cpu().numpy(), pred_meshes, text=pred_class_names, mode='front', blend_weight=0.0, blend_weight_overlay=0.85) | |
img_3DGT = vis.draw_scene_view(img_3DGT, K_scaled.cpu().numpy(), gt_meshes, text=gt_class_names, mode='front', blend_weight=0.0, blend_weight_overlay=0.85) | |
# horizontal stack 3D GT and pred left/right | |
vis_img_3d = np.concatenate((img_3DGT, img_3DPR), axis=1) | |
vis_img_3d = vis_img_3d[:, :, [2, 1, 0]] # RGB | |
vis_img_3d = vis_img_3d.astype(np.uint8).transpose(2, 0, 1) | |
storage.put_image("Left: GT 3D cuboids; Right: Predicted 3D cuboids", vis_img_3d) | |
break | |
def build_model(cfg, priors=None): | |
""" | |
Build the whole model architecture, defined by ``cfg.MODEL.META_ARCHITECTURE``. | |
Note that it does not load any weights from ``cfg``. | |
""" | |
meta_arch = cfg.MODEL.META_ARCHITECTURE | |
model = META_ARCH_REGISTRY.get(meta_arch)(cfg, priors=priors) | |
model.to(torch.device(cfg.MODEL.DEVICE)) | |
_log_api_usage("modeling.meta_arch." + meta_arch) | |
return model | |
def build_backbone(cfg, input_shape=None, priors=None): | |
""" | |
Build a backbone from `cfg.MODEL.BACKBONE.NAME`. | |
Returns: | |
an instance of :class:`Backbone` | |
""" | |
if input_shape is None: | |
input_shape = ShapeSpec(channels=len(cfg.MODEL.PIXEL_MEAN)) | |
backbone_name = cfg.MODEL.BACKBONE.NAME | |
backbone = BACKBONE_REGISTRY.get(backbone_name)(cfg, input_shape, priors) | |
assert isinstance(backbone, Backbone) | |
return backbone |