Spaces:

AndreasLH
/

Weakly-Supervised-3DOD

Sleeping

App Files Files Community

Weakly-Supervised-3DOD / cubercnn /modeling /meta_arch /rcnn3d.py

AndreasLH

upload repo

56bd2b5 12 months ago

raw

history blame contribute delete

42.2 kB

	# Copyright (c) Meta Platforms, Inc. and affiliates
	import logging
	from typing import Dict, List, Optional
	from detectron2.layers import move_device_like
	from detectron2.structures.image_list import ImageList
	import torch
	import numpy as np
	from detectron2.layers import ShapeSpec, batched_nms
	from detectron2.utils.visualizer import Visualizer
	from detectron2.data.detection_utils import convert_image_to_rgb
	from detectron2.structures import Instances
	from detectron2.utils.events import get_event_storage
	from detectron2.data import MetadataCatalog

	from detectron2.modeling.backbone import Backbone, BACKBONE_REGISTRY
	from detectron2.modeling.proposal_generator import build_proposal_generator
	from detectron2.utils.logger import _log_api_usage
	from detectron2.modeling.meta_arch import (
	META_ARCH_REGISTRY, GeneralizedRCNN
	)
	from cubercnn.data.generate_depth_maps import setup_depth_model
	from cubercnn.modeling.roi_heads import build_roi_heads

	from detectron2.data import MetadataCatalog
	from cubercnn.modeling.roi_heads import build_roi_heads
	from cubercnn import util, vis
	import torch.nn.functional as F
	from detectron2.config import configurable
	import torch.nn as nn

	logger = logging.getLogger(__name__)


	@META_ARCH_REGISTRY.register()
	class RCNN3D(GeneralizedRCNN):

	@classmethod
	def from_config(cls, cfg, priors=None):
	backbone = build_backbone(cfg, priors=priors)
	return {
	"backbone": backbone,
	"proposal_generator": build_proposal_generator(cfg, backbone.output_shape()),
	"roi_heads": build_roi_heads(cfg, backbone.output_shape(), priors=priors),
	"input_format": cfg.INPUT.FORMAT,
	"vis_period": cfg.VIS_PERIOD,
	"pixel_mean": cfg.MODEL.PIXEL_MEAN,
	"pixel_std": cfg.MODEL.PIXEL_STD,
	}

	def forward(self, batched_inputs: List[Dict[str, torch.Tensor]]):

	if not self.training:
	return self.inference(batched_inputs)

	images = self.preprocess_image(batched_inputs)

	# scaling factor for the sample relative to its original scale
	# e.g., how much has the image been upsampled by? or downsampled?
	im_scales_ratio = [info['height'] / im.shape[1] for (info, im) in zip(batched_inputs, images)]

	# The unmodified intrinsics for the image
	Ks = [torch.FloatTensor(info['K']) for info in batched_inputs]

	if "instances" in batched_inputs[0]:
	gt_instances = [x["instances"].to(self.device) for x in batched_inputs]
	else:
	gt_instances = None

	# the backbone is actually a FPN, where the DLA model is the bottom-up structure.
	# FPN: https://arxiv.org/abs/1612.03144v2
	# backbone and proposal generator only work on 2D images and annotations.
	features = self.backbone(images.tensor)
	proposals, proposal_losses = self.proposal_generator(images, features, gt_instances)

	instances, detector_losses = self.roi_heads(
	images, features, proposals,
	Ks, im_scales_ratio,
	gt_instances
	)

	if self.vis_period > 0:
	storage = get_event_storage()
	if storage.iter % self.vis_period == 0 and storage.iter > 0:
	self.visualize_training(batched_inputs, proposals, instances)

	losses = {}
	losses.update(detector_losses)
	losses.update(proposal_losses)
	return losses

	def inference(
	self,
	batched_inputs: List[Dict[str, torch.Tensor]],
	detected_instances: Optional[List[Instances]] = None,
	do_postprocess: bool = True,
	):
	assert not self.training

	images = self.preprocess_image(batched_inputs)

	# scaling factor for the sample relative to its original scale
	# e.g., how much has the image been upsampled by? or downsampled?
	im_scales_ratio = [info['height'] / im.shape[1] for (info, im) in zip(batched_inputs, images)]

	# The unmodified intrinsics for the image
	Ks = [torch.FloatTensor(info['K']) for info in batched_inputs]

	features = self.backbone(images.tensor)

	# Pass oracle 2D boxes into the RoI heads
	if type(batched_inputs == list) and np.any(['oracle2D' in b for b in batched_inputs]):
	oracles = [b['oracle2D'] for b in batched_inputs]
	results, _ = self.roi_heads(images, features, oracles, Ks, im_scales_ratio, None)

	# normal inference
	else:
	proposals, _ = self.proposal_generator(images, features, None)
	results, _ = self.roi_heads(images, features, proposals, Ks, im_scales_ratio, None)

	if do_postprocess:
	assert not torch.jit.is_scripting(), "Scripting is not supported for postprocess."
	return GeneralizedRCNN._postprocess(results, batched_inputs, images.image_sizes)
	else:
	return results

	def visualize_training(self, batched_inputs, proposals, instances):
	"""
	A function used to visualize images and proposals. It shows ground truth
	bounding boxes on the original image and up to 20 top-scoring predicted
	object proposals on the original image. Users can implement different
	visualization functions for different models.
	Args:
	batched_inputs (list): a list that contains input to the model.
	proposals (list): a list that contains predicted proposals. Both
	batched_inputs and proposals should have the same length.
	instances (list): a list that contains predicted RoIhead instances. Both
	batched_inputs and proposals should have the same length.
	"""

	storage = get_event_storage()

	# minimum number of boxes to try to visualize per image
	max_vis_prop = 20

	if not hasattr(self, 'thing_classes'):
	self.thing_classes = MetadataCatalog.get('omni3d_model').thing_classes
	self.num_classes = len(self.thing_classes)

	for input, prop, instances_i in zip(batched_inputs, proposals, instances):

	img = input["image"]
	img = convert_image_to_rgb(img.permute(1, 2, 0), self.input_format)
	img_3DGT = np.ascontiguousarray(img.copy()[:, :, [2, 1, 1]]) # BGR
	img_3DPR = np.ascontiguousarray(img.copy()[:, :, [2, 1, 1]]) # BGR

	'''
	Visualize the 2D GT and proposal predictions
	'''
	v_gt = Visualizer(img, None)
	v_gt = v_gt.overlay_instances(boxes=input["instances"].gt_boxes)
	anno_img = v_gt.get_image()
	box_size = min(len(prop.proposal_boxes), max_vis_prop)
	v_pred = Visualizer(img, None)
	v_pred = v_pred.overlay_instances(
	boxes=prop.proposal_boxes[0:box_size].tensor.cpu().numpy()
	)
	prop_img = v_pred.get_image()
	vis_img_rpn = np.concatenate((anno_img, prop_img), axis=1)
	vis_img_rpn = vis_img_rpn.transpose(2, 0, 1)
	storage.put_image("Left: GT 2D bounding boxes; Right: Predicted 2D proposals", vis_img_rpn)

	'''
	Visualize the 3D GT and predictions
	'''
	K = torch.tensor(input['K'], device=self.device)
	scale = input['height']/img.shape[0]
	fx, sx = (val.item()/scale for val in K[0, [0, 2]])
	fy, sy = (val.item()/scale for val in K[1, [1, 2]])

	K_scaled = torch.tensor(
	[[1/scale, 0 , 0], [0, 1/scale, 0], [0, 0, 1.0]],
	dtype=torch.float32, device=self.device
	) @ K

	gts_per_image = input["instances"]

	gt_classes = gts_per_image.gt_classes

	# Filter out irrelevant groundtruth
	fg_selection_mask = (gt_classes != -1) & (gt_classes < self.num_classes)

	gt_classes = gt_classes[fg_selection_mask]
	gt_class_names = [self.thing_classes[cls_idx] for cls_idx in gt_classes]
	gt_boxes = gts_per_image.gt_boxes.tensor[fg_selection_mask] # 2D boxes
	gt_poses = gts_per_image.gt_poses[fg_selection_mask] # GT poses

	# projected 2D center, depth, w, h, l, 3D center
	gt_boxes3D = gts_per_image.gt_boxes3D[fg_selection_mask]

	# this box may have been mirrored and scaled so
	# we need to recompute XYZ in 3D by backprojecting.
	gt_z = gt_boxes3D[:, 2]

	gt_x3D = gt_z * (gt_boxes3D[:, 0] - sx)/fx
	gt_y3D = gt_z * (gt_boxes3D[:, 1] - sy)/fy

	# put together the GT boxes
	gt_center_3D = torch.stack((gt_x3D, gt_y3D, gt_z)).T
	gt_boxes3D_XYZ_WHL = torch.cat((gt_center_3D, gt_boxes3D[:, 3:6]), dim=1)

	gt_colors = torch.tensor(
	[util.get_color(i) for i in range(len(gt_boxes3D_XYZ_WHL))],
	device=self.device
	)/255.0

	gt_meshes = util.mesh_cuboid(gt_boxes3D_XYZ_WHL, gt_poses, gt_colors)

	# perform a simple NMS, which is not cls dependent.
	keep = batched_nms(
	instances_i.pred_boxes.tensor,
	instances_i.scores,
	torch.zeros(len(instances_i.scores), dtype=torch.long, device=instances_i.scores.device),
	self.roi_heads.box_predictor.test_nms_thresh
	)

	keep = keep[:max_vis_prop]
	num_to_visualize = len(keep)

	pred_xyzwhl = torch.cat((instances_i.pred_center_cam[keep], instances_i.pred_dimensions[keep]), dim=1)
	pred_pose = instances_i.pred_pose[keep]

	pred_colors = torch.tensor(
	[util.get_color(i) for i in range(num_to_visualize)],
	device=self.device
	)/255.0

	pred_boxes = instances_i.pred_boxes[keep]
	pred_scores = instances_i.scores[keep]
	pred_classes = instances_i.pred_classes[keep]
	pred_class_names = ['{} {:.2f}'.format(self.thing_classes[cls_idx], score) for cls_idx, score in zip(pred_classes, pred_scores)]
	pred_meshes = util.mesh_cuboid(pred_xyzwhl, pred_pose, pred_colors)

	# convert to lists
	pred_meshes = [pred_meshes.__getitem__(i).detach() for i in range(len(pred_meshes))]
	gt_meshes = [gt_meshes.__getitem__(i) for i in range(len(gt_meshes))]

	img_3DPR = vis.draw_scene_view(img_3DPR, K_scaled.cpu().numpy(), pred_meshes, text=pred_class_names, mode='front', blend_weight=0.0, blend_weight_overlay=0.85)
	img_3DGT = vis.draw_scene_view(img_3DGT, K_scaled.cpu().numpy(), gt_meshes, text=gt_class_names, mode='front', blend_weight=0.0, blend_weight_overlay=0.85)

	# horizontal stack 3D GT and pred left/right
	vis_img_3d = np.concatenate((img_3DGT, img_3DPR), axis=1)
	vis_img_3d = vis_img_3d[:, :, [2, 1, 0]] # RGB
	vis_img_3d = vis_img_3d.astype(np.uint8).transpose(2, 0, 1)

	storage.put_image("Left: GT 3D cuboids; Right: Predicted 3D cuboids", vis_img_3d)

	break # only visualize one image in a batch

	@META_ARCH_REGISTRY.register()
	class RCNN3D_combined_features(nn.Module):

	@configurable
	def __init__(self, *, backbone, proposal_generator, roi_heads, input_format, vis_period, pixel_mean, pixel_std, depth_model):
	super().__init__()
	self.backbone = backbone
	self.proposal_generator = proposal_generator
	self.roi_heads = roi_heads
	self.input_format = input_format
	self.vis_period = vis_period
	self.depth_model = depth_model

	self.register_buffer("pixel_mean", torch.tensor(pixel_mean).view(-1, 1, 1), False)
	self.register_buffer("pixel_std", torch.tensor(pixel_std).view(-1, 1, 1), False)
	assert (
	self.pixel_mean.shape == self.pixel_std.shape
	), f"{self.pixel_mean} and {self.pixel_std} have different shapes!"

	@classmethod
	def from_config(cls, cfg, priors=None):
	backbone = build_backbone(cfg, priors=priors)
	if cfg.MODEL.DEPTH_ON:
	depth_model = 'zoedepth'
	pretrained_resource = 'local::depth/checkpoints/depth_anything_metric_depth_indoor.pt'
	d_model = setup_depth_model(depth_model, pretrained_resource) #NOTE maybe make the depth model be learnable as well

	shape_modified = {key:ShapeSpec(i.channels*2,stride=i.stride) for key, i in backbone.output_shape().items()}
	else:
	d_model = None
	shape_modified = backbone.output_shape()

	return {
	"backbone": backbone,
	"proposal_generator": build_proposal_generator(cfg, backbone.output_shape()),
	"roi_heads": build_roi_heads(cfg, shape_modified, priors=priors),
	"input_format": cfg.INPUT.FORMAT,
	"vis_period": cfg.VIS_PERIOD,
	"pixel_mean": cfg.MODEL.PIXEL_MEAN,
	"pixel_std": cfg.MODEL.PIXEL_STD,
	"depth_model": d_model,
	}


	@property
	def device(self):
	return self.pixel_mean.device

	def _move_to_current_device(self, x):
	return move_device_like(x, self.pixel_mean)


	def preprocess_image(self, batched_inputs: List[Dict[str, torch.Tensor]], normalise=True, img_type="image", convert=False, NoOp=False, to_float=False):
	"""
	Normalize, pad and batch the input images.
	"""
	images = [self._move_to_current_device(x[img_type]) for x in batched_inputs]
	if normalise:
	images = [(x - self.pixel_mean) / self.pixel_std for x in images]
	if convert:
	# convert from BGR to RGB
	images = [x[[2,1,0],:,:] for x in images]
	if to_float:
	images = [x.float()/255.0 for x in images]
	if NoOp:
	images = ImageList.from_tensors(images)
	return images
	images = ImageList.from_tensors(
	images,
	self.backbone.size_divisibility,
	padding_constraints=self.backbone.padding_constraints,
	)
	return images

	def _standardize(self, x:torch.Tensor, y:torch.Tensor):
	'''standardise x to match the mean and std of y'''
	ym = y.mean()
	ys = y.std()
	xm = x.mean()
	xs = x.std()
	return (x - xm) * (ys / xs) + ym

	def cat_depth_features(self, features, images_raw):
	pred_o = self.depth_model(images_raw.tensor.float()/255.0)
	# depth features corresponding to p2, p3, p4, p5

	d_features = pred_o['depth_features']
	# img_features = features['p5']
	# we must scale the depth map to the same size as the conv feature, otherwise the scale will not correspond correctly in the roi pooling
	for (layer, img_feature), d_feature in zip(features.items(), reversed(d_features)):
	d_feature = F.interpolate(d_feature, size=img_feature.shape[-2:], mode='bilinear', align_corners=True)
	d_feature = self._standardize(d_feature, img_feature)
	features[layer] = torch.cat((img_feature, d_feature), dim=1)
	return features

	def forward(self, batched_inputs: List[Dict[str, torch.Tensor]]):

	if not self.training:
	return self.inference(batched_inputs) # segmentor is just none in inference because we dont need the loss

	images = self.preprocess_image(batched_inputs)
	# NOTE: images_raw are scaled to be padded to the same size as the largest.
	# This is necessary because the images are of different sizes, so to batch them they must each be the same size.
	images_raw = self.preprocess_image(batched_inputs, img_type='image', convert=True, normalise=False, NoOp=True)
	# if we want depth maps they are there
	depth_maps = self.preprocess_image(batched_inputs, img_type="depth_map", normalise=False, NoOp=True)
	# Note if a single ground map in a batch is missing, we skip the ground map for the entire batch
	ground_maps_fail = [i['ground_map'] is None for i in batched_inputs]
	ground_maps_fail_idx = [i for i, x in enumerate(ground_maps_fail) if x]
	for idx in ground_maps_fail_idx:
	batched_inputs[idx]['ground_map'] = torch.tensor([[1]]) # make a dummy to indicate a fail
	ground_maps = self.preprocess_image(batched_inputs, img_type="ground_map", normalise=False, NoOp=True)
	# scaling factor for the sample relative to its original scale
	# e.g., how much has the image been upsampled by? or downsampled?
	im_scales_ratio = [info['height'] / im.shape[1] for (info, im) in zip(batched_inputs, images)]

	# The unmodified intrinsics for the image
	Ks = [torch.FloatTensor(info['K']) for info in batched_inputs]

	if "instances" in batched_inputs[0]:
	gt_instances = [x["instances"].to(self.device) for x in batched_inputs]

	features = self.backbone(images.tensor)
	proposals, proposal_losses = self.proposal_generator(images, features, gt_instances)

	if self.depth_model is not None:
	features = self.cat_depth_features(features, images_raw)

	instances, detector_losses = self.roi_heads(
	images, images_raw, ground_maps, depth_maps, features, proposals,
	Ks, im_scales_ratio,
	gt_instances
	)

	if self.vis_period > 0:
	storage = get_event_storage()
	if storage.iter % self.vis_period == 0 and storage.iter > 0:
	self.visualize_training(batched_inputs, proposals, instances)

	losses = {}
	losses.update(detector_losses)
	losses.update(proposal_losses)
	return losses

	def inference(
	self,
	batched_inputs: List[Dict[str, torch.Tensor]],
	detected_instances: Optional[List[Instances]] = None,
	do_postprocess: bool = True,
	):
	assert not self.training

	images = self.preprocess_image(batched_inputs)
	images_raw = self.preprocess_image(batched_inputs, img_type='image', convert=True, normalise=False, NoOp=True)
	# do we assume no access to ground maps in inference?
	ground_maps = None
	depth_maps = None

	# scaling factor for the sample relative to its original scale
	# e.g., how much has the image been upsampled by? or downsampled?
	im_scales_ratio = [info['height'] / im.shape[1] for (info, im) in zip(batched_inputs, images)]

	# The unmodified intrinsics for the image
	Ks = [torch.FloatTensor(info['K']) for info in batched_inputs]

	features = self.backbone(images.tensor)

	# Pass oracle 2D boxes into the RoI heads
	if type(batched_inputs == list) and np.any(['oracle2D' in b for b in batched_inputs]):
	oracles = [b['oracle2D'] for b in batched_inputs]
	results, _ = self.roi_heads(images, images_raw, ground_maps, depth_maps, features, oracles, Ks, im_scales_ratio, None)

	# normal inference
	else:
	proposals, _ = self.proposal_generator(images, features, None)
	if self.depth_model is not None:
	features = self.cat_depth_features(features, images_raw)
	# pred boxes are proposals
	results, _ = self.roi_heads(images, images_raw, ground_maps, depth_maps, features, proposals, Ks, im_scales_ratio, None)

	if do_postprocess:
	assert not torch.jit.is_scripting(), "Scripting is not supported for postprocess."
	return GeneralizedRCNN._postprocess(results, batched_inputs, images.image_sizes)
	else:
	return results

	def visualize_training(self, batched_inputs, proposals, instances):
	"""
	A function used to visualize images and proposals. It shows ground truth
	bounding boxes on the original image and up to 20 top-scoring predicted
	object proposals on the original image. Users can implement different
	visualization functions for different models.
	Args:
	batched_inputs (list): a list that contains input to the model.
	proposals (list): a list that contains predicted proposals. Both
	batched_inputs and proposals should have the same length.
	instances (list): a list that contains predicted RoIhead instances. Both
	batched_inputs and proposals should have the same length.
	"""

	storage = get_event_storage()

	# minimum number of boxes to try to visualize per image
	max_vis_prop = 20

	if not hasattr(self, 'thing_classes'):
	self.thing_classes = MetadataCatalog.get('omni3d_model').thing_classes
	self.num_classes = len(self.thing_classes)
	only2d = instances is None
	if only2d:
	instances = [None]*len(batched_inputs)
	for input, prop, instances_i in zip(batched_inputs, proposals, instances):

	img = input["image"]
	img = convert_image_to_rgb(img.permute(1, 2, 0), self.input_format)
	img_3DGT = np.ascontiguousarray(img.copy()[:, :, [2, 1, 1]]) # BGR
	img_3DPR = np.ascontiguousarray(img.copy()[:, :, [2, 1, 1]]) # BGR

	'''
	Visualize the 2D GT and proposal predictions
	'''
	v_gt = Visualizer(img, None)
	v_gt = v_gt.overlay_instances(boxes=input["instances"].gt_boxes)
	anno_img = v_gt.get_image()
	box_size = min(len(prop.proposal_boxes), max_vis_prop)
	v_pred = Visualizer(img, None)
	v_pred = v_pred.overlay_instances(
	boxes=prop.proposal_boxes[0:box_size].tensor.cpu().numpy()
	)
	prop_img = v_pred.get_image()
	vis_img_rpn = np.concatenate((anno_img, prop_img), axis=1)
	vis_img_rpn = vis_img_rpn.transpose(2, 0, 1)
	storage.put_image("Left: GT 2D bounding boxes; Right: Predicted 2D proposals", vis_img_rpn)
	if only2d:
	break
	'''
	Visualize the 3D GT and predictions
	'''
	K = torch.tensor(input['K'], device=self.device)
	scale = input['height']/img.shape[0]
	fx, sx = (val.item()/scale for val in K[0, [0, 2]])
	fy, sy = (val.item()/scale for val in K[1, [1, 2]])

	K_scaled = torch.tensor(
	[[1/scale, 0 , 0], [0, 1/scale, 0], [0, 0, 1.0]],
	dtype=torch.float32, device=self.device
	) @ K

	gts_per_image = input["instances"]

	gt_classes = gts_per_image.gt_classes

	# Filter out irrelevant groundtruth
	fg_selection_mask = (gt_classes != -1) & (gt_classes < self.num_classes)

	gt_classes = gt_classes[fg_selection_mask]
	gt_class_names = [self.thing_classes[cls_idx] for cls_idx in gt_classes]
	gt_boxes = gts_per_image.gt_boxes.tensor[fg_selection_mask] # 2D boxes
	gt_poses = gts_per_image.gt_poses[fg_selection_mask] # GT poses

	# projected 2D center, depth, w, h, l, 3D center
	gt_boxes3D = gts_per_image.gt_boxes3D[fg_selection_mask]

	# this box may have been mirrored and scaled so
	# we need to recompute XYZ in 3D by backprojecting.
	gt_z = gt_boxes3D[:, 2]

	gt_x3D = gt_z * (gt_boxes3D[:, 0] - sx)/fx
	gt_y3D = gt_z * (gt_boxes3D[:, 1] - sy)/fy

	# put together the GT boxes
	gt_center_3D = torch.stack((gt_x3D, gt_y3D, gt_z)).T
	gt_boxes3D_XYZ_WHL = torch.cat((gt_center_3D, gt_boxes3D[:, 3:6]), dim=1)

	gt_colors = torch.tensor(
	[util.get_color(i) for i in range(len(gt_boxes3D_XYZ_WHL))],
	device=self.device
	)/255.0

	gt_meshes = util.mesh_cuboid(gt_boxes3D_XYZ_WHL, gt_poses, gt_colors)

	# perform a simple NMS, which is not cls dependent.
	keep = batched_nms(
	instances_i.pred_boxes.tensor,
	instances_i.scores,
	torch.zeros(len(instances_i.scores), dtype=torch.long, device=instances_i.scores.device),
	self.roi_heads.box_predictor.test_nms_thresh
	)

	keep = keep[:max_vis_prop]
	num_to_visualize = len(keep)

	pred_xyzwhl = torch.cat((instances_i.pred_center_cam[keep], instances_i.pred_dimensions[keep]), dim=1)
	pred_pose = instances_i.pred_pose[keep]

	pred_colors = torch.tensor(
	[util.get_color(i) for i in range(num_to_visualize)],
	device=self.device
	)/255.0

	pred_boxes = instances_i.pred_boxes[keep]
	pred_scores = instances_i.scores[keep]
	pred_classes = instances_i.pred_classes[keep]
	pred_class_names = ['{} {:.2f}'.format(self.thing_classes[cls_idx], score) for cls_idx, score in zip(pred_classes, pred_scores)]
	pred_meshes = util.mesh_cuboid(pred_xyzwhl, pred_pose, pred_colors)

	# convert to lists
	pred_meshes = [pred_meshes.__getitem__(i).detach() for i in range(len(pred_meshes))]
	gt_meshes = [gt_meshes.__getitem__(i) for i in range(len(gt_meshes))]

	img_3DPR = vis.draw_scene_view(img_3DPR, K_scaled.cpu().numpy(), pred_meshes, text=pred_class_names, mode='front', blend_weight=0.0, blend_weight_overlay=0.85)
	img_3DGT = vis.draw_scene_view(img_3DGT, K_scaled.cpu().numpy(), gt_meshes, text=gt_class_names, mode='front', blend_weight=0.0, blend_weight_overlay=0.85)

	# horizontal stack 3D GT and pred left/right
	vis_img_3d = np.concatenate((img_3DGT, img_3DPR), axis=1)
	vis_img_3d = vis_img_3d[:, :, [2, 1, 0]] # RGB
	vis_img_3d = vis_img_3d.astype(np.uint8).transpose(2, 0, 1)

	storage.put_image("Left: GT 3D cuboids; Right: Predicted 3D cuboids", vis_img_3d)

	break # only visualize one image in a batch

	@META_ARCH_REGISTRY.register()
	class BoxNet(nn.Module):

	@configurable
	def __init__(
	self,
	*,
	backbone: Backbone,
	proposal_generator: nn.Module,
	roi_heads: nn.Module,
	pixel_mean: tuple[float],
	pixel_std: tuple[float],
	input_format: Optional[str] = None,
	vis_period: int = 0,
	):
	"""
	Args:
	backbone: a backbone module, must follow detectron2's backbone interface
	proposal_generator: a module that generates proposals using backbone features
	roi_heads: a ROI head that performs per-region computation
	pixel_mean, pixel_std: list or tuple with #channels element, representing
	the per-channel mean and std to be used to normalize the input image
	input_format: describe the meaning of channels of input. Needed by visualization
	vis_period: the period to run visualization. Set to 0 to disable.
	"""
	super().__init__()
	self.backbone = backbone
	self.proposal_generator = proposal_generator
	self.roi_heads = roi_heads

	self.input_format = input_format
	self.vis_period = vis_period
	if vis_period > 0:
	assert input_format is not None, "input_format is required for visualization!"

	self.register_buffer("pixel_mean", torch.tensor(pixel_mean).view(-1, 1, 1), False)
	self.register_buffer("pixel_std", torch.tensor(pixel_std).view(-1, 1, 1), False)
	assert (
	self.pixel_mean.shape == self.pixel_std.shape
	), f"{self.pixel_mean} and {self.pixel_std} have different shapes!"

	@classmethod
	def from_config(cls, cfg, priors=None):
	backbone = build_backbone(cfg, priors=priors)
	return {
	"backbone": backbone,
	"proposal_generator": build_proposal_generator(cfg, backbone.output_shape()),
	"roi_heads": build_roi_heads(cfg, backbone.output_shape(), priors=priors),
	"input_format": cfg.INPUT.FORMAT,
	"vis_period": cfg.VIS_PERIOD,
	"pixel_mean": cfg.MODEL.PIXEL_MEAN,
	"pixel_std": cfg.MODEL.PIXEL_STD,
	}

	@property
	def device(self):
	return self.pixel_mean.device

	def _move_to_current_device(self, x):
	return move_device_like(x, self.pixel_mean)

	def preprocess_image(self, batched_inputs: List[Dict[str, torch.Tensor]], normalise=True, img_type="image", convert=False, NoOp=False, to_float=False):
	"""
	Normalize, pad and batch the input images.
	"""
	images = [self._move_to_current_device(x[img_type]) for x in batched_inputs]
	if normalise:
	images = [(x - self.pixel_mean) / self.pixel_std for x in images]
	else:
	if convert:
	# convert from BGR to RGB
	images = [x[[2,1,0],:,:] for x in images]
	if to_float:
	images = [x.float()/255.0 for x in images]
	if NoOp:
	images = ImageList.from_tensors(images,0,)
	return images
	images = ImageList.from_tensors(
	images,
	self.backbone.size_divisibility,
	padding_constraints=self.backbone.padding_constraints,
	)
	return images

	def forward(self, batched_inputs: List[Dict[str, torch.Tensor]], experiment_type={'use_pred_boxes':True}, proposal_function='propose'):
	if not self.training:
	if not experiment_type['use_pred_boxes']: # MABO
	return self.inference(batched_inputs, do_postprocess=False, experiment_type=experiment_type, proposal_function=proposal_function)
	else: # AP
	return self.inference(batched_inputs, do_postprocess=True, experiment_type=experiment_type, proposal_function=proposal_function)

	if self.training:
	images = self.preprocess_image(batched_inputs, img_type='image', convert=False)
	images_raw = self.preprocess_image(batched_inputs, img_type='image', convert=True, normalise=False, NoOp=True)
	depth_maps = self.preprocess_image(batched_inputs, img_type="depth_map", normalise=False, NoOp=True)
	if batched_inputs[0]['ground_map'] is not None:
	ground_maps = self.preprocess_image(batched_inputs, img_type="ground_map", normalise=False, NoOp=True)
	if not torch.count_nonzero(ground_maps.tensor): # for some reason there is a single ground map causing problems
	print('no_ground for', batched_inputs[0]['image_id'])
	ground_maps = None
	else:
	ground_maps = None
	# scaling factor for the sample relative to its original scale
	# e.g., how much has the image been upsampled by? or downsampled?
	im_scales_ratio = [info['height'] / im.shape[1] for (info, im) in zip(batched_inputs, images)]
	# The unmodified intrinsics for the image
	Ks = [torch.FloatTensor(info['K']) for info in batched_inputs]
	features = None

	gt_instances = [x["instances"].to(self.device) for x in batched_inputs]
	# def forward(self, images, images_raw, combined_features, depth_maps, ground_maps, features, proposals, Ks, im_scales_ratio, segmentor, experiment_type, proposal_function, targets=None):
	results = self.roi_heads(images, images_raw, None, depth_maps, ground_maps, features, gt_instances, Ks, im_scales_ratio, experiment_type, proposal_function)
	return GeneralizedRCNN._postprocess(results, batched_inputs, images.image_sizes)

	def inference(self,
	batched_inputs: List[Dict[str, torch.Tensor]],
	detected_instances: Optional[List[Instances]] = None, do_postprocess: bool = True, experiment_type={}, proposal_function='propose'):
	assert not self.training

	# must apply the same preprocessing to both the image, the depth map, and the mask
	# except don't normalise the input for the segmentation method
	images = self.preprocess_image(batched_inputs, img_type='image', convert=False)
	images_raw = self.preprocess_image(batched_inputs, img_type='image', convert=True, normalise=False, NoOp=True)
	depth_maps = self.preprocess_image(batched_inputs, img_type="depth_map", normalise=False, NoOp=True)
	if batched_inputs[0]['ground_map'] is not None:
	ground_maps = self.preprocess_image(batched_inputs, img_type="ground_map", normalise=False, NoOp=True)
	else:
	#logger.info("ground map file not found, setting to None")
	ground_maps = None
	# TODO: make logic to predict ground map on the fly
	# logger.info("ground map file not found, computing...")
	# raise NotImplementedError("Implement ground on the fly, see generate_ground_segmentations.py for reference")

	# scaling factor for the sample relative to its original scale
	# e.g., how much has the image been upsampled by? or downsampled?
	im_scales_ratio = [info['height'] / im.shape[1] for (info, im) in zip(batched_inputs, images)]

	# The unmodified intrinsics for the image
	Ks = [torch.FloatTensor(info['K']) for info in batched_inputs]

	# do_postprocess is the same as using predicted boxes
	if do_postprocess:
	# gt_instances should be None in inference mode
	features = self.backbone(images.tensor)
	# normal inference
	proposals, _ = self.proposal_generator(images, features, None)
	else:
	if "instances" in batched_inputs[0]:
	gt_instances = [x["instances"].to(self.device) for x in batched_inputs]
	else:
	gt_instances = None
	features, proposals = None, gt_instances

	# combined_features = self.scorenet_base.forward_features(images, images_raw)
	combined_features = None
	# is it necessary to resize images back???

	# use the mask and the 2D box to predict the 3D box
	# proposals are ground truth for MABO plots and predictions for AP plots
	results = self.roi_heads(images, images_raw, combined_features, depth_maps, ground_maps, features, proposals, Ks, im_scales_ratio, experiment_type, proposal_function)

	if do_postprocess:
	assert not torch.jit.is_scripting(), "Scripting is not supported for postprocess."
	return GeneralizedRCNN._postprocess(results, batched_inputs, images.image_sizes)
	else:
	return results #[{'instances':results}]

	def visualize_training(self, batched_inputs, proposals, instances):
	"""
	A function used to visualize images and proposals. It shows ground truth
	bounding boxes on the original image and up to 20 top-scoring predicted
	object proposals on the original image. Users can implement different
	visualization functions for different models.
	Args:
	batched_inputs (list): a list that contains input to the model.
	proposals (list): a list that contains predicted proposals. Both
	batched_inputs and proposals should have the same length.
	instances (list): a list that contains predicted RoIhead instances. Both
	batched_inputs and proposals should have the same length.
	"""

	storage = get_event_storage()

	# minimum number of boxes to try to visualize per image
	max_vis_prop = 20

	if not hasattr(self, 'thing_classes'):
	self.thing_classes = MetadataCatalog.get('omni3d_model').thing_classes
	self.num_classes = len(self.thing_classes)

	for input, prop, instances_i in zip(batched_inputs, proposals, instances):

	img = input["image"]
	img = convert_image_to_rgb(img.permute(1, 2, 0), self.input_format)
	img_3DGT = np.ascontiguousarray(img.copy()[:, :, [2, 1, 1]]) # BGR
	img_3DPR = np.ascontiguousarray(img.copy()[:, :, [2, 1, 1]]) # BGR

	'''
	Visualize the 2D GT and proposal predictions
	'''
	v_gt = Visualizer(img, None)
	v_gt = v_gt.overlay_instances(boxes=input["instances"].gt_boxes)
	anno_img = v_gt.get_image()
	box_size = min(len(prop.proposal_boxes), max_vis_prop)
	v_pred = Visualizer(img, None)
	v_pred = v_pred.overlay_instances(
	boxes=prop.proposal_boxes[0:box_size].tensor.cpu().numpy()
	)
	prop_img = v_pred.get_image()
	vis_img_rpn = np.concatenate((anno_img, prop_img), axis=1)
	vis_img_rpn = vis_img_rpn.transpose(2, 0, 1)
	storage.put_image("Left: GT 2D bounding boxes; Right: Predicted 2D proposals", vis_img_rpn)

	'''
	Visualize the 3D GT and predictions
	'''
	K = torch.tensor(input['K'], device=self.device)
	scale = input['height']/img.shape[0]
	fx, sx = (val.item()/scale for val in K[0, [0, 2]])
	fy, sy = (val.item()/scale for val in K[1, [1, 2]])

	K_scaled = torch.tensor(
	[[1/scale, 0 , 0], [0, 1/scale, 0], [0, 0, 1.0]],
	dtype=torch.float32, device=self.device
	) @ K

	gts_per_image = input["instances"]

	gt_classes = gts_per_image.gt_classes

	# Filter out irrelevant groundtruth
	fg_selection_mask = (gt_classes != -1) & (gt_classes < self.num_classes)

	gt_classes = gt_classes[fg_selection_mask]
	gt_class_names = [self.thing_classes[cls_idx] for cls_idx in gt_classes]
	gt_boxes = gts_per_image.gt_boxes.tensor[fg_selection_mask] # 2D boxes
	gt_poses = gts_per_image.gt_poses[fg_selection_mask] # GT poses

	# projected 2D center, depth, w, h, l, 3D center
	gt_boxes3D = gts_per_image.gt_boxes3D[fg_selection_mask]

	# this box may have been mirrored and scaled so
	# we need to recompute XYZ in 3D by backprojecting.
	gt_z = gt_boxes3D[:, 2]

	gt_x3D = gt_z * (gt_boxes3D[:, 0] - sx)/fx
	gt_y3D = gt_z * (gt_boxes3D[:, 1] - sy)/fy

	# put together the GT boxes
	gt_center_3D = torch.stack((gt_x3D, gt_y3D, gt_z)).T
	gt_boxes3D_XYZ_WHL = torch.cat((gt_center_3D, gt_boxes3D[:, 3:6]), dim=1)

	gt_colors = torch.tensor(
	[util.get_color(i) for i in range(len(gt_boxes3D_XYZ_WHL))],
	device=self.device
	)/255.0

	gt_meshes = util.mesh_cuboid(gt_boxes3D_XYZ_WHL, gt_poses, gt_colors)

	# perform a simple NMS, which is not cls dependent.
	keep = batched_nms(
	instances_i.pred_boxes.tensor,
	instances_i.scores,
	torch.zeros(len(instances_i.scores), dtype=torch.long, device=instances_i.scores.device),
	self.roi_heads.box_predictor.test_nms_thresh
	)

	keep = keep[:max_vis_prop]
	num_to_visualize = len(keep)

	pred_xyzwhl = torch.cat((instances_i.pred_center_cam[keep], instances_i.pred_dimensions[keep]), dim=1)
	pred_pose = instances_i.pred_pose[keep]

	pred_colors = torch.tensor(
	[util.get_color(i) for i in range(num_to_visualize)],
	device=self.device
	)/255.0

	pred_boxes = instances_i.pred_boxes[keep]
	pred_scores = instances_i.scores[keep]
	pred_classes = instances_i.pred_classes[keep]
	pred_class_names = ['{} {:.2f}'.format(self.thing_classes[cls_idx], score) for cls_idx, score in zip(pred_classes, pred_scores)]
	pred_meshes = util.mesh_cuboid(pred_xyzwhl, pred_pose, pred_colors)

	# convert to lists
	pred_meshes = [pred_meshes.__getitem__(i).detach() for i in range(len(pred_meshes))]
	gt_meshes = [gt_meshes.__getitem__(i) for i in range(len(gt_meshes))]

	img_3DPR = vis.draw_scene_view(img_3DPR, K_scaled.cpu().numpy(), pred_meshes, text=pred_class_names, mode='front', blend_weight=0.0, blend_weight_overlay=0.85)
	img_3DGT = vis.draw_scene_view(img_3DGT, K_scaled.cpu().numpy(), gt_meshes, text=gt_class_names, mode='front', blend_weight=0.0, blend_weight_overlay=0.85)

	# horizontal stack 3D GT and pred left/right
	vis_img_3d = np.concatenate((img_3DGT, img_3DPR), axis=1)
	vis_img_3d = vis_img_3d[:, :, [2, 1, 0]] # RGB
	vis_img_3d = vis_img_3d.astype(np.uint8).transpose(2, 0, 1)

	storage.put_image("Left: GT 3D cuboids; Right: Predicted 3D cuboids", vis_img_3d)

	break

	def build_model(cfg, priors=None):
	"""
	Build the whole model architecture, defined by ``cfg.MODEL.META_ARCHITECTURE``.
	Note that it does not load any weights from ``cfg``.
	"""
	meta_arch = cfg.MODEL.META_ARCHITECTURE
	model = META_ARCH_REGISTRY.get(meta_arch)(cfg, priors=priors)
	model.to(torch.device(cfg.MODEL.DEVICE))
	_log_api_usage("modeling.meta_arch." + meta_arch)
	return model

	def build_backbone(cfg, input_shape=None, priors=None):
	"""
	Build a backbone from `cfg.MODEL.BACKBONE.NAME`.

	Returns:
	an instance of :class:`Backbone`
	"""
	if input_shape is None:
	input_shape = ShapeSpec(channels=len(cfg.MODEL.PIXEL_MEAN))

	backbone_name = cfg.MODEL.BACKBONE.NAME
	backbone = BACKBONE_REGISTRY.get(backbone_name)(cfg, input_shape, priors)
	assert isinstance(backbone, Backbone)
	return backbone