Spaces:
Sleeping
Sleeping
File size: 10,308 Bytes
56bd2b5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 |
# Copyright (c) Meta Platforms, Inc. and affiliates
import copy
import logging
from detectron2.config.config import configurable
from detectron2.data.transforms.augmentation import AugmentationList
import torch
import numpy as np
from detectron2.structures import BoxMode, Keypoints
from detectron2.data import detection_utils
from detectron2.data import transforms as T
from detectron2.data import (
DatasetMapper
)
from detectron2.structures import (
Boxes,
BoxMode,
Instances,
)
from typing import List, Optional, Union
from PIL import Image
class DatasetMapper3D(DatasetMapper):
@configurable
def __init__(
self,
is_train: bool,
*,
augmentations: List[Union[T.Augmentation, T.Transform]],
image_format: str,
mode:str=None,
use_instance_mask: bool = False,
use_keypoint: bool = False,
instance_mask_format: str = "polygon",
keypoint_hflip_indices: Optional[np.ndarray] = None,
precomputed_proposal_topk: Optional[int] = None,
recompute_boxes: bool = False,
):
"""
NOTE: this interface is experimental.
Args:
is_train: whether it's used in training or inference
mode: 'get_depth_maps' (default), 'cube_rcnn'
augmentations: a list of augmentations or deterministic transforms to apply
image_format: an image format supported by :func:`detection_utils.read_image`.
use_instance_mask: whether to process instance segmentation annotations, if available
use_keypoint: whether to process keypoint annotations if available
instance_mask_format: one of "polygon" or "bitmask". Process instance segmentation
masks into this format.
keypoint_hflip_indices: see :func:`detection_utils.create_keypoint_hflip_indices`
precomputed_proposal_topk: if given, will load pre-computed
proposals from dataset_dict and keep the top k proposals for each image.
recompute_boxes: whether to overwrite bounding box annotations
by computing tight bounding boxes from instance mask annotations.
"""
if recompute_boxes:
assert use_instance_mask, "recompute_boxes requires instance masks"
# fmt: off
self.is_train = is_train
self.augmentations = T.AugmentationList(augmentations)
self.image_format = image_format
self.use_instance_mask = use_instance_mask
self.instance_mask_format = instance_mask_format
self.use_keypoint = use_keypoint
self.keypoint_hflip_indices = keypoint_hflip_indices
self.proposal_topk = precomputed_proposal_topk
self.recompute_boxes = recompute_boxes
# fmt: on
logger = logging.getLogger(__name__)
mode_out = "training" if is_train else "inference"
logger.info(f"[DatasetMapper] Augmentations used in {mode_out}: {augmentations}")
self.mode = mode
@classmethod
def from_config(cls, cfg, is_train: bool = True, mode='get_depth_maps'):
augs = detection_utils.build_augmentation(cfg, is_train)
if cfg.INPUT.CROP.ENABLED and is_train:
augs.insert(0, T.RandomCrop(cfg.INPUT.CROP.TYPE, cfg.INPUT.CROP.SIZE))
recompute_boxes = cfg.MODEL.MASK_ON
else:
recompute_boxes = False
ret = {
"is_train": is_train,
"mode": mode,
"augmentations": augs,
"image_format": cfg.INPUT.FORMAT,
"use_instance_mask": cfg.MODEL.MASK_ON,
"instance_mask_format": cfg.INPUT.MASK_FORMAT,
"use_keypoint": cfg.MODEL.KEYPOINT_ON,
"recompute_boxes": recompute_boxes,
}
if cfg.MODEL.KEYPOINT_ON:
ret["keypoint_hflip_indices"] = detection_utils.create_keypoint_hflip_indices(cfg.DATASETS.TRAIN)
if cfg.MODEL.LOAD_PROPOSALS:
ret["precomputed_proposal_topk"] = (
cfg.DATASETS.PRECOMPUTED_PROPOSAL_TOPK_TRAIN
if is_train
else cfg.DATASETS.PRECOMPUTED_PROPOSAL_TOPK_TEST
)
return ret
def __call__(self, dataset_dict):
dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below
image = detection_utils.read_image(dataset_dict["file_name"], format=self.image_format)
detection_utils.check_image_size(dataset_dict, image)
aug_input = T.AugInput(image)
# state = torch.get_rng_state()
transforms = self.augmentations(aug_input)
image = aug_input.image
image_shape = image.shape[:2] # h, w
# dont load ground map and depth map when
dp_img = Image.fromarray(np.load(dataset_dict["depth_image_path"])['depth'])
dp_img = np.array(dp_img.resize(image.shape[:2][::-1], Image.NEAREST))
aug_input_dp = T.AugInput(dp_img)
aug_only_flip = AugmentationList(transforms[-1:])
# torch.set_rng_state(state)
#transforms_dp = aug_only_flip(aug_input_dp)
dp_image = aug_input_dp.image
dataset_dict["depth_map"] = torch.as_tensor(np.ascontiguousarray(dp_image))
# ground image
if 'ground_image_path' in dataset_dict:
ground_img = Image.fromarray(np.load(dataset_dict["ground_image_path"])['mask'])
ground_img = np.array(ground_img.resize(image.shape[:2][::-1], Image.NEAREST))
aug_input_gr = T.AugInput(ground_img)
#transforms_gr = aug_only_flip(aug_input_gr)
gr_image = aug_input_gr.image
dataset_dict["ground_map"] = torch.as_tensor(np.ascontiguousarray(gr_image))
else:
dataset_dict["ground_map"] = None
# Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
# but not efficient on large generic data structures due to the use of pickle & mp.Queue.
# Therefore it's important to use torch.Tensor.
dataset_dict["image"] = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
# no need for additional processing at inference
# if not self.mode == 'eval_with_gt':
if self.mode == 'cube_rcnn':
if not self.is_train:
return dataset_dict
if "annotations" in dataset_dict:
dataset_id = dataset_dict['dataset_id']
K = np.array(dataset_dict['K'])
unknown_categories = self.dataset_id_to_unknown_cats[dataset_id]
# transform and pop off annotations
annos = [
transform_instance_annotations(obj, transforms, K=K)
for obj in dataset_dict.pop("annotations") if obj.get("iscrowd", 0) == 0
]
# convert to instance format
instances = annotations_to_instances(annos, image_shape, unknown_categories)
dataset_dict["instances"] = detection_utils.filter_empty_instances(instances)
return dataset_dict
'''
Cached for mirroring annotations
'''
_M1 = np.array([
[1, 0, 0],
[0, -1, 0],
[0, 0, -1]
])
_M2 = np.array([
[-1., 0., 0.],
[ 0., -1., 0.],
[ 0., 0., 1.]
])
def transform_instance_annotations(annotation, transforms, *, K):
if isinstance(transforms, (tuple, list)):
transforms = T.TransformList(transforms)
# bbox is 1d (per-instance bounding box)
bbox = BoxMode.convert(annotation["bbox"], annotation["bbox_mode"], BoxMode.XYXY_ABS)
bbox = transforms.apply_box(np.array([bbox]))[0]
annotation["bbox"] = bbox
annotation["bbox_mode"] = BoxMode.XYXY_ABS
if annotation['center_cam'][2] != 0:
# project the 3D box annotation XYZ_3D to screen
point3D = annotation['center_cam']
point2D = K @ np.array(point3D)
point2D[:2] = point2D[:2] / point2D[-1]
annotation["center_cam_proj"] = point2D.tolist()
# apply coords transforms to 2D box
annotation["center_cam_proj"][0:2] = transforms.apply_coords(
point2D[np.newaxis][:, :2]
)[0].tolist()
keypoints = (K @ np.array(annotation["bbox3D_cam"]).T).T
keypoints[:, 0] /= keypoints[:, -1]
keypoints[:, 1] /= keypoints[:, -1]
if annotation['ignore']:
# all keypoints marked as not visible
# 0 - unknown, 1 - not visible, 2 visible
keypoints[:, 2] = 1
else:
valid_keypoints = keypoints[:, 2] > 0
# 0 - unknown, 1 - not visible, 2 visible
keypoints[:, 2] = 2
keypoints[valid_keypoints, 2] = 2
# in place
transforms.apply_coords(keypoints[:, :2])
annotation["keypoints"] = keypoints.tolist()
# manually apply mirror for pose
for transform in transforms:
# horrizontal flip?
if isinstance(transform, T.HFlipTransform):
pose = _M1 @ np.array(annotation["pose"]) @ _M2
annotation["pose"] = pose.tolist()
annotation["R_cam"] = pose.tolist()
return annotation
def annotations_to_instances(annos, image_size, unknown_categories):
# init
target = Instances(image_size)
# add classes, 2D boxes, 3D boxes and poses
target.gt_classes = torch.tensor([int(obj["category_id"]) for obj in annos], dtype=torch.int64)
target.gt_boxes = Boxes([BoxMode.convert(obj["bbox"], obj["bbox_mode"], BoxMode.XYXY_ABS) for obj in annos])
target.gt_boxes3D = torch.FloatTensor([anno['center_cam_proj'] + anno['dimensions'] + anno['center_cam'] for anno in annos])
target.gt_poses = torch.FloatTensor([anno['pose'] for anno in annos])
n = len(target.gt_classes)
# do keypoints?
target.gt_keypoints = Keypoints(torch.FloatTensor([anno['keypoints'] for anno in annos]))
gt_unknown_category_mask = torch.zeros(max(unknown_categories)+1, dtype=bool)
gt_unknown_category_mask[torch.tensor(list(unknown_categories))] = True
# include available category indices as tensor with GTs
target.gt_unknown_category_mask = gt_unknown_category_mask.unsqueeze(0).repeat([n, 1])
return target
|