Spaces:
Sleeping
Sleeping
File size: 9,766 Bytes
2fd6166 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 |
import os
import numpy as np
import cv2
import torch
from .base_data import BaseDataset
from .behave_paths import DataPaths
from .img_utils import compute_translation, masks2bbox, crop
def padTo_4x3(rgb, person_mask, obj_mask, aspect_ratio=0.75):
"""
pad images to have 4:3 aspect ratio
:param rgb: (H, W, 3)
:param person_mask:
:param obj_mask:
:return: all images at the given aspect ratio
"""
h, w = rgb.shape[:2]
if w > h * 1/aspect_ratio:
# pad top
h_4x3 = int(w * aspect_ratio)
pad_top = h_4x3 - h
rgb_pad = np.pad(rgb, ((pad_top, 0), (0, 0), (0, 0)))
person_mask = np.pad(person_mask, ((pad_top, 0), (0, 0))) if person_mask is not None else None
obj_mask = np.pad(obj_mask, ((pad_top, 0), (0, 0))) if obj_mask is not None else None
else:
# pad two side
w_new = np.lcm.reduce([h * 2, 16]) # least common multiplier
h_4x3 = int(w_new * aspect_ratio)
pad_top = h_4x3 - h
pad_left = (w_new - w) // 2
pad_right = w_new - w - pad_left
rgb_pad = np.pad(rgb, ((pad_top, 0), (pad_left, pad_right), (0, 0)))
obj_mask = np.pad(obj_mask, ((pad_top, 0), (pad_left, pad_right))) if obj_mask is not None else None
person_mask = np.pad(person_mask, ((pad_top, 0), (pad_left, pad_right))) if person_mask is not None else None
return rgb_pad, obj_mask, person_mask
def recrop_input(rgb, person_mask, obj_mask, dataset_name='behave'):
"recrop input images"
exp_ratio = 1.42
if dataset_name == 'behave':
mean_center = np.array([1008, 995]) # mean RGB image crop center
behave_size = (2048, 1536)
new_size = (int(750 * exp_ratio), int(exp_ratio * 750))
else:
mean_center = np.array([904, 668]) # mean RGB image crop center for bottle sequences of ICAP
behave_size = (1920, 1080)
new_size = (int(593.925 * exp_ratio), int(exp_ratio * 593.925)) # mean width of bottle sequences
aspect_ratio = behave_size[1] / behave_size[0]
pad_top = mean_center[1] - new_size[0] // 2
pad_bottom = behave_size[1] - (mean_center[1] + new_size[0] // 2)
pad_left = mean_center[0] - new_size[0] // 2
pad_right = behave_size[0] - (mean_center[0] + new_size[0] // 2)
# First resize to the same aspect ratio
if rgb.shape[0] / rgb.shape[1] != aspect_ratio:
rgb, obj_mask, person_mask = padTo_4x3(rgb, person_mask, obj_mask, aspect_ratio)
# Resize to the same size as behave image, to have a comparable pixel size
rgb = cv2.resize(rgb, behave_size)
mask_ps = cv2.resize(person_mask, behave_size)
mask_obj = cv2.resize(obj_mask, behave_size)
# Crop and resize the human + object patch
bmin, bmax = masks2bbox([mask_ps, mask_obj])
center = (bmin + bmax) // 2
crop_size = int(np.max(bmax - bmin) * exp_ratio) # larger crop to have background
img_crop = cv2.resize(crop(rgb, center, crop_size), new_size)
mask_ps = cv2.resize(crop(mask_ps, center, crop_size), new_size)
mask_obj = cv2.resize(crop(mask_obj, center, crop_size), new_size)
# Pad back to have same shape as behave image
img_full = np.pad(img_crop, [[pad_top, pad_bottom], [pad_left, pad_right], [0, 0]])
mask_ps_full = np.pad(mask_ps, [[pad_top, pad_bottom], [pad_left, pad_right]])
mask_obj_full = np.pad(mask_obj, [[pad_top, pad_bottom], [pad_left, pad_right]])
# Make sure the image shape is the same
if img_full.shape[:2] != behave_size[::-1]:
img_full = cv2.resize(img_full, behave_size)
mask_ps_full = cv2.resize(mask_ps_full, behave_size)
mask_obj_full = cv2.resize(mask_obj_full, behave_size)
return img_full, mask_ps_full, mask_obj_full
class DemoDataset(BaseDataset):
def __init__(self, data_paths, input_size=(224, 224),
std_coverage=3.5, # used to estimate camera translation
):
super().__init__(data_paths, input_size)
self.std_coverage = std_coverage
def __len__(self):
return len(self.data_paths)
def __getitem__(self, idx):
rgb_file = self.data_paths[idx]
mask_hum, mask_obj = self.load_masks(rgb_file)
rgb_full = cv2.imread(rgb_file)[:, :, ::-1]
return self.image2dict(mask_hum, mask_obj, rgb_full, rgb_file)
def image2dict(self, mask_hum, mask_obj, rgb_full, rgb_file=None):
"do all the necessary preprocessing for images"
if rgb_full.shape[:2] != mask_obj.shape[:2]:
raise ValueError(f"The given object mask shape {mask_obj.shape[:2]} does not match the RGB image shape {rgb_full.shape[:2]}")
if rgb_full.shape[:2] != mask_hum.shape[:2]:
raise ValueError(f"The given human mask shape {mask_hum.shape[:2]} does not match the RGB image shape {rgb_full.shape[:2]}")
if rgb_full.shape[:2] not in [(1080, 1920), (1536, 2048)]:
# crop and resize the image to behave image size
print(f"Recropping the input image and masks for {rgb_file}")
rgb_full, mask_hum, mask_obj = recrop_input(rgb_full, mask_hum, mask_obj)
color_h, color_w = rgb_full.shape[:2]
# Input to the first stage model: human + object crop
Kroi, objmask_fullcrop, psmask_fullcrop, rgb_fullcrop = self.crop_full_image(mask_hum.copy(),
mask_obj.copy(),
rgb_full.copy(),
[mask_hum, mask_obj],
1.00)
# Input to the second stage model: human and object crops
Kroi_h, masko_hum, maskh_hum, rgb_hum = self.crop_full_image(mask_hum.copy(),
mask_obj.copy(),
rgb_full.copy(),
[mask_hum, mask_hum], 1.05)
Kroi_o, masko_obj, maskh_obj, rgb_obj = self.crop_full_image(mask_hum.copy(),
mask_obj.copy(),
rgb_full.copy(),
[mask_obj, mask_obj], 1.5)
# Estimate camera translation
cent_transform = np.eye(4) # the transform applied to the mesh that moves it back to kinect camera frame
bmin_ho, bmax_ho = masks2bbox([mask_hum, mask_obj])
crop_size_ho = int(np.max(bmax_ho - bmin_ho) * 1.0)
if crop_size_ho % 2 == 1:
crop_size_ho += 1 # make sure it is an even number
is_behave = self.is_behave_dataset(rgb_full.shape[1])
if rgb_full.shape[1] not in [2048, 1920]:
raise ValueError('the image is not normalized to BEHAVE or ICAP size!')
indices = np.indices(rgb_full.shape[:2])
if np.sum(mask_obj > 127) < 5:
raise ValueError(f'not enough object mask found for {rgb_file}')
pts_h = np.stack([indices[1][mask_hum > 127], indices[0][mask_hum > 127]], -1)
pts_o = np.stack([indices[1][mask_obj > 127], indices[0][mask_obj > 127]], -1)
proj_cent_est = (np.mean(pts_h, 0) + np.mean(pts_o, 0)) / 2. # heuristic to obtain 2d projection center
transl_estimate = compute_translation(proj_cent_est, crop_size_ho, is_behave, self.std_coverage)
cent_transform[:3, 3] = transl_estimate / 7.0
radius = 0.5 # don't do normalization anymore
cent = transl_estimate / 7.0
comb = np.matmul(self.opencv2py3d, cent_transform)
R = torch.from_numpy(comb[:3, :3]).float()
T = torch.from_numpy(comb[:3, 3]).float() / (radius * 2)
data_dict = {
"R": R,
"T": T,
"K": torch.from_numpy(Kroi).float(),
"T_ho": torch.from_numpy(cent).float(), # translation for H+O
"image_path": rgb_file,
"image_size_hw": torch.tensor(self.input_size),
"images": torch.from_numpy(rgb_fullcrop).float().permute(2, 0, 1),
"masks": torch.from_numpy(np.stack([psmask_fullcrop, objmask_fullcrop], 0)).float(),
'orig_image_size': torch.tensor([color_h, color_w]),
# Human input to stage 2
"images_hum": torch.from_numpy(rgb_hum).float().permute(2, 0, 1),
"masks_hum": torch.from_numpy(np.stack([maskh_hum, masko_hum], 0)).float(),
"K_hum": torch.from_numpy(Kroi_h).float(),
# Object input to stage 2
"images_obj": torch.from_numpy(rgb_obj).float().permute(2, 0, 1),
"masks_obj": torch.from_numpy(np.stack([maskh_obj, masko_obj], 0)).float(),
"K_obj": torch.from_numpy(Kroi_o).float(),
# some normalization parameters
"gt_trans": cent,
'radius': radius,
"estimated_trans": transl_estimate,
}
return data_dict
def image2batch(self, rgb, mask_hum, mask_obj):
"""
given input image, convert it into a batch object ready for model inference
:param rgb: (h, w, 3), np array
:param mask_hum: (h, w, 3), np array
:param mask_obj: (h, w, 3), np array
:return:
"""
mask_hum = np.mean(mask_hum, -1)
mask_obj = np.mean(mask_obj, -1)
data_dict = self.image2dict(mask_hum, mask_obj, rgb, 'input image')
# convert dict to list
new_dict = {k:[v] for k, v in data_dict.items()}
return new_dict
|