Spaces:

xiexh20
/

HDM-interaction-recon

Sleeping

App Files Files Community

HDM-interaction-recon / dataset /demo_dataset.py

xiexh20

add hdm demo v1

2fd6166 over 1 year ago

raw

history blame

9.77 kB

	import os
	import numpy as np
	import cv2
	import torch

	from .base_data import BaseDataset
	from .behave_paths import DataPaths
	from .img_utils import compute_translation, masks2bbox, crop


	def padTo_4x3(rgb, person_mask, obj_mask, aspect_ratio=0.75):
	"""
	pad images to have 4:3 aspect ratio
	:param rgb: (H, W, 3)
	:param person_mask:
	:param obj_mask:
	:return: all images at the given aspect ratio
	"""
	h, w = rgb.shape[:2]
	if w > h * 1/aspect_ratio:
	# pad top
	h_4x3 = int(w * aspect_ratio)
	pad_top = h_4x3 - h
	rgb_pad = np.pad(rgb, ((pad_top, 0), (0, 0), (0, 0)))
	person_mask = np.pad(person_mask, ((pad_top, 0), (0, 0))) if person_mask is not None else None
	obj_mask = np.pad(obj_mask, ((pad_top, 0), (0, 0))) if obj_mask is not None else None
	else:
	# pad two side
	w_new = np.lcm.reduce([h * 2, 16]) # least common multiplier
	h_4x3 = int(w_new * aspect_ratio)
	pad_top = h_4x3 - h
	pad_left = (w_new - w) // 2
	pad_right = w_new - w - pad_left
	rgb_pad = np.pad(rgb, ((pad_top, 0), (pad_left, pad_right), (0, 0)))
	obj_mask = np.pad(obj_mask, ((pad_top, 0), (pad_left, pad_right))) if obj_mask is not None else None
	person_mask = np.pad(person_mask, ((pad_top, 0), (pad_left, pad_right))) if person_mask is not None else None
	return rgb_pad, obj_mask, person_mask


	def recrop_input(rgb, person_mask, obj_mask, dataset_name='behave'):
	"recrop input images"
	exp_ratio = 1.42
	if dataset_name == 'behave':
	mean_center = np.array([1008, 995]) # mean RGB image crop center
	behave_size = (2048, 1536)
	new_size = (int(750 * exp_ratio), int(exp_ratio * 750))
	else:
	mean_center = np.array([904, 668]) # mean RGB image crop center for bottle sequences of ICAP
	behave_size = (1920, 1080)
	new_size = (int(593.925 * exp_ratio), int(exp_ratio * 593.925)) # mean width of bottle sequences
	aspect_ratio = behave_size[1] / behave_size[0]
	pad_top = mean_center[1] - new_size[0] // 2
	pad_bottom = behave_size[1] - (mean_center[1] + new_size[0] // 2)
	pad_left = mean_center[0] - new_size[0] // 2
	pad_right = behave_size[0] - (mean_center[0] + new_size[0] // 2)

	# First resize to the same aspect ratio
	if rgb.shape[0] / rgb.shape[1] != aspect_ratio:
	rgb, obj_mask, person_mask = padTo_4x3(rgb, person_mask, obj_mask, aspect_ratio)

	# Resize to the same size as behave image, to have a comparable pixel size
	rgb = cv2.resize(rgb, behave_size)
	mask_ps = cv2.resize(person_mask, behave_size)
	mask_obj = cv2.resize(obj_mask, behave_size)

	# Crop and resize the human + object patch
	bmin, bmax = masks2bbox([mask_ps, mask_obj])
	center = (bmin + bmax) // 2
	crop_size = int(np.max(bmax - bmin) * exp_ratio) # larger crop to have background
	img_crop = cv2.resize(crop(rgb, center, crop_size), new_size)
	mask_ps = cv2.resize(crop(mask_ps, center, crop_size), new_size)
	mask_obj = cv2.resize(crop(mask_obj, center, crop_size), new_size)

	# Pad back to have same shape as behave image
	img_full = np.pad(img_crop, [[pad_top, pad_bottom], [pad_left, pad_right], [0, 0]])
	mask_ps_full = np.pad(mask_ps, [[pad_top, pad_bottom], [pad_left, pad_right]])
	mask_obj_full = np.pad(mask_obj, [[pad_top, pad_bottom], [pad_left, pad_right]])

	# Make sure the image shape is the same
	if img_full.shape[:2] != behave_size[::-1]:
	img_full = cv2.resize(img_full, behave_size)
	mask_ps_full = cv2.resize(mask_ps_full, behave_size)
	mask_obj_full = cv2.resize(mask_obj_full, behave_size)
	return img_full, mask_ps_full, mask_obj_full


	class DemoDataset(BaseDataset):
	def __init__(self, data_paths, input_size=(224, 224),
	std_coverage=3.5, # used to estimate camera translation
	):
	super().__init__(data_paths, input_size)
	self.std_coverage = std_coverage

	def __len__(self):
	return len(self.data_paths)

	def __getitem__(self, idx):
	rgb_file = self.data_paths[idx]
	mask_hum, mask_obj = self.load_masks(rgb_file)
	rgb_full = cv2.imread(rgb_file)[:, :, ::-1]

	return self.image2dict(mask_hum, mask_obj, rgb_full, rgb_file)

	def image2dict(self, mask_hum, mask_obj, rgb_full, rgb_file=None):
	"do all the necessary preprocessing for images"
	if rgb_full.shape[:2] != mask_obj.shape[:2]:
	raise ValueError(f"The given object mask shape {mask_obj.shape[:2]} does not match the RGB image shape {rgb_full.shape[:2]}")
	if rgb_full.shape[:2] != mask_hum.shape[:2]:
	raise ValueError(f"The given human mask shape {mask_hum.shape[:2]} does not match the RGB image shape {rgb_full.shape[:2]}")

	if rgb_full.shape[:2] not in [(1080, 1920), (1536, 2048)]:
	# crop and resize the image to behave image size
	print(f"Recropping the input image and masks for {rgb_file}")
	rgb_full, mask_hum, mask_obj = recrop_input(rgb_full, mask_hum, mask_obj)
	color_h, color_w = rgb_full.shape[:2]
	# Input to the first stage model: human + object crop
	Kroi, objmask_fullcrop, psmask_fullcrop, rgb_fullcrop = self.crop_full_image(mask_hum.copy(),
	mask_obj.copy(),
	rgb_full.copy(),
	[mask_hum, mask_obj],
	1.00)
	# Input to the second stage model: human and object crops
	Kroi_h, masko_hum, maskh_hum, rgb_hum = self.crop_full_image(mask_hum.copy(),
	mask_obj.copy(),
	rgb_full.copy(),
	[mask_hum, mask_hum], 1.05)
	Kroi_o, masko_obj, maskh_obj, rgb_obj = self.crop_full_image(mask_hum.copy(),
	mask_obj.copy(),
	rgb_full.copy(),
	[mask_obj, mask_obj], 1.5)
	# Estimate camera translation
	cent_transform = np.eye(4) # the transform applied to the mesh that moves it back to kinect camera frame
	bmin_ho, bmax_ho = masks2bbox([mask_hum, mask_obj])
	crop_size_ho = int(np.max(bmax_ho - bmin_ho) * 1.0)
	if crop_size_ho % 2 == 1:
	crop_size_ho += 1 # make sure it is an even number
	is_behave = self.is_behave_dataset(rgb_full.shape[1])
	if rgb_full.shape[1] not in [2048, 1920]:
	raise ValueError('the image is not normalized to BEHAVE or ICAP size!')
	indices = np.indices(rgb_full.shape[:2])
	if np.sum(mask_obj > 127) < 5:
	raise ValueError(f'not enough object mask found for {rgb_file}')
	pts_h = np.stack([indices[1][mask_hum > 127], indices[0][mask_hum > 127]], -1)
	pts_o = np.stack([indices[1][mask_obj > 127], indices[0][mask_obj > 127]], -1)
	proj_cent_est = (np.mean(pts_h, 0) + np.mean(pts_o, 0)) / 2. # heuristic to obtain 2d projection center
	transl_estimate = compute_translation(proj_cent_est, crop_size_ho, is_behave, self.std_coverage)
	cent_transform[:3, 3] = transl_estimate / 7.0
	radius = 0.5 # don't do normalization anymore
	cent = transl_estimate / 7.0
	comb = np.matmul(self.opencv2py3d, cent_transform)
	R = torch.from_numpy(comb[:3, :3]).float()
	T = torch.from_numpy(comb[:3, 3]).float() / (radius * 2)
	data_dict = {
	"R": R,
	"T": T,
	"K": torch.from_numpy(Kroi).float(),
	"T_ho": torch.from_numpy(cent).float(), # translation for H+O
	"image_path": rgb_file,
	"image_size_hw": torch.tensor(self.input_size),
	"images": torch.from_numpy(rgb_fullcrop).float().permute(2, 0, 1),
	"masks": torch.from_numpy(np.stack([psmask_fullcrop, objmask_fullcrop], 0)).float(),
	'orig_image_size': torch.tensor([color_h, color_w]),

	# Human input to stage 2
	"images_hum": torch.from_numpy(rgb_hum).float().permute(2, 0, 1),
	"masks_hum": torch.from_numpy(np.stack([maskh_hum, masko_hum], 0)).float(),
	"K_hum": torch.from_numpy(Kroi_h).float(),

	# Object input to stage 2
	"images_obj": torch.from_numpy(rgb_obj).float().permute(2, 0, 1),
	"masks_obj": torch.from_numpy(np.stack([maskh_obj, masko_obj], 0)).float(),
	"K_obj": torch.from_numpy(Kroi_o).float(),

	# some normalization parameters
	"gt_trans": cent,
	'radius': radius,
	"estimated_trans": transl_estimate,
	}
	return data_dict

	def image2batch(self, rgb, mask_hum, mask_obj):
	"""
	given input image, convert it into a batch object ready for model inference
	:param rgb: (h, w, 3), np array
	:param mask_hum: (h, w, 3), np array
	:param mask_obj: (h, w, 3), np array
	:return:
	"""
	mask_hum = np.mean(mask_hum, -1)
	mask_obj = np.mean(mask_obj, -1)

	data_dict = self.image2dict(mask_hum, mask_obj, rgb, 'input image')
	# convert dict to list
	new_dict = {k:[v] for k, v in data_dict.items()}

	return new_dict