Spaces:

ZayarnyukNick
/

stable-virtual-camera

Runtime error

App Files Files Community

stable-virtual-camera / third_party /dust3r /datasets_preprocess /preprocess_co3d.py

ZayarnyukNick

Upload folder using huggingface_hub

864ebc9 verified 4 months ago

raw

history blame contribute delete

14 kB

	#!/usr/bin/env python3
	# Copyright (C) 2024-present Naver Corporation. All rights reserved.
	# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
	#
	# --------------------------------------------------------
	# Script to pre-process the CO3D dataset.
	# Usage:
	# python3 datasets_preprocess/preprocess_co3d.py --co3d_dir /path/to/co3d
	# --------------------------------------------------------

	import argparse
	import gzip
	import json
	import os
	import os.path as osp
	import random

	import cv2
	import dust3r.datasets.utils.cropping as cropping # noqa
	import matplotlib.pyplot as plt
	import numpy as np
	import path_to_root # noqa
	import PIL.Image
	import torch
	from tqdm.auto import tqdm

	CATEGORIES = [
	"apple",
	"backpack",
	"ball",
	"banana",
	"baseballbat",
	"baseballglove",
	"bench",
	"bicycle",
	"book",
	"bottle",
	"bowl",
	"broccoli",
	"cake",
	"car",
	"carrot",
	"cellphone",
	"chair",
	"couch",
	"cup",
	"donut",
	"frisbee",
	"hairdryer",
	"handbag",
	"hotdog",
	"hydrant",
	"keyboard",
	"kite",
	"laptop",
	"microwave",
	"motorcycle",
	"mouse",
	"orange",
	"parkingmeter",
	"pizza",
	"plant",
	"remote",
	"sandwich",
	"skateboard",
	"stopsign",
	"suitcase",
	"teddybear",
	"toaster",
	"toilet",
	"toybus",
	"toyplane",
	"toytrain",
	"toytruck",
	"tv",
	"umbrella",
	"vase",
	"wineglass",
	]
	CATEGORIES_IDX = {cat: i for i, cat in enumerate(CATEGORIES)} # for seeding

	SINGLE_SEQUENCE_CATEGORIES = sorted(
	set(CATEGORIES) - set(["microwave", "stopsign", "tv"])
	)


	def get_parser():
	parser = argparse.ArgumentParser()
	parser.add_argument("--category", type=str, default=None)
	parser.add_argument(
	"--single_sequence_subset",
	default=False,
	action="store_true",
	help="prepare the single_sequence_subset instead.",
	)
	parser.add_argument("--output_dir", type=str, default="data/co3d_processed")
	parser.add_argument("--co3d_dir", type=str, required=True)
	parser.add_argument("--num_sequences_per_object", type=int, default=50)
	parser.add_argument("--seed", type=int, default=42)
	parser.add_argument(
	"--min_quality",
	type=float,
	default=0.5,
	help="Minimum viewpoint quality score.",
	)

	parser.add_argument(
	"--img_size",
	type=int,
	default=512,
	help=(
	"lower dimension will be >= img_size * 3/4, and max dimension will be >= img_size"
	),
	)
	return parser


	def convert_ndc_to_pinhole(focal_length, principal_point, image_size):
	focal_length = np.array(focal_length)
	principal_point = np.array(principal_point)
	image_size_wh = np.array([image_size[1], image_size[0]])
	half_image_size = image_size_wh / 2
	rescale = half_image_size.min()
	principal_point_px = half_image_size - principal_point * rescale
	focal_length_px = focal_length * rescale
	fx, fy = focal_length_px[0], focal_length_px[1]
	cx, cy = principal_point_px[0], principal_point_px[1]
	K = np.array([[fx, 0.0, cx], [0.0, fy, cy], [0.0, 0.0, 1.0]], dtype=np.float32)
	return K


	def opencv_from_cameras_projection(R, T, focal, p0, image_size):
	R = torch.from_numpy(R)[None, :, :]
	T = torch.from_numpy(T)[None, :]
	focal = torch.from_numpy(focal)[None, :]
	p0 = torch.from_numpy(p0)[None, :]
	image_size = torch.from_numpy(image_size)[None, :]

	R_pytorch3d = R.clone()
	T_pytorch3d = T.clone()
	focal_pytorch3d = focal
	p0_pytorch3d = p0
	T_pytorch3d[:, :2] *= -1
	R_pytorch3d[:, :, :2] *= -1
	tvec = T_pytorch3d
	R = R_pytorch3d.permute(0, 2, 1)

	# Retype the image_size correctly and flip to width, height.
	image_size_wh = image_size.to(R).flip(dims=(1,))

	# NDC to screen conversion.
	scale = image_size_wh.to(R).min(dim=1, keepdim=True)[0] / 2.0
	scale = scale.expand(-1, 2)
	c0 = image_size_wh / 2.0

	principal_point = -p0_pytorch3d * scale + c0
	focal_length = focal_pytorch3d * scale

	camera_matrix = torch.zeros_like(R)
	camera_matrix[:, :2, 2] = principal_point
	camera_matrix[:, 2, 2] = 1.0
	camera_matrix[:, 0, 0] = focal_length[:, 0]
	camera_matrix[:, 1, 1] = focal_length[:, 1]
	return R[0], tvec[0], camera_matrix[0]


	def get_set_list(category_dir, split, is_single_sequence_subset=False):
	listfiles = os.listdir(osp.join(category_dir, "set_lists"))
	if is_single_sequence_subset:
	# not all objects have manyview_dev
	subset_list_files = [f for f in listfiles if "manyview_dev" in f]
	else:
	subset_list_files = [f for f in listfiles if f"fewview_train" in f]

	sequences_all = []
	for subset_list_file in subset_list_files:
	with open(osp.join(category_dir, "set_lists", subset_list_file)) as f:
	subset_lists_data = json.load(f)
	sequences_all.extend(subset_lists_data[split])

	return sequences_all


	def prepare_sequences(
	category,
	co3d_dir,
	output_dir,
	img_size,
	split,
	min_quality,
	max_num_sequences_per_object,
	seed,
	is_single_sequence_subset=False,
	):
	random.seed(seed)
	category_dir = osp.join(co3d_dir, category)
	category_output_dir = osp.join(output_dir, category)
	sequences_all = get_set_list(category_dir, split, is_single_sequence_subset)
	sequences_numbers = sorted(set(seq_name for seq_name, _, _ in sequences_all))

	frame_file = osp.join(category_dir, "frame_annotations.jgz")
	sequence_file = osp.join(category_dir, "sequence_annotations.jgz")

	with gzip.open(frame_file, "r") as fin:
	frame_data = json.loads(fin.read())
	with gzip.open(sequence_file, "r") as fin:
	sequence_data = json.loads(fin.read())

	frame_data_processed = {}
	for f_data in frame_data:
	sequence_name = f_data["sequence_name"]
	frame_data_processed.setdefault(sequence_name, {})[
	f_data["frame_number"]
	] = f_data

	good_quality_sequences = set()
	for seq_data in sequence_data:
	if seq_data["viewpoint_quality_score"] > min_quality:
	good_quality_sequences.add(seq_data["sequence_name"])

	sequences_numbers = [
	seq_name for seq_name in sequences_numbers if seq_name in good_quality_sequences
	]
	if len(sequences_numbers) < max_num_sequences_per_object:
	selected_sequences_numbers = sequences_numbers
	else:
	selected_sequences_numbers = random.sample(
	sequences_numbers, max_num_sequences_per_object
	)

	selected_sequences_numbers_dict = {
	seq_name: [] for seq_name in selected_sequences_numbers
	}
	sequences_all = [
	(seq_name, frame_number, filepath)
	for seq_name, frame_number, filepath in sequences_all
	if seq_name in selected_sequences_numbers_dict
	]

	for seq_name, frame_number, filepath in tqdm(sequences_all):
	frame_idx = int(filepath.split("/")[-1][5:-4])
	selected_sequences_numbers_dict[seq_name].append(frame_idx)
	mask_path = filepath.replace("images", "masks").replace(".jpg", ".png")
	frame_data = frame_data_processed[seq_name][frame_number]
	focal_length = frame_data["viewpoint"]["focal_length"]
	principal_point = frame_data["viewpoint"]["principal_point"]
	image_size = frame_data["image"]["size"]
	K = convert_ndc_to_pinhole(focal_length, principal_point, image_size)
	R, tvec, camera_intrinsics = opencv_from_cameras_projection(
	np.array(frame_data["viewpoint"]["R"]),
	np.array(frame_data["viewpoint"]["T"]),
	np.array(focal_length),
	np.array(principal_point),
	np.array(image_size),
	)

	frame_data = frame_data_processed[seq_name][frame_number]
	depth_path = os.path.join(co3d_dir, frame_data["depth"]["path"])
	assert frame_data["depth"]["scale_adjustment"] == 1.0
	image_path = os.path.join(co3d_dir, filepath)
	mask_path_full = os.path.join(co3d_dir, mask_path)

	input_rgb_image = PIL.Image.open(image_path).convert("RGB")
	input_mask = plt.imread(mask_path_full)

	with PIL.Image.open(depth_path) as depth_pil:
	# the image is stored with 16-bit depth but PIL reads it as I (32 bit).
	# we cast it to uint16, then reinterpret as float16, then cast to float32
	input_depthmap = (
	np.frombuffer(np.array(depth_pil, dtype=np.uint16), dtype=np.float16)
	.astype(np.float32)
	.reshape((depth_pil.size[1], depth_pil.size[0]))
	)
	depth_mask = np.stack((input_depthmap, input_mask), axis=-1)
	H, W = input_depthmap.shape

	camera_intrinsics = camera_intrinsics.numpy()
	cx, cy = camera_intrinsics[:2, 2].round().astype(int)
	min_margin_x = min(cx, W - cx)
	min_margin_y = min(cy, H - cy)

	# the new window will be a rectangle of size (2min_margin_x, 2min_margin_y) centered on (cx,cy)
	l, t = cx - min_margin_x, cy - min_margin_y
	r, b = cx + min_margin_x, cy + min_margin_y
	crop_bbox = (l, t, r, b)
	(
	input_rgb_image,
	depth_mask,
	input_camera_intrinsics,
	) = cropping.crop_image_depthmap(
	input_rgb_image, depth_mask, camera_intrinsics, crop_bbox
	)

	# try to set the lower dimension to img_size * 3/4 -> img_size=512 => 384
	scale_final = ((img_size * 3 // 4) / min(H, W)) + 1e-8
	output_resolution = np.floor(np.array([W, H]) * scale_final).astype(int)
	if max(output_resolution) < img_size:
	# let's put the max dimension to img_size
	scale_final = (img_size / max(H, W)) + 1e-8
	output_resolution = np.floor(np.array([W, H]) * scale_final).astype(int)

	(
	input_rgb_image,
	depth_mask,
	input_camera_intrinsics,
	) = cropping.rescale_image_depthmap(
	input_rgb_image, depth_mask, input_camera_intrinsics, output_resolution
	)
	input_depthmap = depth_mask[:, :, 0]
	input_mask = depth_mask[:, :, 1]

	# generate and adjust camera pose
	camera_pose = np.eye(4, dtype=np.float32)
	camera_pose[:3, :3] = R
	camera_pose[:3, 3] = tvec
	camera_pose = np.linalg.inv(camera_pose)

	# save crop images and depth, metadata
	save_img_path = os.path.join(output_dir, filepath)
	save_depth_path = os.path.join(output_dir, frame_data["depth"]["path"])
	save_mask_path = os.path.join(output_dir, mask_path)
	os.makedirs(os.path.split(save_img_path)[0], exist_ok=True)
	os.makedirs(os.path.split(save_depth_path)[0], exist_ok=True)
	os.makedirs(os.path.split(save_mask_path)[0], exist_ok=True)

	input_rgb_image.save(save_img_path)
	scaled_depth_map = (input_depthmap / np.max(input_depthmap) * 65535).astype(
	np.uint16
	)
	cv2.imwrite(save_depth_path, scaled_depth_map)
	cv2.imwrite(save_mask_path, (input_mask * 255).astype(np.uint8))

	save_meta_path = save_img_path.replace("jpg", "npz")
	np.savez(
	save_meta_path,
	camera_intrinsics=input_camera_intrinsics,
	camera_pose=camera_pose,
	maximum_depth=np.max(input_depthmap),
	)

	return selected_sequences_numbers_dict


	if __name__ == "__main__":
	parser = get_parser()
	args = parser.parse_args()
	assert args.co3d_dir != args.output_dir
	if args.category is None:
	if args.single_sequence_subset:
	categories = SINGLE_SEQUENCE_CATEGORIES
	else:
	categories = CATEGORIES
	else:
	categories = [args.category]
	os.makedirs(args.output_dir, exist_ok=True)

	for split in ["train", "test"]:
	selected_sequences_path = os.path.join(
	args.output_dir, f"selected_seqs_{split}.json"
	)
	if os.path.isfile(selected_sequences_path):
	continue

	all_selected_sequences = {}
	for category in categories:
	category_output_dir = osp.join(args.output_dir, category)
	os.makedirs(category_output_dir, exist_ok=True)
	category_selected_sequences_path = os.path.join(
	category_output_dir, f"selected_seqs_{split}.json"
	)
	if os.path.isfile(category_selected_sequences_path):
	with open(category_selected_sequences_path, "r") as fid:
	category_selected_sequences = json.load(fid)
	else:
	print(f"Processing {split} - category = {category}")
	category_selected_sequences = prepare_sequences(
	category=category,
	co3d_dir=args.co3d_dir,
	output_dir=args.output_dir,
	img_size=args.img_size,
	split=split,
	min_quality=args.min_quality,
	max_num_sequences_per_object=args.num_sequences_per_object,
	seed=args.seed + CATEGORIES_IDX[category],
	is_single_sequence_subset=args.single_sequence_subset,
	)
	with open(category_selected_sequences_path, "w") as file:
	json.dump(category_selected_sequences, file)

	all_selected_sequences[category] = category_selected_sequences
	with open(selected_sequences_path, "w") as file:
	json.dump(all_selected_sequences, file)