# Copyright (c) Meta Platforms, Inc. and affiliates. # All rights reserved. # # This source code is licensed under the license found in the # LICENSE file in the root directory of this source tree. # Adapted from https://github.com/facebookresearch/vggt/blob/main/visual_util.py import matplotlib import numpy as np import trimesh from scipy.spatial.transform import Rotation from aether.utils.postprocess_utils import depth_edge def predictions_to_glb( predictions, filter_by_frames="all", show_cam=True, max_depth=100.0, rtol=0.03, frame_rel_idx: float = 0.0, ) -> trimesh.Scene: """ Converts predictions to a 3D scene represented as a GLB file. Args: predictions (dict): Dictionary containing model predictions with keys: - world_points: 3D point coordinates (S, H, W, 3) - images: Input images (S, H, W, 3) - depths: Depths (S, H, W) - camera poses: Camera poses (S, 4, 4) filter_by_frames (str): Frame filter specification (default: "all") show_cam (bool): Include camera visualization (default: True) max_depth (float): Maximum depth value (default: 100.0) rtol (float): Relative tolerance for depth edge detection (default: 0.2) frame_rel_idx (float): Relative index of the frame to visualize (default: 0.0) Returns: trimesh.Scene: Processed 3D scene containing point cloud and cameras Raises: ValueError: If input predictions structure is invalid """ if not isinstance(predictions, dict): raise ValueError("predictions must be a dictionary") selected_frame_idx = None if filter_by_frames != "all" and filter_by_frames != "All": try: # Extract the index part before the colon selected_frame_idx = int(filter_by_frames.split(":")[0]) except (ValueError, IndexError): pass pred_world_points = predictions["world_points"] # Get images from predictions images = predictions["images"] # Use extrinsic matrices instead of pred_extrinsic_list camera_poses = predictions["camera_poses"] if selected_frame_idx is not None: pred_world_points = pred_world_points[selected_frame_idx][None] images = images[selected_frame_idx][None] camera_poses = camera_poses[selected_frame_idx][None] vertices_3d = pred_world_points.reshape(-1, 3) # Handle different image formats - check if images need transposing if images.ndim == 4 and images.shape[1] == 3: # NCHW format colors_rgb = np.transpose(images, (0, 2, 3, 1)) else: # Assume already in NHWC format colors_rgb = images colors_rgb = (colors_rgb.reshape(-1, 3) * 255).astype(np.uint8) depths = predictions["depths"] masks = depths < max_depth edge = ~depth_edge(depths, rtol=rtol, mask=masks) masks = (masks & edge).reshape(-1) vertices_3d = vertices_3d[masks] colors_rgb = colors_rgb[masks] if vertices_3d is None or np.asarray(vertices_3d).size == 0: vertices_3d = np.array([[1, 0, 0]]) colors_rgb = np.array([[255, 255, 255]]) scene_scale = 1 else: # Calculate the 5th and 95th percentiles along each axis lower_percentile = np.percentile(vertices_3d, 5, axis=0) upper_percentile = np.percentile(vertices_3d, 95, axis=0) # Calculate the diagonal length of the percentile bounding box scene_scale = np.linalg.norm(upper_percentile - lower_percentile) colormap = matplotlib.colormaps.get_cmap("gist_rainbow") # Initialize a 3D scene scene_3d = trimesh.Scene() # Add point cloud data to the scene point_cloud_data = trimesh.PointCloud(vertices=vertices_3d, colors=colors_rgb) scene_3d.add_geometry(point_cloud_data) # Prepare 4x4 matrices for camera extrinsics num_cameras = len(camera_poses) extrinsics_matrices = np.zeros((num_cameras, 4, 4)) extrinsics_matrices[:, :3, :4] = camera_poses[:, :3, :4] extrinsics_matrices[:, 3, 3] = 1 if show_cam: # Add camera models to the scene for i in range(num_cameras): camera_to_world = camera_poses[i] rgba_color = colormap(frame_rel_idx) current_color = tuple(int(255 * x) for x in rgba_color[:3]) integrate_camera_into_scene( scene_3d, camera_to_world, current_color, scene_scale ) return scene_3d def integrate_camera_into_scene( scene: trimesh.Scene, transform: np.ndarray, face_colors: tuple, scene_scale: float, ): """ Integrates a fake camera mesh into the 3D scene. Args: scene (trimesh.Scene): The 3D scene to add the camera model. transform (np.ndarray): Transformation matrix for camera positioning. face_colors (tuple): Color of the camera face. scene_scale (float): Scale of the scene. """ cam_width = scene_scale * 0.025 cam_height = scene_scale * 0.05 # Create cone shape for camera rot_45_degree = np.eye(4) rot_45_degree[:3, :3] = Rotation.from_euler("z", 45, degrees=True).as_matrix() rot_45_degree[2, 3] = -cam_height opengl_transform = get_opengl_conversion_matrix() # Combine transformations complete_transform = transform @ opengl_transform @ rot_45_degree camera_cone_shape = trimesh.creation.cone(cam_width, cam_height, sections=4) # Generate mesh for the camera slight_rotation = np.eye(4) slight_rotation[:3, :3] = Rotation.from_euler("z", 2, degrees=True).as_matrix() vertices_combined = np.concatenate( [ camera_cone_shape.vertices, 0.95 * camera_cone_shape.vertices, transform_points(slight_rotation, camera_cone_shape.vertices), ] ) vertices_transformed = transform_points(complete_transform, vertices_combined) mesh_faces = compute_camera_faces(camera_cone_shape) # Add the camera mesh to the scene camera_mesh = trimesh.Trimesh(vertices=vertices_transformed, faces=mesh_faces) camera_mesh.visual.face_colors[:, :3] = face_colors scene.add_geometry(camera_mesh) def get_opengl_conversion_matrix() -> np.ndarray: """ Constructs and returns the OpenGL conversion matrix. Returns: numpy.ndarray: A 4x4 OpenGL conversion matrix. """ # Create an identity matrix matrix = np.identity(4) # Flip the y and z axes matrix[1, 1] = -1 matrix[2, 2] = -1 return matrix def transform_points( transformation: np.ndarray, points: np.ndarray, dim: int = None ) -> np.ndarray: """ Applies a 4x4 transformation to a set of points. Args: transformation (np.ndarray): Transformation matrix. points (np.ndarray): Points to be transformed. dim (int, optional): Dimension for reshaping the result. Returns: np.ndarray: Transformed points. """ points = np.asarray(points) initial_shape = points.shape[:-1] dim = dim or points.shape[-1] # Apply transformation transformation = transformation.swapaxes( -1, -2 ) # Transpose the transformation matrix points = points @ transformation[..., :-1, :] + transformation[..., -1:, :] # Reshape the result result = points[..., :dim].reshape(*initial_shape, dim) return result def compute_camera_faces(cone_shape: trimesh.Trimesh) -> np.ndarray: """ Computes the faces for the camera mesh. Args: cone_shape (trimesh.Trimesh): The shape of the camera cone. Returns: np.ndarray: Array of faces for the camera mesh. """ # Create pseudo cameras faces_list = [] num_vertices_cone = len(cone_shape.vertices) for face in cone_shape.faces: if 0 in face: continue v1, v2, v3 = face v1_offset, v2_offset, v3_offset = face + num_vertices_cone v1_offset_2, v2_offset_2, v3_offset_2 = face + 2 * num_vertices_cone faces_list.extend( [ (v1, v2, v2_offset), (v1, v1_offset, v3), (v3_offset, v2, v3), (v1, v2, v2_offset_2), (v1, v1_offset_2, v3), (v3_offset_2, v2, v3), ] ) faces_list += [(v3, v2, v1) for v1, v2, v3 in faces_list] return np.array(faces_list)