SparseAGS / sparseags /cam_utils.py
qitaoz's picture
init commit
4f54ccd verified
raw
history blame
17.4 kB
import numpy as np
from scipy.spatial.transform import Rotation as R
# import ipdb
import math
import torch
import torch.nn.functional as F
from pytorch3d.transforms import Rotate, Translate
def intersect_skew_line_groups(p, r, mask):
# p, r both of shape (B, N, n_intersected_lines, 3)
# mask of shape (B, N, n_intersected_lines)
p_intersect, r = intersect_skew_lines_high_dim(p, r, mask=mask)
if p_intersect is None:
return None, None, None, None
_, p_line_intersect = point_line_distance(
p, r, p_intersect[..., None, :].expand_as(p)
)
intersect_dist_squared = ((p_line_intersect - p_intersect[..., None, :]) ** 2).sum(
dim=-1
)
return p_intersect, p_line_intersect, intersect_dist_squared, r
def intersect_skew_lines_high_dim(p, r, mask=None):
# Implements https://en.wikipedia.org/wiki/Skew_lines In more than two dimensions
dim = p.shape[-1]
# make sure the heading vectors are l2-normed
if mask is None:
mask = torch.ones_like(p[..., 0])
r = torch.nn.functional.normalize(r, dim=-1)
eye = torch.eye(dim, device=p.device, dtype=p.dtype)[None, None]
I_min_cov = (eye - (r[..., None] * r[..., None, :])) * mask[..., None, None]
sum_proj = I_min_cov.matmul(p[..., None]).sum(dim=-3)
# I_eps = torch.zeros_like(I_min_cov.sum(dim=-3)) + 1e-10
# p_intersect = torch.pinverse(I_min_cov.sum(dim=-3) + I_eps).matmul(sum_proj)[..., 0]
p_intersect = torch.linalg.lstsq(I_min_cov.sum(dim=-3), sum_proj).solution[..., 0]
# I_min_cov.sum(dim=-3): torch.Size([1, 1, 3, 3])
# sum_proj: torch.Size([1, 1, 3, 1])
# p_intersect = np.linalg.lstsq(I_min_cov.sum(dim=-3).numpy(), sum_proj.numpy(), rcond=None)[0]
if torch.any(torch.isnan(p_intersect)):
print(p_intersect)
return None, None
ipdb.set_trace()
assert False
return p_intersect, r
def point_line_distance(p1, r1, p2):
df = p2 - p1
proj_vector = df - ((df * r1).sum(dim=-1, keepdim=True) * r1)
line_pt_nearest = p2 - proj_vector
d = (proj_vector).norm(dim=-1)
return d, line_pt_nearest
def compute_optical_axis_intersection(cameras, in_ndc=True):
centers = cameras.get_camera_center()
principal_points = cameras.principal_point
one_vec = torch.ones((len(cameras), 1), device=centers.device)
optical_axis = torch.cat((principal_points, one_vec), -1)
# optical_axis = torch.cat(
# (principal_points, cameras.focal_length[:, 0].unsqueeze(1)), -1
# )
pp = cameras.unproject_points(optical_axis, from_ndc=in_ndc, world_coordinates=True)
pp2 = torch.diagonal(pp, dim1=0, dim2=1).T
directions = pp2 - centers
centers = centers.unsqueeze(0).unsqueeze(0)
directions = directions.unsqueeze(0).unsqueeze(0)
p_intersect, p_line_intersect, _, r = intersect_skew_line_groups(
p=centers, r=directions, mask=None
)
if p_intersect is None:
dist = None
else:
p_intersect = p_intersect.squeeze().unsqueeze(0)
dist = (p_intersect - centers).norm(dim=-1)
return p_intersect, dist, p_line_intersect, pp2, r
def normalize_cameras_with_up_axis(cameras, sequence_name, scale=1.0, in_ndc=True):
"""
Normalizes cameras such that the optical axes point to the origin and the average
distance to the origin is 1.
Args:
cameras (List[camera]).
"""
# Let distance from first camera to origin be unit
new_cameras = cameras.clone()
new_transform = new_cameras.get_world_to_view_transform()
p_intersect, dist, p_line_intersect, pp, r = compute_optical_axis_intersection(
cameras,
in_ndc=in_ndc
)
t = Translate(p_intersect)
# scale = dist.squeeze()[0]
scale = dist.squeeze().mean()
# Degenerate case
if scale == 0:
print(cameras.T)
print(new_transform.get_matrix()[:, 3, :3])
return -1
assert scale != 0
new_transform = t.compose(new_transform)
new_cameras.R = new_transform.get_matrix()[:, :3, :3]
new_cameras.T = new_transform.get_matrix()[:, 3, :3] / scale * 1.85
needs_checking = False
# ===== Rotation normalization
# Estimate the world 'up' direction assuming that yaw is small
# and running SVD on the x-vectors of the cameras
x_vectors = new_cameras.R.transpose(1, 2)[:, 0, :].clone()
x_vectors -= x_vectors.mean(dim=0, keepdim=True)
U, S, Vh = torch.linalg.svd(x_vectors)
V = Vh.mH
# vector with the smallest variation is to the normal to
# the plane of x-vectors (assume this to be the up direction)
if S[0] / S[1] > S[1] / S[2]:
print('Warning: unexpected singular values in sequence {}: {}'.format(sequence_name, S))
needs_checking = True
# return None, None, None, None, None
estimated_world_up = V[:, 2:]
# check all cameras have the same y-direction
for camera_idx in range(len(new_cameras.T)):
if torch.sign(torch.dot(estimated_world_up[:, 0],
new_cameras.R[0].transpose(0,1)[1, :])) != torch.sign(torch.dot(estimated_world_up[:, 0],
new_cameras.R[camera_idx].transpose(0,1)[1, :])):
print("Some cameras appear to be flipped in sequence {}".format(sequence_name) )
needs_checking = True
# return None, None, None, None, None
flip = torch.sign(torch.dot(estimated_world_up[:, 0], new_cameras.R[0].transpose(0,1)[1, :])) < 0
if flip:
estimated_world_up = V[:, 2:] * -1
# build the target coordinate basis using the estimated world up
target_coordinate_basis = torch.cat([V[:, :1],
estimated_world_up,
torch.linalg.cross(V[:, :1], estimated_world_up, dim=0)],
dim=1)
new_cameras.R = torch.matmul(target_coordinate_basis.T, new_cameras.R)
return new_cameras, p_intersect, p_line_intersect, pp, r, needs_checking
def dot(x, y):
if isinstance(x, np.ndarray):
return np.sum(x * y, -1, keepdims=True)
else:
return torch.sum(x * y, -1, keepdim=True)
def length(x, eps=1e-20):
if isinstance(x, np.ndarray):
return np.sqrt(np.maximum(np.sum(x * x, axis=-1, keepdims=True), eps))
else:
return torch.sqrt(torch.clamp(dot(x, x), min=eps))
def safe_normalize(x, eps=1e-20):
return x / length(x, eps)
def look_at(campos, target, opengl=True):
# campos: [N, 3], camera/eye position
# target: [N, 3], object to look at
# return: [N, 3, 3], rotation matrix
if not opengl:
# camera forward aligns with -z
forward_vector = safe_normalize(target - campos)
up_vector = np.array([0, 1, 0], dtype=np.float32)
right_vector = safe_normalize(np.cross(forward_vector, up_vector))
up_vector = safe_normalize(np.cross(right_vector, forward_vector))
else:
# camera forward aligns with +z
forward_vector = safe_normalize(campos - target)
up_vector = np.array([0, 1, 0], dtype=np.float32)
right_vector = safe_normalize(np.cross(up_vector, forward_vector))
up_vector = safe_normalize(np.cross(forward_vector, right_vector))
R = np.stack([right_vector, up_vector, forward_vector], axis=1)
return R
# elevation & azimuth to pose (cam2world) matrix
def orbit_camera(elevation, azimuth, radius=1, is_degree=True, target=None, opengl=True):
# radius: scalar
# elevation: scalar, in (-90, 90), from +y to -y is (-90, 90)
# azimuth: scalar, in (-180, 180), from +z to +x is (0, 90)
# return: [4, 4], camera pose matrix
if is_degree:
elevation = np.deg2rad(elevation)
azimuth = np.deg2rad(azimuth)
x = radius * np.cos(elevation) * np.sin(azimuth)
y = - radius * np.sin(elevation)
z = radius * np.cos(elevation) * np.cos(azimuth)
if target is None:
target = np.zeros([3], dtype=np.float32)
campos = np.array([x, y, z]) + target # [3]
T = np.eye(4, dtype=np.float32)
T[:3, :3] = look_at(campos, target, opengl)
T[:3, 3] = campos
return T
def mat2latlon(T):
if not isinstance(T, np.ndarray):
xyz = T.cpu().detach().numpy()
else:
xyz = T.copy()
r = np.linalg.norm(xyz)
xyz = xyz / r
theta = -np.arcsin(xyz[1])
azi = np.arctan2(xyz[0], xyz[2])
return np.rad2deg(theta), np.rad2deg(azi), r
def extract_camera_properties(camera_to_world_matrix):
# Camera position is the translation part of the matrix
camera_position = camera_to_world_matrix[:3, 3]
# Extracting the forward direction vector (third column of rotation matrix)
forward = camera_to_world_matrix[:3, 2]
return camera_position, forward
def compute_angular_error_batch(rotation1, rotation2):
R_rel = np.einsum("Bij,Bjk ->Bik", rotation1.transpose(0, 2, 1), rotation2)
t = (np.trace(R_rel, axis1=1, axis2=2) - 1) / 2
theta = np.arccos(np.clip(t, -1, 1))
return theta * 180 / np.pi
def find_mask_center_and_translate(image, mask):
"""
Calculate the center of the mask and translate the image such that
the mask center is at the image center.
Args:
- image (torch.Tensor): Input image tensor of shape (N, C, H, W)
- mask (torch.Tensor): Mask tensor of shape (N, 1, H, W)
Returns:
- Translated image of shape (N, C, H, W)
"""
_, _, h, w = image.shape
# Calculate the center of mass of the mask
# Note: mask should be a binary mask of the same spatial dimensions as the image
y_coords, x_coords = torch.meshgrid(torch.arange(0, h), torch.arange(0, w), indexing='ij')
total_mass = mask.sum(dim=[2, 3], keepdim=True)
x_center = (mask * x_coords.to(image.device)).sum(dim=[2, 3], keepdim=True) / total_mass
y_center = (mask * y_coords.to(image.device)).sum(dim=[2, 3], keepdim=True) / total_mass
# Calculate the translation needed to move the mask center to the image center
image_center_x, image_center_y = w // 2, h // 2
delta_x = x_center.squeeze() - image_center_x
delta_y = y_center.squeeze() - image_center_y
return torch.tensor([delta_x, delta_y])
def create_voxel_grid(length, resolution=64):
"""
Creates a voxel grid.
xyz_range: ((min_x, max_x), (min_y, max_y), (min_z, max_z))
resolution: The number of divisions along each axis.
Returns a 4D tensor representing the voxel grid, with each voxel initialized to 1 (solid).
"""
x = torch.linspace(-length, length, resolution)
y = torch.linspace(-length, length, resolution)
z = torch.linspace(-length, length, resolution)
xx, yy, zz = torch.meshgrid(x, y, z, indexing='ij')
voxels = torch.stack([xx, yy, zz, torch.ones_like(xx)], dim=-1) # Homogeneous coordinates
return voxels
def project_voxels_to_image(voxels, camera):
"""
Projects voxel centers into the camera's image plane.
voxels: 4D tensor of voxel grid in homogeneous coordinates.
K: Camera intrinsic matrix.
R: Camera rotation matrix.
t: Camera translation vector.
Returns a tensor of projected 2D points in image coordinates.
"""
device = voxels.device
# K, R, t = torch.tensor(K, device=device), torch.tensor(R, device=device), torch.tensor(t, device=device)
# Flatten voxels to shape (N, 4) for matrix multiplication
N = voxels.nelement() // 4 # Total number of voxels
voxels_flat = voxels.reshape(-1, 4).t() # Shape (4, N)
# # Apply extrinsic parameters (rotation and translation)
# transformed_voxels = R @ voxels_flat[:3, :] + t[:, None]
# # Apply intrinsic parameters
# projected_voxels = K @ transformed_voxels
projected_voxels = camera.projection_matrix.transpose(0, 1) @ camera.world_view_transform.transpose(0, 1) @ voxels_flat
# Convert from homogeneous coordinates to 2D
projected_voxels_2d = (projected_voxels[:2, :] / projected_voxels[3, :]).t() # Reshape to grid dimensions with 2D points
projected_voxels_2d = (projected_voxels_2d.reshape(*voxels.shape[:-1], 2) + 1.) * 255 * 0.5
return projected_voxels_2d
def carve_voxels(voxel_grid, projected_points, mask):
"""
Updates the voxel grid based on the comparison with the mask.
voxel_grid: 3D tensor representing the voxel grid.
projected_points: Projected 2D points in image coordinates.
mask: Binary mask image.
"""
# Convert projected points to indices in the mask
indices_x = torch.clamp(projected_points[..., 0], 0, mask.shape[1] - 1).long()
indices_y = torch.clamp(projected_points[..., 1], 0, mask.shape[0] - 1).long()
# Check if projected points are within the object in the mask
in_object = mask[indices_y, indices_x]
# Carve out voxels where the projection does not fall within the object
voxel_grid[in_object == 0] = 0
def sample_points_from_voxel(cameras, masks, length=1, resolution=64, N=5000, inverse=False, device="cuda"):
"""
Randomly sample N points from solid regions in a voxel grid.
Args:
- voxel_grid (torch.Tensor): A 3D tensor representing the voxel grid after carving.
Solid regions are marked with 1s.
- N (int): The number of points to sample.
Returns:
- sampled_points (torch.Tensor): A tensor of shape (N, 3) representing the sampled 3D coordinates.
"""
voxel_grid = create_voxel_grid(length, resolution).to(device)
voxel_grid_indicator = torch.ones(resolution, resolution, resolution)
masks = torch.from_numpy(masks).to(device).squeeze()
for idx, cam in enumerate(cameras):
projected_points = project_voxels_to_image(voxel_grid, cam)
carve_voxels(voxel_grid_indicator, projected_points, masks[idx])
voxel_grid_indicator = voxel_grid_indicator.reshape(resolution, resolution, resolution)
# Identify the indices of solid voxels
if inverse:
solid_indices = torch.nonzero(voxel_grid_indicator == 0)
else:
solid_indices = torch.nonzero(voxel_grid_indicator == 1)
# Randomly select N indices from the solid indices
if N <= solid_indices.size(0):
# Randomly select N indices from the solid indices if there are enough solid voxels
sampled_indices = solid_indices[torch.randperm(solid_indices.size(0))[:N]]
else:
# If there are not enough solid voxels, sample with replacement
sampled_indices = solid_indices[torch.randint(0, solid_indices.size(0), (N,))]
# Convert indices to coordinates
# Note: This step assumes the voxel grid spans from 0 to 1 in each dimension.
# Adjust accordingly if your grid spans a different range.
sampled_points = sampled_indices.float() / (voxel_grid.size(0) - 1) * 2 * length - length
return sampled_points
class OrbitCamera:
def __init__(self, W, H, r=2, fovy=60, near=0.01, far=100):
self.W = W
self.H = H
self.radius = r # camera distance from center
self.fovy = np.deg2rad(fovy) # deg 2 rad
self.near = near
self.far = far
self.center = np.array([0, 0, 0], dtype=np.float32) # look at this point
self.rot = R.from_matrix(np.eye(3))
self.up = np.array([0, 1, 0], dtype=np.float32) # need to be normalized!
@property
def fovx(self):
return 2 * np.arctan(np.tan(self.fovy / 2) * self.W / self.H)
@property
def campos(self):
return self.pose[:3, 3]
# pose (c2w)
@property
def pose(self):
# first move camera to radius
res = np.eye(4, dtype=np.float32)
res[2, 3] = self.radius # opengl convention...
# rotate
rot = np.eye(4, dtype=np.float32)
rot[:3, :3] = self.rot.as_matrix()
res = rot @ res
# translate
res[:3, 3] -= self.center
return res
# view (w2c)
@property
def view(self):
return np.linalg.inv(self.pose)
# projection (perspective)
@property
def perspective(self):
y = np.tan(self.fovy / 2)
aspect = self.W / self.H
return np.array(
[
[1 / (y * aspect), 0, 0, 0],
[0, -1 / y, 0, 0],
[
0,
0,
-(self.far + self.near) / (self.far - self.near),
-(2 * self.far * self.near) / (self.far - self.near),
],
[0, 0, -1, 0],
],
dtype=np.float32,
)
# intrinsics
@property
def intrinsics(self):
focal = self.H / (2 * np.tan(self.fovy / 2))
return np.array([focal, focal, self.W // 2, self.H // 2], dtype=np.float32)
@property
def mvp(self):
return self.perspective @ np.linalg.inv(self.pose) # [4, 4]
def orbit(self, dx, dy):
# rotate along camera up/side axis!
side = self.rot.as_matrix()[:3, 0]
rotvec_x = self.up * np.radians(-0.05 * dx)
rotvec_y = side * np.radians(-0.05 * dy)
self.rot = R.from_rotvec(rotvec_x) * R.from_rotvec(rotvec_y) * self.rot
def scale(self, delta):
self.radius *= 1.1 ** (-delta)
def pan(self, dx, dy, dz=0):
# pan in camera coordinate system (careful on the sensitivity!)
self.center += 0.0005 * self.rot.as_matrix()[:3, :3] @ np.array([-dx, -dy, dz])