Spaces:

FrozenBurning
/

3DTopia-XL

Sleeping

3DTopia-XL / dva /mvp /extensions /mvpraymarch /mvpraymarch.py

FrozenBurning

single view to 3D init release

81ecb2b 6 months ago

23 kB

	# Copyright (c) Meta Platforms, Inc. and affiliates.
	# All rights reserved.
	#
	# This source code is licensed under the license found in the
	# LICENSE file in the root directory of this source tree.

	import numpy as np
	import time

	import torch
	import torch.nn as nn
	from torch.autograd import Function
	import torch.nn.functional as F

	try:
	from . import mvpraymarchlib
	except:
	import mvpraymarchlib

	def build_accel(primtransfin, algo, fixedorder=False):
	"""build bvh structure given primitive centers and sizes

	Parameters:
	----------
	primtransfin : tuple[tensor, tensor, tensor]
	primitive transform tensors
	algo : int
	raymarching algorithm
	fixedorder : optional[str]
	True means the bvh builder will not reorder primitives and will
	use a trivial tree structure. Likely to be slow for arbitrary
	configurations of primitives.

	"""
	primpos, primrot, primscale = primtransfin

	N = primpos.size(0)
	K = primpos.size(1)

	dev = primpos.device

	# compute and sort morton codes
	if fixedorder:
	sortedobjid = (torch.arange(N*K, dtype=torch.int32, device=dev) % K).view(N, K)
	else:
	cmax = primpos.max(dim=1, keepdim=True)[0]
	cmin = primpos.min(dim=1, keepdim=True)[0]

	centers_norm = (primpos - cmin) / (cmax - cmin).clamp(min=1e-8)

	mortoncode = torch.empty((N, K), dtype=torch.int32, device=dev)
	mvpraymarchlib.compute_morton(centers_norm, mortoncode, algo)
	sortedcode, sortedobjid_long = torch.sort(mortoncode, dim=-1)
	sortedobjid = sortedobjid_long.int()

	if fixedorder:
	nodechildren = torch.cat([
	torch.arange(1, (K - 1) * 2 + 1, dtype=torch.int32, device=dev),
	torch.div(torch.arange(-2, -(K * 2 + 1) - 1, -1, dtype=torch.int32, device=dev), 2, rounding_mode="floor")],
	dim=0).view(1, K + K - 1, 2).repeat(N, 1, 1)
	nodeparent = (
	torch.div(torch.arange(-1, K * 2 - 2, dtype=torch.int32, device=dev), 2, rounding_mode="floor")
	.view(1, -1).repeat(N, 1))
	else:
	nodechildren = torch.empty((N, K + K - 1, 2), dtype=torch.int32, device=dev)
	nodeparent = torch.full((N, K + K - 1), -1, dtype=torch.int32, device=dev)
	mvpraymarchlib.build_tree(sortedcode, nodechildren, nodeparent)

	nodeaabb = torch.empty((N, K + K - 1, 2, 3), dtype=torch.float32, device=dev)
	mvpraymarchlib.compute_aabb(*primtransfin, sortedobjid, nodechildren, nodeparent, nodeaabb, algo)

	return sortedobjid, nodechildren, nodeaabb

	class MVPRaymarch(Function):
	"""Custom Function for raymarching Mixture of Volumetric Primitives."""
	@staticmethod
	def forward(self, raypos, raydir, stepsize, tminmax,
	primpos, primrot, primscale,
	template, warp,
	rayterm, gradmode, options):
	algo = options["algo"]
	usebvh = options["usebvh"]
	sortprims = options["sortprims"]
	randomorder = options["randomorder"]
	maxhitboxes = options["maxhitboxes"]
	synchitboxes = options["synchitboxes"]
	chlast = options["chlast"]
	fadescale = options["fadescale"]
	fadeexp = options["fadeexp"]
	accum = options["accum"]
	termthresh = options["termthresh"]
	griddim = options["griddim"]
	if isinstance(options["blocksize"], tuple):
	blocksizex, blocksizey = options["blocksize"]
	else:
	blocksizex = options["blocksize"]
	blocksizey = 1

	assert raypos.is_contiguous() and raypos.size(3) == 3
	assert raydir.is_contiguous() and raydir.size(3) == 3
	assert tminmax.is_contiguous() and tminmax.size(3) == 2

	assert primpos is None or primpos.is_contiguous() and primpos.size(2) == 3
	assert primrot is None or primrot.is_contiguous() and primrot.size(2) == 3
	assert primscale is None or primscale.is_contiguous() and primscale.size(2) == 3

	if chlast:
	assert template.is_contiguous() and len(template.size()) == 6 and template.size(-1) == 4
	assert warp is None or (warp.is_contiguous() and warp.size(-1) == 3)
	else:
	assert template.is_contiguous() and len(template.size()) == 6 and template.size(2) == 4
	assert warp is None or (warp.is_contiguous() and warp.size(2) == 3)

	primtransfin = (primpos, primrot, primscale)

	# Build bvh
	if usebvh is not False:
	# compute radius of primitives
	sortedobjid, nodechildren, nodeaabb = build_accel(primtransfin,
	algo, fixedorder=usebvh=="fixedorder")
	assert sortedobjid.is_contiguous()
	assert nodechildren.is_contiguous()
	assert nodeaabb.is_contiguous()

	if randomorder:
	sortedobjid = sortedobjid[torch.randperm(len(sortedobjid))]
	else:
	_, sortedobjid, nodechildren, nodeaabb = None, None, None, None

	# march through boxes
	N, H, W = raypos.size(0), raypos.size(1), raypos.size(2)
	rayrgba = torch.empty((N, H, W, 4), device=raypos.device)
	if gradmode:
	raysat = torch.full((N, H, W, 3), -1, dtype=torch.float32, device=raypos.device)
	rayterm = None
	else:
	raysat = None
	rayterm = None

	mvpraymarchlib.raymarch_forward(
	raypos, raydir, stepsize, tminmax,
	sortedobjid, nodechildren, nodeaabb,
	*primtransfin,
	template, warp,
	rayrgba, raysat, rayterm,
	algo, sortprims, maxhitboxes, synchitboxes, chlast,
	fadescale, fadeexp,
	accum, termthresh,
	griddim, blocksizex, blocksizey)

	self.save_for_backward(
	raypos, raydir, tminmax,
	sortedobjid, nodechildren, nodeaabb,
	primpos, primrot, primscale,
	template, warp,
	rayrgba, raysat, rayterm)
	self.options = options
	self.stepsize = stepsize

	return rayrgba

	@staticmethod
	def backward(self, grad_rayrgba):
	(raypos, raydir, tminmax,
	sortedobjid, nodechildren, nodeaabb,
	primpos, primrot, primscale,
	template, warp,
	rayrgba, raysat, rayterm) = self.saved_tensors
	algo = self.options["algo"]
	usebvh = self.options["usebvh"]
	sortprims = self.options["sortprims"]
	maxhitboxes = self.options["maxhitboxes"]
	synchitboxes = self.options["synchitboxes"]
	chlast = self.options["chlast"]
	fadescale = self.options["fadescale"]
	fadeexp = self.options["fadeexp"]
	accum = self.options["accum"]
	termthresh = self.options["termthresh"]
	griddim = self.options["griddim"]
	if isinstance(self.options["bwdblocksize"], tuple):
	blocksizex, blocksizey = self.options["bwdblocksize"]
	else:
	blocksizex = self.options["bwdblocksize"]
	blocksizey = 1

	stepsize = self.stepsize

	grad_primpos = torch.zeros_like(primpos)
	grad_primrot = torch.zeros_like(primrot)
	grad_primscale = torch.zeros_like(primscale)
	primtransfin = (primpos, grad_primpos, primrot, grad_primrot, primscale, grad_primscale)

	grad_template = torch.zeros_like(template)
	grad_warp = torch.zeros_like(warp) if warp is not None else None

	mvpraymarchlib.raymarch_backward(raypos, raydir, stepsize, tminmax,
	sortedobjid, nodechildren, nodeaabb,

	*primtransfin,

	template, grad_template, warp, grad_warp,

	rayrgba, grad_rayrgba.contiguous(), raysat, rayterm,

	algo, sortprims, maxhitboxes, synchitboxes, chlast,
	fadescale, fadeexp,
	accum, termthresh,
	griddim, blocksizex, blocksizey)

	return (None, None, None, None,
	grad_primpos, grad_primrot, grad_primscale,
	grad_template, grad_warp,
	None, None, None)

	def mvpraymarch(raypos, raydir, stepsize, tminmax,
	primtransf,
	template, warp,
	rayterm=None,
	algo=0, usebvh="fixedorder",
	sortprims=False, randomorder=False,
	maxhitboxes=512, synchitboxes=True,
	chlast=True, fadescale=8., fadeexp=8.,
	accum=0, termthresh=0.,
	griddim=3, blocksize=(8, 16), bwdblocksize=(8, 16)):
	"""Main entry point for raymarching MVP.

	Parameters:
	----------
	raypos: N x H x W x 3 tensor of ray origins
	raydir: N x H x W x 3 tensor of ray directions
	stepsize: raymarching step size
	tminmax: N x H x W x 2 tensor of raymarching min/max bounds
	template: N x K x 4 x TD x TH x TW tensor of K RGBA primitives
	warp: N x K x 3 x TD x TH x TW tensor of K warp fields (optional)
	primpos: N x K x 3 tensor of primitive centers
	primrot: N x K x 3 x 3 tensor of primitive orientations
	primscale: N x K x 3 tensor of primitive inverse dimension lengths
	algo: algorithm for raymarching (valid values: 0, 1). algo=0 is the fastest.
	Currently algo=0 has a limit of 512 primitives per ray, so problems can
	occur if there are many more boxes. all sortprims=True options have
	this limitation, but you can use (algo=1, sortprims=False,
	usebvh="fixedorder") which works correctly and has no primitive number
	limitation (but is slightly slower).
	usebvh: True to use bvh, "fixedorder" for a simple BVH, False for no bvh
	sortprims: True to sort overlapping primitives at a sample point. Must
	be True for gradients to match the PyTorch gradients. Seems unstable
	if False but also not a big performance bottleneck.
	chlast: whether template is provided as channels last or not. True tends
	to be faster.
	fadescale: Opacity is faded at the borders of the primitives by the equation
	exp(-fadescale * x ** fadeexp) where x is the normalized coordinates of
	the primitive.
	fadeexp: Opacity is faded at the borders of the primitives by the equation
	exp(-fadescale * x ** fadeexp) where x is the normalized coordinates of
	the primitive.
	griddim: CUDA grid dimensionality.
	blocksize: blocksize of CUDA kernels. Should be 2-element tuple if
	griddim>1, or integer if griddim==1."""
	if isinstance(primtransf, tuple):
	primpos, primrot, primscale = primtransf
	else:
	primpos, primrot, primscale = (
	primtransf[:, :, 0, :].contiguous(),
	primtransf[:, :, 1:4, :].contiguous(),
	primtransf[:, :, 4, :].contiguous())
	primtransfin = (primpos, primrot, primscale)

	out = MVPRaymarch.apply(raypos, raydir, stepsize, tminmax,
	*primtransfin,
	template, warp,
	rayterm, torch.is_grad_enabled(),
	{"algo": algo, "usebvh": usebvh, "sortprims": sortprims, "randomorder": randomorder,
	"maxhitboxes": maxhitboxes, "synchitboxes": synchitboxes,
	"chlast": chlast, "fadescale": fadescale, "fadeexp": fadeexp,
	"accum": accum, "termthresh": termthresh,
	"griddim": griddim, "blocksize": blocksize, "bwdblocksize": bwdblocksize})
	return out

	class Rodrigues(nn.Module):
	def __init__(self):
	super(Rodrigues, self).__init__()

	def forward(self, rvec):
	theta = torch.sqrt(1e-5 + torch.sum(rvec ** 2, dim=1))
	rvec = rvec / theta[:, None]
	costh = torch.cos(theta)
	sinth = torch.sin(theta)
	return torch.stack((
	rvec[:, 0] 2 + (1. - rvec[:, 0] 2) * costh,
	rvec[:, 0] * rvec[:, 1] * (1. - costh) - rvec[:, 2] * sinth,
	rvec[:, 0] * rvec[:, 2] * (1. - costh) + rvec[:, 1] * sinth,

	rvec[:, 0] * rvec[:, 1] * (1. - costh) + rvec[:, 2] * sinth,
	rvec[:, 1] 2 + (1. - rvec[:, 1] 2) * costh,
	rvec[:, 1] * rvec[:, 2] * (1. - costh) - rvec[:, 0] * sinth,

	rvec[:, 0] * rvec[:, 2] * (1. - costh) - rvec[:, 1] * sinth,
	rvec[:, 1] * rvec[:, 2] * (1. - costh) + rvec[:, 0] * sinth,
	rvec[:, 2] 2 + (1. - rvec[:, 2] 2) * costh), dim=1).view(-1, 3, 3)

	def gradcheck(usebvh=True, sortprims=True, maxhitboxes=512, synchitboxes=False,
	dowarp=False, chlast=False, fadescale=8., fadeexp=8.,
	accum=0, termthresh=0., algo=0, griddim=2, blocksize=(8, 16), bwdblocksize=(8, 16)):
	N = 2
	H = 65
	W = 65
	k3 = 4
	K = k3k3k3

	M = 32

	print("=================================================================")
	print("usebvh={}, sortprims={}, maxhb={}, synchb={}, dowarp={}, chlast={}, "
	"fadescale={}, fadeexp={}, accum={}, termthresh={}, algo={}, griddim={}, "
	"blocksize={}, bwdblocksize={}".format(
	usebvh, sortprims, maxhitboxes, synchitboxes, dowarp, chlast,
	fadescale, fadeexp, accum, termthresh, algo, griddim, blocksize,
	bwdblocksize))

	# generate random inputs
	torch.manual_seed(1112)

	coherent_rays = True
	if not coherent_rays:
	_raypos = torch.randn(N, H, W, 3).to("cuda")
	_raydir = torch.randn(N, H, W, 3).to("cuda")
	_raydir /= torch.sqrt(torch.sum(_raydir ** 2, dim=-1, keepdim=True))
	else:
	focal = torch.tensor([[W4.0, W4.0] for n in range(N)])
	princpt = torch.tensor([[W0.5, H0.5] for n in range(N)])
	pixely, pixelx = torch.meshgrid(torch.arange(H).float(), torch.arange(W).float())
	pixelcoords = torch.stack([pixelx, pixely], dim=-1)[None, :, :, :].repeat(N, 1, 1, 1)

	raydir = (pixelcoords - princpt[:, None, None, :]) / focal[:, None, None, :]
	raydir = torch.cat([raydir, torch.ones_like(raydir[:, :, :, 0:1])], dim=-1)
	raydir = raydir / torch.sqrt(torch.sum(raydir ** 2, dim=-1, keepdim=True))

	_raypos = torch.tensor([-0.0, 0.0, -4.])[None, None, None, :].repeat(N, H, W, 1).to("cuda")
	_raydir = raydir.to("cuda")
	_raydir /= torch.sqrt(torch.sum(_raydir ** 2, dim=-1, keepdim=True))

	max_len = 6.0
	_stepsize = max_len / 15.386928
	_tminmax = max_len*torch.arange(2, dtype=torch.float32)[None, None, None, :].repeat(N, H, W, 1).to("cuda") + \
	torch.rand(N, H, W, 2, device="cuda") * 1.

	_template = torch.randn(N, K, 4, M, M, M, requires_grad=True)
	_template.data[:, :, -1, :, :, :] -= 3.5
	_template = _template.contiguous().detach().clone()
	_template.requires_grad = True
	gridxyz = torch.stack(torch.meshgrid(
	torch.linspace(-1., 1., M//2),
	torch.linspace(-1., 1., M//2),
	torch.linspace(-1., 1., M//2))[::-1], dim=0).contiguous()
	_warp = (torch.randn(N, K, 3, M//2, M//2, M//2) * 0.01 + gridxyz[None, None, :, :, :, :]).contiguous().detach().clone()
	_warp.requires_grad = True
	_primpos = torch.randn(N, K, 3, requires_grad=True)
	_primpos = torch.randn(N, K, 3, requires_grad=True)

	coherent_centers = True
	if coherent_centers:
	ns = k3
	#assert nsnsns==K
	grid3d = torch.stack(torch.meshgrid(
	torch.linspace(-1., 1., ns),
	torch.linspace(-1., 1., ns),
	torch.linspace(-1., 1., K//(ns*ns)))[::-1], dim=0)[None]
	_primpos = ((
	grid3d.permute((0, 2, 3, 4, 1)).reshape(1, K, 3).expand(N, -1, -1) +
	0.1 * torch.randn(N, K, 3, requires_grad=True)
	)).contiguous().detach().clone()
	_primpos.requires_grad = True
	scale_ws = 1.
	_primrot = torch.randn(N, K, 3)
	rodrigues = Rodrigues()
	_primrot = rodrigues(_primrot.view(-1, 3)).view(N, K, 3, 3).contiguous().detach().clone()
	_primrot.requires_grad = True

	_primscale = torch.randn(N, K, 3, requires_grad=True)
	_primscale.data *= 0.0

	if dowarp:
	params = [_template, _warp, _primscale, _primrot, _primpos]
	paramnames = ["template", "warp", "primscale", "primrot", "primpos"]
	else:
	params = [_template, _primscale, _primrot, _primpos]
	paramnames = ["template", "primscale", "primrot", "primpos"]

	termthreshorig = termthresh

	########################### run pytorch version ###########################

	raypos = _raypos
	raydir = _raydir
	stepsize = _stepsize
	tminmax = _tminmax

	#template = F.softplus(_template.to("cuda") * 1.5)
	template = F.softplus(_template.to("cuda") * 1.5) if algo != 2 else _template.to("cuda") * 1.5
	warp = _warp.to("cuda")
	primpos = _primpos.to("cuda") * 0.3
	primrot = _primrot.to("cuda")
	primscale = scale_ws * torch.exp(0.1 * _primscale.to("cuda"))

	# python raymarching implementation
	rayrgba = torch.zeros((N, H, W, 4)).to("cuda")
	raypos = raypos + raydir * tminmax[:, :, :, 0, None]
	t = tminmax[:, :, :, 0]

	step = 0
	t0 = t.detach().clone()
	raypos0 = raypos.detach().clone()

	torch.cuda.synchronize()
	time0 = time.time()

	while (t < tminmax[:, :, :, 1]).any():
	valid2 = torch.ones_like(rayrgba[:, :, :, 3:4])

	for k in range(K):
	y0 = torch.bmm(
	(raypos - primpos[:, k, None, None, :]).view(raypos.size(0), -1, raypos.size(3)),
	primrot[:, k, :, :]).view_as(raypos) * primscale[:, k, None, None, :]

	fade = torch.exp(-fadescale * torch.sum(torch.abs(y0) ** fadeexp, dim=-1, keepdim=True))

	if dowarp:
	y1 = F.grid_sample(
	warp[:, k, :, :, :, :],
	y0[:, None, :, :, :], align_corners=True)[:, :, 0, :, :].permute(0, 2, 3, 1)
	else:
	y1 = y0

	sample = F.grid_sample(
	template[:, k, :, :, :, :],
	y1[:, None, :, :, :], align_corners=True)[:, :, 0, :, :].permute(0, 2, 3, 1)

	valid1 = (
	torch.prod(y0[:, :, :, :] >= -1., dim=-1, keepdim=True) *
	torch.prod(y0[:, :, :, :] <= 1., dim=-1, keepdim=True))

	valid = ((t >= tminmax[:, :, :, 0]) & (t < tminmax[:, :, :, 1])).float()[:, :, :, None]

	alpha0 = sample[:, :, :, 3:4]

	rgb = sample[:, :, :, 0:3] * valid * valid1
	alpha = alpha0 * fade * stepsize * valid * valid1

	if accum == 0:
	newalpha = rayrgba[:, :, :, 3:4] + alpha
	contrib = (newalpha.clamp(max=1.0) - rayrgba[:, :, :, 3:4]) * valid * valid1
	rayrgba = rayrgba + contrib * torch.cat([rgb, torch.ones_like(alpha)], dim=-1)
	else:
	raise

	step += 1
	t = t0 + stepsize * step
	raypos = raypos0 + raydir * stepsize * step

	print(rayrgba[..., -1].min().item(), rayrgba[..., -1].max().item())

	sample0 = rayrgba

	torch.cuda.synchronize()
	time1 = time.time()

	sample0.backward(torch.ones_like(sample0))

	torch.cuda.synchronize()
	time2 = time.time()

	print("{:<10} {:>10} {:>10} {:>10}".format("", "fwd", "bwd", "total"))
	print("{:<10} {:10.5} {:10.5} {:10.5}".format("pytime", time1 - time0, time2 - time1, time2 - time0))

	grads0 = [p.grad.detach().clone() for p in params]

	for p in params:
	p.grad.detach_()
	p.grad.zero_()

	############################## run cuda version ###########################

	raypos = _raypos
	raydir = _raydir
	stepsize = _stepsize
	tminmax = _tminmax

	template = F.softplus(_template.to("cuda") * 1.5) if algo != 2 else _template.to("cuda") * 1.5
	warp = _warp.to("cuda")
	if chlast:
	template = template.permute(0, 1, 3, 4, 5, 2).contiguous()
	warp = warp.permute(0, 1, 3, 4, 5, 2).contiguous()
	primpos = _primpos.to("cuda") * 0.3
	primrot = _primrot.to("cuda")
	primscale = scale_ws * torch.exp(0.1 * _primscale.to("cuda"))

	niter = 1

	tf, tb = 0., 0.
	for i in range(niter):
	for p in params:
	try:
	p.grad.detach_()
	p.grad.zero_()
	except:
	pass
	t0 = time.time()
	torch.cuda.synchronize()
	sample1 = mvpraymarch(raypos, raydir, stepsize, tminmax,
	(primpos, primrot, primscale),
	template, warp if dowarp else None,
	algo=algo, usebvh=usebvh, sortprims=sortprims,
	maxhitboxes=maxhitboxes, synchitboxes=synchitboxes,
	chlast=chlast, fadescale=fadescale, fadeexp=fadeexp,
	accum=accum, termthresh=termthreshorig,
	griddim=griddim, blocksize=blocksize, bwdblocksize=bwdblocksize)
	t1 = time.time()
	torch.cuda.synchronize()
	sample1.backward(torch.ones_like(sample1), retain_graph=True)
	torch.cuda.synchronize()
	t2 = time.time()
	tf += t1 - t0
	tb += t2 - t1

	print("{:<10} {:10.5} {:10.5} {:10.5}".format("time", tf / niter, tb / niter, (tf + tb) / niter))
	grads1 = [p.grad.detach().clone() for p in params]

	############# compare results #############

	print("-----------------------------------------------------------------")
	print("{:>10} {:>10} {:>10} {:>10} {:>10} {:>10} {:>10} {:>10}".format("", "maxabsdiff", "dp", "\|\|py\|\|", "\|\|cuda\|\|", "index", "py", "cuda"))
	ind = torch.argmax(torch.abs(sample0 - sample1))
	print("{:<10} {:>10.5} {:>10.5} {:>10.5} {:>10.5} {:>10} {:>10.5} {:>10.5}".format(
	"fwd",
	torch.max(torch.abs(sample0 - sample1)).item(),
	(torch.sum(sample0 * sample1) / torch.sqrt(torch.sum(sample0 * sample0) * torch.sum(sample1 * sample1))).item(),
	torch.sqrt(torch.sum(sample0 * sample0)).item(),
	torch.sqrt(torch.sum(sample1 * sample1)).item(),
	ind.item(),
	sample0.view(-1)[ind].item(),
	sample1.view(-1)[ind].item()))

	for p, g0, g1 in zip(paramnames, grads0, grads1):
	ind = torch.argmax(torch.abs(g0 - g1))
	print("{:<10} {:>10.5} {:>10.5} {:>10.5} {:>10.5} {:>10} {:>10.5} {:>10.5}".format(
	p,
	torch.max(torch.abs(g0 - g1)).item(),
	(torch.sum(g0 * g1) / torch.sqrt(torch.sum(g0 * g0) * torch.sum(g1 * g1))).item(),
	torch.sqrt(torch.sum(g0 * g0)).item(),
	torch.sqrt(torch.sum(g1 * g1)).item(),
	ind.item(),
	g0.view(-1)[ind].item(),
	g1.view(-1)[ind].item()))

	if __name__ == "__main__":
	gradcheck(usebvh="fixedorder", sortprims=False, maxhitboxes=512, synchitboxes=True,
	dowarp=False, chlast=True, fadescale=6.5, fadeexp=7.5, accum=0, algo=0, griddim=3)
	gradcheck(usebvh="fixedorder", sortprims=False, maxhitboxes=512, synchitboxes=True,
	dowarp=True, chlast=True, fadescale=6.5, fadeexp=7.5, accum=0, algo=1, griddim=3)