|
|
|
|
|
|
|
|
|
|
|
|
|
"""Test render speed.""" |
|
import logging |
|
import sys |
|
from os import path |
|
|
|
import torch |
|
from fvcore.common.benchmark import benchmark |
|
from pytorch3d.renderer.points.pulsar import Renderer |
|
from torch.autograd import Variable |
|
|
|
|
|
|
|
sys.path.insert(0, path.join(path.dirname(__file__), "..")) |
|
LOGGER = logging.getLogger(__name__) |
|
|
|
|
|
"""Measure the execution speed of the rendering. |
|
|
|
This measures a very pessimistic upper bound on speed, because synchronization |
|
points have to be introduced in Python. On a pure PyTorch execution pipeline, |
|
results should be significantly faster. You can get pure CUDA timings through |
|
C++ by activating `PULSAR_TIMINGS_BATCHED_ENABLED` in the file |
|
`pytorch3d/csrc/pulsar/logging.h` or defining it for your compiler. |
|
""" |
|
|
|
|
|
def _bm_pulsar(): |
|
n_points = 1_000_000 |
|
width = 1_000 |
|
height = 1_000 |
|
renderer = Renderer(width, height, n_points) |
|
|
|
torch.manual_seed(1) |
|
vert_pos = torch.rand(n_points, 3, dtype=torch.float32) * 10.0 |
|
vert_pos[:, 2] += 25.0 |
|
vert_pos[:, :2] -= 5.0 |
|
vert_col = torch.rand(n_points, 3, dtype=torch.float32) |
|
vert_rad = torch.rand(n_points, dtype=torch.float32) |
|
cam_params = torch.tensor( |
|
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 5.0, 2.0], dtype=torch.float32 |
|
) |
|
device = torch.device("cuda") |
|
vert_pos = vert_pos.to(device) |
|
vert_col = vert_col.to(device) |
|
vert_rad = vert_rad.to(device) |
|
cam_params = cam_params.to(device) |
|
renderer = renderer.to(device) |
|
vert_pos_var = Variable(vert_pos, requires_grad=False) |
|
vert_col_var = Variable(vert_col, requires_grad=False) |
|
vert_rad_var = Variable(vert_rad, requires_grad=False) |
|
cam_params_var = Variable(cam_params, requires_grad=False) |
|
|
|
def bm_closure(): |
|
renderer.forward( |
|
vert_pos_var, |
|
vert_col_var, |
|
vert_rad_var, |
|
cam_params_var, |
|
1.0e-1, |
|
45.0, |
|
percent_allowed_difference=0.01, |
|
) |
|
torch.cuda.synchronize() |
|
|
|
return bm_closure |
|
|
|
|
|
def _bm_pulsar_backward(): |
|
n_points = 1_000_000 |
|
width = 1_000 |
|
height = 1_000 |
|
renderer = Renderer(width, height, n_points) |
|
|
|
torch.manual_seed(1) |
|
vert_pos = torch.rand(n_points, 3, dtype=torch.float32) * 10.0 |
|
vert_pos[:, 2] += 25.0 |
|
vert_pos[:, :2] -= 5.0 |
|
vert_col = torch.rand(n_points, 3, dtype=torch.float32) |
|
vert_rad = torch.rand(n_points, dtype=torch.float32) |
|
cam_params = torch.tensor( |
|
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 5.0, 2.0], dtype=torch.float32 |
|
) |
|
device = torch.device("cuda") |
|
vert_pos = vert_pos.to(device) |
|
vert_col = vert_col.to(device) |
|
vert_rad = vert_rad.to(device) |
|
cam_params = cam_params.to(device) |
|
renderer = renderer.to(device) |
|
vert_pos_var = Variable(vert_pos, requires_grad=True) |
|
vert_col_var = Variable(vert_col, requires_grad=True) |
|
vert_rad_var = Variable(vert_rad, requires_grad=True) |
|
cam_params_var = Variable(cam_params, requires_grad=True) |
|
res = renderer.forward( |
|
vert_pos_var, |
|
vert_col_var, |
|
vert_rad_var, |
|
cam_params_var, |
|
1.0e-1, |
|
45.0, |
|
percent_allowed_difference=0.01, |
|
) |
|
loss = res.sum() |
|
|
|
def bm_closure(): |
|
loss.backward(retain_graph=True) |
|
torch.cuda.synchronize() |
|
|
|
return bm_closure |
|
|
|
|
|
def bm_pulsar() -> None: |
|
if not torch.cuda.is_available(): |
|
return |
|
|
|
benchmark(_bm_pulsar, "PULSAR_FORWARD", [{}], warmup_iters=3) |
|
benchmark(_bm_pulsar_backward, "PULSAR_BACKWARD", [{}], warmup_iters=3) |
|
|
|
|
|
if __name__ == "__main__": |
|
bm_pulsar() |
|
|