Spaces:

nvidia
/

PartPacker

Running on Zero

App Files Files Community

ashawkey commited on Jun 1

Commit

daa6779

1 Parent(s): 40a41ad

init

Browse files

Files changed (32) hide show

README.md +1 -1
app.py +192 -0
examples/barrel.png +3 -0
examples/cactus.png +3 -0
examples/cyan_car.png +3 -0
examples/pickup.png +3 -0
examples/rabbit.png +3 -0
examples/robot.png +3 -0
examples/swivelchair.png +3 -0
examples/teapot.png +3 -0
examples/warhammer.png +3 -0
flow/__init__.py +0 -0
flow/configs/__init__.py +0 -0
flow/configs/big_parts_strict_pvae.py +33 -0
flow/configs/schema.py +57 -0
flow/flow_matching.py +58 -0
flow/model.py +336 -0
flow/modules/__init__.py +0 -0
flow/modules/dit.py +235 -0
flow/scripts/infer.py +180 -0
flow/utils.py +119 -0
requirements.txt +16 -0
vae/__init__.py +0 -0
vae/configs/__init__.py +0 -0
vae/configs/part_woenc.py +30 -0
vae/configs/schema.py +55 -0
vae/model.py +451 -0
vae/modules/__init__.py +0 -0
vae/modules/attention.py +261 -0
vae/modules/transformer.py +117 -0
vae/scripts/infer.py +142 -0
vae/utils.py +315 -0

README.md CHANGED Viewed

@@ -1,6 +1,6 @@
 ---
 title: PartPacker
-emoji: 📈
 colorFrom: blue
 colorTo: gray
 sdk: gradio

 ---
 title: PartPacker
+emoji: 🪴
 colorFrom: blue
 colorTo: gray
 sdk: gradio

app.py ADDED Viewed

	@@ -0,0 +1,192 @@

+import os
+import numpy as np
+import cv2
+import kiui
+import trimesh
+import torch
+import rembg
+from datetime import datetime
+import subprocess
+import gradio as gr
+try:
+    # running on Hugging Face Spaces
+    import spaces
+except ImportError:
+    # running locally, use a dummy space
+    class spaces:
+        class GPU:
+            def __init__(self, duration=60):
+                self.duration = duration
+            def __call__(self, func):
+                return func
+from flow.model import Model
+from flow.configs.schema import ModelConfig
+from flow.utils import get_random_color, recenter_foreground
+from vae.utils import postprocess_mesh
+# download checkpoints
+from huggingface_hub import hf_hub_download
+flow_ckpt_path = hf_hub_download(repo_id="nvidia/PartPacker", filename="flow.pt")
+vae_ckpt_path = hf_hub_download(repo_id="nvidia/PartPacker", filename="vae.pt")
+TRIMESH_GLB_EXPORT = np.array([[0, 1, 0], [0, 0, 1], [1, 0, 0]]).astype(np.float32)
+MAX_SEED = np.iinfo(np.int32).max
+bg_remover = rembg.new_session()
+# model config
+model_config = ModelConfig(
+    vae_conf="vae.configs.part_woenc",
+    vae_ckpt_path=vae_ckpt_path,
+    qknorm=True,
+    qknorm_type="RMSNorm",
+    use_pos_embed=False,
+    dino_model="dinov2_vitg14",
+    hidden_dim=1536,
+    flow_shift=3.0,
+    logitnorm_mean=1.0,
+    logitnorm_std=1.0,
+    latent_size=4096,
+    use_parts=True,
+)
+# instantiate model
+model = Model(model_config).eval().cuda().bfloat16()
+# load weight
+ckpt_dict = torch.load(flow_ckpt_path, weights_only=True)
+model.load_state_dict(ckpt_dict, strict=True)
+# process function
+@spaces.GPU(duration=120)
+def process(input_image, input_num_steps=30, input_cfg_scale=7.5, grid_res=384, seed=42, randomize_seed=True):
+    # seed
+    if randomize_seed:
+        seed = np.random.randint(0, MAX_SEED)
+    kiui.seed_everything(seed)
+    # output path
+    os.makedirs("output", exist_ok=True)
+    output_glb_path = f"output/partpacker_{datetime.now().strftime('%Y%m%d_%H%M%S')}.glb"
+    # input image
+    input_image = np.array(input_image) # uint8
+    # bg removal if there is no alpha channel
+    if input_image.shape[-1] == 3:
+        input_image = rembg.remove(input_image, session=bg_remover)  # [H, W, 4]
+    mask = input_image[..., -1] > 0
+    image = recenter_foreground(input_image, mask, border_ratio=0.1)
+    image = cv2.resize(image, (518, 518), interpolation=cv2.INTER_LINEAR)
+    image = image.astype(np.float32) / 255.0
+    image = image[..., :3] * image[..., 3:4] + (1 - image[..., 3:4])  # white background
+    image_tensor = torch.from_numpy(image).permute(2, 0, 1).contiguous().unsqueeze(0).float().cuda()
+    data = {"cond_images": image_tensor}
+    with torch.inference_mode():
+        results = model(data, num_steps=input_num_steps, cfg_scale=input_cfg_scale)
+    latent = results["latent"]
+    # query mesh
+    data_part0 = {"latent": latent[:, : model.config.latent_size, :]}
+    data_part1 = {"latent": latent[:, model.config.latent_size :, :]}
+    with torch.inference_mode():
+        results_part0 = model.vae(data_part0, resolution=grid_res)
+        results_part1 = model.vae(data_part1, resolution=grid_res)
+    vertices, faces = results_part0["meshes"][0]
+    mesh_part0 = trimesh.Trimesh(vertices, faces)
+    mesh_part0.vertices = mesh_part0.vertices @ TRIMESH_GLB_EXPORT.T
+    mesh_part0 = postprocess_mesh(mesh_part0, 5e4)
+    parts = mesh_part0.split(only_watertight=False)
+    vertices, faces = results_part1["meshes"][0]
+    mesh_part1 = trimesh.Trimesh(vertices, faces)
+    mesh_part1.vertices = mesh_part1.vertices @ TRIMESH_GLB_EXPORT.T
+    mesh_part1 = postprocess_mesh(mesh_part1, 5e4)
+    parts.extend(mesh_part1.split(only_watertight=False))
+    # split connected components and assign different colors
+    for j, part in enumerate(parts):
+        # each component uses a random color
+        part.visual.vertex_colors = get_random_color(j, use_float=True)
+    mesh = trimesh.Scene(parts)
+    # export the whole mesh
+    mesh.export(output_glb_path)
+    return seed, image, output_glb_path
+# gradio UI
+_TITLE = '''PartPacker: Efficient Part-level 3D Object Generation via Dual Volume Packing'''
+_DESCRIPTION = '''
+<div>
+<a style="display:inline-block" href="https://research.nvidia.com/labs/dir/partpacker/"><img src='https://img.shields.io/badge/public_website-8A2BE2'></a>
+<a style="display:inline-block; margin-left: .5em" href="https://github.com/NVlabs/PartPacker"><img src='https://img.shields.io/github/stars/NVlabs/PartPacker?style=social'/></a>
+</div>
+* Each part is visualized with a random color, and can be separated in the GLB file.
+* If the output is not satisfactory, please try different random seeds!
+'''
+block = gr.Blocks(title=_TITLE).queue()
+with block:
+    with gr.Row():
+        with gr.Column(scale=1):
+            gr.Markdown('# ' + _TITLE)
+    gr.Markdown(_DESCRIPTION)
+    with gr.Row():
+        with gr.Column(scale=2):
+            # input image
+            input_image = gr.Image(label="Image", type='pil')
+            # inference steps
+            input_num_steps = gr.Slider(label="Inference steps", minimum=1, maximum=100, step=1, value=30)
+            # cfg scale
+            input_cfg_scale = gr.Slider(label="CFG scale", minimum=2, maximum=10, step=0.1, value=7.5)
+            # grid resolution
+            input_grid_res = gr.Slider(label="Grid resolution", minimum=256, maximum=512, step=1, value=384)
+            # random seed
+            seed = gr.Slider(label="Random seed", minimum=0, maximum=MAX_SEED, step=1, value=0)
+            randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
+            # gen button
+            button_gen = gr.Button("Generate")
+        with gr.Column(scale=4):
+            with gr.Tab("3D Model"):
+                # glb file
+                output_model = gr.Model3D(label="Geometry", height=380)
+            with gr.Tab("Input Image"):
+                # background removed image
+                output_image = gr.Image(interactive=False, show_label=False)
+        with gr.Column(scale=1):
+            gr.Examples(
+                examples=[
+                    ["examples/barrel.png"],
+                    ["examples/cactus.png"],
+                    ["examples/cyan_car.png"],
+                    ["examples/pickup.png"],
+                    ["examples/swivelchair.png"],
+                    ["examples/warhammer.png"],
+                ],
+                inputs=[input_image],
+                cache_examples=False,
+            )
+        button_gen.click(process, inputs=[input_image, input_num_steps, input_cfg_scale, input_grid_res, seed, randomize_seed], outputs=[seed, output_image, output_model])
+block.launch()

examples/barrel.png ADDED Viewed

Git LFS Details

SHA256: 6efc5d01a6460ffe2aaf3f644f26d278ba7d4801476d17e98a212c588079c978
Pointer size: 131 Bytes
Size of remote file: 314 kB

examples/cactus.png ADDED Viewed

Git LFS Details

SHA256: b63b5a14ec5df6cf05ce537d8ba7eec8e67a9260d0f521e9b68417f86f7942ad
Pointer size: 131 Bytes
Size of remote file: 186 kB

examples/cyan_car.png ADDED Viewed

Git LFS Details

SHA256: 61dc2c1b2e940a9d2ecded4d7c60fe0249c8ca905a55029293e0f062b559f795
Pointer size: 130 Bytes
Size of remote file: 69.6 kB

examples/pickup.png ADDED Viewed

Git LFS Details

SHA256: 9f89940c0bf2dbaf6a48346b8b9861f56a1b002ad5452c14e0019149132873bd
Pointer size: 131 Bytes
Size of remote file: 114 kB

examples/rabbit.png ADDED Viewed

Git LFS Details

SHA256: 7c06b1e3364b3417dd4e92eaa6a5978d01a9ab9b1d9b99d7b15b2393d7952fbd
Pointer size: 131 Bytes
Size of remote file: 209 kB

examples/robot.png ADDED Viewed

Git LFS Details

SHA256: 0ebaf8657cdb7d233ee2661926cff32656b54809bfc20fb7ac4d5ed7aa71fb15
Pointer size: 131 Bytes
Size of remote file: 207 kB

examples/swivelchair.png ADDED Viewed

Git LFS Details

SHA256: af28e5c01b48ca27bb8baf68d7522eaa109f8998588f02152f935c9e3e5e2b57
Pointer size: 131 Bytes
Size of remote file: 119 kB

examples/teapot.png ADDED Viewed

Git LFS Details

SHA256: 967ec369e4bb45e835f7ac057d9406a46178ba6f2d6a5958788f805deb2fd5ec
Pointer size: 131 Bytes
Size of remote file: 204 kB

examples/warhammer.png ADDED Viewed

Git LFS Details

SHA256: bc63bda34774288092d069808b7cb28c9544dd253cfbcfb33a98b22c9ec19537
Pointer size: 131 Bytes
Size of remote file: 163 kB

flow/__init__.py ADDED Viewed

File without changes

flow/configs/__init__.py ADDED Viewed

File without changes

flow/configs/big_parts_strict_pvae.py ADDED Viewed

	@@ -0,0 +1,33 @@

+"""
+-----------------------------------------------------------------------------
+Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+NVIDIA CORPORATION and its licensors retain all intellectual property
+and proprietary rights in and to this software, related documentation
+and any modifications thereto. Any use, reproduction, disclosure or
+distribution of this software and related documentation without an express
+license agreement from NVIDIA CORPORATION is strictly prohibited.
+-----------------------------------------------------------------------------
+"""
+from flow.configs.schema import ModelConfig
+def make_config():
+    model_config = ModelConfig(
+        vae_conf="vae.configs.part_woenc",
+        vae_ckpt_path="pretrained/vae.pt",
+        qknorm=True,
+        qknorm_type="RMSNorm",
+        use_pos_embed=False,
+        dino_model="dinov2_vitg14",
+        hidden_dim=1536,
+        flow_shift=3.0,
+        logitnorm_mean=1.0,
+        logitnorm_std=1.0,
+        latent_size=4096,
+        use_parts=True,
+    )
+    return model_config

flow/configs/schema.py ADDED Viewed

	@@ -0,0 +1,57 @@

+"""
+-----------------------------------------------------------------------------
+Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+NVIDIA CORPORATION and its licensors retain all intellectual property
+and proprietary rights in and to this software, related documentation
+and any modifications thereto. Any use, reproduction, disclosure or
+distribution of this software and related documentation without an express
+license agreement from NVIDIA CORPORATION is strictly prohibited.
+-----------------------------------------------------------------------------
+"""
+from typing import Literal, Optional
+import attrs
+@attrs.define(slots=False)
+class ModelConfig:
+    # vae
+    vae_conf: str = "vae.configs.part_woenc"
+    vae_ckpt_path: Optional[str] = None
+    # learn & generate parts
+    use_parts: bool = False
+    part_embed_mode: Literal["element", "part", "part2_only"] = "part2_only"
+    shuffle_parts: bool = False
+    use_num_parts_cond: bool = False
+    # flow matching hyper-params
+    flow_shift: float = 1.0
+    logitnorm_mean: float = 0.0
+    logitnorm_std: float = 1.0
+    # image encoder
+    dino_model: Literal["dinov2_vitl14_reg", "dinov2_vitg14"] = "dinov2_vitg14"
+    # backbone DiT
+    hidden_dim: int = 1536
+    num_heads: int = 16
+    num_layers: int = 24
+    qknorm: bool = True
+    qknorm_type: Literal["LayerNorm", "RMSNorm"] = "RMSNorm"
+    use_pos_embed: bool = False
+    # latent code
+    latent_size: Optional[int] = None  # if None, will load from vae
+    latent_dim: Optional[int] = None
+    # preload vae weights
+    preload_vae: bool = True
+    # preload dinov2 weights
+    preload_dinov2: bool = True
+    # init weights from a pretrained checkpoint
+    pretrain_path: Optional[str] = None

flow/flow_matching.py ADDED Viewed

	@@ -0,0 +1,58 @@

+"""
+-----------------------------------------------------------------------------
+Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+NVIDIA CORPORATION and its licensors retain all intellectual property
+and proprietary rights in and to this software, related documentation
+and any modifications thereto. Any use, reproduction, disclosure or
+distribution of this software and related documentation without an express
+license agreement from NVIDIA CORPORATION is strictly prohibited.
+-----------------------------------------------------------------------------
+"""
+import numpy as np
+import torch
+class FlowMatchingScheduler:
+    def __init__(self, num_train_timesteps: int = 1000, shift: float = 1):
+        # set timesteps
+        self.num_train_timesteps = num_train_timesteps
+        self.shift = shift
+        timesteps = np.linspace(1, num_train_timesteps, num_train_timesteps, dtype=np.float32)[::-1].copy()
+        timesteps = torch.from_numpy(timesteps).to(dtype=torch.float32)
+        sigmas = timesteps / num_train_timesteps
+        sigmas = shift * sigmas / (1 + (shift - 1) * sigmas)
+        self.sigmas = sigmas  # 1 --> 0
+        self.timesteps = sigmas * num_train_timesteps  # num_train_timesteps --> 1
+    # set device
+    def to(self, device):
+        self.sigmas = self.sigmas.to(device=device)
+        self.timesteps = self.timesteps.to(device=device)
+    # add random noise to latent during training
+    def add_noise(self, latent: torch.Tensor, logit_mean: float = 1.0, logit_std: float = 1.0):
+        # latent: [B, ...]
+        # timesteps: [B]
+        # return: [B, ...] noisy_latent, [B, ...] noise, [B] timesteps
+        # logit-normal sampling
+        u = torch.normal(mean=logit_mean, std=logit_std, size=(latent.shape[0],), device=self.sigmas.device)
+        u = torch.nn.functional.sigmoid(u)
+        step_indices = (u * self.num_train_timesteps).long()
+        timesteps = self.timesteps[step_indices]
+        sigmas = self.sigmas[step_indices].flatten()
+        while len(sigmas.shape) < latent.ndim:
+            sigmas = sigmas.unsqueeze(-1)
+        noise = torch.randn_like(latent)
+        noisy_latent = (1.0 - sigmas) * latent + sigmas * noise
+        return noisy_latent, noise, timesteps

flow/model.py ADDED Viewed

	@@ -0,0 +1,336 @@

+"""
+-----------------------------------------------------------------------------
+Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+NVIDIA CORPORATION and its licensors retain all intellectual property
+and proprietary rights in and to this software, related documentation
+and any modifications thereto. Any use, reproduction, disclosure or
+distribution of this software and related documentation without an express
+license agreement from NVIDIA CORPORATION is strictly prohibited.
+-----------------------------------------------------------------------------
+"""
+import importlib
+from transformers import Dinov2Model
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import tqdm
+from torchvision import transforms
+from flow.configs.schema import ModelConfig
+from flow.flow_matching import FlowMatchingScheduler
+from flow.modules.dit import DiT
+from vae.model import Model as VAE
+from vae.utils import sync_timer
+class Model(nn.Module):
+    def __init__(self, config: ModelConfig) -> None:
+        super().__init__()
+        self.config = config
+        self.precision = torch.bfloat16
+        # image condition model (dinov2)
+        if self.config.dino_model == "dinov2_vitg14":
+            self.dino = Dinov2Model.from_pretrained("facebook/dinov2-giant")
+        elif self.config.dino_model == "dinov2_vitl14_reg":
+            self.dino = Dinov2Model.from_pretrained("facebook/dinov2-with-registers-large")
+        else:
+            raise ValueError(f"DINOv2 model {self.config.dino_model} not supported")
+        # hack to match our implementation
+        self.dino.layernorm = torch.nn.Identity()
+        self.dino.eval().to(dtype=self.precision)
+        self.dino.requires_grad_(False)
+        cond_dim = 1024 if self.config.dino_model == "dinov2_vitl14_reg" else 1536
+        assert cond_dim == config.hidden_dim, "DINOv2 dim must match backbone dim"
+        self.preprocess_cond_image = transforms.Compose(
+            [
+                transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+            ]
+        )
+        # vae encoder
+        vae_config = importlib.import_module(config.vae_conf).make_config()
+        self.vae = VAE(vae_config).eval().to(dtype=self.precision)
+        self.vae.requires_grad_(False)
+        # load vae
+        if self.config.preload_vae:
+            try:
+                vae_ckpt = torch.load(self.config.vae_ckpt_path, weights_only=True)  # local path
+                if "model" in vae_ckpt:
+                    vae_ckpt = vae_ckpt["model"]
+                self.vae.load_state_dict(vae_ckpt, strict=True)
+                del vae_ckpt
+                print(f"Loaded VAE from {self.config.vae_ckpt_path}")
+            except Exception as e:
+                print(
+                    f"Failed to load VAE from {self.config.vae_ckpt_path}: {e}, make sure you resumed from a valid checkpoint!"
+                )
+        # load info from vae config
+        if config.latent_size is None:
+            config.latent_size = self.vae.config.latent_size
+        if config.latent_dim is None:
+            config.latent_dim = self.vae.config.latent_dim
+        # dit
+        self.dit = DiT(
+            hidden_dim=config.hidden_dim,
+            num_heads=config.num_heads,
+            num_layers=config.num_layers,
+            latent_size=config.latent_size,
+            latent_dim=config.latent_dim,
+            qknorm=config.qknorm,
+            qknorm_type=config.qknorm_type,
+            use_pos_embed=config.use_pos_embed,
+            use_parts=config.use_parts,
+            part_embed_mode=config.part_embed_mode,
+        )
+        # num_part condition
+        if self.config.use_num_parts_cond:
+            assert self.config.use_parts, "use_num_parts_cond requires use_parts"
+            self.num_part_embed = nn.Embedding(5, config.hidden_dim)
+        # preload from a checkpoint (NOTE: this happens BEFORE checkpointer loading latest checkpoint!)
+        if self.config.pretrain_path is not None:
+            try:
+                ckpt = torch.load(self.config.pretrain_path)  # local path
+                self.load_state_dict(ckpt["model"], strict=True)
+                del ckpt
+                print(f"Loaded DiT from {self.config.pretrain_path}")
+            except Exception as e:
+                print(
+                    f"Failed to load DiT from {self.config.pretrain_path}: {e}, make sure you resumed from a valid checkpoint!"
+                )
+        # sampler
+        self.scheduler = FlowMatchingScheduler(shift=config.flow_shift)
+        n_params = 0
+        for p in self.dit.parameters():
+            n_params += p.numel()
+        print(f"Number of parameters in DiT: {n_params/1e6:.2f}M")
+    # override state_dict to exclude vae and dino, so we only save the trainable params.
+    def state_dict(self, *args, **kwargs):
+        state_dict = super().state_dict(*args, **kwargs)
+        keys_to_del = []
+        for k in state_dict.keys():
+            if "vae" in k or "dino" in k:
+                keys_to_del.append(k)
+        for k in keys_to_del:
+            del state_dict[k]
+        return state_dict
+    # override to support tolerant loading (only load matched shape)
+    def load_state_dict(self, state_dict, strict=True, assign=False):
+        local_state_dict = self.state_dict()
+        seen_keys = {k: False for k in local_state_dict.keys()}
+        for k, v in state_dict.items():
+            if k in local_state_dict:
+                seen_keys[k] = True
+                if local_state_dict[k].shape == v.shape:
+                    local_state_dict[k].copy_(v)
+                else:
+                    print(f"mismatching shape for key {k}: loaded {local_state_dict[k].shape} but model has {v.shape}")
+            else:
+                print(f"unexpected key {k} in loaded state dict")
+        for k in seen_keys:
+            if not seen_keys[k]:
+                print(f"missing key {k} in loaded state dict")
+    # this happens before checkpointer loading old models !!!
+    def on_train_start(self, memory_format: torch.memory_format = torch.preserve_format) -> None:
+        super().on_train_start(memory_format=memory_format)
+        device = next(self.dit.parameters()).device
+        self.dit.to(dtype=self.precision)
+        if self.config.use_num_parts_cond:
+            self.num_part_embed.to(dtype=self.precision)
+        # cast scheduler to device
+        self.scheduler.to(device)
+    def get_cond(self, cond_image, num_part=None):
+        # image condition
+        cond_image = cond_image.to(dtype=self.precision)
+        with torch.no_grad():
+            cond = self.dino(cond_image).last_hidden_state
+        cond = F.layer_norm(cond.float(), cond.shape[-1:]).to(dtype=self.precision)  # [B, L, C]
+        # num_part condition
+        if self.config.use_num_parts_cond:
+            if num_part is None:
+                # use a default value (2-10 parts)
+                num_part_coarse = torch.ones(cond.shape[0], dtype=torch.int64, device=cond.device) * 2
+            else:
+                # coarse range
+                num_part_coarse = torch.ones(cond.shape[0], dtype=torch.int64, device=cond.device)
+                num_part_coarse[num_part == 2] = 1
+                num_part_coarse[(num_part > 2) & (num_part <= 10)] = 2
+                num_part_coarse[(num_part > 10) & (num_part <= 100)] = 3
+                num_part_coarse[num_part > 100] = 4
+            num_part_embed = self.num_part_embed(num_part_coarse).unsqueeze(1)  # [B, 1, C]
+            cond = torch.cat([cond, num_part_embed], dim=1)  # [B, L+1, C]
+        return cond
+    def training_step(
+        self,
+        data: dict[str, torch.Tensor],
+        iteration: int,
+    ) -> tuple[dict[str, torch.Tensor], torch.Tensor]:
+        output = {}
+        loss = 0
+        cond_images = self.preprocess_cond_image(
+            data["cond_images"]
+        )  # [B, N, 3, 518, 518], we may load multiple (N) cond images for the same shape
+        B, N, C, H, W = cond_images.shape
+        if self.config.use_num_parts_cond:
+            cond_num_part = data["num_part"].repeat_interleave(N, dim=0)
+        else:
+            cond_num_part = None
+        cond = self.get_cond(cond_images.view(-1, C, H, W), cond_num_part)  # [B*N, L, C]
+        # random CFG dropout
+        if self.training:
+            mask = torch.rand((B * N, 1, 1), device=cond.device, dtype=cond.dtype) >= 0.1
+            cond = cond * mask
+        with torch.no_grad():
+            # encode latent
+            if self.config.use_parts:
+                # encode two parts and concat latent
+                part0_data = {k.replace("_part0", ""): v for k, v in data.items() if "_part0" in k}
+                part1_data = {k.replace("_part1", ""): v for k, v in data.items() if "_part1" in k}
+                posterior0 = self.vae.encode(part0_data)
+                posterior1 = self.vae.encode(part1_data)
+                if self.training and self.config.shuffle_parts:
+                    if np.random.rand() < 0.5:
+                        posterior0, posterior1 = posterior1, posterior0
+                latent = torch.cat(
+                    [
+                        posterior0.mode().float().nan_to_num_(0),
+                        posterior1.mode().float().nan_to_num_(0),
+                    ],
+                    dim=1,
+                )  # [B, 2L, C]
+            else:
+                posterior = self.vae.encode(data)
+                latent = posterior.mode().float().nan_to_num_(0)  # use mean as the latent, [B, L, C]
+            # repeat latent for each cond image
+            if N != 1:
+                latent = latent.repeat_interleave(N, dim=0)
+            # random sample timesteps and add noise
+            noisy_latent, noise, timesteps = self.scheduler.add_noise(
+                latent, self.config.logitnorm_mean, self.config.logitnorm_std
+            )
+        noisy_latent = noisy_latent.to(dtype=self.precision)
+        model_pred = self.dit(noisy_latent, cond, timesteps)
+        # flow-matching loss
+        target = noise - latent
+        loss = F.mse_loss(model_pred.float(), target.float())
+        # metrics
+        with torch.no_grad():
+            output["scalar"] = {}  # for wandb logging
+            output["scalar"]["loss_mse"] = loss.detach()
+        return output, loss
+    @torch.no_grad()
+    def validation_step(
+        self,
+        data: dict[str, torch.Tensor],
+        iteration: int,
+    ) -> tuple[dict[str, torch.Tensor], torch.Tensor]:
+        return self.training_step(data, iteration)
+    @torch.inference_mode()
+    @sync_timer("flow forward")
+    def forward(
+        self,
+        data: dict[str, torch.Tensor],
+        num_steps: int = 30,
+        cfg_scale: float = 7.0,
+        verbose: bool = True,
+        generator: torch.Generator | None = None,
+    ) -> dict[str, torch.Tensor]:
+        # the inference sampling
+        cond_images = self.preprocess_cond_image(data["cond_images"])  # [B, 3, 518, 518]
+        B = cond_images.shape[0]
+        assert B == 1, "Only support batch size 1 for now."
+        # num_part condition
+        if self.config.use_num_parts_cond and "num_part" in data:
+            cond_num_part = data["num_part"]  # [B,], int
+        else:
+            cond_num_part = None
+        cond = self.get_cond(cond_images, cond_num_part)
+        if self.config.use_parts:
+            x = torch.randn(
+                B,
+                self.config.latent_size * 2,
+                self.config.latent_dim,
+                device=cond.device,
+                dtype=torch.float32,
+                generator=generator,
+            )
+        else:
+            x = torch.randn(
+                B,
+                self.config.latent_size,
+                self.config.latent_dim,
+                device=cond.device,
+                dtype=torch.float32,
+                generator=generator,
+            )
+        cond_input = torch.cat([cond, torch.zeros_like(cond)], dim=0)
+        # flow-matching
+        sigmas = np.linspace(1, 0, num_steps + 1)
+        sigmas = self.scheduler.shift * sigmas / (1 + (self.scheduler.shift - 1) * sigmas)
+        sigmas_pair = list((sigmas[i], sigmas[i + 1]) for i in range(num_steps))
+        for sigma, sigma_prev in tqdm.tqdm(sigmas_pair, desc="Flow Sampling", disable=not verbose):
+            # classifier-free guidance
+            timesteps = torch.tensor([1000 * sigma] * B * 2, device=x.device, dtype=x.dtype)
+            x_input = torch.cat([x, x], dim=0)
+            # predict v
+            x_input = x_input.to(dtype=self.precision)
+            pred = self.dit(x_input, cond_input, timesteps).float()
+            cond_v, uncond_v = pred.chunk(2, dim=0)
+            pred_v = uncond_v + (cond_v - uncond_v) * cfg_scale
+            # scheduler step
+            x = x - (sigma - sigma_prev) * pred_v
+        output = {}
+        output["latent"] = x
+        # leave mesh extraction to vae
+        return output

flow/modules/__init__.py ADDED Viewed

File without changes

flow/modules/dit.py ADDED Viewed

	@@ -0,0 +1,235 @@

+"""
+-----------------------------------------------------------------------------
+Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+NVIDIA CORPORATION and its licensors retain all intellectual property
+and proprietary rights in and to this software, related documentation
+and any modifications thereto. Any use, reproduction, disclosure or
+distribution of this software and related documentation without an express
+license agreement from NVIDIA CORPORATION is strictly prohibited.
+-----------------------------------------------------------------------------
+"""
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.utils.checkpoint import checkpoint
+from vae.modules.attention import CrossAttention, SelfAttention
+class FeedForward(nn.Module):
+    def __init__(self, dim, mult=4):
+        super().__init__()
+        self.net = nn.Sequential(nn.Linear(dim, dim * mult), nn.GELU(), nn.Linear(dim * mult, dim))
+    def forward(self, x):
+        return self.net(x)
+# Adapted from https://github.com/facebookresearch/DiT/blob/main/models.py#L27
+class TimestepEmbedder(nn.Module):
+    """
+    Embeds scalar timesteps into vector representations.
+    """
+    def __init__(self, hidden_size, frequency_embedding_size=256):
+        super().__init__()
+        self.mlp = nn.Sequential(
+            nn.Linear(frequency_embedding_size, hidden_size, bias=True),
+            nn.SiLU(),
+            nn.Linear(hidden_size, hidden_size, bias=True),
+        )
+        self.frequency_embedding_size = frequency_embedding_size
+    @staticmethod
+    def timestep_embedding(t, dim, max_period=10000):
+        """
+        Create sinusoidal timestep embeddings.
+        Args:
+            t: a 1-D Tensor of N indices, one per batch element.
+                These may be fractional.
+            dim: the dimension of the output.
+            max_period: controls the minimum frequency of the embeddings.
+        Returns:
+            an (N, D) Tensor of positional embeddings.
+        """
+        # https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py
+        half = dim // 2
+        freqs = torch.exp(-np.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half).to(
+            device=t.device
+        )
+        args = t[:, None].float() * freqs[None]
+        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+        if dim % 2:
+            embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+        return embedding
+    def forward(self, t):
+        dtype = next(self.mlp.parameters()).dtype  # need to determine on the fly...
+        t_freq = self.timestep_embedding(t, self.frequency_embedding_size)
+        t_freq = t_freq.to(dtype=dtype)
+        t_emb = self.mlp(t_freq)
+        return t_emb
+class DiTLayer(nn.Module):
+    def __init__(self, dim, num_heads, qknorm=False, gradient_checkpointing=True, qknorm_type="LayerNorm"):
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.gradient_checkpointing = gradient_checkpointing
+        self.norm1 = nn.LayerNorm(dim, eps=1e-6, elementwise_affine=False)
+        self.attn1 = SelfAttention(dim, num_heads, qknorm=qknorm, qknorm_type=qknorm_type)
+        self.norm2 = nn.LayerNorm(dim, eps=1e-6, elementwise_affine=False)
+        self.attn2 = CrossAttention(dim, num_heads, context_dim=dim, qknorm=qknorm, qknorm_type=qknorm_type)
+        self.norm3 = nn.LayerNorm(dim, eps=1e-6, elementwise_affine=False)
+        self.ff = FeedForward(dim)
+        self.adaln_linear = nn.Linear(dim, dim * 6, bias=True)
+    def forward(self, x, c, t_emb):
+        if self.training and self.gradient_checkpointing:
+            return checkpoint(self._forward, x, c, t_emb, use_reentrant=False)
+        else:
+            return self._forward(x, c, t_emb)
+    def _forward(self, x, c, t_emb):
+        # x: [B, N, C], hidden states
+        # c: [B, M, C], condition (assume normed and projected to C)
+        # t_emb: [B, C], timestep embedding of adaln
+        # return: [B, N, C], updated hidden states
+        B, N, C = x.shape
+        t_adaln = self.adaln_linear(F.silu(t_emb)).view(B, 6, -1)  # [B, 6, C]
+        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = t_adaln.chunk(6, dim=1)
+        h = self.norm1(x)
+        h = h * (1 + scale_msa) + shift_msa
+        x = x + gate_msa * self.attn1(h)
+        h = self.norm2(x)
+        x = x + self.attn2(h, c)
+        h = self.norm3(x)
+        h = h * (1 + scale_mlp) + shift_mlp
+        x = x + gate_mlp * self.ff(h)
+        return x
+class DiT(nn.Module):
+    def __init__(
+        self,
+        hidden_dim=1024,
+        num_heads=16,
+        latent_size=2048,
+        latent_dim=8,
+        num_layers=24,
+        qknorm=False,
+        gradient_checkpointing=True,
+        qknorm_type="LayerNorm",
+        use_pos_embed=False,
+        use_parts=False,
+        part_embed_mode="part2_only",
+    ):
+        super().__init__()
+        # project in
+        self.proj_in = nn.Linear(latent_dim, hidden_dim)
+        # positional encoding (just use a learnable positional encoding)
+        self.use_pos_embed = use_pos_embed
+        if self.use_pos_embed:
+            self.pos_embed = nn.Parameter(torch.randn(1, latent_size, hidden_dim) / hidden_dim**0.5)
+        # part encoding (a must to distinguish parts!)
+        self.use_parts = use_parts
+        self.part_embed_mode = part_embed_mode
+        if self.use_parts:
+            if self.part_embed_mode == "element":
+                self.part_embed = nn.Parameter(torch.randn(latent_size, hidden_dim) / hidden_dim**0.5)
+            elif self.part_embed_mode == "part":
+                self.part_embed = nn.Parameter(torch.randn(2, hidden_dim))
+            elif self.part_embed_mode == "part2_only":
+                # we only add this to the second part to distinguish from the first part
+                self.part_embed = nn.Parameter(torch.randn(1, hidden_dim) / hidden_dim**0.5)
+        # timestep encoding
+        self.timestep_embed = TimestepEmbedder(hidden_dim)
+        # transformer layers
+        self.layers = nn.ModuleList(
+            [DiTLayer(hidden_dim, num_heads, qknorm, gradient_checkpointing, qknorm_type) for _ in range(num_layers)]
+        )
+        # project out
+        self.norm_out = nn.LayerNorm(hidden_dim, eps=1e-6, elementwise_affine=False)
+        self.proj_out = nn.Linear(hidden_dim, latent_dim)
+        # init
+        self.init_weight()
+    def init_weight(self):
+        # Initialize transformer layers
+        def _basic_init(module):
+            if isinstance(module, nn.Linear):
+                torch.nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    nn.init.constant_(module.bias, 0)
+        self.apply(_basic_init)
+        # Initialize timestep embedding MLP:
+        nn.init.normal_(self.timestep_embed.mlp[0].weight, std=0.02)
+        nn.init.normal_(self.timestep_embed.mlp[2].weight, std=0.02)
+        # Zero-out adaLN modulation layers in DiT blocks:
+        for layer in self.layers:
+            nn.init.constant_(layer.adaln_linear.weight, 0)
+            nn.init.constant_(layer.adaln_linear.bias, 0)
+        # Zero-out output layers:
+        nn.init.constant_(self.proj_out.weight, 0)
+        nn.init.constant_(self.proj_out.bias, 0)
+    def forward(self, x, c, t):
+        # x: [B, N, C], hidden states
+        # c: [B, M, C], condition (assume normed and projected to C)
+        # t: [B,], timestep
+        # return: [B, N, C], updated hidden states
+        B, N, C = x.shape
+        # project in
+        x = self.proj_in(x)
+        # positional encoding
+        if self.use_pos_embed:
+            x = x + self.pos_embed
+        # part encoding
+        if self.use_parts:
+            if self.part_embed_mode == "element":
+                x += self.part_embed
+            elif self.part_embed_mode == "part":
+                x[:, : x.shape[1] // 2, :] += self.part_embed[0]
+                x[:, x.shape[1] // 2 :, :] += self.part_embed[1]
+            elif self.part_embed_mode == "part2_only":
+                x[:, x.shape[1] // 2 :, :] += self.part_embed[0]
+        # timestep encoding
+        t_emb = self.timestep_embed(t)  # [B, C]
+        # transformer layers
+        for layer in self.layers:
+            x = layer(x, c, t_emb)
+        # project out
+        x = self.norm_out(x)
+        x = self.proj_out(x)
+        return x

flow/scripts/infer.py ADDED Viewed

	@@ -0,0 +1,180 @@

+"""
+-----------------------------------------------------------------------------
+Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+NVIDIA CORPORATION and its licensors retain all intellectual property
+and proprietary rights in and to this software, related documentation
+and any modifications thereto. Any use, reproduction, disclosure or
+distribution of this software and related documentation without an express
+license agreement from NVIDIA CORPORATION is strictly prohibited.
+-----------------------------------------------------------------------------
+"""
+import argparse
+import glob
+import importlib
+import os
+from datetime import datetime
+import cv2
+import kiui
+import numpy as np
+import rembg
+import torch
+import trimesh
+from flow.model import Model
+from flow.utils import get_random_color, recenter_foreground
+from vae.utils import postprocess_mesh
+# PYTHONPATH=. python flow/scripts/infer.py
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--config",
+    type=str,
+    help="config file path",
+    default="flow.configs.big_parts_strict_pvae",
+)
+parser.add_argument(
+    "--ckpt_path",
+    type=str,
+    help="checkpoint path",
+    default="pretrained/flow.pt",
+)
+parser.add_argument("--input", type=str, help="input directory", default="assets/images/")
+parser.add_argument("--limit", type=int, help="limit number of images", default=-1)
+parser.add_argument("--output_dir", type=str, help="output directory", default="output/")
+parser.add_argument("--grid_res", type=int, help="grid resolution", default=384)
+parser.add_argument("--num_steps", type=int, help="number of cfg steps", default=30)
+parser.add_argument("--cfg_scale", type=float, help="cfg scale", default=7.0)
+parser.add_argument("--num_repeats", type=int, help="number of repeats per image", default=1)
+parser.add_argument("--seed", type=int, help="seed", default=42)
+args = parser.parse_args()
+TRIMESH_GLB_EXPORT = np.array([[0, 1, 0], [0, 0, 1], [1, 0, 0]]).astype(np.float32)
+bg_remover = rembg.new_session()
+def preprocess_image(path):
+    input_image = kiui.read_image(path, mode="uint8", order="RGBA")
+    # bg removal if there is no alpha channel
+    if input_image.shape[-1] == 3:
+        input_image = rembg.remove(input_image, session=bg_remover)  # [H, W, 4]
+    mask = input_image[..., -1] > 0
+    image = recenter_foreground(input_image, mask, border_ratio=0.1)
+    image = cv2.resize(image, (518, 518), interpolation=cv2.INTER_LINEAR)
+    image = image.astype(np.float32) / 255.0
+    image = image[..., :3] * image[..., 3:4] + (1 - image[..., 3:4])  # white background
+    return image
+print(f"Loading checkpoint from {args.ckpt_path}")
+ckpt_dict = torch.load(args.ckpt_path, weights_only=True)
+# delete all keys other than model
+if "model" in ckpt_dict:
+    ckpt_dict = ckpt_dict["model"]
+# instantiate model
+print(f"Instantiating model from {args.config}")
+model_config = importlib.import_module(args.config).make_config()
+model = Model(model_config).eval().cuda().bfloat16()
+# load weight
+print(f"Loading weights from {args.ckpt_path}")
+model.load_state_dict(ckpt_dict, strict=True)
+# output folder
+timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+workspace = os.path.join(args.output_dir, "flow_" + args.config.split(".")[-1] + "_" + timestamp)
+if not os.path.exists(workspace):
+    os.makedirs(workspace)
+else:
+    os.system(f"rm {workspace}/*")
+print(f"Output directory: {workspace}")
+# load test images
+if os.path.isdir(args.input):
+    paths = glob.glob(os.path.join(args.input, "*"))
+    paths = sorted(paths)
+    if args.limit > 0:
+        paths = paths[: args.limit]
+else:  # single file
+    paths = [args.input]
+for path in paths:
+    name = os.path.splitext(os.path.basename(path))[0]
+    print(f"Processing {name}")
+    image = preprocess_image(path)
+    kiui.write_image(os.path.join(workspace, name + ".jpg"), image)
+    image = torch.from_numpy(image).permute(2, 0, 1).contiguous().unsqueeze(0).float().cuda()
+    # run model
+    data = {"cond_images": image}
+    for i in range(args.num_repeats):
+        kiui.seed_everything(args.seed + i)
+        with torch.inference_mode():
+            results = model(data, num_steps=args.num_steps, cfg_scale=args.cfg_scale)
+        latent = results["latent"]
+        # kiui.lo(latent)
+        # query mesh
+        if model.config.use_parts:
+            data_part0 = {"latent": latent[:, : model.config.latent_size, :]}
+            data_part1 = {"latent": latent[:, model.config.latent_size :, :]}
+            with torch.inference_mode():
+                results_part0 = model.vae(data_part0, resolution=args.grid_res)
+                results_part1 = model.vae(data_part1, resolution=args.grid_res)
+            vertices, faces = results_part0["meshes"][0]
+            mesh_part0 = trimesh.Trimesh(vertices, faces)
+            mesh_part0.vertices = mesh_part0.vertices @ TRIMESH_GLB_EXPORT.T
+            mesh_part0 = postprocess_mesh(mesh_part0, 5e4)
+            parts = mesh_part0.split(only_watertight=False)
+            vertices, faces = results_part1["meshes"][0]
+            mesh_part1 = trimesh.Trimesh(vertices, faces)
+            mesh_part1.vertices = mesh_part1.vertices @ TRIMESH_GLB_EXPORT.T
+            mesh_part1 = postprocess_mesh(mesh_part1, 5e4)
+            parts.extend(mesh_part1.split(only_watertight=False))
+            # split connected components and assign different colors
+            for j, part in enumerate(parts):
+                # each component uses a random color
+                part.visual.vertex_colors = get_random_color(j, use_float=True)
+            mesh = trimesh.Scene(parts)
+            # export the whole mesh
+            mesh.export(os.path.join(workspace, name + "_" + str(i) + ".glb"))
+            # export each part
+            for j, part in enumerate(parts):
+                part.export(os.path.join(workspace, name + "_" + str(i) + "_part" + str(j) + ".glb"))
+            # export dual volumes
+            mesh_part0.export(os.path.join(workspace, name + "_" + str(i) + "_vol0.glb"))
+            mesh_part1.export(os.path.join(workspace, name + "_" + str(i) + "_vol1.glb"))
+        else:
+            data = {"latent": latent}
+            with torch.inference_mode():
+                results = model.vae(data, resolution=args.grid_res)
+            vertices, faces = results["meshes"][0]
+            mesh = trimesh.Trimesh(vertices, faces)
+            mesh = postprocess_mesh(mesh, 5e4)
+            # kiui.lo(mesh.vertices, mesh.faces)
+            mesh.vertices = mesh.vertices @ TRIMESH_GLB_EXPORT.T
+            mesh.export(os.path.join(workspace, name + "_" + str(i) + ".glb"))

flow/utils.py ADDED Viewed

	@@ -0,0 +1,119 @@

+"""
+-----------------------------------------------------------------------------
+Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+NVIDIA CORPORATION and its licensors retain all intellectual property
+and proprietary rights in and to this software, related documentation
+and any modifications thereto. Any use, reproduction, disclosure or
+distribution of this software and related documentation without an express
+license agreement from NVIDIA CORPORATION is strictly prohibited.
+-----------------------------------------------------------------------------
+"""
+from typing import Optional
+import cv2
+import numpy as np
+def recenter_foreground(image, mask, border_ratio: float = 0.1):
+    """recenter an image to leave some empty space at the image border.
+    Args:
+        image (ndarray): input image, float/uint8 [H, W, 3/4]
+        mask (ndarray): alpha mask, bool [H, W]
+        border_ratio (float, optional): border ratio, image will be resized to (1 - border_ratio). Defaults to 0.1.
+    Returns:
+        ndarray: output image, float/uint8 [H, W, 3/4]
+    """
+    # empty foreground: just return
+    if mask.sum() == 0:
+        return image
+    return_int = False
+    if image.dtype == np.uint8:
+        image = image.astype(np.float32) / 255
+        return_int = True
+    H, W, C = image.shape
+    size = max(H, W)
+    # default to white bg if rgb, but use 0 if rgba
+    if C == 3:
+        result = np.ones((size, size, C), dtype=np.float32)
+    else:
+        result = np.zeros((size, size, C), dtype=np.float32)
+    coords = np.nonzero(mask)
+    x_min, x_max = coords[0].min(), coords[0].max()
+    y_min, y_max = coords[1].min(), coords[1].max()
+    h = x_max - x_min
+    w = y_max - y_min
+    desired_size = int(size * (1 - border_ratio))
+    scale = desired_size / max(h, w)
+    h2 = int(h * scale)
+    w2 = int(w * scale)
+    x2_min = (size - h2) // 2
+    x2_max = x2_min + h2
+    y2_min = (size - w2) // 2
+    y2_max = y2_min + w2
+    result[x2_min:x2_max, y2_min:y2_max] = cv2.resize(
+        image[x_min:x_max, y_min:y_max], (w2, h2), interpolation=cv2.INTER_AREA
+    )
+    if return_int:
+        result = (result * 255).astype(np.uint8)
+    return result
+def get_random_color(index: Optional[int] = None, use_float: bool = False):
+    # some pleasing colors
+    # matplotlib.colormaps['Set3'].colors + matplotlib.colormaps['Set2'].colors + matplotlib.colormaps['Set1'].colors
+    palette = np.array(
+        [
+            [141, 211, 199, 255],
+            [255, 255, 179, 255],
+            [190, 186, 218, 255],
+            [251, 128, 114, 255],
+            [128, 177, 211, 255],
+            [253, 180, 98, 255],
+            [179, 222, 105, 255],
+            [252, 205, 229, 255],
+            [217, 217, 217, 255],
+            [188, 128, 189, 255],
+            [204, 235, 197, 255],
+            [255, 237, 111, 255],
+            [102, 194, 165, 255],
+            [252, 141, 98, 255],
+            [141, 160, 203, 255],
+            [231, 138, 195, 255],
+            [166, 216, 84, 255],
+            [255, 217, 47, 255],
+            [229, 196, 148, 255],
+            [179, 179, 179, 255],
+            [228, 26, 28, 255],
+            [55, 126, 184, 255],
+            [77, 175, 74, 255],
+            [152, 78, 163, 255],
+            [255, 127, 0, 255],
+            [255, 255, 51, 255],
+            [166, 86, 40, 255],
+            [247, 129, 191, 255],
+            [153, 153, 153, 255],
+        ],
+        dtype=np.uint8,
+    )
+    if index is None:
+        index = np.random.randint(0, len(palette))
+    if index >= len(palette):
+        index = index % len(palette)
+    if use_float:
+        return palette[index].astype(np.float32) / 255
+    else:
+        return palette[index]

requirements.txt ADDED Viewed

	@@ -0,0 +1,16 @@

+torch==2.5.1  # should be installed by HF
+torchvision==0.20.1
+numpy
+trimesh
+# meshiki # we don't use it for flow inference
+fpsample
+einops
+onnxruntime
+rembg
+kiui
+pymcubes
+tqdm
+opencv-python
+ninja
+pymeshlab
+transformers

vae/__init__.py ADDED Viewed

File without changes

vae/configs/__init__.py ADDED Viewed

File without changes

vae/configs/part_woenc.py ADDED Viewed

	@@ -0,0 +1,30 @@

+"""
+-----------------------------------------------------------------------------
+Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+NVIDIA CORPORATION and its licensors retain all intellectual property
+and proprietary rights in and to this software, related documentation
+and any modifications thereto. Any use, reproduction, disclosure or
+distribution of this software and related documentation without an express
+license agreement from NVIDIA CORPORATION is strictly prohibited.
+-----------------------------------------------------------------------------
+"""
+from vae.configs.schema import ModelConfig
+def make_config():
+    model_config = ModelConfig(
+        use_salient_point=True,
+        latent_size=4096,
+        cutoff_fps_point=(256, 512, 512, 512, 1024, 1024, 2048),
+        cutoff_fps_salient_point=(0, 0, 256, 512, 512, 1024, 2048),
+        cutoff_fps_prob=(0.1, 0.1, 0.1, 0.1, 0.1, 0.3, 0.2),
+        kl_weight=1e-3,
+        salient_attn_mode="dual",
+        num_enc_layers=0,
+        num_dec_layers=24,
+    )
+    return model_config

vae/configs/schema.py ADDED Viewed

	@@ -0,0 +1,55 @@

+"""
+-----------------------------------------------------------------------------
+Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+NVIDIA CORPORATION and its licensors retain all intellectual property
+and proprietary rights in and to this software, related documentation
+and any modifications thereto. Any use, reproduction, disclosure or
+distribution of this software and related documentation without an express
+license agreement from NVIDIA CORPORATION is strictly prohibited.
+-----------------------------------------------------------------------------
+"""
+from typing import Literal, Optional, Tuple
+import attrs
+@attrs.define(slots=False)
+class ModelConfig:
+    # input
+    use_salient_point: bool = True
+    # random cutoff during training
+    cutoff_fps_point: Tuple[int, ...] = (256, 512, 512, 512, 1024, 1024, 2048)
+    cutoff_fps_salient_point: Tuple[int, ...] = (0, 0, 256, 512, 512, 1024, 2048)
+    cutoff_fps_prob: Tuple[float, ...] = (0.1, 0.1, 0.1, 0.1, 0.1, 0.3, 0.2)  # sum to 1.0
+    # backbone transformer
+    num_enc_layers: int = 0
+    hidden_dim: int = 1024
+    num_heads: int = 16
+    num_dec_layers: int = 24
+    dec_hidden_dim: int = 1024
+    dec_num_heads: int = 16
+    qknorm: bool = True
+    qknorm_type: Literal["LayerNorm", "RMSNorm"] = "LayerNorm"  # type of qknorm
+    salient_attn_mode: Literal["dual_shared", "single", "dual"] = "dual"
+    # query decoder
+    fourier_version: Literal["v1", "v2", "v3"] = "v3"
+    point_fourier_dim: int = 48  # must be divisible by 6 (sin/cos, x/y/z)
+    query_hidden_dim: int = 1024
+    query_num_heads: int = 16
+    use_flash_query: bool = False
+    # latent code
+    latent_size: int = 4096  # == num_fps_point + num_fps_salient_point
+    latent_dim: int = 64
+    # loss
+    use_ae: bool = False  # if true, variance will be ignored, and kl_weight is used as a L2 norm weight
+    kl_weight: float = 1e-3
+    # init weights from a pretrained checkpoint
+    pretrain_path: Optional[str] = None

vae/model.py ADDED Viewed

	@@ -0,0 +1,451 @@

+"""
+-----------------------------------------------------------------------------
+Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+NVIDIA CORPORATION and its licensors retain all intellectual property
+and proprietary rights in and to this software, related documentation
+and any modifications thereto. Any use, reproduction, disclosure or
+distribution of this software and related documentation without an express
+license agreement from NVIDIA CORPORATION is strictly prohibited.
+-----------------------------------------------------------------------------
+"""
+from typing import Literal
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from vae.configs.schema import ModelConfig
+from vae.modules.transformer import AttentionBlock, FlashQueryLayer
+from vae.utils import (
+    DiagonalGaussianDistribution,
+    DummyLatent,
+    calculate_iou,
+    calculate_metrics,
+    construct_grid_points,
+    extract_mesh,
+    sync_timer,
+)
+class Model(nn.Module):
+    def __init__(self, config: ModelConfig) -> None:
+        super().__init__()
+        self.config = config
+        self.precision = torch.bfloat16  # manually handle low-precision training, always use bf16
+        # point encoder
+        self.proj_input = nn.Linear(3 + config.point_fourier_dim, config.hidden_dim)
+        self.perceiver = AttentionBlock(
+            config.hidden_dim,
+            num_heads=config.num_heads,
+            dim_context=config.hidden_dim,
+            qknorm=config.qknorm,
+            qknorm_type=config.qknorm_type,
+        )
+        if self.config.salient_attn_mode == "dual":
+            self.perceiver_dorases = AttentionBlock(
+                config.hidden_dim,
+                num_heads=config.num_heads,
+                dim_context=config.hidden_dim,
+                qknorm=config.qknorm,
+                qknorm_type=config.qknorm_type,
+            )
+        # self-attention encoder
+        self.encoder = nn.ModuleList(
+            [
+                AttentionBlock(
+                    config.hidden_dim, config.num_heads, qknorm=config.qknorm, qknorm_type=config.qknorm_type
+                )
+                for _ in range(config.num_enc_layers)
+            ]
+        )
+        # vae bottleneck
+        self.norm_down = nn.LayerNorm(config.hidden_dim)
+        self.proj_down_mean = nn.Linear(config.hidden_dim, config.latent_dim)
+        if not self.config.use_ae:
+            self.proj_down_std = nn.Linear(config.hidden_dim, config.latent_dim)
+        self.proj_up = nn.Linear(config.latent_dim, config.dec_hidden_dim)
+        # self-attention decoder
+        self.decoder = nn.ModuleList(
+            [
+                AttentionBlock(
+                    config.dec_hidden_dim, config.dec_num_heads, qknorm=config.qknorm, qknorm_type=config.qknorm_type
+                )
+                for _ in range(config.num_dec_layers)
+            ]
+        )
+        # cross-attention query
+        self.proj_query = nn.Linear(3 + config.point_fourier_dim, config.query_hidden_dim)
+        if self.config.use_flash_query:
+            self.norm_query_context = nn.LayerNorm(config.hidden_dim, eps=1e-6, elementwise_affine=False)
+            self.attn_query = FlashQueryLayer(
+                config.query_hidden_dim,
+                num_heads=config.query_num_heads,
+                dim_context=config.hidden_dim,
+                qknorm=config.qknorm,
+                qknorm_type=config.qknorm_type,
+            )
+        else:
+            self.attn_query = AttentionBlock(
+                config.query_hidden_dim,
+                num_heads=config.query_num_heads,
+                dim_context=config.hidden_dim,
+                qknorm=config.qknorm,
+                qknorm_type=config.qknorm_type,
+            )
+        self.norm_out = nn.LayerNorm(config.query_hidden_dim)
+        self.proj_out = nn.Linear(config.query_hidden_dim, 1)
+        # preload from a checkpoint (NOTE: this happens BEFORE checkpointer loading latest checkpoint!)
+        if self.config.pretrain_path is not None:
+            try:
+                ckpt = torch.load(self.config.pretrain_path)  # local path
+                self.load_state_dict(ckpt["model"], strict=True)
+                del ckpt
+                print(f"Loaded VAE from {self.config.pretrain_path}")
+            except Exception as e:
+                print(
+                    f"Failed to load VAE from {self.config.pretrain_path}: {e}, make sure you resumed from a valid checkpoint!"
+                )
+        # log
+        n_params = 0
+        for p in self.parameters():
+            n_params += p.numel()
+        print(f"Number of parameters in VAE: {n_params / 1e6:.2f}M")
+    # override to support tolerant loading (only load matched shape)
+    def load_state_dict(self, state_dict, strict=True, assign=False):
+        local_state_dict = self.state_dict()
+        seen_keys = {k: False for k in local_state_dict.keys()}
+        for k, v in state_dict.items():
+            if k in local_state_dict:
+                seen_keys[k] = True
+                if local_state_dict[k].shape == v.shape:
+                    local_state_dict[k].copy_(v)
+                else:
+                    print(f"mismatching shape for key {k}: loaded {local_state_dict[k].shape} but model has {v.shape}")
+            else:
+                print(f"unexpected key {k} in loaded state dict")
+        for k in seen_keys:
+            if not seen_keys[k]:
+                print(f"missing key {k} in loaded state dict")
+    def fourier_encoding(self, points: torch.Tensor):
+        # points: [B, N, 3], float32 for precision
+        # assert points.dtype == torch.float32, "Query points must be float32"
+        F = self.config.point_fourier_dim // (2 * points.shape[-1])
+        if self.config.fourier_version == "v1":  # default
+            exponent = torch.arange(1, F + 1, device=points.device, dtype=torch.float32) / F  # [F], range from 0 to 1
+            freq_band = 512**exponent  # [F], min frequency is 1, max frequency is 1/freq
+            freq_band *= torch.pi
+        elif self.config.fourier_version == "v2":
+            exponent = torch.arange(F, device=points.device, dtype=torch.float32) / (F - 1)  # [F], range from 0 to 1
+            freq_band = 1024**exponent  # [F]
+            freq_band *= torch.pi
+        elif self.config.fourier_version == "v3":  # hunyuan3d-2
+            freq_band = 2 ** torch.arange(F, device=points.device, dtype=torch.float32)  # [F]
+        spectrum = points.unsqueeze(-1) * freq_band  # [B,...,3,F]
+        sin, cos = spectrum.sin(), spectrum.cos()  # [B,...,3,F]
+        input_enc = torch.stack([sin, cos], dim=-2)  # [B,...,3,2,F]
+        input_enc = input_enc.view(*points.shape[:-1], -1)  # [B,...,6F] = [B,...,dim]
+        return torch.cat([input_enc, points], dim=-1).to(dtype=self.precision)  # [B,...,dim+input_dim]
+    def on_train_start(self, memory_format: torch.memory_format = torch.preserve_format) -> None:
+        super().on_train_start(memory_format=memory_format)
+        self.to(dtype=self.precision, memory_format=memory_format)  # use bfloat16 for training
+    def encode(self, data: dict[str, torch.Tensor]):
+        # uniform points
+        pointcloud = data["pointcloud"]  # [B, N, 3]
+        # fourier embed and project
+        pointcloud = self.fourier_encoding(pointcloud)  # [B, N, 3+C]
+        pointcloud = self.proj_input(pointcloud)  # [B, N, hidden_dim]
+        # salient points
+        if self.config.use_salient_point:
+            pointcloud_dorases = data["pointcloud_dorases"]  # [B, M, 3]
+            # fourier embed and project (shared weights)
+            pointcloud_dorases = self.fourier_encoding(pointcloud_dorases)  # [B, M, 3+C]
+            pointcloud_dorases = self.proj_input(pointcloud_dorases)  # [B, M, hidden_dim]
+        # gather fps point
+        fps_indices = data["fps_indices"]  # [B, N']
+        pointcloud_query = torch.gather(pointcloud, 1, fps_indices.unsqueeze(-1).expand(-1, -1, pointcloud.shape[-1]))
+        if self.config.use_salient_point:
+            fps_indices_dorases = data["fps_indices_dorases"]  # [B, M']
+            if fps_indices_dorases.shape[1] > 0:
+                pointcloud_query_dorases = torch.gather(
+                    pointcloud_dorases,
+                    1,
+                    fps_indices_dorases.unsqueeze(-1).expand(-1, -1, pointcloud_dorases.shape[-1]),
+                )
+                # combine both fps points as the query
+                pointcloud_query = torch.cat(
+                    [pointcloud_query, pointcloud_query_dorases], dim=1
+                )  # [B, N'+M', hidden_dim]
+            # dual cross-attention
+            if self.config.salient_attn_mode == "dual_shared":
+                hidden_states = self.perceiver(pointcloud_query, pointcloud) + self.perceiver(
+                    pointcloud_query, pointcloud_dorases
+                )  # [B, N'+M', hidden_dim]
+            elif self.config.salient_attn_mode == "dual":
+                hidden_states = self.perceiver(pointcloud_query, pointcloud) + self.perceiver_dorases(
+                    pointcloud_query, pointcloud_dorases
+                )
+            else:  # single, hunyuan3d-2 style
+                hidden_states = self.perceiver(pointcloud_query, torch.cat([pointcloud, pointcloud_dorases], dim=1))
+        else:
+            hidden_states = self.perceiver(pointcloud_query, pointcloud)  # [B, N', hidden_dim]
+        # encoder
+        for block in self.encoder:
+            hidden_states = block(hidden_states)
+        # bottleneck
+        hidden_states = self.norm_down(hidden_states)
+        latent_mean = self.proj_down_mean(hidden_states).float()
+        if not self.config.use_ae:
+            latent_std = self.proj_down_std(hidden_states).float()
+            posterior = DiagonalGaussianDistribution(latent_mean, latent_std)
+        else:
+            posterior = DummyLatent(latent_mean)
+        return posterior
+    def decode(self, latent: torch.Tensor):
+        latent = latent.to(dtype=self.precision)
+        hidden_states = self.proj_up(latent)
+        for block in self.decoder:
+            hidden_states = block(hidden_states)
+        return hidden_states
+    def query(self, query_points: torch.Tensor, hidden_states: torch.Tensor):
+        # query_points: [B, N, 3], float32 to keep the precision
+        query_points = self.fourier_encoding(query_points)  # [B, N, 3+C]
+        query_points = self.proj_query(query_points)  # [B, N, hidden_dim]
+        # cross attention
+        query_output = self.attn_query(query_points, hidden_states)  # [B, N, hidden_dim]
+        # output linear
+        query_output = self.norm_out(query_output)
+        pred = self.proj_out(query_output)  # [B, N, 1]
+        return pred
+    def training_step(
+        self,
+        data: dict[str, torch.Tensor],
+        iteration: int,
+    ) -> tuple[dict[str, torch.Tensor], torch.Tensor]:
+        output = {}
+        # cut off fps point during training for progressive flow
+        if self.training:
+            # randomly choose from a set of cutoff candidates
+            cutoff_index = np.random.choice(len(self.config.cutoff_fps_prob), p=self.config.cutoff_fps_prob)
+            cutoff_fps_point = self.config.cutoff_fps_point[cutoff_index]
+            cutoff_fps_salient_point = self.config.cutoff_fps_salient_point[cutoff_index]
+            # prefix of FPS points are still FPS points
+            data["fps_indices"] = data["fps_indices"][:, :cutoff_fps_point]
+            if self.config.use_salient_point:
+                data["fps_indices_dorases"] = data["fps_indices_dorases"][:, :cutoff_fps_salient_point]
+        loss = 0
+        # encode
+        posterior = self.encode(data)
+        latent_geom = posterior.sample() if self.training else posterior.mode()
+        # decode
+        hidden_states = self.decode(latent_geom)
+        # cross-attention query
+        query_points = data["query_points"]  # [B, N, 3], float32
+        # the context norm can be moved out to avoid repeated computation
+        if self.config.use_flash_query:
+            hidden_states = self.norm_query_context(hidden_states)
+        pred = self.query(query_points, hidden_states).squeeze(-1).float()  # [B, N]
+        gt = data["query_gt"].float()  # [B, N], in [-1, 1]
+        # main loss
+        loss_mse = F.mse_loss(pred, gt, reduction="mean")
+        loss += loss_mse
+        loss_l1 = F.l1_loss(pred, gt, reduction="mean")
+        loss += loss_l1
+        # kl loss
+        loss_kl = posterior.kl().mean()
+        loss += self.config.kl_weight * loss_kl
+        # metrics
+        with torch.no_grad():
+            output["scalar"] = {}  # for wandb logging
+            output["scalar"]["loss_mse"] = loss_mse.detach()
+            output["scalar"]["loss_l1"] = loss_l1.detach()
+            output["scalar"]["loss_kl"] = loss_kl.detach()
+            output["scalar"]["iou_fg"] = calculate_iou(pred, gt, target_value=1)
+            output["scalar"]["iou_bg"] = calculate_iou(pred, gt, target_value=0)
+            output["scalar"]["precision"], output["scalar"]["recall"], output["scalar"]["f1"] = calculate_metrics(
+                pred, gt, target_value=1
+            )
+        return output, loss
+    @torch.no_grad()
+    def validation_step(
+        self,
+        data: dict[str, torch.Tensor],
+        iteration: int,
+    ) -> tuple[dict[str, torch.Tensor], torch.Tensor]:
+        return self.training_step(data, iteration)
+    @torch.inference_mode()
+    @sync_timer("vae forward")
+    def forward(
+        self,
+        data: dict[str, torch.Tensor],
+        mode: Literal["dense", "hierarchical"] = "hierarchical",
+        max_samples_per_iter: int = 512**2,
+        resolution: int = 512,
+        min_resolution: int = 64,  # for hierarchical
+    ) -> dict[str, torch.Tensor]:
+        output = {}
+        # encode
+        if "latent" in data:
+            latent = data["latent"]
+        else:
+            posterior = self.encode(data)
+            output["posterior"] = posterior
+            latent = posterior.mode()
+        output["latent"] = latent
+        B = latent.shape[0]
+        # decode
+        hidden_states = self.decode(latent)
+        output["hidden_states"] = hidden_states  # [B, N, hidden_dim] for the last cross-attention decoder
+        # the context norm can be moved out to avoid repeated computation
+        if self.config.use_flash_query:
+            hidden_states = self.norm_query_context(hidden_states)
+        # query
+        def chunked_query(grid_points):
+            if grid_points.shape[0] <= max_samples_per_iter:
+                return self.query(grid_points.unsqueeze(0), hidden_states).squeeze(-1)  # [B, N]
+            all_pred = []
+            for i in range(0, grid_points.shape[0], max_samples_per_iter):
+                grid_chunk = grid_points[i : i + max_samples_per_iter]
+                pred_chunk = self.query(grid_chunk.unsqueeze(0), hidden_states)
+                all_pred.append(pred_chunk)
+            return torch.cat(all_pred, dim=1).squeeze(-1)  # [B, N]
+        if mode == "dense":
+            grid_points = construct_grid_points(resolution).to(latent.device)
+            grid_points = grid_points.contiguous().view(-1, 3)
+            grid_vals = chunked_query(grid_points).float().view(B, resolution + 1, resolution + 1, resolution + 1)
+        elif mode == "hierarchical":
+            assert resolution >= min_resolution, "Resolution must be greater than or equal to min_resolution"
+            assert B == 1, "Only one batch is supported for hierarchical mode"
+            resolutions = []
+            res = resolution
+            while res >= min_resolution:
+                resolutions.append(res)
+                res = res // 2
+            resolutions.reverse()  # e.g., [64, 128, 256, 512]
+            # dense-query the coarsest resolution
+            res = resolutions[0]
+            grid_points = construct_grid_points(res).to(latent.device)
+            grid_points = grid_points.contiguous().view(-1, 3)
+            grid_vals = chunked_query(grid_points).float().view(res + 1, res + 1, res + 1)
+            # sparse-query finer resolutions
+            dilate_kernel_3 = torch.ones(1, 1, 3, 3, 3, dtype=torch.float32, device=latent.device)
+            dilate_kernel_5 = torch.ones(1, 1, 5, 5, 5, dtype=torch.float32, device=latent.device)
+            for i in range(1, len(resolutions)):
+                res = resolutions[i]
+                # get the boundary grid mask in the coarser grid (where the grid_vals have different signs with at least one of its neighbors)
+                grid_signs = grid_vals >= 0
+                mask = torch.zeros_like(grid_signs)
+                mask[1:, :, :] += grid_signs[1:, :, :] != grid_signs[:-1, :, :]
+                mask[:-1, :, :] += grid_signs[:-1, :, :] != grid_signs[1:, :, :]
+                mask[:, 1:, :] += grid_signs[:, 1:, :] != grid_signs[:, :-1, :]
+                mask[:, :-1, :] += grid_signs[:, :-1, :] != grid_signs[:, 1:, :]
+                mask[:, :, 1:] += grid_signs[:, :, 1:] != grid_signs[:, :, :-1]
+                mask[:, :, :-1] += grid_signs[:, :, :-1] != grid_signs[:, :, 1:]
+                # empirical: also add those with abs(grid_vals) < 0.95
+                mask += grid_vals.abs() < 0.95
+                mask = (mask > 0).float()
+                # empirical: dilate the coarse mask
+                if res < 512:
+                    mask = mask.unsqueeze(0).unsqueeze(0)
+                    mask = F.conv3d(mask, weight=dilate_kernel_3, padding=1)
+                    mask = mask.squeeze(0).squeeze(0)
+                # get the coarse coordinates
+                cidx_x, cidx_y, cidx_z = torch.nonzero(mask, as_tuple=True)
+                # fill to the fine indices
+                mask_fine = torch.zeros(res + 1, res + 1, res + 1, dtype=torch.float32, device=latent.device)
+                mask_fine[cidx_x * 2, cidx_y * 2, cidx_z * 2] = 1
+                # empirical: dilate the fine mask
+                if res < 512:
+                    mask_fine = mask_fine.unsqueeze(0).unsqueeze(0)
+                    mask_fine = F.conv3d(mask_fine, weight=dilate_kernel_3, padding=1)
+                    mask_fine = mask_fine.squeeze(0).squeeze(0)
+                else:
+                    mask_fine = mask_fine.unsqueeze(0).unsqueeze(0)
+                    mask_fine = F.conv3d(mask_fine, weight=dilate_kernel_5, padding=2)
+                    mask_fine = mask_fine.squeeze(0).squeeze(0)
+                # get the fine coordinates
+                fidx_x, fidx_y, fidx_z = torch.nonzero(mask_fine, as_tuple=True)
+                # convert to float query points
+                query_points = torch.stack([fidx_x, fidx_y, fidx_z], dim=-1)  # [N, 3]
+                query_points = query_points * 2 / res - 1  # [N, 3], in [-1, 1]
+                # query
+                pred = chunked_query(query_points).float()
+                # fill to the fine indices
+                grid_vals = torch.full((res + 1, res + 1, res + 1), -100.0, dtype=torch.float32, device=latent.device)
+                grid_vals[fidx_x, fidx_y, fidx_z] = pred
+                # print(f"[INFO] hierarchical: resolution: {res}, valid coarse points: {len(cidx_x)}, valid fine points: {len(fidx_x)}")
+            grid_vals = grid_vals.unsqueeze(0)  # [1, res+1, res+1, res+1]
+            grid_vals[grid_vals <= -100.0] = float("nan")  # use nans to ignore invalid regions
+        # extract mesh
+        meshes = []
+        for b in range(B):
+            vertices, faces = extract_mesh(grid_vals[b], resolution)
+            meshes.append((vertices, faces))
+        output["meshes"] = meshes
+        return output

vae/modules/__init__.py ADDED Viewed

File without changes

vae/modules/attention.py ADDED Viewed

	@@ -0,0 +1,261 @@

+"""
+-----------------------------------------------------------------------------
+Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+NVIDIA CORPORATION and its licensors retain all intellectual property
+and proprietary rights in and to this software, related documentation
+and any modifications thereto. Any use, reproduction, disclosure or
+distribution of this software and related documentation without an express
+license agreement from NVIDIA CORPORATION is strictly prohibited.
+-----------------------------------------------------------------------------
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+try:
+    from flash_attn import flash_attn_func, flash_attn_varlen_func
+    from flash_attn.bert_padding import (  # , unpad_input  # noqa
+        index_first_axis,
+        pad_input,
+    )
+    FLASH_ATTN_AVAILABLE = True
+except Exception as e:
+    print("[WARN] flash_attn not available, using torch/naive implementation")
+    FLASH_ATTN_AVAILABLE = False
+# Adapted from https://github.com/Dao-AILab/flash-attention/blob/main/flash_attn/bert_padding.py#L98
+# flashattn 2.7.0 changes the API, we are overriding it here
+def unpad_input(hidden_states, attention_mask):
+    """
+    Arguments:
+        hidden_states: (batch, seqlen, ...)
+        attention_mask: (batch, seqlen), bool / int, 1 means valid and 0 means not valid.
+    Return:
+        hidden_states: (total_nnz, ...), where total_nnz = number of tokens in selected in attention_mask.
+        indices: (total_nnz), the indices of non-masked tokens from the flattened input sequence.
+        cu_seqlens: (batch + 1), the cumulative sequence lengths, used to index into hidden_states.
+        max_seqlen_in_batch: int
+    """
+    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
+    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
+    max_seqlen_in_batch = seqlens_in_batch.max().item()
+    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0))
+    # TD [2022-03-04] We don't want to index with a bool mask, because Pytorch will expand the
+    # bool mask, then call nonzero to get the indices, then index with those. The indices is @dim
+    # times larger than it needs to be, wasting memory. It's faster and more memory-efficient to
+    # index with integer indices. Moreover, torch's index is a bit slower than it needs to be,
+    # so we write custom forward and backward to make it a bit faster.
+    return (
+        index_first_axis(rearrange(hidden_states, "b s ... -> (b s) ..."), indices),
+        indices,
+        cu_seqlens,
+        max_seqlen_in_batch,
+    )
+def attention(q, k, v, mask_q=None, mask_kv=None, dropout=0, causal=False, window_size=(-1, -1), backend="torch"):
+    # q: (B, N, H, D)
+    # k: (B, M, H, D)
+    # v: (B, M, H, D)
+    # mask_q: (B, N)
+    # mask_kv: (B, M)
+    # return: (B, N, H, D)
+    B, N, H, D = q.shape
+    M = k.shape[1]
+    if causal:
+        assert N == 1 or N == M, "Causal mask only supports self-attention"
+    # unmasked case (usually inference)
+    # will ignore window_size except flash-attn impl. Only provide the effective window!
+    if mask_q is None and mask_kv is None:
+        if backend == "flash-attn" and FLASH_ATTN_AVAILABLE:
+            return flash_attn_func(q, k, v, dropout, causal=causal, window_size=window_size)  # [B, N, H, D]
+        elif backend == "torch":  # torch implementation
+            q = q.permute(0, 2, 1, 3)
+            k = k.permute(0, 2, 1, 3)
+            v = v.permute(0, 2, 1, 3)
+            out = F.scaled_dot_product_attention(q, k, v, attn_mask=None, dropout_p=dropout, is_causal=causal)
+            out = out.permute(0, 2, 1, 3).contiguous()
+            return out
+        else:  # naive implementation
+            q = q.transpose(1, 2).reshape(B * H, N, D)
+            k = k.transpose(1, 2).reshape(B * H, M, D)
+            v = v.transpose(1, 2).reshape(B * H, M, D)
+            w = torch.bmm(q, k.transpose(1, 2)) / (D**0.5)  # [B*H, N, M]
+            if causal and N > 1:
+                causal_mask = torch.full((N, M), float("-inf"), device=w.device, dtype=w.dtype)
+                causal_mask = torch.triu(causal_mask, diagonal=1)
+                w = w + causal_mask.unsqueeze(0)
+            w = F.softmax(w, dim=-1)
+            if dropout > 0:
+                w = F.dropout(w, p=dropout)
+            out = torch.bmm(w, v)  # [B*H, N, D]
+            out = out.reshape(B, H, N, D).transpose(1, 2).contiguous()  # [B, N, H, D]
+            return out
+    # at least one of q or kv is masked (training)
+    # only support flash-attn for now...
+    if mask_q is None:
+        mask_q = torch.ones(B, N, dtype=torch.bool, device=q.device)
+    elif mask_kv is None:
+        mask_kv = torch.ones(B, M, dtype=torch.bool, device=q.device)
+    if FLASH_ATTN_AVAILABLE:
+        # unpad (gather) input
+        # mask_q: [B, N], first row has N1 1s, second row has N2 1s, ...
+        # indices: [Ns,], Ns = N1 + N2 + ...
+        # cu_seqlens_q: [B+1,], (0, N1, N1+N2, ...), cu=cumulative
+        # max_len_q: scalar, max(N1, N2, ...)
+        q, indices_q, cu_seqlens_q, max_len_q = unpad_input(q, mask_q)
+        k, indices_kv, cu_seqlens_kv, max_len_kv = unpad_input(k, mask_kv)
+        v = index_first_axis(v.reshape(-1, H, D), indices_kv)  # same indice as k
+        # call varlen_func
+        out = flash_attn_varlen_func(
+            q,
+            k,
+            v,
+            cu_seqlens_q=cu_seqlens_q,
+            cu_seqlens_k=cu_seqlens_kv,
+            max_seqlen_q=max_len_q,
+            max_seqlen_k=max_len_kv,
+            dropout_p=dropout,
+            causal=causal,
+            window_size=window_size,
+        )
+        # pad (put back) output
+        out = pad_input(out, indices_q, B, N)
+        return out
+    else:
+        raise NotImplementedError("masked attention requires flash_attn!")
+class RMSNorm(nn.Module):
+    def __init__(self, dim, eps=1e-6):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(dim))
+        self.eps = eps
+    def forward(self, x):
+        rnorm = torch.rsqrt(x.float().pow(2).mean(dim=-1, keepdim=True) + self.eps)
+        return (x * rnorm).to(dtype=self.weight.dtype) * self.weight
+class SelfAttention(nn.Module):
+    def __init__(
+        self,
+        hidden_dim,
+        num_heads,
+        input_dim=None,
+        output_dim=None,
+        dropout=0,
+        causal=False,
+        qknorm=False,
+        qknorm_type="LayerNorm",
+    ):
+        super().__init__()
+        self.hidden_dim = hidden_dim
+        self.input_dim = input_dim if input_dim is not None else hidden_dim
+        self.output_dim = output_dim if output_dim is not None else hidden_dim
+        self.num_heads = num_heads
+        assert hidden_dim % num_heads == 0, "hidden_dim must be divisible by num_heads"
+        self.head_dim = hidden_dim // num_heads
+        self.causal = causal
+        self.dropout = dropout
+        self.qknorm = qknorm
+        self.qkv_proj = nn.Linear(self.input_dim, 3 * self.hidden_dim)
+        self.out_proj = nn.Linear(self.hidden_dim, self.output_dim)
+        if self.qknorm:
+            if qknorm_type == "RMSNorm":
+                self.q_norm = RMSNorm(self.hidden_dim, eps=1e-6)
+                self.k_norm = RMSNorm(self.hidden_dim, eps=1e-6)
+            else:
+                self.q_norm = nn.LayerNorm(self.hidden_dim, eps=1e-6, elementwise_affine=False)
+                self.k_norm = nn.LayerNorm(self.hidden_dim, eps=1e-6, elementwise_affine=False)
+    def forward(self, x, mask=None):
+        # x: [B, N, C]
+        # mask: [B, N]
+        B, N, C = x.shape
+        qkv = self.qkv_proj(x)  # [B, N, C] -> [B, N, 3 * D]
+        qkv = qkv.reshape(B, N, 3, -1).permute(2, 0, 1, 3)  # [3, B, N, D]
+        q, k, v = qkv.chunk(3, dim=0)  # [3, B, N, D] -> 3 * [1, B, N, D]
+        q = q.squeeze(0)
+        k = k.squeeze(0)
+        v = v.squeeze(0)
+        if self.qknorm:
+            q = self.q_norm(q)
+            k = self.k_norm(k)
+        q = q.reshape(B, N, self.num_heads, self.head_dim)
+        k = k.reshape(B, N, self.num_heads, self.head_dim)
+        v = v.reshape(B, N, self.num_heads, self.head_dim)
+        x = attention(q, k, v, mask_q=mask, mask_kv=mask, dropout=self.dropout, causal=self.causal)  # [B, N, H, D]
+        x = self.out_proj(x.reshape(B, N, -1))
+        return x
+class CrossAttention(nn.Module):
+    def __init__(
+        self,
+        hidden_dim,
+        num_heads,
+        input_dim=None,
+        context_dim=None,
+        output_dim=None,
+        dropout=0,
+        qknorm=False,
+        qknorm_type="LayerNorm",
+    ):
+        super().__init__()
+        self.hidden_dim = hidden_dim
+        self.input_dim = input_dim if input_dim is not None else hidden_dim
+        self.context_dim = context_dim if context_dim is not None else hidden_dim
+        self.output_dim = output_dim if output_dim is not None else hidden_dim
+        self.num_heads = num_heads
+        assert hidden_dim % num_heads == 0, "hidden_dim must be divisible by num_heads"
+        self.head_dim = hidden_dim // num_heads
+        self.dropout = dropout
+        self.qknorm = qknorm
+        self.q_proj = nn.Linear(self.input_dim, self.hidden_dim)
+        self.k_proj = nn.Linear(self.context_dim, self.hidden_dim)
+        self.v_proj = nn.Linear(self.context_dim, self.hidden_dim)
+        self.out_proj = nn.Linear(self.hidden_dim, self.output_dim)
+        if self.qknorm:
+            if qknorm_type == "RMSNorm":
+                self.q_norm = RMSNorm(self.hidden_dim, eps=1e-6)
+                self.k_norm = RMSNorm(self.hidden_dim, eps=1e-6)
+            else:
+                self.q_norm = nn.LayerNorm(self.hidden_dim, eps=1e-6, elementwise_affine=False)
+                self.k_norm = nn.LayerNorm(self.hidden_dim, eps=1e-6, elementwise_affine=False)
+    def forward(self, x, context, mask_q=None, mask_kv=None):
+        # x: [B, N, C]
+        # context: [B, M, C']
+        # mask_q: [B, N]
+        # mask_kv: [B, M]
+        B, N, C = x.shape
+        M = context.shape[1]
+        q = self.q_proj(x)
+        k = self.k_proj(context)
+        v = self.v_proj(context)
+        if self.qknorm:
+            q = self.q_norm(q)
+            k = self.k_norm(k)
+        q = q.reshape(B, N, self.num_heads, self.head_dim)
+        k = k.reshape(B, M, self.num_heads, self.head_dim)
+        v = v.reshape(B, M, self.num_heads, self.head_dim)
+        x = attention(q, k, v, mask_q=mask_q, mask_kv=mask_kv, dropout=self.dropout, causal=False)  # [B, N, H, D]
+        x = self.out_proj(x.reshape(B, N, -1))
+        return x

vae/modules/transformer.py ADDED Viewed

	@@ -0,0 +1,117 @@

+"""
+-----------------------------------------------------------------------------
+Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+NVIDIA CORPORATION and its licensors retain all intellectual property
+and proprietary rights in and to this software, related documentation
+and any modifications thereto. Any use, reproduction, disclosure or
+distribution of this software and related documentation without an express
+license agreement from NVIDIA CORPORATION is strictly prohibited.
+-----------------------------------------------------------------------------
+"""
+import torch.nn as nn
+from torch.utils.checkpoint import checkpoint
+from vae.modules.attention import CrossAttention, SelfAttention
+class FeedForward(nn.Module):
+    def __init__(self, dim, mult=4):
+        super().__init__()
+        self.net = nn.Sequential(nn.Linear(dim, dim * mult), nn.GELU(), nn.Linear(dim * mult, dim))
+    def forward(self, x):
+        return self.net(x)
+class AttentionBlock(nn.Module):
+    def __init__(
+        self,
+        dim,
+        num_heads,
+        dim_context=None,
+        qknorm=False,
+        gradient_checkpointing=True,
+        qknorm_type="LayerNorm",
+    ):
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.dim_context = dim_context
+        self.gradient_checkpointing = gradient_checkpointing
+        self.norm_attn = nn.LayerNorm(dim, eps=1e-6, elementwise_affine=False)
+        if dim_context is not None:
+            self.norm_context = nn.LayerNorm(dim_context, eps=1e-6, elementwise_affine=False)
+            self.attn = CrossAttention(dim, num_heads, context_dim=dim_context, qknorm=qknorm, qknorm_type=qknorm_type)
+        else:
+            self.attn = SelfAttention(dim, num_heads, qknorm=qknorm, qknorm_type=qknorm_type)
+        self.norm_ff = nn.LayerNorm(dim, eps=1e-6, elementwise_affine=False)
+        self.ff = FeedForward(dim)
+    def forward(self, x, c=None, mask=None, mask_c=None):
+        if self.training and self.gradient_checkpointing:
+            return checkpoint(self._forward, x, c, mask, mask_c, use_reentrant=False)
+        else:
+            return self._forward(x, c, mask, mask_c)
+    def _forward(self, x, c=None, mask=None, mask_c=None):
+        # x: [B, N, C], hidden states
+        # c: [B, M, C'], condition (assume normed and projected to C)
+        # mask: [B, N], mask for x
+        # mask_c: [B, M], mask for c
+        # return: [B, N, C], updated hidden states
+        if c is not None:
+            x = x + self.attn(self.norm_attn(x), self.norm_context(c), mask_q=mask, mask_kv=mask_c)
+        else:
+            x = x + self.attn(self.norm_attn(x), mask=mask)
+        x = x + self.ff(self.norm_ff(x))
+        return x
+# special attention block for the last cross-attn query layer
+# 1. simple feed-forward (mult=1, no post ln)
+# 2. no residual connection
+# 3. no context ln
+class FlashQueryLayer(nn.Module):
+    def __init__(
+        self,
+        dim,
+        num_heads,
+        dim_context,
+        qknorm=False,
+        gradient_checkpointing=True,
+        qknorm_type="LayerNorm",
+    ):
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.dim_context = dim_context
+        self.gradient_checkpointing = gradient_checkpointing
+        self.norm_attn = nn.LayerNorm(dim, eps=1e-6, elementwise_affine=False)
+        self.attn = CrossAttention(dim, num_heads, context_dim=dim_context, qknorm=qknorm, qknorm_type=qknorm_type)
+        self.ff = FeedForward(dim, mult=1)
+    def forward(self, x, c=None, mask=None, mask_c=None):
+        if self.training and self.gradient_checkpointing:
+            return checkpoint(self._forward, x, c, mask, mask_c, use_reentrant=False)
+        else:
+            return self._forward(x, c, mask, mask_c)
+    def _forward(self, x, c, mask=None, mask_c=None):
+        # x: [B, N, C], hidden states
+        # c: [B, M, C'], condition (assume normed and projected to C)
+        # mask: [B, N], mask for x
+        # mask_c: [B, M], mask for c
+        # return: [B, N, C], updated hidden states
+        x = self.attn(self.norm_attn(x), c, mask_q=mask, mask_kv=mask_c)
+        x = self.ff(x)
+        return x

vae/scripts/infer.py ADDED Viewed

	@@ -0,0 +1,142 @@

+"""
+-----------------------------------------------------------------------------
+Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+NVIDIA CORPORATION and its licensors retain all intellectual property
+and proprietary rights in and to this software, related documentation
+and any modifications thereto. Any use, reproduction, disclosure or
+distribution of this software and related documentation without an express
+license agreement from NVIDIA CORPORATION is strictly prohibited.
+-----------------------------------------------------------------------------
+"""
+import argparse
+import glob
+import importlib
+import os
+from datetime import datetime
+import fpsample
+import kiui
+import meshiki
+import numpy as np
+import torch
+import trimesh
+from vae.model import Model
+from vae.utils import box_normalize, postprocess_mesh, sphere_normalize, sync_timer
+# PYTHONPATH=. python vae/scripts/infer.py
+parser = argparse.ArgumentParser()
+parser.add_argument("--config", type=str, help="config file path", default="vae.configs.part_woenc")
+parser.add_argument(
+    "--ckpt_path",
+    type=str,
+    help="checkpoint path",
+    default="pretrained/vae.pt",
+)
+parser.add_argument("--input", type=str, help="input directory", default="assets/meshes/")
+parser.add_argument("--output_dir", type=str, help="output directory", default="output/")
+parser.add_argument("--limit", type=int, help="how many samples to test", default=-1)
+parser.add_argument("--num_fps_point", type=int, help="number of fps points", default=1024)
+parser.add_argument("--num_fps_salient_point", type=int, help="number of fps salient points", default=1024)
+parser.add_argument("--grid_res", type=int, help="grid resolution", default=512)
+parser.add_argument("--seed", type=int, help="seed", default=42)
+args = parser.parse_args()
+TRIMESH_GLB_EXPORT = np.array([[0, 1, 0], [0, 0, 1], [1, 0, 0]]).astype(np.float32)
+kiui.seed_everything(args.seed)
+@sync_timer("prepare_input_from_mesh")
+def prepare_input_from_mesh(mesh_path, use_salient_point=True, num_fps_point=1024, num_fps_salient_point=1024):
+    # load mesh, assume it's already processed to be watertight.
+    mesh_name = mesh_path.split("/")[-1].split(".")[0]
+    vertices, faces = meshiki.load_mesh(mesh_path)
+    # vertices = sphere_normalize(vertices)
+    vertices = box_normalize(vertices)
+    mesh = meshiki.Mesh(vertices, faces)
+    uniform_surface_points = mesh.uniform_point_sample(200000)
+    uniform_surface_points = meshiki.fps(uniform_surface_points, 32768)  # hardcoded...
+    salient_surface_points = mesh.salient_point_sample(16384, thresh_bihedral=15)
+    # save points
+    # trimesh.PointCloud(vertices=uniform_surface_points).export(os.path.join(workspace, mesh_name + "_uniform.ply"))
+    # trimesh.PointCloud(vertices=salient_surface_points).export(os.path.join(workspace, mesh_name + "_salient.ply"))
+    sample = {}
+    sample["pointcloud"] = torch.from_numpy(uniform_surface_points)
+    # fps subsample
+    fps_indices = fpsample.bucket_fps_kdline_sampling(uniform_surface_points, num_fps_point, h=5, start_idx=0)
+    sample["fps_indices"] = torch.from_numpy(fps_indices).long()  # [num_fps_point,]
+    if use_salient_point:
+        sample["pointcloud_dorases"] = torch.from_numpy(salient_surface_points)  # [N', 3]
+        # fps subsample
+        fps_indices_dorases = fpsample.bucket_fps_kdline_sampling(
+            salient_surface_points, num_fps_salient_point, h=5, start_idx=0
+        )
+        sample["fps_indices_dorases"] = torch.from_numpy(fps_indices_dorases).long()  # [num_fps_point,]
+    return sample
+print(f"Loading checkpoint from {args.ckpt_path}")
+ckpt_dict = torch.load(args.ckpt_path, weights_only=True)
+# delete all keys other than model
+if "model" in ckpt_dict:
+    ckpt_dict = ckpt_dict["model"]
+# instantiate model
+print(f"Instantiating model from {args.config}")
+model_config = importlib.import_module(args.config).make_config()
+model = Model(model_config).eval().cuda().bfloat16()
+# load weight
+print(f"Loading weights from {args.ckpt_path}")
+model.load_state_dict(ckpt_dict, strict=True)
+timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+workspace = os.path.join(args.output_dir, "vae_" + args.config.split(".")[-1] + "_" + timestamp)
+if not os.path.exists(workspace):
+    os.makedirs(workspace)
+else:
+    os.system(f"rm {workspace}/*")
+print(f"Output directory: {workspace}")
+# load dataset
+mesh_list = glob.glob(os.path.join(args.input, "*"))
+mesh_list = mesh_list[: args.limit] if args.limit > 0 else mesh_list
+for i, mesh_path in enumerate(mesh_list):
+    print(f"Processing {i}/{len(mesh_list)}: {mesh_path}")
+    mesh_name = mesh_path.split("/")[-1].split(".")[0]
+    sample = prepare_input_from_mesh(
+        mesh_path, num_fps_point=args.num_fps_point, num_fps_salient_point=args.num_fps_salient_point
+    )
+    for k in sample:
+        sample[k] = sample[k].unsqueeze(0).cuda()
+    # call vae
+    with torch.inference_mode():
+        output = model(sample, resolution=args.grid_res)
+    latent = output["latent"]
+    vertices, faces = output["meshes"][0]
+    mesh = trimesh.Trimesh(vertices, faces)
+    mesh = postprocess_mesh(mesh, 5e5)
+    mesh.export(f"{workspace}/{mesh_name}.glb")

vae/utils.py ADDED Viewed

	@@ -0,0 +1,315 @@

+"""
+-----------------------------------------------------------------------------
+Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+NVIDIA CORPORATION and its licensors retain all intellectual property
+and proprietary rights in and to this software, related documentation
+and any modifications thereto. Any use, reproduction, disclosure or
+distribution of this software and related documentation without an express
+license agreement from NVIDIA CORPORATION is strictly prohibited.
+-----------------------------------------------------------------------------
+"""
+import os
+from functools import wraps
+from typing import Literal
+import numpy as np
+import torch
+import trimesh
+from kiui.mesh_utils import clean_mesh, decimate_mesh
+# Adapted from https://github.com/Tencent/Hunyuan3D-2/blob/main/hy3dgen/shapegen/utils.py#L38
+class sync_timer:
+    """
+    Synchronized timer to count the inference time of `nn.Module.forward` or else.
+    set env var TIMER=1 to enable logging!
+    Example as context manager:
+    ```python
+    with timer('name'):
+        run()
+    ```
+    Example as decorator:
+    ```python
+    @timer('name')
+    def run():
+        pass
+    ```
+    """
+    def __init__(self, name=None, flag_env="TIMER"):
+        self.name = name
+        self.flag_env = flag_env
+    def __enter__(self):
+        if os.environ.get(self.flag_env, "0") == "1":
+            self.start = torch.cuda.Event(enable_timing=True)
+            self.end = torch.cuda.Event(enable_timing=True)
+            self.start.record()
+            return lambda: self.time
+    def __exit__(self, exc_type, exc_value, exc_tb):
+        if os.environ.get(self.flag_env, "0") == "1":
+            self.end.record()
+            torch.cuda.synchronize()
+            self.time = self.start.elapsed_time(self.end)
+            if self.name is not None:
+                print(f"{self.name} takes {self.time} ms")
+    def __call__(self, func):
+        @wraps(func)
+        def wrapper(*args, **kwargs):
+            with self:
+                result = func(*args, **kwargs)
+            return result
+        return wrapper
+@torch.no_grad()
+def calculate_iou(pred: torch.Tensor, gt: torch.Tensor, target_value: int, thresh: float = 0) -> torch.Tensor:
+    """Calculate the Intersection over Union (IoU) between two volumes.
+    Args:
+        pred (torch.Tensor): [*] continuous value between 0 and 1
+        gt (torch.Tensor): [*] discrete value of 0 or 1
+        target_value (int): The value to be considered as the target class
+    Returns:
+        torch.Tensor: IoU value
+    """
+    # Ensure volumes have the same shape
+    assert pred.shape == gt.shape, "Volumes must have the same shape"
+    # binarize
+    pred_binary = pred > thresh
+    gt = gt > thresh
+    # Convert the volumes to boolean tensors for logical operations
+    intersection = torch.logical_and(pred_binary == target_value, gt == target_value).sum().float()
+    union = torch.logical_or(pred_binary == target_value, gt == target_value).sum().float()
+    # Compute IoU
+    iou = intersection / union if union != 0 else torch.tensor(0.0)
+    return iou
+@torch.no_grad()
+def calculate_metrics(
+    pred: torch.Tensor, gt: torch.Tensor, target_value: int = 1, thresh: float = 0.5
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    """Calculate Precision, Recall, and F1 between two volumes.
+    Args:
+        pred (torch.Tensor): [*] continuous value between 0 and 1
+        gt (torch.Tensor): [*] discrete value of 0 or 1
+        target_value (int): The value to be considered as the target class
+    Returns:
+        tuple: Precision, Recall, F1 values
+    """
+    assert pred.shape == gt.shape, f"Pred {pred.shape} and gt {gt.shape} must have the same shape"
+    # Binarize prediction
+    pred_binary = pred > thresh
+    gt = gt > thresh
+    # True Positive (TP): pred == target_value and gt == target_value
+    true_positive = torch.logical_and(pred_binary == target_value, gt == target_value).sum().float()
+    # False Positive (FP): pred == target_value and gt != target_value
+    false_positive = torch.logical_and(pred_binary == target_value, gt != target_value).sum().float()
+    # False Negative (FN): pred != target_value and gt == target_value
+    false_negative = torch.logical_and(pred_binary != target_value, gt == target_value).sum().float()
+    # Precision: TP / (TP + FP), best to detect False Positives
+    precision = (
+        true_positive / (true_positive + false_positive) if (true_positive + false_positive) != 0 else torch.tensor(0.0)
+    )
+    # Recall: TP / (TP + FN), best to detect False Negatives
+    recall = (
+        true_positive / (true_positive + false_negative) if (true_positive + false_negative) != 0 else torch.tensor(0.0)
+    )
+    # f1: 2 / (1 / precision + 1 / recall)
+    f1 = 2 / (1 / precision + 1 / recall) if (precision != 0 and recall != 0) else torch.tensor(0.0)
+    return precision, recall, f1
+# Adapted from https://github.com/Stability-AI/stablediffusion/blob/main/ldm/modules/distributions/distributions.py#L24
+class DiagonalGaussianDistribution:
+    """VAE latent"""
+    def __init__(self, mean, logvar, deterministic=False):
+        # mean, logvar: [B, L, D] x 2
+        self.mean, self.logvar = mean, logvar
+        self.logvar = torch.clamp(self.logvar, -30.0, 20.0)
+        self.deterministic = deterministic
+        self.std = torch.exp(0.5 * self.logvar)
+        self.var = torch.exp(self.logvar)
+        if self.deterministic:
+            self.var = self.std = torch.zeros_like(self.mean, device=self.mean.device, dtype=self.mean.dtype)
+    def sample(self, weight: float = 1.0):
+        sample = weight * torch.randn(self.mean.shape, device=self.mean.device, dtype=self.mean.dtype)
+        x = self.mean + self.std * sample
+        return x
+    def kl(self, other=None, dims=[1, 2]):
+        if self.deterministic:
+            return torch.Tensor([0.0])
+        else:
+            if other is None:
+                return 0.5 * torch.mean(torch.pow(self.mean, 2) + self.var - 1.0 - self.logvar, dim=dims)
+            else:
+                return 0.5 * torch.mean(
+                    torch.pow(self.mean - other.mean, 2) / other.var
+                    + self.var / other.var
+                    - 1.0
+                    - self.logvar
+                    + other.logvar,
+                    dim=dims,
+                )
+    def nll(self, sample, dims=[1, 2]):
+        if self.deterministic:
+            return torch.Tensor([0.0])
+        logtwopi = np.log(2.0 * np.pi)
+        return 0.5 * torch.mean(logtwopi + self.logvar + torch.pow(sample - self.mean, 2) / self.var, dim=dims)
+    def mode(self):
+        return self.mean
+class DummyLatent:
+    def __init__(self, mean):
+        self.mean = mean
+    def sample(self, weight=0):
+        # simply perturb the mean
+        if weight > 0:
+            noise = torch.randn_like(self.mean) * weight
+        else:
+            noise = 0
+        return self.mean + noise
+    def mode(self):
+        return self.mean
+    def kl(self):
+        # just an l2 penalty
+        return 0.5 * torch.mean(torch.pow(self.mean, 2))
+def construct_grid_points(
+    resolution: int,
+    indexing: str = "ij",
+):
+    """Generate dense grid points in [-1, 1]^3.
+    Args:
+        resolution (int): resolution of the grid
+        indexing (str, optional): indexing of the grid. Defaults to "ij".
+    Returns:
+        torch.Tensor: grid points (resolution + 1, resolution + 1, resolution + 1, 3), inside bbox.
+    """
+    x = np.linspace(-1, 1, resolution + 1, dtype=np.float32)
+    y = np.linspace(-1, 1, resolution + 1, dtype=np.float32)
+    z = np.linspace(-1, 1, resolution + 1, dtype=np.float32)
+    [xs, ys, zs] = np.meshgrid(x, y, z, indexing=indexing)
+    xyzs = np.stack((xs, ys, zs), axis=-1)
+    xyzs = torch.from_numpy(xyzs).float()
+    return xyzs
+_diso_session = None  # lazy session for reuse
+@sync_timer("extract_mesh")
+def extract_mesh(
+    grid_vals: torch.Tensor,
+    resolution: int,
+    isosurface_level: float = 0,
+    backend: Literal["mcubes", "diso"] = "mcubes",
+):
+    """Extract mesh from grid occupancy.
+    Args:
+        grid_vals (torch.Tensor): [resolution + 1, resolution + 1, resolution + 1], assume to be TSDF in [-1, 1] (inner is positive)
+        resolution (int, optional): Grid resolution.
+        isosurface_level (float, optional): Iso-surface level. Defaults to 0.
+        backend (Literal["mcubes", "diso"], optional): Backend for mesh extraction. Defaults to "diso", which uses GPU and is faster.
+    Returns:
+        vertices (np.ndarray): [N, 3], float32, in [-1, 1]
+        faces (np.ndarray): [M, 3], int32
+    """
+    grid_vals = grid_vals.view(resolution + 1, resolution + 1, resolution + 1)
+    if backend == "mcubes":
+        try:
+            import mcubes
+        except ImportError:
+            os.system("pip install pymcubes")
+            import mcubes
+        grid_vals = grid_vals.float().cpu().numpy()
+        verts, faces = mcubes.marching_cubes(grid_vals, isosurface_level)
+        verts = 2 * verts / resolution - 1.0  # normalize to [-1, 1]
+    elif backend == "diso":
+        try:
+            import diso
+        except ImportError:
+            os.system("pip install diso")
+            import diso
+        global _diso_session
+        if _diso_session is None:
+            _diso_session = diso.DiffDMC(dtype=torch.float32).cuda()
+        grid_vals = -grid_vals.float().cuda()  # diso assumes inner is NEGATIVE!
+        verts, faces = _diso_session(grid_vals, deform=None, normalize=True)  # verts in [0, 1]
+        verts = verts.cpu().numpy() * 2 - 1.0  # normalize to [-1, 1]
+        faces = faces.cpu().numpy()
+    return verts, faces
+@sync_timer("postprocess_mesh")
+def postprocess_mesh(mesh: trimesh.Trimesh, decimate_target=100000):
+    vertices = mesh.vertices
+    triangles = mesh.faces
+    if vertices.shape[0] > 0 and triangles.shape[0] > 0:
+        vertices, triangles = clean_mesh(vertices, triangles, remesh=False, min_f=25, min_d=5)
+    if triangles.shape[0] > decimate_target:
+        vertices, triangles = decimate_mesh(vertices, triangles, decimate_target, optimalplacement=False)
+    if vertices.shape[0] > 0 and triangles.shape[0] > 0:
+        vertices, triangles = clean_mesh(vertices, triangles, remesh=False, min_f=25, min_d=5)
+    mesh.vertices = vertices
+    mesh.faces = triangles
+    return mesh
+def sphere_normalize(vertices):
+    bmin = vertices.min(axis=0)
+    bmax = vertices.max(axis=0)
+    bcenter = (bmax + bmin) / 2
+    radius = np.linalg.norm(vertices - bcenter, axis=-1).max()
+    vertices = (vertices - bcenter) / radius  # to [-1, 1]
+    return vertices
+def box_normalize(vertices, bound=0.95):
+    bmin = vertices.min(axis=0)
+    bmax = vertices.max(axis=0)
+    bcenter = (bmax + bmin) / 2
+    vertices = bound * (vertices - bcenter) / (bmax - bmin).max()
+    return vertices