Spaces:

stabilityai
/

stable-fast-3d

Runtime error

App Files Files Community

Aaryaman Vasishta

mboss commited on Jan 4

Commit

77d8010

1 Parent(s): e96dd77

Update to latest inference code

Browse files

Co-authored-by: Mark Boss <[email protected]>

Files changed (38) hide show

.gitattributes +1 -0
.gitignore +167 -0
.pre-commit-config.yaml +24 -0
README.md +2 -4
__init__.py +201 -0
demo_files/scatterplot.jpg +0 -0
demo_files/workflows/sf3d_example.json +254 -0
app.py → gradio_app.py +87 -71
requirements.txt +12 -4
ruff.toml +3 -0
run.py +141 -0
sf3d/models/image_estimator/clip_based_estimator.py +1 -1
sf3d/models/mesh.py +119 -2
sf3d/models/network.py +21 -3
sf3d/models/utils.py +1 -57
sf3d/system.py +93 -43
sf3d/utils.py +48 -34
texture_baker/README.md +26 -0
texture_baker/requirements.txt +2 -0
texture_baker/setup.py +124 -0
texture_baker/texture_baker/__init__.py +4 -0
texture_baker/texture_baker/baker.py +86 -0
texture_baker/texture_baker/csrc/baker.cpp +548 -0
texture_baker/texture_baker/csrc/baker.h +203 -0
texture_baker/texture_baker/csrc/baker_kernel.cu +301 -0
texture_baker/texture_baker/csrc/baker_kernel.metal +170 -0
texture_baker/texture_baker/csrc/baker_kernel.mm +260 -0
uv_unwrapper/README.md +0 -0
uv_unwrapper/requirements.txt +2 -0
uv_unwrapper/setup.py +79 -0
uv_unwrapper/uv_unwrapper/__init__.py +6 -0
uv_unwrapper/uv_unwrapper/csrc/bvh.cpp +380 -0
uv_unwrapper/uv_unwrapper/csrc/bvh.h +118 -0
uv_unwrapper/uv_unwrapper/csrc/common.h +493 -0
uv_unwrapper/uv_unwrapper/csrc/intersect.cpp +702 -0
uv_unwrapper/uv_unwrapper/csrc/intersect.h +10 -0
uv_unwrapper/uv_unwrapper/csrc/unwrapper.cpp +271 -0
uv_unwrapper/uv_unwrapper/unwrap.py +669 -0

.gitattributes CHANGED Viewed

@@ -35,3 +35,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 *.gif filter=lfs diff=lfs merge=lfs -text
 *.png filter=lfs diff=lfs merge=lfs -text

 *tfevents* filter=lfs diff=lfs merge=lfs -text
 *.gif filter=lfs diff=lfs merge=lfs -text
 *.png filter=lfs diff=lfs merge=lfs -text
+*.whl filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,167 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv*/
+env/
+venv*/
+ENV/
+env.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+.vs/
+.idea/
+.vscode/
+stabilityai/
+output/

.pre-commit-config.yaml ADDED Viewed

	@@ -0,0 +1,24 @@

+default_language_version:
+  python: python3
+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.4.0
+    hooks:
+      - id: trailing-whitespace
+      - id: check-ast
+      - id: check-merge-conflict
+      - id: check-yaml
+      - id: end-of-file-fixer
+      - id: trailing-whitespace
+        args: [--markdown-linebreak-ext=md]
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    # Ruff version.
+    rev: v0.3.5
+    hooks:
+      # Run the linter.
+      - id: ruff
+        args: [ --fix ]
+      # Run the formatter.
+      - id: ruff-format

README.md CHANGED Viewed

@@ -4,9 +4,9 @@ emoji: 🎮
 colorFrom: purple
 colorTo: indigo
 sdk: gradio
-sdk_version: 4.31.4
 python_version: 3.10.13
-app_file: app.py
 pinned: false
 models:
   - stabilityai/stable-fast-3d
@@ -14,5 +14,3 @@ license: other
 license_name: stabilityai-ai-community
 license_link: LICENSE.md
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 colorFrom: purple
 colorTo: indigo
 sdk: gradio
+sdk_version: 4.41.0
 python_version: 3.10.13
+app_file: gradio_app.py
 pinned: false
 models:
   - stabilityai/stable-fast-3d
 license_name: stabilityai-ai-community
 license_link: LICENSE.md
 ---

__init__.py ADDED Viewed

	@@ -0,0 +1,201 @@

+import base64
+import logging
+import os
+import sys
+from contextlib import nullcontext
+import comfy.model_management
+import folder_paths
+import numpy as np
+import torch
+import trimesh
+from PIL import Image
+from trimesh.exchange import gltf
+sys.path.append(os.path.dirname(__file__))
+from sf3d.system import SF3D
+from sf3d.utils import resize_foreground
+SF3D_CATEGORY = "StableFast3D"
+SF3D_MODEL_NAME = "stabilityai/stable-fast-3d"
+class StableFast3DLoader:
+    CATEGORY = SF3D_CATEGORY
+    FUNCTION = "load"
+    RETURN_NAMES = ("sf3d_model",)
+    RETURN_TYPES = ("SF3D_MODEL",)
+    @classmethod
+    def INPUT_TYPES(cls):
+        return {"required": {}}
+    def load(self):
+        device = comfy.model_management.get_torch_device()
+        model = SF3D.from_pretrained(
+            SF3D_MODEL_NAME,
+            config_name="config.yaml",
+            weight_name="model.safetensors",
+        )
+        model.to(device)
+        model.eval()
+        return (model,)
+class StableFast3DPreview:
+    CATEGORY = SF3D_CATEGORY
+    FUNCTION = "preview"
+    OUTPUT_NODE = True
+    RETURN_TYPES = ()
+    @classmethod
+    def INPUT_TYPES(s):
+        return {"required": {"mesh": ("MESH",)}}
+    def preview(self, mesh):
+        glbs = []
+        for m in mesh:
+            scene = trimesh.Scene(m)
+            glb_data = gltf.export_glb(scene, include_normals=True)
+            glb_base64 = base64.b64encode(glb_data).decode("utf-8")
+            glbs.append(glb_base64)
+        return {"ui": {"glbs": glbs}}
+class StableFast3DSampler:
+    CATEGORY = SF3D_CATEGORY
+    FUNCTION = "predict"
+    RETURN_NAMES = ("mesh",)
+    RETURN_TYPES = ("MESH",)
+    @classmethod
+    def INPUT_TYPES(s):
+        return {
+            "required": {
+                "model": ("SF3D_MODEL",),
+                "image": ("IMAGE",),
+                "foreground_ratio": (
+                    "FLOAT",
+                    {"default": 0.85, "min": 0.0, "max": 1.0, "step": 0.01},
+                ),
+                "texture_resolution": (
+                    "INT",
+                    {"default": 1024, "min": 512, "max": 2048, "step": 256},
+                ),
+            },
+            "optional": {
+                "mask": ("MASK",),
+                "remesh": (["none", "triangle", "quad"],),
+                "vertex_count": (
+                    "INT",
+                    {"default": -1, "min": -1, "max": 20000, "step": 1},
+                ),
+            },
+        }
+    def predict(
+        s,
+        model,
+        image,
+        mask,
+        foreground_ratio,
+        texture_resolution,
+        remesh="none",
+        vertex_count=-1,
+    ):
+        if image.shape[0] != 1:
+            raise ValueError("Only one image can be processed at a time")
+        pil_image = Image.fromarray(
+            torch.clamp(torch.round(255.0 * image[0]), 0, 255)
+            .type(torch.uint8)
+            .cpu()
+            .numpy()
+        )
+        if mask is not None:
+            print("Using Mask")
+            mask_np = np.clip(255.0 * mask[0].detach().cpu().numpy(), 0, 255).astype(
+                np.uint8
+            )
+            mask_pil = Image.fromarray(mask_np, mode="L")
+            pil_image.putalpha(mask_pil)
+        else:
+            if image.shape[3] != 4:
+                print("No mask or alpha channel detected, Converting to RGBA")
+                pil_image = pil_image.convert("RGBA")
+        pil_image = resize_foreground(pil_image, foreground_ratio)
+        print(remesh)
+        with torch.no_grad():
+            with torch.autocast(
+                device_type="cuda", dtype=torch.bfloat16
+            ) if "cuda" in comfy.model_management.get_torch_device().type else nullcontext():
+                mesh, glob_dict = model.run_image(
+                    pil_image,
+                    bake_resolution=texture_resolution,
+                    remesh=remesh,
+                    vertex_count=vertex_count,
+                )
+        if mesh.vertices.shape[0] == 0:
+            raise ValueError("No subject detected in the image")
+        return ([mesh],)
+class StableFast3DSave:
+    CATEGORY = SF3D_CATEGORY
+    FUNCTION = "save"
+    OUTPUT_NODE = True
+    RETURN_TYPES = ()
+    @classmethod
+    def INPUT_TYPES(s):
+        return {
+            "required": {
+                "mesh": ("MESH",),
+                "filename_prefix": ("STRING", {"default": "SF3D"}),
+            }
+        }
+    def __init__(self):
+        self.type = "output"
+    def save(self, mesh, filename_prefix):
+        output_dir = folder_paths.get_output_directory()
+        glbs = []
+        for idx, m in enumerate(mesh):
+            scene = trimesh.Scene(m)
+            glb_data = gltf.export_glb(scene, include_normals=True)
+            logging.info(f"Generated GLB model with {len(glb_data)} bytes")
+            full_output_folder, filename, counter, subfolder, filename_prefix = (
+                folder_paths.get_save_image_path(filename_prefix, output_dir)
+            )
+            filename = filename.replace("%batch_num%", str(idx))
+            out_path = os.path.join(full_output_folder, f"{filename}_{counter:05}_.glb")
+            with open(out_path, "wb") as f:
+                f.write(glb_data)
+            glbs.append(base64.b64encode(glb_data).decode("utf-8"))
+        return {"ui": {"glbs": glbs}}
+NODE_DISPLAY_NAME_MAPPINGS = {
+    "StableFast3DLoader": "Stable Fast 3D Loader",
+    "StableFast3DPreview": "Stable Fast 3D Preview",
+    "StableFast3DSampler": "Stable Fast 3D Sampler",
+    "StableFast3DSave": "Stable Fast 3D Save",
+}
+NODE_CLASS_MAPPINGS = {
+    "StableFast3DLoader": StableFast3DLoader,
+    "StableFast3DPreview": StableFast3DPreview,
+    "StableFast3DSampler": StableFast3DSampler,
+    "StableFast3DSave": StableFast3DSave,
+}
+WEB_DIRECTORY = "./comfyui"
+__all__ = ["NODE_CLASS_MAPPINGS", "NODE_DISPLAY_NAME_MAPPINGS", "WEB_DIRECTORY"]

demo_files/scatterplot.jpg CHANGED Viewed

demo_files/workflows/sf3d_example.json ADDED Viewed

	@@ -0,0 +1,254 @@

+{
+  "last_node_id": 10,
+  "last_link_id": 12,
+  "nodes": [
+    {
+      "id": 8,
+      "type": "StableFast3DSampler",
+      "pos": [
+        756.9950672198843,
+        9.735666739723854
+      ],
+      "size": {
+        "0": 315,
+        "1": 166
+      },
+      "flags": {},
+      "order": 3,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "model",
+          "type": "SF3D_MODEL",
+          "link": 8
+        },
+        {
+          "name": "image",
+          "type": "IMAGE",
+          "link": 10,
+          "slot_index": 1
+        },
+        {
+          "name": "mask",
+          "type": "MASK",
+          "link": 11
+        },
+        {
+          "name": "remesh",
+          "type": "none",
+          "link": null,
+          "slot_index": 3
+        }
+      ],
+      "outputs": [
+        {
+          "name": "mesh",
+          "type": "MESH",
+          "links": [
+            9
+          ],
+          "shape": 3,
+          "slot_index": 0
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "StableFast3DSampler"
+      },
+      "widgets_values": [
+        0.85,
+        1024,
+        "triangle"
+      ]
+    },
+    {
+      "id": 9,
+      "type": "StableFast3DSave",
+      "pos": [
+        1116,
+        8
+      ],
+      "size": [
+        600,
+        512
+      ],
+      "flags": {},
+      "order": 4,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "mesh",
+          "type": "MESH",
+          "link": 9
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "StableFast3DSave"
+      },
+      "widgets_values": [
+        "SF3D",
+        null
+      ]
+    },
+    {
+      "id": 6,
+      "type": "InvertMask",
+      "pos": [
+        485,
+        132
+      ],
+      "size": {
+        "0": 210,
+        "1": 26
+      },
+      "flags": {},
+      "order": 2,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "mask",
+          "type": "MASK",
+          "link": 6
+        }
+      ],
+      "outputs": [
+        {
+          "name": "MASK",
+          "type": "MASK",
+          "links": [
+            11
+          ],
+          "shape": 3,
+          "slot_index": 0
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "InvertMask"
+      }
+    },
+    {
+      "id": 1,
+      "type": "LoadImage",
+      "pos": [
+        105,
+        26
+      ],
+      "size": {
+        "0": 315,
+        "1": 314
+      },
+      "flags": {},
+      "order": 0,
+      "mode": 0,
+      "outputs": [
+        {
+          "name": "IMAGE",
+          "type": "IMAGE",
+          "links": [
+            10
+          ],
+          "shape": 3,
+          "slot_index": 0
+        },
+        {
+          "name": "MASK",
+          "type": "MASK",
+          "links": [
+            6
+          ],
+          "shape": 3,
+          "slot_index": 1
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "LoadImage"
+      },
+      "widgets_values": [
+        "axe (1).png",
+        "image"
+      ]
+    },
+    {
+      "id": 7,
+      "type": "StableFast3DLoader",
+      "pos": [
+        478,
+        -27
+      ],
+      "size": {
+        "0": 210,
+        "1": 26
+      },
+      "flags": {},
+      "order": 1,
+      "mode": 0,
+      "outputs": [
+        {
+          "name": "sf3d_model",
+          "type": "SF3D_MODEL",
+          "links": [
+            8
+          ],
+          "shape": 3,
+          "slot_index": 0
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "StableFast3DLoader"
+      }
+    }
+  ],
+  "links": [
+    [
+      6,
+      1,
+      1,
+      6,
+      0,
+      "MASK"
+    ],
+    [
+      8,
+      7,
+      0,
+      8,
+      0,
+      "SF3D_MODEL"
+    ],
+    [
+      9,
+      8,
+      0,
+      9,
+      0,
+      "MESH"
+    ],
+    [
+      10,
+      1,
+      0,
+      8,
+      1,
+      "IMAGE"
+    ],
+    [
+      11,
+      6,
+      0,
+      8,
+      2,
+      "MASK"
+    ]
+  ],
+  "groups": [],
+  "config": {},
+  "extra": {
+    "ds": {
+      "scale": 0.6209213230591552,
+      "offset": [
+        80.89139921077967,
+        610.3296066172098
+      ]
+    }
+  },
+  "version": 0.4
+}

app.py → gradio_app.py RENAMED Viewed

@@ -1,6 +1,7 @@
 import os
 import tempfile
 import time
 from functools import lru_cache
 from typing import Any
@@ -11,9 +12,13 @@ import torch
 from gradio_litmodel3d import LitModel3D
 from PIL import Image
 import sf3d.utils as sf3d_utils
 from sf3d.system import SF3D
 rembg_session = rembg.new_session()
 COND_WIDTH = 512
@@ -28,32 +33,48 @@ intrinsic, intrinsic_normed_cond = sf3d_utils.create_intrinsic_from_fov_deg(
     COND_FOVY_DEG, COND_HEIGHT, COND_WIDTH
 )
 model = SF3D.from_pretrained(
     "stabilityai/stable-fast-3d",
     config_name="config.yaml",
     weight_name="model.safetensors",
 )
-model.eval().cuda()
 example_files = [
     os.path.join("demo_files/examples", f) for f in os.listdir("demo_files/examples")
 ]
-def run_model(input_image):
     start = time.time()
     with torch.no_grad():
-        with torch.autocast(device_type="cuda", dtype=torch.float16):
             model_batch = create_batch(input_image)
-            model_batch = {k: v.cuda() for k, v in model_batch.items()}
-            trimesh_mesh, _glob_dict = model.generate_mesh(model_batch, 1024)
             trimesh_mesh = trimesh_mesh[0]
     # Create new tmp file
     tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".glb")
     trimesh_mesh.export(tmp_file.name, file_type="glb", include_normals=True)
     print("Generation took:", time.time() - start, "s")
@@ -104,61 +125,6 @@ def remove_background(input_image: Image) -> Image:
     return rembg.remove(input_image, session=rembg_session)
-def resize_foreground(
-    image: Image,
-    ratio: float,
-) -> Image:
-    image = np.array(image)
-    assert image.shape[-1] == 4
-    alpha = np.where(image[..., 3] > 0)
-    y1, y2, x1, x2 = (
-        alpha[0].min(),
-        alpha[0].max(),
-        alpha[1].min(),
-        alpha[1].max(),
-    )
-    # crop the foreground
-    fg = image[y1:y2, x1:x2]
-    # pad to square
-    size = max(fg.shape[0], fg.shape[1])
-    ph0, pw0 = (size - fg.shape[0]) // 2, (size - fg.shape[1]) // 2
-    ph1, pw1 = size - fg.shape[0] - ph0, size - fg.shape[1] - pw0
-    new_image = np.pad(
-        fg,
-        ((ph0, ph1), (pw0, pw1), (0, 0)),
-        mode="constant",
-        constant_values=((0, 0), (0, 0), (0, 0)),
-    )
-    # compute padding according to the ratio
-    new_size = int(new_image.shape[0] / ratio)
-    # pad to size, double side
-    ph0, pw0 = (new_size - size) // 2, (new_size - size) // 2
-    ph1, pw1 = new_size - size - ph0, new_size - size - pw0
-    new_image = np.pad(
-        new_image,
-        ((ph0, ph1), (pw0, pw1), (0, 0)),
-        mode="constant",
-        constant_values=((0, 0), (0, 0), (0, 0)),
-    )
-    new_image = Image.fromarray(new_image, mode="RGBA").resize(
-        (COND_WIDTH, COND_HEIGHT)
-    )
-    return new_image
-def square_crop(input_image: Image) -> Image:
-    # Perform a center square crop
-    min_size = min(input_image.size)
-    left = (input_image.size[0] - min_size) // 2
-    top = (input_image.size[1] - min_size) // 2
-    right = (input_image.size[0] + min_size) // 2
-    bottom = (input_image.size[1] + min_size) // 2
-    return input_image.crop((left, top, right, bottom)).resize(
-        (COND_WIDTH, COND_HEIGHT)
-    )
 def show_mask_img(input_image: Image) -> Image:
     img_numpy = np.array(input_image)
     alpha = img_numpy[:, :, 3] / 255.0
@@ -167,9 +133,27 @@ def show_mask_img(input_image: Image) -> Image:
     return Image.fromarray(new_img.astype(np.uint8), mode="RGB")
-def run_button(run_btn, input_image, background_state, foreground_ratio):
     if run_btn == "Run":
-        glb_file: str = run_model(background_state)
         return (
             gr.update(),
@@ -182,12 +166,13 @@ def run_button(run_btn, input_image, background_state, foreground_ratio):
     elif run_btn == "Remove Background":
         rem_removed = remove_background(input_image)
-        sqr_crop = square_crop(rem_removed)
-        fr_res = resize_foreground(sqr_crop, foreground_ratio)
         return (
             gr.update(value="Run", visible=True),
-            sqr_crop,
             fr_res,
             gr.update(value=show_mask_img(fr_res), visible=True),
             gr.update(value=None, visible=False),
@@ -210,11 +195,12 @@ def requires_bg_remove(image, fr):
     if min_alpha == 0:
         print("Already has alpha")
-        sqr_crop = square_crop(image)
-        fr_res = resize_foreground(sqr_crop, fr)
         return (
             gr.update(value="Run", visible=True),
-            sqr_crop,
             fr_res,
             gr.update(value=show_mask_img(fr_res), visible=True),
             gr.update(visible=False),
@@ -231,7 +217,9 @@ def requires_bg_remove(image, fr):
 def update_foreground_ratio(img_proc, fr):
-    foreground_res = resize_foreground(img_proc, fr)
     return (
         foreground_res,
         gr.update(value=show_mask_img(foreground_res)),
@@ -250,7 +238,8 @@ with gr.Blocks() as demo:
     **Tips**
     1. If the image already has an alpha channel, you can skip the background removal step.
     2. You can adjust the foreground ratio to control the size of the foreground object. This can influence the shape
-    3. You can upload your own HDR environment map to light the 3D model.
     """)
     with gr.Row(variant="panel"):
         with gr.Column():
@@ -280,6 +269,30 @@ with gr.Blocks() as demo:
                 outputs=[background_remove_state, preview_removal],
             )
             run_btn = gr.Button("Run", variant="primary", visible=False)
         with gr.Column():
@@ -341,6 +354,9 @@ with gr.Blocks() as demo:
             input_img,
             background_remove_state,
             foreground_ratio,
         ],
         outputs=[
             run_btn,
@@ -352,4 +368,4 @@ with gr.Blocks() as demo:
         ],
     )
-demo.launch()

 import os
 import tempfile
 import time
+from contextlib import nullcontext
 from functools import lru_cache
 from typing import Any
 from gradio_litmodel3d import LitModel3D
 from PIL import Image
+os.system("USE_CUDA=1 pip install -vv --no-build-isolation ./texture_baker ./uv_unwrapper")
 import sf3d.utils as sf3d_utils
 from sf3d.system import SF3D
+os.environ["GRADIO_TEMP_DIR"] = os.path.join(os.environ.get("TMPDIR", "/tmp"), "gradio")
 rembg_session = rembg.new_session()
 COND_WIDTH = 512
     COND_FOVY_DEG, COND_HEIGHT, COND_WIDTH
 )
+generated_files = []
+# Delete previous gradio temp dir folder
+if os.path.exists(os.environ["GRADIO_TEMP_DIR"]):
+    print(f"Deleting {os.environ['GRADIO_TEMP_DIR']}")
+    import shutil
+    shutil.rmtree(os.environ["GRADIO_TEMP_DIR"])
+device = sf3d_utils.get_device()
 model = SF3D.from_pretrained(
     "stabilityai/stable-fast-3d",
     config_name="config.yaml",
     weight_name="model.safetensors",
 )
+model.eval()
+model = model.to(device)
 example_files = [
     os.path.join("demo_files/examples", f) for f in os.listdir("demo_files/examples")
 ]
+def run_model(input_image, remesh_option, vertex_count, texture_size):
     start = time.time()
     with torch.no_grad():
+        with torch.autocast(
+            device_type=device, dtype=torch.bfloat16
+        ) if "cuda" in device else nullcontext():
             model_batch = create_batch(input_image)
+            model_batch = {k: v.to(device) for k, v in model_batch.items()}
+            trimesh_mesh, _glob_dict = model.generate_mesh(
+                model_batch, texture_size, remesh_option, vertex_count
+            )
             trimesh_mesh = trimesh_mesh[0]
     # Create new tmp file
     tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".glb")
     trimesh_mesh.export(tmp_file.name, file_type="glb", include_normals=True)
+    generated_files.append(tmp_file.name)
     print("Generation took:", time.time() - start, "s")
     return rembg.remove(input_image, session=rembg_session)
 def show_mask_img(input_image: Image) -> Image:
     img_numpy = np.array(input_image)
     alpha = img_numpy[:, :, 3] / 255.0
     return Image.fromarray(new_img.astype(np.uint8), mode="RGB")
+def run_button(
+    run_btn,
+    input_image,
+    background_state,
+    foreground_ratio,
+    remesh_option,
+    vertex_count,
+    texture_size,
+):
     if run_btn == "Run":
+        if torch.cuda.is_available():
+            torch.cuda.reset_peak_memory_stats()
+        glb_file: str = run_model(
+            background_state, remesh_option.lower(), vertex_count, texture_size
+        )
+        if torch.cuda.is_available():
+            print("Peak Memory:", torch.cuda.max_memory_allocated() / 1024 / 1024, "MB")
+        elif torch.backends.mps.is_available():
+            print(
+                "Peak Memory:", torch.mps.driver_allocated_memory() / 1024 / 1024, "MB"
+            )
         return (
             gr.update(),
     elif run_btn == "Remove Background":
         rem_removed = remove_background(input_image)
+        fr_res = sf3d_utils.resize_foreground(
+            rem_removed, foreground_ratio, out_size=(COND_WIDTH, COND_HEIGHT)
+        )
         return (
             gr.update(value="Run", visible=True),
+            rem_removed,
             fr_res,
             gr.update(value=show_mask_img(fr_res), visible=True),
             gr.update(value=None, visible=False),
     if min_alpha == 0:
         print("Already has alpha")
+        fr_res = sf3d_utils.resize_foreground(
+            image, foreground_ratio, out_size=(COND_WIDTH, COND_HEIGHT)
+        )
         return (
             gr.update(value="Run", visible=True),
+            image,
             fr_res,
             gr.update(value=show_mask_img(fr_res), visible=True),
             gr.update(visible=False),
 def update_foreground_ratio(img_proc, fr):
+    foreground_res = sf3d_utils.resize_foreground(
+        img_proc, fr, out_size=(COND_WIDTH, COND_HEIGHT)
+    )
     return (
         foreground_res,
         gr.update(value=show_mask_img(foreground_res)),
     **Tips**
     1. If the image already has an alpha channel, you can skip the background removal step.
     2. You can adjust the foreground ratio to control the size of the foreground object. This can influence the shape
+    3. You can select the remeshing option to control the mesh topology. This can introduce artifacts in the mesh on thin surfaces and should be turned off in such cases.
+    4. You can upload your own HDR environment map to light the 3D model.
     """)
     with gr.Row(variant="panel"):
         with gr.Column():
                 outputs=[background_remove_state, preview_removal],
             )
+            remesh_option = gr.Radio(
+                choices=["None", "Triangle", "Quad"],
+                label="Remeshing",
+                value="None",
+                visible=True,
+            )
+            vertex_count_slider = gr.Slider(
+                label="Target Vertex Count",
+                minimum=-1,
+                maximum=20000,
+                value=-1,
+                visible=True,
+            )
+            texture_size = gr.Slider(
+                label="Texture Size",
+                minimum=512,
+                maximum=2048,
+                value=1024,
+                step=256,
+                visible=True,
+            )
             run_btn = gr.Button("Run", variant="primary", visible=False)
         with gr.Column():
             input_img,
             background_remove_state,
             foreground_ratio,
+            remesh_option,
+            vertex_count_slider,
+            texture_size,
         ],
         outputs=[
             run_btn,
         ],
     )
+demo.queue().launch(share=False)

requirements.txt CHANGED Viewed

@@ -1,13 +1,21 @@
-torch==2.1.2
-torchvision==0.16.2
 einops==0.7.0
 jaxtyping==0.2.31
 omegaconf==2.3.0
 transformers==4.42.3
-slangtorch==1.2.2
 open_clip_torch==2.24.0
 trimesh==4.4.1
 numpy==1.26.4
 huggingface-hub==0.23.4
-rembg[gpu]==2.0.57
 gradio-litmodel3d==0.0.1

+wheel
+setuptools==69.5.1
+torch==2.5.1
+torchvision==0.20.1
 einops==0.7.0
 jaxtyping==0.2.31
 omegaconf==2.3.0
 transformers==4.42.3
 open_clip_torch==2.24.0
 trimesh==4.4.1
 numpy==1.26.4
 huggingface-hub==0.23.4
+rembg[gpu]==2.0.57; sys_platform != 'darwin'
+rembg==2.0.57; sys_platform == 'darwin'
+pynanoinstantmeshes==0.0.3
+gpytoolbox==0.2.0
+gradio==4.41.0
 gradio-litmodel3d==0.0.1
+# (HF hack) These are installed at runtime in gradio_app.py
+# ./texture_baker/
+# ./uv_unwrapper/

ruff.toml ADDED Viewed

	@@ -0,0 +1,3 @@

+[lint]
+ignore = ["F722"]
+extend-select = ["I"]

run.py ADDED Viewed

	@@ -0,0 +1,141 @@

+import argparse
+import os
+from contextlib import nullcontext
+import rembg
+import torch
+from PIL import Image
+from tqdm import tqdm
+from sf3d.system import SF3D
+from sf3d.utils import get_device, remove_background, resize_foreground
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "image", type=str, nargs="+", help="Path to input image(s) or folder."
+    )
+    parser.add_argument(
+        "--device",
+        default=get_device(),
+        type=str,
+        help=f"Device to use. If no CUDA/MPS-compatible device is found, the baking will fail. Default: '{get_device()}'",
+    )
+    parser.add_argument(
+        "--pretrained-model",
+        default="stabilityai/stable-fast-3d",
+        type=str,
+        help="Path to the pretrained model. Could be either a huggingface model id is or a local path. Default: 'stabilityai/stable-fast-3d'",
+    )
+    parser.add_argument(
+        "--foreground-ratio",
+        default=0.85,
+        type=float,
+        help="Ratio of the foreground size to the image size. Only used when --no-remove-bg is not specified. Default: 0.85",
+    )
+    parser.add_argument(
+        "--output-dir",
+        default="output/",
+        type=str,
+        help="Output directory to save the results. Default: 'output/'",
+    )
+    parser.add_argument(
+        "--texture-resolution",
+        default=1024,
+        type=int,
+        help="Texture atlas resolution. Default: 1024",
+    )
+    parser.add_argument(
+        "--remesh_option",
+        choices=["none", "triangle", "quad"],
+        default="none",
+        help="Remeshing option",
+    )
+    parser.add_argument(
+        "--target_vertex_count",
+        type=int,
+        help="Target vertex count. -1 does not perform a reduction.",
+        default=-1,
+    )
+    parser.add_argument(
+        "--batch_size", default=1, type=int, help="Batch size for inference"
+    )
+    args = parser.parse_args()
+    # Ensure args.device contains cuda
+    devices = ["cuda", "mps", "cpu"]
+    if not any(args.device in device for device in devices):
+        raise ValueError("Invalid device. Use cuda, mps or cpu")
+    output_dir = args.output_dir
+    os.makedirs(output_dir, exist_ok=True)
+    device = args.device
+    if not (torch.cuda.is_available() or torch.backends.mps.is_available()):
+        device = "cpu"
+    print("Device used: ", device)
+    model = SF3D.from_pretrained(
+        args.pretrained_model,
+        config_name="config.yaml",
+        weight_name="model.safetensors",
+    )
+    model.to(device)
+    model.eval()
+    rembg_session = rembg.new_session()
+    images = []
+    idx = 0
+    for image_path in args.image:
+        def handle_image(image_path, idx):
+            image = remove_background(
+                Image.open(image_path).convert("RGBA"), rembg_session
+            )
+            image = resize_foreground(image, args.foreground_ratio)
+            os.makedirs(os.path.join(output_dir, str(idx)), exist_ok=True)
+            image.save(os.path.join(output_dir, str(idx), "input.png"))
+            images.append(image)
+        if os.path.isdir(image_path):
+            image_paths = [
+                os.path.join(image_path, f)
+                for f in os.listdir(image_path)
+                if f.endswith((".png", ".jpg", ".jpeg"))
+            ]
+            for image_path in image_paths:
+                handle_image(image_path, idx)
+                idx += 1
+        else:
+            handle_image(image_path, idx)
+            idx += 1
+    for i in tqdm(range(0, len(images), args.batch_size)):
+        image = images[i : i + args.batch_size]
+        if torch.cuda.is_available():
+            torch.cuda.reset_peak_memory_stats()
+        with torch.no_grad():
+            with torch.autocast(
+                device_type=device, dtype=torch.bfloat16
+            ) if "cuda" in device else nullcontext():
+                mesh, glob_dict = model.run_image(
+                    image,
+                    bake_resolution=args.texture_resolution,
+                    remesh=args.remesh_option,
+                    vertex_count=args.target_vertex_count,
+                )
+        if torch.cuda.is_available():
+            print("Peak Memory:", torch.cuda.max_memory_allocated() / 1024 / 1024, "MB")
+        elif torch.backends.mps.is_available():
+            print(
+                "Peak Memory:", torch.mps.driver_allocated_memory() / 1024 / 1024, "MB"
+            )
+        if len(image) == 1:
+            out_mesh_path = os.path.join(output_dir, str(i), "mesh.glb")
+            mesh.export(out_mesh_path, include_normals=True)
+        else:
+            for j in range(len(mesh)):
+                out_mesh_path = os.path.join(output_dir, str(i + j), "mesh.glb")
+                mesh[j].export(out_mesh_path, include_normals=True)

sf3d/models/image_estimator/clip_based_estimator.py CHANGED Viewed

@@ -95,7 +95,7 @@ class ClipBasedHeadEstimator(BaseModule):
         # Run the model
         # Resize cond_image to 224
         cond_image = nn.functional.interpolate(
-            cond_image.flatten(0, 1).permute(0, 3, 1, 2),
             size=(224, 224),
             mode="bilinear",
             align_corners=False,

         # Run the model
         # Resize cond_image to 224
         cond_image = nn.functional.interpolate(
+            cond_image.flatten(0, 1).permute(0, 3, 1, 2).contiguous(),
             size=(224, 224),
             mode="bilinear",
             align_corners=False,

sf3d/models/mesh.py CHANGED Viewed

@@ -1,15 +1,30 @@
 from __future__ import annotations
 from typing import Any, Dict, Optional
 import torch
 import torch.nn.functional as F
 from jaxtyping import Float, Integer
 from torch import Tensor
-from sf3d.box_uv_unwrap import box_projection_uv_unwrap
 from sf3d.models.utils import dot
 class Mesh:
     def __init__(
@@ -25,6 +40,8 @@ class Mesh:
         for k, v in kwargs.items():
             self.add_extra(k, v)
     def add_extra(self, k, v) -> None:
         self.extras[k] = v
@@ -131,12 +148,112 @@ class Mesh:
         return tangents
     @torch.no_grad()
     def unwrap_uv(
         self,
         island_padding: float = 0.02,
     ) -> Mesh:
-        uv, indices = box_projection_uv_unwrap(
             self.v_pos, self.v_nrm, self.t_pos_idx, island_padding
         )

 from __future__ import annotations
+import math
 from typing import Any, Dict, Optional
+import gpytoolbox
+import numpy as np
+import pynanoinstantmeshes
 import torch
 import torch.nn.functional as F
+import trimesh
 from jaxtyping import Float, Integer
 from torch import Tensor
 from sf3d.models.utils import dot
+try:
+    from uv_unwrapper import Unwrapper
+except ImportError:
+    import logging
+    logging.warning(
+        "Could not import uv_unwrapper. Please install it via `pip install uv_unwrapper/`"
+    )
+    # Exit early to avoid further errors
+    raise ImportError("uv_unwrapper not found")
 class Mesh:
     def __init__(
         for k, v in kwargs.items():
             self.add_extra(k, v)
+        self.unwrapper = Unwrapper()
     def add_extra(self, k, v) -> None:
         self.extras[k] = v
         return tangents
+    def quad_remesh(
+        self,
+        quad_vertex_count: int = -1,
+        quad_rosy: int = 4,
+        quad_crease_angle: float = -1.0,
+        quad_smooth_iter: int = 2,
+        quad_align_to_boundaries: bool = False,
+    ) -> Mesh:
+        if quad_vertex_count < 0:
+            quad_vertex_count = self.v_pos.shape[0]
+        v_pos = self.v_pos.detach().cpu().numpy().astype(np.float32)
+        t_pos_idx = self.t_pos_idx.detach().cpu().numpy().astype(np.uint32)
+        new_vert, new_faces = pynanoinstantmeshes.remesh(
+            v_pos,
+            t_pos_idx,
+            quad_vertex_count // 4,
+            rosy=quad_rosy,
+            posy=4,
+            creaseAngle=quad_crease_angle,
+            align_to_boundaries=quad_align_to_boundaries,
+            smooth_iter=quad_smooth_iter,
+            deterministic=False,
+        )
+        # Briefly load in trimesh
+        mesh = trimesh.Trimesh(vertices=new_vert, faces=new_faces.astype(np.int32))
+        v_pos = torch.from_numpy(mesh.vertices).to(self.v_pos).contiguous()
+        t_pos_idx = torch.from_numpy(mesh.faces).to(self.t_pos_idx).contiguous()
+        # Create new mesh
+        return Mesh(v_pos, t_pos_idx)
+    def triangle_remesh(
+        self,
+        triangle_average_edge_length_multiplier: Optional[float] = None,
+        triangle_remesh_steps: int = 10,
+        triangle_vertex_count=-1,
+    ):
+        if triangle_vertex_count > 0:
+            reduction = triangle_vertex_count / self.v_pos.shape[0]
+            print("Triangle reduction:", reduction)
+            v_pos = self.v_pos.detach().cpu().numpy().astype(np.float32)
+            t_pos_idx = self.t_pos_idx.detach().cpu().numpy().astype(np.int32)
+            if reduction > 1.0:
+                subdivide_iters = int(math.ceil(math.log(reduction) / math.log(2)))
+                print("Subdivide iters:", subdivide_iters)
+                v_pos, t_pos_idx = gpytoolbox.subdivide(
+                    v_pos,
+                    t_pos_idx,
+                    iters=subdivide_iters,
+                )
+                reduction = triangle_vertex_count / v_pos.shape[0]
+            # Simplify
+            points_out, faces_out, _, _ = gpytoolbox.decimate(
+                v_pos,
+                t_pos_idx,
+                face_ratio=reduction,
+            )
+            # Convert back to torch
+            self.v_pos = torch.from_numpy(points_out).to(self.v_pos)
+            self.t_pos_idx = torch.from_numpy(faces_out).to(self.t_pos_idx)
+            self._edges = None
+            triangle_average_edge_length_multiplier = None
+        edges = self.edges
+        if triangle_average_edge_length_multiplier is None:
+            h = None
+        else:
+            h = float(
+                torch.linalg.norm(
+                    self.v_pos[edges[:, 0]] - self.v_pos[edges[:, 1]], dim=1
+                )
+                .mean()
+                .item()
+                * triangle_average_edge_length_multiplier
+            )
+        # Convert to numpy
+        v_pos = self.v_pos.detach().cpu().numpy().astype(np.float64)
+        t_pos_idx = self.t_pos_idx.detach().cpu().numpy().astype(np.int32)
+        # Remesh
+        v_remesh, f_remesh = gpytoolbox.remesh_botsch(
+            v_pos,
+            t_pos_idx,
+            triangle_remesh_steps,
+            h,
+        )
+        # Convert back to torch
+        v_pos = torch.from_numpy(v_remesh).to(self.v_pos).contiguous()
+        t_pos_idx = torch.from_numpy(f_remesh).to(self.t_pos_idx).contiguous()
+        # Create new mesh
+        return Mesh(v_pos, t_pos_idx)
     @torch.no_grad()
     def unwrap_uv(
         self,
         island_padding: float = 0.02,
     ) -> Mesh:
+        uv, indices = self.unwrapper(
             self.v_pos, self.v_nrm, self.t_pos_idx, island_padding
         )

sf3d/models/network.py CHANGED Viewed

@@ -7,10 +7,23 @@ import torch.nn.functional as F
 from einops import rearrange
 from jaxtyping import Float
 from torch import Tensor
 from torch.autograd import Function
-from torch.cuda.amp import custom_bwd, custom_fwd
 from sf3d.models.utils import BaseModule, normalize
 class PixelShuffleUpsampleNetwork(BaseModule):
@@ -65,13 +78,18 @@ class _TruncExp(Function):  # pylint: disable=abstract-method
     # Implementation from torch-ngp:
     # https://github.com/ashawkey/torch-ngp/blob/93b08a0d4ec1cc6e69d85df7f0acdfb99603b628/activation.py
     @staticmethod
-    @custom_fwd(cast_inputs=torch.float32)
     def forward(ctx, x):  # pylint: disable=arguments-differ
         ctx.save_for_backward(x)
         return torch.exp(x)
     @staticmethod
-    @custom_bwd
     def backward(ctx, g):  # pylint: disable=arguments-differ
         x = ctx.saved_tensors[0]
         return g * torch.exp(torch.clamp(x, max=15))

 from einops import rearrange
 from jaxtyping import Float
 from torch import Tensor
+from torch.amp import custom_bwd, custom_fwd
 from torch.autograd import Function
 from sf3d.models.utils import BaseModule, normalize
+from sf3d.utils import get_device
+def conditional_decorator(decorator_with_args, condition, *args, **kwargs):
+    def wrapper(fn):
+        if condition:
+            if len(kwargs) == 0:
+                return decorator_with_args
+            return decorator_with_args(*args, **kwargs)(fn)
+        else:
+            return fn
+    return wrapper
 class PixelShuffleUpsampleNetwork(BaseModule):
     # Implementation from torch-ngp:
     # https://github.com/ashawkey/torch-ngp/blob/93b08a0d4ec1cc6e69d85df7f0acdfb99603b628/activation.py
     @staticmethod
+    @conditional_decorator(
+        custom_fwd,
+        "cuda" in get_device(),
+        cast_inputs=torch.float32,
+        device_type="cuda",
+    )
     def forward(ctx, x):  # pylint: disable=arguments-differ
         ctx.save_for_backward(x)
         return torch.exp(x)
     @staticmethod
+    @conditional_decorator(custom_bwd, "cuda" in get_device())
     def backward(ctx, g):  # pylint: disable=arguments-differ
         x = ctx.saved_tensors[0]
         return g * torch.exp(torch.clamp(x, max=15))

sf3d/models/utils.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import dataclasses
 import importlib
-import math
 from dataclasses import dataclass
 from typing import Any, List, Optional, Tuple, Union
@@ -9,7 +8,7 @@ import PIL
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from jaxtyping import Bool, Float, Int, Num
 from omegaconf import DictConfig, OmegaConf
 from torch import Tensor
@@ -77,61 +76,6 @@ def normalize(x, dim=-1, eps=None):
     return F.normalize(x, dim=dim, p=2, eps=eps)
-def tri_winding(tri: Float[Tensor, "*B 3 2"]) -> Float[Tensor, "*B 3 3"]:
-    # One pad for determinant
-    tri_sq = F.pad(tri, (0, 1), "constant", 1.0)
-    det_tri = torch.det(tri_sq)
-    tri_rev = torch.cat(
-        (tri_sq[..., 0:1, :], tri_sq[..., 2:3, :], tri_sq[..., 1:2, :]), -2
-    )
-    tri_sq[det_tri < 0] = tri_rev[det_tri < 0]
-    return tri_sq
-def triangle_intersection_2d(
-    t1: Float[Tensor, "*B 3 2"],
-    t2: Float[Tensor, "*B 3 2"],
-    eps=1e-12,
-) -> Float[Tensor, "*B"]:  # noqa: F821
-    """Returns True if triangles collide, False otherwise"""
-    def chk_edge(x: Float[Tensor, "*B 3 3"]) -> Bool[Tensor, "*B"]:  # noqa: F821
-        logdetx = torch.logdet(x.double())
-        if eps is None:
-            return ~torch.isfinite(logdetx)
-        return ~(torch.isfinite(logdetx) & (logdetx > math.log(eps)))
-    t1s = tri_winding(t1)
-    t2s = tri_winding(t2)
-    # Assume the triangles do not collide in the begging
-    ret = torch.zeros(t1.shape[0], dtype=torch.bool, device=t1.device)
-    for i in range(3):
-        edge = torch.roll(t1s, i, dims=1)[:, :2, :]
-        # Check if all points of triangle 2 lay on the external side of edge E.
-        # If this is the case the triangle do not collide
-        upd = (
-            chk_edge(torch.cat((edge, t2s[:, 0:1]), 1))
-            & chk_edge(torch.cat((edge, t2s[:, 1:2]), 1))
-            & chk_edge(torch.cat((edge, t2s[:, 2:3]), 1))
-        )
-        # Here no collision is still True due to inversion
-        ret = ret | upd
-    for i in range(3):
-        edge = torch.roll(t2s, i, dims=1)[:, :2, :]
-        upd = (
-            chk_edge(torch.cat((edge, t1s[:, 0:1]), 1))
-            & chk_edge(torch.cat((edge, t1s[:, 1:2]), 1))
-            & chk_edge(torch.cat((edge, t1s[:, 2:3]), 1))
-        )
-        # Here no collision is still True due to inversion
-        ret = ret | upd
-    return ~ret  # Do the inversion
 ValidScale = Union[Tuple[float, float], Num[Tensor, "2 D"]]

 import dataclasses
 import importlib
 from dataclasses import dataclass
 from typing import Any, List, Optional, Tuple, Union
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+from jaxtyping import Float, Int, Num
 from omegaconf import DictConfig, OmegaConf
 from torch import Tensor
     return F.normalize(x, dim=dim, p=2, eps=eps)
 ValidScale = Union[Tuple[float, float], Num[Tensor, "2 D"]]

sf3d/system.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import os
 from dataclasses import dataclass, field
-from typing import Any, List, Optional, Tuple
 import numpy as np
 import torch
@@ -21,15 +22,23 @@ from sf3d.models.utils import (
     ImageProcessor,
     convert_data,
     dilate_fill,
-    dot,
     find_class,
     float32_to_uint8_np,
     normalize,
     scale_tensor,
 )
-from sf3d.utils import create_intrinsic_from_fov_deg, default_cond_c2w
-from .texture_baker import TextureBaker
 class SF3D(BaseModule):
@@ -206,6 +215,7 @@ class SF3D(BaseModule):
             batch["c2w_cond"] = batch["c2w_cond"].unsqueeze(1)
             batch["intrinsic_cond"] = batch["intrinsic_cond"].unsqueeze(1)
             batch["intrinsic_normed_cond"] = batch["intrinsic_normed_cond"].unsqueeze(1)
         batch_size, n_input_views = batch["rgb_cond"].shape[:2]
         camera_embeds: Optional[Float[Tensor, "B Nv Cc"]]
@@ -234,10 +244,54 @@ class SF3D(BaseModule):
     def run_image(
         self,
-        image: Image,
         bake_resolution: int,
         estimate_illumination: bool = False,
-    ) -> Tuple[trimesh.Trimesh, dict[str, Any]]:
         if image.mode != "RGBA":
             raise ValueError("Image must be in RGBA mode")
         img_cond = (
@@ -258,30 +312,14 @@ class SF3D(BaseModule):
             mask_cond,
         )
-        c2w_cond = default_cond_c2w(self.cfg.default_distance).to(self.device)
-        intrinsic, intrinsic_normed_cond = create_intrinsic_from_fov_deg(
-            self.cfg.default_fovy_deg,
-            self.cfg.cond_image_size,
-            self.cfg.cond_image_size,
-        )
-        batch = {
-            "rgb_cond": rgb_cond,
-            "mask_cond": mask_cond,
-            "c2w_cond": c2w_cond.unsqueeze(0),
-            "intrinsic_cond": intrinsic.to(self.device).unsqueeze(0),
-            "intrinsic_normed_cond": intrinsic_normed_cond.to(self.device).unsqueeze(0),
-        }
-        meshes, global_dict = self.generate_mesh(
-            batch, bake_resolution, estimate_illumination
-        )
-        return meshes[0], global_dict
     def generate_mesh(
         self,
         batch,
         bake_resolution: int,
         estimate_illumination: bool = False,
     ) -> Tuple[List[trimesh.Trimesh], dict[str, Any]]:
         batch["rgb_cond"] = self.image_processor(
@@ -300,8 +338,11 @@ class SF3D(BaseModule):
         if self.global_estimator is not None and estimate_illumination:
             global_dict.update(self.global_estimator(non_postprocessed_codes))
         with torch.no_grad():
-            with torch.autocast(device_type="cuda", enabled=False):
                 meshes = self.triplane_to_meshes(scene_codes)
                 rets = []
@@ -311,6 +352,17 @@ class SF3D(BaseModule):
                         rets.append(trimesh.Trimesh())
                         continue
                     mesh.unwrap_uv()
                     # Build textures
@@ -323,7 +375,6 @@ class SF3D(BaseModule):
                         mesh.v_pos,
                         rast,
                         mesh.t_pos_idx,
-                        mesh.v_tex,
                     )
                     gb_pos = pos_bake[bake_mask]
@@ -336,7 +387,6 @@ class SF3D(BaseModule):
                         mesh.v_nrm,
                         rast,
                         mesh.t_pos_idx,
-                        mesh.v_tex,
                     )
                     gb_nrm = F.normalize(nrm[bake_mask], dim=-1)
                     decoded["normal"] = gb_nrm
@@ -377,29 +427,28 @@ class SF3D(BaseModule):
                                     mesh.v_tng,
                                     rast,
                                     mesh.t_pos_idx,
-                                    mesh.v_tex,
                                 )
                                 gb_tng = tng[bake_mask]
                                 gb_tng = F.normalize(gb_tng, dim=-1)
                                 gb_btng = F.normalize(
-                                    torch.cross(gb_tng, gb_nrm, dim=-1), dim=-1
                                 )
                                 normal = F.normalize(mat_out["normal"], dim=-1)
-                                bump = torch.cat(
-                                    # Check if we have to flip some things
-                                    (
-                                        dot(normal, gb_tng),
-                                        dot(normal, gb_btng),
-                                        dot(normal, gb_nrm).clip(
-                                            0.3, 1
-                                        ),  # Never go below 0.3. This would indicate a flipped (or close to one) normal
-                                    ),
-                                    -1,
                                 )
-                                bump = (bump * 0.5 + 0.5).clamp(0, 1)
-                                f[bake_mask] = bump.view(-1, 3)
                                 mat_out["bump"] = f
                             else:
                                 f[bake_mask] = v.view(-1, v.shape[-1])
@@ -410,12 +459,13 @@ class SF3D(BaseModule):
                             return arr
                         return (
                             dilate_fill(
-                                arr.permute(2, 0, 1)[None, ...],
                                 bake_mask.unsqueeze(0).unsqueeze(0),
                                 iterations=bake_resolution // 150,
                             )
                             .squeeze(0)
                             .permute(1, 2, 0)
                         )
                     verts_np = convert_data(mesh.v_pos)

 import os
+from contextlib import nullcontext
 from dataclasses import dataclass, field
+from typing import Any, List, Literal, Optional, Tuple, Union
 import numpy as np
 import torch
     ImageProcessor,
     convert_data,
     dilate_fill,
     find_class,
     float32_to_uint8_np,
     normalize,
     scale_tensor,
 )
+from sf3d.utils import create_intrinsic_from_fov_deg, default_cond_c2w, get_device
+try:
+    from texture_baker import TextureBaker
+except ImportError:
+    import logging
+    logging.warning(
+        "Could not import texture_baker. Please install it via `pip install texture-baker/`"
+    )
+    # Exit early to avoid further errors
+    raise ImportError("texture_baker not found")
 class SF3D(BaseModule):
             batch["c2w_cond"] = batch["c2w_cond"].unsqueeze(1)
             batch["intrinsic_cond"] = batch["intrinsic_cond"].unsqueeze(1)
             batch["intrinsic_normed_cond"] = batch["intrinsic_normed_cond"].unsqueeze(1)
         batch_size, n_input_views = batch["rgb_cond"].shape[:2]
         camera_embeds: Optional[Float[Tensor, "B Nv Cc"]]
     def run_image(
         self,
+        image: Union[Image.Image, List[Image.Image]],
         bake_resolution: int,
+        remesh: Literal["none", "triangle", "quad"] = "none",
+        vertex_count: int = -1,
         estimate_illumination: bool = False,
+    ) -> Tuple[Union[trimesh.Trimesh, List[trimesh.Trimesh]], dict[str, Any]]:
+        if isinstance(image, list):
+            rgb_cond = []
+            mask_cond = []
+            for img in image:
+                mask, rgb = self.prepare_image(img)
+                mask_cond.append(mask)
+                rgb_cond.append(rgb)
+            rgb_cond = torch.stack(rgb_cond, 0)
+            mask_cond = torch.stack(mask_cond, 0)
+            batch_size = rgb_cond.shape[0]
+        else:
+            mask_cond, rgb_cond = self.prepare_image(image)
+            batch_size = 1
+        c2w_cond = default_cond_c2w(self.cfg.default_distance).to(self.device)
+        intrinsic, intrinsic_normed_cond = create_intrinsic_from_fov_deg(
+            self.cfg.default_fovy_deg,
+            self.cfg.cond_image_size,
+            self.cfg.cond_image_size,
+        )
+        batch = {
+            "rgb_cond": rgb_cond,
+            "mask_cond": mask_cond,
+            "c2w_cond": c2w_cond.view(1, 1, 4, 4).repeat(batch_size, 1, 1, 1),
+            "intrinsic_cond": intrinsic.to(self.device)
+            .view(1, 1, 3, 3)
+            .repeat(batch_size, 1, 1, 1),
+            "intrinsic_normed_cond": intrinsic_normed_cond.to(self.device)
+            .view(1, 1, 3, 3)
+            .repeat(batch_size, 1, 1, 1),
+        }
+        meshes, global_dict = self.generate_mesh(
+            batch, bake_resolution, remesh, vertex_count, estimate_illumination
+        )
+        if batch_size == 1:
+            return meshes[0], global_dict
+        else:
+            return meshes, global_dict
+    def prepare_image(self, image):
         if image.mode != "RGBA":
             raise ValueError("Image must be in RGBA mode")
         img_cond = (
             mask_cond,
         )
+        return mask_cond, rgb_cond
     def generate_mesh(
         self,
         batch,
         bake_resolution: int,
+        remesh: Literal["none", "triangle", "quad"] = "none",
+        vertex_count: int = -1,
         estimate_illumination: bool = False,
     ) -> Tuple[List[trimesh.Trimesh], dict[str, Any]]:
         batch["rgb_cond"] = self.image_processor(
         if self.global_estimator is not None and estimate_illumination:
             global_dict.update(self.global_estimator(non_postprocessed_codes))
+        device = get_device()
         with torch.no_grad():
+            with torch.autocast(
+                device_type=device, enabled=False
+            ) if "cuda" in device else nullcontext():
                 meshes = self.triplane_to_meshes(scene_codes)
                 rets = []
                         rets.append(trimesh.Trimesh())
                         continue
+                    if remesh == "triangle":
+                        mesh = mesh.triangle_remesh(triangle_vertex_count=vertex_count)
+                    elif remesh == "quad":
+                        mesh = mesh.quad_remesh(quad_vertex_count=vertex_count)
+                    else:
+                        if vertex_count > 0:
+                            print(
+                                "Warning: vertex_count is ignored when remesh is none"
+                            )
+                    print("After Remesh", mesh.v_pos.shape[0], mesh.t_pos_idx.shape[0])
                     mesh.unwrap_uv()
                     # Build textures
                         mesh.v_pos,
                         rast,
                         mesh.t_pos_idx,
                     )
                     gb_pos = pos_bake[bake_mask]
                         mesh.v_nrm,
                         rast,
                         mesh.t_pos_idx,
                     )
                     gb_nrm = F.normalize(nrm[bake_mask], dim=-1)
                     decoded["normal"] = gb_nrm
                                     mesh.v_tng,
                                     rast,
                                     mesh.t_pos_idx,
                                 )
                                 gb_tng = tng[bake_mask]
                                 gb_tng = F.normalize(gb_tng, dim=-1)
                                 gb_btng = F.normalize(
+                                    torch.cross(gb_nrm, gb_tng, dim=-1), dim=-1
                                 )
                                 normal = F.normalize(mat_out["normal"], dim=-1)
+                                # Create tangent space matrix and transform normal
+                                tangent_matrix = torch.stack(
+                                    [gb_tng, gb_btng, gb_nrm], dim=-1
+                                )
+                                normal_tangent = torch.bmm(
+                                    tangent_matrix.transpose(1, 2), normal.unsqueeze(-1)
+                                ).squeeze(-1)
+                                # Convert from [-1,1] to [0,1] range for storage
+                                normal_tangent = (normal_tangent * 0.5 + 0.5).clamp(
+                                    0, 1
                                 )
+                                f[bake_mask] = normal_tangent.view(-1, 3)
                                 mat_out["bump"] = f
                             else:
                                 f[bake_mask] = v.view(-1, v.shape[-1])
                             return arr
                         return (
                             dilate_fill(
+                                arr.permute(2, 0, 1)[None, ...].contiguous(),
                                 bake_mask.unsqueeze(0).unsqueeze(0),
                                 iterations=bake_resolution // 150,
                             )
                             .squeeze(0)
                             .permute(1, 2, 0)
+                            .contiguous()
                         )
                     verts_np = convert_data(mesh.v_pos)

sf3d/utils.py CHANGED Viewed

@@ -1,13 +1,27 @@
-from typing import Any
 import numpy as np
 import rembg
 import torch
 from PIL import Image
 import sf3d.models.utils as sf3d_utils
 def create_intrinsic_from_fov_deg(fov_deg: float, cond_height: int, cond_width: int):
     intrinsic = sf3d_utils.get_intrinsic_from_fov(
         np.deg2rad(fov_deg),
@@ -50,42 +64,42 @@ def remove_background(
     return image
 def resize_foreground(
-    image: Image,
     ratio: float,
 ) -> Image:
-    image = np.array(image)
-    assert image.shape[-1] == 4
-    alpha = np.where(image[..., 3] > 0)
-    y1, y2, x1, x2 = (
-        alpha[0].min(),
-        alpha[0].max(),
-        alpha[1].min(),
-        alpha[1].max(),
-    )
-    # crop the foreground
-    fg = image[y1:y2, x1:x2]
-    # pad to square
-    size = max(fg.shape[0], fg.shape[1])
-    ph0, pw0 = (size - fg.shape[0]) // 2, (size - fg.shape[1]) // 2
-    ph1, pw1 = size - fg.shape[0] - ph0, size - fg.shape[1] - pw0
-    new_image = np.pad(
-        fg,
-        ((ph0, ph1), (pw0, pw1), (0, 0)),
-        mode="constant",
-        constant_values=((0, 0), (0, 0), (0, 0)),
-    )
-    # compute padding according to the ratio
-    new_size = int(new_image.shape[0] / ratio)
-    # pad to size, double side
-    ph0, pw0 = (new_size - size) // 2, (new_size - size) // 2
-    ph1, pw1 = new_size - size - ph0, new_size - size - pw0
-    new_image = np.pad(
-        new_image,
-        ((ph0, ph1), (pw0, pw1), (0, 0)),
-        mode="constant",
-        constant_values=((0, 0), (0, 0), (0, 0)),
     )
-    new_image = Image.fromarray(new_image, mode="RGBA")
     return new_image

+import os
+from typing import Any, Union
 import numpy as np
 import rembg
 import torch
+import torchvision.transforms.functional as torchvision_F
 from PIL import Image
 import sf3d.models.utils as sf3d_utils
+def get_device():
+    if os.environ.get("SF3D_USE_CPU", "0") == "1":
+        return "cpu"
+    device = "cpu"
+    if torch.cuda.is_available():
+        device = "cuda"
+    elif torch.backends.mps.is_available():
+        device = "mps"
+    return device
 def create_intrinsic_from_fov_deg(fov_deg: float, cond_height: int, cond_width: int):
     intrinsic = sf3d_utils.get_intrinsic_from_fov(
         np.deg2rad(fov_deg),
     return image
+def get_1d_bounds(arr):
+    nz = np.flatnonzero(arr)
+    return nz[0], nz[-1]
+def get_bbox_from_mask(mask, thr=0.5):
+    masks_for_box = (mask > thr).astype(np.float32)
+    assert masks_for_box.sum() > 0, "Empty mask!"
+    x0, x1 = get_1d_bounds(masks_for_box.sum(axis=-2))
+    y0, y1 = get_1d_bounds(masks_for_box.sum(axis=-1))
+    return x0, y0, x1, y1
 def resize_foreground(
+    image: Union[Image.Image, np.ndarray],
     ratio: float,
+    out_size=None,
 ) -> Image:
+    if isinstance(image, np.ndarray):
+        image = Image.fromarray(image, mode="RGBA")
+    assert image.mode == "RGBA"
+    # Get bounding box
+    mask_np = np.array(image)[:, :, -1]
+    x1, y1, x2, y2 = get_bbox_from_mask(mask_np, thr=0.5)
+    h, w = y2 - y1, x2 - x1
+    yc, xc = (y1 + y2) / 2, (x1 + x2) / 2
+    scale = max(h, w) / ratio
+    new_image = torchvision_F.crop(
+        image,
+        top=int(yc - scale / 2),
+        left=int(xc - scale / 2),
+        height=int(scale),
+        width=int(scale),
     )
+    if out_size is not None:
+        new_image = new_image.resize(out_size)
     return new_image

texture_baker/README.md ADDED Viewed

	@@ -0,0 +1,26 @@

+# Texture baker
+Small texture baker which rasterizes barycentric coordinates to a tensor.
+It also implements an interpolation module which can be used to bake attributes to textures then.
+## Usage
+The baker can quickly bake vertex attributes to the a texture atlas based on the UV coordinates.
+It supports baking on the CPU and GPU.
+```python
+from texture_baker import TextureBaker
+mesh = ...
+uv = mesh.uv # num_vertex, 2
+triangle_idx = mesh.faces # num_faces, 3
+vertices = mesh.vertices # num_vertex, 3
+tb  = TextureBaker()
+# First get the barycentric coordinates
+rast = tb.rasterize(
+    uv=uv, face_indices=triangle_idx, bake_resolution=1024
+)
+# Then interpolate vertex attributes
+position_bake = tb.interpolate(attr=vertices, rast=rast, face_indices=triangle_idx)
+```

texture_baker/requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ torch
2	+ numpy

texture_baker/setup.py ADDED Viewed

	@@ -0,0 +1,124 @@

+import glob
+import os
+import platform
+import torch
+from setuptools import find_packages, setup
+from torch.utils.cpp_extension import (
+    CUDA_HOME,
+    BuildExtension,
+    CppExtension,
+    CUDAExtension,
+)
+library_name = "texture_baker"
+def get_extensions():
+    debug_mode = os.getenv("DEBUG", "0") == "1"
+    use_cuda = os.getenv("USE_CUDA", "1" if torch.cuda.is_available() else "0") == "1"
+    use_metal = (
+        os.getenv("USE_METAL", "1" if torch.backends.mps.is_available() else "0") == "1"
+    )
+    use_native_arch = os.getenv("USE_NATIVE_ARCH", "1") == "1"
+    if debug_mode:
+        print("Compiling in debug mode")
+    use_cuda = use_cuda and CUDA_HOME is not None
+    extension = CUDAExtension if use_cuda else CppExtension
+    extra_link_args = []
+    extra_compile_args = {
+        "cxx": [
+            "-O3" if not debug_mode else "-O0",
+            "-fdiagnostics-color=always",
+            "-fopenmp",
+        ] + ["-march=native"] if use_native_arch else [],
+        "nvcc": [
+            "-O3" if not debug_mode else "-O0",
+        ],
+    }
+    if debug_mode:
+        extra_compile_args["cxx"].append("-g")
+        if platform.system() == "Windows":
+            extra_compile_args["cxx"].append("/Z7")
+            extra_compile_args["cxx"].append("/Od")
+            extra_link_args.extend(["/DEBUG"])
+        extra_compile_args["cxx"].append("-UNDEBUG")
+        extra_compile_args["nvcc"].append("-UNDEBUG")
+        extra_compile_args["nvcc"].append("-g")
+        extra_link_args.extend(["-O0", "-g"])
+    define_macros = []
+    extensions = []
+    libraries = []
+    this_dir = os.path.dirname(os.path.curdir)
+    sources = glob.glob(
+        os.path.join(this_dir, library_name, "csrc", "**", "*.cpp"), recursive=True
+    )
+    if len(sources) == 0:
+        print("No source files found for extension, skipping extension compilation")
+        return None
+    if use_cuda:
+        define_macros += [
+            ("THRUST_IGNORE_CUB_VERSION_CHECK", None),
+        ]
+        sources += glob.glob(
+            os.path.join(this_dir, library_name, "csrc", "**", "*.cu"), recursive=True
+        )
+        libraries += ["cudart", "c10_cuda"]
+    if use_metal:
+        define_macros += [
+            ("WITH_MPS", None),
+        ]
+        sources += glob.glob(
+            os.path.join(this_dir, library_name, "csrc", "**", "*.mm"), recursive=True
+        )
+        extra_compile_args.update({"cxx": ["-O3", "-arch", "arm64"]})
+        extra_link_args += ["-arch", "arm64"]
+    extensions.append(
+        extension(
+            name=f"{library_name}._C",
+            sources=sources,
+            define_macros=define_macros,
+            extra_compile_args=extra_compile_args,
+            extra_link_args=extra_link_args,
+            libraries=libraries
+            + [
+                "c10",
+                "torch",
+                "torch_cpu",
+                "torch_python",
+            ],
+        )
+    )
+    for ext in extensions:
+        ext.libraries = ["cudart_static" if x == "cudart" else x for x in ext.libraries]
+    print(extensions)
+    return extensions
+setup(
+    name=library_name,
+    version="0.0.1",
+    packages=find_packages(where="."),
+    package_dir={"": "."},
+    ext_modules=get_extensions(),
+    install_requires=[],
+    package_data={
+        library_name: [os.path.join("csrc", "*.h"), os.path.join("csrc", "*.metal")],
+    },
+    description="Small texture baker which rasterizes barycentric coordinates to a tensor.",
+    long_description=open("README.md").read(),
+    long_description_content_type="text/markdown",
+    url="https://github.com/Stability-AI/texture_baker",
+    cmdclass={"build_ext": BuildExtension},
+)

texture_baker/texture_baker/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+import torch  # noqa: F401
+from . import _C  # noqa: F401
+from .baker import TextureBaker  # noqa: F401

texture_baker/texture_baker/baker.py ADDED Viewed

	@@ -0,0 +1,86 @@

+import torch
+import torch.nn as nn
+from torch import Tensor
+class TextureBaker(nn.Module):
+    def __init__(self):
+        super().__init__()
+    def rasterize(
+        self,
+        uv: Tensor,
+        face_indices: Tensor,
+        bake_resolution: int,
+    ) -> Tensor:
+        """
+        Rasterize the UV coordinates to a barycentric coordinates
+        & Triangle idxs texture map
+        Args:
+            uv (Tensor, num_vertices 2, float): UV coordinates of the mesh
+            face_indices (Tensor, num_faces 3, int): Face indices of the mesh
+            bake_resolution (int): Resolution of the bake
+        Returns:
+            Tensor, bake_resolution bake_resolution 4, float: Rasterized map
+        """
+        return torch.ops.texture_baker_cpp.rasterize(
+            uv, face_indices.to(torch.int32), bake_resolution
+        )
+    def get_mask(self, rast: Tensor) -> Tensor:
+        """
+        Get the occupancy mask from the rasterized map
+        Args:
+            rast (Tensor, bake_resolution bake_resolution 4, float): Rasterized map
+        Returns:
+            Tensor, bake_resolution bake_resolution, bool: Mask
+        """
+        return rast[..., -1] >= 0
+    def interpolate(
+        self,
+        attr: Tensor,
+        rast: Tensor,
+        face_indices: Tensor,
+    ) -> Tensor:
+        """
+        Interpolate the attributes using the rasterized map
+        Args:
+            attr (Tensor, num_vertices 3, float): Attributes of the mesh
+            rast (Tensor, bake_resolution bake_resolution 4, float): Rasterized map
+            face_indices (Tensor, num_faces 3, int): Face indices of the mesh
+            uv (Tensor, num_vertices 2, float): UV coordinates of the mesh
+        Returns:
+            Tensor, bake_resolution bake_resolution 3, float: Interpolated attributes
+        """
+        return torch.ops.texture_baker_cpp.interpolate(
+            attr, face_indices.to(torch.int32), rast
+        )
+    def forward(
+        self,
+        attr: Tensor,
+        uv: Tensor,
+        face_indices: Tensor,
+        bake_resolution: int,
+    ) -> Tensor:
+        """
+        Bake the texture
+        Args:
+            attr (Tensor, num_vertices 3, float): Attributes of the mesh
+            uv (Tensor, num_vertices 2, float): UV coordinates of the mesh
+            face_indices (Tensor, num_faces 3, int): Face indices of the mesh
+            bake_resolution (int): Resolution of the bake
+        Returns:
+            Tensor, bake_resolution bake_resolution 3, float: Baked texture
+        """
+        rast = self.rasterize(uv, face_indices, bake_resolution)
+        return self.interpolate(attr, rast, face_indices, uv)

texture_baker/texture_baker/csrc/baker.cpp ADDED Viewed

	@@ -0,0 +1,548 @@

+#include <ATen/ATen.h>
+#include <ATen/Context.h>
+#include <chrono>
+#include <cmath>
+#include <omp.h>
+#include <torch/extension.h>
+#ifndef __ARM_ARCH_ISA_A64
+#include <immintrin.h>
+#endif
+#include "baker.h"
+// #define TIMING
+#define BINS 8
+namespace texture_baker_cpp {
+// Calculate the centroid of a triangle
+tb_float2 triangle_centroid(const tb_float2 &v0, const tb_float2 &v1,
+                            const tb_float2 &v2) {
+  return {(v0.x + v1.x + v2.x) * 0.3333f, (v0.y + v1.y + v2.y) * 0.3333f};
+}
+float BVH::find_best_split_plane(const BVHNode &node, int &best_axis,
+                                 int &best_pos, AABB &centroidBounds) {
+  float best_cost = std::numeric_limits<float>::max();
+  for (int axis = 0; axis < 2; ++axis) // We use 2 as we have only x and y
+  {
+    float boundsMin = centroidBounds.min[axis];
+    float boundsMax = centroidBounds.max[axis];
+    if (boundsMin == boundsMax) {
+      continue;
+    }
+    // Populate the bins
+    float scale = BINS / (boundsMax - boundsMin);
+    float leftCountArea[BINS - 1], rightCountArea[BINS - 1];
+    int leftSum = 0, rightSum = 0;
+#ifndef __ARM_ARCH_ISA_A64
+#ifndef _MSC_VER
+    if (__builtin_cpu_supports("sse"))
+#elif (defined(_M_AMD64) || defined(_M_X64))
+    // SSE supported on Windows
+    if constexpr (true)
+#endif
+    {
+      __m128 min4[BINS], max4[BINS];
+      unsigned int count[BINS];
+      for (unsigned int i = 0; i < BINS; i++)
+        min4[i] = _mm_set_ps1(1e30f), max4[i] = _mm_set_ps1(-1e30f),
+        count[i] = 0;
+      for (int i = node.start; i < node.end; i++) {
+        int tri_idx = triangle_indices[i];
+        const Triangle &triangle = triangles[tri_idx];
+        int binIdx = std::min(
+            BINS - 1, (int)((triangle.centroid[axis] - boundsMin) * scale));
+        count[binIdx]++;
+        __m128 v0 = _mm_set_ps(triangle.v0.x, triangle.v0.y, 0.0f, 0.0f);
+        __m128 v1 = _mm_set_ps(triangle.v1.x, triangle.v1.y, 0.0f, 0.0f);
+        __m128 v2 = _mm_set_ps(triangle.v2.x, triangle.v2.y, 0.0f, 0.0f);
+        min4[binIdx] = _mm_min_ps(min4[binIdx], v0);
+        max4[binIdx] = _mm_max_ps(max4[binIdx], v0);
+        min4[binIdx] = _mm_min_ps(min4[binIdx], v1);
+        max4[binIdx] = _mm_max_ps(max4[binIdx], v1);
+        min4[binIdx] = _mm_min_ps(min4[binIdx], v2);
+        max4[binIdx] = _mm_max_ps(max4[binIdx], v2);
+      }
+      // gather data for the 7 planes between the 8 bins
+      __m128 leftMin4 = _mm_set_ps1(1e30f), rightMin4 = leftMin4;
+      __m128 leftMax4 = _mm_set_ps1(-1e30f), rightMax4 = leftMax4;
+      for (int i = 0; i < BINS - 1; i++) {
+        leftSum += count[i];
+        rightSum += count[BINS - 1 - i];
+        leftMin4 = _mm_min_ps(leftMin4, min4[i]);
+        rightMin4 = _mm_min_ps(rightMin4, min4[BINS - 2 - i]);
+        leftMax4 = _mm_max_ps(leftMax4, max4[i]);
+        rightMax4 = _mm_max_ps(rightMax4, max4[BINS - 2 - i]);
+        float le[4], re[4];
+        _mm_store_ps(le, _mm_sub_ps(leftMax4, leftMin4));
+        _mm_store_ps(re, _mm_sub_ps(rightMax4, rightMin4));
+        // SSE order goes from back to front
+        leftCountArea[i] = leftSum * (le[2] * le[3]); // 2D area calculation
+        rightCountArea[BINS - 2 - i] =
+            rightSum * (re[2] * re[3]); // 2D area calculation
+      }
+    }
+#else
+    if constexpr (false) {
+    }
+#endif
+    else {
+      struct Bin {
+        AABB bounds;
+        int triCount = 0;
+      } bins[BINS];
+      for (int i = node.start; i < node.end; i++) {
+        int tri_idx = triangle_indices[i];
+        const Triangle &triangle = triangles[tri_idx];
+        int binIdx = std::min(
+            BINS - 1, (int)((triangle.centroid[axis] - boundsMin) * scale));
+        bins[binIdx].triCount++;
+        bins[binIdx].bounds.grow(triangle.v0);
+        bins[binIdx].bounds.grow(triangle.v1);
+        bins[binIdx].bounds.grow(triangle.v2);
+      }
+      // Gather data for the planes between the bins
+      AABB leftBox, rightBox;
+      for (int i = 0; i < BINS - 1; i++) {
+        leftSum += bins[i].triCount;
+        leftBox.grow(bins[i].bounds);
+        leftCountArea[i] = leftSum * leftBox.area();
+        rightSum += bins[BINS - 1 - i].triCount;
+        rightBox.grow(bins[BINS - 1 - i].bounds);
+        rightCountArea[BINS - 2 - i] = rightSum * rightBox.area();
+      }
+    }
+    // Calculate SAH cost for the planes
+    scale = (boundsMax - boundsMin) / BINS;
+    for (int i = 0; i < BINS - 1; i++) {
+      float planeCost = leftCountArea[i] + rightCountArea[i];
+      if (planeCost < best_cost) {
+        best_axis = axis;
+        best_pos = i + 1;
+        best_cost = planeCost;
+      }
+    }
+  }
+  return best_cost;
+}
+void BVH::update_node_bounds(BVHNode &node, AABB &centroidBounds) {
+#ifndef __ARM_ARCH_ISA_A64
+#ifndef _MSC_VER
+  if (__builtin_cpu_supports("sse"))
+#elif (defined(_M_AMD64) || defined(_M_X64))
+  // SSE supported on Windows
+  if constexpr (true)
+#endif
+  {
+    __m128 min4 = _mm_set_ps1(1e30f), max4 = _mm_set_ps1(-1e30f);
+    __m128 cmin4 = _mm_set_ps1(1e30f), cmax4 = _mm_set_ps1(-1e30f);
+    for (int i = node.start; i < node.end; i += 2) {
+      int tri_idx1 = triangle_indices[i];
+      const Triangle &leafTri1 = triangles[tri_idx1];
+      // Check if the second actually exists in the node
+      __m128 v0, v1, v2, centroid;
+      if (i + 1 < node.end) {
+        int tri_idx2 = triangle_indices[i + 1];
+        const Triangle leafTri2 = triangles[tri_idx2];
+        v0 = _mm_set_ps(leafTri1.v0.x, leafTri1.v0.y, leafTri2.v0.x,
+                        leafTri2.v0.y);
+        v1 = _mm_set_ps(leafTri1.v1.x, leafTri1.v1.y, leafTri2.v1.x,
+                        leafTri2.v1.y);
+        v2 = _mm_set_ps(leafTri1.v2.x, leafTri1.v2.y, leafTri2.v2.x,
+                        leafTri2.v2.y);
+        centroid = _mm_set_ps(leafTri1.centroid.x, leafTri1.centroid.y,
+                              leafTri2.centroid.x, leafTri2.centroid.y);
+      } else {
+        // Otherwise do some duplicated work
+        v0 = _mm_set_ps(leafTri1.v0.x, leafTri1.v0.y, leafTri1.v0.x,
+                        leafTri1.v0.y);
+        v1 = _mm_set_ps(leafTri1.v1.x, leafTri1.v1.y, leafTri1.v1.x,
+                        leafTri1.v1.y);
+        v2 = _mm_set_ps(leafTri1.v2.x, leafTri1.v2.y, leafTri1.v2.x,
+                        leafTri1.v2.y);
+        centroid = _mm_set_ps(leafTri1.centroid.x, leafTri1.centroid.y,
+                              leafTri1.centroid.x, leafTri1.centroid.y);
+      }
+      min4 = _mm_min_ps(min4, v0);
+      max4 = _mm_max_ps(max4, v0);
+      min4 = _mm_min_ps(min4, v1);
+      max4 = _mm_max_ps(max4, v1);
+      min4 = _mm_min_ps(min4, v2);
+      max4 = _mm_max_ps(max4, v2);
+      cmin4 = _mm_min_ps(cmin4, centroid);
+      cmax4 = _mm_max_ps(cmax4, centroid);
+    }
+    float min_values[4], max_values[4], cmin_values[4], cmax_values[4];
+    _mm_store_ps(min_values, min4);
+    _mm_store_ps(max_values, max4);
+    _mm_store_ps(cmin_values, cmin4);
+    _mm_store_ps(cmax_values, cmax4);
+    node.bbox.min.x = std::min(min_values[3], min_values[1]);
+    node.bbox.min.y = std::min(min_values[2], min_values[0]);
+    node.bbox.max.x = std::max(max_values[3], max_values[1]);
+    node.bbox.max.y = std::max(max_values[2], max_values[0]);
+    centroidBounds.min.x = std::min(cmin_values[3], cmin_values[1]);
+    centroidBounds.min.y = std::min(cmin_values[2], cmin_values[0]);
+    centroidBounds.max.x = std::max(cmax_values[3], cmax_values[1]);
+    centroidBounds.max.y = std::max(cmax_values[2], cmax_values[0]);
+  }
+#else
+  if constexpr (false) {
+  }
+#endif
+  {
+    node.bbox.invalidate();
+    centroidBounds.invalidate();
+    // Calculate the bounding box for the node
+    for (int i = node.start; i < node.end; ++i) {
+      int tri_idx = triangle_indices[i];
+      const Triangle &tri = triangles[tri_idx];
+      node.bbox.grow(tri.v0);
+      node.bbox.grow(tri.v1);
+      node.bbox.grow(tri.v2);
+      centroidBounds.grow(tri.centroid);
+    }
+  }
+}
+void BVH::build(const tb_float2 *vertices, const tb_int3 *indices,
+                const int64_t &num_indices) {
+#ifdef TIMING
+  auto start = std::chrono::high_resolution_clock::now();
+#endif
+  // Create triangles
+  for (size_t i = 0; i < num_indices; ++i) {
+    tb_int3 idx = indices[i];
+    triangles.push_back(
+        {vertices[idx.x], vertices[idx.y], vertices[idx.z], static_cast<int>(i),
+         triangle_centroid(vertices[idx.x], vertices[idx.y], vertices[idx.z])});
+  }
+  // Initialize triangle_indices
+  triangle_indices.resize(triangles.size());
+  std::iota(triangle_indices.begin(), triangle_indices.end(), 0);
+  // Build BVH nodes
+  // Reserve extra capacity to fix windows specific crashes
+  nodes.reserve(triangles.size() * 2 + 1);
+  nodes.push_back({}); // Create the root node
+  root = 0;
+  // Define a struct for queue entries
+  struct QueueEntry {
+    int node_idx;
+    int start;
+    int end;
+  };
+  // Queue for breadth-first traversal
+  std::queue<QueueEntry> node_queue;
+  node_queue.push({root, 0, (int)triangles.size()});
+  // Process each node in the queue
+  while (!node_queue.empty()) {
+    QueueEntry current = node_queue.front();
+    node_queue.pop();
+    int node_idx = current.node_idx;
+    int start = current.start;
+    int end = current.end;
+    BVHNode &node = nodes[node_idx];
+    node.start = start;
+    node.end = end;
+    // Calculate the bounding box for the node
+    AABB centroidBounds;
+    update_node_bounds(node, centroidBounds);
+    // Determine the best split using SAH
+    int best_axis, best_pos;
+    float splitCost =
+        find_best_split_plane(node, best_axis, best_pos, centroidBounds);
+    float nosplitCost = node.calculate_node_cost();
+    // Stop condition: if the best cost is greater than or equal to the parent's
+    // cost
+    if (splitCost >= nosplitCost) {
+      // Leaf node
+      node.left = node.right = -1;
+      continue;
+    }
+    float scale =
+        BINS / (centroidBounds.max[best_axis] - centroidBounds.min[best_axis]);
+    int i = node.start;
+    int j = node.end - 1;
+    // Sort the triangle_indices in the range [start, end) based on the best
+    // axis
+    while (i <= j) {
+      // use the exact calculation we used for binning to prevent rare
+      // inaccuracies
+      int tri_idx = triangle_indices[i];
+      tb_float2 tcentr = triangles[tri_idx].centroid;
+      int binIdx = std::min(
+          BINS - 1,
+          (int)((tcentr[best_axis] - centroidBounds.min[best_axis]) * scale));
+      if (binIdx < best_pos)
+        i++;
+      else
+        std::swap(triangle_indices[i], triangle_indices[j--]);
+    }
+    int leftCount = i - node.start;
+    if (leftCount == 0 || leftCount == node.num_triangles()) {
+      // Leaf node
+      node.left = node.right = -1;
+      continue;
+    }
+    int mid = i;
+    // Create and set left child
+    node.left = nodes.size();
+    nodes.push_back({});
+    node_queue.push({node.left, start, mid});
+    // Create and set right child
+    node = nodes[node_idx]; // Update the node - Potentially stale reference
+    node.right = nodes.size();
+    nodes.push_back({});
+    node_queue.push({node.right, mid, end});
+  }
+#ifdef TIMING
+  auto end = std::chrono::high_resolution_clock::now();
+  std::chrono::duration<double> elapsed = end - start;
+  std::cout << "BVH build time: " << elapsed.count() << "s" << std::endl;
+#endif
+}
+// Utility function to clamp a value between a minimum and a maximum
+float clamp(float val, float minVal, float maxVal) {
+  return std::min(std::max(val, minVal), maxVal);
+}
+// Function to check if a point (xy) is inside a triangle defined by vertices
+// v1, v2, v3
+bool barycentric_coordinates(tb_float2 xy, tb_float2 v1, tb_float2 v2,
+                             tb_float2 v3, float &u, float &v, float &w) {
+  // Vectors from v1 to v2, v3 and xy
+  tb_float2 v1v2 = {v2.x - v1.x, v2.y - v1.y};
+  tb_float2 v1v3 = {v3.x - v1.x, v3.y - v1.y};
+  tb_float2 xyv1 = {xy.x - v1.x, xy.y - v1.y};
+  // Dot products of the vectors
+  float d00 = v1v2.x * v1v2.x + v1v2.y * v1v2.y;
+  float d01 = v1v2.x * v1v3.x + v1v2.y * v1v3.y;
+  float d11 = v1v3.x * v1v3.x + v1v3.y * v1v3.y;
+  float d20 = xyv1.x * v1v2.x + xyv1.y * v1v2.y;
+  float d21 = xyv1.x * v1v3.x + xyv1.y * v1v3.y;
+  // Calculate the barycentric coordinates
+  float denom = d00 * d11 - d01 * d01;
+  v = (d11 * d20 - d01 * d21) / denom;
+  w = (d00 * d21 - d01 * d20) / denom;
+  u = 1.0f - v - w;
+  // Check if the point is inside the triangle
+  return (v >= 0.0f) && (w >= 0.0f) && (v + w <= 1.0f);
+}
+bool BVH::intersect(const tb_float2 &point, float &u, float &v, float &w,
+                    int &index) const {
+  const int max_stack_size = 64;
+  int node_stack[max_stack_size];
+  int stack_size = 0;
+  node_stack[stack_size++] = root;
+  while (stack_size > 0) {
+    int node_idx = node_stack[--stack_size];
+    const BVHNode &node = nodes[node_idx];
+    if (node.is_leaf()) {
+      for (int i = node.start; i < node.end; ++i) {
+        const Triangle &tri = triangles[triangle_indices[i]];
+        if (barycentric_coordinates(point, tri.v0, tri.v1, tri.v2, u, v, w)) {
+          index = tri.index;
+          return true;
+        }
+      }
+    } else {
+      if (nodes[node.right].bbox.overlaps(point)) {
+        if (stack_size < max_stack_size) {
+          node_stack[stack_size++] = node.right;
+        } else {
+          // Handle stack overflow
+          throw std::runtime_error("Node stack overflow");
+        }
+      }
+      if (nodes[node.left].bbox.overlaps(point)) {
+        if (stack_size < max_stack_size) {
+          node_stack[stack_size++] = node.left;
+        } else {
+          // Handle stack overflow
+          throw std::runtime_error("Node stack overflow");
+        }
+      }
+    }
+  }
+  return false;
+}
+torch::Tensor rasterize_cpu(torch::Tensor uv, torch::Tensor indices,
+                            int64_t bake_resolution) {
+  int width = bake_resolution;
+  int height = bake_resolution;
+  int num_pixels = width * height;
+  torch::Tensor rast_result = torch::empty(
+      {bake_resolution, bake_resolution, 4},
+      torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCPU));
+  float *rast_result_ptr = rast_result.contiguous().data_ptr<float>();
+  const tb_float2 *vertices = (tb_float2 *)uv.data_ptr<float>();
+  const tb_int3 *tris = (tb_int3 *)indices.data_ptr<int>();
+  BVH bvh;
+  bvh.build(vertices, tris, indices.size(0));
+#ifdef TIMING
+  auto start = std::chrono::high_resolution_clock::now();
+#endif
+#pragma omp parallel for
+  for (int idx = 0; idx < num_pixels; ++idx) {
+    int x = idx / height;
+    int y = idx % height;
+    int idx_ = idx * 4; // Note: *4 because we're storing float4 per pixel
+    tb_float2 pixel_coord = {float(y) / height, float(x) / width};
+    pixel_coord.x = clamp(pixel_coord.x, 0.0f, 1.0f);
+    pixel_coord.y = 1.0f - clamp(pixel_coord.y, 0.0f, 1.0f);
+    float u, v, w;
+    int triangle_idx;
+    if (bvh.intersect(pixel_coord, u, v, w, triangle_idx)) {
+      rast_result_ptr[idx_ + 0] = u;
+      rast_result_ptr[idx_ + 1] = v;
+      rast_result_ptr[idx_ + 2] = w;
+      rast_result_ptr[idx_ + 3] = static_cast<float>(triangle_idx);
+    } else {
+      rast_result_ptr[idx_ + 0] = 0.0f;
+      rast_result_ptr[idx_ + 1] = 0.0f;
+      rast_result_ptr[idx_ + 2] = 0.0f;
+      rast_result_ptr[idx_ + 3] = -1.0f;
+    }
+  }
+#ifdef TIMING
+  auto end = std::chrono::high_resolution_clock::now();
+  std::chrono::duration<double> elapsed = end - start;
+  std::cout << "Rasterization time: " << elapsed.count() << "s" << std::endl;
+#endif
+  return rast_result;
+}
+torch::Tensor interpolate_cpu(torch::Tensor attr, torch::Tensor indices,
+                              torch::Tensor rast) {
+#ifdef TIMING
+  auto start = std::chrono::high_resolution_clock::now();
+#endif
+  int height = rast.size(0);
+  int width = rast.size(1);
+  torch::Tensor pos_bake = torch::empty(
+      {height, width, 3},
+      torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCPU));
+  const float *attr_ptr = attr.contiguous().data_ptr<float>();
+  const int *indices_ptr = indices.contiguous().data_ptr<int>();
+  const float *rast_ptr = rast.contiguous().data_ptr<float>();
+  float *output_ptr = pos_bake.contiguous().data_ptr<float>();
+  int num_pixels = width * height;
+#pragma omp parallel for
+  for (int idx = 0; idx < num_pixels; ++idx) {
+    int idx_ = idx * 4; // Index into the float4 array (4 floats per pixel)
+    tb_float3 barycentric = {
+        rast_ptr[idx_ + 0],
+        rast_ptr[idx_ + 1],
+        rast_ptr[idx_ + 2],
+    };
+    int triangle_idx = static_cast<int>(rast_ptr[idx_ + 3]);
+    if (triangle_idx < 0) {
+      output_ptr[idx * 3 + 0] = 0.0f;
+      output_ptr[idx * 3 + 1] = 0.0f;
+      output_ptr[idx * 3 + 2] = 0.0f;
+      continue;
+    }
+    tb_int3 triangle = {indices_ptr[3 * triangle_idx + 0],
+                        indices_ptr[3 * triangle_idx + 1],
+                        indices_ptr[3 * triangle_idx + 2]};
+    tb_float3 v1 = {attr_ptr[3 * triangle.x + 0], attr_ptr[3 * triangle.x + 1],
+                    attr_ptr[3 * triangle.x + 2]};
+    tb_float3 v2 = {attr_ptr[3 * triangle.y + 0], attr_ptr[3 * triangle.y + 1],
+                    attr_ptr[3 * triangle.y + 2]};
+    tb_float3 v3 = {attr_ptr[3 * triangle.z + 0], attr_ptr[3 * triangle.z + 1],
+                    attr_ptr[3 * triangle.z + 2]};
+    tb_float3 interpolated;
+    interpolated.x =
+        v1.x * barycentric.x + v2.x * barycentric.y + v3.x * barycentric.z;
+    interpolated.y =
+        v1.y * barycentric.x + v2.y * barycentric.y + v3.y * barycentric.z;
+    interpolated.z =
+        v1.z * barycentric.x + v2.z * barycentric.y + v3.z * barycentric.z;
+    output_ptr[idx * 3 + 0] = interpolated.x;
+    output_ptr[idx * 3 + 1] = interpolated.y;
+    output_ptr[idx * 3 + 2] = interpolated.z;
+  }
+#ifdef TIMING
+  auto end = std::chrono::high_resolution_clock::now();
+  std::chrono::duration<double> elapsed = end - start;
+  std::cout << "Interpolation time: " << elapsed.count() << "s" << std::endl;
+#endif
+  return pos_bake;
+}
+// Registers _C as a Python extension module.
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {}
+// Defines the operators
+TORCH_LIBRARY(texture_baker_cpp, m) {
+  m.def("rasterize(Tensor uv, Tensor indices, int bake_resolution) -> Tensor");
+  m.def("interpolate(Tensor attr, Tensor indices, Tensor rast) -> Tensor");
+}
+// Registers CPP implementations
+TORCH_LIBRARY_IMPL(texture_baker_cpp, CPU, m) {
+  m.impl("rasterize", &rasterize_cpu);
+  m.impl("interpolate", &interpolate_cpu);
+}
+} // namespace texture_baker_cpp

texture_baker/texture_baker/csrc/baker.h ADDED Viewed

	@@ -0,0 +1,203 @@

+#pragma once
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__METAL__)
+#define CUDA_ENABLED
+#ifndef __METAL__
+#define CUDA_HOST_DEVICE __host__ __device__
+#define CUDA_DEVICE __device__
+#define METAL_CONSTANT_MEM
+#define METAL_THREAD_MEM
+#else
+#define tb_float2 float2
+#define CUDA_HOST_DEVICE
+#define CUDA_DEVICE
+#define METAL_CONSTANT_MEM constant
+#define METAL_THREAD_MEM thread
+#endif
+#else
+#define CUDA_HOST_DEVICE
+#define CUDA_DEVICE
+#define METAL_CONSTANT_MEM
+#define METAL_THREAD_MEM
+#include <cfloat>
+#include <limits>
+#include <vector>
+#endif
+namespace texture_baker_cpp {
+// Structure to represent a 2D point or vector
+#ifndef __METAL__
+union alignas(8) tb_float2 {
+  struct {
+    float x, y;
+  };
+  float data[2];
+  float &operator[](size_t idx) {
+    if (idx > 1)
+      throw std::runtime_error("bad index");
+    return data[idx];
+  }
+  const float &operator[](size_t idx) const {
+    if (idx > 1)
+      throw std::runtime_error("bad index");
+    return data[idx];
+  }
+  bool operator==(const tb_float2 &rhs) const {
+    return x == rhs.x && y == rhs.y;
+  }
+};
+union alignas(4) tb_float3 {
+  struct {
+    float x, y, z;
+  };
+  float data[3];
+  float &operator[](size_t idx) {
+    if (idx > 2)
+      throw std::runtime_error("bad index");
+    return data[idx];
+  }
+  const float &operator[](size_t idx) const {
+    if (idx > 2)
+      throw std::runtime_error("bad index");
+    return data[idx];
+  }
+};
+union alignas(16) tb_float4 {
+  struct {
+    float x, y, z, w;
+  };
+  float data[4];
+  float &operator[](size_t idx) {
+    if (idx > 3)
+      throw std::runtime_error("bad index");
+    return data[idx];
+  }
+  const float &operator[](size_t idx) const {
+    if (idx > 3)
+      throw std::runtime_error("bad index");
+    return data[idx];
+  }
+};
+#endif
+union alignas(4) tb_int3 {
+  struct {
+    int x, y, z;
+  };
+  int data[3];
+#ifndef __METAL__
+  int &operator[](size_t idx) {
+    if (idx > 2)
+      throw std::runtime_error("bad index");
+    return data[idx];
+  }
+#endif
+};
+// BVH structure to accelerate point-triangle intersection
+struct alignas(16) AABB {
+  // Init bounding boxes with max/min
+  tb_float2 min = {FLT_MAX, FLT_MAX};
+  tb_float2 max = {FLT_MIN, FLT_MIN};
+#ifndef CUDA_ENABLED
+  // grow the AABB to include a point
+  void grow(const tb_float2 &p) {
+    min.x = std::min(min.x, p.x);
+    min.y = std::min(min.y, p.y);
+    max.x = std::max(max.x, p.x);
+    max.y = std::max(max.y, p.y);
+  }
+  void grow(const AABB &b) {
+    if (b.min.x != FLT_MAX) {
+      grow(b.min);
+      grow(b.max);
+    }
+  }
+#endif
+  // Check if two AABBs overlap
+  bool overlaps(const METAL_THREAD_MEM AABB &other) const {
+    return min.x <= other.max.x && max.x >= other.min.x &&
+           min.y <= other.max.y && max.y >= other.min.y;
+  }
+  bool overlaps(const METAL_THREAD_MEM tb_float2 &point) const {
+    return point.x >= min.x && point.x <= max.x && point.y >= min.y &&
+           point.y <= max.y;
+  }
+#if defined(__NVCC__)
+  CUDA_DEVICE bool overlaps(const float2 &point) const {
+    return point.x >= min.x && point.x <= max.x && point.y >= min.y &&
+           point.y <= max.y;
+  }
+#endif
+  // Initialize AABB to an invalid state
+  void invalidate() {
+    min = {FLT_MAX, FLT_MAX};
+    max = {FLT_MIN, FLT_MIN};
+  }
+  // Calculate the area of the AABB
+  float area() const {
+    tb_float2 extent = {max.x - min.x, max.y - min.y};
+    return extent.x * extent.y;
+  }
+};
+struct BVHNode {
+  AABB bbox;
+  int start, end;
+  int left, right;
+  int num_triangles() const { return end - start; }
+  CUDA_HOST_DEVICE bool is_leaf() const { return left == -1 && right == -1; }
+  float calculate_node_cost() {
+    float area = bbox.area();
+    return num_triangles() * area;
+  }
+};
+struct Triangle {
+  tb_float2 v0, v1, v2;
+  int index;
+  tb_float2 centroid;
+};
+#ifndef __METAL__
+struct BVH {
+  std::vector<BVHNode> nodes;
+  std::vector<Triangle> triangles;
+  std::vector<int> triangle_indices;
+  int root;
+  void build(const tb_float2 *vertices, const tb_int3 *indices,
+             const int64_t &num_indices);
+  bool intersect(const tb_float2 &point, float &u, float &v, float &w,
+                 int &index) const;
+  void update_node_bounds(BVHNode &node, AABB &centroidBounds);
+  float find_best_split_plane(const BVHNode &node, int &best_axis,
+                              int &best_pos, AABB &centroidBounds);
+};
+#endif
+} // namespace texture_baker_cpp

texture_baker/texture_baker/csrc/baker_kernel.cu ADDED Viewed

	@@ -0,0 +1,301 @@

+#include <ATen/ATen.h>
+#include <ATen/Context.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <torch/extension.h>
+#include "baker.h"
+// #define TIMING
+#define STRINGIFY(x) #x
+#define STR(x) STRINGIFY(x)
+#define FILE_LINE __FILE__ ":" STR(__LINE__)
+#define CUDA_CHECK_THROW(x) \
+	do { \
+		cudaError_t _result = x; \
+		if (_result != cudaSuccess) \
+			throw std::runtime_error(std::string(FILE_LINE " check failed " #x " failed: ") + cudaGetErrorString(_result)); \
+	} while(0)
+namespace texture_baker_cpp
+{
+    __device__ float3 operator+(const float3 &a, const float3 &b)
+    {
+        return make_float3(a.x + b.x, a.y + b.y, a.z + b.z);
+    }
+    // xy: 2D test position
+    // v1: vertex position 1
+    // v2: vertex position 2
+    // v3: vertex position 3
+    //
+    __forceinline__ __device__ bool barycentric_coordinates(const float2 &xy, const tb_float2 &v1, const tb_float2 &v2, const tb_float2 &v3, float &u, float &v, float &w)
+    {
+        // Return true if the point (xy) is inside the triangle defined by the vertices v1, v2, v3.
+        // If the point is inside the triangle, the barycentric coordinates are stored in u, v, and w.
+        float2 v1v2 = make_float2(v2.x - v1.x, v2.y - v1.y);
+        float2 v1v3 = make_float2(v3.x - v1.x, v3.y - v1.y);
+        float2 xyv1 = make_float2(xy.x - v1.x, xy.y - v1.y);
+        float d00 = v1v2.x * v1v2.x + v1v2.y * v1v2.y;
+        float d01 = v1v2.x * v1v3.x + v1v2.y * v1v3.y;
+        float d11 = v1v3.x * v1v3.x + v1v3.y * v1v3.y;
+        float d20 = xyv1.x * v1v2.x + xyv1.y * v1v2.y;
+        float d21 = xyv1.x * v1v3.x + xyv1.y * v1v3.y;
+        float denom = d00 * d11 - d01 * d01;
+        v = (d11 * d20 - d01 * d21) / denom;
+        w = (d00 * d21 - d01 * d20) / denom;
+        u = 1.0f - v - w;
+        return (v >= 0.0f) && (w >= 0.0f) && (v + w <= 1.0f);
+    }
+    __global__ void kernel_interpolate(const float3* __restrict__ attr, const int3* __restrict__ indices, const float4* __restrict__ rast, float3* __restrict__ output, int width, int height)
+    {
+        // Interpolate the attr into output based on the rast result (barycentric coordinates, + triangle idx)
+        //int idx = x * width + y;
+        int idx = blockIdx.x * blockDim.x + threadIdx.x;
+        int x = idx / width;
+        int y = idx % width;
+        if (x >= width || y >= height)
+            return;
+        float4 barycentric = rast[idx];
+        int triangle_idx = int(barycentric.w);
+        if (triangle_idx < 0)
+        {
+            output[idx] = make_float3(0.0f, 0.0f, 0.0f);
+            return;
+        }
+        float3 v1 = attr[indices[triangle_idx].x];
+        float3 v2 = attr[indices[triangle_idx].y];
+        float3 v3 = attr[indices[triangle_idx].z];
+        output[idx] = make_float3(v1.x * barycentric.x, v1.y * barycentric.x, v1.z * barycentric.x)
+        + make_float3(v2.x * barycentric.y, v2.y * barycentric.y, v2.z * barycentric.y)
+        + make_float3(v3.x * barycentric.z, v3.y * barycentric.z, v3.z * barycentric.z);
+    }
+    __device__ bool bvh_intersect(
+        const BVHNode* __restrict__ nodes,
+        const Triangle* __restrict__ triangles,
+        const int* __restrict__ triangle_indices,
+        const int root,
+        const float2 &point,
+        float &u, float &v, float &w,
+        int &index)
+    {
+        constexpr int max_stack_size = 64;
+        int node_stack[max_stack_size];
+        int stack_size = 0;
+        node_stack[stack_size++] = root;
+        while (stack_size > 0)
+        {
+            int node_idx = node_stack[--stack_size];
+            const BVHNode &node = nodes[node_idx];
+            if (node.is_leaf())
+            {
+                for (int i = node.start; i < node.end; ++i)
+                {
+                    const Triangle &tri = triangles[triangle_indices[i]];
+                    if (barycentric_coordinates(point, tri.v0, tri.v1, tri.v2, u, v, w))
+                    {
+                        index = tri.index;
+                        return true;
+                    }
+                }
+            }
+            else
+            {
+                if (nodes[node.right].bbox.overlaps(point))
+                {
+                    if (stack_size < max_stack_size)
+                    {
+                        node_stack[stack_size++] = node.right;
+                    }
+                    else
+                    {
+                        // Handle stack overflow
+                        // Make sure NDEBUG is not defined (see setup.py)
+                        assert(0 && "Node stack overflow");
+                    }
+                }
+                if (nodes[node.left].bbox.overlaps(point))
+                {
+                    if (stack_size < max_stack_size)
+                    {
+                        node_stack[stack_size++] = node.left;
+                    }
+                    else
+                    {
+                        // Handle stack overflow
+                        // Make sure NDEBUG is not defined (see setup.py)
+                        assert(0 && "Node stack overflow");
+                    }
+                }
+            }
+        }
+        return false;
+    }
+    __global__ void kernel_bake_uv(
+        float2* __restrict__ uv,
+        int3* __restrict__ indices,
+        float4* __restrict__ output,
+        const BVHNode* __restrict__ nodes,
+        const Triangle* __restrict__ triangles,
+        const int* __restrict__ triangle_indices,
+        const int root,
+        const int width,
+        const int height,
+        const int num_indices)
+    {
+        //int idx = x * width + y;
+        int idx = blockIdx.x * blockDim.x + threadIdx.x;
+        int x = idx / width;
+        int y = idx % width;
+        if (y >= width || x >= height)
+            return;
+        // We index x,y but the original coords are HW. So swap them
+        float2 pixel_coord = make_float2(float(y) / height, float(x) / width);
+        pixel_coord.x = fminf(fmaxf(pixel_coord.x, 0.0f), 1.0f);
+        pixel_coord.y = 1.0f - fminf(fmaxf(pixel_coord.y, 0.0f), 1.0f);
+        float u, v, w;
+        int triangle_idx;
+        bool hit = bvh_intersect(nodes, triangles, triangle_indices, root, pixel_coord, u, v, w, triangle_idx);
+        if (hit)
+        {
+            output[idx] = make_float4(u, v, w, float(triangle_idx));
+            return;
+        }
+        output[idx] = make_float4(0.0f, 0.0f, 0.0f, -1.0f);
+    }
+    torch::Tensor rasterize_gpu(
+        torch::Tensor uv,
+        torch::Tensor indices,
+        int64_t bake_resolution)
+    {
+#ifdef TIMING
+        auto start = std::chrono::high_resolution_clock::now();
+#endif
+        constexpr int block_size = 16 * 16;
+        int grid_size = bake_resolution * bake_resolution / block_size;
+        dim3 block_dims(block_size, 1, 1);
+        dim3 grid_dims(grid_size, 1, 1);
+        int num_indices = indices.size(0);
+        int width = bake_resolution;
+        int height = bake_resolution;
+        // Step 1: create an empty tensor to store the output.
+        torch::Tensor rast_result = torch::empty({bake_resolution, bake_resolution, 4}, torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA));
+        auto vertices_cpu = uv.contiguous().cpu();
+        auto indices_cpu = indices.contiguous().cpu();
+        const tb_float2 *vertices_cpu_ptr = (tb_float2*)vertices_cpu.contiguous().data_ptr<float>();
+        const tb_int3 *tris_cpu_ptr = (tb_int3*)indices_cpu.contiguous().data_ptr<int>();
+        BVH bvh;
+        bvh.build(vertices_cpu_ptr, tris_cpu_ptr, indices.size(0));
+        BVHNode *nodes_gpu = nullptr;
+        Triangle *triangles_gpu = nullptr;
+        int *triangle_indices_gpu = nullptr;
+        const int bvh_root = bvh.root;
+        cudaStream_t cuda_stream = at::cuda::getCurrentCUDAStream();
+        CUDA_CHECK_THROW(cudaMallocAsync(&nodes_gpu, sizeof(BVHNode) * bvh.nodes.size(), cuda_stream));
+        CUDA_CHECK_THROW(cudaMallocAsync(&triangles_gpu, sizeof(Triangle) * bvh.triangles.size(), cuda_stream));
+        CUDA_CHECK_THROW(cudaMallocAsync(&triangle_indices_gpu, sizeof(int) * bvh.triangle_indices.size(), cuda_stream));
+        CUDA_CHECK_THROW(cudaMemcpyAsync(nodes_gpu, bvh.nodes.data(), sizeof(BVHNode) * bvh.nodes.size(), cudaMemcpyHostToDevice, cuda_stream));
+        CUDA_CHECK_THROW(cudaMemcpyAsync(triangles_gpu, bvh.triangles.data(), sizeof(Triangle) * bvh.triangles.size(), cudaMemcpyHostToDevice, cuda_stream));
+        CUDA_CHECK_THROW(cudaMemcpyAsync(triangle_indices_gpu, bvh.triangle_indices.data(), sizeof(int) * bvh.triangle_indices.size(), cudaMemcpyHostToDevice, cuda_stream));
+        kernel_bake_uv<<<grid_dims, block_dims, 0, cuda_stream>>>(
+            (float2 *)uv.contiguous().data_ptr<float>(),
+            (int3 *)indices.contiguous().data_ptr<int>(),
+            (float4 *)rast_result.contiguous().data_ptr<float>(),
+            nodes_gpu,
+            triangles_gpu,
+            triangle_indices_gpu,
+            bvh_root,
+            width,
+            height,
+            num_indices);
+        CUDA_CHECK_THROW(cudaFreeAsync(nodes_gpu, cuda_stream));
+        CUDA_CHECK_THROW(cudaFreeAsync(triangles_gpu, cuda_stream));
+        CUDA_CHECK_THROW(cudaFreeAsync(triangle_indices_gpu, cuda_stream));
+#ifdef TIMING
+        CUDA_CHECK_THROW(cudaStreamSynchronize(cuda_stream));
+        auto end = std::chrono::high_resolution_clock::now();
+        std::chrono::duration<double> elapsed = end - start;
+        std::cout << "Rasterization time (CUDA): " << elapsed.count() << "s" << std::endl;
+#endif
+        return rast_result;
+    }
+    torch::Tensor interpolate_gpu(
+        torch::Tensor attr,
+        torch::Tensor indices,
+        torch::Tensor rast)
+    {
+#ifdef TIMING
+        auto start = std::chrono::high_resolution_clock::now();
+#endif
+        constexpr int block_size = 16 * 16;
+        int grid_size = rast.size(0) * rast.size(0) / block_size;
+        dim3 block_dims(block_size, 1, 1);
+        dim3 grid_dims(grid_size, 1, 1);
+        // Step 1: create an empty tensor to store the output.
+        torch::Tensor pos_bake = torch::empty({rast.size(0), rast.size(1), 3}, torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA));
+        int width = rast.size(0);
+        int height = rast.size(1);
+        cudaStream_t cuda_stream = at::cuda::getCurrentCUDAStream();
+        kernel_interpolate<<<grid_dims, block_dims, 0, cuda_stream>>>(
+            (float3 *)attr.contiguous().data_ptr<float>(),
+            (int3 *)indices.contiguous().data_ptr<int>(),
+            (float4 *)rast.contiguous().data_ptr<float>(),
+            (float3 *)pos_bake.contiguous().data_ptr<float>(),
+            width,
+            height);
+#ifdef TIMING
+        CUDA_CHECK_THROW(cudaStreamSynchronize(cuda_stream));
+        auto end = std::chrono::high_resolution_clock::now();
+        std::chrono::duration<double> elapsed = end - start;
+        std::cout << "Interpolation time (CUDA): " << elapsed.count() << "s" << std::endl;
+#endif
+        return pos_bake;
+    }
+    // Registers CUDA implementations
+    TORCH_LIBRARY_IMPL(texture_baker_cpp, CUDA, m)
+    {
+        m.impl("rasterize", &rasterize_gpu);
+        m.impl("interpolate", &interpolate_gpu);
+    }
+}

texture_baker/texture_baker/csrc/baker_kernel.metal ADDED Viewed

	@@ -0,0 +1,170 @@

+#include <metal_stdlib>
+using namespace metal;
+// This header is inlined manually
+//#include "baker.h"
+// Use the texture_baker_cpp so it can use the classes from baker.h
+using namespace texture_baker_cpp;
+// Utility function to compute barycentric coordinates
+bool barycentric_coordinates(float2 xy, float2 v1, float2 v2, float2 v3, thread float &u, thread float &v, thread float &w) {
+    float2 v1v2 = v2 - v1;
+    float2 v1v3 = v3 - v1;
+    float2 xyv1 = xy - v1;
+    float d00 = dot(v1v2, v1v2);
+    float d01 = dot(v1v2, v1v3);
+    float d11 = dot(v1v3, v1v3);
+    float d20 = dot(xyv1, v1v2);
+    float d21 = dot(xyv1, v1v3);
+    float denom = d00 * d11 - d01 * d01;
+    v = (d11 * d20 - d01 * d21) / denom;
+    w = (d00 * d21 - d01 * d20) / denom;
+    u = 1.0f - v - w;
+    return (v >= 0.0f) && (w >= 0.0f) && (v + w <= 1.0f);
+}
+// Kernel function for interpolation
+kernel void kernel_interpolate(constant packed_float3 *attr [[buffer(0)]],
+                            constant packed_int3 *indices [[buffer(1)]],
+                            constant packed_float4 *rast [[buffer(2)]],
+                            device packed_float3 *output [[buffer(3)]],
+                            constant int &width [[buffer(4)]],
+                            constant int &height [[buffer(5)]],
+                            uint3 blockIdx [[threadgroup_position_in_grid]],
+                            uint3 threadIdx [[thread_position_in_threadgroup]],
+                            uint3 blockDim [[threads_per_threadgroup]])
+{
+    // Calculate global position using threadgroup and thread positions
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= width || y >= height) return;
+    int idx = y * width + x;
+    float4 barycentric = rast[idx];
+    int triangle_idx = int(barycentric.w);
+    if (triangle_idx < 0) {
+        output[idx] = float3(0.0f, 0.0f, 0.0f);
+        return;
+    }
+    float3 v1 = attr[indices[triangle_idx].x];
+    float3 v2 = attr[indices[triangle_idx].y];
+    float3 v3 = attr[indices[triangle_idx].z];
+    output[idx] = v1 * barycentric.x + v2 * barycentric.y + v3 * barycentric.z;
+}
+bool bvh_intersect(
+    constant BVHNode* nodes,
+    constant Triangle* triangles,
+    constant int* triangle_indices,
+    const thread int root,
+    const thread float2 &point,
+    thread float &u, thread float &v, thread float &w,
+    thread int &index)
+{
+    const int max_stack_size = 64;
+    thread int node_stack[max_stack_size];
+    int stack_size = 0;
+    node_stack[stack_size++] = root;
+    while (stack_size > 0)
+    {
+        int node_idx = node_stack[--stack_size];
+        BVHNode node = nodes[node_idx];
+        if (node.is_leaf())
+        {
+            for (int i = node.start; i < node.end; ++i)
+            {
+                constant Triangle &tri = triangles[triangle_indices[i]];
+                if (barycentric_coordinates(point, tri.v0, tri.v1, tri.v2, u, v, w))
+                {
+                    index = tri.index;
+                    return true;
+                }
+            }
+        }
+        else
+        {
+            BVHNode test_node = nodes[node.right];
+            if (test_node.bbox.overlaps(point))
+            {
+                if (stack_size < max_stack_size)
+                {
+                    node_stack[stack_size++] = node.right;
+                }
+                else
+                {
+                    // Handle stack overflow
+                    // Sadly, metal doesn't support asserts (but you could try enabling metal validation layers)
+                    return false;
+                }
+            }
+            test_node = nodes[node.left];
+            if (test_node.bbox.overlaps(point))
+            {
+                if (stack_size < max_stack_size)
+                {
+                    node_stack[stack_size++] = node.left;
+                }
+                else
+                {
+                    // Handle stack overflow
+                    return false;
+                }
+            }
+        }
+    }
+    return false;
+}
+// Kernel function for baking UV
+kernel void kernel_bake_uv(constant packed_float2 *uv [[buffer(0)]],
+                        constant packed_int3 *indices [[buffer(1)]],
+                        device packed_float4 *output [[buffer(2)]],
+                        constant BVHNode *nodes [[buffer(3)]],
+                        constant Triangle *triangles [[buffer(4)]],
+                        constant int *triangle_indices [[buffer(5)]],
+                        constant int &root [[buffer(6)]],
+                        constant int &width [[buffer(7)]],
+                        constant int &height [[buffer(8)]],
+                        constant int &num_indices [[buffer(9)]],
+                        uint3 blockIdx [[threadgroup_position_in_grid]],
+                        uint3 threadIdx [[thread_position_in_threadgroup]],
+                        uint3 blockDim [[threads_per_threadgroup]])
+{
+    // Calculate global position using threadgroup and thread positions
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= width || y >= height) return;
+    int idx = x * width + y;
+    // Swap original coordinates
+    float2 pixel_coord = float2(float(y) / float(height), float(x) / float(width));
+    pixel_coord = clamp(pixel_coord, 0.0f, 1.0f);
+    pixel_coord.y = 1.0f - pixel_coord.y;
+    float u, v, w;
+    int triangle_idx;
+    bool hit = bvh_intersect(nodes, triangles, triangle_indices, root, pixel_coord, u, v, w, triangle_idx);
+    if (hit) {
+        output[idx] = float4(u, v, w, float(triangle_idx));
+        return;
+    }
+    output[idx] = float4(0.0f, 0.0f, 0.0f, -1.0f);
+}

texture_baker/texture_baker/csrc/baker_kernel.mm ADDED Viewed

	@@ -0,0 +1,260 @@

+#include <torch/extension.h>
+#include <ATen/ATen.h>
+#include <ATen/Context.h>
+#include "baker.h"
+#import <Foundation/Foundation.h>
+#import <Metal/Metal.h>
+#include <filesystem>
+// Helper function to retrieve the `MTLBuffer` from a `torch::Tensor`.
+static inline id<MTLBuffer> getMTLBufferStorage(const torch::Tensor& tensor) {
+  return __builtin_bit_cast(id<MTLBuffer>, tensor.storage().data());
+}
+// Helper function to create a compute pipeline state object (PSO).
+static inline id<MTLComputePipelineState> createComputePipelineState(id<MTLDevice> device, NSString* fullSource, std::string kernel_name) {
+    NSError *error = nil;
+    // Load the custom kernel shader.
+    MTLCompileOptions *options = [[MTLCompileOptions alloc] init];
+    // Add the preprocessor macro "__METAL__"
+    options.preprocessorMacros = @{@"__METAL__": @""};
+    id<MTLLibrary> customKernelLibrary = [device newLibraryWithSource: fullSource options:options error:&error];
+    TORCH_CHECK(customKernelLibrary, "Failed to create custom kernel library, error: ", error.localizedDescription.UTF8String);
+    id<MTLFunction> customKernelFunction = [customKernelLibrary newFunctionWithName:[NSString stringWithUTF8String:kernel_name.c_str()]];
+    TORCH_CHECK(customKernelFunction, "Failed to create function state object for ", kernel_name.c_str());
+    id<MTLComputePipelineState> pso = [device newComputePipelineStateWithFunction:customKernelFunction error:&error];
+    TORCH_CHECK(pso, error.localizedDescription.UTF8String);
+    return pso;
+}
+std::filesystem::path get_extension_path() {
+    // Ensure the GIL is held before calling any Python C API function
+    PyGILState_STATE gstate = PyGILState_Ensure();
+    const char* module_name = "texture_baker";
+    // Import the module by name
+    PyObject* module = PyImport_ImportModule(module_name);
+    if (!module) {
+        PyGILState_Release(gstate);
+        throw std::runtime_error("Could not import the module: " + std::string(module_name));
+    }
+    // Get the filename of the module
+    PyObject* filename_obj = PyModule_GetFilenameObject(module);
+    if (filename_obj) {
+        std::string path = PyUnicode_AsUTF8(filename_obj);
+        Py_DECREF(filename_obj);
+        PyGILState_Release(gstate);
+        // Get the directory part of the path (removing the __init__.py)
+        std::filesystem::path module_path = std::filesystem::path(path).parent_path();
+        // Append the 'csrc' directory to the path
+        module_path /= "csrc";
+        return module_path;
+    } else {
+        PyGILState_Release(gstate);
+        throw std::runtime_error("Could not retrieve the module filename.");
+    }
+}
+NSString *get_shader_sources_as_string()
+{
+    const std::filesystem::path csrc_path = get_extension_path();
+    const std::string shader_path = (csrc_path / "baker_kernel.metal").string();
+    const std::string shader_header_path = (csrc_path / "baker.h").string();
+    // Load the Metal shader from the specified path
+    NSError *error = nil;
+    NSString* shaderHeaderSource = [
+        NSString stringWithContentsOfFile:[NSString stringWithUTF8String:shader_header_path.c_str()]
+        encoding:NSUTF8StringEncoding
+        error:&error];
+    if (error) {
+        throw std::runtime_error("Failed to load baker.h: " + std::string(error.localizedDescription.UTF8String));
+    }
+    NSString* shaderSource = [
+        NSString stringWithContentsOfFile:[NSString stringWithUTF8String:shader_path.c_str()]
+        encoding:NSUTF8StringEncoding
+        error:&error];
+    if (error) {
+        throw std::runtime_error("Failed to load Metal shader: " + std::string(error.localizedDescription.UTF8String));
+    }
+    NSString *fullSource = [shaderHeaderSource stringByAppendingString:shaderSource];
+    return fullSource;
+}
+namespace texture_baker_cpp
+{
+    torch::Tensor rasterize_gpu(
+        torch::Tensor uv,
+        torch::Tensor indices,
+        int64_t bake_resolution)
+    {
+        TORCH_CHECK(uv.device().is_mps(), "uv must be a MPS tensor");
+        TORCH_CHECK(uv.is_contiguous(), "uv must be contiguous");
+        TORCH_CHECK(indices.is_contiguous(), "indices must be contiguous");
+        TORCH_CHECK(uv.scalar_type() == torch::kFloat32, "Unsupported data type: ", indices.scalar_type());
+        TORCH_CHECK(indices.scalar_type() == torch::kInt32, "Unsupported data type: ", indices.scalar_type());
+        torch::Tensor rast_result = torch::empty({bake_resolution, bake_resolution, 4}, torch::TensorOptions().dtype(torch::kFloat32).device(torch::kMPS)).contiguous();
+        @autoreleasepool {
+            auto vertices_cpu = uv.contiguous().cpu();
+            auto indices_cpu = indices.contiguous().cpu();
+            const tb_float2 *vertices_cpu_ptr = (tb_float2*)vertices_cpu.contiguous().data_ptr<float>();
+            const tb_int3 *tris_cpu_ptr = (tb_int3*)indices_cpu.contiguous().data_ptr<int>();
+            BVH bvh;
+            bvh.build(vertices_cpu_ptr, tris_cpu_ptr, indices.size(0));
+            id<MTLDevice> device = MTLCreateSystemDefaultDevice();
+            NSString *fullSource = get_shader_sources_as_string();
+            // Create a compute pipeline state object using the helper function
+            id<MTLComputePipelineState> bake_uv_PSO = createComputePipelineState(device, fullSource, "kernel_bake_uv");
+            // Get a reference to the command buffer for the MPS stream.
+            id<MTLCommandBuffer> commandBuffer = torch::mps::get_command_buffer();
+            TORCH_CHECK(commandBuffer, "Failed to retrieve command buffer reference");
+            // Get a reference to the dispatch queue for the MPS stream, which encodes the synchronization with the CPU.
+            dispatch_queue_t serialQueue = torch::mps::get_dispatch_queue();
+            dispatch_sync(serialQueue, ^(){
+                // Start a compute pass.
+                id<MTLComputeCommandEncoder> computeEncoder = [commandBuffer computeCommandEncoder];
+                TORCH_CHECK(computeEncoder, "Failed to create compute command encoder");
+                // Get Metal buffers directly from PyTorch tensors
+                auto uv_buf = getMTLBufferStorage(uv.contiguous());
+                auto indices_buf = getMTLBufferStorage(indices.contiguous());
+                auto rast_result_buf = getMTLBufferStorage(rast_result);
+                const int width = bake_resolution;
+                const int height = bake_resolution;
+                const int num_indices = indices.size(0);
+                const int bvh_root = bvh.root;
+                // Wrap the existing CPU memory in Metal buffers with shared memory
+                id<MTLBuffer> nodesBuffer = [device newBufferWithBytesNoCopy:(void*)bvh.nodes.data() length:sizeof(BVHNode) * bvh.nodes.size() options:MTLResourceStorageModeShared deallocator:nil];
+                id<MTLBuffer> trianglesBuffer = [device newBufferWithBytesNoCopy:(void*)bvh.triangles.data() length:sizeof(Triangle) * bvh.triangles.size() options:MTLResourceStorageModeShared deallocator:nil];
+                id<MTLBuffer> triangleIndicesBuffer = [device newBufferWithBytesNoCopy:(void*)bvh.triangle_indices.data() length:sizeof(int) * bvh.triangle_indices.size() options:MTLResourceStorageModeShared deallocator:nil];
+                [computeEncoder setComputePipelineState:bake_uv_PSO];
+                [computeEncoder setBuffer:uv_buf offset:uv.storage_offset() * uv.element_size() atIndex:0];
+                [computeEncoder setBuffer:indices_buf offset:indices.storage_offset() * indices.element_size() atIndex:1];
+                [computeEncoder setBuffer:rast_result_buf offset:rast_result.storage_offset() * rast_result.element_size() atIndex:2];
+                [computeEncoder setBuffer:nodesBuffer offset:0 atIndex:3];
+                [computeEncoder setBuffer:trianglesBuffer offset:0 atIndex:4];
+                [computeEncoder setBuffer:triangleIndicesBuffer offset:0 atIndex:5];
+                [computeEncoder setBytes:&bvh_root length:sizeof(int) atIndex:6];
+                [computeEncoder setBytes:&width length:sizeof(int) atIndex:7];
+                [computeEncoder setBytes:&height length:sizeof(int) atIndex:8];
+                [computeEncoder setBytes:&num_indices length:sizeof(int) atIndex:9];
+                // Calculate a thread group size.
+                int block_size = 16;
+                MTLSize threadgroupSize = MTLSizeMake(block_size, block_size, 1);  // Fixed threadgroup size
+                MTLSize numThreadgroups = MTLSizeMake(bake_resolution / block_size, bake_resolution / block_size, 1);
+                // Encode the compute command.
+                [computeEncoder dispatchThreadgroups:numThreadgroups threadsPerThreadgroup:threadgroupSize];
+                [computeEncoder endEncoding];
+                // Commit the work.
+                torch::mps::commit();
+            });
+        }
+        return rast_result;
+    }
+    torch::Tensor interpolate_gpu(
+        torch::Tensor attr,
+        torch::Tensor indices,
+        torch::Tensor rast)
+    {
+        TORCH_CHECK(attr.is_contiguous(), "attr must be contiguous");
+        TORCH_CHECK(indices.is_contiguous(), "indices must be contiguous");
+        TORCH_CHECK(rast.is_contiguous(), "rast must be contiguous");
+        torch::Tensor pos_bake = torch::empty({rast.size(0), rast.size(1), 3}, torch::TensorOptions().dtype(torch::kFloat32).device(torch::kMPS)).contiguous();
+        std::filesystem::path csrc_path = get_extension_path();
+        @autoreleasepool {
+            id<MTLDevice> device = MTLCreateSystemDefaultDevice();
+            NSString *fullSource = get_shader_sources_as_string();
+            // Create a compute pipeline state object using the helper function
+            id<MTLComputePipelineState> interpolate_PSO = createComputePipelineState(device, fullSource, "kernel_interpolate");
+            // Get a reference to the command buffer for the MPS stream.
+            id<MTLCommandBuffer> commandBuffer = torch::mps::get_command_buffer();
+            TORCH_CHECK(commandBuffer, "Failed to retrieve command buffer reference");
+            // Get a reference to the dispatch queue for the MPS stream, which encodes the synchronization with the CPU.
+            dispatch_queue_t serialQueue = torch::mps::get_dispatch_queue();
+            dispatch_sync(serialQueue, ^(){
+                // Start a compute pass.
+                id<MTLComputeCommandEncoder> computeEncoder = [commandBuffer computeCommandEncoder];
+                TORCH_CHECK(computeEncoder, "Failed to create compute command encoder");
+                // Get Metal buffers directly from PyTorch tensors
+                auto attr_buf = getMTLBufferStorage(attr.contiguous());
+                auto indices_buf = getMTLBufferStorage(indices.contiguous());
+                auto rast_buf = getMTLBufferStorage(rast.contiguous());
+                auto pos_bake_buf = getMTLBufferStorage(pos_bake);
+                int width = rast.size(0);
+                int height = rast.size(1);
+                [computeEncoder setComputePipelineState:interpolate_PSO];
+                [computeEncoder setBuffer:attr_buf offset:attr.storage_offset() * attr.element_size() atIndex:0];
+                [computeEncoder setBuffer:indices_buf offset:indices.storage_offset() * indices.element_size() atIndex:1];
+                [computeEncoder setBuffer:rast_buf offset:rast.storage_offset() * rast.element_size() atIndex:2];
+                [computeEncoder setBuffer:pos_bake_buf offset:pos_bake.storage_offset() * pos_bake.element_size() atIndex:3];
+                [computeEncoder setBytes:&width length:sizeof(int) atIndex:4];
+                [computeEncoder setBytes:&height length:sizeof(int) atIndex:5];
+                // Calculate a thread group size.
+                int block_size = 16;
+                MTLSize threadgroupSize = MTLSizeMake(block_size, block_size, 1);  // Fixed threadgroup size
+                MTLSize numThreadgroups = MTLSizeMake(rast.size(0) / block_size, rast.size(0) / block_size, 1);
+                // Encode the compute command.
+                [computeEncoder dispatchThreadgroups:numThreadgroups threadsPerThreadgroup:threadgroupSize];
+                [computeEncoder endEncoding];
+                // Commit the work.
+                torch::mps::commit();
+            });
+        }
+        return pos_bake;
+    }
+    // Registers MPS implementations
+    TORCH_LIBRARY_IMPL(texture_baker_cpp, MPS, m)
+    {
+        m.impl("rasterize", &rasterize_gpu);
+        m.impl("interpolate", &interpolate_gpu);
+    }
+}

uv_unwrapper/README.md ADDED Viewed

File without changes

uv_unwrapper/requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ torch
2	+ numpy

uv_unwrapper/setup.py ADDED Viewed

	@@ -0,0 +1,79 @@

+import torch
+import glob
+import os
+from setuptools import find_packages, setup
+from torch.utils.cpp_extension import (
+    BuildExtension,
+    CppExtension,
+)
+library_name = "uv_unwrapper"
+def get_extensions():
+    debug_mode = os.getenv("DEBUG", "0") == "1"
+    if debug_mode:
+        print("Compiling in debug mode")
+    is_mac = True if torch.backends.mps.is_available() else False
+    use_native_arch = not is_mac and os.getenv("USE_NATIVE_ARCH", "1") == "1"
+    extension = CppExtension
+    extra_link_args = []
+    extra_compile_args = {
+        "cxx": [
+            "-O3" if not debug_mode else "-O0",
+            "-fdiagnostics-color=always",
+            ("-Xclang " if is_mac else "") + "-fopenmp",
+        ] + ["-march=native"] if use_native_arch else [],
+    }
+    if debug_mode:
+        extra_compile_args["cxx"].append("-g")
+        extra_compile_args["cxx"].append("-UNDEBUG")
+        extra_link_args.extend(["-O0", "-g"])
+    define_macros = []
+    extensions = []
+    this_dir = os.path.dirname(os.path.curdir)
+    sources = glob.glob(
+        os.path.join(this_dir, library_name, "csrc", "**", "*.cpp"), recursive=True
+    )
+    if len(sources) == 0:
+        print("No source files found for extension, skipping extension compilation")
+        return None
+    extensions.append(
+        extension(
+            name=f"{library_name}._C",
+            sources=sources,
+            define_macros=define_macros,
+            extra_compile_args=extra_compile_args,
+            extra_link_args=extra_link_args,
+            libraries=[
+                "c10",
+                "torch",
+                "torch_cpu",
+                "torch_python"
+            ] + ["omp"] if is_mac else [],
+        )
+    )
+    print(extensions)
+    return extensions
+setup(
+    name=library_name,
+    version="0.0.1",
+    packages=find_packages(),
+    ext_modules=get_extensions(),
+    install_requires=[],
+    description="Box projection based UV unwrapper",
+    long_description=open("README.md").read(),
+    long_description_content_type="text/markdown",
+    cmdclass={"build_ext": BuildExtension},
+)

uv_unwrapper/uv_unwrapper/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+import torch  # noqa: F401
+from . import _C  # noqa: F401
+from .unwrap import Unwrapper
+__all__ = ["Unwrapper"]

uv_unwrapper/uv_unwrapper/csrc/bvh.cpp ADDED Viewed

	@@ -0,0 +1,380 @@

+#include "bvh.h"
+#include "common.h"
+#include <cstring>
+#include <iostream>
+#include <queue>
+#include <tuple>
+namespace UVUnwrapper {
+BVH::BVH(Triangle *tri, int *actual_idx, const size_t &num_indices) {
+  // Copty tri to triangle
+  triangle = new Triangle[num_indices];
+  memcpy(triangle, tri, num_indices * sizeof(Triangle));
+  // Copy actual_idx to actualIdx
+  actualIdx = new int[num_indices];
+  memcpy(actualIdx, actual_idx, num_indices * sizeof(int));
+  triIdx = new int[num_indices];
+  triCount = num_indices;
+  bvhNode = new BVHNode[triCount * 2 + 64];
+  nodesUsed = 2;
+  memset(bvhNode, 0, triCount * 2 * sizeof(BVHNode));
+  // populate triangle index array
+  for (int i = 0; i < triCount; i++)
+    triIdx[i] = i;
+  BVHNode &root = bvhNode[0];
+  root.start = 0, root.end = triCount;
+  AABB centroidBounds;
+  UpdateNodeBounds(0, centroidBounds);
+  // subdivide recursively
+  Subdivide(0, nodesUsed, centroidBounds);
+}
+BVH::BVH(const BVH &other)
+    : BVH(other.triangle, other.triIdx, other.triCount) {}
+BVH::BVH(BVH &&other) noexcept // move constructor
+    : triIdx(std::exchange(other.triIdx, nullptr)),
+      actualIdx(std::exchange(other.actualIdx, nullptr)),
+      triangle(std::exchange(other.triangle, nullptr)),
+      bvhNode(std::exchange(other.bvhNode, nullptr)) {}
+BVH &BVH::operator=(const BVH &other) // copy assignment
+{
+  return *this = BVH(other);
+}
+BVH &BVH::operator=(BVH &&other) noexcept // move assignment
+{
+  std::swap(triIdx, other.triIdx);
+  std::swap(actualIdx, other.actualIdx);
+  std::swap(triangle, other.triangle);
+  std::swap(bvhNode, other.bvhNode);
+  std::swap(triCount, other.triCount);
+  std::swap(nodesUsed, other.nodesUsed);
+  return *this;
+}
+BVH::~BVH() {
+  if (triIdx)
+    delete[] triIdx;
+  if (triangle)
+    delete[] triangle;
+  if (actualIdx)
+    delete[] actualIdx;
+  if (bvhNode)
+    delete[] bvhNode;
+}
+void BVH::UpdateNodeBounds(unsigned int nodeIdx, AABB &centroidBounds) {
+  BVHNode &node = bvhNode[nodeIdx];
+#ifndef __ARM_ARCH_ISA_A64
+#ifndef _MSC_VER
+  if (__builtin_cpu_supports("sse"))
+#elif (defined(_M_AMD64) || defined(_M_X64))
+  // SSE supported on Windows
+  if constexpr (true)
+#endif
+  {
+    __m128 min4 = _mm_set_ps1(FLT_MAX), max4 = _mm_set_ps1(FLT_MIN);
+    __m128 cmin4 = _mm_set_ps1(FLT_MAX), cmax4 = _mm_set_ps1(FLT_MIN);
+    for (int i = node.start; i < node.end; i += 2) {
+      Triangle &leafTri1 = triangle[triIdx[i]];
+      __m128 v0, v1, v2, centroid;
+      if (i + 1 < node.end) {
+        const Triangle leafTri2 = triangle[triIdx[i + 1]];
+        v0 = _mm_set_ps(leafTri1.v0.x, leafTri1.v0.y, leafTri2.v0.x,
+                        leafTri2.v0.y);
+        v1 = _mm_set_ps(leafTri1.v1.x, leafTri1.v1.y, leafTri2.v1.x,
+                        leafTri2.v1.y);
+        v2 = _mm_set_ps(leafTri1.v2.x, leafTri1.v2.y, leafTri2.v2.x,
+                        leafTri2.v2.y);
+        centroid = _mm_set_ps(leafTri1.centroid.x, leafTri1.centroid.y,
+                              leafTri2.centroid.x, leafTri2.centroid.y);
+      } else {
+        // Otherwise do some duplicated work
+        v0 = _mm_set_ps(leafTri1.v0.x, leafTri1.v0.y, leafTri1.v0.x,
+                        leafTri1.v0.y);
+        v1 = _mm_set_ps(leafTri1.v1.x, leafTri1.v1.y, leafTri1.v1.x,
+                        leafTri1.v1.y);
+        v2 = _mm_set_ps(leafTri1.v2.x, leafTri1.v2.y, leafTri1.v2.x,
+                        leafTri1.v2.y);
+        centroid = _mm_set_ps(leafTri1.centroid.x, leafTri1.centroid.y,
+                              leafTri1.centroid.x, leafTri1.centroid.y);
+      }
+      min4 = _mm_min_ps(min4, v0);
+      max4 = _mm_max_ps(max4, v0);
+      min4 = _mm_min_ps(min4, v1);
+      max4 = _mm_max_ps(max4, v1);
+      min4 = _mm_min_ps(min4, v2);
+      max4 = _mm_max_ps(max4, v2);
+      cmin4 = _mm_min_ps(cmin4, centroid);
+      cmax4 = _mm_max_ps(cmax4, centroid);
+    }
+    float min_values[4], max_values[4], cmin_values[4], cmax_values[4];
+    _mm_store_ps(min_values, min4);
+    _mm_store_ps(max_values, max4);
+    _mm_store_ps(cmin_values, cmin4);
+    _mm_store_ps(cmax_values, cmax4);
+    node.bbox.min.x = std::min(min_values[3], min_values[1]);
+    node.bbox.min.y = std::min(min_values[2], min_values[0]);
+    node.bbox.max.x = std::max(max_values[3], max_values[1]);
+    node.bbox.max.y = std::max(max_values[2], max_values[0]);
+    centroidBounds.min.x = std::min(cmin_values[3], cmin_values[1]);
+    centroidBounds.min.y = std::min(cmin_values[2], cmin_values[0]);
+    centroidBounds.max.x = std::max(cmax_values[3], cmax_values[1]);
+    centroidBounds.max.y = std::max(cmax_values[2], cmax_values[0]);
+  }
+#else
+  if constexpr (false) {
+  }
+#endif
+  else {
+    node.bbox.invalidate();
+    centroidBounds.invalidate();
+    // Calculate the bounding box for the node
+    for (int i = node.start; i < node.end; ++i) {
+      const Triangle &tri = triangle[triIdx[i]];
+      node.bbox.grow(tri.v0);
+      node.bbox.grow(tri.v1);
+      node.bbox.grow(tri.v2);
+      centroidBounds.grow(tri.centroid);
+    }
+  }
+}
+void BVH::Subdivide(unsigned int root_idx, unsigned int &nodePtr,
+                    AABB &rootCentroidBounds) {
+  // Create a queue for the nodes to be subdivided
+  std::queue<std::tuple<unsigned int, AABB>> nodeQueue;
+  nodeQueue.push(std::make_tuple(root_idx, rootCentroidBounds));
+  while (!nodeQueue.empty()) {
+    // Get the next node to process from the queue
+    auto [node_idx, centroidBounds] = nodeQueue.front();
+    nodeQueue.pop();
+    BVHNode &node = bvhNode[node_idx];
+    // Check if left is -1 and right not or vice versa
+    int axis, splitPos;
+    float cost = FindBestSplitPlane(node, axis, splitPos, centroidBounds);
+    if (cost >= node.calculate_node_cost()) {
+      node.left = node.right = -1;
+      continue; // Move on to the next node in the queue
+    }
+    int i = node.start;
+    int j = node.end - 1;
+    float scale = BINS / (centroidBounds.max[axis] - centroidBounds.min[axis]);
+    while (i <= j) {
+      int binIdx =
+          std::min(BINS - 1, (int)((triangle[triIdx[i]].centroid[axis] -
+                                    centroidBounds.min[axis]) *
+                                   scale));
+      if (binIdx < splitPos)
+        i++;
+      else
+        std::swap(triIdx[i], triIdx[j--]);
+    }
+    int leftCount = i - node.start;
+    if (leftCount == 0 || leftCount == (int)node.num_triangles()) {
+      node.left = node.right = -1;
+      continue; // Move on to the next node in the queue
+    }
+    int mid = i;
+    // Create child nodes
+    int leftChildIdx = nodePtr++;
+    int rightChildIdx = nodePtr++;
+    bvhNode[leftChildIdx].start = node.start;
+    bvhNode[leftChildIdx].end = mid;
+    bvhNode[rightChildIdx].start = mid;
+    bvhNode[rightChildIdx].end = node.end;
+    node.left = leftChildIdx;
+    node.right = rightChildIdx;
+    // Update the bounds for the child nodes and push them onto the queue
+    UpdateNodeBounds(leftChildIdx, centroidBounds);
+    nodeQueue.push(std::make_tuple(leftChildIdx, centroidBounds));
+    UpdateNodeBounds(rightChildIdx, centroidBounds);
+    nodeQueue.push(std::make_tuple(rightChildIdx, centroidBounds));
+  }
+}
+float BVH::FindBestSplitPlane(BVHNode &node, int &best_axis, int &best_pos,
+                              AABB &centroidBounds) {
+  float best_cost = FLT_MAX;
+  for (int axis = 0; axis < 2; ++axis) // We use 2 as we have only x and y
+  {
+    float boundsMin = centroidBounds.min[axis];
+    float boundsMax = centroidBounds.max[axis];
+    // Or floating point precision
+    if ((boundsMin == boundsMax) || (boundsMax - boundsMin < 1e-8f)) {
+      continue;
+    }
+    // populate the bins
+    float scale = BINS / (boundsMax - boundsMin);
+    float leftCountArea[BINS - 1], rightCountArea[BINS - 1];
+    int leftSum = 0, rightSum = 0;
+#ifndef __ARM_ARCH_ISA_A64
+#ifndef _MSC_VER
+    if (__builtin_cpu_supports("sse"))
+#elif (defined(_M_AMD64) || defined(_M_X64))
+    // SSE supported on Windows
+    if constexpr (true)
+#endif
+    {
+      __m128 min4[BINS], max4[BINS];
+      unsigned int count[BINS];
+      for (unsigned int i = 0; i < BINS; i++)
+        min4[i] = _mm_set_ps1(FLT_MAX), max4[i] = _mm_set_ps1(FLT_MIN),
+        count[i] = 0;
+      for (int i = node.start; i < node.end; i++) {
+        Triangle &tri = triangle[triIdx[i]];
+        int binIdx =
+            std::min(BINS - 1, (int)((tri.centroid[axis] - boundsMin) * scale));
+        count[binIdx]++;
+        __m128 v0 = _mm_set_ps(tri.v0.x, tri.v0.y, 0.0f, 0.0f);
+        __m128 v1 = _mm_set_ps(tri.v1.x, tri.v1.y, 0.0f, 0.0f);
+        __m128 v2 = _mm_set_ps(tri.v2.x, tri.v2.y, 0.0f, 0.0f);
+        min4[binIdx] = _mm_min_ps(min4[binIdx], v0);
+        max4[binIdx] = _mm_max_ps(max4[binIdx], v0);
+        min4[binIdx] = _mm_min_ps(min4[binIdx], v1);
+        max4[binIdx] = _mm_max_ps(max4[binIdx], v1);
+        min4[binIdx] = _mm_min_ps(min4[binIdx], v2);
+        max4[binIdx] = _mm_max_ps(max4[binIdx], v2);
+      }
+      // gather data for the 7 planes between the 8 bins
+      __m128 leftMin4 = _mm_set_ps1(FLT_MAX), rightMin4 = leftMin4;
+      __m128 leftMax4 = _mm_set_ps1(FLT_MIN), rightMax4 = leftMax4;
+      for (int i = 0; i < BINS - 1; i++) {
+        leftSum += count[i];
+        rightSum += count[BINS - 1 - i];
+        leftMin4 = _mm_min_ps(leftMin4, min4[i]);
+        rightMin4 = _mm_min_ps(rightMin4, min4[BINS - 2 - i]);
+        leftMax4 = _mm_max_ps(leftMax4, max4[i]);
+        rightMax4 = _mm_max_ps(rightMax4, max4[BINS - 2 - i]);
+        float le[4], re[4];
+        _mm_store_ps(le, _mm_sub_ps(leftMax4, leftMin4));
+        _mm_store_ps(re, _mm_sub_ps(rightMax4, rightMin4));
+        // SSE order goes from back to front
+        leftCountArea[i] = leftSum * (le[2] * le[3]); // 2D area calculation
+        rightCountArea[BINS - 2 - i] =
+            rightSum * (re[2] * re[3]); // 2D area calculation
+      }
+    }
+#else
+    if constexpr (false) {
+    }
+#endif
+    else {
+      struct Bin {
+        AABB bounds;
+        int triCount = 0;
+      } bin[BINS];
+      for (int i = node.start; i < node.end; i++) {
+        Triangle &tri = triangle[triIdx[i]];
+        int binIdx =
+            std::min(BINS - 1, (int)((tri.centroid[axis] - boundsMin) * scale));
+        bin[binIdx].triCount++;
+        bin[binIdx].bounds.grow(tri.v0);
+        bin[binIdx].bounds.grow(tri.v1);
+        bin[binIdx].bounds.grow(tri.v2);
+      }
+      // gather data for the 7 planes between the 8 bins
+      AABB leftBox, rightBox;
+      for (int i = 0; i < BINS - 1; i++) {
+        leftSum += bin[i].triCount;
+        leftBox.grow(bin[i].bounds);
+        leftCountArea[i] = leftSum * leftBox.area();
+        rightSum += bin[BINS - 1 - i].triCount;
+        rightBox.grow(bin[BINS - 1 - i].bounds);
+        rightCountArea[BINS - 2 - i] = rightSum * rightBox.area();
+      }
+    }
+    // calculate SAH cost for the 7 planes
+    scale = (boundsMax - boundsMin) / BINS;
+    for (int i = 0; i < BINS - 1; i++) {
+      const float planeCost = leftCountArea[i] + rightCountArea[i];
+      if (planeCost < best_cost)
+        best_axis = axis, best_pos = i + 1, best_cost = planeCost;
+    }
+  }
+  return best_cost;
+}
+std::vector<int> BVH::Intersect(Triangle &tri_intersect) {
+  /**
+   * @brief Intersect a triangle with the BVH
+   *
+   * @param triangle the triangle to intersect
+   *
+   * @return -1 for no intersection, the index of the intersected triangle
+   * otherwise
+   */
+  const int max_stack_size = 64;
+  int node_stack[max_stack_size];
+  int stack_size = 0;
+  std::vector<int> intersected_triangles;
+  node_stack[stack_size++] = 0; // Start with the root node (index 0)
+  while (stack_size > 0) {
+    int node_idx = node_stack[--stack_size];
+    const BVHNode &node = bvhNode[node_idx];
+    if (node.is_leaf()) {
+      for (int i = node.start; i < node.end; ++i) {
+        const Triangle &tri = triangle[triIdx[i]];
+        // Check that the triangle is not the same as the intersected triangle
+        if (tri == tri_intersect)
+          continue;
+        if (tri_intersect.overlaps(tri)) {
+          intersected_triangles.push_back(actualIdx[triIdx[i]]);
+        }
+      }
+    } else {
+      // Check right child first
+      if (bvhNode[node.right].bbox.overlaps(tri_intersect)) {
+        if (stack_size < max_stack_size) {
+          node_stack[stack_size++] = node.right;
+        } else {
+          throw std::runtime_error("Node stack overflow");
+        }
+      }
+      // Check left child
+      if (bvhNode[node.left].bbox.overlaps(tri_intersect)) {
+        if (stack_size < max_stack_size) {
+          node_stack[stack_size++] = node.left;
+        } else {
+          throw std::runtime_error("Node stack overflow");
+        }
+      }
+    }
+  }
+  return intersected_triangles; // Return all intersected triangle indices
+}
+} // namespace UVUnwrapper

uv_unwrapper/uv_unwrapper/csrc/bvh.h ADDED Viewed

	@@ -0,0 +1,118 @@

+#pragma once
+#include <cfloat>
+#include <cmath>
+#ifndef __ARM_ARCH_ISA_A64
+#include <immintrin.h>
+#endif
+#include <limits>
+#include <vector>
+#include "common.h"
+#include "intersect.h"
+/**
+ * Based on https://github.com/jbikker/bvh_article released under the unlicense.
+ */
+// bin count for binned BVH building
+#define BINS 8
+namespace UVUnwrapper {
+// minimalist triangle struct
+struct alignas(32) Triangle {
+  uv_float2 v0;
+  uv_float2 v1;
+  uv_float2 v2;
+  uv_float2 centroid;
+  bool overlaps(const Triangle &other) {
+    // return tri_tri_overlap_test_2d(v0, v1, v2, other.v0, other.v1, other.v2);
+    return triangle_triangle_intersection(v0, v1, v2, other.v0, other.v1,
+                                          other.v2);
+  }
+  bool operator==(const Triangle &rhs) const {
+    return v0 == rhs.v0 && v1 == rhs.v1 && v2 == rhs.v2;
+  }
+};
+// minimalist AABB struct with grow functionality
+struct alignas(16) AABB {
+  // Init bounding boxes with max/min
+  uv_float2 min = {FLT_MAX, FLT_MAX};
+  uv_float2 max = {FLT_MIN, FLT_MIN};
+  void grow(const uv_float2 &p) {
+    min.x = std::min(min.x, p.x);
+    min.y = std::min(min.y, p.y);
+    max.x = std::max(max.x, p.x);
+    max.y = std::max(max.y, p.y);
+  }
+  void grow(const AABB &b) {
+    if (b.min.x != FLT_MAX) {
+      grow(b.min);
+      grow(b.max);
+    }
+  }
+  bool overlaps(const Triangle &tri) {
+    return triangle_aabb_intersection(min, max, tri.v0, tri.v1, tri.v2);
+  }
+  float area() const {
+    uv_float2 extent = {max.x - min.x, max.y - min.y};
+    return extent.x * extent.y;
+  }
+  void invalidate() {
+    min = {FLT_MAX, FLT_MAX};
+    max = {FLT_MIN, FLT_MIN};
+  }
+};
+// 32-byte BVH node struct
+struct alignas(32) BVHNode {
+  AABB bbox;              // 16
+  int start = 0, end = 0; // 8
+  int left, right;
+  int num_triangles() const { return end - start; }
+  bool is_leaf() const { return left == -1 && right == -1; }
+  float calculate_node_cost() {
+    float area = bbox.area();
+    return num_triangles() * area;
+  }
+};
+class BVH {
+public:
+  BVH() = default;
+  BVH(BVH &&other) noexcept;
+  BVH(const BVH &other);
+  BVH &operator=(const BVH &other);
+  BVH &operator=(BVH &&other) noexcept;
+  BVH(Triangle *tri, int *actual_idx, const size_t &num_indices);
+  ~BVH();
+  std::vector<int> Intersect(Triangle &triangle);
+private:
+  void Subdivide(unsigned int node_idx, unsigned int &nodePtr,
+                 AABB &centroidBounds);
+  void UpdateNodeBounds(unsigned int nodeIdx, AABB &centroidBounds);
+  float FindBestSplitPlane(BVHNode &node, int &axis, int &splitPos,
+                           AABB &centroidBounds);
+public:
+  int *triIdx = nullptr;
+  int *actualIdx = nullptr;
+  unsigned int triCount;
+  unsigned int nodesUsed;
+  BVHNode *bvhNode = nullptr;
+  Triangle *triangle = nullptr;
+};
+} // namespace UVUnwrapper

uv_unwrapper/uv_unwrapper/csrc/common.h ADDED Viewed

	@@ -0,0 +1,493 @@

+#pragma once
+#include <array>
+#include <cmath>
+#include <iostream>
+#include <stdexcept>
+const float EPSILON = 1e-7f;
+// Structure to represent a 2D point or vector
+union alignas(8) uv_float2 {
+  struct {
+    float x, y;
+  };
+  float data[2];
+  float &operator[](size_t idx) {
+    if (idx > 1)
+      throw std::runtime_error("bad index");
+    return data[idx];
+  }
+  const float &operator[](size_t idx) const {
+    if (idx > 1)
+      throw std::runtime_error("bad index");
+    return data[idx];
+  }
+  bool operator==(const uv_float2 &rhs) const {
+    return x == rhs.x && y == rhs.y;
+  }
+};
+// Do not align as this is specifically tweaked for BVHNode
+union uv_float3 {
+  struct {
+    float x, y, z;
+  };
+  float data[3];
+  float &operator[](size_t idx) {
+    if (idx > 3)
+      throw std::runtime_error("bad index");
+    return data[idx];
+  }
+  const float &operator[](size_t idx) const {
+    if (idx > 3)
+      throw std::runtime_error("bad index");
+    return data[idx];
+  }
+  bool operator==(const uv_float3 &rhs) const {
+    return x == rhs.x && y == rhs.y && z == rhs.z;
+  }
+};
+union alignas(16) uv_float4 {
+  struct {
+    float x, y, z, w;
+  };
+  float data[4];
+  float &operator[](size_t idx) {
+    if (idx > 3)
+      throw std::runtime_error("bad index");
+    return data[idx];
+  }
+  const float &operator[](size_t idx) const {
+    if (idx > 3)
+      throw std::runtime_error("bad index");
+    return data[idx];
+  }
+  bool operator==(const uv_float4 &rhs) const {
+    return x == rhs.x && y == rhs.y && z == rhs.z && w == rhs.w;
+  }
+};
+union alignas(8) uv_int2 {
+  struct {
+    int x, y;
+  };
+  int data[2];
+  int &operator[](size_t idx) {
+    if (idx > 1)
+      throw std::runtime_error("bad index");
+    return data[idx];
+  }
+  const int &operator[](size_t idx) const {
+    if (idx > 1)
+      throw std::runtime_error("bad index");
+    return data[idx];
+  }
+  bool operator==(const uv_int2 &rhs) const { return x == rhs.x && y == rhs.y; }
+};
+union alignas(4) uv_int3 {
+  struct {
+    int x, y, z;
+  };
+  int data[3];
+  int &operator[](size_t idx) {
+    if (idx > 2)
+      throw std::runtime_error("bad index");
+    return data[idx];
+  }
+  const int &operator[](size_t idx) const {
+    if (idx > 2)
+      throw std::runtime_error("bad index");
+    return data[idx];
+  }
+  bool operator==(const uv_int3 &rhs) const {
+    return x == rhs.x && y == rhs.y && z == rhs.z;
+  }
+};
+union alignas(16) uv_int4 {
+  struct {
+    int x, y, z, w;
+  };
+  int data[4];
+  int &operator[](size_t idx) {
+    if (idx > 3)
+      throw std::runtime_error("bad index");
+    return data[idx];
+  }
+  const int &operator[](size_t idx) const {
+    if (idx > 3)
+      throw std::runtime_error("bad index");
+    return data[idx];
+  }
+  bool operator==(const uv_int4 &rhs) const {
+    return x == rhs.x && y == rhs.y && z == rhs.z && w == rhs.w;
+  }
+};
+inline float calc_mean(float a, float b, float c) { return (a + b + c) / 3; }
+// Create a triangle centroid
+inline uv_float2 triangle_centroid(const uv_float2 &v0, const uv_float2 &v1,
+                                   const uv_float2 &v2) {
+  return {calc_mean(v0.x, v1.x, v2.x), calc_mean(v0.y, v1.y, v2.y)};
+}
+inline uv_float3 triangle_centroid(const uv_float3 &v0, const uv_float3 &v1,
+                                   const uv_float3 &v2) {
+  return {calc_mean(v0.x, v1.x, v2.x), calc_mean(v0.y, v1.y, v2.y),
+          calc_mean(v0.z, v1.z, v2.z)};
+}
+// Helper functions for vector math
+inline uv_float2 operator-(const uv_float2 &a, const uv_float2 &b) {
+  return {a.x - b.x, a.y - b.y};
+}
+inline uv_float3 operator-(const uv_float3 &a, const uv_float3 &b) {
+  return {a.x - b.x, a.y - b.y, a.z - b.z};
+}
+inline uv_float2 operator+(const uv_float2 &a, const uv_float2 &b) {
+  return {a.x + b.x, a.y + b.y};
+}
+inline uv_float3 operator+(const uv_float3 &a, const uv_float3 &b) {
+  return {a.x + b.x, a.y + b.y, a.z + b.z};
+}
+inline uv_float2 operator*(const uv_float2 &a, float scalar) {
+  return {a.x * scalar, a.y * scalar};
+}
+inline uv_float3 operator*(const uv_float3 &a, float scalar) {
+  return {a.x * scalar, a.y * scalar, a.z * scalar};
+}
+inline float dot(const uv_float2 &a, const uv_float2 &b) {
+  return a.x * b.x + a.y * b.y;
+}
+inline float dot(const uv_float3 &a, const uv_float3 &b) {
+  return a.x * b.x + a.y * b.y + a.z * b.z;
+}
+inline float cross(const uv_float2 &a, const uv_float2 &b) {
+  return a.x * b.y - a.y * b.x;
+}
+inline uv_float3 cross(const uv_float3 &a, const uv_float3 &b) {
+  return {a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x};
+}
+inline uv_float2 abs_vec(const uv_float2 &v) {
+  return {std::abs(v.x), std::abs(v.y)};
+}
+inline uv_float2 min_vec(const uv_float2 &a, const uv_float2 &b) {
+  return {std::min(a.x, b.x), std::min(a.y, b.y)};
+}
+inline uv_float2 max_vec(const uv_float2 &a, const uv_float2 &b) {
+  return {std::max(a.x, b.x), std::max(a.y, b.y)};
+}
+inline float distance_to(const uv_float2 &a, const uv_float2 &b) {
+  return std::sqrt(std::pow(a.x - b.x, 2) + std::pow(a.y - b.y, 2));
+}
+inline float distance_to(const uv_float3 &a, const uv_float3 &b) {
+  return std::sqrt(std::pow(a.x - b.x, 2) + std::pow(a.y - b.y, 2) +
+                   std::pow(a.z - b.z, 2));
+}
+inline uv_float2 normalize(const uv_float2 &v) {
+  float len = std::sqrt(v.x * v.x + v.y * v.y);
+  return {v.x / len, v.y / len};
+}
+inline uv_float3 normalize(const uv_float3 &v) {
+  float len = std::sqrt(v.x * v.x + v.y * v.y + v.z * v.z);
+  return {v.x / len, v.y / len, v.z / len};
+}
+inline float magnitude(const uv_float3 &v) {
+  return std::sqrt(v.x * v.x + v.y * v.y + v.z * v.z);
+}
+struct Matrix4 {
+  std::array<std::array<float, 4>, 4> m;
+  Matrix4() {
+    for (auto &row : m) {
+      row.fill(0.0f);
+    }
+    m[3][3] = 1.0f; // Identity matrix for 4th row and column
+  }
+  void set(float m00, float m01, float m02, float m03, float m10, float m11,
+           float m12, float m13, float m20, float m21, float m22, float m23,
+           float m30, float m31, float m32, float m33) {
+    m[0][0] = m00;
+    m[0][1] = m01;
+    m[0][2] = m02;
+    m[0][3] = m03;
+    m[1][0] = m10;
+    m[1][1] = m11;
+    m[1][2] = m12;
+    m[1][3] = m13;
+    m[2][0] = m20;
+    m[2][1] = m21;
+    m[2][2] = m22;
+    m[2][3] = m23;
+    m[3][0] = m30;
+    m[3][1] = m31;
+    m[3][2] = m32;
+    m[3][3] = m33;
+  }
+  float determinant() const {
+    return m[0][3] * m[1][2] * m[2][1] * m[3][0] -
+           m[0][2] * m[1][3] * m[2][1] * m[3][0] -
+           m[0][3] * m[1][1] * m[2][2] * m[3][0] +
+           m[0][1] * m[1][3] * m[2][2] * m[3][0] +
+           m[0][2] * m[1][1] * m[2][3] * m[3][0] -
+           m[0][1] * m[1][2] * m[2][3] * m[3][0] -
+           m[0][3] * m[1][2] * m[2][0] * m[3][1] +
+           m[0][2] * m[1][3] * m[2][0] * m[3][1] +
+           m[0][3] * m[1][0] * m[2][2] * m[3][1] -
+           m[0][0] * m[1][3] * m[2][2] * m[3][1] -
+           m[0][2] * m[1][0] * m[2][3] * m[3][1] +
+           m[0][0] * m[1][2] * m[2][3] * m[3][1] +
+           m[0][3] * m[1][1] * m[2][0] * m[3][2] -
+           m[0][1] * m[1][3] * m[2][0] * m[3][2] -
+           m[0][3] * m[1][0] * m[2][1] * m[3][2] +
+           m[0][0] * m[1][3] * m[2][1] * m[3][2] +
+           m[0][1] * m[1][0] * m[2][3] * m[3][2] -
+           m[0][0] * m[1][1] * m[2][3] * m[3][2] -
+           m[0][2] * m[1][1] * m[2][0] * m[3][3] +
+           m[0][1] * m[1][2] * m[2][0] * m[3][3] +
+           m[0][2] * m[1][0] * m[2][1] * m[3][3] -
+           m[0][0] * m[1][2] * m[2][1] * m[3][3] -
+           m[0][1] * m[1][0] * m[2][2] * m[3][3] +
+           m[0][0] * m[1][1] * m[2][2] * m[3][3];
+  }
+  Matrix4 operator*(const Matrix4 &other) const {
+    Matrix4 result;
+    for (int row = 0; row < 4; ++row) {
+      for (int col = 0; col < 4; ++col) {
+        result.m[row][col] =
+            m[row][0] * other.m[0][col] + m[row][1] * other.m[1][col] +
+            m[row][2] * other.m[2][col] + m[row][3] * other.m[3][col];
+      }
+    }
+    return result;
+  }
+  Matrix4 operator*(float scalar) const {
+    Matrix4 result = *this;
+    for (auto &row : result.m) {
+      for (auto &element : row) {
+        element *= scalar;
+      }
+    }
+    return result;
+  }
+  Matrix4 operator+(const Matrix4 &other) const {
+    Matrix4 result;
+    for (int i = 0; i < 4; ++i) {
+      for (int j = 0; j < 4; ++j) {
+        result.m[i][j] = m[i][j] + other.m[i][j];
+      }
+    }
+    return result;
+  }
+  Matrix4 operator-(const Matrix4 &other) const {
+    Matrix4 result;
+    for (int i = 0; i < 4; ++i) {
+      for (int j = 0; j < 4; ++j) {
+        result.m[i][j] = m[i][j] - other.m[i][j];
+      }
+    }
+    return result;
+  }
+  float trace() const { return m[0][0] + m[1][1] + m[2][2] + m[3][3]; }
+  Matrix4 identity() const {
+    Matrix4 identity;
+    identity.set(1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1);
+    return identity;
+  }
+  Matrix4 power(int exp) const {
+    if (exp == 0)
+      return identity();
+    if (exp == 1)
+      return *this;
+    Matrix4 result = *this;
+    for (int i = 1; i < exp; ++i) {
+      result = result * (*this);
+    }
+    return result;
+  }
+  void print() {
+    // Print all entries in 4 rows with 4 columns
+    for (int i = 0; i < 4; ++i) {
+      for (int j = 0; j < 4; ++j) {
+        std::cout << m[i][j] << " ";
+      }
+      std::cout << std::endl;
+    }
+  }
+  bool invert() {
+    double inv[16], det;
+    double mArr[16];
+    // Convert the matrix to a 1D array for easier manipulation
+    for (int i = 0; i < 4; ++i) {
+      for (int j = 0; j < 4; ++j) {
+        mArr[i * 4 + j] = static_cast<double>(m[i][j]);
+      }
+    }
+    inv[0] = mArr[5] * mArr[10] * mArr[15] - mArr[5] * mArr[11] * mArr[14] -
+             mArr[9] * mArr[6] * mArr[15] + mArr[9] * mArr[7] * mArr[14] +
+             mArr[13] * mArr[6] * mArr[11] - mArr[13] * mArr[7] * mArr[10];
+    inv[4] = -mArr[4] * mArr[10] * mArr[15] + mArr[4] * mArr[11] * mArr[14] +
+             mArr[8] * mArr[6] * mArr[15] - mArr[8] * mArr[7] * mArr[14] -
+             mArr[12] * mArr[6] * mArr[11] + mArr[12] * mArr[7] * mArr[10];
+    inv[8] = mArr[4] * mArr[9] * mArr[15] - mArr[4] * mArr[11] * mArr[13] -
+             mArr[8] * mArr[5] * mArr[15] + mArr[8] * mArr[7] * mArr[13] +
+             mArr[12] * mArr[5] * mArr[11] - mArr[12] * mArr[7] * mArr[9];
+    inv[12] = -mArr[4] * mArr[9] * mArr[14] + mArr[4] * mArr[10] * mArr[13] +
+              mArr[8] * mArr[5] * mArr[14] - mArr[8] * mArr[6] * mArr[13] -
+              mArr[12] * mArr[5] * mArr[10] + mArr[12] * mArr[6] * mArr[9];
+    inv[1] = -mArr[1] * mArr[10] * mArr[15] + mArr[1] * mArr[11] * mArr[14] +
+             mArr[9] * mArr[2] * mArr[15] - mArr[9] * mArr[3] * mArr[14] -
+             mArr[13] * mArr[2] * mArr[11] + mArr[13] * mArr[3] * mArr[10];
+    inv[5] = mArr[0] * mArr[10] * mArr[15] - mArr[0] * mArr[11] * mArr[14] -
+             mArr[8] * mArr[2] * mArr[15] + mArr[8] * mArr[3] * mArr[14] +
+             mArr[12] * mArr[2] * mArr[11] - mArr[12] * mArr[3] * mArr[10];
+    inv[9] = -mArr[0] * mArr[9] * mArr[15] + mArr[0] * mArr[11] * mArr[13] +
+             mArr[8] * mArr[1] * mArr[15] - mArr[8] * mArr[3] * mArr[13] -
+             mArr[12] * mArr[1] * mArr[11] + mArr[12] * mArr[3] * mArr[9];
+    inv[13] = mArr[0] * mArr[9] * mArr[14] - mArr[0] * mArr[10] * mArr[13] -
+              mArr[8] * mArr[1] * mArr[14] + mArr[8] * mArr[2] * mArr[13] +
+              mArr[12] * mArr[1] * mArr[10] - mArr[12] * mArr[2] * mArr[9];
+    inv[2] = mArr[1] * mArr[6] * mArr[15] - mArr[1] * mArr[7] * mArr[14] -
+             mArr[5] * mArr[2] * mArr[15] + mArr[5] * mArr[3] * mArr[14] +
+             mArr[13] * mArr[2] * mArr[7] - mArr[13] * mArr[3] * mArr[6];
+    inv[6] = -mArr[0] * mArr[6] * mArr[15] + mArr[0] * mArr[7] * mArr[14] +
+             mArr[4] * mArr[2] * mArr[15] - mArr[4] * mArr[3] * mArr[14] -
+             mArr[12] * mArr[2] * mArr[7] + mArr[12] * mArr[3] * mArr[6];
+    inv[10] = mArr[0] * mArr[5] * mArr[15] - mArr[0] * mArr[7] * mArr[13] -
+              mArr[4] * mArr[1] * mArr[15] + mArr[4] * mArr[3] * mArr[13] +
+              mArr[12] * mArr[1] * mArr[7] - mArr[12] * mArr[3] * mArr[5];
+    inv[14] = -mArr[0] * mArr[5] * mArr[14] + mArr[0] * mArr[6] * mArr[13] +
+              mArr[4] * mArr[1] * mArr[14] - mArr[4] * mArr[2] * mArr[13] -
+              mArr[12] * mArr[1] * mArr[6] + mArr[12] * mArr[2] * mArr[5];
+    inv[3] = -mArr[1] * mArr[6] * mArr[11] + mArr[1] * mArr[7] * mArr[10] +
+             mArr[5] * mArr[2] * mArr[11] - mArr[5] * mArr[3] * mArr[10] -
+             mArr[9] * mArr[2] * mArr[7] + mArr[9] * mArr[3] * mArr[6];
+    inv[7] = mArr[0] * mArr[6] * mArr[11] - mArr[0] * mArr[7] * mArr[10] -
+             mArr[4] * mArr[2] * mArr[11] + mArr[4] * mArr[3] * mArr[10] +
+             mArr[8] * mArr[2] * mArr[7] - mArr[8] * mArr[3] * mArr[6];
+    inv[11] = -mArr[0] * mArr[5] * mArr[11] + mArr[0] * mArr[7] * mArr[9] +
+              mArr[4] * mArr[1] * mArr[11] - mArr[4] * mArr[3] * mArr[9] -
+              mArr[8] * mArr[1] * mArr[7] + mArr[8] * mArr[3] * mArr[5];
+    inv[15] = mArr[0] * mArr[5] * mArr[10] - mArr[0] * mArr[6] * mArr[9] -
+              mArr[4] * mArr[1] * mArr[10] + mArr[4] * mArr[2] * mArr[9] +
+              mArr[8] * mArr[1] * mArr[6] - mArr[8] * mArr[2] * mArr[5];
+    det = mArr[0] * inv[0] + mArr[1] * inv[4] + mArr[2] * inv[8] +
+          mArr[3] * inv[12];
+    if (fabs(det) < 1e-6) {
+      return false;
+    }
+    det = 1.0 / det;
+    for (int i = 0; i < 16; i++) {
+      inv[i] *= det;
+    }
+    // Convert the 1D array back to the 4x4 matrix
+    for (int i = 0; i < 4; ++i) {
+      for (int j = 0; j < 4; ++j) {
+        m[i][j] = static_cast<float>(inv[i * 4 + j]);
+      }
+    }
+    return true;
+  }
+};
+inline void apply_matrix4(uv_float3 &v, const Matrix4 matrix) {
+  float newX = v.x * matrix.m[0][0] + v.y * matrix.m[0][1] +
+               v.z * matrix.m[0][2] + matrix.m[0][3];
+  float newY = v.x * matrix.m[1][0] + v.y * matrix.m[1][1] +
+               v.z * matrix.m[1][2] + matrix.m[1][3];
+  float newZ = v.x * matrix.m[2][0] + v.y * matrix.m[2][1] +
+               v.z * matrix.m[2][2] + matrix.m[2][3];
+  float w = v.x * matrix.m[3][0] + v.y * matrix.m[3][1] + v.z * matrix.m[3][2] +
+            matrix.m[3][3];
+  if (std::fabs(w) > EPSILON) {
+    newX /= w;
+    newY /= w;
+    newZ /= w;
+  }
+  v.x = newX;
+  v.y = newY;
+  v.z = newZ;
+}

uv_unwrapper/uv_unwrapper/csrc/intersect.cpp ADDED Viewed

	@@ -0,0 +1,702 @@

+#include "intersect.h"
+#include "bvh.h"
+#include <algorithm>
+#include <cmath>
+#include <iostream>
+#include <stdexcept>
+#include <vector>
+bool triangle_aabb_intersection(const uv_float2 &aabbMin,
+                                const uv_float2 &aabbMax, const uv_float2 &v0,
+                                const uv_float2 &v1, const uv_float2 &v2) {
+  // Convert the min and max aabb defintion to left, right, top, bottom
+  float l = aabbMin.x;
+  float r = aabbMax.x;
+  float t = aabbMin.y;
+  float b = aabbMax.y;
+  int b0 = ((v0.x > l) ? 1 : 0) | ((v0.y > t) ? 2 : 0) | ((v0.x > r) ? 4 : 0) |
+           ((v0.y > b) ? 8 : 0);
+  if (b0 == 3)
+    return true;
+  int b1 = ((v1.x > l) ? 1 : 0) | ((v1.y > t) ? 2 : 0) | ((v1.x > r) ? 4 : 0) |
+           ((v1.y > b) ? 8 : 0);
+  if (b1 == 3)
+    return true;
+  int b2 = ((v2.x > l) ? 1 : 0) | ((v2.y > t) ? 2 : 0) | ((v2.x > r) ? 4 : 0) |
+           ((v2.y > b) ? 8 : 0);
+  if (b2 == 3)
+    return true;
+  float m, c, s;
+  int i0 = b0 ^ b1;
+  if (i0 != 0) {
+    if (v1.x != v0.x) {
+      m = (v1.y - v0.y) / (v1.x - v0.x);
+      c = v0.y - (m * v0.x);
+      if (i0 & 1) {
+        s = m * l + c;
+        if (s >= t && s <= b)
+          return true;
+      }
+      if (i0 & 2) {
+        s = (t - c) / m;
+        if (s >= l && s <= r)
+          return true;
+      }
+      if (i0 & 4) {
+        s = m * r + c;
+        if (s >= t && s <= b)
+          return true;
+      }
+      if (i0 & 8) {
+        s = (b - c) / m;
+        if (s >= l && s <= r)
+          return true;
+      }
+    } else {
+      if (l == v0.x || r == v0.x)
+        return true;
+      if (v0.x > l && v0.x < r)
+        return true;
+    }
+  }
+  int i1 = b1 ^ b2;
+  if (i1 != 0) {
+    if (v2.x != v1.x) {
+      m = (v2.y - v1.y) / (v2.x - v1.x);
+      c = v1.y - (m * v1.x);
+      if (i1 & 1) {
+        s = m * l + c;
+        if (s >= t && s <= b)
+          return true;
+      }
+      if (i1 & 2) {
+        s = (t - c) / m;
+        if (s >= l && s <= r)
+          return true;
+      }
+      if (i1 & 4) {
+        s = m * r + c;
+        if (s >= t && s <= b)
+          return true;
+      }
+      if (i1 & 8) {
+        s = (b - c) / m;
+        if (s >= l && s <= r)
+          return true;
+      }
+    } else {
+      if (l == v1.x || r == v1.x)
+        return true;
+      if (v1.x > l && v1.x < r)
+        return true;
+    }
+  }
+  int i2 = b0 ^ b2;
+  if (i2 != 0) {
+    if (v2.x != v0.x) {
+      m = (v2.y - v0.y) / (v2.x - v0.x);
+      c = v0.y - (m * v0.x);
+      if (i2 & 1) {
+        s = m * l + c;
+        if (s >= t && s <= b)
+          return true;
+      }
+      if (i2 & 2) {
+        s = (t - c) / m;
+        if (s >= l && s <= r)
+          return true;
+      }
+      if (i2 & 4) {
+        s = m * r + c;
+        if (s >= t && s <= b)
+          return true;
+      }
+      if (i2 & 8) {
+        s = (b - c) / m;
+        if (s >= l && s <= r)
+          return true;
+      }
+    } else {
+      if (l == v0.x || r == v0.x)
+        return true;
+      if (v0.x > l && v0.x < r)
+        return true;
+    }
+  }
+  // Bounding box check
+  float tbb_l = std::min(v0.x, std::min(v1.x, v2.x));
+  float tbb_t = std::min(v0.y, std::min(v1.y, v2.y));
+  float tbb_r = std::max(v0.x, std::max(v1.x, v2.x));
+  float tbb_b = std::max(v0.y, std::max(v1.y, v2.y));
+  if (tbb_l <= l && tbb_r >= r && tbb_t <= t && tbb_b >= b) {
+    float v0x = v2.x - v0.x;
+    float v0y = v2.y - v0.y;
+    float v1x = v1.x - v0.x;
+    float v1y = v1.y - v0.y;
+    float v2x, v2y;
+    float dot00, dot01, dot02, dot11, dot12, invDenom, u, v;
+    // Top-left corner
+    v2x = l - v0.x;
+    v2y = t - v0.y;
+    dot00 = v0x * v0x + v0y * v0y;
+    dot01 = v0x * v1x + v0y * v1y;
+    dot02 = v0x * v2x + v0y * v2y;
+    dot11 = v1x * v1x + v1y * v1y;
+    dot12 = v1x * v2x + v1y * v2y;
+    invDenom = 1.0f / (dot00 * dot11 - dot01 * dot01);
+    u = (dot11 * dot02 - dot01 * dot12) * invDenom;
+    v = (dot00 * dot12 - dot01 * dot02) * invDenom;
+    if (u >= 0 && v >= 0 && (u + v) <= 1)
+      return true;
+    // Bottom-left corner
+    v2x = l - v0.x;
+    v2y = b - v0.y;
+    dot02 = v0x * v2x + v0y * v2y;
+    dot12 = v1x * v2x + v1y * v2y;
+    u = (dot11 * dot02 - dot01 * dot12) * invDenom;
+    v = (dot00 * dot12 - dot01 * dot02) * invDenom;
+    if (u >= 0 && v >= 0 && (u + v) <= 1)
+      return true;
+    // Bottom-right corner
+    v2x = r - v0.x;
+    v2y = b - v0.y;
+    dot02 = v0x * v2x + v0y * v2y;
+    dot12 = v1x * v2x + v1y * v2y;
+    u = (dot11 * dot02 - dot01 * dot12) * invDenom;
+    v = (dot00 * dot12 - dot01 * dot02) * invDenom;
+    if (u >= 0 && v >= 0 && (u + v) <= 1)
+      return true;
+    // Top-right corner
+    v2x = r - v0.x;
+    v2y = t - v0.y;
+    dot02 = v0x * v2x + v0y * v2y;
+    dot12 = v1x * v2x + v1y * v2y;
+    u = (dot11 * dot02 - dot01 * dot12) * invDenom;
+    v = (dot00 * dot12 - dot01 * dot02) * invDenom;
+    if (u >= 0 && v >= 0 && (u + v) <= 1)
+      return true;
+  }
+  return false;
+}
+void tri_winding(uv_float2 &a, uv_float2 &b, uv_float2 &c) {
+  float det = (a.x * (b.y - c.y) + b.x * (c.y - a.y) + c.x * (a.y - b.y));
+  // If the determinant is negative, the triangle is oriented clockwise
+  if (det < 0) {
+    // Swap vertices b and c to ensure counter-clockwise winding
+    std::swap(b, c);
+  }
+}
+struct Triangle {
+  uv_float3 a, b, c;
+  Triangle(const uv_float2 &p1, const uv_float2 &q1, const uv_float2 &r1)
+      : a({p1.x, p1.y, 0}), b({q1.x, q1.y, 0}), c({r1.x, r1.y, 0}) {}
+  Triangle(const uv_float3 &p1, const uv_float3 &q1, const uv_float3 &r1)
+      : a(p1), b(q1), c(r1) {}
+  void getNormal(uv_float3 &normal) const {
+    uv_float3 u = b - a;
+    uv_float3 v = c - a;
+    normal = normalize(cross(u, v));
+  }
+};
+bool isTriDegenerated(const Triangle &tri) {
+  uv_float3 u = tri.a - tri.b;
+  uv_float3 v = tri.a - tri.c;
+  uv_float3 cr = cross(u, v);
+  return fabs(cr.x) < EPSILON && fabs(cr.y) < EPSILON && fabs(cr.z) < EPSILON;
+}
+int orient3D(const uv_float3 &a, const uv_float3 &b, const uv_float3 &c,
+             const uv_float3 &d) {
+  Matrix4 _matrix4;
+  _matrix4.set(a.x, a.y, a.z, 1, b.x, b.y, b.z, 1, c.x, c.y, c.z, 1, d.x, d.y,
+               d.z, 1);
+  float det = _matrix4.determinant();
+  if (det < -EPSILON)
+    return -1;
+  else if (det > EPSILON)
+    return 1;
+  else
+    return 0;
+}
+int orient2D(const uv_float2 &a, const uv_float2 &b, const uv_float2 &c) {
+  float det = (a.x * (b.y - c.y) + b.x * (c.y - a.y) + c.x * (a.y - b.y));
+  if (det < -EPSILON)
+    return -1;
+  else if (det > EPSILON)
+    return 1;
+  else
+    return 0;
+}
+int orient2D(const uv_float3 &a, const uv_float3 &b, const uv_float3 &c) {
+  uv_float2 a_2d = {a.x, a.y};
+  uv_float2 b_2d = {b.x, b.y};
+  uv_float2 c_2d = {c.x, c.y};
+  return orient2D(a_2d, b_2d, c_2d);
+}
+void permuteTriLeft(Triangle &tri) {
+  uv_float3 tmp = tri.a;
+  tri.a = tri.b;
+  tri.b = tri.c;
+  tri.c = tmp;
+}
+void permuteTriRight(Triangle &tri) {
+  uv_float3 tmp = tri.c;
+  tri.c = tri.b;
+  tri.b = tri.a;
+  tri.a = tmp;
+}
+void makeTriCounterClockwise(Triangle &tri) {
+  if (orient2D(tri.a, tri.b, tri.c) < 0) {
+    uv_float3 tmp = tri.c;
+    tri.c = tri.b;
+    tri.b = tmp;
+  }
+}
+void intersectPlane(const uv_float3 &a, const uv_float3 &b, const uv_float3 &p,
+                    const uv_float3 &n, uv_float3 &target) {
+  uv_float3 u = b - a;
+  uv_float3 v = a - p;
+  float dot1 = dot(n, u);
+  float dot2 = dot(n, v);
+  u = u * (-dot2 / dot1);
+  target = a + u;
+}
+void computeLineIntersection(const Triangle &t1, const Triangle &t2,
+                             std::vector<uv_float3> &target) {
+  uv_float3 n1, n2;
+  t1.getNormal(n1);
+  t2.getNormal(n2);
+  int o1 = orient3D(t1.a, t1.c, t2.b, t2.a);
+  int o2 = orient3D(t1.a, t1.b, t2.c, t2.a);
+  uv_float3 i1, i2;
+  if (o1 > 0) {
+    if (o2 > 0) {
+      intersectPlane(t1.a, t1.c, t2.a, n2, i1);
+      intersectPlane(t2.a, t2.c, t1.a, n1, i2);
+    } else {
+      intersectPlane(t1.a, t1.c, t2.a, n2, i1);
+      intersectPlane(t1.a, t1.b, t2.a, n2, i2);
+    }
+  } else {
+    if (o2 > 0) {
+      intersectPlane(t2.a, t2.b, t1.a, n1, i1);
+      intersectPlane(t2.a, t2.c, t1.a, n1, i2);
+    } else {
+      intersectPlane(t2.a, t2.b, t1.a, n1, i1);
+      intersectPlane(t1.a, t1.b, t2.a, n2, i2);
+    }
+  }
+  target.push_back(i1);
+  if (distance_to(i1, i2) >= EPSILON) {
+    target.push_back(i2);
+  }
+}
+void makeTriAVertexAlone(Triangle &tri, int oa, int ob, int oc) {
+  // Permute a, b, c so that a is alone on its side
+  if (oa == ob) {
+    // c is alone, permute right so c becomes a
+    permuteTriRight(tri);
+  } else if (oa == oc) {
+    // b is alone, permute so b becomes a
+    permuteTriLeft(tri);
+  } else if (ob != oc) {
+    // In case a, b, c have different orientation, put a on positive side
+    if (ob > 0) {
+      permuteTriLeft(tri);
+    } else if (oc > 0) {
+      permuteTriRight(tri);
+    }
+  }
+}
+void makeTriAVertexPositive(Triangle &tri, const Triangle &other) {
+  int o = orient3D(other.a, other.b, other.c, tri.a);
+  if (o < 0) {
+    std::swap(tri.b, tri.c);
+  }
+}
+bool crossIntersect(Triangle &t1, Triangle &t2, int o1a, int o1b, int o1c,
+                    std::vector<uv_float3> *target = nullptr) {
+  int o2a = orient3D(t1.a, t1.b, t1.c, t2.a);
+  int o2b = orient3D(t1.a, t1.b, t1.c, t2.b);
+  int o2c = orient3D(t1.a, t1.b, t1.c, t2.c);
+  if (o2a == o2b && o2a == o2c) {
+    return false;
+  }
+  // Make a vertex alone on its side for both triangles
+  makeTriAVertexAlone(t1, o1a, o1b, o1c);
+  makeTriAVertexAlone(t2, o2a, o2b, o2c);
+  // Ensure the vertex on the positive side
+  makeTriAVertexPositive(t2, t1);
+  makeTriAVertexPositive(t1, t2);
+  int o1 = orient3D(t1.a, t1.b, t2.a, t2.b);
+  int o2 = orient3D(t1.a, t1.c, t2.c, t2.a);
+  if (o1 <= 0 && o2 <= 0) {
+    if (target) {
+      computeLineIntersection(t1, t2, *target);
+    }
+    return true;
+  }
+  return false;
+}
+void linesIntersect2d(const uv_float3 &a1, const uv_float3 &b1,
+                      const uv_float3 &a2, const uv_float3 &b2,
+                      uv_float3 &target) {
+  float dx1 = a1.x - b1.x;
+  float dx2 = a2.x - b2.x;
+  float dy1 = a1.y - b1.y;
+  float dy2 = a2.y - b2.y;
+  float D = dx1 * dy2 - dx2 * dy1;
+  float n1 = a1.x * b1.y - a1.y * b1.x;
+  float n2 = a2.x * b2.y - a2.y * b2.x;
+  target.x = (n1 * dx2 - n2 * dx1) / D;
+  target.y = (n1 * dy2 - n2 * dy1) / D;
+  target.z = 0;
+}
+void clipTriangle(const Triangle &t1, const Triangle &t2,
+                  std::vector<uv_float3> &target) {
+  std::vector<uv_float3> clip = {t1.a, t1.b, t1.c};
+  std::vector<uv_float3> output = {t2.a, t2.b, t2.c};
+  std::vector<int> orients(output.size() * 3, 0);
+  uv_float3 inter;
+  for (int i = 0; i < 3; ++i) {
+    const int i_prev = (i + 2) % 3;
+    std::vector<uv_float3> input;
+    std::copy(output.begin(), output.end(), std::back_inserter(input));
+    output.clear();
+    for (size_t j = 0; j < input.size(); ++j) {
+      orients[j] = orient2D(clip[i_prev], clip[i], input[j]);
+    }
+    for (size_t j = 0; j < input.size(); ++j) {
+      const int j_prev = (j - 1 + input.size()) % input.size();
+      if (orients[j] >= 0) {
+        if (orients[j_prev] < 0) {
+          linesIntersect2d(clip[i_prev], clip[i], input[j_prev], input[j],
+                           inter);
+          output.push_back({inter.x, inter.y, inter.z});
+        }
+        output.push_back({input[j].x, input[j].y, input[j].z});
+      } else if (orients[j_prev] >= 0) {
+        linesIntersect2d(clip[i_prev], clip[i], input[j_prev], input[j], inter);
+        output.push_back({inter.x, inter.y, inter.z});
+      }
+    }
+  }
+  // Clear duplicated points
+  for (const auto &point : output) {
+    int j = 0;
+    bool sameFound = false;
+    while (!sameFound && j < target.size()) {
+      sameFound = distance_to(point, target[j]) <= 1e-6;
+      j++;
+    }
+    if (!sameFound) {
+      target.push_back(point);
+    }
+  }
+}
+bool intersectionTypeR1(const Triangle &t1, const Triangle &t2) {
+  const uv_float3 &p1 = t1.a;
+  const uv_float3 &q1 = t1.b;
+  const uv_float3 &r1 = t1.c;
+  const uv_float3 &p2 = t2.a;
+  const uv_float3 &r2 = t2.c;
+  if (orient2D(r2, p2, q1) >= 0) {     // I
+    if (orient2D(r2, p1, q1) >= 0) {   // II.a
+      if (orient2D(p1, p2, q1) >= 0) { // III.a
+        return true;
+      } else {
+        if (orient2D(p1, p2, r1) >= 0) {   // IV.a
+          if (orient2D(q1, r1, p2) >= 0) { // V
+            return true;
+          }
+        }
+      }
+    }
+  } else {
+    if (orient2D(r2, p2, r1) >= 0) {     // II.b
+      if (orient2D(q1, r1, r2) >= 0) {   // III.b
+        if (orient2D(p1, p2, r1) >= 0) { // IV.b (diverges from paper)
+          return true;
+        }
+      }
+    }
+  }
+  return false;
+}
+bool intersectionTypeR2(const Triangle &t1, const Triangle &t2) {
+  const uv_float3 &p1 = t1.a;
+  const uv_float3 &q1 = t1.b;
+  const uv_float3 &r1 = t1.c;
+  const uv_float3 &p2 = t2.a;
+  const uv_float3 &q2 = t2.b;
+  const uv_float3 &r2 = t2.c;
+  if (orient2D(r2, p2, q1) >= 0) {       // I
+    if (orient2D(q2, r2, q1) >= 0) {     // II.a
+      if (orient2D(p1, p2, q1) >= 0) {   // III.a
+        if (orient2D(p1, q2, q1) <= 0) { // IV.a
+          return true;
+        }
+      } else {
+        if (orient2D(p1, p2, r1) >= 0) {   // IV.b
+          if (orient2D(r2, p2, r1) <= 0) { // V.a
+            return true;
+          }
+        }
+      }
+    } else {
+      if (orient2D(p1, q2, q1) <= 0) {     // III.b
+        if (orient2D(q2, r2, r1) >= 0) {   // IV.c
+          if (orient2D(q1, r1, q2) >= 0) { // V.b
+            return true;
+          }
+        }
+      }
+    }
+  } else {
+    if (orient2D(r2, p2, r1) >= 0) {     // II.b
+      if (orient2D(q1, r1, r2) >= 0) {   // III.c
+        if (orient2D(r1, p1, p2) >= 0) { // IV.d
+          return true;
+        }
+      } else {
+        if (orient2D(q1, r1, q2) >= 0) {   // IV.e
+          if (orient2D(q2, r2, r1) >= 0) { // V.c
+            return true;
+          }
+        }
+      }
+    }
+  }
+  return false;
+}
+bool coplanarIntersect(Triangle &t1, Triangle &t2,
+                       std::vector<uv_float3> *target = nullptr) {
+  uv_float3 normal, u, v;
+  t1.getNormal(normal);
+  normal = normalize(normal);
+  u = normalize(t1.a - t1.b);
+  v = cross(normal, u);
+  // Move basis to t1.a
+  u = u + t1.a;
+  v = v + t1.a;
+  normal = normal + t1.a;
+  Matrix4 _matrix;
+  _matrix.set(t1.a.x, u.x, v.x, normal.x, t1.a.y, u.y, v.y, normal.y, t1.a.z,
+              u.z, v.z, normal.z, 1, 1, 1, 1);
+  Matrix4 _affineMatrix;
+  _affineMatrix.set(0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1);
+  _matrix.invert(); // Invert the _matrix
+  _matrix = _affineMatrix * _matrix;
+  // Apply transformation
+  apply_matrix4(t1.a, _matrix);
+  apply_matrix4(t1.b, _matrix);
+  apply_matrix4(t1.c, _matrix);
+  apply_matrix4(t2.a, _matrix);
+  apply_matrix4(t2.b, _matrix);
+  apply_matrix4(t2.c, _matrix);
+  makeTriCounterClockwise(t1);
+  makeTriCounterClockwise(t2);
+  const uv_float3 &p1 = t1.a;
+  const uv_float3 &p2 = t2.a;
+  const uv_float3 &q2 = t2.b;
+  const uv_float3 &r2 = t2.c;
+  int o_p2q2 = orient2D(p2, q2, p1);
+  int o_q2r2 = orient2D(q2, r2, p1);
+  int o_r2p2 = orient2D(r2, p2, p1);
+  bool intersecting = false;
+  if (o_p2q2 >= 0) {
+    if (o_q2r2 >= 0) {
+      if (o_r2p2 >= 0) {
+        // + + +
+        intersecting = true;
+      } else {
+        // + + -
+        intersecting = intersectionTypeR1(t1, t2);
+      }
+    } else {
+      if (o_r2p2 >= 0) {
+        // + - +
+        permuteTriRight(t2);
+        intersecting = intersectionTypeR1(t1, t2);
+      } else {
+        // + - -
+        intersecting = intersectionTypeR2(t1, t2);
+      }
+    }
+  } else {
+    if (o_q2r2 >= 0) {
+      if (o_r2p2 >= 0) {
+        // - + +
+        permuteTriLeft(t2);
+        intersecting = intersectionTypeR1(t1, t2);
+      } else {
+        // - + -
+        permuteTriLeft(t2);
+        intersecting = intersectionTypeR2(t1, t2);
+      }
+    } else {
+      if (o_r2p2 >= 0) {
+        // - - +
+        permuteTriRight(t2);
+        intersecting = intersectionTypeR2(t1, t2);
+      } else {
+        // - - -
+        std::cerr << "Triangles should not be flat." << std::endl;
+        return false;
+      }
+    }
+  }
+  if (intersecting && target) {
+    clipTriangle(t1, t2, *target);
+    _matrix.invert();
+    // Apply the transform to each target point
+    for (int i = 0; i < target->size(); ++i) {
+      apply_matrix4(target->at(i), _matrix);
+    }
+  }
+  return intersecting;
+}
+// Helper function to calculate the area of a polygon
+float polygon_area(const std::vector<uv_float3> &polygon) {
+  if (polygon.size() < 3)
+    return 0.0f; // Not a polygon
+  uv_float3 normal = {0.0f, 0.0f, 0.0f}; // Initialize normal vector
+  // Calculate the cross product of edges around the polygon
+  for (size_t i = 0; i < polygon.size(); ++i) {
+    uv_float3 p1 = polygon[i];
+    uv_float3 p2 = polygon[(i + 1) % polygon.size()];
+    normal = normal + cross(p1, p2); // Accumulate the normal vector
+  }
+  float area =
+      magnitude(normal) / 2.0f; // Area is half the magnitude of the normal
+  return area;
+}
+bool triangle_triangle_intersection(uv_float2 p1, uv_float2 q1, uv_float2 r1,
+                                    uv_float2 p2, uv_float2 q2, uv_float2 r2) {
+  Triangle t1(p1, q1, r1);
+  Triangle t2(p2, q2, r2);
+  if (isTriDegenerated(t1) || isTriDegenerated(t2)) {
+    // std::cerr << "Degenerated triangles provided, skipping." << std::endl;
+    return false;
+  }
+  int o1a = orient3D(t2.a, t2.b, t2.c, t1.a);
+  int o1b = orient3D(t2.a, t2.b, t2.c, t1.b);
+  int o1c = orient3D(t2.a, t2.b, t2.c, t1.c);
+  std::vector<uv_float3> intersections;
+  bool intersects;
+  if (o1a == o1b && o1a == o1c) // [[likely]]
+  {
+    intersects = o1a == 0 && coplanarIntersect(t1, t2, &intersections);
+  } else // [[unlikely]]
+  {
+    intersects = crossIntersect(t1, t2, o1a, o1b, o1c, &intersections);
+  }
+  if (intersects) {
+    float area = polygon_area(intersections);
+    // std::cout << "Intersection area: " << area << std::endl;
+    if (area < 1e-10f || std::isfinite(area) == false) {
+      // std::cout<<"Invalid area: " << area << std::endl;
+      return false; // Ignore intersection if the area is too small
+    }
+  }
+  return intersects;
+}

uv_unwrapper/uv_unwrapper/csrc/intersect.h ADDED Viewed

	@@ -0,0 +1,10 @@

+#pragma once
+#include "common.h"
+#include <vector>
+bool triangle_aabb_intersection(const uv_float2 &aabb_min,
+                                const uv_float2 &aabb_max, const uv_float2 &v0,
+                                const uv_float2 &v1, const uv_float2 &v2);
+bool triangle_triangle_intersection(uv_float2 p1, uv_float2 q1, uv_float2 r1,
+                                    uv_float2 p2, uv_float2 q2, uv_float2 r2);

uv_unwrapper/uv_unwrapper/csrc/unwrapper.cpp ADDED Viewed

	@@ -0,0 +1,271 @@

+#include "bvh.h"
+#include <ATen/ATen.h>
+#include <ATen/Context.h>
+#include <chrono>
+#include <cmath>
+#include <cstring>
+#include <omp.h>
+#include <set>
+#include <torch/extension.h>
+#include <vector>
+// #define TIMING
+#if defined(_MSC_VER)
+#include <BaseTsd.h>
+typedef SSIZE_T ssize_t;
+#endif
+namespace UVUnwrapper {
+void create_bvhs(BVH *bvhs, Triangle *triangles,
+                 std::vector<std::set<int>> &triangle_per_face, int num_faces,
+                 int start, int end) {
+#pragma omp parallel for
+  for (int i = start; i < end; i++) {
+    int num_triangles = triangle_per_face[i].size();
+    Triangle *triangles_per_face = new Triangle[num_triangles];
+    int *indices = new int[num_triangles];
+    int j = 0;
+    for (int idx : triangle_per_face[i]) {
+      triangles_per_face[j] = triangles[idx];
+      indices[j++] = idx;
+    }
+    // Each thread writes to it's own memory space
+    // First check if the number of triangles is 0
+    if (num_triangles == 0) {
+      bvhs[i - start] = std::move(BVH()); // Default constructor
+    } else {
+      bvhs[i - start] = std::move(
+          BVH(triangles_per_face, indices,
+              num_triangles)); // BVH now handles memory of triangles_per_face
+    }
+    delete[] triangles_per_face;
+  }
+}
+void perform_intersection_check(BVH *bvhs, int num_bvhs, Triangle *triangles,
+                                uv_float3 *vertex_tri_centroids,
+                                int64_t *assign_indices_ptr,
+                                ssize_t num_indices, int offset,
+                                std::vector<std::set<int>> &triangle_per_face) {
+  std::vector<std::pair<int, int>>
+      unique_intersections; // Store unique intersections as pairs of triangle
+                            // indices
+// Step 1: Detect intersections in parallel
+#pragma omp parallel for
+  for (int i = 0; i < num_indices; i++) {
+    if (assign_indices_ptr[i] < offset) {
+      continue;
+    }
+    Triangle cur_tri = triangles[i];
+    auto &cur_bvh = bvhs[assign_indices_ptr[i] - offset];
+    if (cur_bvh.bvhNode == nullptr) {
+      continue;
+    }
+    std::vector<int> intersections = cur_bvh.Intersect(cur_tri);
+    if (!intersections.empty()) {
+#pragma omp critical
+      {
+        for (int intersect : intersections) {
+          if (i != intersect) {
+            // Ensure we only store unique pairs (A, B) where A < B to avoid
+            // duplication
+            if (i < intersect) {
+              unique_intersections.push_back(std::make_pair(i, intersect));
+            } else {
+              unique_intersections.push_back(std::make_pair(intersect, i));
+            }
+          }
+        }
+      }
+    }
+  }
+  // Step 2: Process unique intersections
+  for (int idx = 0; idx < unique_intersections.size(); idx++) {
+    int first = unique_intersections[idx].first;
+    int second = unique_intersections[idx].second;
+    int i_idx = assign_indices_ptr[first];
+    int norm_idx = i_idx % 6;
+    int axis = (norm_idx < 2) ? 0 : (norm_idx < 4) ? 1 : 2;
+    bool use_max = (i_idx % 2) == 1;
+    float pos_a = vertex_tri_centroids[first][axis];
+    float pos_b = vertex_tri_centroids[second][axis];
+    // Sort the intersections based on vertex_tri_centroids along the specified
+    // axis
+    if (use_max) {
+      if (pos_a < pos_b) {
+        std::swap(first, second);
+      }
+    } else {
+      if (pos_a > pos_b) {
+        std::swap(first, second);
+      }
+    }
+    // Update the unique intersections
+    unique_intersections[idx].first = first;
+    unique_intersections[idx].second = second;
+  }
+  // Now only get the second intersections from the pair and put them in a set
+  // The second intersection should always be the occluded triangle
+  std::set<int> second_intersections;
+  for (int idx = 0; idx < (int)unique_intersections.size(); idx++) {
+    int second = unique_intersections[idx].second;
+    second_intersections.insert(second);
+  }
+  for (int int_idx : second_intersections) {
+    // Move the second (occluded) triangle by 6
+    int intersect_idx = assign_indices_ptr[int_idx];
+    int new_index = intersect_idx + 6;
+    new_index = std::clamp(new_index, 0, 12);
+    assign_indices_ptr[int_idx] = new_index;
+    triangle_per_face[intersect_idx].erase(int_idx);
+    triangle_per_face[new_index].insert(int_idx);
+  }
+}
+torch::Tensor assign_faces_uv_to_atlas_index(torch::Tensor vertices,
+                                             torch::Tensor indices,
+                                             torch::Tensor face_uv,
+                                             torch::Tensor face_index) {
+  // Get the number of faces
+  int num_faces = indices.size(0);
+  torch::Tensor assign_indices =
+      torch::empty(
+          {
+              num_faces,
+          },
+          torch::TensorOptions().dtype(torch::kInt64).device(torch::kCPU))
+          .contiguous();
+  auto vert_accessor = vertices.accessor<float, 2>();
+  auto indices_accessor = indices.accessor<int64_t, 2>();
+  auto face_uv_accessor = face_uv.accessor<float, 2>();
+  const int64_t *face_index_ptr = face_index.contiguous().data_ptr<int64_t>();
+  int64_t *assign_indices_ptr = assign_indices.data_ptr<int64_t>();
+  // copy face_index to assign_indices
+  memcpy(assign_indices_ptr, face_index_ptr, num_faces * sizeof(int64_t));
+#ifdef TIMING
+  auto start = std::chrono::high_resolution_clock::now();
+#endif
+  uv_float3 *vertex_tri_centroids = new uv_float3[num_faces];
+  Triangle *triangles = new Triangle[num_faces];
+  // Use std::set to store triangles for each face
+  std::vector<std::set<int>> triangle_per_face;
+  triangle_per_face.resize(13);
+#pragma omp parallel for
+  for (int i = 0; i < num_faces; i++) {
+    int face_idx = i * 3;
+    triangles[i].v0 = {face_uv_accessor[face_idx + 0][0],
+                       face_uv_accessor[face_idx + 0][1]};
+    triangles[i].v1 = {face_uv_accessor[face_idx + 1][0],
+                       face_uv_accessor[face_idx + 1][1]};
+    triangles[i].v2 = {face_uv_accessor[face_idx + 2][0],
+                       face_uv_accessor[face_idx + 2][1]};
+    triangles[i].centroid =
+        triangle_centroid(triangles[i].v0, triangles[i].v1, triangles[i].v2);
+    uv_float3 v0 = {vert_accessor[indices_accessor[i][0]][0],
+                    vert_accessor[indices_accessor[i][0]][1],
+                    vert_accessor[indices_accessor[i][0]][2]};
+    uv_float3 v1 = {vert_accessor[indices_accessor[i][1]][0],
+                    vert_accessor[indices_accessor[i][1]][1],
+                    vert_accessor[indices_accessor[i][1]][2]};
+    uv_float3 v2 = {vert_accessor[indices_accessor[i][2]][0],
+                    vert_accessor[indices_accessor[i][2]][1],
+                    vert_accessor[indices_accessor[i][2]][2]};
+    vertex_tri_centroids[i] = triangle_centroid(v0, v1, v2);
+// Assign the triangle to the face index
+#pragma omp critical
+    { triangle_per_face[face_index_ptr[i]].insert(i); }
+  }
+#ifdef TIMING
+  auto start_bvh = std::chrono::high_resolution_clock::now();
+#endif
+  BVH *bvhs = new BVH[6];
+  create_bvhs(bvhs, triangles, triangle_per_face, num_faces, 0, 6);
+#ifdef TIMING
+  auto end_bvh = std::chrono::high_resolution_clock::now();
+  std::chrono::duration<double> elapsed_seconds = end_bvh - start_bvh;
+  std::cout << "BVH build time: " << elapsed_seconds.count() << "s\n";
+  auto start_intersection_1 = std::chrono::high_resolution_clock::now();
+#endif
+  perform_intersection_check(bvhs, 6, triangles, vertex_tri_centroids,
+                             assign_indices_ptr, num_faces, 0,
+                             triangle_per_face);
+#ifdef TIMING
+  auto end_intersection_1 = std::chrono::high_resolution_clock::now();
+  elapsed_seconds = end_intersection_1 - start_intersection_1;
+  std::cout << "Intersection 1 time: " << elapsed_seconds.count() << "s\n";
+#endif
+  // Create 6 new bvhs and delete the old ones
+  BVH *new_bvhs = new BVH[6];
+  create_bvhs(new_bvhs, triangles, triangle_per_face, num_faces, 6, 12);
+#ifdef TIMING
+  auto end_bvh2 = std::chrono::high_resolution_clock::now();
+  elapsed_seconds = end_bvh2 - end_intersection_1;
+  std::cout << "BVH 2 build time: " << elapsed_seconds.count() << "s\n";
+  auto start_intersection_2 = std::chrono::high_resolution_clock::now();
+#endif
+  perform_intersection_check(new_bvhs, 6, triangles, vertex_tri_centroids,
+                             assign_indices_ptr, num_faces, 6,
+                             triangle_per_face);
+#ifdef TIMING
+  auto end_intersection_2 = std::chrono::high_resolution_clock::now();
+  elapsed_seconds = end_intersection_2 - start_intersection_2;
+  std::cout << "Intersection 2 time: " << elapsed_seconds.count() << "s\n";
+  elapsed_seconds = end_intersection_2 - start;
+  std::cout << "Total time: " << elapsed_seconds.count() << "s\n";
+#endif
+  // Cleanup
+  delete[] vertex_tri_centroids;
+  delete[] triangles;
+  delete[] bvhs;
+  delete[] new_bvhs;
+  return assign_indices;
+}
+// Registers _C as a Python extension module.
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {}
+// Defines the operators
+TORCH_LIBRARY(UVUnwrapper, m) {
+  m.def("assign_faces_uv_to_atlas_index(Tensor vertices, Tensor indices, "
+        "Tensor face_uv, Tensor face_index) -> Tensor");
+}
+// Registers CPP implementations
+TORCH_LIBRARY_IMPL(UVUnwrapper, CPU, m) {
+  m.impl("assign_faces_uv_to_atlas_index", &assign_faces_uv_to_atlas_index);
+}
+} // namespace UVUnwrapper

uv_unwrapper/uv_unwrapper/unwrap.py ADDED Viewed

	@@ -0,0 +1,669 @@

+import math
+from typing import Tuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+class Unwrapper(nn.Module):
+    def __init__(self):
+        super().__init__()
+    def _box_assign_vertex_to_cube_face(
+        self,
+        vertex_positions: Tensor,
+        vertex_normals: Tensor,
+        triangle_idxs: Tensor,
+        bbox: Tensor,
+    ) -> Tuple[Tensor, Tensor]:
+        """
+        Assigns each vertex to a cube face based on the face normal
+        Args:
+            vertex_positions (Tensor, Nv 3, float): Vertex positions
+            vertex_normals (Tensor, Nv 3, float): Vertex normals
+            triangle_idxs (Tensor, Nf 3, int): Triangle indices
+            bbox (Tensor, 2 3, float): Bounding box of the mesh
+        Returns:
+            Tensor, Nf 3 2, float: UV coordinates
+            Tensor, Nf, int: Cube face indices
+        """
+        # Test to not have a scaled model to fit the space better
+        # bbox_min = bbox[:1].mean(-1, keepdim=True)
+        # bbox_max = bbox[1:].mean(-1, keepdim=True)
+        # v_pos_normalized = (vertex_positions - bbox_min) / (bbox_max - bbox_min)
+        # Create a [0, 1] normalized vertex position
+        v_pos_normalized = (vertex_positions - bbox[:1]) / (bbox[1:] - bbox[:1])
+        # And to [-1, 1]
+        v_pos_normalized = 2.0 * v_pos_normalized - 1.0
+        # Get all vertex positions for each triangle
+        # Now how do we define to which face the triangle belongs? Mean face pos? Max vertex pos?
+        v0 = v_pos_normalized[triangle_idxs[:, 0]]
+        v1 = v_pos_normalized[triangle_idxs[:, 1]]
+        v2 = v_pos_normalized[triangle_idxs[:, 2]]
+        tri_stack = torch.stack([v0, v1, v2], dim=1)
+        vn0 = vertex_normals[triangle_idxs[:, 0]]
+        vn1 = vertex_normals[triangle_idxs[:, 1]]
+        vn2 = vertex_normals[triangle_idxs[:, 2]]
+        tri_stack_nrm = torch.stack([vn0, vn1, vn2], dim=1)
+        # Just average the normals per face
+        face_normal = F.normalize(torch.sum(tri_stack_nrm, 1), eps=1e-6, dim=-1)
+        # Now decide based on the face normal in which box map we project
+        # abs_x, abs_y, abs_z = tri_stack_nrm.abs().unbind(-1)
+        abs_x, abs_y, abs_z = tri_stack.abs().unbind(-1)
+        axis = torch.tensor(
+            [
+                [1, 0, 0],  # 0
+                [-1, 0, 0],  # 1
+                [0, 1, 0],  # 2
+                [0, -1, 0],  # 3
+                [0, 0, 1],  # 4
+                [0, 0, -1],  # 5
+            ],
+            device=face_normal.device,
+            dtype=face_normal.dtype,
+        )
+        face_normal_axis = (face_normal[:, None] * axis[None]).sum(-1)
+        index = face_normal_axis.argmax(-1)
+        max_axis, uc, vc = (
+            torch.ones_like(abs_x),
+            torch.zeros_like(tri_stack[..., :1]),
+            torch.zeros_like(tri_stack[..., :1]),
+        )
+        mask_pos_x = index == 0
+        max_axis[mask_pos_x] = abs_x[mask_pos_x]
+        uc[mask_pos_x] = tri_stack[mask_pos_x][..., 1:2]
+        vc[mask_pos_x] = -tri_stack[mask_pos_x][..., -1:]
+        mask_neg_x = index == 1
+        max_axis[mask_neg_x] = abs_x[mask_neg_x]
+        uc[mask_neg_x] = tri_stack[mask_neg_x][..., 1:2]
+        vc[mask_neg_x] = -tri_stack[mask_neg_x][..., -1:]
+        mask_pos_y = index == 2
+        max_axis[mask_pos_y] = abs_y[mask_pos_y]
+        uc[mask_pos_y] = tri_stack[mask_pos_y][..., 0:1]
+        vc[mask_pos_y] = -tri_stack[mask_pos_y][..., -1:]
+        mask_neg_y = index == 3
+        max_axis[mask_neg_y] = abs_y[mask_neg_y]
+        uc[mask_neg_y] = tri_stack[mask_neg_y][..., 0:1]
+        vc[mask_neg_y] = -tri_stack[mask_neg_y][..., -1:]
+        mask_pos_z = index == 4
+        max_axis[mask_pos_z] = abs_z[mask_pos_z]
+        uc[mask_pos_z] = tri_stack[mask_pos_z][..., 0:1]
+        vc[mask_pos_z] = tri_stack[mask_pos_z][..., 1:2]
+        mask_neg_z = index == 5
+        max_axis[mask_neg_z] = abs_z[mask_neg_z]
+        uc[mask_neg_z] = tri_stack[mask_neg_z][..., 0:1]
+        vc[mask_neg_z] = -tri_stack[mask_neg_z][..., 1:2]
+        # UC from [-1, 1] to [0, 1]
+        max_dim_div = max_axis.max(dim=0, keepdim=True).values
+        uc = ((uc[..., 0] / max_dim_div + 1.0) * 0.5).clip(0, 1)
+        vc = ((vc[..., 0] / max_dim_div + 1.0) * 0.5).clip(0, 1)
+        uv = torch.stack([uc, vc], dim=-1)
+        return uv, index
+    def _assign_faces_uv_to_atlas_index(
+        self,
+        vertex_positions: Tensor,
+        triangle_idxs: Tensor,
+        face_uv: Tensor,
+        face_index: Tensor,
+    ) -> Tensor:  # noqa: F821
+        """
+        Assigns the face UV to the atlas index
+        Args:
+            vertex_positions (Float[Tensor, "Nv 3"]): Vertex positions
+            triangle_idxs (Integer[Tensor, "Nf 3"]): Triangle indices
+            face_uv (Float[Tensor, "Nf 3 2"]): Face UV coordinates
+            face_index (Integer[Tensor, "Nf"]): Face indices
+        Returns:
+            Integer[Tensor, "Nf"]: Atlas index
+        """
+        return torch.ops.UVUnwrapper.assign_faces_uv_to_atlas_index(
+            vertex_positions.cpu(),
+            triangle_idxs.cpu(),
+            face_uv.view(-1, 2).cpu(),
+            face_index.cpu(),
+        ).to(vertex_positions.device)
+    def _find_slice_offset_and_scale(
+        self, index: Tensor
+    ) -> Tuple[Tensor, Tensor, Tensor, Tensor]:  # noqa: F821
+        """
+        Find the slice offset and scale
+        Args:
+            index (Integer[Tensor, "Nf"]): Atlas index
+        Returns:
+            Float[Tensor, "Nf"]: Offset x
+            Float[Tensor, "Nf"]: Offset y
+            Float[Tensor, "Nf"]: Division x
+            Float[Tensor, "Nf"]: Division y
+        """
+        # 6 due to the 6 cube faces
+        off = 1 / 3
+        dupl_off = 1 / 6
+        # Here, we need to decide how to pack the textures in the case of overlap
+        def x_offset_calc(x, i):
+            offset_calc = i // 6
+            # Initial coordinates - just 3x2 grid
+            if offset_calc == 0:
+                return off * x
+            else:
+                # Smaller 3x2 grid plus eventual shift to right for
+                # second overlap
+                return dupl_off * x + min(offset_calc - 1, 1) * 0.5
+        def y_offset_calc(x, i):
+            offset_calc = i // 6
+            # Initial coordinates - just a 3x2 grid
+            if offset_calc == 0:
+                return off * x
+            else:
+                # Smaller coordinates in the lowest row
+                return dupl_off * x + off * 2
+        offset_x = torch.zeros_like(index, dtype=torch.float32)
+        offset_y = torch.zeros_like(index, dtype=torch.float32)
+        offset_x_vals = [0, 1, 2, 0, 1, 2]
+        offset_y_vals = [0, 0, 0, 1, 1, 1]
+        for i in range(index.max().item() + 1):
+            mask = index == i
+            if not mask.any():
+                continue
+            offset_x[mask] = x_offset_calc(offset_x_vals[i % 6], i)
+            offset_y[mask] = y_offset_calc(offset_y_vals[i % 6], i)
+        div_x = torch.full_like(index, 6 // 2, dtype=torch.float32)
+        # All overlap elements are saved in half scale
+        div_x[index >= 6] = 6
+        div_y = div_x.clone()  # Same for y
+        # Except for the random overlaps
+        div_x[index >= 12] = 2
+        # But the random overlaps are saved in a large block in the lower thirds
+        div_y[index >= 12] = 3
+        return offset_x, offset_y, div_x, div_y
+    def _calculate_tangents(
+        self,
+        vertex_positions: Tensor,
+        vertex_normals: Tensor,
+        triangle_idxs: Tensor,
+        face_uv: Tensor,
+    ) -> Tensor:
+        """
+        Calculate the tangents for each triangle
+        Args:
+            vertex_positions (Float[Tensor, "Nv 3"]): Vertex positions
+            vertex_normals (Float[Tensor, "Nv 3"]): Vertex normals
+            triangle_idxs (Integer[Tensor, "Nf 3"]): Triangle indices
+            face_uv (Float[Tensor, "Nf 3 2"]): Face UV coordinates
+        Returns:
+            Float[Tensor, "Nf 3 4"]: Tangents
+        """
+        vn_idx = [None] * 3
+        pos = [None] * 3
+        tex = face_uv.unbind(1)
+        for i in range(0, 3):
+            pos[i] = vertex_positions[triangle_idxs[:, i]]
+            # t_nrm_idx is always the same as t_pos_idx
+            vn_idx[i] = triangle_idxs[:, i]
+        if(torch.backends.mps.is_available()):
+            tangents = torch.zeros_like(vertex_normals).contiguous()
+            tansum = torch.zeros_like(vertex_normals).contiguous()
+        else:
+            tangents = torch.zeros_like(vertex_normals)
+            tansum = torch.zeros_like(vertex_normals)
+        # Compute tangent space for each triangle
+        duv1 = tex[1] - tex[0]
+        duv2 = tex[2] - tex[0]
+        dpos1 = pos[1] - pos[0]
+        dpos2 = pos[2] - pos[0]
+        tng_nom = dpos1 * duv2[..., 1:2] - dpos2 * duv1[..., 1:2]
+        denom = duv1[..., 0:1] * duv2[..., 1:2] - duv1[..., 1:2] * duv2[..., 0:1]
+        # Avoid division by zero for degenerated texture coordinates
+        denom_safe = denom.clip(1e-6)
+        tang = tng_nom / denom_safe
+        # Update all 3 vertices
+        for i in range(0, 3):
+            idx = vn_idx[i][:, None].repeat(1, 3)
+            tangents.scatter_add_(0, idx, tang)  # tangents[n_i] = tangents[n_i] + tang
+            tansum.scatter_add_(
+                0, idx, torch.ones_like(tang)
+            )  # tansum[n_i] = tansum[n_i] + 1
+        # Also normalize it. Here we do not normalize the individual triangles first so larger area
+        # triangles influence the tangent space more
+        tangents = tangents / tansum
+        # Normalize and make sure tangent is perpendicular to normal
+        tangents = F.normalize(tangents, dim=1)
+        tangents = F.normalize(
+            tangents
+            - (tangents * vertex_normals).sum(-1, keepdim=True) * vertex_normals
+        )
+        return tangents
+    def _rotate_uv_slices_consistent_space(
+        self,
+        vertex_positions: Tensor,
+        vertex_normals: Tensor,
+        triangle_idxs: Tensor,
+        uv: Tensor,
+        index: Tensor,
+    ) -> Tensor:
+        """
+        Rotate the UV slices so they are in a consistent space
+        Args:
+            vertex_positions (Float[Tensor, "Nv 3"]): Vertex positions
+            vertex_normals (Float[Tensor, "Nv 3"]): Vertex normals
+            triangle_idxs (Integer[Tensor, "Nf 3"]): Triangle indices
+            uv (Float[Tensor, "Nf 3 2"]): UV coordinates
+            index (Integer[Tensor, "Nf"]): Atlas index
+        Returns:
+            Float[Tensor, "Nf 3 2"]: Rotated UV coordinates
+        """
+        tangents = self._calculate_tangents(
+            vertex_positions, vertex_normals, triangle_idxs, uv
+        )
+        pos_stack = torch.stack(
+            [
+                -vertex_positions[..., 1],
+                vertex_positions[..., 0],
+                torch.zeros_like(vertex_positions[..., 0]),
+            ],
+            dim=-1,
+        )
+        expected_tangents = F.normalize(
+            torch.linalg.cross(
+                vertex_normals,
+                torch.linalg.cross(pos_stack, vertex_normals, dim=-1),
+                dim=-1,
+            ),
+            -1,
+        )
+        actual_tangents = tangents[triangle_idxs]
+        expected_tangents = expected_tangents[triangle_idxs]
+        def rotation_matrix_2d(theta):
+            c, s = torch.cos(theta), torch.sin(theta)
+            return torch.tensor([[c, -s], [s, c]])
+        # Now find the rotation
+        index_mod = index % 6  # Shouldn't happen. Just for safety
+        for i in range(6):
+            mask = index_mod == i
+            if not mask.any():
+                continue
+            actual_mean_tangent = actual_tangents[mask].mean(dim=(0, 1))
+            expected_mean_tangent = expected_tangents[mask].mean(dim=(0, 1))
+            dot_product = torch.dot(actual_mean_tangent, expected_mean_tangent)
+            cross_product = (
+                actual_mean_tangent[0] * expected_mean_tangent[1]
+                - actual_mean_tangent[1] * expected_mean_tangent[0]
+            )
+            angle = torch.atan2(cross_product, dot_product)
+            rot_matrix = rotation_matrix_2d(angle).to(mask.device)
+            # Center the uv coordinate to be in the range of -1 to 1 and 0 centered
+            uv_cur = uv[mask] * 2 - 1  # Center it first
+            # Rotate it
+            uv[mask] = torch.einsum("ij,nfj->nfi", rot_matrix, uv_cur)
+            # Rescale uv[mask] to be within the 0-1 range
+            uv[mask] = (uv[mask] - uv[mask].min()) / (uv[mask].max() - uv[mask].min())
+        return uv
+    def _handle_slice_uvs(
+        self,
+        uv: Tensor,
+        index: Tensor,  # noqa: F821
+        island_padding: float,
+        max_index: int = 6 * 2,
+    ) -> Tensor:  # noqa: F821
+        """
+        Handle the slice UVs
+        Args:
+            uv (Float[Tensor, "Nf 3 2"]): UV coordinates
+            index (Integer[Tensor, "Nf"]): Atlas index
+            island_padding (float): Island padding
+            max_index (int): Maximum index
+        Returns:
+            Float[Tensor, "Nf 3 2"]: Updated UV coordinates
+        """
+        uc, vc = uv.unbind(-1)
+        # Get the second slice (The first overlap)
+        index_filter = [index == i for i in range(6, max_index)]
+        # Normalize them to always fully fill the atlas patch
+        for i, fi in enumerate(index_filter):
+            if fi.sum() > 0:
+                # Scale the slice but only up to a factor of 2
+                # This keeps the texture resolution with the first slice in line (Half space in UV)
+                uc[fi] = (uc[fi] - uc[fi].min()) / (uc[fi].max() - uc[fi].min()).clip(
+                    0.5
+                )
+                vc[fi] = (vc[fi] - vc[fi].min()) / (vc[fi].max() - vc[fi].min()).clip(
+                    0.5
+                )
+        uc_padded = (uc * (1 - 2 * island_padding) + island_padding).clip(0, 1)
+        vc_padded = (vc * (1 - 2 * island_padding) + island_padding).clip(0, 1)
+        return torch.stack([uc_padded, vc_padded], dim=-1)
+    def _handle_remaining_uvs(
+        self,
+        uv: Tensor,
+        index: Tensor,  # noqa: F821
+        island_padding: float,
+    ) -> Tensor:
+        """
+        Handle the remaining UVs (The ones that are not slices)
+        Args:
+            uv (Float[Tensor, "Nf 3 2"]): UV coordinates
+            index (Integer[Tensor, "Nf"]): Atlas index
+            island_padding (float): Island padding
+        Returns:
+            Float[Tensor, "Nf 3 2"]: Updated UV coordinates
+        """
+        uc, vc = uv.unbind(-1)
+        # Get all remaining elements
+        remaining_filter = index >= 6 * 2
+        squares_left = remaining_filter.sum()
+        if squares_left == 0:
+            return uv
+        uc = uc[remaining_filter]
+        vc = vc[remaining_filter]
+        # Or remaining triangles are distributed in a rectangle
+        # The rectangle takes 0.5 of the entire uv space in width and 1/3 in height
+        ratio = 0.5 * (1 / 3)  # 1.5
+        # sqrt(744/(0.5*(1/3)))
+        mult = math.sqrt(squares_left / ratio)
+        num_square_width = int(math.ceil(0.5 * mult))
+        num_square_height = int(math.ceil(squares_left / num_square_width))
+        width = 1 / num_square_width
+        height = 1 / num_square_height
+        # The idea is again to keep the texture resolution consistent with the first slice
+        # This only occupys half the region in the texture chart but the scaling on the squares
+        # assumes full coverage.
+        clip_val = min(width, height) * 1.5
+        # Now normalize the UVs with taking into account the maximum scaling
+        uc = (uc - uc.min(dim=1, keepdim=True).values) / (
+            uc.amax(dim=1, keepdim=True) - uc.amin(dim=1, keepdim=True)
+        ).clip(clip_val)
+        vc = (vc - vc.min(dim=1, keepdim=True).values) / (
+            vc.amax(dim=1, keepdim=True) - vc.amin(dim=1, keepdim=True)
+        ).clip(clip_val)
+        # Add a small padding
+        uc = (
+            uc * (1 - island_padding * num_square_width * 0.5)
+            + island_padding * num_square_width * 0.25
+        ).clip(0, 1)
+        vc = (
+            vc * (1 - island_padding * num_square_height * 0.5)
+            + island_padding * num_square_height * 0.25
+        ).clip(0, 1)
+        uc = uc * width
+        vc = vc * height
+        # And calculate offsets for each element
+        idx = torch.arange(uc.shape[0], device=uc.device, dtype=torch.int32)
+        x_idx = idx % num_square_width
+        y_idx = idx // num_square_width
+        # And move each triangle to its own spot
+        uc = uc + x_idx[:, None] * width
+        vc = vc + y_idx[:, None] * height
+        uc = (uc * (1 - 2 * island_padding * 0.5) + island_padding * 0.5).clip(0, 1)
+        vc = (vc * (1 - 2 * island_padding * 0.5) + island_padding * 0.5).clip(0, 1)
+        uv[remaining_filter] = torch.stack([uc, vc], dim=-1)
+        return uv
+    def _distribute_individual_uvs_in_atlas(
+        self,
+        face_uv: Tensor,
+        assigned_faces: Tensor,
+        offset_x: Tensor,
+        offset_y: Tensor,
+        div_x: Tensor,
+        div_y: Tensor,
+        island_padding: float,
+    ) -> Tensor:
+        """
+        Distribute the individual UVs in the atlas
+        Args:
+            face_uv (Float[Tensor, "Nf 3 2"]): Face UV coordinates
+            assigned_faces (Integer[Tensor, "Nf"]): Assigned faces
+            offset_x (Float[Tensor, "Nf"]): Offset x
+            offset_y (Float[Tensor, "Nf"]): Offset y
+            div_x (Float[Tensor, "Nf"]): Division x
+            div_y (Float[Tensor, "Nf"]): Division y
+            island_padding (float): Island padding
+        Returns:
+            Float[Tensor, "Nf 3 2"]: Updated UV coordinates
+        """
+        # Place the slice first
+        placed_uv = self._handle_slice_uvs(face_uv, assigned_faces, island_padding)
+        # Then handle the remaining overlap elements
+        placed_uv = self._handle_remaining_uvs(
+            placed_uv, assigned_faces, island_padding
+        )
+        uc, vc = placed_uv.unbind(-1)
+        uc = uc / div_x[:, None] + offset_x[:, None]
+        vc = vc / div_y[:, None] + offset_y[:, None]
+        uv = torch.stack([uc, vc], dim=-1).view(-1, 2)
+        return uv
+    def _get_unique_face_uv(
+        self,
+        uv: Tensor,
+    ) -> Tuple[Tensor, Tensor]:
+        """
+        Get the unique face UV
+        Args:
+            uv (Float[Tensor, "Nf 3 2"]): UV coordinates
+        Returns:
+            Float[Tensor, "Utex 3"]: Unique UV coordinates
+            Integer[Tensor, "Nf"]: Vertex index
+        """
+        unique_uv, unique_idx = torch.unique(uv, return_inverse=True, dim=0)
+        # And add the face to uv index mapping
+        vtex_idx = unique_idx.view(-1, 3)
+        return unique_uv, vtex_idx
+    def _align_mesh_with_main_axis(
+        self, vertex_positions: Tensor, vertex_normals: Tensor
+    ) -> Tuple[Tensor, Tensor]:
+        """
+        Align the mesh with the main axis
+        Args:
+            vertex_positions (Float[Tensor, "Nv 3"]): Vertex positions
+            vertex_normals (Float[Tensor, "Nv 3"]): Vertex normals
+        Returns:
+            Float[Tensor, "Nv 3"]: Rotated vertex positions
+            Float[Tensor, "Nv 3"]: Rotated vertex normals
+        """
+        # Use pca to find the 2 main axis (third is derived by cross product)
+        # Set the random seed so it's repeatable
+        torch.manual_seed(0)
+        _, _, v = torch.pca_lowrank(vertex_positions, q=2)
+        main_axis, seconday_axis = v[:, 0], v[:, 1]
+        main_axis = F.normalize(main_axis, eps=1e-6, dim=-1)  # 3,
+        # Orthogonalize the second axis
+        seconday_axis = F.normalize(
+            seconday_axis
+            - (seconday_axis * main_axis).sum(-1, keepdim=True) * main_axis,
+            eps=1e-6,
+            dim=-1,
+        )  # 3,
+        # Create perpendicular third axis
+        third_axis = F.normalize(
+            torch.cross(main_axis, seconday_axis, dim=-1), dim=-1, eps=1e-6
+        )  # 3,
+        # Check to which canonical axis each aligns
+        main_axis_max_idx = main_axis.abs().argmax().item()
+        seconday_axis_max_idx = seconday_axis.abs().argmax().item()
+        third_axis_max_idx = third_axis.abs().argmax().item()
+        # Now sort the axes based on the argmax so they align with thecanonoical axes
+        # If two axes have the same argmax move one of them
+        all_possible_axis = {0, 1, 2}
+        cur_index = 1
+        while (
+            len(set([main_axis_max_idx, seconday_axis_max_idx, third_axis_max_idx]))
+            != 3
+        ):
+            # Find missing axis
+            missing_axis = all_possible_axis - set(
+                [main_axis_max_idx, seconday_axis_max_idx, third_axis_max_idx]
+            )
+            missing_axis = missing_axis.pop()
+            # Just assign it to third axis as it had the smallest contribution to the
+            # overall shape
+            if cur_index == 1:
+                third_axis_max_idx = missing_axis
+            elif cur_index == 2:
+                seconday_axis_max_idx = missing_axis
+            else:
+                raise ValueError("Could not find 3 unique axis")
+            cur_index += 1
+        if len({main_axis_max_idx, seconday_axis_max_idx, third_axis_max_idx}) != 3:
+            raise ValueError("Could not find 3 unique axis")
+        axes = [None] * 3
+        axes[main_axis_max_idx] = main_axis
+        axes[seconday_axis_max_idx] = seconday_axis
+        axes[third_axis_max_idx] = third_axis
+        # Create rotation matrix from the individual axes
+        rot_mat = torch.stack(axes, dim=1).T
+        # Now rotate the vertex positions and vertex normals so the mesh aligns with the main axis
+        vertex_positions = torch.einsum("ij,nj->ni", rot_mat, vertex_positions)
+        vertex_normals = torch.einsum("ij,nj->ni", rot_mat, vertex_normals)
+        return vertex_positions, vertex_normals
+    def forward(
+        self,
+        vertex_positions: Tensor,
+        vertex_normals: Tensor,
+        triangle_idxs: Tensor,
+        island_padding: float,
+    ) -> Tuple[Tensor, Tensor]:
+        """
+        Unwrap the mesh
+        Args:
+            vertex_positions (Float[Tensor, "Nv 3"]): Vertex positions
+            vertex_normals (Float[Tensor, "Nv 3"]): Vertex normals
+            triangle_idxs (Integer[Tensor, "Nf 3"]): Triangle indices
+            island_padding (float): Island padding
+        Returns:
+            Float[Tensor, "Utex 3"]: Unique UV coordinates
+            Integer[Tensor, "Nf"]: Vertex index
+        """
+        vertex_positions, vertex_normals = self._align_mesh_with_main_axis(
+            vertex_positions, vertex_normals
+        )
+        bbox = torch.stack(
+            [vertex_positions.min(dim=0).values, vertex_positions.max(dim=0).values],
+            dim=0,
+        )  # 2, 3
+        face_uv, face_index = self._box_assign_vertex_to_cube_face(
+            vertex_positions, vertex_normals, triangle_idxs, bbox
+        )
+        face_uv = self._rotate_uv_slices_consistent_space(
+            vertex_positions, vertex_normals, triangle_idxs, face_uv, face_index
+        )
+        assigned_atlas_index = self._assign_faces_uv_to_atlas_index(
+            vertex_positions, triangle_idxs, face_uv, face_index
+        )
+        offset_x, offset_y, div_x, div_y = self._find_slice_offset_and_scale(
+            assigned_atlas_index
+        )
+        placed_uv = self._distribute_individual_uvs_in_atlas(
+            face_uv,
+            assigned_atlas_index,
+            offset_x,
+            offset_y,
+            div_x,
+            div_y,
+            island_padding,
+        )
+        return self._get_unique_face_uv(placed_uv)