Spaces:

Stable-X
/

StableRecon

Runtime error

App Files Files Community

hugoycj commited on Nov 16, 2024

Commit

2caa1bd

1 Parent(s): 9059c91

feat: Add mast3r dependencies

Browse files

Files changed (26) hide show

dust3r/model.py +5 -1
mast3r/__init__.py +2 -0
mast3r/catmlp_dpt_head.py +123 -0
mast3r/cloud_opt/__init__.py +2 -0
mast3r/cloud_opt/sparse_ga.py +1035 -0
mast3r/cloud_opt/triangulation.py +80 -0
mast3r/cloud_opt/tsdf_optimizer.py +269 -0
mast3r/cloud_opt/utils/__init__.py +2 -0
mast3r/cloud_opt/utils/losses.py +32 -0
mast3r/cloud_opt/utils/schedules.py +17 -0
mast3r/colmap/__init__.py +2 -0
mast3r/colmap/database.py +383 -0
mast3r/datasets/__init__.py +62 -0
mast3r/datasets/base/__init__.py +2 -0
mast3r/datasets/base/mast3r_base_stereo_view_dataset.py +355 -0
mast3r/datasets/utils/__init__.py +2 -0
mast3r/datasets/utils/cropping.py +219 -0
mast3r/demo.py +321 -0
mast3r/fast_nn.py +223 -0
mast3r/losses.py +508 -0
mast3r/model.py +68 -0
mast3r/utils/__init__.py +2 -0
mast3r/utils/coarse_to_fine.py +214 -0
mast3r/utils/collate.py +62 -0
mast3r/utils/misc.py +17 -0
mast3r/utils/path_to_dust3r.py +19 -0

dust3r/model.py CHANGED Viewed

@@ -14,6 +14,7 @@ from .utils.misc import fill_default_args, freeze_all_params, is_symmetrized, in
 from .heads import head_factory
 from dust3r.patch_embed import get_patch_embed
 import dust3r.utils.path_to_croco  # noqa: F401
 from models.croco import CroCoNet  # noqa
@@ -78,7 +79,10 @@ class AsymmetricCroCo3DStereo (
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, **kw):
-        return load_model(pretrained_model_name_or_path, device='cpu', landscape_only=kw['landscape_only'])
     def _set_patch_embed(self, img_size=224, patch_size=16, enc_embed_dim=768):
         self.patch_embed = get_patch_embed(self.patch_embed_cls, img_size, patch_size, enc_embed_dim)

 from .heads import head_factory
 from dust3r.patch_embed import get_patch_embed
+import urllib
 import dust3r.utils.path_to_croco  # noqa: F401
 from models.croco import CroCoNet  # noqa
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, **kw):
+        if os.path.isfile(pretrained_model_name_or_path) or urllib.parse.urlparse(pretrained_model_name_or_path).scheme in ('http', 'https'):
+            return load_model(pretrained_model_name_or_path, device='cpu', landscape_only=kw['landscape_only'])
+        else:
+            return super(AsymmetricCroCo3DStereo, cls).from_pretrained(pretrained_model_name_or_path, **kw)
     def _set_patch_embed(self, img_size=224, patch_size=16, enc_embed_dim=768):
         self.patch_embed = get_patch_embed(self.patch_embed_cls, img_size, patch_size, enc_embed_dim)

mast3r/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # Copyright (C) 2024-present Naver Corporation. All rights reserved.
2	+ # Licensed under CC BY-NC-SA 4.0 (non-commercial use only).

mast3r/catmlp_dpt_head.py ADDED Viewed

	@@ -0,0 +1,123 @@

+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# MASt3R heads
+# --------------------------------------------------------
+import torch
+import torch.nn.functional as F
+import mast3r.utils.path_to_dust3r  # noqa
+from dust3r.heads.postprocess import reg_dense_depth, reg_dense_conf  # noqa
+from dust3r.heads.dpt_head import PixelwiseTaskWithDPT  # noqa
+import dust3r.utils.path_to_croco  # noqa
+from models.blocks import Mlp  # noqa
+def reg_desc(desc, mode):
+    if 'norm' in mode:
+        desc = desc / desc.norm(dim=-1, keepdim=True)
+    else:
+        raise ValueError(f"Unknown desc mode {mode}")
+    return desc
+def postprocess(out, depth_mode, conf_mode, desc_dim=None, desc_mode='norm', two_confs=False, desc_conf_mode=None):
+    if desc_conf_mode is None:
+        desc_conf_mode = conf_mode
+    fmap = out.permute(0, 2, 3, 1)  # B,H,W,D
+    res = dict(pts3d=reg_dense_depth(fmap[..., 0:3], mode=depth_mode))
+    if conf_mode is not None:
+        res['conf'] = reg_dense_conf(fmap[..., 3], mode=conf_mode)
+    if desc_dim is not None:
+        start = 3 + int(conf_mode is not None)
+        res['desc'] = reg_desc(fmap[..., start:start + desc_dim], mode=desc_mode)
+        if two_confs:
+            res['desc_conf'] = reg_dense_conf(fmap[..., start + desc_dim], mode=desc_conf_mode)
+        else:
+            res['desc_conf'] = res['conf'].clone()
+    return res
+class Cat_MLP_LocalFeatures_DPT_Pts3d(PixelwiseTaskWithDPT):
+    """ Mixture between MLP and DPT head that outputs 3d points and local features (with MLP).
+    The input for both heads is a concatenation of Encoder and Decoder outputs
+    """
+    def __init__(self, net, has_conf=False, local_feat_dim=16, hidden_dim_factor=4., hooks_idx=None, dim_tokens=None,
+                 num_channels=1, postprocess=None, feature_dim=256, last_dim=32, depth_mode=None, conf_mode=None, head_type="regression", **kwargs):
+        super().__init__(num_channels=num_channels, feature_dim=feature_dim, last_dim=last_dim, hooks_idx=hooks_idx,
+                         dim_tokens=dim_tokens, depth_mode=depth_mode, postprocess=postprocess, conf_mode=conf_mode, head_type=head_type)
+        self.local_feat_dim = local_feat_dim
+        patch_size = net.patch_embed.patch_size
+        if isinstance(patch_size, tuple):
+            assert len(patch_size) == 2 and isinstance(patch_size[0], int) and isinstance(
+                patch_size[1], int), "What is your patchsize format? Expected a single int or a tuple of two ints."
+            assert patch_size[0] == patch_size[1], "Error, non square patches not managed"
+            patch_size = patch_size[0]
+        self.patch_size = patch_size
+        self.desc_mode = net.desc_mode
+        self.has_conf = has_conf
+        self.two_confs = net.two_confs  # independent confs for 3D regr and descs
+        self.desc_conf_mode = net.desc_conf_mode
+        idim = net.enc_embed_dim + net.dec_embed_dim
+        self.head_local_features = Mlp(in_features=idim,
+                                       hidden_features=int(hidden_dim_factor * idim),
+                                       out_features=(self.local_feat_dim + self.two_confs) * self.patch_size**2)
+    def forward(self, decout, img_shape):
+        # pass through the heads
+        pts3d = self.dpt(decout, image_size=(img_shape[0], img_shape[1]))
+        # recover encoder and decoder outputs
+        enc_output, dec_output = decout[0], decout[-1]
+        cat_output = torch.cat([enc_output, dec_output], dim=-1)  # concatenate
+        H, W = img_shape
+        B, S, D = cat_output.shape
+        # extract local_features
+        local_features = self.head_local_features(cat_output)  # B,S,D
+        local_features = local_features.transpose(-1, -2).view(B, -1, H // self.patch_size, W // self.patch_size)
+        local_features = F.pixel_shuffle(local_features, self.patch_size)  # B,d,H,W
+        # post process 3D pts, descriptors and confidences
+        out = torch.cat([pts3d, local_features], dim=1)
+        if self.postprocess:
+            out = self.postprocess(out,
+                                   depth_mode=self.depth_mode,
+                                   conf_mode=self.conf_mode,
+                                   desc_dim=self.local_feat_dim,
+                                   desc_mode=self.desc_mode,
+                                   two_confs=self.two_confs,
+                                   desc_conf_mode=self.desc_conf_mode)
+        return out
+def mast3r_head_factory(head_type, output_mode, net, has_conf=False):
+    """" build a prediction head for the decoder
+    """
+    if head_type == 'catmlp+dpt' and output_mode.startswith('pts3d+desc'):
+        local_feat_dim = int(output_mode[10:])
+        assert net.dec_depth > 9
+        l2 = net.dec_depth
+        feature_dim = 256
+        last_dim = feature_dim // 2
+        out_nchan = 3
+        ed = net.enc_embed_dim
+        dd = net.dec_embed_dim
+        return Cat_MLP_LocalFeatures_DPT_Pts3d(net, local_feat_dim=local_feat_dim, has_conf=has_conf,
+                                               num_channels=out_nchan + has_conf,
+                                               feature_dim=feature_dim,
+                                               last_dim=last_dim,
+                                               hooks_idx=[0, l2 * 2 // 4, l2 * 3 // 4, l2],
+                                               dim_tokens=[ed, dd, dd, dd],
+                                               postprocess=postprocess,
+                                               depth_mode=net.depth_mode,
+                                               conf_mode=net.conf_mode,
+                                               head_type='regression')
+    else:
+        raise NotImplementedError(
+            f"unexpected {head_type=} and {output_mode=}")

mast3r/cloud_opt/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # Copyright (C) 2024-present Naver Corporation. All rights reserved.
2	+ # Licensed under CC BY-NC-SA 4.0 (non-commercial use only).

mast3r/cloud_opt/sparse_ga.py ADDED Viewed

	@@ -0,0 +1,1035 @@

+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# MASt3R Sparse Global Alignement
+# --------------------------------------------------------
+from tqdm import tqdm
+import roma
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+import os
+from collections import namedtuple
+from functools import lru_cache
+from scipy import sparse as sp
+import copy
+from mast3r.utils.misc import mkdir_for, hash_md5
+from mast3r.cloud_opt.utils.losses import gamma_loss
+from mast3r.cloud_opt.utils.schedules import linear_schedule, cosine_schedule
+from mast3r.fast_nn import fast_reciprocal_NNs, merge_corres
+import mast3r.utils.path_to_dust3r  # noqa
+from dust3r.utils.geometry import inv, geotrf  # noqa
+from dust3r.utils.device import to_cpu, to_numpy, todevice  # noqa
+from dust3r.post_process import estimate_focal_knowing_depth  # noqa
+from dust3r.optim_factory import adjust_learning_rate_by_lr  # noqa
+from dust3r.viz import SceneViz
+class SparseGA():
+    def __init__(self, img_paths, pairs_in, res_fine, anchors, canonical_paths=None):
+        def fetch_img(im):
+            def torgb(x): return (x[0].permute(1, 2, 0).numpy() * .5 + .5).clip(min=0., max=1.)
+            for im1, im2 in pairs_in:
+                if im1['instance'] == im:
+                    return torgb(im1['img'])
+                if im2['instance'] == im:
+                    return torgb(im2['img'])
+        self.canonical_paths = canonical_paths
+        self.img_paths = img_paths
+        self.imgs = [fetch_img(img) for img in img_paths]
+        self.intrinsics = res_fine['intrinsics']
+        self.cam2w = res_fine['cam2w']
+        self.depthmaps = res_fine['depthmaps']
+        self.pts3d = res_fine['pts3d']
+        self.pts3d_colors = []
+        self.working_device = self.cam2w.device
+        for i in range(len(self.imgs)):
+            im = self.imgs[i]
+            x, y = anchors[i][0][..., :2].detach().cpu().numpy().T
+            self.pts3d_colors.append(im[y, x])
+            assert self.pts3d_colors[-1].shape == self.pts3d[i].shape
+        self.n_imgs = len(self.imgs)
+    def get_focals(self):
+        return torch.tensor([ff[0, 0] for ff in self.intrinsics]).to(self.working_device)
+    def get_principal_points(self):
+        return torch.stack([ff[:2, -1] for ff in self.intrinsics]).to(self.working_device)
+    def get_im_poses(self):
+        return self.cam2w
+    def get_sparse_pts3d(self):
+        return self.pts3d
+    def get_dense_pts3d(self, clean_depth=True, subsample=8):
+        assert self.canonical_paths, 'cache_path is required for dense 3d points'
+        device = self.cam2w.device
+        confs = []
+        base_focals = []
+        anchors = {}
+        for i, canon_path in enumerate(self.canonical_paths):
+            (canon, canon2, conf), focal = torch.load(canon_path, map_location=device)
+            confs.append(conf)
+            base_focals.append(focal)
+            H, W = conf.shape
+            pixels = torch.from_numpy(np.mgrid[:W, :H].T.reshape(-1, 2)).float().to(device)
+            idxs, offsets = anchor_depth_offsets(canon2, {i: (pixels, None)}, subsample=subsample)
+            anchors[i] = (pixels, idxs[i], offsets[i])
+        # densify sparse depthmaps
+        pts3d, depthmaps = make_pts3d(anchors, self.intrinsics, self.cam2w, [
+                                      d.ravel() for d in self.depthmaps], base_focals=base_focals, ret_depth=True)
+        return pts3d, depthmaps, confs
+    def get_pts3d_colors(self):
+        return self.pts3d_colors
+    def get_depthmaps(self):
+        return self.depthmaps
+    def get_masks(self):
+        return [slice(None, None) for _ in range(len(self.imgs))]
+    def show(self, show_cams=True):
+        pts3d, _, confs = self.get_dense_pts3d()
+        show_reconstruction(self.imgs, self.intrinsics if show_cams else None, self.cam2w,
+                            [p.clip(min=-50, max=50) for p in pts3d],
+                            masks=[c > 1 for c in confs])
+def convert_dust3r_pairs_naming(imgs, pairs_in):
+    for pair_id in range(len(pairs_in)):
+        for i in range(2):
+            pairs_in[pair_id][i]['instance'] = imgs[pairs_in[pair_id][i]['idx']]
+    return pairs_in
+def sparse_global_alignment(imgs, pairs_in, cache_path, model, subsample=8, desc_conf='desc_conf',
+                            device='cuda', dtype=torch.float32, shared_intrinsics=False, **kw):
+    """ Sparse alignment with MASt3R
+        imgs: list of image paths
+        cache_path: path where to dump temporary files (str)
+        lr1, niter1: learning rate and #iterations for coarse global alignment (3D matching)
+        lr2, niter2: learning rate and #iterations for refinement (2D reproj error)
+        lora_depth: smart dimensionality reduction with depthmaps
+    """
+    # Convert pair naming convention from dust3r to mast3r
+    pairs_in = convert_dust3r_pairs_naming(imgs, pairs_in)
+    # forward pass
+    pairs, cache_path = forward_mast3r(pairs_in, model,
+                                       cache_path=cache_path, subsample=subsample,
+                                       desc_conf=desc_conf, device=device)
+    # extract canonical pointmaps
+    tmp_pairs, pairwise_scores, canonical_views, canonical_paths, preds_21 = \
+        prepare_canonical_data(imgs, pairs, subsample, cache_path=cache_path, mode='avg-angle', device=device)
+    # compute minimal spanning tree
+    mst = compute_min_spanning_tree(pairwise_scores)
+    # remove all edges not in the spanning tree?
+    # min_spanning_tree = {(imgs[i],imgs[j]) for i,j in mst[1]}
+    # tmp_pairs = {(a,b):v for (a,b),v in tmp_pairs.items() if {(a,b),(b,a)} & min_spanning_tree}
+    # smartly combine all useful data
+    imsizes, pps, base_focals, core_depth, anchors, corres, corres2d, preds_21 = \
+        condense_data(imgs, tmp_pairs, canonical_views, preds_21, dtype)
+    imgs, res_coarse, res_fine = sparse_scene_optimizer(
+        imgs, subsample, imsizes, pps, base_focals, core_depth, anchors, corres, corres2d, preds_21, canonical_paths, mst,
+        shared_intrinsics=shared_intrinsics, cache_path=cache_path, device=device, dtype=dtype, **kw)
+    return SparseGA(imgs, pairs_in, res_fine or res_coarse, anchors, canonical_paths)
+def sparse_scene_optimizer(imgs, subsample, imsizes, pps, base_focals, core_depth, anchors, corres, corres2d,
+                           preds_21, canonical_paths, mst, cache_path,
+                           lr1=0.2, niter1=500, loss1=gamma_loss(1.1),
+                           lr2=0.02, niter2=500, loss2=gamma_loss(0.4),
+                           lossd=gamma_loss(1.1),
+                           opt_pp=True, opt_depth=True,
+                           schedule=cosine_schedule, depth_mode='add', exp_depth=False,
+                           lora_depth=False,  # dict(k=96, gamma=15, min_norm=.5),
+                           shared_intrinsics=False,
+                           init={}, device='cuda', dtype=torch.float32,
+                           matching_conf_thr=5., loss_dust3r_w=0.01,
+                           verbose=True, dbg=()):
+    init = copy.deepcopy(init)
+    # extrinsic parameters
+    vec0001 = torch.tensor((0, 0, 0, 1), dtype=dtype, device=device)
+    quats = [nn.Parameter(vec0001.clone()) for _ in range(len(imgs))]
+    trans = [nn.Parameter(torch.zeros(3, device=device, dtype=dtype)) for _ in range(len(imgs))]
+    # initialize
+    ones = torch.ones((len(imgs), 1), device=device, dtype=dtype)
+    median_depths = torch.ones(len(imgs), device=device, dtype=dtype)
+    for img in imgs:
+        idx = imgs.index(img)
+        init_values = init.setdefault(img, {})
+        if verbose and init_values:
+            print(f' >> initializing img=...{img[-25:]} [{idx}] for {set(init_values)}')
+        K = init_values.get('intrinsics')
+        if K is not None:
+            K = K.detach()
+            focal = K[:2, :2].diag().mean()
+            pp = K[:2, 2]
+            base_focals[idx] = focal
+            pps[idx] = pp
+        pps[idx] /= imsizes[idx]  # default principal_point would be (0.5, 0.5)
+        depth = init_values.get('depthmap')
+        if depth is not None:
+            core_depth[idx] = depth.detach()
+        median_depths[idx] = med_depth = core_depth[idx].median()
+        core_depth[idx] /= med_depth
+        cam2w = init_values.get('cam2w')
+        if cam2w is not None:
+            rot = cam2w[:3, :3].detach()
+            cam_center = cam2w[:3, 3].detach()
+            quats[idx].data[:] = roma.rotmat_to_unitquat(rot)
+            trans_offset = med_depth * torch.cat((imsizes[idx] / base_focals[idx] * (0.5 - pps[idx]), ones[:1, 0]))
+            trans[idx].data[:] = cam_center + rot @ trans_offset
+            del rot
+            assert False, 'inverse kinematic chain not yet implemented'
+    # intrinsics parameters
+    if shared_intrinsics:
+        # Optimize a single set of intrinsics for all cameras. Use averages as init.
+        confs = torch.stack([torch.load(pth)[0][2].mean() for pth in canonical_paths]).to(pps)
+        weighting = confs / confs.sum()
+        pp = nn.Parameter((weighting @ pps).to(dtype))
+        pps = [pp for _ in range(len(imgs))]
+        focal_m = weighting @ base_focals
+        log_focal = nn.Parameter(focal_m.view(1).log().to(dtype))
+        log_focals = [log_focal for _ in range(len(imgs))]
+    else:
+        pps = [nn.Parameter(pp.to(dtype)) for pp in pps]
+        log_focals = [nn.Parameter(f.view(1).log().to(dtype)) for f in base_focals]
+    diags = imsizes.float().norm(dim=1)
+    min_focals = 0.25 * diags  # diag = 1.2~1.4*max(W,H) => beta >= 1/(2*1.2*tan(fov/2)) ~= 0.26
+    max_focals = 10 * diags
+    assert len(mst[1]) == len(pps) - 1
+    def make_K_cam_depth(log_focals, pps, trans, quats, log_sizes, core_depth):
+        # make intrinsics
+        focals = torch.cat(log_focals).exp().clip(min=min_focals, max=max_focals)
+        pps = torch.stack(pps)
+        K = torch.eye(3, dtype=dtype, device=device)[None].expand(len(imgs), 3, 3).clone()
+        K[:, 0, 0] = K[:, 1, 1] = focals
+        K[:, 0:2, 2] = pps * imsizes
+        if trans is None:
+            return K
+        # security! optimization is always trying to crush the scale down
+        sizes = torch.cat(log_sizes).exp()
+        global_scaling = 1 / sizes.min()
+        # compute distance of camera to focal plane
+        # tan(fov) = W/2 / focal
+        z_cameras = sizes * median_depths * focals / base_focals
+        # make extrinsic
+        rel_cam2cam = torch.eye(4, dtype=dtype, device=device)[None].expand(len(imgs), 4, 4).clone()
+        rel_cam2cam[:, :3, :3] = roma.unitquat_to_rotmat(F.normalize(torch.stack(quats), dim=1))
+        rel_cam2cam[:, :3, 3] = torch.stack(trans)
+        # camera are defined as a kinematic chain
+        tmp_cam2w = [None] * len(K)
+        tmp_cam2w[mst[0]] = rel_cam2cam[mst[0]]
+        for i, j in mst[1]:
+            # i is the cam_i_to_world reference, j is the relative pose = cam_j_to_cam_i
+            tmp_cam2w[j] = tmp_cam2w[i] @ rel_cam2cam[j]
+        tmp_cam2w = torch.stack(tmp_cam2w)
+        # smart reparameterizaton of cameras
+        trans_offset = z_cameras.unsqueeze(1) * torch.cat((imsizes / focals.unsqueeze(1) * (0.5 - pps), ones), dim=-1)
+        new_trans = global_scaling * (tmp_cam2w[:, :3, 3:4] - tmp_cam2w[:, :3, :3] @ trans_offset.unsqueeze(-1))
+        cam2w = torch.cat((torch.cat((tmp_cam2w[:, :3, :3], new_trans), dim=2),
+                          vec0001.view(1, 1, 4).expand(len(K), 1, 4)), dim=1)
+        depthmaps = []
+        for i in range(len(imgs)):
+            core_depth_img = core_depth[i]
+            if exp_depth:
+                core_depth_img = core_depth_img.exp()
+            if lora_depth:  # compute core_depth as a low-rank decomposition of 3d points
+                core_depth_img = lora_depth_proj[i] @ core_depth_img
+            if depth_mode == 'add':
+                core_depth_img = z_cameras[i] + (core_depth_img - 1) * (median_depths[i] * sizes[i])
+            elif depth_mode == 'mul':
+                core_depth_img = z_cameras[i] * core_depth_img
+            else:
+                raise ValueError(f'Bad {depth_mode=}')
+            depthmaps.append(global_scaling * core_depth_img)
+        return K, (inv(cam2w), cam2w), depthmaps
+    K = make_K_cam_depth(log_focals, pps, None, None, None, None)
+    if shared_intrinsics:
+        print('init focal (shared) = ', to_numpy(K[0, 0, 0]).round(2))
+    else:
+        print('init focals =', to_numpy(K[:, 0, 0]))
+    # spectral low-rank projection of depthmaps
+    if lora_depth:
+        core_depth, lora_depth_proj = spectral_projection_of_depthmaps(
+            imgs, K, core_depth, subsample, cache_path=cache_path, **lora_depth)
+    if exp_depth:
+        core_depth = [d.clip(min=1e-4).log() for d in core_depth]
+    core_depth = [nn.Parameter(d.ravel().to(dtype)) for d in core_depth]
+    log_sizes = [nn.Parameter(torch.zeros(1, dtype=dtype, device=device)) for _ in range(len(imgs))]
+    # Fetch img slices
+    _, confs_sum, imgs_slices = corres
+    # Define which pairs are fine to use with matching
+    def matching_check(x): return x.max() > matching_conf_thr
+    is_matching_ok = {}
+    for s in imgs_slices:
+        is_matching_ok[s.img1, s.img2] = matching_check(s.confs)
+    # Prepare slices and corres for losses
+    dust3r_slices = [s for s in imgs_slices if not is_matching_ok[s.img1, s.img2]]
+    loss3d_slices = [s for s in imgs_slices if is_matching_ok[s.img1, s.img2]]
+    cleaned_corres2d = []
+    for cci, (img1, pix1, confs, confsum, imgs_slices) in enumerate(corres2d):
+        cf_sum = 0
+        pix1_filtered = []
+        confs_filtered = []
+        curstep = 0
+        cleaned_slices = []
+        for img2, slice2 in imgs_slices:
+            if is_matching_ok[img1, img2]:
+                tslice = slice(curstep, curstep + slice2.stop - slice2.start, slice2.step)
+                pix1_filtered.append(pix1[tslice])
+                confs_filtered.append(confs[tslice])
+                cleaned_slices.append((img2, slice2))
+            curstep += slice2.stop - slice2.start
+        if pix1_filtered != []:
+            pix1_filtered = torch.cat(pix1_filtered)
+            confs_filtered = torch.cat(confs_filtered)
+            cf_sum = confs_filtered.sum()
+        cleaned_corres2d.append((img1, pix1_filtered, confs_filtered, cf_sum, cleaned_slices))
+    def loss_dust3r(cam2w, pts3d, pix_loss):
+        # In the case no correspondence could be established, fallback to DUSt3R GA regression loss formulation (sparsified)
+        loss = 0.
+        cf_sum = 0.
+        for s in dust3r_slices:
+            if init[imgs[s.img1]].get('freeze') and init[imgs[s.img2]].get('freeze'):
+                continue
+            # fallback to dust3r regression
+            tgt_pts, tgt_confs = preds_21[imgs[s.img2]][imgs[s.img1]]
+            tgt_pts = geotrf(cam2w[s.img2], tgt_pts)
+            cf_sum += tgt_confs.sum()
+            loss += tgt_confs @ pix_loss(pts3d[s.img1], tgt_pts)
+        return loss / cf_sum if cf_sum != 0. else 0.
+    def loss_3d(K, w2cam, pts3d, pix_loss):
+        # For each correspondence, we have two 3D points (one for each image of the pair).
+        # For each 3D point, we have 2 reproj errors
+        if any(v.get('freeze') for v in init.values()):
+            pts3d_1 = []
+            pts3d_2 = []
+            confs = []
+            for s in loss3d_slices:
+                if init[imgs[s.img1]].get('freeze') and init[imgs[s.img2]].get('freeze'):
+                    continue
+                pts3d_1.append(pts3d[s.img1][s.slice1])
+                pts3d_2.append(pts3d[s.img2][s.slice2])
+                confs.append(s.confs)
+        else:
+            pts3d_1 = [pts3d[s.img1][s.slice1] for s in loss3d_slices]
+            pts3d_2 = [pts3d[s.img2][s.slice2] for s in loss3d_slices]
+            confs = [s.confs for s in loss3d_slices]
+        if pts3d_1 != []:
+            confs = torch.cat(confs)
+            pts3d_1 = torch.cat(pts3d_1)
+            pts3d_2 = torch.cat(pts3d_2)
+            loss = confs @ pix_loss(pts3d_1, pts3d_2)
+            cf_sum = confs.sum()
+        else:
+            loss = 0.
+            cf_sum = 1.
+        return loss / cf_sum
+    def loss_2d(K, w2cam, pts3d, pix_loss):
+        # For each correspondence, we have two 3D points (one for each image of the pair).
+        # For each 3D point, we have 2 reproj errors
+        proj_matrix = K @ w2cam[:, :3]
+        loss = npix = 0
+        for img1, pix1_filtered, confs_filtered, cf_sum, cleaned_slices in cleaned_corres2d:
+            if init[imgs[img1]].get('freeze', 0) >= 1:
+                continue  # no need
+            pts3d_in_img1 = [pts3d[img2][slice2] for img2, slice2 in cleaned_slices]
+            if pts3d_in_img1 != []:
+                pts3d_in_img1 = torch.cat(pts3d_in_img1)
+                loss += confs_filtered @ pix_loss(pix1_filtered, reproj2d(proj_matrix[img1], pts3d_in_img1))
+                npix += confs_filtered.sum()
+        return loss / npix if npix != 0 else 0.
+    def optimize_loop(loss_func, lr_base, niter, pix_loss, lr_end=0):
+        # create optimizer
+        params = pps + log_focals + quats + trans + log_sizes + core_depth
+        optimizer = torch.optim.Adam(params, lr=1, weight_decay=0, betas=(0.9, 0.9))
+        ploss = pix_loss if 'meta' in repr(pix_loss) else (lambda a: pix_loss)
+        with tqdm(total=niter) as bar:
+            for iter in range(niter or 1):
+                K, (w2cam, cam2w), depthmaps = make_K_cam_depth(log_focals, pps, trans, quats, log_sizes, core_depth)
+                pts3d = make_pts3d(anchors, K, cam2w, depthmaps, base_focals=base_focals)
+                if niter == 0:
+                    break
+                alpha = (iter / niter)
+                lr = schedule(alpha, lr_base, lr_end)
+                adjust_learning_rate_by_lr(optimizer, lr)
+                pix_loss = ploss(1 - alpha)
+                optimizer.zero_grad()
+                loss = loss_func(K, w2cam, pts3d, pix_loss) + loss_dust3r_w * loss_dust3r(cam2w, pts3d, lossd)
+                loss.backward()
+                optimizer.step()
+                # make sure the pose remains well optimizable
+                for i in range(len(imgs)):
+                    quats[i].data[:] /= quats[i].data.norm()
+                loss = float(loss)
+                if loss != loss:
+                    break  # NaN loss
+                bar.set_postfix_str(f'{lr=:.4f}, {loss=:.3f}')
+                bar.update(1)
+        if niter:
+            print(f'>> final loss = {loss}')
+        return dict(intrinsics=K.detach(), cam2w=cam2w.detach(),
+                    depthmaps=[d.detach() for d in depthmaps], pts3d=[p.detach() for p in pts3d])
+    # at start, don't optimize 3d points
+    for i, img in enumerate(imgs):
+        trainable = not (init[img].get('freeze'))
+        pps[i].requires_grad_(False)
+        log_focals[i].requires_grad_(False)
+        quats[i].requires_grad_(trainable)
+        trans[i].requires_grad_(trainable)
+        log_sizes[i].requires_grad_(trainable)
+        core_depth[i].requires_grad_(False)
+    res_coarse = optimize_loop(loss_3d, lr_base=lr1, niter=niter1, pix_loss=loss1)
+    res_fine = None
+    if niter2:
+        # now we can optimize 3d points
+        for i, img in enumerate(imgs):
+            if init[img].get('freeze', 0) >= 1:
+                continue
+            pps[i].requires_grad_(bool(opt_pp))
+            log_focals[i].requires_grad_(True)
+            core_depth[i].requires_grad_(opt_depth)
+        # refinement with 2d reproj
+        res_fine = optimize_loop(loss_2d, lr_base=lr2, niter=niter2, pix_loss=loss2)
+    K = make_K_cam_depth(log_focals, pps, None, None, None, None)
+    if shared_intrinsics:
+        print('Final focal (shared) = ', to_numpy(K[0, 0, 0]).round(2))
+    else:
+        print('Final focals =', to_numpy(K[:, 0, 0]))
+    return imgs, res_coarse, res_fine
+@lru_cache
+def mask110(device, dtype):
+    return torch.tensor((1, 1, 0), device=device, dtype=dtype)
+def proj3d(inv_K, pixels, z):
+    if pixels.shape[-1] == 2:
+        pixels = torch.cat((pixels, torch.ones_like(pixels[..., :1])), dim=-1)
+    return z.unsqueeze(-1) * (pixels * inv_K.diag() + inv_K[:, 2] * mask110(z.device, z.dtype))
+def make_pts3d(anchors, K, cam2w, depthmaps, base_focals=None, ret_depth=False):
+    focals = K[:, 0, 0]
+    invK = inv(K)
+    all_pts3d = []
+    depth_out = []
+    for img, (pixels, idxs, offsets) in anchors.items():
+        # from depthmaps to 3d points
+        if base_focals is None:
+            pass
+        else:
+            # compensate for focal
+            # depth + depth * (offset - 1) * base_focal / focal
+            # = depth * (1 + (offset - 1) * (base_focal / focal))
+            offsets = 1 + (offsets - 1) * (base_focals[img] / focals[img])
+        pts3d = proj3d(invK[img], pixels, depthmaps[img][idxs] * offsets)
+        if ret_depth:
+            depth_out.append(pts3d[..., 2])  # before camera rotation
+        # rotate to world coordinate
+        pts3d = geotrf(cam2w[img], pts3d)
+        all_pts3d.append(pts3d)
+    if ret_depth:
+        return all_pts3d, depth_out
+    return all_pts3d
+def make_dense_pts3d(intrinsics, cam2w, depthmaps, canonical_paths, subsample, device='cuda'):
+    base_focals = []
+    anchors = {}
+    confs = []
+    for i, canon_path in enumerate(canonical_paths):
+        (canon, canon2, conf), focal = torch.load(canon_path, map_location=device)
+        confs.append(conf)
+        base_focals.append(focal)
+        H, W = conf.shape
+        pixels = torch.from_numpy(np.mgrid[:W, :H].T.reshape(-1, 2)).float().to(device)
+        idxs, offsets = anchor_depth_offsets(canon2, {i: (pixels, None)}, subsample=subsample)
+        anchors[i] = (pixels, idxs[i], offsets[i])
+    # densify sparse depthmaps
+    pts3d, depthmaps_out = make_pts3d(anchors, intrinsics, cam2w, [
+                                      d.ravel() for d in depthmaps], base_focals=base_focals, ret_depth=True)
+    return pts3d, depthmaps_out, confs
+@torch.no_grad()
+def forward_mast3r(pairs, model, cache_path, desc_conf='desc_conf',
+                   device='cuda', subsample=8, **matching_kw):
+    res_paths = {}
+    for img1, img2 in tqdm(pairs):
+        idx1 = hash_md5(img1['instance'])
+        idx2 = hash_md5(img2['instance'])
+        path1 = cache_path + f'/forward/{idx1}/{idx2}.pth'
+        path2 = cache_path + f'/forward/{idx2}/{idx1}.pth'
+        path_corres = cache_path + f'/corres_conf={desc_conf}_{subsample=}/{idx1}-{idx2}.pth'
+        path_corres2 = cache_path + f'/corres_conf={desc_conf}_{subsample=}/{idx2}-{idx1}.pth'
+        if os.path.isfile(path_corres2) and not os.path.isfile(path_corres):
+            score, (xy1, xy2, confs) = torch.load(path_corres2)
+            torch.save((score, (xy2, xy1, confs)), path_corres)
+        if not all(os.path.isfile(p) for p in (path1, path2, path_corres)):
+            if model is None:
+                continue
+            res = symmetric_inference(model, img1, img2, device=device)
+            X11, X21, X22, X12 = [r['pts3d'][0] for r in res]
+            C11, C21, C22, C12 = [r['conf'][0] for r in res]
+            descs = [r['desc'][0] for r in res]
+            qonfs = [r[desc_conf][0] for r in res]
+            # save
+            torch.save(to_cpu((X11, C11, X21, C21)), mkdir_for(path1))
+            torch.save(to_cpu((X22, C22, X12, C12)), mkdir_for(path2))
+            # perform reciprocal matching
+            corres = extract_correspondences(descs, qonfs, device=device, subsample=subsample)
+            conf_score = (C11.mean() * C12.mean() * C21.mean() * C22.mean()).sqrt().sqrt()
+            matching_score = (float(conf_score), float(corres[2].sum()), len(corres[2]))
+            if cache_path is not None:
+                torch.save((matching_score, corres), mkdir_for(path_corres))
+        res_paths[img1['instance'], img2['instance']] = (path1, path2), path_corres
+    del model
+    torch.cuda.empty_cache()
+    return res_paths, cache_path
+def symmetric_inference(model, img1, img2, device):
+    shape1 = torch.from_numpy(img1['true_shape']).to(device, non_blocking=True)
+    shape2 = torch.from_numpy(img2['true_shape']).to(device, non_blocking=True)
+    img1 = img1['img'].to(device, non_blocking=True)
+    img2 = img2['img'].to(device, non_blocking=True)
+    # compute encoder only once
+    feat1, feat2, pos1, pos2 = model._encode_image_pairs(img1, img2, shape1, shape2)
+    def decoder(feat1, feat2, pos1, pos2, shape1, shape2):
+        dec1, dec2 = model._decoder(feat1, pos1, feat2, pos2)
+        with torch.cuda.amp.autocast(enabled=False):
+            res1 = model._downstream_head(1, [tok.float() for tok in dec1], shape1)
+            res2 = model._downstream_head(2, [tok.float() for tok in dec2], shape2)
+        return res1, res2
+    # decoder 1-2
+    res11, res21 = decoder(feat1, feat2, pos1, pos2, shape1, shape2)
+    # decoder 2-1
+    res22, res12 = decoder(feat2, feat1, pos2, pos1, shape2, shape1)
+    return (res11, res21, res22, res12)
+def extract_correspondences(feats, qonfs, subsample=8, device=None, ptmap_key='pred_desc'):
+    feat11, feat21, feat22, feat12 = feats
+    qonf11, qonf21, qonf22, qonf12 = qonfs
+    assert feat11.shape[:2] == feat12.shape[:2] == qonf11.shape == qonf12.shape
+    assert feat21.shape[:2] == feat22.shape[:2] == qonf21.shape == qonf22.shape
+    if '3d' in ptmap_key:
+        opt = dict(device='cpu', workers=32)
+    else:
+        opt = dict(device=device, dist='dot', block_size=2**13)
+    # matching the two pairs
+    idx1 = []
+    idx2 = []
+    qonf1 = []
+    qonf2 = []
+    # TODO add non symmetric / pixel_tol options
+    for A, B, QA, QB in [(feat11, feat21, qonf11.cpu(), qonf21.cpu()),
+                         (feat12, feat22, qonf12.cpu(), qonf22.cpu())]:
+        nn1to2 = fast_reciprocal_NNs(A, B, subsample_or_initxy1=subsample, ret_xy=False, **opt)
+        nn2to1 = fast_reciprocal_NNs(B, A, subsample_or_initxy1=subsample, ret_xy=False, **opt)
+        idx1.append(np.r_[nn1to2[0], nn2to1[1]])
+        idx2.append(np.r_[nn1to2[1], nn2to1[0]])
+        qonf1.append(QA.ravel()[idx1[-1]])
+        qonf2.append(QB.ravel()[idx2[-1]])
+    # merge corres from opposite pairs
+    H1, W1 = feat11.shape[:2]
+    H2, W2 = feat22.shape[:2]
+    cat = np.concatenate
+    xy1, xy2, idx = merge_corres(cat(idx1), cat(idx2), (H1, W1), (H2, W2), ret_xy=True, ret_index=True)
+    corres = (xy1.copy(), xy2.copy(), np.sqrt(cat(qonf1)[idx] * cat(qonf2)[idx]))
+    return todevice(corres, device)
+@torch.no_grad()
+def prepare_canonical_data(imgs, tmp_pairs, subsample, order_imgs=False, min_conf_thr=0,
+                           cache_path=None, device='cuda', **kw):
+    canonical_views = {}
+    pairwise_scores = torch.zeros((len(imgs), len(imgs)), device=device)
+    canonical_paths = []
+    preds_21 = {}
+    for img in tqdm(imgs):
+        if cache_path:
+            cache = os.path.join(cache_path, 'canon_views', hash_md5(img) + f'_{subsample=}_{kw=}.pth')
+            canonical_paths.append(cache)
+        try:
+            (canon, canon2, cconf), focal = torch.load(cache, map_location=device)
+        except IOError:
+            # cache does not exist yet, we create it!
+            canon = focal = None
+        # collect all pred1
+        n_pairs = sum((img in pair) for pair in tmp_pairs)
+        ptmaps11 = None
+        pixels = {}
+        n = 0
+        for (img1, img2), ((path1, path2), path_corres) in tmp_pairs.items():
+            score = None
+            if img == img1:
+                X, C, X2, C2 = torch.load(path1, map_location=device)
+                score, (xy1, xy2, confs) = load_corres(path_corres, device, min_conf_thr)
+                pixels[img2] = xy1, confs
+                if img not in preds_21:
+                    preds_21[img] = {}
+                # Subsample preds_21
+                preds_21[img][img2] = X2[::subsample, ::subsample].reshape(-1, 3), C2[::subsample, ::subsample].ravel()
+            if img == img2:
+                X, C, X2, C2 = torch.load(path2, map_location=device)
+                score, (xy1, xy2, confs) = load_corres(path_corres, device, min_conf_thr)
+                pixels[img1] = xy2, confs
+                if img not in preds_21:
+                    preds_21[img] = {}
+                preds_21[img][img1] = X2[::subsample, ::subsample].reshape(-1, 3), C2[::subsample, ::subsample].ravel()
+            if score is not None:
+                i, j = imgs.index(img1), imgs.index(img2)
+                # score = score[0]
+                # score = np.log1p(score[2])
+                score = score[2]
+                pairwise_scores[i, j] = score
+                pairwise_scores[j, i] = score
+                if canon is not None:
+                    continue
+                if ptmaps11 is None:
+                    H, W = C.shape
+                    ptmaps11 = torch.empty((n_pairs, H, W, 3), device=device)
+                    confs11 = torch.empty((n_pairs, H, W), device=device)
+                ptmaps11[n] = X
+                confs11[n] = C
+                n += 1
+        if canon is None:
+            canon, canon2, cconf = canonical_view(ptmaps11, confs11, subsample, **kw)
+            del ptmaps11
+            del confs11
+        # compute focals
+        H, W = canon.shape[:2]
+        pp = torch.tensor([W / 2, H / 2], device=device)
+        if focal is None:
+            focal = estimate_focal_knowing_depth(canon[None], pp, focal_mode='weiszfeld', min_focal=0.5, max_focal=3.5)
+            if cache:
+                torch.save(to_cpu(((canon, canon2, cconf), focal)), mkdir_for(cache))
+        # extract depth offsets with correspondences
+        core_depth = canon[subsample // 2::subsample, subsample // 2::subsample, 2]
+        idxs, offsets = anchor_depth_offsets(canon2, pixels, subsample=subsample)
+        canonical_views[img] = (pp, (H, W), focal.view(1), core_depth, pixels, idxs, offsets)
+    return tmp_pairs, pairwise_scores, canonical_views, canonical_paths, preds_21
+def load_corres(path_corres, device, min_conf_thr):
+    score, (xy1, xy2, confs) = torch.load(path_corres, map_location=device)
+    valid = confs > min_conf_thr if min_conf_thr else slice(None)
+    # valid = (xy1 > 0).all(dim=1) & (xy2 > 0).all(dim=1) & (xy1 < 512).all(dim=1) & (xy2 < 512).all(dim=1)
+    # print(f'keeping {valid.sum()} / {len(valid)} correspondences')
+    return score, (xy1[valid], xy2[valid], confs[valid])
+PairOfSlices = namedtuple(
+    'ImgPair', 'img1, slice1, pix1, anchor_idxs1, img2, slice2, pix2, anchor_idxs2, confs, confs_sum')
+def condense_data(imgs, tmp_paths, canonical_views, preds_21, dtype=torch.float32):
+    # aggregate all data properly
+    set_imgs = set(imgs)
+    principal_points = []
+    shapes = []
+    focals = []
+    core_depth = []
+    img_anchors = {}
+    tmp_pixels = {}
+    for idx1, img1 in enumerate(imgs):
+        # load stuff
+        pp, shape, focal, anchors, pixels_confs, idxs, offsets = canonical_views[img1]
+        principal_points.append(pp)
+        shapes.append(shape)
+        focals.append(focal)
+        core_depth.append(anchors)
+        img_uv1 = []
+        img_idxs = []
+        img_offs = []
+        cur_n = [0]
+        for img2, (pixels, match_confs) in pixels_confs.items():
+            if img2 not in set_imgs:
+                continue
+            assert len(pixels) == len(idxs[img2]) == len(offsets[img2])
+            img_uv1.append(torch.cat((pixels, torch.ones_like(pixels[:, :1])), dim=-1))
+            img_idxs.append(idxs[img2])
+            img_offs.append(offsets[img2])
+            cur_n.append(cur_n[-1] + len(pixels))
+            # store the position of 3d points
+            tmp_pixels[img1, img2] = pixels.to(dtype), match_confs.to(dtype), slice(*cur_n[-2:])
+        img_anchors[idx1] = (torch.cat(img_uv1), torch.cat(img_idxs), torch.cat(img_offs))
+    all_confs = []
+    imgs_slices = []
+    corres2d = {img: [] for img in range(len(imgs))}
+    for img1, img2 in tmp_paths:
+        try:
+            pix1, confs1, slice1 = tmp_pixels[img1, img2]
+            pix2, confs2, slice2 = tmp_pixels[img2, img1]
+        except KeyError:
+            continue
+        img1 = imgs.index(img1)
+        img2 = imgs.index(img2)
+        confs = (confs1 * confs2).sqrt()
+        # prepare for loss_3d
+        all_confs.append(confs)
+        anchor_idxs1 = canonical_views[imgs[img1]][5][imgs[img2]]
+        anchor_idxs2 = canonical_views[imgs[img2]][5][imgs[img1]]
+        imgs_slices.append(PairOfSlices(img1, slice1, pix1, anchor_idxs1,
+                                        img2, slice2, pix2, anchor_idxs2,
+                                        confs, float(confs.sum())))
+        # prepare for loss_2d
+        corres2d[img1].append((pix1, confs, img2, slice2))
+        corres2d[img2].append((pix2, confs, img1, slice1))
+    all_confs = torch.cat(all_confs)
+    corres = (all_confs, float(all_confs.sum()), imgs_slices)
+    def aggreg_matches(img1, list_matches):
+        pix1, confs, img2, slice2 = zip(*list_matches)
+        all_pix1 = torch.cat(pix1).to(dtype)
+        all_confs = torch.cat(confs).to(dtype)
+        return img1, all_pix1, all_confs, float(all_confs.sum()), [(j, sl2) for j, sl2 in zip(img2, slice2)]
+    corres2d = [aggreg_matches(img, m) for img, m in corres2d.items()]
+    imsizes = torch.tensor([(W, H) for H, W in shapes], device=pp.device)  # (W,H)
+    principal_points = torch.stack(principal_points)
+    focals = torch.cat(focals)
+    # Subsample preds_21
+    subsamp_preds_21 = {}
+    for imk, imv in preds_21.items():
+        subsamp_preds_21[imk] = {}
+        for im2k, (pred, conf) in preds_21[imk].items():
+            idxs = img_anchors[imgs.index(im2k)][1]
+            subsamp_preds_21[imk][im2k] = (pred[idxs], conf[idxs])  # anchors subsample
+    return imsizes, principal_points, focals, core_depth, img_anchors, corres, corres2d, subsamp_preds_21
+def canonical_view(ptmaps11, confs11, subsample, mode='avg-angle'):
+    assert len(ptmaps11) == len(confs11) > 0, 'not a single view1 for img={i}'
+    # canonical pointmap is just a weighted average
+    confs11 = confs11.unsqueeze(-1) - 0.999
+    canon = (confs11 * ptmaps11).sum(0) / confs11.sum(0)
+    canon_depth = ptmaps11[..., 2].unsqueeze(1)
+    S = slice(subsample // 2, None, subsample)
+    center_depth = canon_depth[:, :, S, S]
+    center_depth = torch.clip(center_depth, min=torch.finfo(center_depth.dtype).eps)
+    stacked_depth = F.pixel_unshuffle(canon_depth, subsample)
+    stacked_confs = F.pixel_unshuffle(confs11[:, None, :, :, 0], subsample)
+    if mode == 'avg-reldepth':
+        rel_depth = stacked_depth / center_depth
+        stacked_canon = (stacked_confs * rel_depth).sum(dim=0) / stacked_confs.sum(dim=0)
+        canon2 = F.pixel_shuffle(stacked_canon.unsqueeze(0), subsample).squeeze()
+    elif mode == 'avg-angle':
+        xy = ptmaps11[..., 0:2].permute(0, 3, 1, 2)
+        stacked_xy = F.pixel_unshuffle(xy, subsample)
+        B, _, H, W = stacked_xy.shape
+        stacked_radius = (stacked_xy.view(B, 2, -1, H, W) - xy[:, :, None, S, S]).norm(dim=1)
+        stacked_radius.clip_(min=1e-8)
+        stacked_angle = torch.arctan((stacked_depth - center_depth) / stacked_radius)
+        avg_angle = (stacked_confs * stacked_angle).sum(dim=0) / stacked_confs.sum(dim=0)
+        # back to depth
+        stacked_depth = stacked_radius.mean(dim=0) * torch.tan(avg_angle)
+        canon2 = F.pixel_shuffle((1 + stacked_depth / canon[S, S, 2]).unsqueeze(0), subsample).squeeze()
+    else:
+        raise ValueError(f'bad {mode=}')
+    confs = (confs11.square().sum(dim=0) / confs11.sum(dim=0)).squeeze()
+    return canon, canon2, confs
+def anchor_depth_offsets(canon_depth, pixels, subsample=8):
+    device = canon_depth.device
+    # create a 2D grid of anchor 3D points
+    H1, W1 = canon_depth.shape
+    yx = np.mgrid[subsample // 2:H1:subsample, subsample // 2:W1:subsample]
+    H2, W2 = yx.shape[1:]
+    cy, cx = yx.reshape(2, -1)
+    core_depth = canon_depth[cy, cx]
+    assert (core_depth > 0).all()
+    # slave 3d points (attached to core 3d points)
+    core_idxs = {}  # core_idxs[img2] = {corr_idx:core_idx}
+    core_offs = {}  # core_offs[img2] = {corr_idx:3d_offset}
+    for img2, (xy1, _confs) in pixels.items():
+        px, py = xy1.long().T
+        # find nearest anchor == block quantization
+        core_idx = (py // subsample) * W2 + (px // subsample)
+        core_idxs[img2] = core_idx.to(device)
+        # compute relative depth offsets w.r.t. anchors
+        ref_z = core_depth[core_idx]
+        pts_z = canon_depth[py, px]
+        offset = pts_z / ref_z
+        core_offs[img2] = offset.detach().to(device)
+    return core_idxs, core_offs
+def spectral_clustering(graph, k=None, normalized_cuts=False):
+    graph.fill_diagonal_(0)
+    # graph laplacian
+    degrees = graph.sum(dim=-1)
+    laplacian = torch.diag(degrees) - graph
+    if normalized_cuts:
+        i_inv = torch.diag(degrees.sqrt().reciprocal())
+        laplacian = i_inv @ laplacian @ i_inv
+    # compute eigenvectors!
+    eigval, eigvec = torch.linalg.eigh(laplacian)
+    return eigval[:k], eigvec[:, :k]
+def sim_func(p1, p2, gamma):
+    diff = (p1 - p2).norm(dim=-1)
+    avg_depth = (p1[:, :, 2] + p2[:, :, 2])
+    rel_distance = diff / avg_depth
+    sim = torch.exp(-gamma * rel_distance.square())
+    return sim
+def backproj(K, depthmap, subsample):
+    H, W = depthmap.shape
+    uv = np.mgrid[subsample // 2:subsample * W:subsample, subsample // 2:subsample * H:subsample].T.reshape(H, W, 2)
+    xyz = depthmap.unsqueeze(-1) * geotrf(inv(K), todevice(uv, K.device), ncol=3)
+    return xyz
+def spectral_projection_depth(K, depthmap, subsample, k=64, cache_path='',
+                              normalized_cuts=True, gamma=7, min_norm=5):
+    try:
+        if cache_path:
+            cache_path = cache_path + f'_{k=}_norm={normalized_cuts}_{gamma=}.pth'
+        lora_proj = torch.load(cache_path, map_location=K.device)
+    except IOError:
+        # reconstruct 3d points in camera coordinates
+        xyz = backproj(K, depthmap, subsample)
+        # compute all distances
+        xyz = xyz.reshape(-1, 3)
+        graph = sim_func(xyz[:, None], xyz[None, :], gamma=gamma)
+        _, lora_proj = spectral_clustering(graph, k, normalized_cuts=normalized_cuts)
+        if cache_path:
+            torch.save(lora_proj.cpu(), mkdir_for(cache_path))
+    lora_proj, coeffs = lora_encode_normed(lora_proj, depthmap.ravel(), min_norm=min_norm)
+    # depthmap ~= lora_proj @ coeffs
+    return coeffs, lora_proj
+def lora_encode_normed(lora_proj, x, min_norm, global_norm=False):
+    # encode the pointmap
+    coeffs = torch.linalg.pinv(lora_proj) @ x
+    # rectify the norm of basis vector to be ~ equal
+    if coeffs.ndim == 1:
+        coeffs = coeffs[:, None]
+    if global_norm:
+        lora_proj *= coeffs[1:].norm() * min_norm / coeffs.shape[1]
+    elif min_norm:
+        lora_proj *= coeffs.norm(dim=1).clip(min=min_norm)
+    # can have rounding errors here!
+    coeffs = (torch.linalg.pinv(lora_proj.double()) @ x.double()).float()
+    return lora_proj.detach(), coeffs.detach()
+@torch.no_grad()
+def spectral_projection_of_depthmaps(imgs, intrinsics, depthmaps, subsample, cache_path=None, **kw):
+    # recover 3d points
+    core_depth = []
+    lora_proj = []
+    for i, img in enumerate(tqdm(imgs)):
+        cache = os.path.join(cache_path, 'lora_depth', hash_md5(img)) if cache_path else None
+        depth, proj = spectral_projection_depth(intrinsics[i], depthmaps[i], subsample,
+                                                cache_path=cache, **kw)
+        core_depth.append(depth)
+        lora_proj.append(proj)
+    return core_depth, lora_proj
+def reproj2d(Trf, pts3d):
+    res = (pts3d @ Trf[:3, :3].transpose(-1, -2)) + Trf[:3, 3]
+    clipped_z = res[:, 2:3].clip(min=1e-3)  # make sure we don't have nans!
+    uv = res[:, 0:2] / clipped_z
+    return uv.clip(min=-1000, max=2000)
+def bfs(tree, start_node):
+    order, predecessors = sp.csgraph.breadth_first_order(tree, start_node, directed=False)
+    ranks = np.arange(len(order))
+    ranks[order] = ranks.copy()
+    return ranks, predecessors
+def compute_min_spanning_tree(pws):
+    sparse_graph = sp.dok_array(pws.shape)
+    for i, j in pws.nonzero().cpu().tolist():
+        sparse_graph[i, j] = -float(pws[i, j])
+    msp = sp.csgraph.minimum_spanning_tree(sparse_graph)
+    # now reorder the oriented edges, starting from the central point
+    ranks1, _ = bfs(msp, 0)
+    ranks2, _ = bfs(msp, ranks1.argmax())
+    ranks1, _ = bfs(msp, ranks2.argmax())
+    # this is the point farther from any leaf
+    root = np.minimum(ranks1, ranks2).argmax()
+    # find the ordered list of edges that describe the tree
+    order, predecessors = sp.csgraph.breadth_first_order(msp, root, directed=False)
+    order = order[1:]  # root not do not have a predecessor
+    edges = [(predecessors[i], i) for i in order]
+    return root, edges
+def show_reconstruction(shapes_or_imgs, K, cam2w, pts3d, gt_cam2w=None, gt_K=None, cam_size=None, masks=None, **kw):
+    viz = SceneViz()
+    cc = cam2w[:, :3, 3]
+    cs = cam_size or float(torch.cdist(cc, cc).fill_diagonal_(np.inf).min(dim=0).values.median())
+    colors = 64 + np.random.randint(255 - 64, size=(len(cam2w), 3))
+    if isinstance(shapes_or_imgs, np.ndarray) and shapes_or_imgs.ndim == 2:
+        cam_kws = dict(imsizes=shapes_or_imgs[:, ::-1], cam_size=cs)
+    else:
+        imgs = shapes_or_imgs
+        cam_kws = dict(images=imgs, cam_size=cs)
+    if K is not None:
+        viz.add_cameras(to_numpy(cam2w), to_numpy(K), colors=colors, **cam_kws)
+    if gt_cam2w is not None:
+        if gt_K is None:
+            gt_K = K
+        viz.add_cameras(to_numpy(gt_cam2w), to_numpy(gt_K), colors=colors, marker='o', **cam_kws)
+    if pts3d is not None:
+        for i, p in enumerate(pts3d):
+            if not len(p):
+                continue
+            if masks is None:
+                viz.add_pointcloud(to_numpy(p), color=tuple(colors[i].tolist()))
+            else:
+                viz.add_pointcloud(to_numpy(p), mask=masks[i], color=imgs[i])
+    viz.show(**kw)

mast3r/cloud_opt/triangulation.py ADDED Viewed

	@@ -0,0 +1,80 @@

+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# Matches Triangulation Utils
+# --------------------------------------------------------
+import numpy as np
+import torch
+# Batched Matches Triangulation
+def batched_triangulate(pts2d,        # [B, Ncams, Npts, 2]
+                        proj_mats):   # [B, Ncams, 3, 4] I@E projection matrix
+    B, Ncams, Npts, two = pts2d.shape
+    assert two==2
+    assert proj_mats.shape == (B, Ncams, 3, 4)
+    # P - xP
+    x = proj_mats[...,0,:][...,None,:] - torch.einsum('bij,bik->bijk', pts2d[...,0], proj_mats[...,2,:]) # [B, Ncams, Npts, 4]
+    y = proj_mats[...,1,:][...,None,:] - torch.einsum('bij,bik->bijk', pts2d[...,1], proj_mats[...,2,:]) # [B, Ncams, Npts, 4]
+    eq = torch.cat([x, y], dim=1).transpose(1, 2) # [B, Npts, 2xNcams, 4]
+    return torch.linalg.lstsq(eq[...,:3], -eq[...,3]).solution
+def matches_to_depths(intrinsics, # input camera intrinsics     [B, Ncams, 3, 3]
+                      extrinsics, # input camera extrinsics     [B, Ncams, 3, 4]
+                      matches,    # input correspondences       [B, Ncams, Npts, 2]
+                      batchsize=16, # bs for batched processing
+                      min_num_valids_ratio=.3 # at least this ratio of image pairs need to predict a match for a given pixel of img1
+                      ):
+    B, Nv, H, W, five = matches.shape
+    min_num_valids = np.floor(Nv*min_num_valids_ratio)
+    out_aggregated_points, out_depths, out_confs = [], [], []
+    for b in range(B//batchsize+1): # batched processing
+        start, stop = b*batchsize,min(B,(b+1)*batchsize)
+        sub_batch=slice(start,stop)
+        sub_batchsize = stop-start
+        if sub_batchsize==0:continue
+        points1, points2, confs = matches[sub_batch, ..., :2], matches[sub_batch, ..., 2:4], matches[sub_batch, ..., -1]
+        allpoints = torch.cat([points1.view([sub_batchsize*Nv,1,H*W,2]), points2.view([sub_batchsize*Nv,1,H*W,2])],dim=1) # [BxNv, 2, HxW, 2]
+        allcam_Ps = intrinsics[sub_batch] @ extrinsics[sub_batch,:,:3,:]
+        cam_Ps1, cam_Ps2 = allcam_Ps[:,[0]].repeat([1,Nv,1,1]), allcam_Ps[:,1:] # [B, Nv, 3, 4]
+        formatted_camPs = torch.cat([cam_Ps1.reshape([sub_batchsize*Nv,1,3,4]), cam_Ps2.reshape([sub_batchsize*Nv,1,3,4])],dim=1) # [BxNv, 2, 3, 4]
+        # Triangulate matches to 3D
+        points_3d_world = batched_triangulate(allpoints, formatted_camPs) # [BxNv, HxW, three]
+        # Aggregate pairwise predictions
+        points_3d_world = points_3d_world.view([sub_batchsize,Nv,H,W,3])
+        valids = points_3d_world.isfinite()
+        valids_sum = valids.sum(dim=-1)
+        validsuni=valids_sum.unique()
+        assert torch.all(torch.logical_or(validsuni == 0 , validsuni == 3)), "Error, can only be nan for none or all XYZ values, not a subset"
+        confs[valids_sum==0] = 0.
+        points_3d_world = points_3d_world*confs[...,None]
+        # Take care of NaNs
+        normalization = confs.sum(dim=1)[:,None].repeat(1,Nv,1,1)
+        normalization[normalization <= 1e-5] = 1.
+        points_3d_world[valids] /= normalization[valids_sum==3][:,None].repeat(1,3).view(-1)
+        points_3d_world[~valids] = 0.
+        aggregated_points = points_3d_world.sum(dim=1) # weighted average (by confidence value) ignoring nans
+        # Reset invalid values to nans, with a min visibility threshold
+        aggregated_points[valids_sum.sum(dim=1)/3 <= min_num_valids] = torch.nan
+        # From 3D to depths
+        refcamE = extrinsics[sub_batch, 0]
+        points_3d_camera = (refcamE[:,:3, :3] @ aggregated_points.view(sub_batchsize,-1,3).transpose(-2,-1) + refcamE[:,:3,[3]]).transpose(-2,-1) # [B,HxW,3]
+        depths = points_3d_camera.view(sub_batchsize,H,W,3)[..., 2] # [B,H,W]
+        # Cat results
+        out_aggregated_points.append(aggregated_points.cpu())
+        out_depths.append(depths.cpu())
+        out_confs.append(confs.sum(dim=1).cpu())
+    out_aggregated_points = torch.cat(out_aggregated_points,dim=0)
+    out_depths            = torch.cat(out_depths,dim=0)
+    out_confs             = torch.cat(out_confs,dim=0)
+    return out_aggregated_points, out_depths, out_confs

mast3r/cloud_opt/tsdf_optimizer.py ADDED Viewed

	@@ -0,0 +1,269 @@

+import torch
+from torch import nn
+import numpy as np
+from tqdm import tqdm
+from matplotlib import pyplot as pl
+import mast3r.utils.path_to_dust3r  # noqa
+from dust3r.utils.geometry import depthmap_to_pts3d, geotrf, inv
+class TSDFPostProcess:
+    """ Optimizes a signed distance-function to improve depthmaps.
+    """
+    def __init__(self, optimizer, subsample=8, TSDF_thresh=0., TSDF_batchsize=int(1e7)):
+        self.TSDF_thresh = TSDF_thresh  # None -> no TSDF
+        self.TSDF_batchsize = TSDF_batchsize
+        self.optimizer = optimizer
+        pts3d, depthmaps, confs = optimizer.get_dense_pts3d(clean_depth=False, subsample=subsample)
+        pts3d, depthmaps = self._TSDF_postprocess_or_not(pts3d, depthmaps, confs)
+        self.pts3d = pts3d
+        self.depthmaps = depthmaps
+        self.confs = confs
+    def _get_depthmaps(self, TSDF_filtering_thresh=None):
+        if TSDF_filtering_thresh:
+            self._refine_depths_with_TSDF(self.optimizer, TSDF_filtering_thresh)  # compute refined depths if needed
+        dms = self.TSDF_im_depthmaps if TSDF_filtering_thresh else self.im_depthmaps
+        return [d.exp() for d in dms]
+    @torch.no_grad()
+    def _refine_depths_with_TSDF(self, TSDF_filtering_thresh, niter=1, nsamples=1000):
+        """
+        Leverage TSDF to post-process estimated depths
+        for each pixel, find zero level of TSDF along ray (or closest to 0)
+        """
+        print("Post-Processing Depths with TSDF fusion.")
+        self.TSDF_im_depthmaps = []
+        alldepths, allposes, allfocals, allpps, allimshapes = self._get_depthmaps(), self.optimizer.get_im_poses(
+        ), self.optimizer.get_focals(), self.optimizer.get_principal_points(), self.imshapes
+        for vi in tqdm(range(self.optimizer.n_imgs)):
+            dm, pose, focal, pp, imshape = alldepths[vi], allposes[vi], allfocals[vi], allpps[vi], allimshapes[vi]
+            minvals = torch.full(dm.shape, 1e20)
+            for it in range(niter):
+                H, W = dm.shape
+                curthresh = (niter - it) * TSDF_filtering_thresh
+                dm_offsets = (torch.randn(H, W, nsamples).to(dm) - 1.) * \
+                    curthresh  # decreasing search std along with iterations
+                newdm = dm[..., None] + dm_offsets  # [H,W,Nsamp]
+                curproj = self._backproj_pts3d(in_depths=[newdm], in_im_poses=pose[None], in_focals=focal[None], in_pps=pp[None], in_imshapes=[
+                    imshape])[0]  # [H,W,Nsamp,3]
+                # Batched TSDF eval
+                curproj = curproj.view(-1, 3)
+                tsdf_vals = []
+                valids = []
+                for batch in range(0, len(curproj), self.TSDF_batchsize):
+                    values, valid = self._TSDF_query(
+                        curproj[batch:min(batch + self.TSDF_batchsize, len(curproj))], curthresh)
+                    tsdf_vals.append(values)
+                    valids.append(valid)
+                tsdf_vals = torch.cat(tsdf_vals, dim=0)
+                valids = torch.cat(valids, dim=0)
+                tsdf_vals = tsdf_vals.view([H, W, nsamples])
+                valids = valids.view([H, W, nsamples])
+                # keep depth value that got us the closest to 0
+                tsdf_vals[~valids] = torch.inf  # ignore invalid values
+                tsdf_vals = tsdf_vals.abs()
+                mins = torch.argmin(tsdf_vals, dim=-1, keepdim=True)
+                # when all samples live on a very flat zone, do nothing
+                allbad = (tsdf_vals == curthresh).sum(dim=-1) == nsamples
+                dm[~allbad] = torch.gather(newdm, -1, mins)[..., 0][~allbad]
+            # Save refined depth map
+            self.TSDF_im_depthmaps.append(dm.log())
+    def _TSDF_query(self, qpoints, TSDF_filtering_thresh, weighted=True):
+        """
+        TSDF query call: returns the weighted TSDF value for each query point [N, 3]
+        """
+        N, three = qpoints.shape
+        assert three == 3
+        qpoints = qpoints[None].repeat(self.optimizer.n_imgs, 1, 1)  # [B,N,3]
+        # get projection coordinates and depths onto images
+        coords_and_depth = self._proj_pts3d(pts3d=qpoints, cam2worlds=self.optimizer.get_im_poses(
+        ), focals=self.optimizer.get_focals(), pps=self.optimizer.get_principal_points())
+        image_coords = coords_and_depth[..., :2].round().to(int)  # for now, there's no interpolation...
+        proj_depths = coords_and_depth[..., -1]
+        # recover depth values after scene optim
+        pred_depths, pred_confs, valids = self._get_pixel_depths(image_coords)
+        # Gather TSDF scores
+        all_SDF_scores = pred_depths - proj_depths  # SDF
+        unseen = all_SDF_scores < -TSDF_filtering_thresh  # handle visibility
+        # all_TSDF_scores = all_SDF_scores.clip(-TSDF_filtering_thresh,TSDF_filtering_thresh) # SDF -> TSDF
+        all_TSDF_scores = all_SDF_scores.clip(-TSDF_filtering_thresh, 1e20)  # SDF -> TSDF
+        # Gather TSDF confidences and ignore points that are unseen, either OOB during reproj or too far behind seen depth
+        all_TSDF_weights = (~unseen).float() * valids.float()
+        if weighted:
+            all_TSDF_weights = pred_confs.exp() * all_TSDF_weights
+        # Aggregate all votes, ignoring zeros
+        TSDF_weights = all_TSDF_weights.sum(dim=0)
+        valids = TSDF_weights != 0.
+        TSDF_wsum = (all_TSDF_weights * all_TSDF_scores).sum(dim=0)
+        TSDF_wsum[valids] /= TSDF_weights[valids]
+        return TSDF_wsum, valids
+    def _get_pixel_depths(self, image_coords, TSDF_filtering_thresh=None, with_normals_conf=False):
+        """ Recover depth value for each input pixel coordinate, along with OOB validity mask
+        """
+        B, N, two = image_coords.shape
+        assert B == self.optimizer.n_imgs and two == 2
+        depths = torch.zeros([B, N], device=image_coords.device)
+        valids = torch.zeros([B, N], dtype=bool, device=image_coords.device)
+        confs = torch.zeros([B, N], device=image_coords.device)
+        curconfs = self._get_confs_with_normals() if with_normals_conf else self.im_conf
+        for ni, (imc, depth, conf) in enumerate(zip(image_coords, self._get_depthmaps(TSDF_filtering_thresh), curconfs)):
+            H, W = depth.shape
+            valids[ni] = torch.logical_and(0 <= imc[:, 1], imc[:, 1] <
+                                           H) & torch.logical_and(0 <= imc[:, 0], imc[:, 0] < W)
+            imc[~valids[ni]] = 0
+            depths[ni] = depth[imc[:, 1], imc[:, 0]]
+            confs[ni] = conf.cuda()[imc[:, 1], imc[:, 0]]
+        return depths, confs, valids
+    def _get_confs_with_normals(self):
+        outconfs = []
+        # Confidence basedf on depth gradient
+        class Sobel(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.filter = nn.Conv2d(in_channels=1, out_channels=2, kernel_size=3, stride=1, padding=1, bias=False)
+                Gx = torch.tensor([[2.0, 0.0, -2.0], [4.0, 0.0, -4.0], [2.0, 0.0, -2.0]])
+                Gy = torch.tensor([[2.0, 4.0, 2.0], [0.0, 0.0, 0.0], [-2.0, -4.0, -2.0]])
+                G = torch.cat([Gx.unsqueeze(0), Gy.unsqueeze(0)], 0)
+                G = G.unsqueeze(1)
+                self.filter.weight = nn.Parameter(G, requires_grad=False)
+            def forward(self, img):
+                x = self.filter(img)
+                x = torch.mul(x, x)
+                x = torch.sum(x, dim=1, keepdim=True)
+                x = torch.sqrt(x)
+                return x
+        grad_op = Sobel().to(self.im_depthmaps[0].device)
+        for conf, depth in zip(self.im_conf, self.im_depthmaps):
+            grad_confs = (1. - grad_op(depth[None, None])[0, 0]).clip(0)
+            if not 'dbg show':
+                pl.imshow(grad_confs.cpu())
+                pl.show()
+            outconfs.append(conf * grad_confs.to(conf))
+        return outconfs
+    def _proj_pts3d(self, pts3d, cam2worlds, focals, pps):
+        """
+        Projection operation: from 3D points to 2D coordinates + depths
+        """
+        B = pts3d.shape[0]
+        assert pts3d.shape[0] == cam2worlds.shape[0]
+        # prepare Extrinsincs
+        R, t = cam2worlds[:, :3, :3], cam2worlds[:, :3, -1]
+        Rinv = R.transpose(-2, -1)
+        tinv = -Rinv @ t[..., None]
+        # prepare intrinsics
+        intrinsics = torch.eye(3).to(cam2worlds)[None].repeat(focals.shape[0], 1, 1)
+        if len(focals.shape) == 1:
+            focals = torch.stack([focals, focals], dim=-1)
+        intrinsics[:, 0, 0] = focals[:, 0]
+        intrinsics[:, 1, 1] = focals[:, 1]
+        intrinsics[:, :2, -1] = pps
+        # Project
+        projpts = intrinsics @ (Rinv @ pts3d.transpose(-2, -1) + tinv)  # I(RX+t) : [B,3,N]
+        projpts = projpts.transpose(-2, -1)  # [B,N,3]
+        projpts[..., :2] /= projpts[..., [-1]]  # [B,N,3] (X/Z , Y/Z, Z)
+        return projpts
+    def _backproj_pts3d(self, in_depths=None, in_im_poses=None,
+                        in_focals=None, in_pps=None, in_imshapes=None):
+        """
+        Backprojection operation: from image depths to 3D points
+        """
+        # Get depths and  projection params if not provided
+        focals = self.optimizer.get_focals() if in_focals is None else in_focals
+        im_poses = self.optimizer.get_im_poses() if in_im_poses is None else in_im_poses
+        depth = self._get_depthmaps() if in_depths is None else in_depths
+        pp = self.optimizer.get_principal_points() if in_pps is None else in_pps
+        imshapes = self.imshapes if in_imshapes is None else in_imshapes
+        def focal_ex(i): return focals[i][..., None, None].expand(1, *focals[i].shape, *imshapes[i])
+        dm_to_3d = [depthmap_to_pts3d(depth[i][None], focal_ex(i), pp=pp[[i]]) for i in range(im_poses.shape[0])]
+        def autoprocess(x):
+            x = x[0]
+            return x.transpose(-2, -1) if len(x.shape) == 4 else x
+        return [geotrf(pose, autoprocess(pt)) for pose, pt in zip(im_poses, dm_to_3d)]
+    def _pts3d_to_depth(self, pts3d, cam2worlds, focals, pps):
+        """
+        Projection operation: from 3D points to 2D coordinates + depths
+        """
+        B = pts3d.shape[0]
+        assert pts3d.shape[0] == cam2worlds.shape[0]
+        # prepare Extrinsincs
+        R, t = cam2worlds[:, :3, :3], cam2worlds[:, :3, -1]
+        Rinv = R.transpose(-2, -1)
+        tinv = -Rinv @ t[..., None]
+        # prepare intrinsics
+        intrinsics = torch.eye(3).to(cam2worlds)[None].repeat(self.optimizer.n_imgs, 1, 1)
+        if len(focals.shape) == 1:
+            focals = torch.stack([focals, focals], dim=-1)
+        intrinsics[:, 0, 0] = focals[:, 0]
+        intrinsics[:, 1, 1] = focals[:, 1]
+        intrinsics[:, :2, -1] = pps
+        # Project
+        projpts = intrinsics @ (Rinv @ pts3d.transpose(-2, -1) + tinv)  # I(RX+t) : [B,3,N]
+        projpts = projpts.transpose(-2, -1)  # [B,N,3]
+        projpts[..., :2] /= projpts[..., [-1]]  # [B,N,3] (X/Z , Y/Z, Z)
+        return projpts
+    def _depth_to_pts3d(self, in_depths=None, in_im_poses=None, in_focals=None, in_pps=None, in_imshapes=None):
+        """
+        Backprojection operation: from image depths to 3D points
+        """
+        # Get depths and  projection params if not provided
+        focals = self.optimizer.get_focals() if in_focals is None else in_focals
+        im_poses = self.optimizer.get_im_poses() if in_im_poses is None else in_im_poses
+        depth = self._get_depthmaps() if in_depths is None else in_depths
+        pp = self.optimizer.get_principal_points() if in_pps is None else in_pps
+        imshapes = self.imshapes if in_imshapes is None else in_imshapes
+        def focal_ex(i): return focals[i][..., None, None].expand(1, *focals[i].shape, *imshapes[i])
+        dm_to_3d = [depthmap_to_pts3d(depth[i][None], focal_ex(i), pp=pp[i:i + 1]) for i in range(im_poses.shape[0])]
+        def autoprocess(x):
+            x = x[0]
+            H, W, three = x.shape[:3]
+            return x.transpose(-2, -1) if len(x.shape) == 4 else x
+        return [geotrf(pp, autoprocess(pt)) for pp, pt in zip(im_poses, dm_to_3d)]
+    def _get_pts3d(self, TSDF_filtering_thresh=None, **kw):
+        """
+        return 3D points (possibly filtering depths with TSDF)
+        """
+        return self._backproj_pts3d(in_depths=self._get_depthmaps(TSDF_filtering_thresh=TSDF_filtering_thresh), **kw)
+    def _TSDF_postprocess_or_not(self, pts3d, depthmaps, confs, niter=1):
+        # Setup inner variables
+        self.imshapes = [im.shape[:2] for im in self.optimizer.imgs]
+        self.im_depthmaps = [dd.log().view(imshape) for dd, imshape in zip(depthmaps, self.imshapes)]
+        self.im_conf = confs
+        if self.TSDF_thresh > 0.:
+            # Create or update self.TSDF_im_depthmaps that contain logdepths filtered with TSDF
+            self._refine_depths_with_TSDF(self.TSDF_thresh, niter=niter)
+            depthmaps = [dd.exp() for dd in self.TSDF_im_depthmaps]
+            # Turn them into 3D points
+            pts3d = self._backproj_pts3d(in_depths=depthmaps)
+            depthmaps = [dd.flatten() for dd in depthmaps]
+            pts3d = [pp.view(-1, 3) for pp in pts3d]
+        return pts3d, depthmaps
+    def get_dense_pts3d(self, clean_depth=True):
+        return self.pts3d, self.depthmaps, self.confs

mast3r/cloud_opt/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # Copyright (C) 2024-present Naver Corporation. All rights reserved.
2	+ # Licensed under CC BY-NC-SA 4.0 (non-commercial use only).

mast3r/cloud_opt/utils/losses.py ADDED Viewed

	@@ -0,0 +1,32 @@

+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# losses for sparse ga
+# --------------------------------------------------------
+import torch
+import numpy as np
+def l05_loss(x, y):
+    return torch.linalg.norm(x - y, dim=-1).sqrt()
+def l1_loss(x, y):
+    return torch.linalg.norm(x - y, dim=-1)
+def gamma_loss(gamma, mul=1, offset=None, clip=np.inf):
+    if offset is None:
+        if gamma == 1:
+            return l1_loss
+        # d(x**p)/dx = 1 ==> p * x**(p-1) == 1 ==> x = (1/p)**(1/(p-1))
+        offset = (1 / gamma)**(1 / (gamma - 1))
+    def loss_func(x, y):
+        return (mul * l1_loss(x, y).clip(max=clip) + offset) ** gamma - offset ** gamma
+    return loss_func
+def meta_gamma_loss():
+    return lambda alpha: gamma_loss(alpha)

mast3r/cloud_opt/utils/schedules.py ADDED Viewed

	@@ -0,0 +1,17 @@

+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# lr schedules for sparse ga
+# --------------------------------------------------------
+import numpy as np
+def linear_schedule(alpha, lr_base, lr_end=0):
+    lr = (1 - alpha) * lr_base + alpha * lr_end
+    return lr
+def cosine_schedule(alpha, lr_base, lr_end=0):
+    lr = lr_end + (lr_base - lr_end) * (1 + np.cos(alpha * np.pi)) / 2
+    return lr

mast3r/colmap/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # Copyright (C) 2024-present Naver Corporation. All rights reserved.
2	+ # Licensed under CC BY-NC-SA 4.0 (non-commercial use only).

mast3r/colmap/database.py ADDED Viewed

	@@ -0,0 +1,383 @@

+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# MASt3R to colmap export functions
+# --------------------------------------------------------
+import os
+import torch
+import copy
+import numpy as np
+import torchvision
+import numpy as np
+from tqdm import tqdm
+from scipy.cluster.hierarchy import DisjointSet
+from scipy.spatial.transform import Rotation as R
+from mast3r.utils.misc import hash_md5
+from mast3r.fast_nn import extract_correspondences_nonsym, bruteforce_reciprocal_nns
+import mast3r.utils.path_to_dust3r  # noqa
+from dust3r.utils.geometry import find_reciprocal_matches, xy_grid, geotrf  # noqa
+def convert_im_matches_pairs(img0, img1, image_to_colmap, im_keypoints, matches_im0, matches_im1, viz):
+    if viz:
+        from matplotlib import pyplot as pl
+        image_mean = torch.as_tensor(
+            [0.5, 0.5, 0.5], device='cpu').reshape(1, 3, 1, 1)
+        image_std = torch.as_tensor(
+            [0.5, 0.5, 0.5], device='cpu').reshape(1, 3, 1, 1)
+        rgb0 = img0['img'] * image_std + image_mean
+        rgb0 = torchvision.transforms.functional.to_pil_image(rgb0[0])
+        rgb0 = np.array(rgb0)
+        rgb1 = img1['img'] * image_std + image_mean
+        rgb1 = torchvision.transforms.functional.to_pil_image(rgb1[0])
+        rgb1 = np.array(rgb1)
+        imgs = [rgb0, rgb1]
+        # visualize a few matches
+        n_viz = 100
+        num_matches = matches_im0.shape[0]
+        match_idx_to_viz = np.round(np.linspace(
+            0, num_matches - 1, n_viz)).astype(int)
+        viz_matches_im0, viz_matches_im1 = matches_im0[match_idx_to_viz], matches_im1[match_idx_to_viz]
+        H0, W0, H1, W1 = *imgs[0].shape[:2], *imgs[1].shape[:2]
+        rgb0 = np.pad(imgs[0], ((0, max(H1 - H0, 0)),
+                                (0, 0), (0, 0)), 'constant', constant_values=0)
+        rgb1 = np.pad(imgs[1], ((0, max(H0 - H1, 0)),
+                                (0, 0), (0, 0)), 'constant', constant_values=0)
+        img = np.concatenate((rgb0, rgb1), axis=1)
+        pl.figure()
+        pl.imshow(img)
+        cmap = pl.get_cmap('jet')
+        for ii in range(n_viz):
+            (x0, y0), (x1,
+                       y1) = viz_matches_im0[ii].T, viz_matches_im1[ii].T
+            pl.plot([x0, x1 + W0], [y0, y1], '-+', color=cmap(ii /
+                    (n_viz - 1)), scalex=False, scaley=False)
+        pl.show(block=True)
+    matches = [matches_im0.astype(np.float64), matches_im1.astype(np.float64)]
+    imgs = [img0, img1]
+    imidx0 = img0['idx']
+    imidx1 = img1['idx']
+    ravel_matches = []
+    for j in range(2):
+        H, W = imgs[j]['true_shape'][0]
+        with np.errstate(invalid='ignore'):
+            qx, qy = matches[j].round().astype(np.int32).T
+        ravel_matches_j = qx.clip(min=0, max=W - 1, out=qx) + W * qy.clip(min=0, max=H - 1, out=qy)
+        ravel_matches.append(ravel_matches_j)
+        imidxj = imgs[j]['idx']
+        for m in ravel_matches_j:
+            if m not in im_keypoints[imidxj]:
+                im_keypoints[imidxj][m] = 0
+            im_keypoints[imidxj][m] += 1
+    imid0 = copy.deepcopy(image_to_colmap[imidx0]['colmap_imid'])
+    imid1 = copy.deepcopy(image_to_colmap[imidx1]['colmap_imid'])
+    if imid0 > imid1:
+        colmap_matches = np.stack([ravel_matches[1], ravel_matches[0]], axis=-1)
+        imid0, imid1 = imid1, imid0
+        imidx0, imidx1 = imidx1, imidx0
+    else:
+        colmap_matches = np.stack([ravel_matches[0], ravel_matches[1]], axis=-1)
+    colmap_matches = np.unique(colmap_matches, axis=0)
+    return imidx0, imidx1, colmap_matches
+def get_im_matches(pred1, pred2, pairs, image_to_colmap, im_keypoints, conf_thr,
+                   is_sparse=True, subsample=8, pixel_tol=0, viz=False, device='cuda'):
+    im_matches = {}
+    for i in range(len(pred1['pts3d'])):
+        imidx0 = pairs[i][0]['idx']
+        imidx1 = pairs[i][1]['idx']
+        if 'desc' in pred1:  # mast3r
+            descs = [pred1['desc'][i], pred2['desc'][i]]
+            confidences = [pred1['desc_conf'][i], pred2['desc_conf'][i]]
+            desc_dim = descs[0].shape[-1]
+            if is_sparse:
+                corres = extract_correspondences_nonsym(descs[0], descs[1], confidences[0], confidences[1],
+                                                        device=device, subsample=subsample, pixel_tol=pixel_tol)
+                conf = corres[2]
+                mask = conf >= conf_thr
+                matches_im0 = corres[0][mask].cpu().numpy()
+                matches_im1 = corres[1][mask].cpu().numpy()
+            else:
+                confidence_masks = [confidences[0] >=
+                                    conf_thr, confidences[1] >= conf_thr]
+                pts2d_list, desc_list = [], []
+                for j in range(2):
+                    conf_j = confidence_masks[j].cpu().numpy().flatten()
+                    true_shape_j = pairs[i][j]['true_shape'][0]
+                    pts2d_j = xy_grid(
+                        true_shape_j[1], true_shape_j[0]).reshape(-1, 2)[conf_j]
+                    desc_j = descs[j].detach().cpu(
+                    ).numpy().reshape(-1, desc_dim)[conf_j]
+                    pts2d_list.append(pts2d_j)
+                    desc_list.append(desc_j)
+                if len(desc_list[0]) == 0 or len(desc_list[1]) == 0:
+                    continue
+                nn0, nn1 = bruteforce_reciprocal_nns(desc_list[0], desc_list[1],
+                                                     device=device, dist='dot', block_size=2**13)
+                reciprocal_in_P0 = (nn1[nn0] == np.arange(len(nn0)))
+                matches_im1 = pts2d_list[1][nn0][reciprocal_in_P0]
+                matches_im0 = pts2d_list[0][reciprocal_in_P0]
+        else:
+            pts3d = [pred1['pts3d'][i], pred2['pts3d_in_other_view'][i]]
+            confidences = [pred1['conf'][i], pred2['conf'][i]]
+            if is_sparse:
+                corres = extract_correspondences_nonsym(pts3d[0], pts3d[1], confidences[0], confidences[1],
+                                                        device=device, subsample=subsample, pixel_tol=pixel_tol,
+                                                        ptmap_key='3d')
+                conf = corres[2]
+                mask = conf >= conf_thr
+                matches_im0 = corres[0][mask].cpu().numpy()
+                matches_im1 = corres[1][mask].cpu().numpy()
+            else:
+                confidence_masks = [confidences[0] >=
+                                    conf_thr, confidences[1] >= conf_thr]
+                # find 2D-2D matches between the two images
+                pts2d_list, pts3d_list = [], []
+                for j in range(2):
+                    conf_j = confidence_masks[j].cpu().numpy().flatten()
+                    true_shape_j = pairs[i][j]['true_shape'][0]
+                    pts2d_j = xy_grid(true_shape_j[1], true_shape_j[0]).reshape(-1, 2)[conf_j]
+                    pts3d_j = pts3d[j].detach().cpu().numpy().reshape(-1, 3)[conf_j]
+                    pts2d_list.append(pts2d_j)
+                    pts3d_list.append(pts3d_j)
+                PQ, PM = pts3d_list[0], pts3d_list[1]
+                if len(PQ) == 0 or len(PM) == 0:
+                    continue
+                reciprocal_in_PM, nnM_in_PQ, num_matches = find_reciprocal_matches(
+                    PQ, PM)
+                matches_im1 = pts2d_list[1][reciprocal_in_PM]
+                matches_im0 = pts2d_list[0][nnM_in_PQ][reciprocal_in_PM]
+        if len(matches_im0) == 0:
+            continue
+        imidx0, imidx1, colmap_matches = convert_im_matches_pairs(pairs[i][0], pairs[i][1],
+                                                                  image_to_colmap, im_keypoints,
+                                                                  matches_im0, matches_im1, viz)
+        im_matches[(imidx0, imidx1)] = colmap_matches
+    return im_matches
+def get_im_matches_from_cache(pairs, cache_path, desc_conf, subsample,
+                              image_to_colmap, im_keypoints, conf_thr,
+                              viz=False, device='cuda'):
+    im_matches = {}
+    for i in range(len(pairs)):
+        imidx0 = pairs[i][0]['idx']
+        imidx1 = pairs[i][1]['idx']
+        corres_idx1 = hash_md5(pairs[i][0]['instance'])
+        corres_idx2 = hash_md5(pairs[i][1]['instance'])
+        path_corres = cache_path + f'/corres_conf={desc_conf}_{subsample=}/{corres_idx1}-{corres_idx2}.pth'
+        if os.path.isfile(path_corres):
+            score, (xy1, xy2, confs) = torch.load(path_corres, map_location=device)
+        else:
+            path_corres = cache_path + f'/corres_conf={desc_conf}_{subsample=}/{corres_idx2}-{corres_idx1}.pth'
+            score, (xy2, xy1, confs) = torch.load(path_corres, map_location=device)
+        mask = confs >= conf_thr
+        matches_im0 = xy1[mask].cpu().numpy()
+        matches_im1 = xy2[mask].cpu().numpy()
+        if len(matches_im0) == 0:
+            continue
+        imidx0, imidx1, colmap_matches = convert_im_matches_pairs(pairs[i][0], pairs[i][1],
+                                                                  image_to_colmap, im_keypoints,
+                                                                  matches_im0, matches_im1, viz)
+        im_matches[(imidx0, imidx1)] = colmap_matches
+    return im_matches
+def export_images(db, images, image_paths, focals, ga_world_to_cam, camera_model):
+    # add cameras/images to the db
+    # with the output of ga as prior
+    image_to_colmap = {}
+    im_keypoints = {}
+    for idx in range(len(image_paths)):
+        im_keypoints[idx] = {}
+        H, W = images[idx]["orig_shape"]
+        if focals is None:
+            focal_x = focal_y = 1.2 * max(W, H)
+            prior_focal_length = False
+            cx = W / 2.0
+            cy = H / 2.0
+        elif isinstance(focals[idx], np.ndarray) and len(focals[idx].shape) == 2:
+            # intrinsics
+            focal_x = focals[idx][0, 0]
+            focal_y = focals[idx][1, 1]
+            cx = focals[idx][0, 2] * images[idx]["to_orig"][0, 0]
+            cy = focals[idx][1, 2] * images[idx]["to_orig"][1, 1]
+            prior_focal_length = True
+        else:
+            focal_x = focal_y = float(focals[idx])
+            prior_focal_length = True
+            cx = W / 2.0
+            cy = H / 2.0
+        focal_x = focal_x * images[idx]["to_orig"][0, 0]
+        focal_y = focal_y * images[idx]["to_orig"][1, 1]
+        if camera_model == "SIMPLE_PINHOLE":
+            model_id = 0
+            focal = (focal_x + focal_y) / 2.0
+            params = np.asarray([focal, cx, cy], np.float64)
+        elif camera_model == "PINHOLE":
+            model_id = 1
+            params = np.asarray([focal_x, focal_y, cx, cy], np.float64)
+        elif camera_model == "SIMPLE_RADIAL":
+            model_id = 2
+            focal = (focal_x + focal_y) / 2.0
+            params = np.asarray([focal, cx, cy, 0.0], np.float64)
+        elif camera_model == "OPENCV":
+            model_id = 4
+            params = np.asarray([focal_x, focal_y, cx, cy, 0.0, 0.0, 0.0, 0.0], np.float64)
+        else:
+            raise ValueError(f"invalid camera model {camera_model}")
+        H, W = int(H), int(W)
+        # OPENCV camera model
+        camid = db.add_camera(
+            model_id, W, H, params, prior_focal_length=prior_focal_length)
+        if ga_world_to_cam is None:
+            prior_t = np.zeros(3)
+            prior_q = np.zeros(4)
+        else:
+            q = R.from_matrix(ga_world_to_cam[idx][:3, :3]).as_quat()
+            prior_t = ga_world_to_cam[idx][:3, 3]
+            prior_q = np.array([q[-1], q[0], q[1], q[2]])
+        imid = db.add_image(
+            image_paths[idx], camid, prior_q=prior_q, prior_t=prior_t)
+        image_to_colmap[idx] = {
+            'colmap_imid': imid,
+            'colmap_camid': camid
+        }
+    return image_to_colmap, im_keypoints
+def export_matches(db, images, image_to_colmap, im_keypoints, im_matches, min_len_track, skip_geometric_verification):
+    colmap_image_pairs = []
+    # 2D-2D are quite dense
+    # we want to remove the very small tracks
+    # and export only kpt for which we have values
+    # build tracks
+    print("building tracks")
+    keypoints_to_track_id = {}
+    track_id_to_kpt_list = []
+    to_merge = []
+    for (imidx0, imidx1), colmap_matches in tqdm(im_matches.items()):
+        if imidx0 not in keypoints_to_track_id:
+            keypoints_to_track_id[imidx0] = {}
+        if imidx1 not in keypoints_to_track_id:
+            keypoints_to_track_id[imidx1] = {}
+        for m in colmap_matches:
+            if m[0] not in keypoints_to_track_id[imidx0] and m[1] not in keypoints_to_track_id[imidx1]:
+                # new pair of kpts never seen before
+                track_idx = len(track_id_to_kpt_list)
+                keypoints_to_track_id[imidx0][m[0]] = track_idx
+                keypoints_to_track_id[imidx1][m[1]] = track_idx
+                track_id_to_kpt_list.append(
+                    [(imidx0, m[0]), (imidx1, m[1])])
+            elif m[1] not in keypoints_to_track_id[imidx1]:
+                # 0 has a track, not 1
+                track_idx = keypoints_to_track_id[imidx0][m[0]]
+                keypoints_to_track_id[imidx1][m[1]] = track_idx
+                track_id_to_kpt_list[track_idx].append((imidx1, m[1]))
+            elif m[0] not in keypoints_to_track_id[imidx0]:
+                # 1 has a track, not 0
+                track_idx = keypoints_to_track_id[imidx1][m[1]]
+                keypoints_to_track_id[imidx0][m[0]] = track_idx
+                track_id_to_kpt_list[track_idx].append((imidx0, m[0]))
+            else:
+                # both have tracks, merge them
+                track_idx0 = keypoints_to_track_id[imidx0][m[0]]
+                track_idx1 = keypoints_to_track_id[imidx1][m[1]]
+                if track_idx0 != track_idx1:
+                    # let's deal with them later
+                    to_merge.append((track_idx0, track_idx1))
+    # regroup merge targets
+    print("merging tracks")
+    unique = np.unique(to_merge)
+    tree = DisjointSet(unique)
+    for track_idx0, track_idx1 in tqdm(to_merge):
+        tree.merge(track_idx0, track_idx1)
+    subsets = tree.subsets()
+    print("applying merge")
+    for setvals in tqdm(subsets):
+        new_trackid = len(track_id_to_kpt_list)
+        kpt_list = []
+        for track_idx in setvals:
+            kpt_list.extend(track_id_to_kpt_list[track_idx])
+            for imidx, kpid in track_id_to_kpt_list[track_idx]:
+                keypoints_to_track_id[imidx][kpid] = new_trackid
+        track_id_to_kpt_list.append(kpt_list)
+    # binc = np.bincount([len(v) for v in track_id_to_kpt_list])
+    # nonzero = np.nonzero(binc)
+    # nonzerobinc = binc[nonzero[0]]
+    # print(nonzero[0].tolist())
+    # print(nonzerobinc)
+    num_valid_tracks = sum(
+        [1 for v in track_id_to_kpt_list if len(v) >= min_len_track])
+    keypoints_to_idx = {}
+    print(f"squashing keypoints - {num_valid_tracks} valid tracks")
+    for imidx, keypoints_imid in tqdm(im_keypoints.items()):
+        imid = image_to_colmap[imidx]['colmap_imid']
+        keypoints_kept = []
+        keypoints_to_idx[imidx] = {}
+        for kp in keypoints_imid.keys():
+            if kp not in keypoints_to_track_id[imidx]:
+                continue
+            track_idx = keypoints_to_track_id[imidx][kp]
+            track_length = len(track_id_to_kpt_list[track_idx])
+            if track_length < min_len_track:
+                continue
+            keypoints_to_idx[imidx][kp] = len(keypoints_kept)
+            keypoints_kept.append(kp)
+        if len(keypoints_kept) == 0:
+            continue
+        keypoints_kept = np.array(keypoints_kept)
+        keypoints_kept = np.unravel_index(keypoints_kept, images[imidx]['true_shape'][0])[
+            0].base[:, ::-1].copy().astype(np.float32)
+        # rescale coordinates
+        keypoints_kept[:, 0] += 0.5
+        keypoints_kept[:, 1] += 0.5
+        keypoints_kept = geotrf(images[imidx]['to_orig'], keypoints_kept, norm=True)
+        H, W = images[imidx]['orig_shape']
+        keypoints_kept[:, 0] = keypoints_kept[:, 0].clip(min=0, max=W - 0.01)
+        keypoints_kept[:, 1] = keypoints_kept[:, 1].clip(min=0, max=H - 0.01)
+        db.add_keypoints(imid, keypoints_kept)
+    print("exporting im_matches")
+    for (imidx0, imidx1), colmap_matches in im_matches.items():
+        imid0, imid1 = image_to_colmap[imidx0]['colmap_imid'], image_to_colmap[imidx1]['colmap_imid']
+        assert imid0 < imid1
+        final_matches = np.array([[keypoints_to_idx[imidx0][m[0]], keypoints_to_idx[imidx1][m[1]]]
+                                  for m in colmap_matches
+                                  if m[0] in keypoints_to_idx[imidx0] and m[1] in keypoints_to_idx[imidx1]])
+        if len(final_matches) > 0:
+            colmap_image_pairs.append(
+                (images[imidx0]['instance'], images[imidx1]['instance']))
+            db.add_matches(imid0, imid1, final_matches)
+            if skip_geometric_verification:
+                db.add_two_view_geometry(imid0, imid1, final_matches)
+    return colmap_image_pairs

mast3r/datasets/__init__.py ADDED Viewed

	@@ -0,0 +1,62 @@

+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+from .base.mast3r_base_stereo_view_dataset import MASt3RBaseStereoViewDataset
+import mast3r.utils.path_to_dust3r  # noqa
+from dust3r.datasets.arkitscenes import ARKitScenes as DUSt3R_ARKitScenes  # noqa
+from dust3r.datasets.blendedmvs import BlendedMVS as DUSt3R_BlendedMVS  # noqa
+from dust3r.datasets.co3d import Co3d as DUSt3R_Co3d  # noqa
+from dust3r.datasets.megadepth import MegaDepth as DUSt3R_MegaDepth  # noqa
+from dust3r.datasets.scannetpp import ScanNetpp as DUSt3R_ScanNetpp  # noqa
+from dust3r.datasets.staticthings3d import StaticThings3D as DUSt3R_StaticThings3D  # noqa
+from dust3r.datasets.waymo import Waymo as DUSt3R_Waymo  # noqa
+from dust3r.datasets.wildrgbd import WildRGBD as DUSt3R_WildRGBD  # noqa
+class ARKitScenes(DUSt3R_ARKitScenes, MASt3RBaseStereoViewDataset):
+    def __init__(self, *args, split, ROOT, **kwargs):
+        super().__init__(*args, split=split, ROOT=ROOT, **kwargs)
+        self.is_metric_scale = True
+class BlendedMVS(DUSt3R_BlendedMVS, MASt3RBaseStereoViewDataset):
+    def __init__(self, *args, ROOT, split=None, **kwargs):
+        super().__init__(*args, ROOT=ROOT, split=split, **kwargs)
+        self.is_metric_scale = False
+class Co3d(DUSt3R_Co3d, MASt3RBaseStereoViewDataset):
+    def __init__(self, mask_bg=True, *args, ROOT, **kwargs):
+        super().__init__(mask_bg, *args, ROOT=ROOT, **kwargs)
+        self.is_metric_scale = False
+class MegaDepth(DUSt3R_MegaDepth, MASt3RBaseStereoViewDataset):
+    def __init__(self, *args, split, ROOT, **kwargs):
+        super().__init__(*args, split=split, ROOT=ROOT, **kwargs)
+        self.is_metric_scale = False
+class ScanNetpp(DUSt3R_ScanNetpp, MASt3RBaseStereoViewDataset):
+    def __init__(self, *args, ROOT, **kwargs):
+        super().__init__(*args, ROOT=ROOT, **kwargs)
+        self.is_metric_scale = True
+class StaticThings3D(DUSt3R_StaticThings3D, MASt3RBaseStereoViewDataset):
+    def __init__(self, ROOT, *args, mask_bg='rand', **kwargs):
+        super().__init__(ROOT, *args, mask_bg=mask_bg, **kwargs)
+        self.is_metric_scale = False
+class Waymo(DUSt3R_Waymo, MASt3RBaseStereoViewDataset):
+    def __init__(self, *args, ROOT, **kwargs):
+        super().__init__(*args, ROOT=ROOT, **kwargs)
+        self.is_metric_scale = True
+class WildRGBD(DUSt3R_WildRGBD, MASt3RBaseStereoViewDataset):
+    def __init__(self, mask_bg=True, *args, ROOT, **kwargs):
+        super().__init__(mask_bg, *args, ROOT=ROOT, **kwargs)
+        self.is_metric_scale = True

mast3r/datasets/base/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # Copyright (C) 2024-present Naver Corporation. All rights reserved.
2	+ # Licensed under CC BY-NC-SA 4.0 (non-commercial use only).

mast3r/datasets/base/mast3r_base_stereo_view_dataset.py ADDED Viewed

	@@ -0,0 +1,355 @@

+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# base class for implementing datasets
+# --------------------------------------------------------
+import PIL.Image
+import PIL.Image as Image
+import numpy as np
+import torch
+import copy
+from mast3r.datasets.utils.cropping import (extract_correspondences_from_pts3d,
+                                            gen_random_crops, in2d_rect, crop_to_homography)
+import mast3r.utils.path_to_dust3r  # noqa
+from dust3r.datasets.base.base_stereo_view_dataset import BaseStereoViewDataset, view_name, is_good_type  # noqa
+from dust3r.datasets.utils.transforms import ImgNorm
+from dust3r.utils.geometry import depthmap_to_absolute_camera_coordinates, geotrf, depthmap_to_camera_coordinates
+import dust3r.datasets.utils.cropping as cropping
+class MASt3RBaseStereoViewDataset(BaseStereoViewDataset):
+    def __init__(self, *,  # only keyword arguments
+                 split=None,
+                 resolution=None,  # square_size or (width, height) or list of [(width,height), ...]
+                 transform=ImgNorm,
+                 aug_crop=False,
+                 aug_swap=False,
+                 aug_monocular=False,
+                 aug_portrait_or_landscape=True,  # automatic choice between landscape/portrait when possible
+                 aug_rot90=False,
+                 n_corres=0,
+                 nneg=0,
+                 n_tentative_crops=4,
+                 seed=None):
+        super().__init__(split=split, resolution=resolution, transform=transform, aug_crop=aug_crop, seed=seed)
+        self.is_metric_scale = False  # by default a dataset is not metric scale, subclasses can overwrite this
+        self.aug_swap = aug_swap
+        self.aug_monocular = aug_monocular
+        self.aug_portrait_or_landscape = aug_portrait_or_landscape
+        self.aug_rot90 = aug_rot90
+        self.n_corres = n_corres
+        self.nneg = nneg
+        assert self.n_corres == 'all' or isinstance(self.n_corres, int) or (isinstance(self.n_corres, list) and len(
+            self.n_corres) == self.num_views), f"Error, n_corres should either be 'all', a single integer or a list of length {self.num_views}"
+        assert self.nneg == 0 or self.n_corres != 'all'
+        self.n_tentative_crops = n_tentative_crops
+    def _swap_view_aug(self, views):
+        if self._rng.random() < 0.5:
+            views.reverse()
+    def _crop_resize_if_necessary(self, image, depthmap, intrinsics, resolution, rng=None, info=None):
+        """ This function:
+            - first downsizes the image with LANCZOS inteprolation,
+                which is better than bilinear interpolation in
+        """
+        if not isinstance(image, PIL.Image.Image):
+            image = PIL.Image.fromarray(image)
+        # transpose the resolution if necessary
+        W, H = image.size  # new size
+        assert resolution[0] >= resolution[1]
+        if H > 1.1 * W:
+            # image is portrait mode
+            resolution = resolution[::-1]
+        elif 0.9 < H / W < 1.1 and resolution[0] != resolution[1]:
+            # image is square, so we chose (portrait, landscape) randomly
+            if rng.integers(2) and self.aug_portrait_or_landscape:
+                resolution = resolution[::-1]
+        # high-quality Lanczos down-scaling
+        target_resolution = np.array(resolution)
+        image, depthmap, intrinsics = cropping.rescale_image_depthmap(image, depthmap, intrinsics, target_resolution)
+        # actual cropping (if necessary) with bilinear interpolation
+        offset_factor = 0.5
+        intrinsics2 = cropping.camera_matrix_of_crop(intrinsics, image.size, resolution, offset_factor=offset_factor)
+        crop_bbox = cropping.bbox_from_intrinsics_in_out(intrinsics, intrinsics2, resolution)
+        image, depthmap, intrinsics2 = cropping.crop_image_depthmap(image, depthmap, intrinsics, crop_bbox)
+        return image, depthmap, intrinsics2
+    def generate_crops_from_pair(self, view1, view2, resolution, aug_crop_arg, n_crops=4, rng=np.random):
+        views = [view1, view2]
+        if aug_crop_arg is False:
+            # compatibility
+            for i in range(2):
+                view = views[i]
+                view['img'], view['depthmap'], view['camera_intrinsics'] = self._crop_resize_if_necessary(view['img'],
+                                                                                                          view['depthmap'],
+                                                                                                          view['camera_intrinsics'],
+                                                                                                          resolution,
+                                                                                                          rng=rng)
+                view['pts3d'], view['valid_mask'] = depthmap_to_absolute_camera_coordinates(view['depthmap'],
+                                                                                            view['camera_intrinsics'],
+                                                                                            view['camera_pose'])
+            return
+        # extract correspondences
+        corres = extract_correspondences_from_pts3d(*views, target_n_corres=None, rng=rng)
+        # generate 4 random crops in each view
+        view_crops = []
+        crops_resolution = []
+        corres_msks = []
+        for i in range(2):
+            if aug_crop_arg == 'auto':
+                S = min(views[i]['img'].size)
+                R = min(resolution)
+                aug_crop = S * (S - R) // R
+                aug_crop = max(.1 * S, aug_crop)  # for cropping: augment scale of at least 10%, and more if possible
+            else:
+                aug_crop = aug_crop_arg
+            # tranpose the target resolution if necessary
+            assert resolution[0] >= resolution[1]
+            W, H = imsize = views[i]['img'].size
+            crop_resolution = resolution
+            if H > 1.1 * W:
+                # image is portrait mode
+                crop_resolution = resolution[::-1]
+            elif 0.9 < H / W < 1.1 and resolution[0] != resolution[1]:
+                # image is square, so we chose (portrait, landscape) randomly
+                if rng.integers(2):
+                    crop_resolution = resolution[::-1]
+            crops = gen_random_crops(imsize, n_crops, crop_resolution, aug_crop=aug_crop, rng=rng)
+            view_crops.append(crops)
+            crops_resolution.append(crop_resolution)
+            # compute correspondences
+            corres_msks.append(in2d_rect(corres[i], crops))
+        # compute IoU for each
+        intersection = np.float32(corres_msks[0]).T @ np.float32(corres_msks[1])
+        # select best pair of crops
+        best = np.unravel_index(intersection.argmax(), (n_crops, n_crops))
+        crops = [view_crops[i][c] for i, c in enumerate(best)]
+        # crop with the homography
+        for i in range(2):
+            view = views[i]
+            imsize, K_new, R, H = crop_to_homography(view['camera_intrinsics'], crops[i], crops_resolution[i])
+            # imsize, K_new, H = upscale_homography(imsize, resolution, K_new, H)
+            # update camera params
+            K_old = view['camera_intrinsics']
+            view['camera_intrinsics'] = K_new
+            view['camera_pose'] = view['camera_pose'].copy()
+            view['camera_pose'][:3, :3] = view['camera_pose'][:3, :3] @ R
+            # apply homography to image and depthmap
+            homo8 = (H / H[2, 2]).ravel().tolist()[:8]
+            view['img'] = view['img'].transform(imsize, Image.Transform.PERSPECTIVE,
+                                                homo8,
+                                                resample=Image.Resampling.BICUBIC)
+            depthmap2 = depthmap_to_camera_coordinates(view['depthmap'], K_old)[0] @ R[:, 2]
+            view['depthmap'] = np.array(Image.fromarray(depthmap2).transform(
+                imsize, Image.Transform.PERSPECTIVE, homo8))
+            if 'track_labels' in view:
+                # convert from uint64 --> uint32, because PIL.Image cannot handle uint64
+                mapping, track_labels = np.unique(view['track_labels'], return_inverse=True)
+                track_labels = track_labels.astype(np.uint32).reshape(view['track_labels'].shape)
+                # homography transformation
+                res = np.array(Image.fromarray(track_labels).transform(imsize, Image.Transform.PERSPECTIVE, homo8))
+                view['track_labels'] = mapping[res]  # mapping back to uint64
+            # recompute 3d points from scratch
+            view['pts3d'], view['valid_mask'] = depthmap_to_absolute_camera_coordinates(view['depthmap'],
+                                                                                        view['camera_intrinsics'],
+                                                                                        view['camera_pose'])
+    def __getitem__(self, idx):
+        if isinstance(idx, tuple):
+            # the idx is specifying the aspect-ratio
+            idx, ar_idx = idx
+        else:
+            assert len(self._resolutions) == 1
+            ar_idx = 0
+        # set-up the rng
+        if self.seed:  # reseed for each __getitem__
+            self._rng = np.random.default_rng(seed=self.seed + idx)
+        elif not hasattr(self, '_rng'):
+            seed = torch.initial_seed()  # this is different for each dataloader process
+            self._rng = np.random.default_rng(seed=seed)
+        # over-loaded code
+        resolution = self._resolutions[ar_idx]  # DO NOT CHANGE THIS (compatible with BatchedRandomSampler)
+        views = self._get_views(idx, resolution, self._rng)
+        assert len(views) == self.num_views
+        for v, view in enumerate(views):
+            assert 'pts3d' not in view, f"pts3d should not be there, they will be computed afterwards based on intrinsics+depthmap for view {view_name(view)}"
+            view['idx'] = (idx, ar_idx, v)
+            view['is_metric_scale'] = self.is_metric_scale
+            assert 'camera_intrinsics' in view
+            if 'camera_pose' not in view:
+                view['camera_pose'] = np.full((4, 4), np.nan, dtype=np.float32)
+            else:
+                assert np.isfinite(view['camera_pose']).all(), f'NaN in camera pose for view {view_name(view)}'
+            assert 'pts3d' not in view
+            assert 'valid_mask' not in view
+            assert np.isfinite(view['depthmap']).all(), f'NaN in depthmap for view {view_name(view)}'
+            pts3d, valid_mask = depthmap_to_absolute_camera_coordinates(**view)
+            view['pts3d'] = pts3d
+            view['valid_mask'] = valid_mask & np.isfinite(pts3d).all(axis=-1)
+        self.generate_crops_from_pair(views[0], views[1], resolution=resolution,
+                                      aug_crop_arg=self.aug_crop,
+                                      n_crops=self.n_tentative_crops,
+                                      rng=self._rng)
+        for v, view in enumerate(views):
+            # encode the image
+            width, height = view['img'].size
+            view['true_shape'] = np.int32((height, width))
+            view['img'] = self.transform(view['img'])
+            # Pixels for which depth is fundamentally undefined
+            view['sky_mask'] = (view['depthmap'] < 0)
+        if self.aug_swap:
+            self._swap_view_aug(views)
+        if self.aug_monocular:
+            if self._rng.random() < self.aug_monocular:
+                views = [copy.deepcopy(views[0]) for _ in range(len(views))]
+        # automatic extraction of correspondences from pts3d + pose
+        if self.n_corres > 0 and ('corres' not in view):
+            corres1, corres2, valid = extract_correspondences_from_pts3d(*views, self.n_corres,
+                                                                         self._rng, nneg=self.nneg)
+            views[0]['corres'] = corres1
+            views[1]['corres'] = corres2
+            views[0]['valid_corres'] = valid
+            views[1]['valid_corres'] = valid
+        if self.aug_rot90 is False:
+            pass
+        elif self.aug_rot90 == 'same':
+            rotate_90(views, k=self._rng.choice(4))
+        elif self.aug_rot90 == 'diff':
+            rotate_90(views[:1], k=self._rng.choice(4))
+            rotate_90(views[1:], k=self._rng.choice(4))
+        else:
+            raise ValueError(f'Bad value for {self.aug_rot90=}')
+        # check data-types metric_scale
+        for v, view in enumerate(views):
+            if 'corres' not in view:
+                view['corres'] = np.full((self.n_corres, 2), np.nan, dtype=np.float32)
+            # check all datatypes
+            for key, val in view.items():
+                res, err_msg = is_good_type(key, val)
+                assert res, f"{err_msg} with {key}={val} for view {view_name(view)}"
+            K = view['camera_intrinsics']
+            # check shapes
+            assert view['depthmap'].shape == view['img'].shape[1:]
+            assert view['depthmap'].shape == view['pts3d'].shape[:2]
+            assert view['depthmap'].shape == view['valid_mask'].shape
+        # last thing done!
+        for view in views:
+            # transpose to make sure all views are the same size
+            transpose_to_landscape(view)
+            # this allows to check whether the RNG is is the same state each time
+            view['rng'] = int.from_bytes(self._rng.bytes(4), 'big')
+        return views
+def transpose_to_landscape(view, revert=False):
+    height, width = view['true_shape']
+    if width < height:
+        if revert:
+            height, width = width, height
+        # rectify portrait to landscape
+        assert view['img'].shape == (3, height, width)
+        view['img'] = view['img'].swapaxes(1, 2)
+        assert view['valid_mask'].shape == (height, width)
+        view['valid_mask'] = view['valid_mask'].swapaxes(0, 1)
+        assert view['sky_mask'].shape == (height, width)
+        view['sky_mask'] = view['sky_mask'].swapaxes(0, 1)
+        assert view['depthmap'].shape == (height, width)
+        view['depthmap'] = view['depthmap'].swapaxes(0, 1)
+        assert view['pts3d'].shape == (height, width, 3)
+        view['pts3d'] = view['pts3d'].swapaxes(0, 1)
+        # transpose x and y pixels
+        view['camera_intrinsics'] = view['camera_intrinsics'][[1, 0, 2]]
+        # transpose correspondences x and y
+        view['corres'] = view['corres'][:, [1, 0]]
+def rotate_90(views, k=1):
+    from scipy.spatial.transform import Rotation
+    # print('rotation =', k)
+    RT = np.eye(4, dtype=np.float32)
+    RT[:3, :3] = Rotation.from_euler('z', 90 * k, degrees=True).as_matrix()
+    for view in views:
+        view['img'] = torch.rot90(view['img'], k=k, dims=(-2, -1))  # WARNING!! dims=(-1,-2) != dims=(-2,-1)
+        view['depthmap'] = np.rot90(view['depthmap'], k=k).copy()
+        view['camera_pose'] = view['camera_pose'] @ RT
+        RT2 = np.eye(3, dtype=np.float32)
+        RT2[:2, :2] = RT[:2, :2] * ((1, -1), (-1, 1))
+        H, W = view['depthmap'].shape
+        if k % 4 == 0:
+            pass
+        elif k % 4 == 1:
+            # top-left (0,0) pixel becomes (0,H-1)
+            RT2[:2, 2] = (0, H - 1)
+        elif k % 4 == 2:
+            # top-left (0,0) pixel becomes (W-1,H-1)
+            RT2[:2, 2] = (W - 1, H - 1)
+        elif k % 4 == 3:
+            # top-left (0,0) pixel becomes (W-1,0)
+            RT2[:2, 2] = (W - 1, 0)
+        else:
+            raise ValueError(f'Bad value for {k=}')
+        view['camera_intrinsics'][:2, 2] = geotrf(RT2, view['camera_intrinsics'][:2, 2])
+        if k % 2 == 1:
+            K = view['camera_intrinsics']
+            np.fill_diagonal(K, K.diagonal()[[1, 0, 2]])
+        pts3d, valid_mask = depthmap_to_absolute_camera_coordinates(**view)
+        view['pts3d'] = pts3d
+        view['valid_mask'] = np.rot90(view['valid_mask'], k=k).copy()
+        view['sky_mask'] = np.rot90(view['sky_mask'], k=k).copy()
+        view['corres'] = geotrf(RT2, view['corres']).round().astype(view['corres'].dtype)
+        view['true_shape'] = np.int32((H, W))

mast3r/datasets/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # Copyright (C) 2024-present Naver Corporation. All rights reserved.
2	+ # Licensed under CC BY-NC-SA 4.0 (non-commercial use only).

mast3r/datasets/utils/cropping.py ADDED Viewed

	@@ -0,0 +1,219 @@

+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# cropping/match extraction
+# --------------------------------------------------------
+import numpy as np
+import mast3r.utils.path_to_dust3r  # noqa
+from dust3r.utils.device import to_numpy
+from dust3r.utils.geometry import inv, geotrf
+def reciprocal_1d(corres_1_to_2, corres_2_to_1, ret_recip=False):
+    is_reciprocal1 = (corres_2_to_1[corres_1_to_2] == np.arange(len(corres_1_to_2)))
+    pos1 = is_reciprocal1.nonzero()[0]
+    pos2 = corres_1_to_2[pos1]
+    if ret_recip:
+        return is_reciprocal1, pos1, pos2
+    return pos1, pos2
+def extract_correspondences_from_pts3d(view1, view2, target_n_corres, rng=np.random, ret_xy=True, nneg=0):
+    view1, view2 = to_numpy((view1, view2))
+    # project pixels from image1 --> 3d points --> image2 pixels
+    shape1, corres1_to_2 = reproject_view(view1['pts3d'], view2)
+    shape2, corres2_to_1 = reproject_view(view2['pts3d'], view1)
+    # compute reciprocal correspondences:
+    # pos1 == valid pixels (correspondences) in image1
+    is_reciprocal1, pos1, pos2 = reciprocal_1d(corres1_to_2, corres2_to_1, ret_recip=True)
+    is_reciprocal2 = (corres1_to_2[corres2_to_1] == np.arange(len(corres2_to_1)))
+    if target_n_corres is None:
+        if ret_xy:
+            pos1 = unravel_xy(pos1, shape1)
+            pos2 = unravel_xy(pos2, shape2)
+        return pos1, pos2
+    available_negatives = min((~is_reciprocal1).sum(), (~is_reciprocal2).sum())
+    target_n_positives = int(target_n_corres * (1 - nneg))
+    n_positives = min(len(pos1), target_n_positives)
+    n_negatives = min(target_n_corres - n_positives, available_negatives)
+    if n_negatives + n_positives != target_n_corres:
+        # should be really rare => when there are not enough negatives
+        # in that case, break nneg and add a few more positives ?
+        n_positives = target_n_corres - n_negatives
+        assert n_positives <= len(pos1)
+    assert n_positives <= len(pos1)
+    assert n_positives <= len(pos2)
+    assert n_negatives <= (~is_reciprocal1).sum()
+    assert n_negatives <= (~is_reciprocal2).sum()
+    assert n_positives + n_negatives == target_n_corres
+    valid = np.ones(n_positives, dtype=bool)
+    if n_positives < len(pos1):
+        # random sub-sampling of valid correspondences
+        perm = rng.permutation(len(pos1))[:n_positives]
+        pos1 = pos1[perm]
+        pos2 = pos2[perm]
+    if n_negatives > 0:
+        # add false correspondences if not enough
+        def norm(p): return p / p.sum()
+        pos1 = np.r_[pos1, rng.choice(shape1[0] * shape1[1], size=n_negatives, replace=False, p=norm(~is_reciprocal1))]
+        pos2 = np.r_[pos2, rng.choice(shape2[0] * shape2[1], size=n_negatives, replace=False, p=norm(~is_reciprocal2))]
+        valid = np.r_[valid, np.zeros(n_negatives, dtype=bool)]
+    # convert (x+W*y) back to 2d (x,y) coordinates
+    if ret_xy:
+        pos1 = unravel_xy(pos1, shape1)
+        pos2 = unravel_xy(pos2, shape2)
+    return pos1, pos2, valid
+def reproject_view(pts3d, view2):
+    shape = view2['pts3d'].shape[:2]
+    return reproject(pts3d, view2['camera_intrinsics'], inv(view2['camera_pose']), shape)
+def reproject(pts3d, K, world2cam, shape):
+    H, W, THREE = pts3d.shape
+    assert THREE == 3
+    # reproject in camera2 space
+    with np.errstate(divide='ignore', invalid='ignore'):
+        pos = geotrf(K @ world2cam[:3], pts3d, norm=1, ncol=2)
+    # quantize to pixel positions
+    return (H, W), ravel_xy(pos, shape)
+def ravel_xy(pos, shape):
+    H, W = shape
+    with np.errstate(invalid='ignore'):
+        qx, qy = pos.reshape(-1, 2).round().astype(np.int32).T
+    quantized_pos = qx.clip(min=0, max=W - 1, out=qx) + W * qy.clip(min=0, max=H - 1, out=qy)
+    return quantized_pos
+def unravel_xy(pos, shape):
+    # convert (x+W*y) back to 2d (x,y) coordinates
+    return np.unravel_index(pos, shape)[0].base[:, ::-1].copy()
+def _rotation_origin_to_pt(target):
+    """ Align the origin (0,0,1) with the target point (x,y,1) in projective space.
+    Method: rotate z to put target on (x'+,0,1), then rotate on Y to get (0,0,1) and un-rotate z.
+    """
+    from scipy.spatial.transform import Rotation
+    x, y = target
+    rot_z = np.arctan2(y, x)
+    rot_y = np.arctan(np.linalg.norm(target))
+    R = Rotation.from_euler('ZYZ', [rot_z, rot_y, -rot_z]).as_matrix()
+    return R
+def _dotmv(Trf, pts, ncol=None, norm=False):
+    assert Trf.ndim >= 2
+    ncol = ncol or pts.shape[-1]
+    # adapt shape if necessary
+    output_reshape = pts.shape[:-1]
+    if Trf.ndim >= 3:
+        n = Trf.ndim - 2
+        assert Trf.shape[:n] == pts.shape[:n], 'batch size does not match'
+        Trf = Trf.reshape(-1, Trf.shape[-2], Trf.shape[-1])
+        if pts.ndim > Trf.ndim:
+            # Trf == (B,d,d) & pts == (B,H,W,d) --> (B, H*W, d)
+            pts = pts.reshape(Trf.shape[0], -1, pts.shape[-1])
+        elif pts.ndim == 2:
+            # Trf == (B,d,d) & pts == (B,d) --> (B, 1, d)
+            pts = pts[:, None, :]
+    if pts.shape[-1] + 1 == Trf.shape[-1]:
+        Trf = Trf.swapaxes(-1, -2)  # transpose Trf
+        pts = pts @ Trf[..., :-1, :] + Trf[..., -1:, :]
+    elif pts.shape[-1] == Trf.shape[-1]:
+        Trf = Trf.swapaxes(-1, -2)  # transpose Trf
+        pts = pts @ Trf
+    else:
+        pts = Trf @ pts.T
+        if pts.ndim >= 2:
+            pts = pts.swapaxes(-1, -2)
+    if norm:
+        pts = pts / pts[..., -1:]  # DONT DO /= BECAUSE OF WEIRD PYTORCH BUG
+        if norm != 1:
+            pts *= norm
+    res = pts[..., :ncol].reshape(*output_reshape, ncol)
+    return res
+def crop_to_homography(K, crop, target_size=None):
+    """ Given an image and its intrinsics,
+        we want to replicate a rectangular crop with an homography,
+        so that the principal point of the new 'crop' is centered.
+    """
+    # build intrinsics for the crop
+    crop = np.round(crop)
+    crop_size = crop[2:] - crop[:2]
+    K2 = K.copy()  # same focal
+    K2[:2, 2] = crop_size / 2  # new principal point is perfectly centered
+    # find which corner is the most far-away from current principal point
+    # so that the final homography does not go over the image borders
+    corners = crop.reshape(-1, 2)
+    corner_idx = np.abs(corners - K[:2, 2]).argmax(0)
+    corner = corners[corner_idx, [0, 1]]
+    # align with the corresponding corner from the target view
+    corner2 = np.c_[[0, 0], crop_size][[0, 1], corner_idx]
+    old_pt = _dotmv(np.linalg.inv(K), corner, norm=1)
+    new_pt = _dotmv(np.linalg.inv(K2), corner2, norm=1)
+    R = _rotation_origin_to_pt(old_pt) @ np.linalg.inv(_rotation_origin_to_pt(new_pt))
+    if target_size is not None:
+        imsize = target_size
+        target_size = np.asarray(target_size)
+        scaling = min(target_size / crop_size)
+        K2[:2] *= scaling
+        K2[:2, 2] = target_size / 2
+    else:
+        imsize = tuple(np.int32(crop_size).tolist())
+    return imsize, K2, R, K @ R @ np.linalg.inv(K2)
+def gen_random_crops(imsize, n_crops, resolution, aug_crop, rng=np.random):
+    """ Generate random crops of size=resolution,
+        for an input image upscaled to (imsize + randint(0 , aug_crop))
+    """
+    resolution_crop = np.array(resolution) * min(np.array(imsize) / resolution)
+    # (virtually) upscale the input image
+    # scaling = rng.uniform(1, 1+(aug_crop+1)/min(imsize))
+    scaling = np.exp(rng.uniform(0, np.log(1 + aug_crop / min(imsize))))
+    imsize2 = np.int32(np.array(imsize) * scaling)
+    # generate some random crops
+    topleft = rng.random((n_crops, 2)) * (imsize2 - resolution_crop)
+    crops = np.c_[topleft, topleft + resolution_crop]
+    # print(f"{scaling=}, {topleft=}")
+    # reduce the resolution to come back to original size
+    crops /= scaling
+    return crops
+def in2d_rect(corres, crops):
+    # corres = (N,2)
+    # crops = (M,4)
+    # output = (N, M)
+    is_sup = (corres[:, None] >= crops[None, :, 0:2])
+    is_inf = (corres[:, None] < crops[None, :, 2:4])
+    return (is_sup & is_inf).all(axis=-1)

mast3r/demo.py ADDED Viewed

	@@ -0,0 +1,321 @@

+#!/usr/bin/env python3
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# sparse gradio demo functions
+# --------------------------------------------------------
+import math
+import gradio
+import os
+import numpy as np
+import functools
+import trimesh
+import copy
+from scipy.spatial.transform import Rotation
+import tempfile
+import shutil
+from mast3r.cloud_opt.sparse_ga import sparse_global_alignment
+from mast3r.cloud_opt.tsdf_optimizer import TSDFPostProcess
+import mast3r.utils.path_to_dust3r  # noqa
+from dust3r.image_pairs import make_pairs
+from dust3r.utils.image import load_images
+from dust3r.utils.device import to_numpy
+from dust3r.viz import add_scene_cam, CAM_COLORS, OPENGL, pts3d_to_trimesh, cat_meshes
+import matplotlib.pyplot as pl
+class SparseGAState():
+    def __init__(self, sparse_ga, should_delete=False, cache_dir=None, outfile_name=None):
+        self.sparse_ga = sparse_ga
+        self.cache_dir = cache_dir
+        self.outfile_name = outfile_name
+        self.should_delete = should_delete
+    def __del__(self):
+        if not self.should_delete:
+            return
+        if self.cache_dir is not None and os.path.isdir(self.cache_dir):
+            shutil.rmtree(self.cache_dir)
+        self.cache_dir = None
+        if self.outfile_name is not None and os.path.isfile(self.outfile_name):
+            os.remove(self.outfile_name)
+        self.outfile_name = None
+def _convert_scene_output_to_glb(outfile, imgs, pts3d, mask, focals, cams2world, cam_size=0.05,
+                                 cam_color=None, as_pointcloud=False,
+                                 transparent_cams=False, silent=False):
+    assert len(pts3d) == len(mask) <= len(imgs) <= len(cams2world) == len(focals)
+    pts3d = to_numpy(pts3d)
+    imgs = to_numpy(imgs)
+    focals = to_numpy(focals)
+    cams2world = to_numpy(cams2world)
+    scene = trimesh.Scene()
+    # full pointcloud
+    if as_pointcloud:
+        pts = np.concatenate([p[m.ravel()] for p, m in zip(pts3d, mask)]).reshape(-1, 3)
+        col = np.concatenate([p[m] for p, m in zip(imgs, mask)]).reshape(-1, 3)
+        valid_msk = np.isfinite(pts.sum(axis=1))
+        pct = trimesh.PointCloud(pts[valid_msk], colors=col[valid_msk])
+        scene.add_geometry(pct)
+    else:
+        meshes = []
+        for i in range(len(imgs)):
+            pts3d_i = pts3d[i].reshape(imgs[i].shape)
+            msk_i = mask[i] & np.isfinite(pts3d_i.sum(axis=-1))
+            meshes.append(pts3d_to_trimesh(imgs[i], pts3d_i, msk_i))
+        mesh = trimesh.Trimesh(**cat_meshes(meshes))
+        scene.add_geometry(mesh)
+    # add each camera
+    for i, pose_c2w in enumerate(cams2world):
+        if isinstance(cam_color, list):
+            camera_edge_color = cam_color[i]
+        else:
+            camera_edge_color = cam_color or CAM_COLORS[i % len(CAM_COLORS)]
+        add_scene_cam(scene, pose_c2w, camera_edge_color,
+                      None if transparent_cams else imgs[i], focals[i],
+                      imsize=imgs[i].shape[1::-1], screen_width=cam_size)
+    rot = np.eye(4)
+    rot[:3, :3] = Rotation.from_euler('y', np.deg2rad(180)).as_matrix()
+    scene.apply_transform(np.linalg.inv(cams2world[0] @ OPENGL @ rot))
+    if not silent:
+        print('(exporting 3D scene to', outfile, ')')
+    scene.export(file_obj=outfile)
+    return outfile
+def get_3D_model_from_scene(silent, scene_state, min_conf_thr=2, as_pointcloud=False, mask_sky=False,
+                            clean_depth=False, transparent_cams=False, cam_size=0.05, TSDF_thresh=0):
+    """
+    extract 3D_model (glb file) from a reconstructed scene
+    """
+    if scene_state is None:
+        return None
+    outfile = scene_state.outfile_name
+    if outfile is None:
+        return None
+    # get optimized values from scene
+    scene = scene_state.sparse_ga
+    rgbimg = scene.imgs
+    focals = scene.get_focals().cpu()
+    cams2world = scene.get_im_poses().cpu()
+    # 3D pointcloud from depthmap, poses and intrinsics
+    if TSDF_thresh > 0:
+        tsdf = TSDFPostProcess(scene, TSDF_thresh=TSDF_thresh)
+        pts3d, _, confs = to_numpy(tsdf.get_dense_pts3d(clean_depth=clean_depth))
+    else:
+        pts3d, _, confs = to_numpy(scene.get_dense_pts3d(clean_depth=clean_depth))
+    msk = to_numpy([c > min_conf_thr for c in confs])
+    return _convert_scene_output_to_glb(outfile, rgbimg, pts3d, msk, focals, cams2world, as_pointcloud=as_pointcloud,
+                                        transparent_cams=transparent_cams, cam_size=cam_size, silent=silent)
+def get_reconstructed_scene(outdir, gradio_delete_cache, model, device, silent, image_size, current_scene_state,
+                            filelist, optim_level, lr1, niter1, lr2, niter2, min_conf_thr, matching_conf_thr,
+                            as_pointcloud, mask_sky, clean_depth, transparent_cams, cam_size, scenegraph_type, winsize,
+                            win_cyclic, refid, TSDF_thresh, shared_intrinsics, **kw):
+    """
+    from a list of images, run mast3r inference, sparse global aligner.
+    then run get_3D_model_from_scene
+    """
+    print(image_size, current_scene_state, optim_level, lr1, niter1, lr2, niter2, min_conf_thr, matching_conf_thr,
+            as_pointcloud, mask_sky, clean_depth, transparent_cams, cam_size, scenegraph_type, winsize,
+            win_cyclic, refid, TSDF_thresh, shared_intrinsics)
+    # 512 None refine+depth 0.07 500 0.014 200 1.5 5 True False True False 0.2 logwin 6 False 0 0 True
+    imgs = load_images(filelist, size=image_size, verbose=not silent)
+    if len(imgs) == 1:
+        imgs = [imgs[0], copy.deepcopy(imgs[0])]
+        imgs[1]['idx'] = 1
+        filelist = [filelist[0], filelist[0] + '_2']
+    scene_graph_params = [scenegraph_type]
+    if scenegraph_type in ["swin", "logwin"]:
+        scene_graph_params.append(str(winsize))
+    elif scenegraph_type == "oneref":
+        scene_graph_params.append(str(refid))
+    if scenegraph_type in ["swin", "logwin"] and not win_cyclic:
+        scene_graph_params.append('noncyclic')
+    scene_graph = '-'.join(scene_graph_params)
+    pairs = make_pairs(imgs, scene_graph=scene_graph, prefilter=None, symmetrize=True)
+    print(pairs, len(imgs))
+    if optim_level == 'coarse':
+        niter2 = 0
+    # Sparse GA (forward mast3r -> matching -> 3D optim -> 2D refinement -> triangulation)
+    if current_scene_state is not None and \
+        not current_scene_state.should_delete and \
+            current_scene_state.cache_dir is not None:
+        cache_dir = current_scene_state.cache_dir
+    elif gradio_delete_cache:
+        cache_dir = tempfile.mkdtemp(suffix='_cache', dir=outdir)
+    else:
+        cache_dir = os.path.join(outdir, 'cache')
+    os.makedirs(cache_dir, exist_ok=True)
+    scene = sparse_global_alignment(filelist, pairs, cache_dir,
+                                    model, lr1=lr1, niter1=niter1, lr2=lr2, niter2=niter2, device=device,
+                                    opt_depth='depth' in optim_level, shared_intrinsics=shared_intrinsics,
+                                    matching_conf_thr=matching_conf_thr, **kw)
+    if current_scene_state is not None and \
+        not current_scene_state.should_delete and \
+            current_scene_state.outfile_name is not None:
+        outfile_name = current_scene_state.outfile_name
+    else:
+        outfile_name = tempfile.mktemp(suffix='_scene.glb', dir=outdir)
+    scene_state = SparseGAState(scene, gradio_delete_cache, cache_dir, outfile_name)
+    outfile = get_3D_model_from_scene(silent, scene_state, min_conf_thr, as_pointcloud, mask_sky,
+                                      clean_depth, transparent_cams, cam_size, TSDF_thresh)
+    print(outfile)
+    return scene_state, outfile
+def set_scenegraph_options(inputfiles, win_cyclic, refid, scenegraph_type):
+    num_files = len(inputfiles) if inputfiles is not None else 1
+    show_win_controls = scenegraph_type in ["swin", "logwin"]
+    show_winsize = scenegraph_type in ["swin", "logwin"]
+    show_cyclic = scenegraph_type in ["swin", "logwin"]
+    max_winsize, min_winsize = 1, 1
+    if scenegraph_type == "swin":
+        if win_cyclic:
+            max_winsize = max(1, math.ceil((num_files - 1) / 2))
+        else:
+            max_winsize = num_files - 1
+    elif scenegraph_type == "logwin":
+        if win_cyclic:
+            half_size = math.ceil((num_files - 1) / 2)
+            max_winsize = max(1, math.ceil(math.log(half_size, 2)))
+        else:
+            max_winsize = max(1, math.ceil(math.log(num_files, 2)))
+    winsize = gradio.Slider(label="Scene Graph: Window Size", value=max_winsize,
+                            minimum=min_winsize, maximum=max_winsize, step=1, visible=show_winsize)
+    win_cyclic = gradio.Checkbox(value=win_cyclic, label="Cyclic sequence", visible=show_cyclic)
+    win_col = gradio.Column(visible=show_win_controls)
+    refid = gradio.Slider(label="Scene Graph: Id", value=0, minimum=0,
+                          maximum=num_files - 1, step=1, visible=scenegraph_type == 'oneref')
+    return win_col, winsize, win_cyclic, refid
+def main_demo(tmpdirname, model, device, image_size, server_name, server_port, silent=False,
+              share=False, gradio_delete_cache=False):
+    if not silent:
+        print('Outputing stuff in', tmpdirname)
+    recon_fun = functools.partial(get_reconstructed_scene, tmpdirname, gradio_delete_cache, model, device,
+                                  silent, image_size)
+    model_from_scene_fun = functools.partial(get_3D_model_from_scene, silent)
+    def get_context(delete_cache):
+        css = """.gradio-container {margin: 0 !important; min-width: 100%};"""
+        title = "MASt3R Demo"
+        if delete_cache:
+            return gradio.Blocks(css=css, title=title, delete_cache=(delete_cache, delete_cache))
+        else:
+            return gradio.Blocks(css=css, title="MASt3R Demo")  # for compatibility with older versions
+    with get_context(gradio_delete_cache) as demo:
+        # scene state is save so that you can change conf_thr, cam_size... without rerunning the inference
+        scene = gradio.State(None)
+        gradio.HTML('<h2 style="text-align: center;">MASt3R Demo</h2>')
+        with gradio.Column():
+            inputfiles = gradio.File(file_count="multiple")
+            with gradio.Row():
+                with gradio.Column():
+                    with gradio.Row():
+                        lr1 = gradio.Slider(label="Coarse LR", value=0.07, minimum=0.01, maximum=0.2, step=0.01)
+                        niter1 = gradio.Number(value=500, precision=0, minimum=0, maximum=10_000,
+                                               label="num_iterations", info="For coarse alignment!")
+                        lr2 = gradio.Slider(label="Fine LR", value=0.014, minimum=0.005, maximum=0.05, step=0.001)
+                        niter2 = gradio.Number(value=200, precision=0, minimum=0, maximum=100_000,
+                                               label="num_iterations", info="For refinement!")
+                        optim_level = gradio.Dropdown(["coarse", "refine", "refine+depth"],
+                                                      value='refine+depth', label="OptLevel",
+                                                      info="Optimization level")
+                    with gradio.Row():
+                        matching_conf_thr = gradio.Slider(label="Matching Confidence Thr", value=5.,
+                                                          minimum=0., maximum=30., step=0.1,
+                                                          info="Before Fallback to Regr3D!")
+                        shared_intrinsics = gradio.Checkbox(value=False, label="Shared intrinsics",
+                                                            info="Only optimize one set of intrinsics for all views")
+                        scenegraph_type = gradio.Dropdown([("complete: all possible image pairs", "complete"),
+                                                           ("swin: sliding window", "swin"),
+                                                           ("logwin: sliding window with long range", "logwin"),
+                                                           ("oneref: match one image with all", "oneref")],
+                                                          value='complete', label="Scenegraph",
+                                                          info="Define how to make pairs",
+                                                          interactive=True)
+                        with gradio.Column(visible=False) as win_col:
+                            winsize = gradio.Slider(label="Scene Graph: Window Size", value=1,
+                                                    minimum=1, maximum=1, step=1)
+                            win_cyclic = gradio.Checkbox(value=False, label="Cyclic sequence")
+                        refid = gradio.Slider(label="Scene Graph: Id", value=0,
+                                              minimum=0, maximum=0, step=1, visible=False)
+            run_btn = gradio.Button("Run")
+            with gradio.Row():
+                # adjust the confidence threshold
+                min_conf_thr = gradio.Slider(label="min_conf_thr", value=1.5, minimum=0.0, maximum=10, step=0.1)
+                # adjust the camera size in the output pointcloud
+                cam_size = gradio.Slider(label="cam_size", value=0.2, minimum=0.001, maximum=1.0, step=0.001)
+                TSDF_thresh = gradio.Slider(label="TSDF Threshold", value=0., minimum=0., maximum=1., step=0.01)
+            with gradio.Row():
+                as_pointcloud = gradio.Checkbox(value=True, label="As pointcloud")
+                # two post process implemented
+                mask_sky = gradio.Checkbox(value=False, label="Mask sky")
+                clean_depth = gradio.Checkbox(value=True, label="Clean-up depthmaps")
+                transparent_cams = gradio.Checkbox(value=False, label="Transparent cameras")
+            outmodel = gradio.Model3D()
+            # events
+            scenegraph_type.change(set_scenegraph_options,
+                                   inputs=[inputfiles, win_cyclic, refid, scenegraph_type],
+                                   outputs=[win_col, winsize, win_cyclic, refid])
+            inputfiles.change(set_scenegraph_options,
+                              inputs=[inputfiles, win_cyclic, refid, scenegraph_type],
+                              outputs=[win_col, winsize, win_cyclic, refid])
+            win_cyclic.change(set_scenegraph_options,
+                              inputs=[inputfiles, win_cyclic, refid, scenegraph_type],
+                              outputs=[win_col, winsize, win_cyclic, refid])
+            run_btn.click(fn=recon_fun,
+                          inputs=[scene, inputfiles, optim_level, lr1, niter1, lr2, niter2, min_conf_thr, matching_conf_thr,
+                                  as_pointcloud, mask_sky, clean_depth, transparent_cams, cam_size,
+                                  scenegraph_type, winsize, win_cyclic, refid, TSDF_thresh, shared_intrinsics],
+                          outputs=[scene, outmodel])
+            min_conf_thr.release(fn=model_from_scene_fun,
+                                 inputs=[scene, min_conf_thr, as_pointcloud, mask_sky,
+                                         clean_depth, transparent_cams, cam_size, TSDF_thresh],
+                                 outputs=outmodel)
+            cam_size.change(fn=model_from_scene_fun,
+                            inputs=[scene, min_conf_thr, as_pointcloud, mask_sky,
+                                    clean_depth, transparent_cams, cam_size, TSDF_thresh],
+                            outputs=outmodel)
+            TSDF_thresh.change(fn=model_from_scene_fun,
+                               inputs=[scene, min_conf_thr, as_pointcloud, mask_sky,
+                                       clean_depth, transparent_cams, cam_size, TSDF_thresh],
+                               outputs=outmodel)
+            as_pointcloud.change(fn=model_from_scene_fun,
+                                 inputs=[scene, min_conf_thr, as_pointcloud, mask_sky,
+                                         clean_depth, transparent_cams, cam_size, TSDF_thresh],
+                                 outputs=outmodel)
+            mask_sky.change(fn=model_from_scene_fun,
+                            inputs=[scene, min_conf_thr, as_pointcloud, mask_sky,
+                                    clean_depth, transparent_cams, cam_size, TSDF_thresh],
+                            outputs=outmodel)
+            clean_depth.change(fn=model_from_scene_fun,
+                               inputs=[scene, min_conf_thr, as_pointcloud, mask_sky,
+                                       clean_depth, transparent_cams, cam_size, TSDF_thresh],
+                               outputs=outmodel)
+            transparent_cams.change(model_from_scene_fun,
+                                    inputs=[scene, min_conf_thr, as_pointcloud, mask_sky,
+                                            clean_depth, transparent_cams, cam_size, TSDF_thresh],
+                                    outputs=outmodel)
+    demo.launch(share=share, server_name=server_name, server_port=server_port)

mast3r/fast_nn.py ADDED Viewed

	@@ -0,0 +1,223 @@

+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# MASt3R Fast Nearest Neighbor
+# --------------------------------------------------------
+import torch
+import numpy as np
+import math
+from scipy.spatial import KDTree
+import mast3r.utils.path_to_dust3r  # noqa
+from dust3r.utils.device import to_numpy, todevice  # noqa
+@torch.no_grad()
+def bruteforce_reciprocal_nns(A, B, device='cuda', block_size=None, dist='l2'):
+    if isinstance(A, np.ndarray):
+        A = torch.from_numpy(A).to(device)
+    if isinstance(B, np.ndarray):
+        B = torch.from_numpy(B).to(device)
+    A = A.to(device)
+    B = B.to(device)
+    if dist == 'l2':
+        dist_func = torch.cdist
+        argmin = torch.min
+    elif dist == 'dot':
+        def dist_func(A, B):
+            return A @ B.T
+        def argmin(X, dim):
+            sim, nn = torch.max(X, dim=dim)
+            return sim.neg_(), nn
+    else:
+        raise ValueError(f'Unknown {dist=}')
+    if block_size is None or len(A) * len(B) <= block_size**2:
+        dists = dist_func(A, B)
+        _, nn_A = argmin(dists, dim=1)
+        _, nn_B = argmin(dists, dim=0)
+    else:
+        dis_A = torch.full((A.shape[0],), float('inf'), device=device, dtype=A.dtype)
+        dis_B = torch.full((B.shape[0],), float('inf'), device=device, dtype=B.dtype)
+        nn_A = torch.full((A.shape[0],), -1, device=device, dtype=torch.int64)
+        nn_B = torch.full((B.shape[0],), -1, device=device, dtype=torch.int64)
+        number_of_iteration_A = math.ceil(A.shape[0] / block_size)
+        number_of_iteration_B = math.ceil(B.shape[0] / block_size)
+        for i in range(number_of_iteration_A):
+            A_i = A[i * block_size:(i + 1) * block_size]
+            for j in range(number_of_iteration_B):
+                B_j = B[j * block_size:(j + 1) * block_size]
+                dists_blk = dist_func(A_i, B_j)  # A, B, 1
+                # dists_blk = dists[i * block_size:(i+1)*block_size, j * block_size:(j+1)*block_size]
+                min_A_i, argmin_A_i = argmin(dists_blk, dim=1)
+                min_B_j, argmin_B_j = argmin(dists_blk, dim=0)
+                col_mask = min_A_i < dis_A[i * block_size:(i + 1) * block_size]
+                line_mask = min_B_j < dis_B[j * block_size:(j + 1) * block_size]
+                dis_A[i * block_size:(i + 1) * block_size][col_mask] = min_A_i[col_mask]
+                dis_B[j * block_size:(j + 1) * block_size][line_mask] = min_B_j[line_mask]
+                nn_A[i * block_size:(i + 1) * block_size][col_mask] = argmin_A_i[col_mask] + (j * block_size)
+                nn_B[j * block_size:(j + 1) * block_size][line_mask] = argmin_B_j[line_mask] + (i * block_size)
+    nn_A = nn_A.cpu().numpy()
+    nn_B = nn_B.cpu().numpy()
+    return nn_A, nn_B
+class cdistMatcher:
+    def __init__(self, db_pts, device='cuda'):
+        self.db_pts = db_pts.to(device)
+        self.device = device
+    def query(self, queries, k=1, **kw):
+        assert k == 1
+        if queries.numel() == 0:
+            return None, []
+        nnA, nnB = bruteforce_reciprocal_nns(queries, self.db_pts, device=self.device, **kw)
+        dis = None
+        return dis, nnA
+def merge_corres(idx1, idx2, shape1=None, shape2=None, ret_xy=True, ret_index=False):
+    assert idx1.dtype == idx2.dtype == np.int32
+    # unique and sort along idx1
+    corres = np.unique(np.c_[idx2, idx1].view(np.int64), return_index=ret_index)
+    if ret_index:
+        corres, indices = corres
+    xy2, xy1 = corres[:, None].view(np.int32).T
+    if ret_xy:
+        assert shape1 and shape2
+        xy1 = np.unravel_index(xy1, shape1)
+        xy2 = np.unravel_index(xy2, shape2)
+        if ret_xy != 'y_x':
+            xy1 = xy1[0].base[:, ::-1]
+            xy2 = xy2[0].base[:, ::-1]
+    if ret_index:
+        return xy1, xy2, indices
+    return xy1, xy2
+def fast_reciprocal_NNs(pts1, pts2, subsample_or_initxy1=8, ret_xy=True, pixel_tol=0, ret_basin=False,
+                        device='cuda', **matcher_kw):
+    H1, W1, DIM1 = pts1.shape
+    H2, W2, DIM2 = pts2.shape
+    assert DIM1 == DIM2
+    pts1 = pts1.reshape(-1, DIM1)
+    pts2 = pts2.reshape(-1, DIM2)
+    if isinstance(subsample_or_initxy1, int) and pixel_tol == 0:
+        S = subsample_or_initxy1
+        y1, x1 = np.mgrid[S // 2:H1:S, S // 2:W1:S].reshape(2, -1)
+        max_iter = 10
+    else:
+        x1, y1 = subsample_or_initxy1
+        if isinstance(x1, torch.Tensor):
+            x1 = x1.cpu().numpy()
+        if isinstance(y1, torch.Tensor):
+            y1 = y1.cpu().numpy()
+        max_iter = 1
+    xy1 = np.int32(np.unique(x1 + W1 * y1))  # make sure there's no doublons
+    xy2 = np.full_like(xy1, -1)
+    old_xy1 = xy1.copy()
+    old_xy2 = xy2.copy()
+    if 'dist' in matcher_kw or 'block_size' in matcher_kw \
+            or (isinstance(device, str) and device.startswith('cuda')) \
+            or (isinstance(device, torch.device) and device.type.startswith('cuda')):
+        pts1 = pts1.to(device)
+        pts2 = pts2.to(device)
+        tree1 = cdistMatcher(pts1, device=device)
+        tree2 = cdistMatcher(pts2, device=device)
+    else:
+        pts1, pts2 = to_numpy((pts1, pts2))
+        tree1 = KDTree(pts1)
+        tree2 = KDTree(pts2)
+    notyet = np.ones(len(xy1), dtype=bool)
+    basin = np.full((H1 * W1 + 1,), -1, dtype=np.int32) if ret_basin else None
+    niter = 0
+    # n_notyet = [len(notyet)]
+    while notyet.any():
+        _, xy2[notyet] = to_numpy(tree2.query(pts1[xy1[notyet]], **matcher_kw))
+        if not ret_basin:
+            notyet &= (old_xy2 != xy2)  # remove points that have converged
+        _, xy1[notyet] = to_numpy(tree1.query(pts2[xy2[notyet]], **matcher_kw))
+        if ret_basin:
+            basin[old_xy1[notyet]] = xy1[notyet]
+        notyet &= (old_xy1 != xy1)  # remove points that have converged
+        # n_notyet.append(notyet.sum())
+        niter += 1
+        if niter >= max_iter:
+            break
+        old_xy2[:] = xy2
+        old_xy1[:] = xy1
+    # print('notyet_stats:', ' '.join(map(str, (n_notyet+[0]*10)[:max_iter])))
+    if pixel_tol > 0:
+        # in case we only want to match some specific points
+        # and still have some way of checking reciprocity
+        old_yx1 = np.unravel_index(old_xy1, (H1, W1))[0].base
+        new_yx1 = np.unravel_index(xy1, (H1, W1))[0].base
+        dis = np.linalg.norm(old_yx1 - new_yx1, axis=-1)
+        converged = dis < pixel_tol
+        if not isinstance(subsample_or_initxy1, int):
+            xy1 = old_xy1  # replace new points by old ones
+    else:
+        converged = ~notyet  # converged correspondences
+    # keep only unique correspondences, and sort on xy1
+    xy1, xy2 = merge_corres(xy1[converged], xy2[converged], (H1, W1), (H2, W2), ret_xy=ret_xy)
+    if ret_basin:
+        return xy1, xy2, basin
+    return xy1, xy2
+def extract_correspondences_nonsym(A, B, confA, confB, subsample=8, device=None, ptmap_key='pred_desc', pixel_tol=0):
+    if '3d' in ptmap_key:
+        opt = dict(device='cpu', workers=32)
+    else:
+        opt = dict(device=device, dist='dot', block_size=2**13)
+    # matching the two pairs
+    idx1 = []
+    idx2 = []
+    # merge corres from opposite pairs
+    HA, WA = A.shape[:2]
+    HB, WB = B.shape[:2]
+    if pixel_tol == 0:
+        nn1to2 = fast_reciprocal_NNs(A, B, subsample_or_initxy1=subsample, ret_xy=False, **opt)
+        nn2to1 = fast_reciprocal_NNs(B, A, subsample_or_initxy1=subsample, ret_xy=False, **opt)
+    else:
+        S = subsample
+        yA, xA = np.mgrid[S // 2:HA:S, S // 2:WA:S].reshape(2, -1)
+        yB, xB = np.mgrid[S // 2:HB:S, S // 2:WB:S].reshape(2, -1)
+        nn1to2 = fast_reciprocal_NNs(A, B, subsample_or_initxy1=(xA, yA), ret_xy=False, pixel_tol=pixel_tol, **opt)
+        nn2to1 = fast_reciprocal_NNs(B, A, subsample_or_initxy1=(xB, yB), ret_xy=False, pixel_tol=pixel_tol, **opt)
+    idx1 = np.r_[nn1to2[0], nn2to1[1]]
+    idx2 = np.r_[nn1to2[1], nn2to1[0]]
+    c1 = confA.ravel()[idx1]
+    c2 = confB.ravel()[idx2]
+    xy1, xy2, idx = merge_corres(idx1, idx2, (HA, WA), (HB, WB), ret_xy=True, ret_index=True)
+    conf = np.minimum(c1[idx], c2[idx])
+    corres = (xy1.copy(), xy2.copy(), conf)
+    return todevice(corres, device)

mast3r/losses.py ADDED Viewed

	@@ -0,0 +1,508 @@

+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# Implementation of MASt3R training losses
+# --------------------------------------------------------
+import torch
+import torch.nn as nn
+import numpy as np
+from sklearn.metrics import average_precision_score
+import mast3r.utils.path_to_dust3r  # noqa
+from dust3r.losses import BaseCriterion, Criterion, MultiLoss, Sum, ConfLoss
+from dust3r.losses import Regr3D as Regr3D_dust3r
+from dust3r.utils.geometry import (geotrf, inv, normalize_pointcloud)
+from dust3r.inference import get_pred_pts3d
+from dust3r.utils.geometry import get_joint_pointcloud_depth, get_joint_pointcloud_center_scale
+def apply_log_to_norm(xyz):
+    d = xyz.norm(dim=-1, keepdim=True)
+    xyz = xyz / d.clip(min=1e-8)
+    xyz = xyz * torch.log1p(d)
+    return xyz
+class Regr3D (Regr3D_dust3r):
+    def __init__(self, criterion, norm_mode='avg_dis', gt_scale=False, opt_fit_gt=False,
+                 sky_loss_value=2, max_metric_scale=False, loss_in_log=False):
+        self.loss_in_log = loss_in_log
+        if norm_mode.startswith('?'):
+            # do no norm pts from metric scale datasets
+            self.norm_all = False
+            self.norm_mode = norm_mode[1:]
+        else:
+            self.norm_all = True
+            self.norm_mode = norm_mode
+        super().__init__(criterion, self.norm_mode, gt_scale)
+        self.sky_loss_value = sky_loss_value
+        self.max_metric_scale = max_metric_scale
+    def get_all_pts3d(self, gt1, gt2, pred1, pred2, dist_clip=None):
+        # everything is normalized w.r.t. camera of view1
+        in_camera1 = inv(gt1['camera_pose'])
+        gt_pts1 = geotrf(in_camera1, gt1['pts3d'])  # B,H,W,3
+        gt_pts2 = geotrf(in_camera1, gt2['pts3d'])  # B,H,W,3
+        valid1 = gt1['valid_mask'].clone()
+        valid2 = gt2['valid_mask'].clone()
+        if dist_clip is not None:
+            # points that are too far-away == invalid
+            dis1 = gt_pts1.norm(dim=-1)  # (B, H, W)
+            dis2 = gt_pts2.norm(dim=-1)  # (B, H, W)
+            valid1 = valid1 & (dis1 <= dist_clip)
+            valid2 = valid2 & (dis2 <= dist_clip)
+        if self.loss_in_log == 'before':
+            # this only make sense when depth_mode == 'linear'
+            gt_pts1 = apply_log_to_norm(gt_pts1)
+            gt_pts2 = apply_log_to_norm(gt_pts2)
+        pr_pts1 = get_pred_pts3d(gt1, pred1, use_pose=False).clone()
+        pr_pts2 = get_pred_pts3d(gt2, pred2, use_pose=True).clone()
+        if not self.norm_all:
+            if self.max_metric_scale:
+                B = valid1.shape[0]
+                # valid1: B, H, W
+                # torch.linalg.norm(gt_pts1, dim=-1) -> B, H, W
+                # dist1_to_cam1 -> reshape to B, H*W
+                dist1_to_cam1 = torch.where(valid1, torch.linalg.norm(gt_pts1, dim=-1), 0).view(B, -1)
+                dist2_to_cam1 = torch.where(valid2, torch.linalg.norm(gt_pts2, dim=-1), 0).view(B, -1)
+                # is_metric_scale: B
+                # dist1_to_cam1.max(dim=-1).values -> B
+                gt1['is_metric_scale'] = gt1['is_metric_scale'] \
+                    & (dist1_to_cam1.max(dim=-1).values < self.max_metric_scale) \
+                    & (dist2_to_cam1.max(dim=-1).values < self.max_metric_scale)
+                gt2['is_metric_scale'] = gt1['is_metric_scale']
+            mask = ~gt1['is_metric_scale']
+        else:
+            mask = torch.ones_like(gt1['is_metric_scale'])
+        # normalize 3d points
+        if self.norm_mode and mask.any():
+            pr_pts1[mask], pr_pts2[mask] = normalize_pointcloud(pr_pts1[mask], pr_pts2[mask], self.norm_mode,
+                                                                valid1[mask], valid2[mask])
+        if self.norm_mode and not self.gt_scale:
+            gt_pts1, gt_pts2, norm_factor = normalize_pointcloud(gt_pts1, gt_pts2, self.norm_mode,
+                                                                 valid1, valid2, ret_factor=True)
+            # apply the same normalization to prediction
+            pr_pts1[~mask] = pr_pts1[~mask] / norm_factor[~mask]
+            pr_pts2[~mask] = pr_pts2[~mask] / norm_factor[~mask]
+        # return sky segmentation, making sure they don't include any labelled 3d points
+        sky1 = gt1['sky_mask'] & (~valid1)
+        sky2 = gt2['sky_mask'] & (~valid2)
+        return gt_pts1, gt_pts2, pr_pts1, pr_pts2, valid1, valid2, sky1, sky2, {}
+    def compute_loss(self, gt1, gt2, pred1, pred2, **kw):
+        gt_pts1, gt_pts2, pred_pts1, pred_pts2, mask1, mask2, sky1, sky2, monitoring = \
+            self.get_all_pts3d(gt1, gt2, pred1, pred2, **kw)
+        if self.sky_loss_value > 0:
+            assert self.criterion.reduction == 'none', 'sky_loss_value should be 0 if no conf loss'
+            # add the sky pixel as "valid" pixels...
+            mask1 = mask1 | sky1
+            mask2 = mask2 | sky2
+        # loss on img1 side
+        pred_pts1 = pred_pts1[mask1]
+        gt_pts1 = gt_pts1[mask1]
+        if self.loss_in_log and self.loss_in_log != 'before':
+            # this only make sense when depth_mode == 'exp'
+            pred_pts1 = apply_log_to_norm(pred_pts1)
+            gt_pts1 = apply_log_to_norm(gt_pts1)
+        l1 = self.criterion(pred_pts1, gt_pts1)
+        # loss on gt2 side
+        pred_pts2 = pred_pts2[mask2]
+        gt_pts2 = gt_pts2[mask2]
+        if self.loss_in_log and self.loss_in_log != 'before':
+            pred_pts2 = apply_log_to_norm(pred_pts2)
+            gt_pts2 = apply_log_to_norm(gt_pts2)
+        l2 = self.criterion(pred_pts2, gt_pts2)
+        if self.sky_loss_value > 0:
+            assert self.criterion.reduction == 'none', 'sky_loss_value should be 0 if no conf loss'
+            # ... but force the loss to be high there
+            l1 = torch.where(sky1[mask1], self.sky_loss_value, l1)
+            l2 = torch.where(sky2[mask2], self.sky_loss_value, l2)
+        self_name = type(self).__name__
+        details = {self_name + '_pts3d_1': float(l1.mean()), self_name + '_pts3d_2': float(l2.mean())}
+        return Sum((l1, mask1), (l2, mask2)), (details | monitoring)
+class Regr3D_ShiftInv (Regr3D):
+    """ Same than Regr3D but invariant to depth shift.
+    """
+    def get_all_pts3d(self, gt1, gt2, pred1, pred2):
+        # compute unnormalized points
+        gt_pts1, gt_pts2, pred_pts1, pred_pts2, mask1, mask2, sky1, sky2, monitoring = \
+            super().get_all_pts3d(gt1, gt2, pred1, pred2)
+        # compute median depth
+        gt_z1, gt_z2 = gt_pts1[..., 2], gt_pts2[..., 2]
+        pred_z1, pred_z2 = pred_pts1[..., 2], pred_pts2[..., 2]
+        gt_shift_z = get_joint_pointcloud_depth(gt_z1, gt_z2, mask1, mask2)[:, None, None]
+        pred_shift_z = get_joint_pointcloud_depth(pred_z1, pred_z2, mask1, mask2)[:, None, None]
+        # subtract the median depth
+        gt_z1 -= gt_shift_z
+        gt_z2 -= gt_shift_z
+        pred_z1 -= pred_shift_z
+        pred_z2 -= pred_shift_z
+        # monitoring = dict(monitoring, gt_shift_z=gt_shift_z.mean().detach(), pred_shift_z=pred_shift_z.mean().detach())
+        return gt_pts1, gt_pts2, pred_pts1, pred_pts2, mask1, mask2, sky1, sky2, monitoring
+class Regr3D_ScaleInv (Regr3D):
+    """ Same than Regr3D but invariant to depth scale.
+        if gt_scale == True: enforce the prediction to take the same scale than GT
+    """
+    def get_all_pts3d(self, gt1, gt2, pred1, pred2):
+        # compute depth-normalized points
+        gt_pts1, gt_pts2, pred_pts1, pred_pts2, mask1, mask2, sky1, sky2, monitoring = \
+            super().get_all_pts3d(gt1, gt2, pred1, pred2)
+        # measure scene scale
+        _, gt_scale = get_joint_pointcloud_center_scale(gt_pts1, gt_pts2, mask1, mask2)
+        _, pred_scale = get_joint_pointcloud_center_scale(pred_pts1, pred_pts2, mask1, mask2)
+        # prevent predictions to be in a ridiculous range
+        pred_scale = pred_scale.clip(min=1e-3, max=1e3)
+        # subtract the median depth
+        if self.gt_scale:
+            pred_pts1 *= gt_scale / pred_scale
+            pred_pts2 *= gt_scale / pred_scale
+            # monitoring = dict(monitoring, pred_scale=(pred_scale/gt_scale).mean())
+        else:
+            gt_pts1 /= gt_scale
+            gt_pts2 /= gt_scale
+            pred_pts1 /= pred_scale
+            pred_pts2 /= pred_scale
+            # monitoring = dict(monitoring, gt_scale=gt_scale.mean(), pred_scale=pred_scale.mean().detach())
+        return gt_pts1, gt_pts2, pred_pts1, pred_pts2, mask1, mask2, sky1, sky2, monitoring
+class Regr3D_ScaleShiftInv (Regr3D_ScaleInv, Regr3D_ShiftInv):
+    # calls Regr3D_ShiftInv first, then Regr3D_ScaleInv
+    pass
+def get_similarities(desc1, desc2, euc=False):
+    if euc:  # euclidean distance in same range than similarities
+        dists = (desc1[:, :, None] - desc2[:, None]).norm(dim=-1)
+        sim = 1 / (1 + dists)
+    else:
+        # Compute similarities
+        sim = desc1 @ desc2.transpose(-2, -1)
+    return sim
+class MatchingCriterion(BaseCriterion):
+    def __init__(self, reduction='mean', fp=torch.float32):
+        super().__init__(reduction)
+        self.fp = fp
+    def forward(self, a, b, valid_matches=None, euc=False):
+        assert a.ndim >= 2 and 1 <= a.shape[-1], f'Bad shape = {a.shape}'
+        dist = self.loss(a.to(self.fp), b.to(self.fp), valid_matches, euc=euc)
+        # one dimension less or reduction to single value
+        assert (valid_matches is None and dist.ndim == a.ndim -
+                1) or self.reduction in ['mean', 'sum', '1-mean', 'none']
+        if self.reduction == 'none':
+            return dist
+        if self.reduction == 'sum':
+            return dist.sum()
+        if self.reduction == 'mean':
+            return dist.mean() if dist.numel() > 0 else dist.new_zeros(())
+        if self.reduction == '1-mean':
+            return 1. - dist.mean() if dist.numel() > 0 else dist.new_ones(())
+        raise ValueError(f'bad {self.reduction=} mode')
+    def loss(self, a, b, valid_matches=None):
+        raise NotImplementedError
+class InfoNCE(MatchingCriterion):
+    def __init__(self, temperature=0.07, eps=1e-8, mode='all', **kwargs):
+        super().__init__(**kwargs)
+        self.temperature = temperature
+        self.eps = eps
+        assert mode in ['all', 'proper', 'dual']
+        self.mode = mode
+    def loss(self, desc1, desc2, valid_matches=None, euc=False):
+        # valid positives are along diagonals
+        B, N, D = desc1.shape
+        B2, N2, D2 = desc2.shape
+        assert B == B2 and D == D2
+        if valid_matches is None:
+            valid_matches = torch.ones([B, N], dtype=bool)
+        # torch.all(valid_matches.sum(dim=-1) > 0) some pairs have no matches????
+        assert valid_matches.shape == torch.Size([B, N]) and valid_matches.sum() > 0
+        # Tempered similarities
+        sim = get_similarities(desc1, desc2, euc) / self.temperature
+        sim[sim.isnan()] = -torch.inf  # ignore nans
+        # Softmax of positives with temperature
+        sim = sim.exp_()  # save peak memory
+        positives = sim.diagonal(dim1=-2, dim2=-1)
+        # Loss
+        if self.mode == 'all':            # Previous InfoNCE
+            loss = -torch.log((positives / sim.sum(dim=-1).sum(dim=-1, keepdim=True)).clip(self.eps))
+        elif self.mode == 'proper':  # Proper InfoNCE
+            loss = -(torch.log((positives / sim.sum(dim=-2)).clip(self.eps)) +
+                     torch.log((positives / sim.sum(dim=-1)).clip(self.eps)))
+        elif self.mode == 'dual':  # Dual Softmax
+            loss = -(torch.log((positives**2 / sim.sum(dim=-1) / sim.sum(dim=-2)).clip(self.eps)))
+        else:
+            raise ValueError("This should not happen...")
+        return loss[valid_matches]
+class APLoss (MatchingCriterion):
+    """ AP loss.
+    """
+    def __init__(self, nq='torch', min=0, max=1, euc=False, **kw):
+        super().__init__(**kw)
+        # Exact/True AP loss (not differentiable)
+        if nq == 0:
+            nq = 'sklearn'  # special case
+        try:
+            self.compute_AP = eval('self.compute_true_AP_' + nq)
+        except:
+            raise ValueError("Unknown mode %s for AP loss" % nq)
+    @staticmethod
+    def compute_true_AP_sklearn(scores, labels):
+        def compute_AP(label, score):
+            return average_precision_score(label, score)
+        aps = scores.new_zeros((scores.shape[0], scores.shape[1]))
+        label_np = labels.cpu().numpy().astype(bool)
+        scores_np = scores.cpu().numpy()
+        for bi in range(scores_np.shape[0]):
+            for i in range(scores_np.shape[1]):
+                labels = label_np[bi, i, :]
+                if labels.sum() < 1:
+                    continue
+                aps[bi, i] = compute_AP(labels, scores_np[bi, i, :])
+        return aps
+    @staticmethod
+    def compute_true_AP_torch(scores, labels):
+        assert scores.shape == labels.shape
+        B, N, M = labels.shape
+        dev = labels.device
+        with torch.no_grad():
+            # sort scores
+            _, order = scores.sort(dim=-1, descending=True)
+            # sort labels accordingly
+            labels = labels[torch.arange(B, device=dev)[:, None, None].expand(order.shape),
+                            torch.arange(N, device=dev)[None, :, None].expand(order.shape),
+                            order]
+            # compute number of positives per query
+            npos = labels.sum(dim=-1)
+            assert torch.all(torch.isclose(npos, npos[0, 0])
+                             ), "only implemented for constant number of positives per query"
+            npos = int(npos[0, 0])
+            # compute precision at each recall point
+            posrank = labels.nonzero()[:, -1].view(B, N, npos)
+            recall = torch.arange(1, 1 + npos, dtype=torch.float32, device=dev)[None, None, :].expand(B, N, npos)
+            precision = recall / (1 + posrank).float()
+            # average precision values at all recall points
+            aps = precision.mean(dim=-1)
+        return aps
+    def loss(self, desc1, desc2, valid_matches=None, euc=False):  # if matches is None, positives are the diagonal
+        B, N1, D = desc1.shape
+        B2, N2, D2 = desc2.shape
+        assert B == B2 and D == D2
+        scores = get_similarities(desc1, desc2, euc)
+        labels = torch.zeros([B, N1, N2], dtype=scores.dtype, device=scores.device)
+        # allow all diagonal positives and only mask afterwards
+        labels.diagonal(dim1=-2, dim2=-1)[...] = 1.
+        apscore = self.compute_AP(scores, labels)
+        if valid_matches is not None:
+            apscore = apscore[valid_matches]
+        return apscore
+class MatchingLoss (Criterion, MultiLoss):
+    """
+    Matching loss per image
+    only compare pixels inside an image but not in the whole batch as what would be done usually
+    """
+    def __init__(self, criterion, withconf=False, use_pts3d=False, negatives_padding=0, blocksize=4096):
+        super().__init__(criterion)
+        self.negatives_padding = negatives_padding
+        self.use_pts3d = use_pts3d
+        self.blocksize = blocksize
+        self.withconf = withconf
+    def add_negatives(self, outdesc2, desc2, batchid, x2, y2):
+        if self.negatives_padding:
+            B, H, W, D = desc2.shape
+            negatives = torch.ones([B, H, W], device=desc2.device, dtype=bool)
+            negatives[batchid, y2, x2] = False
+            sel = negatives & (negatives.view([B, -1]).cumsum(dim=-1).view(B, H, W)
+                               <= self.negatives_padding)  # take the N-first negatives
+            outdesc2 = torch.cat([outdesc2, desc2[sel].view([B, -1, D])], dim=1)
+        return outdesc2
+    def get_confs(self, pred1, pred2, sel1, sel2):
+        if self.withconf:
+            if self.use_pts3d:
+                outconfs1 = pred1['conf'][sel1]
+                outconfs2 = pred2['conf'][sel2]
+            else:
+                outconfs1 = pred1['desc_conf'][sel1]
+                outconfs2 = pred2['desc_conf'][sel2]
+        else:
+            outconfs1 = outconfs2 = None
+        return outconfs1, outconfs2
+    def get_descs(self, pred1, pred2):
+        if self.use_pts3d:
+            desc1, desc2 = pred1['pts3d'], pred2['pts3d_in_other_view']
+        else:
+            desc1, desc2 = pred1['desc'], pred2['desc']
+        return desc1, desc2
+    def get_matching_descs(self, gt1, gt2, pred1, pred2, **kw):
+        outdesc1 = outdesc2 = outconfs1 = outconfs2 = None
+        # Recover descs, GT corres and valid mask
+        desc1, desc2 = self.get_descs(pred1, pred2)
+        (x1, y1), (x2, y2) = gt1['corres'].unbind(-1), gt2['corres'].unbind(-1)
+        valid_matches = gt1['valid_corres']
+        # Select descs that have GT matches
+        B, N = x1.shape
+        batchid = torch.arange(B)[:, None].repeat(1, N)  # B, N
+        outdesc1, outdesc2 = desc1[batchid, y1, x1], desc2[batchid, y2, x2]  # B, N, D
+        # Padd with unused negatives
+        outdesc2 = self.add_negatives(outdesc2, desc2, batchid, x2, y2)
+        # Gather confs if needed
+        sel1 = batchid, y1, x1
+        sel2 = batchid, y2, x2
+        outconfs1, outconfs2 = self.get_confs(pred1, pred2, sel1, sel2)
+        return outdesc1, outdesc2, outconfs1, outconfs2, valid_matches, {'use_euclidean_dist': self.use_pts3d}
+    def blockwise_criterion(self, descs1, descs2, confs1, confs2, valid_matches, euc, rng=np.random, shuffle=True):
+        loss = None
+        details = {}
+        B, N, D = descs1.shape
+        if N <= self.blocksize:  # Blocks are larger than provided descs, compute regular loss
+            loss = self.criterion(descs1, descs2, valid_matches, euc=euc)
+        else:  # Compute criterion on the blockdiagonal only, after shuffling
+            # Shuffle if necessary
+            matches_perm = slice(None)
+            if shuffle:
+                matches_perm = np.stack([rng.choice(range(N), size=N, replace=False) for _ in range(B)])
+                batchid = torch.tile(torch.arange(B), (N, 1)).T
+                matches_perm = batchid, matches_perm
+            descs1 = descs1[matches_perm]
+            descs2 = descs2[matches_perm]
+            valid_matches = valid_matches[matches_perm]
+            assert N % self.blocksize == 0, "Error, can't chunk block-diagonal, please check blocksize"
+            n_chunks = N // self.blocksize
+            descs1 = descs1.reshape([B * n_chunks, self.blocksize, D])  # [B*(N//blocksize), blocksize, D]
+            descs2 = descs2.reshape([B * n_chunks, self.blocksize, D])  # [B*(N//blocksize), blocksize, D]
+            valid_matches = valid_matches.view([B * n_chunks, self.blocksize])
+            loss = self.criterion(descs1, descs2, valid_matches, euc=euc)
+            if self.withconf:
+                confs1, confs2 = map(lambda x: x[matches_perm], (confs1, confs2))  # apply perm to confidences if needed
+        if self.withconf:
+            # split confidences between positives/negatives for loss computation
+            details['conf_pos'] = map(lambda x: x[valid_matches.view(B, -1)], (confs1, confs2))
+            details['conf_neg'] = map(lambda x: x[~valid_matches.view(B, -1)], (confs1, confs2))
+            details['Conf1_std'] = confs1.std()
+            details['Conf2_std'] = confs2.std()
+        return loss, details
+    def compute_loss(self, gt1, gt2, pred1, pred2, **kw):
+        # Gather preds and GT
+        descs1, descs2, confs1, confs2, valid_matches, monitoring = self.get_matching_descs(
+            gt1, gt2, pred1, pred2, **kw)
+        # loss on matches
+        loss, details = self.blockwise_criterion(descs1, descs2, confs1, confs2,
+                                                 valid_matches, euc=monitoring.pop('use_euclidean_dist', False))
+        details[type(self).__name__] = float(loss.mean())
+        return loss, (details | monitoring)
+class ConfMatchingLoss(ConfLoss):
+    """ Weight matching by learned confidence. Same as ConfLoss but for a matching criterion
+        Assuming the input matching_loss is a match-level loss.
+    """
+    def __init__(self, pixel_loss, alpha=1., confmode='prod', neg_conf_loss_quantile=False):
+        super().__init__(pixel_loss, alpha)
+        self.pixel_loss.withconf = True
+        self.confmode = confmode
+        self.neg_conf_loss_quantile = neg_conf_loss_quantile
+    def aggregate_confs(self, confs1, confs2):  # get the confidences resulting from the two view predictions
+        if self.confmode == 'prod':
+            confs = confs1 * confs2 if confs1 is not None and confs2 is not None else 1.
+        elif self.confmode == 'mean':
+            confs = .5 * (confs1 + confs2) if confs1 is not None and confs2 is not None else 1.
+        else:
+            raise ValueError(f"Unknown conf mode {self.confmode}")
+        return confs
+    def compute_loss(self, gt1, gt2, pred1, pred2, **kw):
+        # compute per-pixel loss
+        loss, details = self.pixel_loss(gt1, gt2, pred1, pred2, **kw)
+        # Recover confidences for positive and negative samples
+        conf1_pos, conf2_pos = details.pop('conf_pos')
+        conf1_neg, conf2_neg = details.pop('conf_neg')
+        conf_pos = self.aggregate_confs(conf1_pos, conf2_pos)
+        # weight Matching loss by confidence on positives
+        conf_pos, log_conf_pos = self.get_conf_log(conf_pos)
+        conf_loss = loss * conf_pos - self.alpha * log_conf_pos
+        # average + nan protection (in case of no valid pixels at all)
+        conf_loss = conf_loss.mean() if conf_loss.numel() > 0 else 0
+        # Add negative confs loss to give some supervision signal to confidences for pixels that are not matched in GT
+        if self.neg_conf_loss_quantile:
+            conf_neg = torch.cat([conf1_neg, conf2_neg])
+            conf_neg, log_conf_neg = self.get_conf_log(conf_neg)
+            # recover quantile that will be used for negatives loss value assignment
+            neg_loss_value = torch.quantile(loss, self.neg_conf_loss_quantile).detach()
+            neg_loss = neg_loss_value * conf_neg - self.alpha * log_conf_neg
+            neg_loss = neg_loss.mean() if neg_loss.numel() > 0 else 0
+            conf_loss = conf_loss + neg_loss
+        return conf_loss, dict(matching_conf_loss=float(conf_loss), **details)

mast3r/model.py ADDED Viewed

	@@ -0,0 +1,68 @@

+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# MASt3R model class
+# --------------------------------------------------------
+import torch
+import torch.nn.functional as F
+import os
+from mast3r.catmlp_dpt_head import mast3r_head_factory
+import mast3r.utils.path_to_dust3r  # noqa
+from dust3r.model import AsymmetricCroCo3DStereo  # noqa
+from dust3r.utils.misc import transpose_to_landscape  # noqa
+inf = float('inf')
+def load_model(model_path, device, verbose=True):
+    if verbose:
+        print('... loading model from', model_path)
+    ckpt = torch.load(model_path, map_location='cpu')
+    args = ckpt['args'].model.replace("ManyAR_PatchEmbed", "PatchEmbedDust3R")
+    if 'landscape_only' not in args:
+        args = args[:-1] + ', landscape_only=False)'
+    else:
+        args = args.replace(" ", "").replace('landscape_only=True', 'landscape_only=False')
+    assert "landscape_only=False" in args
+    if verbose:
+        print(f"instantiating : {args}")
+    net = eval(args)
+    s = net.load_state_dict(ckpt['model'], strict=False)
+    if verbose:
+        print(s)
+    return net.to(device)
+class AsymmetricMASt3R(AsymmetricCroCo3DStereo):
+    def __init__(self, desc_mode=('norm'), two_confs=False, desc_conf_mode=None, **kwargs):
+        self.desc_mode = desc_mode
+        self.two_confs = two_confs
+        self.desc_conf_mode = desc_conf_mode
+        super().__init__(**kwargs)
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, **kw):
+        if os.path.isfile(pretrained_model_name_or_path):
+            return load_model(pretrained_model_name_or_path, device='cpu')
+        else:
+            return super(AsymmetricMASt3R, cls).from_pretrained(pretrained_model_name_or_path, **kw)
+    def set_downstream_head(self, output_mode, head_type, landscape_only, depth_mode, conf_mode, patch_size, img_size, **kw):
+        assert img_size[0] % patch_size == 0 and img_size[
+            1] % patch_size == 0, f'{img_size=} must be multiple of {patch_size=}'
+        self.output_mode = output_mode
+        self.head_type = head_type
+        self.depth_mode = depth_mode
+        self.conf_mode = conf_mode
+        if self.desc_conf_mode is None:
+            self.desc_conf_mode = conf_mode
+        # allocate heads
+        self.downstream_head1 = mast3r_head_factory(head_type, output_mode, self, has_conf=bool(conf_mode))
+        self.downstream_head2 = mast3r_head_factory(head_type, output_mode, self, has_conf=bool(conf_mode))
+        # magic wrapper
+        self.head1 = transpose_to_landscape(self.downstream_head1, activate=landscape_only)
+        self.head2 = transpose_to_landscape(self.downstream_head2, activate=landscape_only)

mast3r/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # Copyright (C) 2024-present Naver Corporation. All rights reserved.
2	+ # Licensed under CC BY-NC-SA 4.0 (non-commercial use only).

mast3r/utils/coarse_to_fine.py ADDED Viewed

	@@ -0,0 +1,214 @@

+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# coarse to fine utilities
+# --------------------------------------------------------
+import numpy as np
+def crop_tag(cell):
+    return f'[{cell[1]}:{cell[3]},{cell[0]}:{cell[2]}]'
+def crop_slice(cell):
+    return slice(cell[1], cell[3]), slice(cell[0], cell[2])
+def _start_pos(total_size, win_size, overlap):
+    # we must have AT LEAST overlap between segments
+    # first segment starts at 0, last segment starts at total_size-win_size
+    assert 0 <= overlap < 1
+    assert total_size >= win_size
+    spacing = win_size * (1 - overlap)
+    last_pt = total_size - win_size
+    n_windows = 2 + int((last_pt - 1) // spacing)
+    return np.linspace(0, last_pt, n_windows).round().astype(int)
+def multiple_of_16(x):
+    return (x // 16) * 16
+def _make_overlapping_grid(H, W, size, overlap):
+    H_win = multiple_of_16(H * size // max(H, W))
+    W_win = multiple_of_16(W * size // max(H, W))
+    x = _start_pos(W, W_win, overlap)
+    y = _start_pos(H, H_win, overlap)
+    grid = np.stack(np.meshgrid(x, y, indexing='xy'), axis=-1)
+    grid = np.concatenate((grid, grid + (W_win, H_win)), axis=-1)
+    return grid.reshape(-1, 4)
+def _cell_size(cell2):
+    width, height = cell2[:, 2] - cell2[:, 0], cell2[:, 3] - cell2[:, 1]
+    assert width.min() >= 0
+    assert height.min() >= 0
+    return width, height
+def _norm_windows(cell2, H2, W2, forced_resolution=None):
+    # make sure the window aspect ratio is 3/4, or the output resolution is forced_resolution  if defined
+    outcell = cell2.copy()
+    width, height = _cell_size(cell2)
+    width2, height2 = width.clip(max=W2), height.clip(max=H2)
+    if forced_resolution is None:
+        width2[width < height] = (height2[width < height] * 3.01 / 4).clip(max=W2)
+        height2[width >= height] = (width2[width >= height] * 3.01 / 4).clip(max=H2)
+    else:
+        forced_H, forced_W = forced_resolution
+        width2[:] = forced_W
+        height2[:] = forced_H
+    half = (width2 - width) / 2
+    outcell[:, 0] -= half
+    outcell[:, 2] += half
+    half = (height2 - height) / 2
+    outcell[:, 1] -= half
+    outcell[:, 3] += half
+    # proj to integers
+    outcell = np.floor(outcell).astype(int)
+    # Take care of flooring errors
+    tmpw, tmph = _cell_size(outcell)
+    outcell[:, 0] += tmpw.astype(tmpw.dtype) - width2.astype(tmpw.dtype)
+    outcell[:, 1] += tmph.astype(tmpw.dtype) - height2.astype(tmpw.dtype)
+    # make sure 0 <= x < W2 and 0 <= y < H2
+    outcell[:, 0::2] -= outcell[:, [0]].clip(max=0)
+    outcell[:, 1::2] -= outcell[:, [1]].clip(max=0)
+    outcell[:, 0::2] -= outcell[:, [2]].clip(min=W2) - W2
+    outcell[:, 1::2] -= outcell[:, [3]].clip(min=H2) - H2
+    width, height = _cell_size(outcell)
+    assert np.all(width == width2.astype(width.dtype)) and np.all(
+        height == height2.astype(height.dtype)), "Error, output is not of the expected shape."
+    assert np.all(width <= W2)
+    assert np.all(height <= H2)
+    return outcell
+def _weight_pixels(cell, pix, assigned, gauss_var=2):
+    center = cell.reshape(-1, 2, 2).mean(axis=1)
+    width, height = _cell_size(cell)
+    # square distance between each cell center and each point
+    dist = (center[:, None] - pix[None]) / np.c_[width, height][:, None]
+    dist2 = np.square(dist).sum(axis=-1)
+    assert assigned.shape == dist2.shape
+    res = np.where(assigned, np.exp(-gauss_var * dist2), 0)
+    return res
+def pos2d_in_rect(p1, cell1):
+    x, y = p1.T
+    l, t, r, b = cell1
+    assigned = (l <= x) & (x < r) & (t <= y) & (y < b)
+    return assigned
+def _score_cell(cell1, H2, W2, p1, p2, min_corres=10, forced_resolution=None):
+    assert p1.shape == p2.shape
+    # compute keypoint assignment
+    assigned = pos2d_in_rect(p1, cell1[None].T)
+    assert assigned.shape == (len(cell1), len(p1))
+    # remove cells without correspondences
+    valid_cells = assigned.sum(axis=1) >= min_corres
+    cell1 = cell1[valid_cells]
+    assigned = assigned[valid_cells]
+    if not valid_cells.any():
+        return cell1, cell1, assigned
+    # fill-in the assigned points in both image
+    assigned_p1 = np.empty((len(cell1), len(p1), 2), dtype=np.float32)
+    assigned_p2 = np.empty((len(cell1), len(p2), 2), dtype=np.float32)
+    assigned_p1[:] = p1[None]
+    assigned_p2[:] = p2[None]
+    assigned_p1[~assigned] = np.nan
+    assigned_p2[~assigned] = np.nan
+    # find the median center and scale of assigned points in each cell
+    # cell_center1 = np.nanmean(assigned_p1, axis=1)
+    cell_center2 = np.nanmean(assigned_p2, axis=1)
+    im1_q25, im1_q75 = np.nanquantile(assigned_p1, (0.1, 0.9), axis=1)
+    im2_q25, im2_q75 = np.nanquantile(assigned_p2, (0.1, 0.9), axis=1)
+    robust_std1 = (im1_q75 - im1_q25).clip(20.)
+    robust_std2 = (im2_q75 - im2_q25).clip(20.)
+    cell_size1 = (cell1[:, 2:4] - cell1[:, 0:2])
+    cell_size2 = cell_size1 * robust_std2 / robust_std1
+    cell2 = np.c_[cell_center2 - cell_size2 / 2, cell_center2 + cell_size2 / 2]
+    # make sure cell bounds are valid
+    cell2 = _norm_windows(cell2, H2, W2, forced_resolution=forced_resolution)
+    # compute correspondence weights
+    corres_weights = _weight_pixels(cell1, p1, assigned) * _weight_pixels(cell2, p2, assigned)
+    # return a list of window pairs and assigned correspondences
+    return cell1, cell2, corres_weights
+def greedy_selection(corres_weights, target=0.9):
+    # corres_weight = (n_cell_pair, n_corres) matrix.
+    # If corres_weight[c,p]>0, means that correspondence p is visible in cell pair p
+    assert 0 < target <= 1
+    corres_weights = corres_weights.copy()
+    total = corres_weights.max(axis=0).sum()
+    target *= total
+    # init = empty
+    res = []
+    cur = np.zeros(corres_weights.shape[1])  # current selection
+    while cur.sum() < target:
+        # pick the nex best cell pair
+        best = corres_weights.sum(axis=1).argmax()
+        res.append(best)
+        # update current
+        cur += corres_weights[best]
+        # print('appending', best, 'with score', corres_weights[best].sum(), '-->', cur.sum())
+        # remove from all other views
+        corres_weights = (corres_weights - corres_weights[best]).clip(min=0)
+    return res
+def select_pairs_of_crops(img_q, img_b, pos2d_in_query, pos2d_in_ref, maxdim=512, overlap=.5, forced_resolution=None):
+    # prepare the overlapping cells
+    grid_q = _make_overlapping_grid(*img_q.shape[:2], maxdim, overlap)
+    grid_b = _make_overlapping_grid(*img_b.shape[:2], maxdim, overlap)
+    assert forced_resolution is None or len(forced_resolution) == 2
+    if isinstance(forced_resolution[0], int) or not len(forced_resolution[0]) == 2:
+        forced_resolution1 = forced_resolution2 = forced_resolution
+    else:
+        assert len(forced_resolution[1]) == 2
+        forced_resolution1 = forced_resolution[0]
+        forced_resolution2 = forced_resolution[1]
+    # Make sure crops respect constraints
+    grid_q = _norm_windows(grid_q.astype(float), *img_q.shape[:2], forced_resolution=forced_resolution1)
+    grid_b = _norm_windows(grid_b.astype(float), *img_b.shape[:2], forced_resolution=forced_resolution2)
+    # score cells
+    pairs_q = _score_cell(grid_q, *img_b.shape[:2], pos2d_in_query, pos2d_in_ref, forced_resolution=forced_resolution2)
+    pairs_b = _score_cell(grid_b, *img_q.shape[:2], pos2d_in_ref, pos2d_in_query, forced_resolution=forced_resolution1)
+    pairs_b = pairs_b[1], pairs_b[0], pairs_b[2]  # cellq, cellb, corres_weights
+    # greedy selection until all correspondences are generated
+    cell1, cell2, corres_weights = map(np.concatenate, zip(pairs_q, pairs_b))
+    if len(corres_weights) == 0:
+        return  # tolerated for empty generators
+    order = greedy_selection(corres_weights, target=0.9)
+    for i in order:
+        def pair_tag(qi, bi): return (str(qi) + crop_tag(cell1[i]), str(bi) + crop_tag(cell2[i]))
+        yield cell1[i], cell2[i], pair_tag

mast3r/utils/collate.py ADDED Viewed

	@@ -0,0 +1,62 @@

+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# Collate extensions
+# --------------------------------------------------------
+import torch
+import collections
+from torch.utils.data._utils.collate import default_collate_fn_map, default_collate_err_msg_format
+from typing import Callable, Dict, Optional, Tuple, Type, Union, List
+def cat_collate_tensor_fn(batch, *, collate_fn_map):
+    return torch.cat(batch, dim=0)
+def cat_collate_list_fn(batch, *, collate_fn_map: Optional[Dict[Union[Type, Tuple[Type, ...]], Callable]] = None):
+    return [item for bb in batch for item in bb]  # concatenate all lists
+cat_collate_fn_map = default_collate_fn_map.copy()
+cat_collate_fn_map[torch.Tensor] = cat_collate_tensor_fn
+cat_collate_fn_map[List] = cat_collate_list_fn
+cat_collate_fn_map[type(None)] = lambda _, **kw: None  # When some Nones, simply return a single None
+def cat_collate(batch, *, collate_fn_map: Optional[Dict[Union[Type, Tuple[Type, ...]], Callable]] = None):
+    r"""Custom collate function that concatenates stuff instead of stacking them, and handles NoneTypes """
+    elem = batch[0]
+    elem_type = type(elem)
+    if collate_fn_map is not None:
+        if elem_type in collate_fn_map:
+            return collate_fn_map[elem_type](batch, collate_fn_map=collate_fn_map)
+        for collate_type in collate_fn_map:
+            if isinstance(elem, collate_type):
+                return collate_fn_map[collate_type](batch, collate_fn_map=collate_fn_map)
+    if isinstance(elem, collections.abc.Mapping):
+        try:
+            return elem_type({key: cat_collate([d[key] for d in batch], collate_fn_map=collate_fn_map) for key in elem})
+        except TypeError:
+            # The mapping type may not support `__init__(iterable)`.
+            return {key: cat_collate([d[key] for d in batch], collate_fn_map=collate_fn_map) for key in elem}
+    elif isinstance(elem, tuple) and hasattr(elem, '_fields'):  # namedtuple
+        return elem_type(*(cat_collate(samples, collate_fn_map=collate_fn_map) for samples in zip(*batch)))
+    elif isinstance(elem, collections.abc.Sequence):
+        transposed = list(zip(*batch))  # It may be accessed twice, so we use a list.
+        if isinstance(elem, tuple):
+            # Backwards compatibility.
+            return [cat_collate(samples, collate_fn_map=collate_fn_map) for samples in transposed]
+        else:
+            try:
+                return elem_type([cat_collate(samples, collate_fn_map=collate_fn_map) for samples in transposed])
+            except TypeError:
+                # The sequence type may not support `__init__(iterable)` (e.g., `range`).
+                return [cat_collate(samples, collate_fn_map=collate_fn_map) for samples in transposed]
+    raise TypeError(default_collate_err_msg_format.format(elem_type))

mast3r/utils/misc.py ADDED Viewed

	@@ -0,0 +1,17 @@

+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# utilitary functions for MASt3R
+# --------------------------------------------------------
+import os
+import hashlib
+def mkdir_for(f):
+    os.makedirs(os.path.dirname(f), exist_ok=True)
+    return f
+def hash_md5(s):
+    return hashlib.md5(s.encode('utf-8')).hexdigest()

mast3r/utils/path_to_dust3r.py ADDED Viewed

	@@ -0,0 +1,19 @@

+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# dust3r submodule import
+# --------------------------------------------------------
+import sys
+import os.path as path
+HERE_PATH = path.normpath(path.dirname(__file__))
+DUSt3R_REPO_PATH = path.normpath(path.join(HERE_PATH, '../../'))
+DUSt3R_LIB_PATH = path.join(DUSt3R_REPO_PATH, 'dust3r')
+# check the presence of models directory in repo to be sure its cloned
+if path.isdir(DUSt3R_LIB_PATH):
+    # workaround for sibling import
+    sys.path.insert(0, DUSt3R_REPO_PATH)
+else:
+    raise ImportError(f"dust3r is not initialized, could not find: {DUSt3R_LIB_PATH}.\n "
+                      "Did you forget to run 'git submodule update --init --recursive' ?")