Spaces:

kkwinds
/

mast3r-sfm

Configuration error

App Files Files Community

yocabon commited on Sep 30, 2024

Commit

35e2575

1 Parent(s): b1b5578

add initial version of mast3r sfm and glomap/colmap wrapper

Browse files

Files changed (17) hide show

NOTICE +5 -0
README.md +18 -2
demo.py +2 -2
demo_glomap.py +52 -0
dust3r +1 -1
kapture_mast3r_mapping.py +127 -0
make_pairs.py +96 -0
mast3r/catmlp_dpt_head.py +116 -0
mast3r/cloud_opt/sparse_ga.py +47 -9
mast3r/colmap/mapping.py +195 -0
mast3r/demo.py +90 -46
mast3r/demo_glomap.py +338 -0
mast3r/image_pairs.py +115 -0
mast3r/losses.py +1 -1
mast3r/retrieval/graph.py +77 -0
mast3r/retrieval/model.py +271 -0
mast3r/retrieval/processor.py +129 -0

NOTICE CHANGED Viewed

@@ -101,3 +101,8 @@ CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 POSSIBILITY OF SUCH DAMAGE.

 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 POSSIBILITY OF SUCH DAMAGE.
+====
+gtolias/how
+https://github.com/gtolias/how
+MIT License https://github.com/gtolias/how/blob/master/LICENSE

README.md CHANGED Viewed

@@ -78,7 +78,19 @@ pip install -r dust3r/requirements.txt
 pip install -r dust3r/requirements_optional.txt
 ```
-3. Optional, compile the cuda kernels for RoPE (as in CroCo v2).
 ```bash
 # DUST3R relies on RoPE positional embeddings for which you can compile some cuda kernels for faster runtime.
 cd dust3r/croco/models/curope/
@@ -86,9 +98,10 @@ python setup.py build_ext --inplace
 cd ../../../../
 ```
 ### Checkpoints
 You can obtain the checkpoints by two ways:
 1) You can use our huggingface_hub integration: the models will be downloaded automatically.
@@ -123,6 +136,7 @@ demo.py is the updated demo for MASt3R. It uses our new sparse global alignment
 python3 demo.py --model_name MASt3R_ViTLarge_BaseDecoder_512_catmlpdpt_metric
 # Use --weights to load a checkpoint from a local file, eg --weights checkpoints/MASt3R_ViTLarge_BaseDecoder_512_catmlpdpt_metric.pth
 # Use --local_network to make it accessible on the local network, or --server_name to specify the url manually
 # Use --server_port to change the port, by default it will search for an available port starting at 7860
 # Use --device to use a different device, by default it's "cuda"
@@ -133,6 +147,8 @@ see https://github.com/naver/dust3r?tab=readme-ov-file#interactive-demo for deta
 ### Interactive demo with docker
 To run MASt3R using Docker, including with NVIDIA CUDA support, follow these instructions:
 1. **Install Docker**: If not already installed, download and install `docker` and `docker compose` from the [Docker website](https://www.docker.com/get-started).

 pip install -r dust3r/requirements_optional.txt
 ```
+3. compile and install ASMK
+```bash
+pip install cython
+git clone https://github.com/jenicek/asmk
+cd asmk/cython/
+cythonize *.pyx
+cd ..
+pip install .
+cd ..
+```
+4. Optional, compile the cuda kernels for RoPE (as in CroCo v2).
 ```bash
 # DUST3R relies on RoPE positional embeddings for which you can compile some cuda kernels for faster runtime.
 cd dust3r/croco/models/curope/
 cd ../../../../
 ```
 ### Checkpoints
+TODO upload retrieval_model somewhere
 You can obtain the checkpoints by two ways:
 1) You can use our huggingface_hub integration: the models will be downloaded automatically.
 python3 demo.py --model_name MASt3R_ViTLarge_BaseDecoder_512_catmlpdpt_metric
 # Use --weights to load a checkpoint from a local file, eg --weights checkpoints/MASt3R_ViTLarge_BaseDecoder_512_catmlpdpt_metric.pth
+# Use --retrieval_model and point to the retrieval checkpoint to enable retrieval as a pairing strategy, asmk must be installed
 # Use --local_network to make it accessible on the local network, or --server_name to specify the url manually
 # Use --server_port to change the port, by default it will search for an available port starting at 7860
 # Use --device to use a different device, by default it's "cuda"
 ### Interactive demo with docker
+TODO update with asmk/retrieval model
 To run MASt3R using Docker, including with NVIDIA CUDA support, follow these instructions:
 1. **Install Docker**: If not already installed, download and install `docker` and `docker compose` from the [Docker website](https://www.docker.com/get-started).

demo.py CHANGED Viewed

@@ -47,5 +47,5 @@ if __name__ == '__main__':
     with get_context(args.tmp_dir) as tmpdirname:
         cache_path = os.path.join(tmpdirname, chkpt_tag)
         os.makedirs(cache_path, exist_ok=True)
-        main_demo(cache_path, model, args.device, args.image_size, server_name, args.server_port, silent=args.silent,
-                  share=args.share, gradio_delete_cache=args.gradio_delete_cache)

     with get_context(args.tmp_dir) as tmpdirname:
         cache_path = os.path.join(tmpdirname, chkpt_tag)
         os.makedirs(cache_path, exist_ok=True)
+        main_demo(cache_path, model, args.retrieval_model, args.device, args.image_size, server_name, args.server_port,
+                  silent=args.silent, share=args.share, gradio_delete_cache=args.gradio_delete_cache)

demo_glomap.py ADDED Viewed

	@@ -0,0 +1,52 @@

+#!/usr/bin/env python3
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# gradio demo executable
+# --------------------------------------------------------
+import pycolmap
+import os
+import torch
+import tempfile
+from contextlib import nullcontext
+from mast3r.demo_glomap import get_args_parser, main_demo
+from mast3r.model import AsymmetricMASt3R
+from mast3r.utils.misc import hash_md5
+import mast3r.utils.path_to_dust3r  # noqa
+from dust3r.demo import set_print_with_timestamp
+import matplotlib.pyplot as pl
+pl.ion()
+torch.backends.cuda.matmul.allow_tf32 = True  # for gpu >= Ampere and pytorch >= 1.12
+if __name__ == '__main__':
+    parser = get_args_parser()
+    args = parser.parse_args()
+    set_print_with_timestamp()
+    if args.server_name is not None:
+        server_name = args.server_name
+    else:
+        server_name = '0.0.0.0' if args.local_network else '127.0.0.1'
+    if args.weights is not None:
+        weights_path = args.weights
+    else:
+        weights_path = "naver/" + args.model_name
+    model = AsymmetricMASt3R.from_pretrained(weights_path).to(args.device)
+    chkpt_tag = hash_md5(weights_path)
+    def get_context(tmp_dir):
+        return tempfile.TemporaryDirectory(suffix='_mast3r_gradio_demo') if tmp_dir is None \
+            else nullcontext(tmp_dir)
+    with get_context(args.tmp_dir) as tmpdirname:
+        cache_path = os.path.join(tmpdirname, chkpt_tag)
+        os.makedirs(cache_path, exist_ok=True)
+        main_demo(args.glomap_bin, cache_path, model, args.retrieval_model, args.device, args.image_size, server_name,
+                  args.server_port, silent=args.silent, share=args.share, gradio_delete_cache=args.gradio_delete_cache)

dust3r CHANGED Viewed

	@@ -1 +1 @@
1	- Subproject commit ~~9869e71f9165aa53c53ec0979cea1122a569ade4~~


1	+ Subproject commit c9e9336a6ba7c1f1873f9295852cea6dffaf770d

kapture_mast3r_mapping.py ADDED Viewed

	@@ -0,0 +1,127 @@

+#!/usr/bin/env python3
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# mast3r exec for running standard sfm
+# --------------------------------------------------------
+import pycolmap
+import os
+import os.path as path
+import argparse
+from mast3r.model import AsymmetricMASt3R
+from mast3r.colmap.mapping import (kapture_import_image_folder_or_list, run_mast3r_matching, pycolmap_run_triangulator,
+                                   pycolmap_run_mapper, glomap_run_mapper)
+from kapture.io.csv import kapture_from_dir
+from kapture.converter.colmap.database_extra import kapture_to_colmap, generate_priors_for_reconstruction
+from kapture_localization.utils.pairsfile import get_pairs_from_file
+from kapture.io.records import get_image_fullpath
+from kapture.converter.colmap.database import COLMAPDatabase
+def get_argparser():
+    parser = argparse.ArgumentParser(description='point triangulator with mast3r from kapture data')
+    parser_weights = parser.add_mutually_exclusive_group(required=True)
+    parser_weights.add_argument("--weights", type=str, help="path to the model weights", default=None)
+    parser_weights.add_argument("--model_name", type=str, help="name of the model weights",
+                                choices=["MASt3R_ViTLarge_BaseDecoder_512_catmlpdpt_metric"])
+    parser_input = parser.add_mutually_exclusive_group(required=True)
+    parser_input.add_argument('-i', '--input', default=None, help='kdata')
+    parser_input.add_argument('--dir', default=None, help='image dir (individual intrinsics)')
+    parser_input.add_argument('--dir_same_camera', default=None, help='image dir (shared intrinsics)')
+    parser.add_argument('-o', '--output', required=True, help='output path to reconstruction')
+    parser.add_argument('--pairsfile_path', required=True, help='pairsfile')
+    parser.add_argument('--glomap_bin', default='glomap', type=str, help='glomap bin')
+    parser_mapper = parser.add_mutually_exclusive_group()
+    parser_mapper.add_argument('--ignore_pose', action='store_true', default=False)
+    parser_mapper.add_argument('--use_glomap_mapper', action='store_true', default=False)
+    parser_matching = parser.add_mutually_exclusive_group()
+    parser_matching.add_argument('--dense_matching', action='store_true', default=False)
+    parser_matching.add_argument('--pixel_tol', default=0, type=int)
+    parser.add_argument('--device', default='cuda')
+    parser.add_argument('--conf_thr', default=1.001, type=float)
+    parser.add_argument('--skip_geometric_verification', action='store_true', default=False)
+    parser.add_argument('--min_len_track', default=5, type=int)
+    return parser
+if __name__ == '__main__':
+    parser = get_argparser()
+    args = parser.parse_args()
+    if args.weights is not None:
+        weights_path = args.weights
+    else:
+        weights_path = "naver/" + args.model_name
+    model = AsymmetricMASt3R.from_pretrained(weights_path).to(args.device)
+    maxdim = max(model.patch_embed.img_size)
+    patch_size = model.patch_embed.patch_size
+    if args.input is not None:
+        kdata = kapture_from_dir(args.input)
+        records_data_path = get_image_fullpath(args.input)
+    else:
+        if args.dir_same_camera is not None:
+            use_single_camera = True
+            records_data_path = args.dir_same_camera
+        elif args.dir is not None:
+            use_single_camera = False
+            records_data_path = args.dir
+        else:
+            raise ValueError('all inputs choices are None')
+        kdata = kapture_import_image_folder_or_list(records_data_path, use_single_camera)
+    has_pose = kdata.trajectories is not None
+    image_pairs = get_pairs_from_file(args.pairsfile_path, kdata.records_camera, kdata.records_camera)
+    colmap_db_path = path.join(args.output, 'colmap.db')
+    reconstruction_path = path.join(args.output, "reconstruction")
+    priors_txt_path = path.join(args.output, "priors_for_reconstruction")
+    for path_i in [reconstruction_path, priors_txt_path]:
+        os.makedirs(path_i, exist_ok=True)
+    assert not os.path.isfile(colmap_db_path)
+    colmap_db = COLMAPDatabase.connect(colmap_db_path)
+    try:
+        kapture_to_colmap(kdata, args.input, tar_handler=None, database=colmap_db,
+                          keypoints_type=None, descriptors_type=None, export_two_view_geometry=False)
+        if has_pose:
+            generate_priors_for_reconstruction(kdata, colmap_db, priors_txt_path)
+        colmap_image_pairs = run_mast3r_matching(model, maxdim, patch_size, args.device,
+                                                 kdata, records_data_path, image_pairs, colmap_db,
+                                                 args.dense_matching, args.pixel_tol, args.conf_thr,
+                                                 args.skip_geometric_verification, args.min_len_track)
+        colmap_db.close()
+    except Exception as e:
+        print(f'Error {e}')
+        colmap_db.close()
+        exit(1)
+    if len(colmap_image_pairs) == 0:
+        raise Exception("no matches were kept")
+    # colmap db is now full, run colmap
+    colmap_world_to_cam = {}
+    if not args.skip_geometric_verification:
+        print("verify_matches")
+        f = open(args.output + '/pairs.txt', "w")
+        for image_path1, image_path2 in colmap_image_pairs:
+            f.write("{} {}\n".format(image_path1, image_path2))
+        f.close()
+        pycolmap.verify_matches(colmap_db_path, args.output + '/pairs.txt')
+    print("running mapping")
+    if has_pose and not args.ignore_pose and not args.use_glomap_mapper:
+        pycolmap_run_triangulator(colmap_db_path, priors_txt_path, reconstruction_path, records_data_path)
+    elif not args.use_glomap_mapper:
+        pycolmap_run_mapper(colmap_db_path, reconstruction_path, records_data_path)
+    else:
+        glomap_run_mapper(args.glomap_bin, colmap_db_path, reconstruction_path, records_data_path)

make_pairs.py ADDED Viewed

	@@ -0,0 +1,96 @@

+#!/usr/bin/env python3
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# make pairs using mast3r scene_graph, including retrieval
+# --------------------------------------------------------
+import argparse
+import torch
+import os
+import os.path as path
+import PIL
+from PIL import Image
+import pathlib
+from kapture.io.csv import table_to_file
+from mast3r.model import AsymmetricMASt3R
+from mast3r.retrieval.processor import Retriever
+from mast3r.image_pairs import make_pairs
+def get_argparser():
+    parser = argparse.ArgumentParser(description='point triangulator with mast3r from kapture data')
+    parser.add_argument('--dir', required=True, help='image dir')
+    parser.add_argument('--scene_graph', default='retrieval-20-1-10-1')
+    parser.add_argument('--output', required=True, help='txt file')
+    parser_weights = parser.add_mutually_exclusive_group(required=False)
+    parser_weights.add_argument("--weights", type=str, help="path to the model weights", default=None)
+    parser_weights.add_argument("--model_name", type=str, help="name of the model weights",
+                                choices=["MASt3R_ViTLarge_BaseDecoder_512_catmlpdpt_metric"])
+    parser.add_argument('--retrieval_model', default=None, type=str, help="retrieval_model to be loaded")
+    return parser
+def get_image_list(images_path):
+    file_list = [path.relpath(path.join(dirpath, filename), images_path)
+                 for dirpath, dirs, filenames in os.walk(images_path)
+                 for filename in filenames]
+    file_list = sorted(file_list)
+    image_list = []
+    for filename in file_list:
+        # test if file is a valid image
+        try:
+            # lazy load
+            with Image.open(path.join(images_path, filename)) as im:
+                width, height = im.size
+                image_list.append(filename)
+        except (OSError, PIL.UnidentifiedImageError):
+            # It is not a valid image: skip it
+            print(f'Skipping invalid image file {filename}')
+            continue
+    return image_list
+def main(dir, scene_graph, output, backbone=None, retrieval_model=None):
+    imgs = get_image_list(dir)
+    sim_matrix = None
+    if 'retrieval' in scene_graph:
+        retriever = Retriever(retrieval_model, backbone=backbone)
+        imgs_fp = [path.join(dir, filename) for filename in imgs]
+        with torch.no_grad():
+            sim_matrix = retriever(imgs_fp)
+        # Cleanup
+        del retriever
+        torch.cuda.empty_cache()
+    pairs = make_pairs(imgs, scene_graph, prefilter=None, symmetrize=True, sim_mat=sim_matrix)
+    os.umask(0o002)
+    p = pathlib.Path(output)
+    os.makedirs(str(p.parent.resolve()), exist_ok=True)
+    with open(output, 'w') as fid:
+        table_to_file(fid, pairs, header='# query_image, map_image, score')
+if __name__ == '__main__':
+    parser = get_argparser()
+    args = parser.parse_args()
+    if "retrieval" in args.scene_graph:
+        assert args.retrieval_model is not None
+        if args.weights is not None:
+            weights_path = args.weights
+        else:
+            weights_path = "naver/" + args.model_name
+        backbone = AsymmetricMASt3R.from_pretrained(weights_path).to(args.device)
+        retrieval_model = args.retrieval_model
+    else:
+        backbone = None
+        retrieval_model = None
+    main(args.dir, args.scene_graph, args.output, backbone, retrieval_model)

mast3r/catmlp_dpt_head.py CHANGED Viewed

@@ -5,6 +5,7 @@
 # MASt3R heads
 # --------------------------------------------------------
 import torch
 import torch.nn.functional as F
 import mast3r.utils.path_to_dust3r  # noqa
@@ -12,6 +13,7 @@ from dust3r.heads.postprocess import reg_dense_depth, reg_dense_conf  # noqa
 from dust3r.heads.dpt_head import PixelwiseTaskWithDPT  # noqa
 import dust3r.utils.path_to_croco  # noqa
 from models.blocks import Mlp  # noqa
 def reg_desc(desc, mode):
@@ -96,6 +98,113 @@ class Cat_MLP_LocalFeatures_DPT_Pts3d(PixelwiseTaskWithDPT):
         return out
 def mast3r_head_factory(head_type, output_mode, net, has_conf=False):
     """" build a prediction head for the decoder
     """
@@ -118,6 +227,13 @@ def mast3r_head_factory(head_type, output_mode, net, has_conf=False):
                                                depth_mode=net.depth_mode,
                                                conf_mode=net.conf_mode,
                                                head_type='regression')
     else:
         raise NotImplementedError(
             f"unexpected {head_type=} and {output_mode=}")

 # MASt3R heads
 # --------------------------------------------------------
 import torch
+import torch.nn as nn
 import torch.nn.functional as F
 import mast3r.utils.path_to_dust3r  # noqa
 from dust3r.heads.dpt_head import PixelwiseTaskWithDPT  # noqa
 import dust3r.utils.path_to_croco  # noqa
 from models.blocks import Mlp  # noqa
+from models.dpt_block import Interpolate  # noqa
 def reg_desc(desc, mode):
         return out
+class MLP_MiniConv_Head(nn.Module):
+    """
+    A special Convolutional head inspired by DPT architecture
+    A MLP predicts pixelwise feats in lower resolution. Prediction is upsampled to target res and goes through a mini convolutional head
+    Input : [B, S, D]  # S = (H//p) * (W//p)
+    MLP:
+        D -> (mlp_hidden_dim) -> out_mlp_dim * (p/2)*2
+        reshape to [out_mlp_dim, H/2, W/2] (MLP predicts in half-res)
+    MiniConv head from DPT:
+        Upsample x2 -> [out_mlp_dim,H,W]
+        Conv 3x3 -> [conv_inner_dim,H,W]
+        ReLU
+        Conv 1x1 -> [odim,H,W]
+    """
+    def __init__(self, idim, mlp_hidden_dim, mlp_odim, conv_inner_dim, odim, patch_size, subpatch=2, **kw):
+        super().__init__()
+        self.patch_size = patch_size
+        self.subpatch = subpatch
+        self.sub_patch_size = patch_size // subpatch
+        self.mlp = Mlp(idim, mlp_hidden_dim, mlp_odim * self.sub_patch_size**2, **kw)  # D -> mlp_odim*sub_patch_size**2
+        # DPT conv head
+        self.head = nn.Sequential(Interpolate(scale_factor=self.subpatch, mode="bilinear", align_corners=True) if self.subpatch != 1 else nn.Identity(),
+                                  nn.Conv2d(mlp_odim, conv_inner_dim, kernel_size=3, stride=1, padding=1),
+                                  nn.ReLU(True),
+                                  nn.Conv2d(conv_inner_dim, odim, kernel_size=1, stride=1, padding=0)
+                                  )
+    def forward(self, decout, img_shape):
+        H, W = img_shape
+        tokens = decout[-1]
+        B, S, D = tokens.shape
+        # extract features
+        feat = self.mlp(tokens)  # [B, S, mlp_odim*sub_patch_size**2]
+        feat = feat.transpose(-1, -2).reshape(B, -1, H // self.patch_size, W // self.patch_size)
+        feat = F.pixel_shuffle(feat, self.sub_patch_size)  # B,mlp_odim,H/sub,W/sub
+        return self.head(feat)  # B, odim, H, W
+class Cat_MLP_LocalFeatures_MiniConv_Pts3d(nn.Module):
+    """ Mixture between MLP and MLP-Convolutional head that outputs 3d points (with miniconv) and local features (with MLP).
+    simply contains two MLP_MiniConv_Head: one for 3D points and one for features.
+    The input for both heads is a concatenation of Encoder and Decoder outputs
+    """
+    def __init__(self, net, has_conf=False, local_feat_dim=16, hidden_dim_factor=4., mlp_odim=24, conv_inner_dim=100, subpatch=2, **kw):
+        super().__init__()
+        self.local_feat_dim = local_feat_dim
+        patch_size = net.patch_embed.patch_size
+        if isinstance(patch_size, tuple):
+            assert len(patch_size) == 2 and isinstance(patch_size[0], int) and isinstance(
+                patch_size[1], int), "What is your patchsize format? Expected a single int or a tuple of two ints."
+            assert patch_size[0] == patch_size[1], "Error, non square patches not managed"
+            patch_size = patch_size[0]
+        self.patch_size = patch_size
+        self.depth_mode = net.depth_mode
+        self.conf_mode = net.conf_mode
+        self.desc_mode = net.desc_mode
+        self.desc_conf_mode = net.desc_conf_mode
+        self.has_conf = has_conf
+        self.two_confs = net.two_confs  # independent confs for 3D regr and descs
+        idim = net.enc_embed_dim + net.dec_embed_dim
+        self.head_pts3d = MLP_MiniConv_Head(idim=idim,
+                                            mlp_hidden_dim=int(hidden_dim_factor * idim),
+                                            mlp_odim=mlp_odim + self.has_conf,
+                                            conv_inner_dim=conv_inner_dim,
+                                            odim=3 + self.has_conf,
+                                            subpatch=subpatch,
+                                            patch_size=self.patch_size,
+                                            **kw)
+        self.head_local_features = Mlp(in_features=idim,
+                                       hidden_features=int(hidden_dim_factor * idim),
+                                       out_features=(self.local_feat_dim + self.two_confs) * self.patch_size**2)
+    def forward(self, decout, img_shape):
+        enc_output, dec_output = decout[0], decout[-1]  # recover encoder and decoder outputs
+        cat_output = torch.cat([enc_output, dec_output], dim=-1)  # concatenate
+        # pass through the heads
+        pts3d = self.head_pts3d([cat_output], img_shape)
+        H, W = img_shape
+        B, S, D = cat_output.shape
+        # extract 3D points
+        local_features = self.head_local_features(cat_output)  # B,S,D
+        local_features = local_features.transpose(-1, -2).view(B, -1, H // self.patch_size, W // self.patch_size)
+        local_features = F.pixel_shuffle(local_features, self.patch_size)  # B,d,H,W
+        # post process 3D pts, descriptors and confidences
+        out = postprocess(torch.cat([pts3d, local_features], dim=1),
+                          depth_mode=self.depth_mode,
+                          conf_mode=self.conf_mode,
+                          desc_dim=self.local_feat_dim,
+                          desc_mode=self.desc_mode,
+                          two_confs=self.two_confs, desc_conf_mode=self.desc_conf_mode)
+        return out
 def mast3r_head_factory(head_type, output_mode, net, has_conf=False):
     """" build a prediction head for the decoder
     """
                                                depth_mode=net.depth_mode,
                                                conf_mode=net.conf_mode,
                                                head_type='regression')
+    elif head_type == 'catconv' and output_mode.startswith('pts3d+desc'):
+        local_feat_dim = int(output_mode[10:])
+        # more params (anounced by a ':' and comma separated)
+        kw = {}
+        if ':' in head_type:
+            kw = eval("dict(" + head_type[8:] + ")")
+        return Cat_MLP_LocalFeatures_MiniConv_Pts3d(net, local_feat_dim=local_feat_dim, has_conf=has_conf, **kw)
     else:
         raise NotImplementedError(
             f"unexpected {head_type=} and {output_mode=}")

mast3r/cloud_opt/sparse_ga.py CHANGED Viewed

@@ -15,6 +15,7 @@ from collections import namedtuple
 from functools import lru_cache
 from scipy import sparse as sp
 import copy
 from mast3r.utils.misc import mkdir_for, hash_md5
 from mast3r.cloud_opt.utils.losses import gamma_loss
@@ -116,7 +117,7 @@ def convert_dust3r_pairs_naming(imgs, pairs_in):
 def sparse_global_alignment(imgs, pairs_in, cache_path, model, subsample=8, desc_conf='desc_conf',
-                            device='cuda', dtype=torch.float32, shared_intrinsics=False, **kw):
     """ Sparse alignment with MASt3R
         imgs: list of image paths
         cache_path: path where to dump temporary files (str)
@@ -137,17 +138,54 @@ def sparse_global_alignment(imgs, pairs_in, cache_path, model, subsample=8, desc
     tmp_pairs, pairwise_scores, canonical_views, canonical_paths, preds_21 = \
         prepare_canonical_data(imgs, pairs, subsample, cache_path=cache_path, mode='avg-angle', device=device)
-    # compute minimal spanning tree
-    mst = compute_min_spanning_tree(pairwise_scores)
     # remove all edges not in the spanning tree?
     # min_spanning_tree = {(imgs[i],imgs[j]) for i,j in mst[1]}
     # tmp_pairs = {(a,b):v for (a,b),v in tmp_pairs.items() if {(a,b),(b,a)} & min_spanning_tree}
-    # smartly combine all useful data
-    imsizes, pps, base_focals, core_depth, anchors, corres, corres2d, preds_21 = \
-        condense_data(imgs, tmp_pairs, canonical_views, preds_21, dtype)
     imgs, res_coarse, res_fine = sparse_scene_optimizer(
         imgs, subsample, imsizes, pps, base_focals, core_depth, anchors, corres, corres2d, preds_21, canonical_paths, mst,
         shared_intrinsics=shared_intrinsics, cache_path=cache_path, device=device, dtype=dtype, **kw)
@@ -157,8 +195,8 @@ def sparse_global_alignment(imgs, pairs_in, cache_path, model, subsample=8, desc
 def sparse_scene_optimizer(imgs, subsample, imsizes, pps, base_focals, core_depth, anchors, corres, corres2d,
                            preds_21, canonical_paths, mst, cache_path,
-                           lr1=0.2, niter1=500, loss1=gamma_loss(1.1),
-                           lr2=0.02, niter2=500, loss2=gamma_loss(0.4),
                            lossd=gamma_loss(1.1),
                            opt_pp=True, opt_depth=True,
                            schedule=cosine_schedule, depth_mode='add', exp_depth=False,

 from functools import lru_cache
 from scipy import sparse as sp
 import copy
+import scipy.cluster.hierarchy as sch
 from mast3r.utils.misc import mkdir_for, hash_md5
 from mast3r.cloud_opt.utils.losses import gamma_loss
 def sparse_global_alignment(imgs, pairs_in, cache_path, model, subsample=8, desc_conf='desc_conf',
+                            kinematic_mode='hclust-ward', device='cuda', dtype=torch.float32, shared_intrinsics=False, **kw):
     """ Sparse alignment with MASt3R
         imgs: list of image paths
         cache_path: path where to dump temporary files (str)
     tmp_pairs, pairwise_scores, canonical_views, canonical_paths, preds_21 = \
         prepare_canonical_data(imgs, pairs, subsample, cache_path=cache_path, mode='avg-angle', device=device)
+    # smartly combine all useful data
+    imsizes, pps, base_focals, core_depth, anchors, corres, corres2d, preds_21 = \
+        condense_data(imgs, tmp_pairs, canonical_views, preds_21, dtype)
+    # Build kinematic chain
+    if kinematic_mode == 'mst':
+        # compute minimal spanning tree
+        mst = compute_min_spanning_tree(pairwise_scores)
+    elif kinematic_mode.startswith('hclust'):
+        mode, linkage = kinematic_mode.split('-')
+        # Convert the affinity matrix to a distance matrix (if needed)
+        n_patches = (imsizes // subsample).prod(dim=1)
+        max_n_corres = 3 * torch.minimum(n_patches[:,None], n_patches[None,:])
+        pws = (pairwise_scores.clone() / max_n_corres).clip(max=1)
+        pws.fill_diagonal_(1)
+        pws = to_numpy(pws)
+        distance_matrix = np.where(pws, 1 - pws, 2)
+        # Compute the condensed distance matrix
+        condensed_distance_matrix = sch.distance.squareform(distance_matrix)
+        # Perform hierarchical clustering using the linkage method
+        Z = sch.linkage(condensed_distance_matrix, method=linkage)
+        # dendrogram = sch.dendrogram(Z)
+        tree = np.eye(len(imgs))
+        new_to_old_nodes = {i:i for i in range(len(imgs))}
+        for i, (a, b) in enumerate(Z[:,:2].astype(int)):
+            # given two nodes to be merged, we choose which one is the best representant
+            a = new_to_old_nodes[a]
+            b = new_to_old_nodes[b]
+            tree[a,b] = tree[b,a] = 1
+            best = a if pws[a].sum() > pws[b].sum() else b
+            new_to_old_nodes[len(imgs)+i] = best
+            pws[best] = np.maximum(pws[a], pws[b]) # update the node
+        pairwise_scores = torch.from_numpy(tree) # this output just gives 1s for connected edges and zeros for other, i.e. no scores or priority
+        mst = compute_min_spanning_tree(pairwise_scores)
+    else:
+        raise ValueError(f'bad {kinematic_mode=}')
     # remove all edges not in the spanning tree?
     # min_spanning_tree = {(imgs[i],imgs[j]) for i,j in mst[1]}
     # tmp_pairs = {(a,b):v for (a,b),v in tmp_pairs.items() if {(a,b),(b,a)} & min_spanning_tree}
     imgs, res_coarse, res_fine = sparse_scene_optimizer(
         imgs, subsample, imsizes, pps, base_focals, core_depth, anchors, corres, corres2d, preds_21, canonical_paths, mst,
         shared_intrinsics=shared_intrinsics, cache_path=cache_path, device=device, dtype=dtype, **kw)
 def sparse_scene_optimizer(imgs, subsample, imsizes, pps, base_focals, core_depth, anchors, corres, corres2d,
                            preds_21, canonical_paths, mst, cache_path,
+                           lr1=0.07, niter1=300, loss1=gamma_loss(1.5),
+                           lr2=0.01, niter2=300, loss2=gamma_loss(0.5),
                            lossd=gamma_loss(1.1),
                            opt_pp=True, opt_depth=True,
                            schedule=cosine_schedule, depth_mode='add', exp_depth=False,

mast3r/colmap/mapping.py ADDED Viewed

	@@ -0,0 +1,195 @@

+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# colmap mapper/colmap point_triangulator/glomap mapper from mast3r matches
+# --------------------------------------------------------
+import pycolmap
+import os
+import os.path as path
+import kapture.io
+import kapture.io.csv
+import subprocess
+import PIL
+from tqdm import tqdm
+import PIL.Image
+import numpy as np
+from typing import List, Tuple, Union
+from mast3r.model import AsymmetricMASt3R
+from mast3r.colmap.database import export_matches, get_im_matches
+import mast3r.utils.path_to_dust3r  # noqa
+from dust3r_visloc.datasets.utils import get_resize_function
+import kapture
+from kapture.converter.colmap.database_extra import get_colmap_camera_ids_from_db, get_colmap_image_ids_from_db
+from kapture.utils.paths import path_secure
+from dust3r.datasets.utils.transforms import ImgNorm
+from dust3r.inference import inference
+def scene_prepare_images(root: str, maxdim: int, patch_size: int, image_paths: List[str]):
+    images = []
+    # image loading
+    for idx in tqdm(range(len(image_paths))):
+        rgb_image = PIL.Image.open(os.path.join(root, image_paths[idx])).convert('RGB')
+        # resize images
+        W, H = rgb_image.size
+        resize_func, _, to_orig = get_resize_function(maxdim, patch_size, H, W)
+        rgb_tensor = resize_func(ImgNorm(rgb_image))
+        # image dictionary
+        images.append({'img': rgb_tensor.unsqueeze(0),
+                       'true_shape': np.int32([rgb_tensor.shape[1:]]),
+                       'to_orig': to_orig,
+                       'idx': idx,
+                       'instance': image_paths[idx],
+                       'orig_shape': np.int32([H, W])})
+    return images
+def remove_duplicates(images, image_pairs):
+    pairs_added = set()
+    pairs = []
+    for (i, _), (j, _) in image_pairs:
+        smallidx, bigidx = min(i, j), max(i, j)
+        if (smallidx, bigidx) in pairs_added:
+            continue
+        pairs_added.add((smallidx, bigidx))
+        pairs.append((images[i], images[j]))
+    return pairs
+def run_mast3r_matching(model: AsymmetricMASt3R, maxdim: int, patch_size: int, device,
+                        kdata: kapture.Kapture, root_path: str, image_pairs_kapture: List[Tuple[str, str]],
+                        colmap_db,
+                        dense_matching: bool, pixel_tol: int, conf_thr: float, skip_geometric_verification: bool,
+                        min_len_track: int):
+    assert kdata.records_camera is not None
+    image_paths = kdata.records_camera.data_list()
+    image_path_to_idx = {image_path: idx for idx, image_path in enumerate(image_paths)}
+    image_path_to_ts = {kdata.records_camera[ts, camid]: (ts, camid) for ts, camid in kdata.records_camera.key_pairs()}
+    images = scene_prepare_images(root_path, maxdim, patch_size, image_paths)
+    image_pairs = [((image_path_to_idx[image_path1], image_path1), (image_path_to_idx[image_path2], image_path2))
+                   for image_path1, image_path2 in image_pairs_kapture]
+    matching_pairs = remove_duplicates(images, image_pairs)
+    colmap_camera_ids = get_colmap_camera_ids_from_db(colmap_db, kdata.records_camera)
+    colmap_image_ids = get_colmap_image_ids_from_db(colmap_db)
+    im_keypoints = {idx: {} for idx in range(len(image_paths))}
+    im_matches = {}
+    image_to_colmap = {}
+    for image_path, idx in image_path_to_idx.items():
+        _, camid = image_path_to_ts[image_path]
+        colmap_camid = colmap_camera_ids[camid]
+        colmap_imid = colmap_image_ids[image_path]
+        image_to_colmap[idx] = {
+            'colmap_imid': colmap_imid,
+            'colmap_camid': colmap_camid
+        }
+    # compute 2D-2D matching from dust3r inference
+    for chunk in tqdm(range(0, len(matching_pairs), 4)):
+        pairs_chunk = matching_pairs[chunk:chunk + 4]
+        output = inference(pairs_chunk, model, device, batch_size=1, verbose=False)
+        pred1, pred2 = output['pred1'], output['pred2']
+        # TODO handle caching
+        im_images_chunk = get_im_matches(pred1, pred2, pairs_chunk, image_to_colmap,
+                                         im_keypoints, conf_thr, not dense_matching, pixel_tol)
+        im_matches.update(im_images_chunk.items())
+    # filter matches, convert them and export keypoints and matches to colmap db
+    colmap_image_pairs = export_matches(
+        colmap_db, images, image_to_colmap, im_keypoints, im_matches, min_len_track, skip_geometric_verification)
+    colmap_db.commit()
+    return colmap_image_pairs
+def pycolmap_run_triangulator(colmap_db_path, prior_recon_path, recon_path, image_root_path):
+    print("running mapping")
+    reconstruction = pycolmap.Reconstruction(prior_recon_path)
+    pycolmap.triangulate_points(
+        reconstruction=reconstruction,
+        database_path=colmap_db_path,
+        image_path=image_root_path,
+        output_path=recon_path,
+        refine_intrinsics=False,
+    )
+def pycolmap_run_mapper(colmap_db_path, recon_path, image_root_path):
+    print("running mapping")
+    reconstructions = pycolmap.incremental_mapping(
+        database_path=colmap_db_path,
+        image_path=image_root_path,
+        output_path=recon_path,
+        options=pycolmap.IncrementalPipelineOptions({'multiple_models': False,
+                                                     'extract_colors': True,
+                                                     })
+    )
+def glomap_run_mapper(glomap_bin, colmap_db_path, recon_path, image_root_path):
+    print("running mapping")
+    args = [
+        'mapper',
+        '--database_path',
+        colmap_db_path,
+        '--image_path',
+        image_root_path,
+        '--output_path',
+        recon_path
+    ]
+    args.insert(0, glomap_bin)
+    glomap_process = subprocess.Popen(args)
+    glomap_process.wait()
+    if glomap_process.returncode != 0:
+        raise ValueError(
+            '\nSubprocess Error (Return code:'
+            f' {glomap_process.returncode} )')
+def kapture_import_image_folder_or_list(images_path: Union[str, Tuple[str, List[str]]], use_single_camera=False) -> kapture.Kapture:
+    images = kapture.RecordsCamera()
+    if isinstance(images_path, str):
+        images_root = images_path
+        file_list = [path.relpath(path.join(dirpath, filename), images_root)
+                     for dirpath, dirs, filenames in os.walk(images_root)
+                     for filename in filenames]
+        file_list = sorted(file_list)
+    else:
+        images_root, file_list = images_path
+    sensors = kapture.Sensors()
+    for n, filename in enumerate(file_list):
+        # test if file is a valid image
+        try:
+            # lazy load
+            with PIL.Image.open(path.join(images_root, filename)) as im:
+                width, height = im.size
+                model_params = [width, height]
+        except (OSError, PIL.UnidentifiedImageError):
+            # It is not a valid image: skip it
+            print(f'Skipping invalid image file {filename}')
+            continue
+        camera_id = f'sensor'
+        if use_single_camera and camera_id not in sensors:
+            sensors[camera_id] = kapture.Camera(kapture.CameraType.UNKNOWN_CAMERA, model_params)
+        elif use_single_camera:
+            assert sensors[camera_id].camera_params[0] == width and sensors[camera_id].camera_params[1] == height
+        else:
+            camera_id = camera_id + f'{n}'
+            sensors[camera_id] = kapture.Camera(kapture.CameraType.UNKNOWN_CAMERA, model_params)
+        images[(n, camera_id)] = path_secure(filename)  # don't forget windows
+    return kapture.Kapture(sensors=sensors, records_camera=images)

mast3r/demo.py CHANGED Viewed

@@ -15,12 +15,14 @@ import copy
 from scipy.spatial.transform import Rotation
 import tempfile
 import shutil
 from mast3r.cloud_opt.sparse_ga import sparse_global_alignment
 from mast3r.cloud_opt.tsdf_optimizer import TSDFPostProcess
 import mast3r.utils.path_to_dust3r  # noqa
-from dust3r.image_pairs import make_pairs
 from dust3r.utils.image import load_images
 from dust3r.utils.device import to_numpy
 from dust3r.viz import add_scene_cam, CAM_COLORS, OPENGL, pts3d_to_trimesh, cat_meshes
@@ -29,7 +31,7 @@ from dust3r.demo import get_args_parser as dust3r_get_args_parser
 import matplotlib.pyplot as pl
-class SparseGAState():
     def __init__(self, sparse_ga, should_delete=False, cache_dir=None, outfile_name=None):
         self.sparse_ga = sparse_ga
         self.cache_dir = cache_dir
@@ -52,6 +54,7 @@ def get_args_parser():
     parser.add_argument('--share', action='store_true')
     parser.add_argument('--gradio_delete_cache', default=None, type=int,
                         help='age/frequency at which gradio removes the file. If >0, matching cache is purged')
     actions = parser._actions
     for action in actions:
@@ -136,10 +139,10 @@ def get_3D_model_from_scene(silent, scene_state, min_conf_thr=2, as_pointcloud=F
                                         transparent_cams=transparent_cams, cam_size=cam_size, silent=silent)
-def get_reconstructed_scene(outdir, gradio_delete_cache, model, device, silent, image_size, current_scene_state,
-                            filelist, optim_level, lr1, niter1, lr2, niter2, min_conf_thr, matching_conf_thr,
-                            as_pointcloud, mask_sky, clean_depth, transparent_cams, cam_size, scenegraph_type, winsize,
-                            win_cyclic, refid, TSDF_thresh, shared_intrinsics, **kw):
     """
     from a list of images, run mast3r inference, sparse global aligner.
     then run get_3D_model_from_scene
@@ -155,10 +158,26 @@ def get_reconstructed_scene(outdir, gradio_delete_cache, model, device, silent,
         scene_graph_params.append(str(winsize))
     elif scenegraph_type == "oneref":
         scene_graph_params.append(str(refid))
     if scenegraph_type in ["swin", "logwin"] and not win_cyclic:
         scene_graph_params.append('noncyclic')
     scene_graph = '-'.join(scene_graph_params)
-    pairs = make_pairs(imgs, scene_graph=scene_graph, prefilter=None, symmetrize=True)
     if optim_level == 'coarse':
         niter2 = 0
     # Sparse GA (forward mast3r -> matching -> 3D optim -> 2D refinement -> triangulation)
@@ -190,39 +209,66 @@ def get_reconstructed_scene(outdir, gradio_delete_cache, model, device, silent,
 def set_scenegraph_options(inputfiles, win_cyclic, refid, scenegraph_type):
     num_files = len(inputfiles) if inputfiles is not None else 1
-    show_win_controls = scenegraph_type in ["swin", "logwin"]
-    show_winsize = scenegraph_type in ["swin", "logwin"]
-    show_cyclic = scenegraph_type in ["swin", "logwin"]
     max_winsize, min_winsize = 1, 1
-    if scenegraph_type == "swin":
-        if win_cyclic:
-            max_winsize = max(1, math.ceil((num_files - 1) / 2))
-        else:
-            max_winsize = num_files - 1
-    elif scenegraph_type == "logwin":
-        if win_cyclic:
-            half_size = math.ceil((num_files - 1) / 2)
-            max_winsize = max(1, math.ceil(math.log(half_size, 2)))
         else:
-            max_winsize = max(1, math.ceil(math.log(num_files, 2)))
-    winsize = gradio.Slider(label="Scene Graph: Window Size", value=max_winsize,
-                            minimum=min_winsize, maximum=max_winsize, step=1, visible=show_winsize)
-    win_cyclic = gradio.Checkbox(value=win_cyclic, label="Cyclic sequence", visible=show_cyclic)
-    win_col = gradio.Column(visible=show_win_controls)
-    refid = gradio.Slider(label="Scene Graph: Id", value=0, minimum=0,
-                          maximum=num_files - 1, step=1, visible=scenegraph_type == 'oneref')
-    return win_col, winsize, win_cyclic, refid
-def main_demo(tmpdirname, model, device, image_size, server_name, server_port, silent=False,
               share=False, gradio_delete_cache=False):
     if not silent:
         print('Outputing stuff in', tmpdirname)
-    recon_fun = functools.partial(get_reconstructed_scene, tmpdirname, gradio_delete_cache, model, device,
-                                  silent, image_size)
     model_from_scene_fun = functools.partial(get_3D_model_from_scene, silent)
     def get_context(delete_cache):
         css = """.gradio-container {margin: 0 !important; min-width: 100%};"""
         title = "MASt3R Demo"
@@ -241,33 +287,31 @@ def main_demo(tmpdirname, model, device, image_size, server_name, server_port, s
                 with gradio.Column():
                     with gradio.Row():
                         lr1 = gradio.Slider(label="Coarse LR", value=0.07, minimum=0.01, maximum=0.2, step=0.01)
-                        niter1 = gradio.Number(value=500, precision=0, minimum=0, maximum=10_000,
-                                               label="num_iterations", info="For coarse alignment!")
-                        lr2 = gradio.Slider(label="Fine LR", value=0.014, minimum=0.005, maximum=0.05, step=0.001)
-                        niter2 = gradio.Number(value=200, precision=0, minimum=0, maximum=100_000,
-                                               label="num_iterations", info="For refinement!")
                         optim_level = gradio.Dropdown(["coarse", "refine", "refine+depth"],
                                                       value='refine', label="OptLevel",
                                                       info="Optimization level")
                     with gradio.Row():
-                        matching_conf_thr = gradio.Slider(label="Matching Confidence Thr", value=5.,
                                                           minimum=0., maximum=30., step=0.1,
                                                           info="Before Fallback to Regr3D!")
                         shared_intrinsics = gradio.Checkbox(value=False, label="Shared intrinsics",
                                                             info="Only optimize one set of intrinsics for all views")
-                        scenegraph_type = gradio.Dropdown([("complete: all possible image pairs", "complete"),
-                                                           ("swin: sliding window", "swin"),
-                                                           ("logwin: sliding window with long range", "logwin"),
-                                                           ("oneref: match one image with all", "oneref")],
                                                           value='complete', label="Scenegraph",
                                                           info="Define how to make pairs",
                                                           interactive=True)
-                        with gradio.Column(visible=False) as win_col:
                             winsize = gradio.Slider(label="Scene Graph: Window Size", value=1,
                                                     minimum=1, maximum=1, step=1)
                             win_cyclic = gradio.Checkbox(value=False, label="Cyclic sequence")
-                        refid = gradio.Slider(label="Scene Graph: Id", value=0,
-                                              minimum=0, maximum=0, step=1, visible=False)
             run_btn = gradio.Button("Run")
             with gradio.Row():
@@ -288,13 +332,13 @@ def main_demo(tmpdirname, model, device, image_size, server_name, server_port, s
             # events
             scenegraph_type.change(set_scenegraph_options,
                                    inputs=[inputfiles, win_cyclic, refid, scenegraph_type],
-                                   outputs=[win_col, winsize, win_cyclic, refid])
             inputfiles.change(set_scenegraph_options,
                               inputs=[inputfiles, win_cyclic, refid, scenegraph_type],
-                              outputs=[win_col, winsize, win_cyclic, refid])
             win_cyclic.change(set_scenegraph_options,
                               inputs=[inputfiles, win_cyclic, refid, scenegraph_type],
-                              outputs=[win_col, winsize, win_cyclic, refid])
             run_btn.click(fn=recon_fun,
                           inputs=[scene, inputfiles, optim_level, lr1, niter1, lr2, niter2, min_conf_thr, matching_conf_thr,
                                   as_pointcloud, mask_sky, clean_depth, transparent_cams, cam_size,

 from scipy.spatial.transform import Rotation
 import tempfile
 import shutil
+import torch
 from mast3r.cloud_opt.sparse_ga import sparse_global_alignment
 from mast3r.cloud_opt.tsdf_optimizer import TSDFPostProcess
+from mast3r.image_pairs import make_pairs
+from mast3r.retrieval.processor import Retriever
 import mast3r.utils.path_to_dust3r  # noqa
 from dust3r.utils.image import load_images
 from dust3r.utils.device import to_numpy
 from dust3r.viz import add_scene_cam, CAM_COLORS, OPENGL, pts3d_to_trimesh, cat_meshes
 import matplotlib.pyplot as pl
+class SparseGAState:
     def __init__(self, sparse_ga, should_delete=False, cache_dir=None, outfile_name=None):
         self.sparse_ga = sparse_ga
         self.cache_dir = cache_dir
     parser.add_argument('--share', action='store_true')
     parser.add_argument('--gradio_delete_cache', default=None, type=int,
                         help='age/frequency at which gradio removes the file. If >0, matching cache is purged')
+    parser.add_argument('--retrieval_model', default=None, type=str, help="retrieval_model to be loaded")
     actions = parser._actions
     for action in actions:
                                         transparent_cams=transparent_cams, cam_size=cam_size, silent=silent)
+def get_reconstructed_scene(outdir, gradio_delete_cache, model, retrieval_model, device, silent, image_size,
+                            current_scene_state, filelist, optim_level, lr1, niter1, lr2, niter2, min_conf_thr,
+                            matching_conf_thr, as_pointcloud, mask_sky, clean_depth, transparent_cams, cam_size,
+                            scenegraph_type, winsize, win_cyclic, refid, TSDF_thresh, shared_intrinsics, **kw):
     """
     from a list of images, run mast3r inference, sparse global aligner.
     then run get_3D_model_from_scene
         scene_graph_params.append(str(winsize))
     elif scenegraph_type == "oneref":
         scene_graph_params.append(str(refid))
+    elif scenegraph_type == "retrieval":
+        scene_graph_params.append(str(winsize))  # Na
+        scene_graph_params.append(str(refid))  # k
     if scenegraph_type in ["swin", "logwin"] and not win_cyclic:
         scene_graph_params.append('noncyclic')
     scene_graph = '-'.join(scene_graph_params)
+    sim_matrix = None
+    if 'retrieval' in scenegraph_type:
+        assert retrieval_model is not None
+        retriever = Retriever(retrieval_model, backbone=model, device=device)
+        with torch.no_grad():
+            sim_matrix = retriever(filelist)
+        # Cleanup
+        del retriever
+        torch.cuda.empty_cache()
+    pairs = make_pairs(imgs, scene_graph=scene_graph, prefilter=None, symmetrize=True, sim_mat=sim_matrix)
     if optim_level == 'coarse':
         niter2 = 0
     # Sparse GA (forward mast3r -> matching -> 3D optim -> 2D refinement -> triangulation)
 def set_scenegraph_options(inputfiles, win_cyclic, refid, scenegraph_type):
     num_files = len(inputfiles) if inputfiles is not None else 1
     max_winsize, min_winsize = 1, 1
+    winsize = gradio.Slider(visible=False)
+    win_cyclic = gradio.Checkbox(visible=False)
+    graph_opt = gradio.Column(visible=False)
+    refid = gradio.Slider(visible=False)
+    if scenegraph_type in ["swin", "logwin"]:
+        if scenegraph_type == "swin":
+            if win_cyclic:
+                max_winsize = max(1, math.ceil((num_files - 1) / 2))
+            else:
+                max_winsize = num_files - 1
         else:
+            if win_cyclic:
+                half_size = math.ceil((num_files - 1) / 2)
+                max_winsize = max(1, math.ceil(math.log(half_size, 2)))
+            else:
+                max_winsize = max(1, math.ceil(math.log(num_files, 2)))
+        winsize = gradio.Slider(label="Scene Graph: Window Size", value=max_winsize,
+                                minimum=min_winsize, maximum=max_winsize, step=1, visible=True)
+        win_cyclic = gradio.Checkbox(value=win_cyclic, label="Cyclic sequence", visible=True)
+        graph_opt = gradio.Column(visible=True)
+        refid = gradio.Slider(visible=False)
+    elif scenegraph_type == "retrieval":
+        graph_opt = gradio.Column(visible=True)
+        winsize = gradio.Slider(label="Retrieval: Num. key images", value=min(20, num_files),
+                                minimum=0, maximum=num_files, step=1, visible=True)
+        win_cyclic = gradio.Checkbox(visible=False)
+        refid = gradio.Slider(label="Retrieval: Num neighbors", value=min(num_files - 1, 10), minimum=1,
+                              maximum=num_files - 1, step=1, visible=True)
+    elif scenegraph_type == "oneref":
+        graph_opt = gradio.Column(visible=True)
+        winsize = gradio.Slider(visible=False)
+        win_cyclic = gradio.Checkbox(visible=False)
+        refid = gradio.Slider(label="Scene Graph: Id", value=0, minimum=0,
+                              maximum=num_files - 1, step=1, visible=True)
+    return graph_opt, winsize, win_cyclic, refid
+def main_demo(tmpdirname, model, retrieval_model, device, image_size, server_name, server_port, silent=False,
               share=False, gradio_delete_cache=False):
     if not silent:
         print('Outputing stuff in', tmpdirname)
+    recon_fun = functools.partial(get_reconstructed_scene, tmpdirname, gradio_delete_cache, model,
+                                  retrieval_model, device, silent, image_size)
     model_from_scene_fun = functools.partial(get_3D_model_from_scene, silent)
+    available_scenegraph_type = [("complete: all possible image pairs", "complete"),
+                                 ("swin: sliding window", "swin"),
+                                 ("logwin: sliding window with long range", "logwin"),
+                                 ("oneref: match one image with all", "oneref")]
+    if retrieval_model is not None:
+        available_scenegraph_type.insert(1, ("retrieval: connect views based on similarity", "retrieval"))
     def get_context(delete_cache):
         css = """.gradio-container {margin: 0 !important; min-width: 100%};"""
         title = "MASt3R Demo"
                 with gradio.Column():
                     with gradio.Row():
                         lr1 = gradio.Slider(label="Coarse LR", value=0.07, minimum=0.01, maximum=0.2, step=0.01)
+                        niter1 = gradio.Slider(value=300, minimum=0, maximum=1000, step=1,
+                                               label="Iterations", info="For coarse alignment")
+                        lr2 = gradio.Slider(label="Fine LR", value=0.01, minimum=0.005, maximum=0.05, step=0.001)
+                        niter2 = gradio.Slider(value=300, minimum=0, maximum=1000, step=1,
+                                               label="Iterations", info="For refinement")
                         optim_level = gradio.Dropdown(["coarse", "refine", "refine+depth"],
                                                       value='refine', label="OptLevel",
                                                       info="Optimization level")
                     with gradio.Row():
+                        matching_conf_thr = gradio.Slider(label="Matching Confidence Thr", value=0.,
                                                           minimum=0., maximum=30., step=0.1,
                                                           info="Before Fallback to Regr3D!")
                         shared_intrinsics = gradio.Checkbox(value=False, label="Shared intrinsics",
                                                             info="Only optimize one set of intrinsics for all views")
+                        scenegraph_type = gradio.Dropdown(available_scenegraph_type,
                                                           value='complete', label="Scenegraph",
                                                           info="Define how to make pairs",
                                                           interactive=True)
+                        with gradio.Column(visible=False) as graph_opt:
                             winsize = gradio.Slider(label="Scene Graph: Window Size", value=1,
                                                     minimum=1, maximum=1, step=1)
                             win_cyclic = gradio.Checkbox(value=False, label="Cyclic sequence")
+                            refid = gradio.Slider(label="Scene Graph: Id", value=0,
+                                                  minimum=0, maximum=0, step=1, visible=False)
             run_btn = gradio.Button("Run")
             with gradio.Row():
             # events
             scenegraph_type.change(set_scenegraph_options,
                                    inputs=[inputfiles, win_cyclic, refid, scenegraph_type],
+                                   outputs=[graph_opt, winsize, win_cyclic, refid])
             inputfiles.change(set_scenegraph_options,
                               inputs=[inputfiles, win_cyclic, refid, scenegraph_type],
+                              outputs=[graph_opt, winsize, win_cyclic, refid])
             win_cyclic.change(set_scenegraph_options,
                               inputs=[inputfiles, win_cyclic, refid, scenegraph_type],
+                              outputs=[graph_opt, winsize, win_cyclic, refid])
             run_btn.click(fn=recon_fun,
                           inputs=[scene, inputfiles, optim_level, lr1, niter1, lr2, niter2, min_conf_thr, matching_conf_thr,
                                   as_pointcloud, mask_sky, clean_depth, transparent_cams, cam_size,

mast3r/demo_glomap.py ADDED Viewed

	@@ -0,0 +1,338 @@

+#!/usr/bin/env python3
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# gradio demo functions
+# --------------------------------------------------------
+import pycolmap
+import gradio
+import os
+import numpy as np
+import functools
+import trimesh
+import copy
+from scipy.spatial.transform import Rotation
+import tempfile
+import shutil
+import PIL.Image
+import torch
+from kapture.converter.colmap.database_extra import kapture_to_colmap
+from kapture.converter.colmap.database import COLMAPDatabase
+from mast3r.colmap.mapping import kapture_import_image_folder_or_list, run_mast3r_matching, glomap_run_mapper
+from mast3r.demo import set_scenegraph_options
+from mast3r.retrieval.processor import Retriever
+from mast3r.image_pairs import make_pairs
+import mast3r.utils.path_to_dust3r  # noqa
+from dust3r.utils.image import load_images
+from dust3r.viz import add_scene_cam, CAM_COLORS, OPENGL
+from dust3r.demo import get_args_parser as dust3r_get_args_parser
+import matplotlib.pyplot as pl
+class GlomapRecon:
+    def __init__(self, world_to_cam, intrinsics, points3d, imgs):
+        self.world_to_cam = world_to_cam
+        self.intrinsics = intrinsics
+        self.points3d = points3d
+        self.imgs = imgs
+class GlomapReconState:
+    def __init__(self, glomap_recon, should_delete=False, cache_dir=None, outfile_name=None):
+        self.glomap_recon = glomap_recon
+        self.cache_dir = cache_dir
+        self.outfile_name = outfile_name
+        self.should_delete = should_delete
+    def __del__(self):
+        if not self.should_delete:
+            return
+        if self.cache_dir is not None and os.path.isdir(self.cache_dir):
+            shutil.rmtree(self.cache_dir)
+        self.cache_dir = None
+        if self.outfile_name is not None and os.path.isfile(self.outfile_name):
+            os.remove(self.outfile_name)
+        self.outfile_name = None
+def get_args_parser():
+    parser = dust3r_get_args_parser()
+    parser.add_argument('--share', action='store_true')
+    parser.add_argument('--gradio_delete_cache', default=None, type=int,
+                        help='age/frequency at which gradio removes the file. If >0, matching cache is purged')
+    parser.add_argument('--glomap_bin', default='glomap', type=str, help='glomap bin')
+    parser.add_argument('--retrieval_model', default=None, type=str, help="retrieval_model to be loaded")
+    actions = parser._actions
+    for action in actions:
+        if action.dest == 'model_name':
+            action.choices = ["MASt3R_ViTLarge_BaseDecoder_512_catmlpdpt_metric"]
+    # change defaults
+    parser.prog = 'mast3r demo'
+    return parser
+def get_reconstructed_scene(glomap_bin, outdir, gradio_delete_cache, model, retrieval_model, device, silent, image_size,
+                            current_scene_state, filelist, transparent_cams, cam_size, scenegraph_type, winsize,
+                            win_cyclic, refid, shared_intrinsics, **kw):
+    """
+    from a list of images, run mast3r inference, sparse global aligner.
+    then run get_3D_model_from_scene
+    """
+    imgs = load_images(filelist, size=image_size, verbose=not silent)
+    if len(imgs) == 1:
+        imgs = [imgs[0], copy.deepcopy(imgs[0])]
+        imgs[1]['idx'] = 1
+        filelist = [filelist[0], filelist[0]]
+    scene_graph_params = [scenegraph_type]
+    if scenegraph_type in ["swin", "logwin"]:
+        scene_graph_params.append(str(winsize))
+    elif scenegraph_type == "oneref":
+        scene_graph_params.append(str(refid))
+    elif scenegraph_type == "retrieval":
+        scene_graph_params.append(str(winsize))  # Na
+        scene_graph_params.append(str(refid))  # k
+    if scenegraph_type in ["swin", "logwin"] and not win_cyclic:
+        scene_graph_params.append('noncyclic')
+    scene_graph = '-'.join(scene_graph_params)
+    sim_matrix = None
+    if 'retrieval' in scenegraph_type:
+        assert retrieval_model is not None
+        retriever = Retriever(retrieval_model, backbone=model, device=device)
+        with torch.no_grad():
+            sim_matrix = retriever(filelist)
+        # Cleanup
+        del retriever
+        torch.cuda.empty_cache()
+    pairs = make_pairs(imgs, scene_graph=scene_graph, prefilter=None, symmetrize=True, sim_mat=sim_matrix)
+    if current_scene_state is not None and \
+        not current_scene_state.should_delete and \
+            current_scene_state.cache_dir is not None:
+        cache_dir = current_scene_state.cache_dir
+    elif gradio_delete_cache:
+        cache_dir = tempfile.mkdtemp(suffix='_cache', dir=outdir)
+    else:
+        cache_dir = os.path.join(outdir, 'cache')
+    root_path = os.path.commonpath(filelist)
+    filelist_relpath = [
+        os.path.relpath(filename, root_path).replace('\\', '/')
+        for filename in filelist
+    ]
+    kdata = kapture_import_image_folder_or_list((root_path, filelist_relpath), shared_intrinsics)
+    image_pairs = [
+        (filelist_relpath[img1['idx']], filelist_relpath[img2['idx']])
+        for img1, img2 in pairs
+    ]
+    colmap_db_path = os.path.join(cache_dir, 'colmap.db')
+    if os.path.isfile(colmap_db_path):
+        os.remove(colmap_db_path)
+    os.makedirs(os.path.dirname(colmap_db_path), exist_ok=True)
+    colmap_db = COLMAPDatabase.connect(colmap_db_path)
+    try:
+        kapture_to_colmap(kdata, root_path, tar_handler=None, database=colmap_db,
+                          keypoints_type=None, descriptors_type=None, export_two_view_geometry=False)
+        colmap_image_pairs = run_mast3r_matching(model, image_size, 16, device,
+                                                 kdata, root_path, image_pairs, colmap_db,
+                                                 False, 5, 1.001,
+                                                 False, 3)
+        colmap_db.close()
+    except Exception as e:
+        print(f'Error {e}')
+        colmap_db.close()
+        exit(1)
+    if len(colmap_image_pairs) == 0:
+        raise Exception("no matches were kept")
+    # colmap db is now full, run colmap
+    colmap_world_to_cam = {}
+    print("verify_matches")
+    f = open(cache_dir + '/pairs.txt', "w")
+    for image_path1, image_path2 in colmap_image_pairs:
+        f.write("{} {}\n".format(image_path1, image_path2))
+    f.close()
+    pycolmap.verify_matches(colmap_db_path, cache_dir + '/pairs.txt')
+    reconstruction_path = os.path.join(cache_dir, "reconstruction")
+    if os.path.isdir(reconstruction_path):
+        shutil.rmtree(reconstruction_path)
+    os.makedirs(reconstruction_path, exist_ok=True)
+    glomap_run_mapper(glomap_bin, colmap_db_path, reconstruction_path, root_path)
+    if current_scene_state is not None and \
+        not current_scene_state.should_delete and \
+            current_scene_state.outfile_name is not None:
+        outfile_name = current_scene_state.outfile_name
+    else:
+        outfile_name = tempfile.mktemp(suffix='_scene.glb', dir=outdir)
+    ouput_recon = pycolmap.Reconstruction(os.path.join(reconstruction_path, '0'))
+    print(ouput_recon.summary())
+    colmap_world_to_cam = {}
+    colmap_intrinsics = {}
+    colmap_image_id_to_name = {}
+    images = {}
+    num_reg_images = ouput_recon.num_reg_images()
+    for idx, (colmap_imgid, colmap_image) in enumerate(ouput_recon.images.items()):
+        colmap_image_id_to_name[colmap_imgid] = colmap_image.name
+        if callable(colmap_image.cam_from_world.matrix):
+            colmap_world_to_cam[colmap_imgid] = colmap_image.cam_from_world.matrix(
+            )
+        else:
+            colmap_world_to_cam[colmap_imgid] = colmap_image.cam_from_world.matrix
+        camera = ouput_recon.cameras[colmap_image.camera_id]
+        K = np.eye(3)
+        K[0, 0] = camera.focal_length_x
+        K[1, 1] = camera.focal_length_y
+        K[0, 2] = camera.principal_point_x
+        K[1, 2] = camera.principal_point_y
+        colmap_intrinsics[colmap_imgid] = K
+        with PIL.Image.open(os.path.join(root_path, colmap_image.name)) as im:
+            images[colmap_imgid] = np.asarray(im)
+        if idx + 1 == num_reg_images:
+            break  # bug with the iterable ?
+    points3D = []
+    num_points3D = ouput_recon.num_points3D()
+    for idx, (pt3d_id, pts3d) in enumerate(ouput_recon.points3D.items()):
+        points3D.append((pts3d.xyz, pts3d.color))
+        if idx + 1 == num_points3D:
+            break  # bug with the iterable ?
+    scene = GlomapRecon(colmap_world_to_cam, colmap_intrinsics, points3D, images)
+    scene_state = GlomapReconState(scene, gradio_delete_cache, cache_dir, outfile_name)
+    outfile = get_3D_model_from_scene(silent, scene_state, transparent_cams, cam_size)
+    return scene_state, outfile
+def get_3D_model_from_scene(silent, scene_state, transparent_cams=False, cam_size=0.05):
+    """
+    extract 3D_model (glb file) from a reconstructed scene
+    """
+    if scene_state is None:
+        return None
+    outfile = scene_state.outfile_name
+    if outfile is None:
+        return None
+    recon = scene_state.glomap_recon
+    scene = trimesh.Scene()
+    pts = np.stack([p[0] for p in recon.points3d], axis=0)
+    col = np.stack([p[1] for p in recon.points3d], axis=0)
+    pct = trimesh.PointCloud(pts, colors=col)
+    scene.add_geometry(pct)
+    # add each camera
+    cams2world = []
+    for i, (id, pose_w2c_3x4) in enumerate(recon.world_to_cam.items()):
+        intrinsics = recon.intrinsics[id]
+        focal = (intrinsics[0, 0] + intrinsics[1, 1]) / 2.0
+        camera_edge_color = CAM_COLORS[i % len(CAM_COLORS)]
+        pose_w2c = np.eye(4)
+        pose_w2c[:3, :] = pose_w2c_3x4
+        pose_c2w = np.linalg.inv(pose_w2c)
+        cams2world.append(pose_c2w)
+        add_scene_cam(scene, pose_c2w, camera_edge_color,
+                      None if transparent_cams else recon.imgs[id], focal,
+                      imsize=recon.imgs[id].shape[1::-1], screen_width=cam_size)
+    rot = np.eye(4)
+    rot[:3, :3] = Rotation.from_euler('y', np.deg2rad(180)).as_matrix()
+    scene.apply_transform(np.linalg.inv(cams2world[0] @ OPENGL @ rot))
+    if not silent:
+        print('(exporting 3D scene to', outfile, ')')
+    scene.export(file_obj=outfile)
+    return outfile
+def main_demo(glomap_bin, tmpdirname, model, retrieval_model, device, image_size, server_name, server_port,
+              silent=False, share=False, gradio_delete_cache=False):
+    if not silent:
+        print('Outputing stuff in', tmpdirname)
+    recon_fun = functools.partial(get_reconstructed_scene, glomap_bin, tmpdirname, gradio_delete_cache, model,
+                                  retrieval_model, device, silent, image_size)
+    model_from_scene_fun = functools.partial(get_3D_model_from_scene, silent)
+    available_scenegraph_type = [("complete: all possible image pairs", "complete"),
+                                 ("swin: sliding window", "swin"),
+                                 ("logwin: sliding window with long range", "logwin"),
+                                 ("oneref: match one image with all", "oneref")]
+    if retrieval_model is not None:
+        available_scenegraph_type.insert(1, ("retrieval: connect views based on similarity", "retrieval"))
+    def get_context(delete_cache):
+        css = """.gradio-container {margin: 0 !important; min-width: 100%};"""
+        title = "MASt3R Demo"
+        if delete_cache:
+            return gradio.Blocks(css=css, title=title, delete_cache=(delete_cache, delete_cache))
+        else:
+            return gradio.Blocks(css=css, title="MASt3R Demo")  # for compatibility with older versions
+    with get_context(gradio_delete_cache) as demo:
+        # scene state is save so that you can change conf_thr, cam_size... without rerunning the inference
+        scene = gradio.State(None)
+        gradio.HTML('<h2 style="text-align: center;">MASt3R Demo</h2>')
+        with gradio.Column():
+            inputfiles = gradio.File(file_count="multiple")
+            with gradio.Row():
+                shared_intrinsics = gradio.Checkbox(value=False, label="Shared intrinsics",
+                                                    info="Only optimize one set of intrinsics for all views")
+                scenegraph_type = gradio.Dropdown(available_scenegraph_type,
+                                                  value='complete', label="Scenegraph",
+                                                  info="Define how to make pairs",
+                                                  interactive=True)
+                with gradio.Column(visible=False) as win_col:
+                    winsize = gradio.Slider(label="Scene Graph: Window Size", value=1,
+                                            minimum=1, maximum=1, step=1)
+                    win_cyclic = gradio.Checkbox(value=False, label="Cyclic sequence")
+                refid = gradio.Slider(label="Scene Graph: Id", value=0,
+                                      minimum=0, maximum=0, step=1, visible=False)
+            run_btn = gradio.Button("Run")
+            with gradio.Row():
+                # adjust the camera size in the output pointcloud
+                cam_size = gradio.Slider(label="cam_size", value=0.01, minimum=0.001, maximum=1.0, step=0.001)
+            with gradio.Row():
+                transparent_cams = gradio.Checkbox(value=False, label="Transparent cameras")
+            outmodel = gradio.Model3D()
+            # events
+            scenegraph_type.change(set_scenegraph_options,
+                                   inputs=[inputfiles, win_cyclic, refid, scenegraph_type],
+                                   outputs=[win_col, winsize, win_cyclic, refid])
+            inputfiles.change(set_scenegraph_options,
+                              inputs=[inputfiles, win_cyclic, refid, scenegraph_type],
+                              outputs=[win_col, winsize, win_cyclic, refid])
+            win_cyclic.change(set_scenegraph_options,
+                              inputs=[inputfiles, win_cyclic, refid, scenegraph_type],
+                              outputs=[win_col, winsize, win_cyclic, refid])
+            run_btn.click(fn=recon_fun,
+                          inputs=[scene, inputfiles, transparent_cams, cam_size,
+                                  scenegraph_type, winsize, win_cyclic, refid, shared_intrinsics],
+                          outputs=[scene, outmodel])
+            cam_size.change(fn=model_from_scene_fun,
+                            inputs=[scene, transparent_cams, cam_size],
+                            outputs=outmodel)
+            transparent_cams.change(model_from_scene_fun,
+                                    inputs=[scene, transparent_cams, cam_size],
+                                    outputs=outmodel)
+    demo.launch(share=share, server_name=server_name, server_port=server_port)

mast3r/image_pairs.py ADDED Viewed

	@@ -0,0 +1,115 @@

+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# utilities needed to load image pairs
+# --------------------------------------------------------
+import numpy as np
+import torch
+from mast3r.retrieval.graph import make_pairs_fps
+def make_pairs(imgs, scene_graph='complete', prefilter=None, symmetrize=True, sim_mat=None):
+    pairs = []
+    if scene_graph == 'complete':  # complete graph
+        for i in range(len(imgs)):
+            for j in range(i):
+                pairs.append((imgs[i], imgs[j]))
+    elif scene_graph.startswith('swin'):
+        iscyclic = not scene_graph.endswith('noncyclic')
+        try:
+            winsize = int(scene_graph.split('-')[1])
+        except Exception as e:
+            winsize = 3
+        pairsid = set()
+        for i in range(len(imgs)):
+            for j in range(1, winsize + 1):
+                idx = (i + j)
+                if iscyclic:
+                    idx = idx % len(imgs)  # explicit loop closure
+                if idx >= len(imgs):
+                    continue
+                pairsid.add((i, idx) if i < idx else (idx, i))
+        for i, j in pairsid:
+            pairs.append((imgs[i], imgs[j]))
+    elif scene_graph.startswith('logwin'):
+        iscyclic = not scene_graph.endswith('noncyclic')
+        try:
+            winsize = int(scene_graph.split('-')[1])
+        except Exception as e:
+            winsize = 3
+        offsets = [2**i for i in range(winsize)]
+        pairsid = set()
+        for i in range(len(imgs)):
+            ixs_l = [i - off for off in offsets]
+            ixs_r = [i + off for off in offsets]
+            for j in ixs_l + ixs_r:
+                if iscyclic:
+                    j = j % len(imgs)  # Explicit loop closure
+                if j < 0 or j >= len(imgs) or j == i:
+                    continue
+                pairsid.add((i, j) if i < j else (j, i))
+        for i, j in pairsid:
+            pairs.append((imgs[i], imgs[j]))
+    elif scene_graph.startswith('oneref'):
+        refid = int(scene_graph.split('-')[1]) if '-' in scene_graph else 0
+        for j in range(len(imgs)):
+            if j != refid:
+                pairs.append((imgs[refid], imgs[j]))
+    elif scene_graph.startswith('retrieval'):
+        mode, Na, k = scene_graph.split('-')
+        assert sim_mat is not None, "sim_mat is required for retrieval mode"
+        fps_pairs, anchor_idxs = make_pairs_fps(sim_mat, Na=int(Na), tokK=int(k), dist_thresh=None)
+        for i, j in fps_pairs:
+            pairs.append((imgs[i], imgs[j]))
+    else:
+        raise ValueError(f'unrecognized value for {scene_graph=}')
+    if symmetrize:
+        pairs += [(img2, img1) for img1, img2 in pairs]
+    # now, remove edges
+    if isinstance(prefilter, str) and prefilter.startswith('seq'):
+        pairs = filter_pairs_seq(pairs, int(prefilter[3:]))
+    if isinstance(prefilter, str) and prefilter.startswith('cyc'):
+        pairs = filter_pairs_seq(pairs, int(prefilter[3:]), cyclic=True)
+    return pairs
+def sel(x, kept):
+    if isinstance(x, dict):
+        return {k: sel(v, kept) for k, v in x.items()}
+    if isinstance(x, (torch.Tensor, np.ndarray)):
+        return x[kept]
+    if isinstance(x, (tuple, list)):
+        return type(x)([x[k] for k in kept])
+def _filter_edges_seq(edges, seq_dis_thr, cyclic=False):
+    # number of images
+    n = max(max(e) for e in edges) + 1
+    kept = []
+    for e, (i, j) in enumerate(edges):
+        dis = abs(i - j)
+        if cyclic:
+            dis = min(dis, abs(i + n - j), abs(i - n - j))
+        if dis <= seq_dis_thr:
+            kept.append(e)
+    return kept
+def filter_pairs_seq(pairs, seq_dis_thr, cyclic=False):
+    edges = [(img1['idx'], img2['idx']) for img1, img2 in pairs]
+    kept = _filter_edges_seq(edges, seq_dis_thr, cyclic=cyclic)
+    return [pairs[i] for i in kept]
+def filter_edges_seq(view1, view2, pred1, pred2, seq_dis_thr, cyclic=False):
+    edges = [(int(i), int(j)) for i, j in zip(view1['idx'], view2['idx'])]
+    kept = _filter_edges_seq(edges, seq_dis_thr, cyclic=cyclic)
+    print(f'>> Filtering edges more than {seq_dis_thr} frames apart: kept {len(kept)}/{len(edges)} edges')
+    return sel(view1, kept), sel(view2, kept), sel(pred1, kept), sel(pred2, kept)

mast3r/losses.py CHANGED Viewed

@@ -273,7 +273,7 @@ class InfoNCE(MatchingCriterion):
 class APLoss (MatchingCriterion):
-    """ AP loss.
     """
     def __init__(self, nq='torch', min=0, max=1, euc=False, **kw):

 class APLoss (MatchingCriterion):
+    """ AP loss
     """
     def __init__(self, nq='torch', min=0, max=1, euc=False, **kw):

mast3r/retrieval/graph.py ADDED Viewed

	@@ -0,0 +1,77 @@

+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# Building the graph based on retrieval results.
+# --------------------------------------------------------
+import numpy as np
+def farthest_point_sampling(dist, N=None, dist_thresh=None):
+    """Farthest point sampling.
+    Args:
+        dist: NxN distance matrix.
+        N: Number of points to sample.
+        dist_thresh: Distance threshold. Point sampling terminates once the
+                     maximum distance is below this threshold.
+    Returns:
+        indices: Indices of the sampled points.
+    """
+    assert N is not None or dist_thresh is not None, "Either N or min_dist must be provided."
+    if N is None:
+        N = dist.shape[0]
+    indices = []
+    distances = [0]
+    indices.append(np.random.choice(dist.shape[0]))
+    for i in range(1, N):
+        d = dist[indices].min(axis=0)
+        bst = d.argmax()
+        bst_dist = d[bst]
+        if dist_thresh is not None and bst_dist < dist_thresh:
+            break
+        indices.append(bst)
+        distances.append(bst_dist)
+    return np.array(indices), np.array(distances)
+def make_pairs_fps(sim_mat, Na=20, tokK=1, dist_thresh=None):
+    dist_mat = 1 - sim_mat
+    pairs = set()
+    keyimgs_idx = np.array([])
+    if Na != 0:
+        keyimgs_idx, _ = farthest_point_sampling(dist_mat, N=Na, dist_thresh=dist_thresh)
+        # 1. Complete graph between key images
+        for i in range(len(keyimgs_idx)):
+            for j in range(i + 1, len(keyimgs_idx)):
+                idx_i, idx_j = keyimgs_idx[i], keyimgs_idx[j]
+                pairs.add((idx_i, idx_j))
+        # 2. Connect non-key images to the earest key image
+        keyimg_dist_mat = dist_mat[:, keyimgs_idx]
+        for i in range(keyimg_dist_mat.shape[0]):
+            if i in keyimgs_idx:
+                continue
+            j = keyimg_dist_mat[i].argmax()
+            i1, i2 = min(i, keyimgs_idx[j]), max(i, keyimgs_idx[j])
+            if i1 != i2 and (i1, i2) not in pairs:
+                pairs.add((i1, i2))
+    # 3. Add some local connections (k-NN) for each view
+    if tokK > 0:
+        for i in range(dist_mat.shape[0]):
+            idx = dist_mat[i].argsort()[:tokK]
+            for j in idx:
+                i1, i2 = min(i, j), max(i, j)
+                if i1 != i2 and (i1, i2) not in pairs:
+                    pairs.add((i1, i2))
+    pairs = list(pairs)
+    return pairs, keyimgs_idx

mast3r/retrieval/model.py ADDED Viewed

	@@ -0,0 +1,271 @@

+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# Whitener and RetrievalModel
+# --------------------------------------------------------
+import numpy as np
+from tqdm import tqdm
+import time
+import torch
+import torch.nn as nn
+import mast3r.utils.path_to_dust3r  # noqa
+from dust3r.utils.image import load_images
+default_device = torch.device('cuda' if torch.cuda.is_available() and torch.cuda.device_count() > 0 else 'cpu')
+# from https://github.com/gtolias/how/blob/4d73c88e0ffb55506e2ce6249e2a015ef6ccf79f/how/utils/whitening.py#L20
+def pcawhitenlearn_shrinkage(X, s=1.0):
+    """Learn PCA whitening with shrinkage from given descriptors"""
+    N = X.shape[0]
+    # Learning PCA w/o annotations
+    m = X.mean(axis=0, keepdims=True)
+    Xc = X - m
+    Xcov = np.dot(Xc.T, Xc)
+    Xcov = (Xcov + Xcov.T) / (2 * N)
+    eigval, eigvec = np.linalg.eig(Xcov)
+    order = eigval.argsort()[::-1]
+    eigval = eigval[order]
+    eigvec = eigvec[:, order]
+    eigval = np.clip(eigval, a_min=1e-14, a_max=None)
+    P = np.dot(np.linalg.inv(np.diag(np.power(eigval, 0.5 * s))), eigvec.T)
+    return m, P.T
+class Dust3rInputFromImageList(torch.utils.data.Dataset):
+    def __init__(self, image_list, imsize=512):
+        super().__init__()
+        self.image_list = image_list
+        assert imsize == 512
+        self.imsize = imsize
+    def __len__(self):
+        return len(self.image_list)
+    def __getitem__(self, index):
+        return load_images([self.image_list[index]], size=self.imsize, verbose=False)[0]
+class Whitener(nn.Module):
+    def __init__(self, dim, l2norm=None):
+        super().__init__()
+        self.m = torch.nn.Parameter(torch.zeros((1, dim)).double())
+        self.p = torch.nn.Parameter(torch.eye(dim, dim).double())
+        self.l2norm = l2norm  # if not None, apply l2 norm along a given dimension
+    def forward(self, x):
+        with torch.autocast(self.m.device.type, enabled=False):
+            shape = x.size()
+            input_type = x.dtype
+            x_reshaped = x.view(-1, shape[-1]).to(dtype=self.m.dtype)
+            # Center the input data
+            x_centered = x_reshaped - self.m
+            # Apply PCA transformation
+            pca_output = torch.matmul(x_centered, self.p)
+            # reshape back
+            pca_output_shape = shape  # list(shape[:-1]) + [shape[-1]]
+            pca_output = pca_output.view(pca_output_shape)
+            if self.l2norm is not None:
+                return torch.nn.functional.normalize(pca_output, dim=self.l2norm).to(dtype=input_type)
+            return pca_output.to(dtype=input_type)
+def weighted_spoc(feat, attn):
+    """
+    feat: BxNxC
+    attn: BxN
+    output: BxC L2-normalization weighted-sum-pooling of features
+    """
+    return torch.nn.functional.normalize((feat * attn[:, :, None]).sum(dim=1), dim=1)
+def how_select_local(feat, attn, nfeat):
+    """
+    feat: BxNxC
+    attn: BxN
+    nfeat: nfeat to keep
+    """
+    # get nfeat
+    if nfeat < 0:
+        assert nfeat >= -1.0
+        nfeat = int(-nfeat * feat.size(1))
+    else:
+        nfeat = int(nfeat)
+    # asort
+    topk_attn, topk_indices = torch.topk(attn, min(nfeat, attn.size(1)), dim=1)
+    topk_indices_expanded = topk_indices.unsqueeze(-1).expand(-1, -1, feat.size(2))
+    topk_features = torch.gather(feat, 1, topk_indices_expanded)
+    return topk_features, topk_attn, topk_indices
+class RetrievalModel(nn.Module):
+    def __init__(self, backbone, freeze_backbone=1, prewhiten=None, hdims=[1024], residual=False, postwhiten=None,
+                 featweights='l2norm', nfeat=300, pretrained_retrieval=None):
+        super().__init__()
+        self.backbone = backbone
+        self.freeze_backbone = freeze_backbone
+        if freeze_backbone:
+            for p in self.backbone.parameters():
+                p.requires_grad = False
+        self.backbone_dim = backbone.enc_embed_dim
+        self.prewhiten = nn.Identity() if prewhiten is None else Whitener(self.backbone_dim)
+        self.prewhiten_freq = prewhiten
+        if prewhiten is not None and prewhiten != -1:
+            for p in self.prewhiten.parameters():
+                p.requires_grad = False
+        self.residual = residual
+        self.projector = self.build_projector(hdims, residual)
+        self.dim = hdims[-1] if len(hdims) > 0 else self.backbone_dim
+        self.postwhiten_freq = postwhiten
+        self.postwhiten = nn.Identity() if postwhiten is None else Whitener(self.dim)
+        if postwhiten is not None and postwhiten != -1:
+            assert len(hdims) > 0
+            for p in self.postwhiten.parameters():
+                p.requires_grad = False
+        self.featweights = featweights
+        if featweights == 'l2norm':
+            self.attention = lambda x: x.norm(dim=-1)
+        else:
+            raise NotImplementedError(featweights)
+        self.nfeat = nfeat
+        self.pretrained_retrieval = pretrained_retrieval
+        if self.pretrained_retrieval is not None:
+            ckpt = torch.load(pretrained_retrieval, 'cpu')
+            msg = self.load_state_dict(ckpt['model'], strict=False)
+            assert len(msg.unexpected_keys) == 0 and all(k.startswith('backbone')
+                                                         or k.startswith('postwhiten') for k in msg.missing_keys)
+    def build_projector(self, hdims, residual):
+        if self.residual:
+            assert hdims[-1] == self.backbone_dim
+        d = self.backbone_dim
+        if len(hdims) == 0:
+            return nn.Identity()
+        layers = []
+        for i in range(len(hdims) - 1):
+            layers.append(nn.Linear(d, hdims[i]))
+            d = hdims[i]
+            layers.append(nn.LayerNorm(d))
+            layers.append(nn.GELU())
+        layers.append(nn.Linear(d, hdims[-1]))
+        return nn.Sequential(*layers)
+    def state_dict(self, *args, destination=None, prefix='', keep_vars=False):
+        ss = super().state_dict(*args, destination=destination, prefix=prefix, keep_vars=keep_vars)
+        if self.freeze_backbone:
+            ss = {k: v for k, v in ss.items() if not k.startswith('backbone')}
+        return ss
+    def reinitialize_whitening(self, epoch, train_dataset, nimgs=5000, log_writer=None, max_nfeat_per_image=None, seed=0, device=default_device):
+        do_prewhiten = self.prewhiten_freq is not None and self.pretrained_retrieval is None and \
+            (epoch == 0 or (self.prewhiten_freq > 0 and epoch % self.prewhiten_freq == 0))
+        do_postwhiten = self.postwhiten_freq is not None and ((epoch == 0 and self.postwhiten_freq in [0, -1])
+                                                              or (self.postwhiten_freq > 0 and
+                                                                  epoch % self.postwhiten_freq == 0 and epoch > 0))
+        if do_prewhiten or do_postwhiten:
+            self.eval()
+            imdataset = train_dataset.imlist_dataset_n_images(nimgs, seed)
+            loader = torch.utils.data.DataLoader(imdataset, batch_size=1, shuffle=False, num_workers=8, pin_memory=True)
+        if do_prewhiten:
+            print('Re-initialization of pre-whitening')
+            t = time.time()
+            with torch.no_grad():
+                features = []
+                for d in tqdm(loader):
+                    feat = self.backbone._encode_image(d['img'][0, ...].to(device),
+                                                       true_shape=d['true_shape'][0, ...])[0]
+                    feat = feat.flatten(0, 1)
+                    if max_nfeat_per_image is not None and max_nfeat_per_image < feat.size(0):
+                        l2norms = torch.linalg.vector_norm(feat, dim=1)
+                        feat = feat[torch.argsort(-l2norms)[:max_nfeat_per_image], :]
+                    features.append(feat.cpu())
+            features = torch.cat(features, dim=0)
+            features = features.numpy()
+            m, P = pcawhitenlearn_shrinkage(features)
+            self.prewhiten.load_state_dict({'m': torch.from_numpy(m), 'p': torch.from_numpy(P)})
+            prewhiten_time = time.time() - t
+            print(f'Done in {prewhiten_time:.1f} seconds')
+            if log_writer is not None:
+                log_writer.add_scalar('time/prewhiten', prewhiten_time, epoch)
+        if do_postwhiten:
+            print(f'Re-initialization of post-whitening')
+            t = time.time()
+            with torch.no_grad():
+                features = []
+                for d in tqdm(loader):
+                    backbone_feat = self.backbone._encode_image(d['img'][0, ...].to(device),
+                                                                true_shape=d['true_shape'][0, ...])[0]
+                    backbone_feat_prewhitened = self.prewhiten(backbone_feat)
+                    proj_feat = self.projector(backbone_feat_prewhitened) + \
+                        (0.0 if not self.residual else backbone_feat_prewhitened)
+                    proj_feat = proj_feat.flatten(0, 1)
+                    if max_nfeat_per_image is not None and max_nfeat_per_image < proj_feat.size(0):
+                        l2norms = torch.linalg.vector_norm(proj_feat, dim=1)
+                        proj_feat = proj_feat[torch.argsort(-l2norms)[:max_nfeat_per_image], :]
+                    features.append(proj_feat.cpu())
+                features = torch.cat(features, dim=0)
+                features = features.numpy()
+            m, P = pcawhitenlearn_shrinkage(features)
+            self.postwhiten.load_state_dict({'m': torch.from_numpy(m), 'p': torch.from_numpy(P)})
+            postwhiten_time = time.time() - t
+            print(f'Done in {postwhiten_time:.1f} seconds')
+            if log_writer is not None:
+                log_writer.add_scalar('time/postwhiten', postwhiten_time, epoch)
+    def extract_features_and_attention(self, x):
+        backbone_feat = self.backbone._encode_image(x['img'], true_shape=x['true_shape'])[0]
+        backbone_feat_prewhitened = self.prewhiten(backbone_feat)
+        proj_feat = self.projector(backbone_feat_prewhitened) + \
+            (0.0 if not self.residual else backbone_feat_prewhitened)
+        attention = self.attention(proj_feat)
+        proj_feat_whitened = self.postwhiten(proj_feat)
+        return proj_feat_whitened, attention
+    def forward_local(self, x):
+        feat, attn = self.extract_features_and_attention(x)
+        return how_select_local(feat, attn, self.nfeat)
+    def forward_global(self, x):
+        feat, attn = self.extract_features_and_attention(x)
+        return weighted_spoc(feat, attn)
+    def forward(self, x):
+        return self.forward_global(x)
+def identity(x):  # to avoid Can't pickle local object 'extract_local_features.<locals>.<lambda>'
+    return x
+@torch.no_grad()
+def extract_local_features(model, images, imsize, seed=0, tocpu=False, max_nfeat_per_image=None,
+                           max_nfeat_per_image2=None, device=default_device):
+    model.eval()
+    imdataset = Dust3rInputFromImageList(images, imsize=imsize) if isinstance(images, list) else images
+    loader = torch.utils.data.DataLoader(imdataset, batch_size=1, shuffle=False,
+                                         num_workers=8, pin_memory=True, collate_fn=identity)
+    with torch.no_grad():
+        features = []
+        imids = []
+        for i, d in enumerate(tqdm(loader)):
+            dd = d[0]
+            dd['img'] = dd['img'].to(device, non_blocking=True)
+            feat, _, _ = model.forward_local(dd)
+            feat = feat.flatten(0, 1)
+            if max_nfeat_per_image is not None and feat.size(0) > max_nfeat_per_image:
+                feat = feat[torch.randperm(feat.size(0))[:max_nfeat_per_image], :]
+            if max_nfeat_per_image2 is not None and feat.size(0) > max_nfeat_per_image2:
+                feat = feat[:max_nfeat_per_image2, :]
+            features.append(feat)
+            if tocpu:
+                features[-1] = features[-1].cpu()
+            imids.append(i * torch.ones_like(features[-1][:, 0]).to(dtype=torch.int64))
+    features = torch.cat(features, dim=0)
+    imids = torch.cat(imids, dim=0)
+    return features, imids

mast3r/retrieval/processor.py ADDED Viewed

	@@ -0,0 +1,129 @@

+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# Main Retriever class
+# --------------------------------------------------------
+import os
+import argparse
+import numpy as np
+import torch
+from mast3r.model import AsymmetricMASt3R
+from mast3r.retrieval.model import RetrievalModel, extract_local_features
+try:
+    import faiss
+    faiss.StandardGpuResources()  # when loading the checkpoint, it will try to instanciate FaissGpuL2Index
+except AttributeError as e:
+    import asmk.index
+    class FaissCpuL2Index(asmk.index.FaissL2Index):
+        def __init__(self, gpu_id):
+            super().__init__()
+            self.gpu_id = gpu_id
+        def _faiss_index_flat(self, dim):
+            """Return initialized faiss.IndexFlatL2"""
+            return faiss.IndexFlatL2(dim)
+    asmk.index.FaissGpuL2Index = FaissCpuL2Index
+from asmk import asmk_method  # noqa
+def get_args_parser():
+    parser = argparse.ArgumentParser('Retrieval scores from a set of retrieval', add_help=False, allow_abbrev=False)
+    parser.add_argument('--model', type=str, required=True,
+                        help="shortname of a retrieval model or path to the corresponding .pth")
+    parser.add_argument('--input', type=str, required=True,
+                        help="directory containing images or a file containing a list of image paths")
+    parser.add_argument('--outfile', type=str, required=True, help="numpy file where to store the matrix score")
+    return parser
+def get_impaths(imlistfile):
+    with open(imlistfile, 'r') as fid:
+        impaths = [f for f in imlistfile.read().splitlines() if not f.startswith('#')
+                   and len(f) > 0]  # ignore comments and empty lines
+    return impaths
+def get_impaths_from_imdir(imdir, extensions=['png', 'jpg', 'PNG', 'JPG']):
+    assert os.path.isdir(imdir)
+    impaths = [os.path.join(imdir, f) for f in sorted(os.listdir(imdir)) if any(f.endswith(ext) for ext in extensions)]
+    return impaths
+def get_impaths_from_imdir_or_imlistfile(input_imdir_or_imlistfile):
+    if os.path.isfile(input_imdir_or_imlistfile):
+        return get_impaths(input_imdir_or_imlistfile)
+    else:
+        return get_impaths_from_imdir(input_imdir_or_imlistfile)
+class Retriever(object):
+    def __init__(self, modelname, backbone=None, device='cuda'):
+        # load the model
+        assert os.path.isfile(modelname), modelname
+        print(f'Loading retrieval model from {modelname}')
+        ckpt = torch.load(modelname, 'cpu')  # TODO from pretrained to download it automatically
+        ckpt_args = ckpt['args']
+        if backbone is None:
+            backbone = AsymmetricMASt3R.from_pretrained(ckpt_args.pretrained)
+        self.model = RetrievalModel(
+            backbone, freeze_backbone=ckpt_args.freeze_backbone, prewhiten=ckpt_args.prewhiten,
+            hdims=list(map(int, ckpt_args.hdims.split('_'))) if len(ckpt_args.hdims) > 0 else "",
+            residual=getattr(ckpt_args, 'residual', False), postwhiten=ckpt_args.postwhiten,
+            featweights=ckpt_args.featweights, nfeat=ckpt_args.nfeat
+        ).to(device)
+        self.device = device
+        msg = self.model.load_state_dict(ckpt['model'], strict=False)
+        assert all(k.startswith('backbone') for k in msg.missing_keys)
+        assert len(msg.unexpected_keys) == 0
+        self.imsize = ckpt_args.imsize
+        # load the asmk codebook
+        dname, bname = os.path.split(modelname)  # TODO they should both be in the same file ?
+        bname_splits = bname.split('_')
+        cache_codebook_fname = os.path.join(dname, '_'.join(bname_splits[:-1]) + '_codebook.pkl')
+        assert os.path.isfile(cache_codebook_fname), cache_codebook_fname
+        asmk_params = {'index': {'gpu_id': 0}, 'train_codebook': {'codebook': {'size': '64k'}},
+                       'build_ivf': {'kernel': {'binary': True}, 'ivf': {'use_idf': False},
+                                     'quantize': {'multiple_assignment': 1}, 'aggregate': {}},
+                       'query_ivf': {'quantize': {'multiple_assignment': 5}, 'aggregate': {},
+                                     'search': {'topk': None},
+                                     'similarity': {'similarity_threshold': 0.0, 'alpha': 3.0}}}
+        asmk_params['train_codebook']['codebook']['size'] = ckpt_args.nclusters
+        self.asmk = asmk_method.ASMKMethod.initialize_untrained(asmk_params)
+        self.asmk = self.asmk.train_codebook(None, cache_path=cache_codebook_fname)
+    def __call__(self, input_imdir_or_imlistfile, outfile=None):
+        # get impaths
+        if isinstance(input_imdir_or_imlistfile, str):
+            impaths = get_impaths_from_imdir_or_imlistfile(input_imdir_or_imlistfile)
+        else:
+            impaths = input_imdir_or_imlistfile  # we're assuming a list has been passed
+        print(f'Found {len(impaths)} images')
+        # build the database
+        feat, ids = extract_local_features(self.model, impaths, self.imsize, tocpu=True, device=self.device)
+        feat = feat.cpu().numpy()
+        ids = ids.cpu().numpy()
+        asmk_dataset = self.asmk.build_ivf(feat, ids)
+        # we actually retrieve the same set of images
+        metadata, query_ids, ranks, ranked_scores = asmk_dataset.query_ivf(feat, ids)
+        # well ... scores are actually reordered according to ranks ...
+        # so we redo it the other way around...
+        scores = np.empty_like(ranked_scores)
+        scores[np.arange(ranked_scores.shape[0])[:, None], ranks] = ranked_scores
+        # save
+        if outfile is not None:
+            if os.path.isdir(os.path.dirname(outfile)):
+                os.makedirs(os.path.dirname(outfile), exist_ok=True)
+            np.save(outfile, scores)
+            print(f'Scores matrix saved in {outfile}')
+        return scores