Spaces:

kkwinds
/

mast3r-sfm

Configuration error

App Files Files Community

jerome-revaud commited on Jul 11, 2024

Commit

83ae704

0 Parent(s):

Initial commit

Browse files

Files changed (48) hide show

.gitignore +129 -0
.gitmodules +4 -0
CHECKPOINTS_NOTICE +1376 -0
LICENSE +7 -0
NOTICE +103 -0
README.md +316 -0
assets/NLE_tower/01D90321-69C8-439F-B0B0-E87E7634741C-83120-000041DAE419D7AE.jpg +0 -0
assets/NLE_tower/1AD85EF5-B651-4291-A5C0-7BDB7D966384-83120-000041DADF639E09.jpg +0 -0
assets/NLE_tower/2679C386-1DC0-4443-81B5-93D7EDE4AB37-83120-000041DADB2EA917.jpg +0 -0
assets/NLE_tower/28EDBB63-B9F9-42FB-AC86-4852A33ED71B-83120-000041DAF22407A1.jpg +0 -0
assets/NLE_tower/91E9B685-7A7D-42D7-B933-23A800EE4129-83120-000041DAE12C8176.jpg +0 -0
assets/NLE_tower/CDBBD885-54C3-4EB4-9181-226059A60EE0-83120-000041DAE0C3D612.jpg +0 -0
assets/NLE_tower/FF5599FD-768B-431A-AB83-BDA5FB44CB9D-83120-000041DADDE35483.jpg +0 -0
assets/demo.jpg +0 -0
assets/examples.jpg +0 -0
assets/mast3r.jpg +0 -0
assets/mast3r_archi.jpg +0 -0
assets/matching.jpg +0 -0
demo.py +297 -0
demo_dust3r_ga.py +64 -0
dust3r +1 -0
mast3r/__init__.py +2 -0
mast3r/catmlp_dpt_head.py +123 -0
mast3r/cloud_opt/__init__.py +2 -0
mast3r/cloud_opt/sparse_ga.py +1001 -0
mast3r/cloud_opt/triangulation.py +80 -0
mast3r/cloud_opt/tsdf_optimizer.py +273 -0
mast3r/cloud_opt/utils/__init__.py +2 -0
mast3r/cloud_opt/utils/losses.py +32 -0
mast3r/cloud_opt/utils/schedules.py +17 -0
mast3r/colmap/__init__.py +2 -0
mast3r/colmap/database.py +383 -0
mast3r/datasets/__init__.py +62 -0
mast3r/datasets/base/__init__.py +2 -0
mast3r/datasets/base/mast3r_base_stereo_view_dataset.py +355 -0
mast3r/datasets/utils/__init__.py +2 -0
mast3r/datasets/utils/cropping.py +219 -0
mast3r/fast_nn.py +221 -0
mast3r/losses.py +514 -0
mast3r/model.py +68 -0
mast3r/utils/__init__.py +2 -0
mast3r/utils/coarse_to_fine.py +214 -0
mast3r/utils/collate.py +62 -0
mast3r/utils/misc.py +17 -0
mast3r/utils/path_to_dust3r.py +19 -0
requirements.txt +1 -0
train.py +48 -0
visloc.py +538 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,129 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/

.gitmodules ADDED Viewed

	@@ -0,0 +1,4 @@

+[submodule "dust3r"]
+	path = dust3r
+	url = https://github.com/naver/dust3r
+	branch = cvpr

CHECKPOINTS_NOTICE ADDED Viewed

	@@ -0,0 +1,1376 @@

+MASt3R
+Copyright 2024-present NAVER Corp.
+This project's checkpoints were trained on datasets with separate license terms.
+Your use of theses checkpoints is subject to the terms and conditions of the following licenses.
+===
+pretrained model:
+DUSt3R: DUSt3R_ViTLarge_BaseDecoder_512_dpt
+https://github.com/naver/dust3r
+In particular, from the croco training set:
+3D_Street_View
+https://github.com/amir32002/3D_Street_View/blob/master/LICENSE
+This dataset is made freely available to academic and non-academic entities for non-commercial purposes such as academic research, teaching, scientific publications, or personal experimentation. Permission is granted to use the data given that you agree:
+1. That the dataset comes "AS IS", without express or implied warranty. Although every effort has been made to ensure accuracy, we do not accept any responsibility for errors or omissions.
+2. That you include a reference to the Dataset in any work that makes use of the dataset. For research papers, cite our  publication as listed on our website.
+3. That you do not distribute this dataset or modified versions. It is permissible to distribute derivative works in as far as they are abstract representations of this dataset (such as models trained on it or additional annotations that do not directly include any of our data) and do not allow to recover the dataset or something similar in character.
+4. That you may not use the dataset or any derivative work for commercial purposes as, for example, licensing or selling the data, or using the data with a purpose to procure a commercial gain.
+That all rights not expressly granted to you are reserved by us.
+In addition, using the dataset is subject to the following standard terms:
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+Indoor Visual Localization datasets (IndoorVL)
+https://challenge.naverlabs.com/kapture/GangnamStation_LICENSE.txt
+https://challenge.naverlabs.com/kapture/HyundaiDepartmentStore_LICENSE.txt
+LICENSE.txt
+Creative Commons Attribution-NonCommercial-NoDerivatives 4.0 (modified ver.)
+International Public License
+By exercising the Licensed Rights (defined below), You accept and agree
+to be bound by the terms and conditions of this Creative Commons
+Attribution-NonCommercial-NoDerivatives 4.0 International Public
+License ("Public License"). To the extent this Public License may be
+interpreted as a contract, You are granted the Licensed Rights in
+consideration of Your acceptance of these terms and conditions, and the
+Licensor grants You such rights in consideration of benefits the
+Licensor receives from making the Licensed Material available under
+these terms and conditions.
+Section 1 -- Definitions.
+  a. Adapted Material means material subject to Copyright and Similar
+     Rights that is derived from or based upon the Licensed Material
+     and in which the Licensed Material is translated, altered,
+     arranged, transformed, or otherwise modified in a manner requiring
+     permission under the Copyright and Similar Rights held by the
+     Licensor. For purposes of this Public License, where the Licensed
+     Material is a musical work, performance, or sound recording,
+     Adapted Material is always produced where the Licensed Material is
+     synched in timed relation with a moving image.
+  b. Copyright and Similar Rights means copyright and/or similar rights
+     closely related to copyright including, without limitation,
+     performance, broadcast, sound recording, and Sui Generis Database
+     Rights, without regard to how the rights are labeled or
+     categorized. For purposes of this Public License, the rights
+     specified in Section 2(b)(1)-(2) are not Copyright and Similar
+     Rights.
+  c. Effective Technological Measures means those measures that, in the
+     absence of proper authority, may not be circumvented under laws
+     fulfilling obligations under Article 11 of the WIPO Copyright
+     Treaty adopted on December 20, 1996, and/or similar international
+     agreements.
+  d. Exceptions and Limitations means fair use, fair dealing, and/or
+     any other exception or limitation to Copyright and Similar Rights
+     that applies to Your use of the Licensed Material.
+  e. Licensed Material means the artistic or literary work, database,
+     or other material to which the Licensor applied this Public
+     License.
+  f. Licensed Rights means the rights granted to You subject to the
+     terms and conditions of this Public License, which are limited to
+     all Copyright and Similar Rights that apply to Your use of the
+     Licensed Material and that the Licensor has authority to license.
+  g. Licensor means the individual(s) or entity(ies) granting rights
+     under this Public License.
+  h. NonCommercial means not primarily intended for or directed towards
+     commercial advantage or monetary compensation. For purposes of
+     this Public License, the exchange of the Licensed Material for
+     other material subject to Copyright and Similar Rights by digital
+     file-sharing or similar means is NonCommercial provided there is
+     no payment of monetary compensation in connection with the
+     exchange.
+  i. Share means to provide material to the public by any means or
+     process that requires permission under the Licensed Rights, such
+     as reproduction, public display, public performance, distribution,
+     dissemination, communication, or importation, and to make material
+     available to the public including in ways that members of the
+     public may access the material from a place and at a time
+     individually chosen by them.
+  j. Sui Generis Database Rights means rights other than copyright
+     resulting from Directive 96/9/EC of the European Parliament and of
+     the Council of 11 March 1996 on the legal protection of databases,
+     as amended and/or succeeded, as well as other essentially
+     equivalent rights anywhere in the world.
+  k. You means the individual or entity exercising the Licensed Rights
+     under this Public License. Your has a corresponding meaning.
+  l. Research purpose means to publish research achievements in a research paper
+Section 2 -- Scope.
+  a. License grant.
+       1. Subject to the terms and conditions of this Public License,
+          the Licensor hereby grants You a worldwide, royalty-free,
+          non-sublicensable, non-exclusive, irrevocable license to
+          exercise the Licensed Rights in the Licensed Material to:
+            a. reproduce and Share the Licensed Material, in whole or
+               in part, for NonCommercial purposes only; and
+            b. produce and reproduce, but not Share, Adapted Material
+               for NonCommercial purposes only.
+            c. reproduce and share the Adapted Matrerial, in part,
+               for Research purposes only.
+       2. Exceptions and Limitations. For the avoidance of doubt, where
+          Exceptions and Limitations apply to Your use, this Public
+          License does not apply, and You do not need to comply with
+          its terms and conditions.
+       3. Term. The term of this Public License is specified in Section
+          6(a).
+       4. Media and formats; technical modifications allowed. The
+          Licensor authorizes You to exercise the Licensed Rights in
+          all media and formats whether now known or hereafter created,
+          and to make technical modifications necessary to do so. The
+          Licensor waives and/or agrees not to assert any right or
+          authority to forbid You from making technical modifications
+          necessary to exercise the Licensed Rights, including
+          technical modifications necessary to circumvent Effective
+          Technological Measures. For purposes of this Public License,
+          simply making modifications authorized by this Section 2(a)
+          (4) never produces Adapted Material.
+       5. Downstream recipients.
+            a. Offer from the Licensor -- Licensed Material. Every
+               recipient of the Licensed Material automatically
+               receives an offer from the Licensor to exercise the
+               Licensed Rights under the terms and conditions of this
+               Public License.
+            b. No downstream restrictions. You may not offer or impose
+               any additional or different terms or conditions on, or
+               apply any Effective Technological Measures to, the
+               Licensed Material if doing so restricts exercise of the
+               Licensed Rights by any recipient of the Licensed
+               Material.
+       6. No endorsement. Nothing in this Public License constitutes or
+          may be construed as permission to assert or imply that You
+          are, or that Your use of the Licensed Material is, connected
+          with, or sponsored, endorsed, or granted official status by,
+          the Licensor or others designated to receive attribution as
+          provided in Section 3(a)(1)(A)(i).
+  b. Other rights.
+       1. Moral rights, such as the right of integrity, are not
+          licensed under this Public License, nor are publicity,
+          privacy, and/or other similar personality rights; however, to
+          the extent possible, the Licensor waives and/or agrees not to
+          assert any such rights held by the Licensor to the limited
+          extent necessary to allow You to exercise the Licensed
+          Rights, but not otherwise.
+       2. Patent and trademark rights are not licensed under this
+          Public License.
+       3. To the extent possible, the Licensor waives any right to
+          collect royalties from You for the exercise of the Licensed
+          Rights, whether directly or through a collecting society
+          under any voluntary or waivable statutory or compulsory
+          licensing scheme. In all other cases the Licensor expressly
+          reserves any right to collect such royalties, including when
+          the Licensed Material is used other than for NonCommercial
+          purposes.
+Section 3 -- License Conditions.
+Your exercise of the Licensed Rights is expressly made subject to the
+following conditions.
+  a. Attribution.
+       1. If You Share the Licensed Material(including in a research paper),
+          You must:
+            a. retain the following if it is supplied by the Licensor
+               with the Licensed Material:
+                 i. identification of the creator(s) of the Licensed
+                    Material and any others designated to receive
+                    attribution, in any reasonable manner requested by
+                    the Licensor (including by pseudonym if
+                    designated);
+                ii. a copyright notice;
+               iii. a notice that refers to this Public License;
+                iv. a notice that refers to the disclaimer of
+                    warranties;
+                 v. a URI or hyperlink to the Licensed Material to the
+                    extent reasonably practicable;
+            b. indicate if You modified the Licensed Material and
+               retain an indication of any previous modifications; and
+            c. indicate the Licensed Material is licensed under this
+               Public License, and include the text of, or the URI or
+               hyperlink to, this Public License.
+          For the avoidance of doubt, You do not have permission under
+          this Public License to Share Adapted Material.
+       2. You may satisfy the conditions in Section 3(a)(1) in any
+          reasonable manner based on the medium, means, and context in
+          which You Share the Licensed Material. For example, it may be
+          reasonable to satisfy the conditions by providing a URI or
+          hyperlink to a resource that includes the required
+          information.
+       3. If requested by the Licensor, You must remove any of the
+          information required by Section 3(a)(1)(A) to the extent
+          reasonably practicable.
+Section 4 -- Sui Generis Database Rights.
+Where the Licensed Rights include Sui Generis Database Rights that
+apply to Your use of the Licensed Material:
+  a. for the avoidance of doubt, Section 2(a)(1) grants You the right
+     to extract, reuse, reproduce, and Share all or a substantial
+     portion of the contents of the database for NonCommercial purposes
+     only and provided You do not Share Adapted Material;
+  b. if You include all or a substantial portion of the database
+     contents in a database in which You have Sui Generis Database
+     Rights, then the database in which You have Sui Generis Database
+     Rights (but not its individual contents) is Adapted Material; and
+  c. You must comply with the conditions in Section 3(a) if You Share
+     all or a substantial portion of the contents of the database.
+For the avoidance of doubt, this Section 4 supplements and does not
+replace Your obligations under this Public License where the Licensed
+Rights include other Copyright and Similar Rights.
+Section 5 -- Disclaimer of Warranties and Limitation of Liability.
+  a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE
+     EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS
+     AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF
+     ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS,
+     IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION,
+     WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR
+     PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS,
+     ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT
+     KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT
+     ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU.
+  b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE
+     TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION,
+     NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT,
+     INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES,
+     COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR
+     USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN
+     ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR
+     DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR
+     IN PART, THIS LIMITATION MAY NOT APPLY TO YOU.
+  c. The disclaimer of warranties and limitation of liability provided
+     above shall be interpreted in a manner that, to the extent
+     possible, most closely approximates an absolute disclaimer and
+     waiver of all liability.
+Section 6 -- Term and Termination.
+  a. This Public License applies for the term of the Copyright and
+     Similar Rights licensed here. However, if You fail to comply with
+     this Public License, then Your rights under this Public License
+     terminate automatically.
+  b. Where Your right to use the Licensed Material has terminated under
+     Section 6(a), it reinstates:
+       1. automatically as of the date the violation is cured, provided
+          it is cured within 30 days of Your discovery of the
+          violation; or
+       2. upon express reinstatement by the Licensor.
+     For the avoidance of doubt, this Section 6(b) does not affect any
+     right the Licensor may have to seek remedies for Your violations
+     of this Public License.
+  c. For the avoidance of doubt, the Licensor may also offer the
+     Licensed Material under separate terms or conditions or stop
+     distributing the Licensed Material at any time; however, doing so
+     will not terminate this Public License.
+  d. Sections 1, 5, 6, 7, and 8 survive termination of this Public
+     License.
+Section 7 -- Other Terms and Conditions.
+  a. The Licensor shall not be bound by any additional or different
+     terms or conditions communicated by You unless expressly agreed.
+  b. Any arrangements, understandings, or agreements regarding the
+     Licensed Material not stated herein are separate from and
+     independent of the terms and conditions of this Public License.
+Section 8 -- Interpretation.
+  a. For the avoidance of doubt, this Public License does not, and
+     shall not be interpreted to, reduce, limit, restrict, or impose
+     conditions on any use of the Licensed Material that could lawfully
+     be made without permission under this Public License.
+  b. To the extent possible, if any provision of this Public License is
+     deemed unenforceable, it shall be automatically reformed to the
+     minimum extent necessary to make it enforceable. If the provision
+     cannot be reformed, it shall be severed from this Public License
+     without affecting the enforceability of the remaining terms and
+     conditions.
+  c. No term or condition of this Public License will be waived and no
+     failure to comply consented to unless expressly agreed to by the
+     Licensor.
+  d. Nothing in this Public License constitutes or may be interpreted
+     as a limitation upon, or waiver of, any privileges and immunities
+     that apply to the Licensor or You, including from the legal
+     processes of any jurisdiction or authority.
+===
+CO3Dv2
+Creative Commons Attribution-NonCommercial 4.0 International Public
+License
+By exercising the Licensed Rights (defined below), You accept and agree
+to be bound by the terms and conditions of this Creative Commons
+Attribution-NonCommercial 4.0 International Public License ("Public
+License"). To the extent this Public License may be interpreted as a
+contract, You are granted the Licensed Rights in consideration of Your
+acceptance of these terms and conditions, and the Licensor grants You
+such rights in consideration of benefits the Licensor receives from
+making the Licensed Material available under these terms and
+conditions.
+Section 1 -- Definitions.
+  a. Adapted Material means material subject to Copyright and Similar
+     Rights that is derived from or based upon the Licensed Material
+     and in which the Licensed Material is translated, altered,
+     arranged, transformed, or otherwise modified in a manner requiring
+     permission under the Copyright and Similar Rights held by the
+     Licensor. For purposes of this Public License, where the Licensed
+     Material is a musical work, performance, or sound recording,
+     Adapted Material is always produced where the Licensed Material is
+     synched in timed relation with a moving image.
+  b. Adapter's License means the license You apply to Your Copyright
+     and Similar Rights in Your contributions to Adapted Material in
+     accordance with the terms and conditions of this Public License.
+  c. Copyright and Similar Rights means copyright and/or similar rights
+     closely related to copyright including, without limitation,
+     performance, broadcast, sound recording, and Sui Generis Database
+     Rights, without regard to how the rights are labeled or
+     categorized. For purposes of this Public License, the rights
+     specified in Section 2(b)(1)-(2) are not Copyright and Similar
+     Rights.
+  d. Effective Technological Measures means those measures that, in the
+     absence of proper authority, may not be circumvented under laws
+     fulfilling obligations under Article 11 of the WIPO Copyright
+     Treaty adopted on December 20, 1996, and/or similar international
+     agreements.
+  e. Exceptions and Limitations means fair use, fair dealing, and/or
+     any other exception or limitation to Copyright and Similar Rights
+     that applies to Your use of the Licensed Material.
+  f. Licensed Material means the artistic or literary work, database,
+     or other material to which the Licensor applied this Public
+     License.
+  g. Licensed Rights means the rights granted to You subject to the
+     terms and conditions of this Public License, which are limited to
+     all Copyright and Similar Rights that apply to Your use of the
+     Licensed Material and that the Licensor has authority to license.
+  h. Licensor means the individual(s) or entity(ies) granting rights
+     under this Public License.
+  i. NonCommercial means not primarily intended for or directed towards
+     commercial advantage or monetary compensation. For purposes of
+     this Public License, the exchange of the Licensed Material for
+     other material subject to Copyright and Similar Rights by digital
+     file-sharing or similar means is NonCommercial provided there is
+     no payment of monetary compensation in connection with the
+     exchange.
+  j. Share means to provide material to the public by any means or
+     process that requires permission under the Licensed Rights, such
+     as reproduction, public display, public performance, distribution,
+     dissemination, communication, or importation, and to make material
+     available to the public including in ways that members of the
+     public may access the material from a place and at a time
+     individually chosen by them.
+  k. Sui Generis Database Rights means rights other than copyright
+     resulting from Directive 96/9/EC of the European Parliament and of
+     the Council of 11 March 1996 on the legal protection of databases,
+     as amended and/or succeeded, as well as other essentially
+     equivalent rights anywhere in the world.
+  l. You means the individual or entity exercising the Licensed Rights
+     under this Public License. Your has a corresponding meaning.
+Section 2 -- Scope.
+  a. License grant.
+       1. Subject to the terms and conditions of this Public License,
+          the Licensor hereby grants You a worldwide, royalty-free,
+          non-sublicensable, non-exclusive, irrevocable license to
+          exercise the Licensed Rights in the Licensed Material to:
+            a. reproduce and Share the Licensed Material, in whole or
+               in part, for NonCommercial purposes only; and
+            b. produce, reproduce, and Share Adapted Material for
+               NonCommercial purposes only.
+       2. Exceptions and Limitations. For the avoidance of doubt, where
+          Exceptions and Limitations apply to Your use, this Public
+          License does not apply, and You do not need to comply with
+          its terms and conditions.
+       3. Term. The term of this Public License is specified in Section
+          6(a).
+       4. Media and formats; technical modifications allowed. The
+          Licensor authorizes You to exercise the Licensed Rights in
+          all media and formats whether now known or hereafter created,
+          and to make technical modifications necessary to do so. The
+          Licensor waives and/or agrees not to assert any right or
+          authority to forbid You from making technical modifications
+          necessary to exercise the Licensed Rights, including
+          technical modifications necessary to circumvent Effective
+          Technological Measures. For purposes of this Public License,
+          simply making modifications authorized by this Section 2(a)
+          (4) never produces Adapted Material.
+       5. Downstream recipients.
+            a. Offer from the Licensor -- Licensed Material. Every
+               recipient of the Licensed Material automatically
+               receives an offer from the Licensor to exercise the
+               Licensed Rights under the terms and conditions of this
+               Public License.
+            b. No downstream restrictions. You may not offer or impose
+               any additional or different terms or conditions on, or
+               apply any Effective Technological Measures to, the
+               Licensed Material if doing so restricts exercise of the
+               Licensed Rights by any recipient of the Licensed
+               Material.
+       6. No endorsement. Nothing in this Public License constitutes or
+          may be construed as permission to assert or imply that You
+          are, or that Your use of the Licensed Material is, connected
+          with, or sponsored, endorsed, or granted official status by,
+          the Licensor or others designated to receive attribution as
+          provided in Section 3(a)(1)(A)(i).
+  b. Other rights.
+       1. Moral rights, such as the right of integrity, are not
+          licensed under this Public License, nor are publicity,
+          privacy, and/or other similar personality rights; however, to
+          the extent possible, the Licensor waives and/or agrees not to
+          assert any such rights held by the Licensor to the limited
+          extent necessary to allow You to exercise the Licensed
+          Rights, but not otherwise.
+       2. Patent and trademark rights are not licensed under this
+          Public License.
+       3. To the extent possible, the Licensor waives any right to
+          collect royalties from You for the exercise of the Licensed
+          Rights, whether directly or through a collecting society
+          under any voluntary or waivable statutory or compulsory
+          licensing scheme. In all other cases the Licensor expressly
+          reserves any right to collect such royalties, including when
+          the Licensed Material is used other than for NonCommercial
+          purposes.
+Section 3 -- License Conditions.
+Your exercise of the Licensed Rights is expressly made subject to the
+following conditions.
+  a. Attribution.
+       1. If You Share the Licensed Material (including in modified
+          form), You must:
+            a. retain the following if it is supplied by the Licensor
+               with the Licensed Material:
+                 i. identification of the creator(s) of the Licensed
+                    Material and any others designated to receive
+                    attribution, in any reasonable manner requested by
+                    the Licensor (including by pseudonym if
+                    designated);
+                ii. a copyright notice;
+               iii. a notice that refers to this Public License;
+                iv. a notice that refers to the disclaimer of
+                    warranties;
+                 v. a URI or hyperlink to the Licensed Material to the
+                    extent reasonably practicable;
+            b. indicate if You modified the Licensed Material and
+               retain an indication of any previous modifications; and
+            c. indicate the Licensed Material is licensed under this
+               Public License, and include the text of, or the URI or
+               hyperlink to, this Public License.
+       2. You may satisfy the conditions in Section 3(a)(1) in any
+          reasonable manner based on the medium, means, and context in
+          which You Share the Licensed Material. For example, it may be
+          reasonable to satisfy the conditions by providing a URI or
+          hyperlink to a resource that includes the required
+          information.
+       3. If requested by the Licensor, You must remove any of the
+          information required by Section 3(a)(1)(A) to the extent
+          reasonably practicable.
+       4. If You Share Adapted Material You produce, the Adapter's
+          License You apply must not prevent recipients of the Adapted
+          Material from complying with this Public License.
+Section 4 -- Sui Generis Database Rights.
+Where the Licensed Rights include Sui Generis Database Rights that
+apply to Your use of the Licensed Material:
+  a. for the avoidance of doubt, Section 2(a)(1) grants You the right
+     to extract, reuse, reproduce, and Share all or a substantial
+     portion of the contents of the database for NonCommercial purposes
+     only;
+  b. if You include all or a substantial portion of the database
+     contents in a database in which You have Sui Generis Database
+     Rights, then the database in which You have Sui Generis Database
+     Rights (but not its individual contents) is Adapted Material; and
+  c. You must comply with the conditions in Section 3(a) if You Share
+     all or a substantial portion of the contents of the database.
+For the avoidance of doubt, this Section 4 supplements and does not
+replace Your obligations under this Public License where the Licensed
+Rights include other Copyright and Similar Rights.
+Section 5 -- Disclaimer of Warranties and Limitation of Liability.
+  a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE
+     EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS
+     AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF
+     ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS,
+     IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION,
+     WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR
+     PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS,
+     ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT
+     KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT
+     ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU.
+  b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE
+     TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION,
+     NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT,
+     INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES,
+     COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR
+     USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN
+     ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR
+     DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR
+     IN PART, THIS LIMITATION MAY NOT APPLY TO YOU.
+  c. The disclaimer of warranties and limitation of liability provided
+     above shall be interpreted in a manner that, to the extent
+     possible, most closely approximates an absolute disclaimer and
+     waiver of all liability.
+Section 6 -- Term and Termination.
+  a. This Public License applies for the term of the Copyright and
+     Similar Rights licensed here. However, if You fail to comply with
+     this Public License, then Your rights under this Public License
+     terminate automatically.
+  b. Where Your right to use the Licensed Material has terminated under
+     Section 6(a), it reinstates:
+       1. automatically as of the date the violation is cured, provided
+          it is cured within 30 days of Your discovery of the
+          violation; or
+       2. upon express reinstatement by the Licensor.
+     For the avoidance of doubt, this Section 6(b) does not affect any
+     right the Licensor may have to seek remedies for Your violations
+     of this Public License.
+  c. For the avoidance of doubt, the Licensor may also offer the
+     Licensed Material under separate terms or conditions or stop
+     distributing the Licensed Material at any time; however, doing so
+     will not terminate this Public License.
+  d. Sections 1, 5, 6, 7, and 8 survive termination of this Public
+     License.
+Section 7 -- Other Terms and Conditions.
+  a. The Licensor shall not be bound by any additional or different
+     terms or conditions communicated by You unless expressly agreed.
+  b. Any arrangements, understandings, or agreements regarding the
+     Licensed Material not stated herein are separate from and
+     independent of the terms and conditions of this Public License.
+Section 8 -- Interpretation.
+  a. For the avoidance of doubt, this Public License does not, and
+     shall not be interpreted to, reduce, limit, restrict, or impose
+     conditions on any use of the Licensed Material that could lawfully
+     be made without permission under this Public License.
+  b. To the extent possible, if any provision of this Public License is
+     deemed unenforceable, it shall be automatically reformed to the
+     minimum extent necessary to make it enforceable. If the provision
+     cannot be reformed, it shall be severed from this Public License
+     without affecting the enforceability of the remaining terms and
+     conditions.
+  c. No term or condition of this Public License will be waived and no
+     failure to comply consented to unless expressly agreed to by the
+     Licensor.
+  d. Nothing in this Public License constitutes or may be interpreted
+     as a limitation upon, or waiver of, any privileges and immunities
+     that apply to the Licensor or You, including from the legal
+     processes of any jurisdiction or authority.
+===
+ARKitScenes
+Creative Commons Attribution-NonCommercial-ShareAlike 4.0: https://creativecommons.org/licenses/by-nc-sa/4.0/
+===
+ScanNet++
+https://kaldir.vc.in.tum.de/scannetpp/static/scannetpp-terms-of-use.pdf
+===
+BlendedMVS
+Creative Commons Attribution 4.0 International: http://creativecommons.org/licenses/by/4.0/
+===
+Habitat-Sim
+HM3D
+https://matterport.com/fr/legal/matterport-end-user-license-agreement-academic-use-model-data
+ScanNet
+https://kaldir.vc.in.tum.de/scannet/ScanNet_TOS.pdf
+Replica
+Before Facebook Technologies, LLC (“FB”) is able to offer you (“Researcher” or
+“You”) access to the Replica Dataset (the “Dataset”), please read the following
+agreement (“Agreement”).
+By accessing, and in exchange for receiving permission to access, the Dataset,
+Researcher hereby agrees to the following terms and conditions:
+1.  Researcher may use, modify, improve and/or publish the Dataset only in
+connection with a research or educational purpose that is non-commercial or
+not-for-profit in nature, and not for any other purpose.
+1.  Researcher may provide research associates and colleagues with access to the
+Dataset provided that they first agree to be bound by these terms and
+conditions.
+1.  Researcher may use the Dataset in the scope of their employment at a
+for-profit or commercial entity provided that Researcher complies with Section 1
+of this Agreement. If Researcher is employed by a for-profit or commercial
+entity, Researcher's employer shall also be bound by these terms and conditions,
+and Researcher hereby represents that they are fully authorized to enter into
+this agreement on behalf of such employer.
+1.  THE DATASET IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL FB OR ANY
+CONTRIBUTOR BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE DATASET OR THE USE OR OTHER DEALINGS IN THE DATASET.
+1.  The law of the State of California shall apply to all disputes related to
+this Dataset.
+ReplicaCAD
+Creative Commons Attribution 4.0 International (CC BY 4.0): https://creativecommons.org/licenses/by/4.0/
+habitat-sim
+MIT License
+Copyright (c) Meta Platforms, Inc. and its affiliates.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+===
+MegaDepth
+MIT License
+Copyright (c) 2018 Zhengqi Li
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+===
+StaticThings3D
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+===
+WildRGB-D
+https://github.com/wildrgbd/wildrgbd/
+MIT License
+Copyright (c) 2024 rowdataset
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+===
+TartanAir
+Creative Commons Attribution 4.0 International License: http://creativecommons.org/licenses/by/4.0/
+===
+UnrealStereo4K
+https://github.com/fabiotosi92/SMD-Nets
+MIT License
+Copyright (c) 2021 Fabio Tosi
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+===
+Virtual KITTI 2
+Creative Commons Attribution-NonCommercial-ShareAlike 3.0: http://creativecommons.org/licenses/by-nc-sa/3.0/legalcode
+===
+DL3DV
+DL3DV-10K Term of use and Creative Commons Attribution-NonCommercial 4.0 International License.
+Terms of Use
+Researcher shall use the Dataset only for non-commercial research and educational purposes.
+DL3DV-10K organization makes no representations or warranties regarding the dataset, including but not limited to warranties of non-infringement or fitness for a particular purpose.
+Researcher accepts full responsibility for his/her/their use of the Dataset and shall defend and indemnify DL3DV-10K organization, including its members, employees, Trustees, officers and agents, against any and all claims arising from Researcher's use of the Dataset, including but not limited to Researcher's use of any copies of copyrighted 3D models that he/she/they may create from the dataset.
+Researcher may provide research associates and colleagues with access to the Dataset, after receiving entity has also agreed to and signed these terms and conditions. Sharing the data otherwise is strictly prohibited.
+Following General Data Protection Regulation, Researcher must ensure that they can delete all person-specific data upon request.
+DL3DV-10K organization reserves the right to terminate Researcher's access to the Dataset at any time.
+If Researcher is employed by a for-profit, commercial entity, Researcher's employer shall also be bound by these terms and conditions, and Researcher hereby represents that he/she/they is/are fully authorized to enter into this agreement on behalf of such employer.
+The law of the Indiana State shall apply to all disputes under this agreement.
+Creative Commons Attribution-NonCommercial 4.0 International Public License
+By exercising the Licensed Rights (defined below), You accept and agree to be bound by the terms and conditions of this Creative Commons Attribution-NonCommercial 4.0 International Public License ("Public License"). To the extent this Public License may be interpreted as a contract, You are granted the Licensed Rights in consideration of Your acceptance of these terms and conditions, and the Licensor grants You such rights in consideration of benefits the Licensor receives from making the Licensed Material available under these terms and conditions.
+Section 1 -- Definitions.
+a. Adapted Material means material subject to Copyright and Similar Rights that is derived from or based upon the Licensed Material and in which the Licensed Material is translated, altered, arranged, transformed, or otherwise modified in a manner requiring permission under the Copyright and Similar Rights held by the Licensor. For purposes of this Public License, where the Licensed Material is a musical work, performance, or sound recording, Adapted Material is always produced where the Licensed Material is synched in timed relation with a moving image.
+b. Adapter's License means the license You apply to Your Copyright and Similar Rights in Your contributions to Adapted Material in accordance with the terms and conditions of this Public License.
+c. Copyright and Similar Rights means copyright and/or similar rights closely related to copyright including, without limitation, performance, broadcast, sound recording, and Sui Generis Database Rights, without regard to how the rights are labeled or categorized. For purposes of this Public License, the rights specified in Section 2(b)(1)-(2) are not Copyright and Similar Rights. d. Effective Technological Measures means those measures that, in the absence of proper authority, may not be circumvented under laws fulfilling obligations under Article 11 of the WIPO Copyright Treaty adopted on December 20, 1996, and/or similar international agreements.
+e. Exceptions and Limitations means fair use, fair dealing, and/or any other exception or limitation to Copyright and Similar Rights that applies to Your use of the Licensed Material.
+f. Licensed Material means the artistic or literary work, database, or other material to which the Licensor applied this Public License.
+g. Licensed Rights means the rights granted to You subject to the terms and conditions of this Public License, which are limited to all Copyright and Similar Rights that apply to Your use of the Licensed Material and that the Licensor has authority to license.
+h. Licensor means the individual(s) or entity(ies) granting rights under this Public License.
+i. NonCommercial means not primarily intended for or directed towards commercial advantage or monetary compensation. For purposes of this Public License, the exchange of the Licensed Material for other material subject to Copyright and Similar Rights by digital file-sharing or similar means is NonCommercial provided there is no payment of monetary compensation in connection with the exchange.
+j. Share means to provide material to the public by any means or process that requires permission under the Licensed Rights, such as reproduction, public display, public performance, distribution, dissemination, communication, or importation, and to make material available to the public including in ways that members of the public may access the material from a place and at a time individually chosen by them.
+k. Sui Generis Database Rights means rights other than copyright resulting from Directive 96/9/EC of the European Parliament and of the Council of 11 March 1996 on the legal protection of databases, as amended and/or succeeded, as well as other essentially equivalent rights anywhere in the world.
+l. You means the individual or entity exercising the Licensed Rights under this Public License. Your has a corresponding meaning.
+Section 2 -- Scope.
+a. License grant.
+   1. Subject to the terms and conditions of this Public License,
+      the Licensor hereby grants You a worldwide, royalty-free,
+      non-sublicensable, non-exclusive, irrevocable license to
+      exercise the Licensed Rights in the Licensed Material to:
+        a. reproduce and Share the Licensed Material, in whole or
+           in part, for NonCommercial purposes only; and
+        b. produce, reproduce, and Share Adapted Material for
+           NonCommercial purposes only.
+   2. Exceptions and Limitations. For the avoidance of doubt, where
+      Exceptions and Limitations apply to Your use, this Public
+      License does not apply, and You do not need to comply with
+      its terms and conditions.
+   3. Term. The term of this Public License is specified in Section
+      6(a).
+   4. Media and formats; technical modifications allowed. The
+      Licensor authorizes You to exercise the Licensed Rights in
+      all media and formats whether now known or hereafter created,
+      and to make technical modifications necessary to do so. The
+      Licensor waives and/or agrees not to assert any right or
+      authority to forbid You from making technical modifications
+      necessary to exercise the Licensed Rights, including
+      technical modifications necessary to circumvent Effective
+      Technological Measures. For purposes of this Public License,
+      simply making modifications authorized by this Section 2(a)
+      (4) never produces Adapted Material.
+   5. Downstream recipients.
+        a. Offer from the Licensor -- Licensed Material. Every
+           recipient of the Licensed Material automatically
+           receives an offer from the Licensor to exercise the
+           Licensed Rights under the terms and conditions of this
+           Public License.
+        b. No downstream restrictions. You may not offer or impose
+           any additional or different terms or conditions on, or
+           apply any Effective Technological Measures to, the
+           Licensed Material if doing so restricts exercise of the
+           Licensed Rights by any recipient of the Licensed
+           Material.
+   6. No endorsement. Nothing in this Public License constitutes or
+      may be construed as permission to assert or imply that You
+      are, or that Your use of the Licensed Material is, connected
+      with, or sponsored, endorsed, or granted official status by,
+      the Licensor or others designated to receive attribution as
+      provided in Section 3(a)(1)(A)(i).
+b. Other rights.
+   1. Moral rights, such as the right of integrity, are not
+      licensed under this Public License, nor are publicity,
+      privacy, and/or other similar personality rights; however, to
+      the extent possible, the Licensor waives and/or agrees not to
+      assert any such rights held by the Licensor to the limited
+      extent necessary to allow You to exercise the Licensed
+      Rights, but not otherwise.
+   2. Patent and trademark rights are not licensed under this
+      Public License.
+   3. To the extent possible, the Licensor waives any right to
+      collect royalties from You for the exercise of the Licensed
+      Rights, whether directly or through a collecting society
+      under any voluntary or waivable statutory or compulsory
+      licensing scheme. In all other cases the Licensor expressly
+      reserves any right to collect such royalties, including when
+      the Licensed Material is used other than for NonCommercial
+      purposes.
+Section 3 -- License Conditions.
+Your exercise of the Licensed Rights is expressly made subject to the following conditions.
+a. Attribution.
+   1. If You Share the Licensed Material (including in modified
+      form), You must:
+        a. retain the following if it is supplied by the Licensor
+           with the Licensed Material:
+             i. identification of the creator(s) of the Licensed
+                Material and any others designated to receive
+                attribution, in any reasonable manner requested by
+                the Licensor (including by pseudonym if
+                designated);
+            ii. a copyright notice;
+           iii. a notice that refers to this Public License;
+            iv. a notice that refers to the disclaimer of
+                warranties;
+             v. a URI or hyperlink to the Licensed Material to the
+                extent reasonably practicable;
+        b. indicate if You modified the Licensed Material and
+           retain an indication of any previous modifications; and
+        c. indicate the Licensed Material is licensed under this
+           Public License, and include the text of, or the URI or
+           hyperlink to, this Public License.
+   2. You may satisfy the conditions in Section 3(a)(1) in any
+      reasonable manner based on the medium, means, and context in
+      which You Share the Licensed Material. For example, it may be
+      reasonable to satisfy the conditions by providing a URI or
+      hyperlink to a resource that includes the required
+      information.
+   3. If requested by the Licensor, You must remove any of the
+      information required by Section 3(a)(1)(A) to the extent
+      reasonably practicable.
+   4. If You Share Adapted Material You produce, the Adapter's
+      License You apply must not prevent recipients of the Adapted
+      Material from complying with this Public License.
+Section 4 -- Sui Generis Database Rights.
+Where the Licensed Rights include Sui Generis Database Rights that apply to Your use of the Licensed Material:
+a. for the avoidance of doubt, Section 2(a)(1) grants You the right to extract, reuse, reproduce, and Share all or a substantial portion of the contents of the database for NonCommercial purposes only;
+b. if You include all or a substantial portion of the database contents in a database in which You have Sui Generis Database Rights, then the database in which You have Sui Generis Database Rights (but not its individual contents) is Adapted Material; and
+c. You must comply with the conditions in Section 3(a) if You Share all or a substantial portion of the contents of the database.
+For the avoidance of doubt, this Section 4 supplements and does not replace Your obligations under this Public License where the Licensed Rights include other Copyright and Similar Rights.
+Section 5 -- Disclaimer of Warranties and Limitation of Liability.
+a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS, IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION, WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS, ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU.
+b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION, NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT, INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES, COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR IN PART, THIS LIMITATION MAY NOT APPLY TO YOU.
+c. The disclaimer of warranties and limitation of liability provided above shall be interpreted in a manner that, to the extent possible, most closely approximates an absolute disclaimer and waiver of all liability.
+Section 6 -- Term and Termination.
+a. This Public License applies for the term of the Copyright and Similar Rights licensed here. However, if You fail to comply with this Public License, then Your rights under this Public License terminate automatically.
+b. Where Your right to use the Licensed Material has terminated under Section 6(a), it reinstates:
+   1. automatically as of the date the violation is cured, provided
+      it is cured within 30 days of Your discovery of the
+      violation; or
+   2. upon express reinstatement by the Licensor.
+ For the avoidance of doubt, this Section 6(b) does not affect any
+ right the Licensor may have to seek remedies for Your violations
+ of this Public License.
+c. For the avoidance of doubt, the Licensor may also offer the Licensed Material under separate terms or conditions or stop distributing the Licensed Material at any time; however, doing so will not terminate this Public License.
+d. Sections 1, 5, 6, 7, and 8 survive termination of this Public License.
+Section 7 -- Other Terms and Conditions.
+a. The Licensor shall not be bound by any additional or different terms or conditions communicated by You unless expressly agreed.
+b. Any arrangements, understandings, or agreements regarding the Licensed Material not stated herein are separate from and independent of the terms and conditions of this Public License.
+Section 8 -- Interpretation.
+a. For the avoidance of doubt, this Public License does not, and shall not be interpreted to, reduce, limit, restrict, or impose conditions on any use of the Licensed Material that could lawfully be made without permission under this Public License.
+b. To the extent possible, if any provision of this Public License is deemed unenforceable, it shall be automatically reformed to the minimum extent necessary to make it enforceable. If the provision cannot be reformed, it shall be severed from this Public License without affecting the enforceability of the remaining terms and conditions.
+c. No term or condition of this Public License will be waived and no failure to comply consented to unless expressly agreed to by the Licensor.
+d. Nothing in this Public License constitutes or may be interpreted as a limitation upon, or waiver of, any privileges and immunities that apply to the Licensor or You, including from the legal processes of any jurisdiction or authority.
+===
+Niantic Map Free Relocalization Dataset License Agreement
+This Niantic Map Free Relocalization Dataset License Agreement ("Agreement") is an agreement between you and Niantic, Inc. (“Niantic” or “we”). By downloading or otherwise using Niantic’s Map-Free Relocalization dataset or dataset-derived materials (collectively, the "Dataset") you agree to:
+1. Purpose and Restrictions. You may only use the Dataset only for non-commercial purposes, such as academic research at educational and not-for-profit research institutions, teaching, public demonstrations, and personal experimentation. Non-commercial use expressly excludes any profit-making or commercial activities, including without limitation sale, license, manufacture or development of commercial products, use in commercially-sponsored research, use at a laboratory or other facility owned or controlled (whether in whole or in part) by a commercial entity, provision of consulting service, use for or on behalf of any commercial entity, and use in consulting service, use for or on behalf of any commercial entity, use in research where a commercial party obtains rights to research results or any other benefit. Notwithstanding the foregoing restrictions, you can use this Dataset for publishing comparison results for academic papers, including retraining your models on this Dataset.
+2. License. Subject to this Agreement, Niantic grants you a non-exclusive, non-transferable, non-sublicensable right to download and use the Dataset for the purpose stated in Section 1 of this Agreement. All rights not expressly granted to you in this Agreement are reserved.
+3. Condition of Use. You must not use the Dataset in a way that could diminish, tarnish, or in any way harm Niantic’s reputation or image.
+4. No Warranties. The Dataset comes “as is”, and you will use it at your own risk. Niantic makes no representations or warranties regarding the Dataset, including but not limited to warranties of non-infringement or fitness for a particular purpose. Neither Niantic nor any contributor to the Dataset will be liable for any damages related to the Dataset or this Agreement, including direct, indirect, special, consequential or incidental damages, to the maximum extent the law permits, no matter what legal theory they are based on. We are not obligated to (and will not) provide technical support for the Dataset.
+5. Indemnity. You accept full responsibility for your use of the Dataset and shall defend and indemnify Niantic, including its employees, officers and agents, against any and all claims arising from your use of the Dataset.
+6. Removal. Niantic reserves the right to remove access to the Dataset at any time without cause. If you have downloaded a copy of the Dataset prior to such removal, you may use such a copy subject to this Agreement, but you may not distribute your copy.
+7. Termination. This Agreement will terminate immediately upon your commercial use of the Dataset.
+8. Authorized Representative. If you are employed by a for-profit, commercial entity, your employer shall also be bound by the terms and conditions of this Agreement, and you hereby represent that you are fully authorized to enter into this Agreement on behalf of such employer.
+9. Survivability. Sections 2, 4, 5, 6, 7, 8, 9, and 10 of this Agreement survive the termination of this Agreement.
+10. Misc. This Agreement is governed and construed in all respects in accordance with the laws of the State of California, USA without regard to conflicts of law. If any provision of this Agreement is deemed unenforceable or contrary to law, the rest of this Agreement shall remain in full effect and enforceable. If you do not agree to this Agreement, do not download or use the Dataset. The Dataset is protected by copyright and other intellectual property laws and is licensed, not sold.
+===
+NVIDIA Source Code License for SegFormer
+1. Definitions
+“Licensor” means any person or entity that distributes its Work.
+“Software” means the original work of authorship made available under this License.
+“Work” means the Software and any additions to or derivative works of the Software that are made available under
+this License.
+The terms “reproduce,” “reproduction,” “derivative works,” and “distribution” have the meaning as provided under
+U.S. copyright law; provided, however, that for the purposes of this License, derivative works shall not include
+works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work.
+Works, including the Software, are “made available” under this License by including in or with the Work either
+(a) a copyright notice referencing the applicability of this License to the Work, or (b) a copy of this License.
+2. License Grant
+2.1 Copyright Grant. Subject to the terms and conditions of this License, each Licensor grants to you a perpetual,
+worldwide, non-exclusive, royalty-free, copyright license to reproduce, prepare derivative works of, publicly
+display, publicly perform, sublicense and distribute its Work and any resulting derivative works in any form.
+3. Limitations
+3.1 Redistribution. You may reproduce or distribute the Work only if (a) you do so under this License, (b) you
+include a complete copy of this License with your distribution, and (c) you retain without modification any
+copyright, patent, trademark, or attribution notices that are present in the Work.
+3.2 Derivative Works. You may specify that additional or different terms apply to the use, reproduction, and
+distribution of your derivative works of the Work (“Your Terms”) only if (a) Your Terms provide that the use
+limitation in Section 3.3 applies to your derivative works, and (b) you identify the specific derivative works
+that are subject to Your Terms. Notwithstanding Your Terms, this License (including the redistribution
+requirements in Section 3.1) will continue to apply to the Work itself.
+3.3 Use Limitation. The Work and any derivative works thereof only may be used or intended for use
+non-commercially. Notwithstanding the foregoing, NVIDIA and its affiliates may use the Work and any derivative
+works commercially. As used herein, “non-commercially” means for research or evaluation purposes only.
+3.4 Patent Claims. If you bring or threaten to bring a patent claim against any Licensor (including any claim,
+cross-claim or counterclaim in a lawsuit) to enforce any patents that you allege are infringed by any Work, then
+your rights under this License from such Licensor (including the grant in Section 2.1) will terminate immediately.
+3.5 Trademarks. This License does not grant any rights to use any Licensor’s or its affiliates’ names, logos,
+or trademarks, except as necessary to reproduce the notices described in this License.
+3.6 Termination. If you violate any term of this License, then your rights under this License (including the
+grant in Section 2.1) will terminate immediately.
+4. Disclaimer of Warranty.
+THE WORK IS PROVIDED “AS IS” WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING
+WARRANTIES OR CONDITIONS OF M ERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE OR NON-INFRINGEMENT. YOU
+BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES UNDER THIS LICENSE.
+5. Limitation of Liability.
+EXCEPT AS PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL THEORY, WHETHER IN TORT (INCLUDING
+NEGLIGENCE), CONTRACT, OR OTHERWISE SHALL ANY LICENSOR BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT,
+INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF OR RELATED TO THIS LICENSE, THE USE OR
+INABILITY TO USE THE WORK (INCLUDING BUT NOT LIMITED TO LOSS OF GOODWILL, BUSINESS INTERRUPTION, LOST PROFITS OR
+DATA, COMPUTER FAILURE OR MALFUNCTION, OR ANY OTHER COMM ERCIAL DAMAGES OR LOSSES), EVEN IF THE LICENSOR HAS BEEN
+ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
+===
+CosXL License Agreement
+STABILITY AI NON-COMMERCIAL RESEARCH COMMUNITY LICENSE AGREEMENT Dated: April 7th, 2024
+By clicking “I Accept” below or by using or distributing any portion or element of the Models, Software, Software Products or Derivative Works, you agree to the terms of this License. If you do not agree to this License, then you do not have any rights to use the Software Products or Derivative Works through this License, and you must immediately cease using the Software Products or Derivative Works. If you are agreeing to be bound by the terms of this License on behalf of your employer or other entity, you represent and warrant to Stability AI that you have full legal authority to bind your employer or such entity to this License. If you do not have the requisite authority, you may not accept the License or access the Software Products or Derivative Works on behalf of your employer or other entity.
+"Agreement" means this Stable Non-Commercial Research Community License Agreement.
+“AUP” means the Stability AI Acceptable Use Policy available at https://stability.ai/use-policy, as may be updated from time to time.
+"Derivative Work(s)” means (a) any derivative work of the Software Products as recognized by U.S. copyright laws and (b) any modifications to a Model, and any other model created which is based on or derived from the Model or the Model’s output. For clarity, Derivative Works do not include the output of any Model.
+“Documentation” means any specifications, manuals, documentation, and other written information provided by Stability AI related to the Software.
+"Licensee" or "you" means you, or your employer or any other person or entity (if you are entering into this Agreement on such person or entity's behalf), of the age required under applicable laws, rules or regulations to provide legal consent and that has legal authority to bind your employer or such other person or entity if you are entering in this Agreement on their behalf.
+“Model(s)" means, collectively, Stability AI’s proprietary models and algorithms, including machine-learning models, trained model weights and other elements of the foregoing, made available under this Agreement.
+“Non-Commercial Uses” means exercising any of the rights granted herein for the purpose of research or non-commercial purposes. Non-Commercial Uses does not include any production use of the Software Products or any Derivative Works.
+"Stability AI" or "we" means Stability AI Ltd. and its affiliates.
+"Software" means Stability AI’s proprietary software made available under this Agreement.
+“Software Products” means the Models, Software and Documentation, individually or in any combination.
+    License Rights and Redistribution.
+    a. Subject to your compliance with this Agreement, the AUP (which is hereby incorporated herein by reference), and the Documentation, Stability AI grants you a non-exclusive, worldwide, non-transferable, non-sublicensable, revocable, royalty free and limited license under Stability AI’s intellectual property or other rights owned or controlled by Stability AI embodied in the Software Products to use, reproduce, distribute, and create Derivative Works of, the Software Products, in each case for Non-Commercial Uses only.
+    b. You may not use the Software Products or Derivative Works to enable third parties to use the Software Products or Derivative Works as part of your hosted service or via your APIs, whether you are adding substantial additional functionality thereto or not. Merely distributing the Software Products or Derivative Works for download online without offering any related service (ex. by distributing the Models on HuggingFace) is not a violation of this subsection. If you wish to use the Software Products or any Derivative Works for commercial or production use or you wish to make the Software Products or any Derivative Works available to third parties via your hosted service or your APIs, contact Stability AI at https://stability.ai/contact.
+    c. If you distribute or make the Software Products, or any Derivative Works thereof, available to a third party, the Software Products, Derivative Works, or any portion thereof, respectively, will remain subject to this Agreement and you must (i) provide a copy of this Agreement to such third party, and (ii) retain the following attribution notice within a "Notice" text file distributed as a part of such copies: "This Stability AI Model is licensed under the Stability AI Non-Commercial Research Community License, Copyright (c) Stability AI Ltd. All Rights Reserved.” If you create a Derivative Work of a Software Product, you may add your own attribution notices to the Notice file included with the Software Product, provided that you clearly indicate which attributions apply to the Software Product and you must state in the NOTICE file that you changed the Software Product and how it was modified.
+    Disclaimer of Warranty. UNLESS REQUIRED BY APPLICABLE LAW, THE SOFTWARE PRODUCTS AND ANY OUTPUT AND RESULTS THEREFROM ARE PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, ANY WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. YOU ARE SOLELY RESPONSIBLE FOR DETERMINING THE APPROPRIATENESS OF USING OR REDISTRIBUTING THE SOFTWARE PRODUCTS, DERIVATIVE WORKS OR ANY OUTPUT OR RESULTS AND ASSUME ANY RISKS ASSOCIATED WITH YOUR USE OF THE SOFTWARE PRODUCTS, DERIVATIVE WORKS AND ANY OUTPUT AND RESULTS. 3. Limitation of Liability. IN NO EVENT WILL STABILITY AI OR ITS AFFILIATES BE LIABLE UNDER ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, TORT, NEGLIGENCE, PRODUCTS LIABILITY, OR OTHERWISE, ARISING OUT OF THIS AGREEMENT, FOR ANY LOST PROFITS OR ANY DIRECT, INDIRECT, SPECIAL, CONSEQUENTIAL, INCIDENTAL, EXEMPLARY OR PUNITIVE DAMAGES, EVEN IF STABILITY AI OR ITS AFFILIATES HAVE BEEN ADVISED OF THE POSSIBILITY OF ANY OF THE FOREGOING. 4. Intellectual Property.
+    a. No trademark licenses are granted under this Agreement, and in connection with the Software Products or Derivative Works, neither Stability AI nor Licensee may use any name or mark owned by or associated with the other or any of its affiliates, except as required for reasonable and customary use in describing and redistributing the Software Products or Derivative Works.
+    b. Subject to Stability AI’s ownership of the Software Products and Derivative Works made by or for Stability AI, with respect to any Derivative Works that are made by you, as between you and Stability AI, you are and will be the owner of such Derivative Works
+    c. If you institute litigation or other proceedings against Stability AI (including a cross-claim or counterclaim in a lawsuit) alleging that the Software Products, Derivative Works or associated outputs or results, or any portion of any of the foregoing, constitutes infringement of intellectual property or other rights owned or licensable by you, then any licenses granted to you under this Agreement shall terminate as of the date such litigation or claim is filed or instituted. You will indemnify and hold harmless Stability AI from and against any claim by any third party arising out of or related to your use or distribution of the Software Products or Derivative Works in violation of this Agreement.
+    Term and Termination. The term of this Agreement will commence upon your acceptance of this Agreement or access to the Software Products and will continue in full force and effect until terminated in accordance with the terms and conditions herein. Stability AI may terminate this Agreement if you are in breach of any term or condition of this Agreement. Upon termination of this Agreement, you shall delete and cease use of any Software Products or Derivative Works. Sections 2-4 shall survive the termination of this Agreement.
+    Governing Law. This Agreement will be governed by and construed in accordance with the laws of the United States and the State of California without regard to choice of law
+    principles.

LICENSE ADDED Viewed

	@@ -0,0 +1,7 @@

+DUSt3R, Copyright (c) 2024-present Naver Corporation, is licensed under the Creative Commons Attribution-NonCommercial-ShareAlike 4.0 license.
+A summary of the CC BY-NC-SA 4.0 license is located here:
+	https://creativecommons.org/licenses/by-nc-sa/4.0/
+The CC BY-NC-SA 4.0 license is located here:
+	https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode

NOTICE ADDED Viewed

	@@ -0,0 +1,103 @@

+MASt3R
+Copyright 2024-present NAVER Corp.
+This project contains subcomponents with separate copyright notices and license terms.
+Your use of the source code for these subcomponents is subject to the terms and conditions of the following licenses.
+====
+naver/dust3r
+https://github.com/naver/dust3r/
+Creative Commons Attribution-NonCommercial-ShareAlike 4.0
+====
+naver/croco
+https://github.com/naver/croco/
+Creative Commons Attribution-NonCommercial-ShareAlike 4.0
+====
+pytorch/pytorch
+https://github.com/pytorch/pytorch
+From PyTorch:
+Copyright (c) 2016-     Facebook, Inc            (Adam Paszke)
+Copyright (c) 2014-     Facebook, Inc            (Soumith Chintala)
+Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
+Copyright (c) 2012-2014 Deepmind Technologies    (Koray Kavukcuoglu)
+Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
+Copyright (c) 2011-2013 NYU                      (Clement Farabet)
+Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
+Copyright (c) 2006      Idiap Research Institute (Samy Bengio)
+Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)
+From Caffe2:
+Copyright (c) 2016-present, Facebook Inc. All rights reserved.
+All contributions by Facebook:
+Copyright (c) 2016 Facebook Inc.
+All contributions by Google:
+Copyright (c) 2015 Google Inc.
+All rights reserved.
+All contributions by Yangqing Jia:
+Copyright (c) 2015 Yangqing Jia
+All rights reserved.
+All contributions by Kakao Brain:
+Copyright 2019-2020 Kakao Brain
+All contributions by Cruise LLC:
+Copyright (c) 2022 Cruise LLC.
+All rights reserved.
+All contributions from Caffe:
+Copyright(c) 2013, 2014, 2015, the respective contributors
+All rights reserved.
+All other contributions:
+Copyright(c) 2015, 2016 the respective contributors
+All rights reserved.
+Caffe2 uses a copyright model similar to Caffe: each contributor holds
+copyright over their contributions to Caffe2. The project versioning records
+all such contribution and copyright details. If a contributor wants to further
+mark their specific copyright on a particular contribution, they should
+indicate their copyright solely in the commit message of the change when it is
+committed.
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+1. Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America
+   and IDIAP Research Institute nor the names of its contributors may be
+   used to endorse or promote products derived from this software without
+   specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.

README.md ADDED Viewed

	@@ -0,0 +1,316 @@

+![banner](assets/mast3r.jpg)
+Official implementation of `Grounding Image Matching in 3D with MASt3R`
+[[Project page](https://dust3r.europe.naverlabs.com/)], [[MASt3R arxiv](https://arxiv.org/abs/2406.09756)], [[DUSt3R arxiv](https://arxiv.org/abs/2312.14132)]
+![Example of matching results obtained from MASt3R](assets/examples.jpg)
+![High level overview of MASt3R's architecture](assets/mast3r_archi.jpg)
+```bibtex
+@misc{mast3r_arxiv24,
+      title={Grounding Image Matching in 3D with MASt3R},
+      author={Vincent Leroy and Yohann Cabon and Jerome Revaud},
+      year={2024},
+      eprint={2406.09756},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV}
+}
+@inproceedings{dust3r_cvpr24,
+      title={DUSt3R: Geometric 3D Vision Made Easy},
+      author={Shuzhe Wang and Vincent Leroy and Yohann Cabon and Boris Chidlovskii and Jerome Revaud},
+      booktitle = {CVPR},
+      year = {2024}
+}
+```
+## Table of Contents
+- [Table of Contents](#table-of-contents)
+- [License](#license)
+- [Get Started](#get-started)
+  - [Installation](#installation)
+  - [Checkpoints](#checkpoints)
+  - [Interactive demo](#interactive-demo)
+  - [Interactive demo with docker](#interactive-demo-with-docker)
+- [Usage](#usage)
+- [Training](#training)
+  - [Datasets](#datasets)
+  - [Demo](#demo)
+  - [Our Hyperparameters](#our-hyperparameters)
+- [Visual Localization](#visual-localization)
+  - [Dataset Preparation](#dataset-preparation)
+  - [Example Commands](#example-commands)
+## License
+The code is distributed under the CC BY-NC-SA 4.0 License.
+See [LICENSE](LICENSE) for more information.
+```python
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+```
+## Get Started
+### Installation
+1. Clone MASt3R.
+```bash
+git clone --recursive https://github.com/naver/mast3r
+cd mast3r
+# if you have already cloned mast3r:
+# git submodule update --init --recursive
+```
+2. Create the environment, here we show an example using conda.
+```bash
+conda create -n mast3r python=3.11 cmake=3.14.0
+conda activate mast3r
+conda install pytorch torchvision pytorch-cuda=12.1 -c pytorch -c nvidia  # use the correct version of cuda for your system
+pip install -r requirements.txt
+pip install -r dust3r/requirements.txt
+# Optional: you can also install additional packages to:
+# - add support for HEIC images
+# - add required packages for visloc.py
+pip install -r dust3r/requirements_optional.txt
+```
+3. Optional, compile the cuda kernels for RoPE (as in CroCo v2).
+```bash
+# DUST3R relies on RoPE positional embeddings for which you can compile some cuda kernels for faster runtime.
+cd dust3r/croco/models/curope/
+python setup.py build_ext --inplace
+cd ../../../../
+```
+### Checkpoints
+You can obtain the checkpoints by two ways:
+1) You can use our huggingface_hub integration: the models will be downloaded automatically.
+2) Otherwise, We provide several pre-trained models:
+| Modelname   | Training resolutions | Head | Encoder | Decoder |
+|-------------|----------------------|------|---------|---------|
+| [`MASt3R_ViTLarge_BaseDecoder_512_catmlpdpt_metric`](https://download.europe.naverlabs.com/ComputerVision/MASt3R/MASt3R_ViTLarge_BaseDecoder_512_catmlpdpt_metric.pth) | 512x384, 512x336, 512x288, 512x256, 512x160 | CatMLP+DPT | ViT-L | ViT-B |
+You can check the hyperparameters we used to train these models in the [section: Our Hyperparameters](#our-hyperparameters)
+Make sure to check license of the datasets we used.
+To download a specific model, for example `MASt3R_ViTLarge_BaseDecoder_512_catmlpdpt_metric.pth`:
+```bash
+mkdir -p checkpoints/
+wget https://download.europe.naverlabs.com/ComputerVision/MASt3R/MASt3R_ViTLarge_BaseDecoder_512_catmlpdpt_metric.pth -P checkpoints/
+```
+For these checkpoints, make sure to agree to the license of all the training datasets we used, in addition to CC-BY-NC-SA 4.0.
+The mapfree dataset license in particular is very restrictive. For more information, check [CHECKPOINTS_NOTICE](CHECKPOINTS_NOTICE).
+### Interactive demo
+There are two demos available:
+```
+demo.py is the updated demo for MASt3R. It uses our new sparse global alignment method that allows you to reconstruct larger scenes
+python3 demo.py --model_name MASt3R_ViTLarge_BaseDecoder_512_catmlpdpt_metric
+# Use --weights to load a checkpoint from a local file, eg --weights checkpoints/MASt3R_ViTLarge_BaseDecoder_512_catmlpdpt_metric.pth
+# Use --local_network to make it accessible on the local network, or --server_name to specify the url manually
+# Use --server_port to change the port, by default it will search for an available port starting at 7860
+# Use --device to use a different device, by default it's "cuda"
+demo_dust3r_ga.py is the same demo as in dust3r (+ compatibility for MASt3R models)
+see https://github.com/naver/dust3r?tab=readme-ov-file#interactive-demo for details
+```
+### Interactive demo with docker
+TODO
+![demo](assets/demo.jpg)
+## Usage
+```python
+from mast3r.model import AsymmetricMASt3R
+from mast3r.fast_nn import fast_reciprocal_NNs
+import mast3r.utils.path_to_dust3r
+from dust3r.inference import inference
+from dust3r.utils.image import load_images
+if __name__ == '__main__':
+    device = 'cuda'
+    schedule = 'cosine'
+    lr = 0.01
+    niter = 300
+    model_name = "naver/MASt3R_ViTLarge_BaseDecoder_512_catmlpdpt_metric"
+    # you can put the path to a local checkpoint in model_name if needed
+    model = AsymmetricMASt3R.from_pretrained(model_name).to(device)
+    images = load_images(['dust3r/croco/assets/Chateau1.png', 'dust3r/croco/assets/Chateau2.png'], size=512)
+    output = inference([tuple(images)], model, device, batch_size=1, verbose=False)
+    # at this stage, you have the raw dust3r predictions
+    view1, pred1 = output['view1'], output['pred1']
+    view2, pred2 = output['view2'], output['pred2']
+    desc1, desc2 = pred1['desc'].squeeze(0).detach(), pred2['desc'].squeeze(0).detach()
+    # find 2D-2D matches between the two images
+    matches_im0, matches_im1 = fast_reciprocal_NNs(desc1, desc2, subsample_or_initxy1=8,
+                                                   device=device, dist='dot', block_size=2**13)
+    # ignore small border around the edge
+    H0, W0 = view1['true_shape'][0]
+    valid_matches_im0 = (matches_im0[:, 0] >= 3) & (matches_im0[:, 0] < int(W0) - 3) & (
+        matches_im0[:, 1] >= 3) & (matches_im0[:, 1] < int(H0) - 3)
+    H1, W1 = view2['true_shape'][0]
+    valid_matches_im1 = (matches_im1[:, 0] >= 3) & (matches_im1[:, 0] < int(W1) - 3) & (
+        matches_im1[:, 1] >= 3) & (matches_im1[:, 1] < int(H1) - 3)
+    valid_matches = valid_matches_im0 & valid_matches_im1
+    matches_im0, matches_im1 = matches_im0[valid_matches], matches_im1[valid_matches]
+    # visualize a few matches
+    import numpy as np
+    import torch
+    import torchvision.transforms.functional
+    from matplotlib import pyplot as pl
+    n_viz = 20
+    num_matches = matches_im0.shape[0]
+    match_idx_to_viz = np.round(np.linspace(0, num_matches - 1, n_viz)).astype(int)
+    viz_matches_im0, viz_matches_im1 = matches_im0[match_idx_to_viz], matches_im1[match_idx_to_viz]
+    image_mean = torch.as_tensor([0.5, 0.5, 0.5], device='cpu').reshape(1, 3, 1, 1)
+    image_std = torch.as_tensor([0.5, 0.5, 0.5], device='cpu').reshape(1, 3, 1, 1)
+    viz_imgs = []
+    for i, view in enumerate([view1, view2]):
+        rgb_tensor = view['img'] * image_std + image_mean
+        viz_imgs.append(rgb_tensor.squeeze(0).permute(1, 2, 0).cpu().numpy())
+    H0, W0, H1, W1 = *viz_imgs[0].shape[:2], *viz_imgs[1].shape[:2]
+    img0 = np.pad(viz_imgs[0], ((0, max(H1 - H0, 0)), (0, 0), (0, 0)), 'constant', constant_values=0)
+    img1 = np.pad(viz_imgs[1], ((0, max(H0 - H1, 0)), (0, 0), (0, 0)), 'constant', constant_values=0)
+    img = np.concatenate((img0, img1), axis=1)
+    pl.figure()
+    pl.imshow(img)
+    cmap = pl.get_cmap('jet')
+    for i in range(n_viz):
+        (x0, y0), (x1, y1) = viz_matches_im0[i].T, viz_matches_im1[i].T
+        pl.plot([x0, x1 + W0], [y0, y1], '-+', color=cmap(i / (n_viz - 1)), scalex=False, scaley=False)
+    pl.show(block=True)
+```
+![matching example on croco pair](assets/matching.jpg)
+## Training
+In this section, we present a short demonstration to get started with training MASt3R.
+### Datasets
+See [Datasets section in DUSt3R](https://github.com/naver/dust3r/tree/datasets?tab=readme-ov-file#datasets)
+### Demo
+Like for the DUSt3R training demo, we're going to download and prepare the same subset of [CO3Dv2](https://github.com/facebookresearch/co3d) - [Creative Commons Attribution-NonCommercial 4.0 International](https://github.com/facebookresearch/co3d/blob/main/LICENSE) and launch the training code on it.
+It is the exact same process as DUSt3R.
+The demo model will be trained for a few epochs on a very small dataset.
+It will not be very good.
+```bash
+# download and prepare the co3d subset
+mkdir -p data/co3d_subset
+cd data/co3d_subset
+git clone https://github.com/facebookresearch/co3d
+cd co3d
+python3 ./co3d/download_dataset.py --download_folder ../ --single_sequence_subset
+rm ../*.zip
+cd ../../..
+python3 datasets_preprocess/preprocess_co3d.py --co3d_dir data/co3d_subset --output_dir data/co3d_subset_processed  --single_sequence_subset
+# download the pretrained dust3r checkpoint
+mkdir -p checkpoints/
+wget https://download.europe.naverlabs.com/ComputerVision/DUSt3R/DUSt3R_ViTLarge_BaseDecoder_512_dpt.pth -P checkpoints/
+# for this example we'll do fewer epochs, for the actual hyperparameters we used in the paper, see the next section: "Our Hyperparameters"
+torchrun --nproc_per_node=4 train.py \
+    --train_dataset "1000 @ Co3d(split='train', ROOT='data/co3d_subset_processed', aug_crop='auto', aug_monocular=0.005, aug_rot90='diff', mask_bg='rand', resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], n_corres=8192, nneg=0.5, transform=ColorJitter)" \
+    --test_dataset "100 @ Co3d(split='test', ROOT='data/co3d_subset_processed', resolution=(512,384), n_corres=1024, seed=777)" \
+    --model "AsymmetricMASt3R(pos_embed='RoPE100', patch_embed_cls='ManyAR_PatchEmbed', img_size=(512, 512), head_type='catmlp+dpt', output_mode='pts3d+desc24', depth_mode=('exp', -inf, inf), conf_mode=('exp', 1, inf), enc_embed_dim=1024, enc_depth=24, enc_num_heads=16, dec_embed_dim=768, dec_depth=12, dec_num_heads=12, two_confs=True)" \
+    --train_criterion "ConfLoss(Regr3D(L21, norm_mode='?avg_dis'), alpha=0.2) + 0.075*ConfMatchingLoss(MatchingLoss(InfoNCE(mode='proper', temperature=0.05), negatives_padding=0, blocksize=8192), alpha=10.0, confmode='mean')" \
+    --test_criterion "Regr3D_ScaleShiftInv(L21, norm_mode='?avg_dis', gt_scale=True, sky_loss_value=0) + -1.*MatchingLoss(APLoss(nq='torch', fp=torch.float16), negatives_padding=12288)" \
+    --pretrained "checkpoints/DUSt3R_ViTLarge_BaseDecoder_512_dpt.pth" \
+    --lr 0.0001 --min_lr 1e-06 --warmup_epochs 1 --epochs 10 --batch_size 4 --accum_iter 4 \
+    --save_freq 1 --keep_freq 5 --eval_freq 1 \
+    --output_dir "checkpoints/mast3r_demo"
+```
+### Our Hyperparameters
+We didn't release all the training datasets, but here are the commands we used for training our models:
+```bash
+# MASt3R_ViTLarge_BaseDecoder_512_catmlpdpt_metric - train mast3r with metric regression and matching loss
+# we used cosxl to generate variations of DL3DV: "foggy", "night", "rainy", "snow", "sunny" but we were not convinced by it.
+torchrun --nproc_per_node=8 train.py \
+    --train_dataset "57_000 @ Habitat512(1_000_000, split='train', resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], aug_crop='auto', aug_monocular=0.005, transform=ColorJitter, n_corres=8192, nneg=0.5) + 68_400 @ BlendedMVS(split='train', mask_sky=True, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], aug_crop='auto', aug_monocular=0.005, transform=ColorJitter, n_corres=8192, nneg=0.5) + 68_400 @ MegaDepth(split='train', mask_sky=True, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], aug_crop='auto', aug_monocular=0.005, transform=ColorJitter, n_corres=8192, nneg=0.5) + 45_600 @ ARKitScenes(split='train', resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], aug_crop='auto', aug_monocular=0.005, transform=ColorJitter, n_corres=8192, nneg=0.5) + 22_800 @ Co3d(split='train', mask_bg='rand', resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], aug_crop='auto', aug_monocular=0.005, transform=ColorJitter, n_corres=8192, nneg=0.5) + 22_800 @ StaticThings3D(mask_bg='rand', resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], aug_crop='auto', aug_monocular=0.005, transform=ColorJitter, n_corres=8192, nneg=0.5) + 45_600 @ ScanNetpp(split='train', resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], aug_crop='auto', aug_monocular=0.005, transform=ColorJitter, n_corres=8192, nneg=0.5) + 45_600 @ TartanAir(pairs_subset='', resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], aug_crop='auto', aug_monocular=0.005, transform=ColorJitter, n_corres=8192, nneg=0.5) + 4_560 @ UnrealStereo4K(resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], aug_crop='auto', aug_monocular=0.005, transform=ColorJitter, n_corres=8192, nneg=0.5) + 1_140 @ VirtualKitti(optical_center_is_centered=True, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], aug_crop='auto', aug_monocular=0.005, transform=ColorJitter, n_corres=8192, nneg=0.5) + 22_800 @ WildRgbd(split='train', mask_bg='rand', resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], aug_crop='auto', aug_monocular=0.005, transform=ColorJitter, n_corres=8192, nneg=0.5) + 145_920 @ NianticMapFree(split='train', resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], aug_crop='auto', aug_monocular=0.005, transform=ColorJitter, n_corres=8192, nneg=0.5) + 57_000 @ DL3DV(split='nlight', resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], aug_crop='auto', aug_monocular=0.005, transform=ColorJitter, n_corres=8192, nneg=0.5) + 57_000 @ DL3DV(split='not-nlight', cosxl_augmentations=None, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], aug_crop='auto', aug_monocular=0.005, transform=ColorJitter, n_corres=8192, nneg=0.5) + 34_200 @ InternalUnreleasedDataset(resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], aug_crop='auto', aug_monocular=0.005, transform=ColorJitter, n_corres=8192, nneg=0.5)" \
+    --test_dataset "Habitat512(1_000, split='val', resolution=(512,384), seed=777, n_corres=1024) + 1_000 @ BlendedMVS(split='val', resolution=(512,384), mask_sky=True, seed=777, n_corres=1024) + 1_000 @ ARKitScenes(split='test', resolution=(512,384), seed=777, n_corres=1024) + 1_000 @ MegaDepth(split='val', mask_sky=True, resolution=(512,336), seed=777, n_corres=1024) + 1_000 @ Co3d(split='test', resolution=(512,384), mask_bg='rand', seed=777, n_corres=1024)" \
+    --model "AsymmetricMASt3R(pos_embed='RoPE100', patch_embed_cls='ManyAR_PatchEmbed', img_size=(512, 512), head_type='catmlp+dpt', output_mode='pts3d+desc24', depth_mode=('exp', -inf, inf), conf_mode=('exp', 1, inf), enc_embed_dim=1024, enc_depth=24, enc_num_heads=16, dec_embed_dim=768, dec_depth=12, dec_num_heads=12, two_confs=True, desc_conf_mode=('exp', 0, inf))" \
+    --train_criterion "ConfLoss(Regr3D(L21, norm_mode='?avg_dis'), alpha=0.2, loss_in_log=False) + 0.075*ConfMatchingLoss(MatchingLoss(InfoNCE(mode='proper', temperature=0.05), negatives_padding=0, blocksize=8192), alpha=10.0, confmode='mean')" \
+    --test_criterion "Regr3D(L21, norm_mode='?avg_dis', gt_scale=True, sky_loss_value=0) + -1.*MatchingLoss(APLoss(nq='torch', fp=torch.float16), negatives_padding=12288)" \
+    --pretrained "checkpoints/DUSt3R_ViTLarge_BaseDecoder_512_dpt.pth" \
+    --lr 0.0001 --min_lr 1e-06 --warmup_epochs 8 --epochs 50 --batch_size 4 --accum_iter 2 \
+    --save_freq 1 --keep_freq 5 --eval_freq 1 --print_freq=10 \
+    --output_dir "checkpoints/MASt3R_ViTLarge_BaseDecoder_512_catmlpdpt_metric"
+```
+## Visual Localization
+### Dataset preparation
+See [Visloc section in DUSt3R](https://github.com/naver/dust3r/tree/dust3r_visloc#dataset-preparation)
+### Example Commands
+With `visloc.py` you can run our visual localization experiments on Aachen-Day-Night, InLoc, Cambridge Landmarks and 7 Scenes.
+```bash
+# Aachen-Day-Night-v1.1:
+# scene in 'day' 'night'
+# scene can also be 'all'
+python3 visloc.py --model_name MASt3R_ViTLarge_BaseDecoder_512_catmlpdpt_metric --dataset "VislocAachenDayNight('/path/to/prepared/Aachen-Day-Night-v1.1/', subscene='${scene}', pairsfile='fire_top50', topk=20)" --pixel_tol 5 --pnp_mode poselib --reprojection_error_diag_ratio 0.008 --output_dir /path/to/output/Aachen-Day-Night-v1.1/${scene}/loc
+# or with coarse to fine:
+python3 visloc.py --model_name MASt3R_ViTLarge_BaseDecoder_512_catmlpdpt_metric --dataset "VislocAachenDayNight('/path/to/prepared/Aachen-Day-Night-v1.1/', subscene='${scene}', pairsfile='fire_top50', topk=20)" --pixel_tol 5 --pnp_mode poselib --reprojection_error_diag_ratio 0.008 --output_dir /path/to/output/Aachen-Day-Night-v1.1/${scene}/loc --coarse_to_fine --max_batch_size 48 --c2f_crop_with_homography
+# InLoc
+python3 visloc.py --model_name MASt3R_ViTLarge_BaseDecoder_512_catmlpdpt_metric --dataset "VislocInLoc('/path/to/prepared/InLoc/', pairsfile='pairs-query-netvlad40-temporal', topk=20)" --pixel_tol 5 --pnp_mode poselib --reprojection_error_diag_ratio 0.008 --output_dir /path/to/output/InLoc/loc
+# or with coarse to fine:
+python3 visloc.py --model_name MASt3R_ViTLarge_BaseDecoder_512_catmlpdpt_metric --dataset "VislocInLoc('/path/to/prepared/InLoc/', pairsfile='pairs-query-netvlad40-temporal', topk=20)" --pixel_tol 5 --pnp_mode poselib --reprojection_error_diag_ratio 0.008 --output_dir /path/to/output/InLoc/loc --coarse_to_fine --max_image_size 1200 --max_batch_size 48 --c2f_crop_with_homography
+# 7-scenes:
+# scene in 'chess' 'fire' 'heads' 'office' 'pumpkin' 'redkitchen' 'stairs'
+python3 visloc.py --model_name MASt3R_ViTLarge_BaseDecoder_512_catmlpdpt_metric --dataset "VislocSevenScenes('/path/to/prepared/7-scenes/', subscene='${scene}', pairsfile='APGeM-LM18_top20', topk=1)" --pixel_tol 5 --pnp_mode poselib --reprojection_error_diag_ratio 0.008 --output_dir /path/to/output/7-scenes/${scene}/loc
+# Cambridge Landmarks:
+# scene in 'ShopFacade' 'GreatCourt' 'KingsCollege' 'OldHospital' 'StMarysChurch'
+python3 visloc.py --model_name MASt3R_ViTLarge_BaseDecoder_512_catmlpdpt_metric --dataset "VislocCambridgeLandmarks('/path/to/prepared/Cambridge_Landmarks/', subscene='${scene}', pairsfile='APGeM-LM18_top20', topk=1)" --pixel_tol 5 --pnp_mode poselib --reprojection_error_diag_ratio 0.008 --output_dir /path/to/output/Cambridge_Landmarks/${scene}/loc
+```

assets/NLE_tower/01D90321-69C8-439F-B0B0-E87E7634741C-83120-000041DAE419D7AE.jpg ADDED Viewed

assets/NLE_tower/1AD85EF5-B651-4291-A5C0-7BDB7D966384-83120-000041DADF639E09.jpg ADDED Viewed

assets/NLE_tower/2679C386-1DC0-4443-81B5-93D7EDE4AB37-83120-000041DADB2EA917.jpg ADDED Viewed

assets/NLE_tower/28EDBB63-B9F9-42FB-AC86-4852A33ED71B-83120-000041DAF22407A1.jpg ADDED Viewed

assets/NLE_tower/91E9B685-7A7D-42D7-B933-23A800EE4129-83120-000041DAE12C8176.jpg ADDED Viewed

assets/NLE_tower/CDBBD885-54C3-4EB4-9181-226059A60EE0-83120-000041DAE0C3D612.jpg ADDED Viewed

assets/NLE_tower/FF5599FD-768B-431A-AB83-BDA5FB44CB9D-83120-000041DADDE35483.jpg ADDED Viewed

assets/demo.jpg ADDED Viewed

assets/examples.jpg ADDED Viewed

assets/mast3r.jpg ADDED Viewed

assets/mast3r_archi.jpg ADDED Viewed

assets/matching.jpg ADDED Viewed

demo.py ADDED Viewed

	@@ -0,0 +1,297 @@

+#!/usr/bin/env python3
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# gradio demo
+# --------------------------------------------------------
+import math
+import gradio
+import os
+import torch
+import numpy as np
+import tempfile
+import functools
+import trimesh
+import copy
+from scipy.spatial.transform import Rotation
+from mast3r.cloud_opt.sparse_ga import sparse_global_alignment
+from mast3r.cloud_opt.tsdf_optimizer import TSDFPostProcess
+from mast3r.model import AsymmetricMASt3R
+from mast3r.utils.misc import hash_md5
+import mast3r.utils.path_to_dust3r  # noqa
+from dust3r.image_pairs import make_pairs
+from dust3r.utils.image import load_images
+from dust3r.utils.device import to_numpy
+from dust3r.viz import add_scene_cam, CAM_COLORS, OPENGL, pts3d_to_trimesh, cat_meshes
+from dust3r.demo import get_args_parser as dust3r_get_args_parser
+import matplotlib.pyplot as pl
+pl.ion()
+torch.backends.cuda.matmul.allow_tf32 = True  # for gpu >= Ampere and pytorch >= 1.12
+batch_size = 1
+def get_args_parser():
+    parser = dust3r_get_args_parser()
+    parser.add_argument('--share', action='store_true')
+    actions = parser._actions
+    for action in actions:
+        if action.dest == 'model_name':
+            action.choices = ["MASt3R_ViTLarge_BaseDecoder_512_catmlpdpt_metric"]
+    # change defaults
+    parser.prog = 'mast3r demo'
+    return parser
+def _convert_scene_output_to_glb(outdir, imgs, pts3d, mask, focals, cams2world, cam_size=0.05,
+                                 cam_color=None, as_pointcloud=False,
+                                 transparent_cams=False, silent=False):
+    assert len(pts3d) == len(mask) <= len(imgs) <= len(cams2world) == len(focals)
+    pts3d = to_numpy(pts3d)
+    imgs = to_numpy(imgs)
+    focals = to_numpy(focals)
+    cams2world = to_numpy(cams2world)
+    scene = trimesh.Scene()
+    # full pointcloud
+    if as_pointcloud:
+        pts = np.concatenate([p[m.ravel()] for p, m in zip(pts3d, mask)])
+        col = np.concatenate([p[m] for p, m in zip(imgs, mask)])
+        pct = trimesh.PointCloud(pts.reshape(-1, 3), colors=col.reshape(-1, 3))
+        scene.add_geometry(pct)
+    else:
+        meshes = []
+        for i in range(len(imgs)):
+            meshes.append(pts3d_to_trimesh(imgs[i], pts3d[i].reshape(imgs[i].shape), mask[i]))
+        mesh = trimesh.Trimesh(**cat_meshes(meshes))
+        scene.add_geometry(mesh)
+    # add each camera
+    for i, pose_c2w in enumerate(cams2world):
+        if isinstance(cam_color, list):
+            camera_edge_color = cam_color[i]
+        else:
+            camera_edge_color = cam_color or CAM_COLORS[i % len(CAM_COLORS)]
+        add_scene_cam(scene, pose_c2w, camera_edge_color,
+                      None if transparent_cams else imgs[i], focals[i],
+                      imsize=imgs[i].shape[1::-1], screen_width=cam_size)
+    rot = np.eye(4)
+    rot[:3, :3] = Rotation.from_euler('y', np.deg2rad(180)).as_matrix()
+    scene.apply_transform(np.linalg.inv(cams2world[0] @ OPENGL @ rot))
+    outfile = os.path.join(outdir, 'scene.glb')
+    if not silent:
+        print('(exporting 3D scene to', outfile, ')')
+    scene.export(file_obj=outfile)
+    return outfile
+def get_3D_model_from_scene(outdir, silent, scene, min_conf_thr=2, as_pointcloud=False, mask_sky=False,
+                            clean_depth=False, transparent_cams=False, cam_size=0.05, TSDF_thresh=0):
+    """
+    extract 3D_model (glb file) from a reconstructed scene
+    """
+    if scene is None:
+        return None
+    # get optimized values from scene
+    rgbimg = scene.imgs
+    focals = scene.get_focals().cpu()
+    cams2world = scene.get_im_poses().cpu()
+    # 3D pointcloud from depthmap, poses and intrinsics
+    if TSDF_thresh > 0:
+        tsdf = TSDFPostProcess(scene, TSDF_thresh=TSDF_thresh)
+        pts3d, _, confs = to_numpy(tsdf.get_dense_pts3d(clean_depth=clean_depth))
+    else:
+        pts3d, _, confs = to_numpy(scene.get_dense_pts3d(clean_depth=clean_depth))
+    msk = to_numpy([c > min_conf_thr for c in confs])
+    return _convert_scene_output_to_glb(outdir, rgbimg, pts3d, msk, focals, cams2world, as_pointcloud=as_pointcloud,
+                                        transparent_cams=transparent_cams, cam_size=cam_size, silent=silent)
+def get_reconstructed_scene(outdir, model, device, silent, image_size, filelist, optim_level, lr1, niter1, lr2, niter2, min_conf_thr,
+                            as_pointcloud, mask_sky, clean_depth, transparent_cams, cam_size,
+                            scenegraph_type, winsize, refid, TSDF_thresh, **kw):
+    """
+    from a list of images, run mast3r inference, sparse global aligner.
+    then run get_3D_model_from_scene
+    """
+    imgs = load_images(filelist, size=image_size, verbose=not silent)
+    if len(imgs) == 1:
+        imgs = [imgs[0], copy.deepcopy(imgs[0])]
+        imgs[1]['idx'] = 1
+        filelist = [filelist[0], filelist[0] + '_2']
+    if scenegraph_type == "swin":
+        scenegraph_type = scenegraph_type + "-" + str(winsize)
+    elif scenegraph_type == "oneref":
+        scenegraph_type = scenegraph_type + "-" + str(refid)
+    elif scenegraph_type == "matrix":
+        scenegraph_type = scenegraph_type + "-" + str(winsize)
+    pairs = make_pairs(imgs, scene_graph=scenegraph_type, prefilter=None, symmetrize=True)
+    if optim_level == 'coarse':
+        niter2 = 0
+    # Sparse GA (forward mast3r -> matching -> 3D optim -> 2D refinement -> triangulation)
+    scene = sparse_global_alignment(filelist, pairs, os.path.join(outdir, 'cache'),
+                                    model, lr1=lr1, niter1=niter1, lr2=lr2, niter2=niter2, device=device,
+                                    opt_depth='depth' in optim_level, **kw)
+    outfile = get_3D_model_from_scene(outdir, silent, scene, min_conf_thr, as_pointcloud, mask_sky,
+                                      clean_depth, transparent_cams, cam_size, TSDF_thresh)
+    return scene, outfile
+def set_scenegraph_options(inputfiles, winsize, refid, scenegraph_type):
+    num_files = len(inputfiles) if inputfiles is not None else 1
+    max_winsize = max(1, math.ceil((num_files - 1) / 2))
+    if scenegraph_type == "swin":
+        winsize = gradio.Slider(label="Scene Graph: Window Size", value=max_winsize,
+                                minimum=1, maximum=max_winsize, step=1, visible=True)
+        refid = gradio.Slider(label="Scene Graph: Id", value=0, minimum=0,
+                              maximum=num_files - 1, step=1, visible=False)
+    elif scenegraph_type == "oneref":
+        winsize = gradio.Slider(label="Scene Graph: Window Size", value=max_winsize,
+                                minimum=1, maximum=max_winsize, step=1, visible=False)
+        refid = gradio.Slider(label="Scene Graph: Id", value=0, minimum=0,
+                              maximum=num_files - 1, step=1, visible=True)
+    elif scenegraph_type == "matrix":
+        winsize = gradio.Slider(label="Scene Graph: long period", value=6,
+                                minimum=2, maximum=num_files, step=1, visible=True)
+        refid = gradio.Slider(label="Scene Graph: Id", value=0, minimum=0,
+                              maximum=num_files - 1, step=1, visible=False)
+    else:
+        winsize = gradio.Slider(label="Scene Graph: Window Size", value=max_winsize,
+                                minimum=1, maximum=max_winsize, step=1, visible=False)
+        refid = gradio.Slider(label="Scene Graph: Id", value=0, minimum=0,
+                              maximum=num_files - 1, step=1, visible=False)
+    return winsize, refid
+def main_demo(tmpdirname, model, device, image_size, server_name, server_port, silent=False, share=False):
+    if not silent:
+        print('Outputing stuff in', tmpdirname)
+    recon_fun = functools.partial(get_reconstructed_scene, tmpdirname, model, device, silent, image_size)
+    model_from_scene_fun = functools.partial(get_3D_model_from_scene, tmpdirname, silent)
+    with gradio.Blocks(css=""".gradio-container {margin: 0 !important; min-width: 100%};""", title="MASt3R Demo") as demo:
+        # scene state is save so that you can change conf_thr, cam_size... without rerunning the inference
+        scene = gradio.State(None)
+        gradio.HTML('<h2 style="text-align: center;">MASt3R Demo</h2>')
+        with gradio.Column():
+            inputfiles = gradio.File(file_count="multiple")
+            with gradio.Row():
+                lr1 = gradio.Slider(label="Coarse LR", value=0.07, minimum=0.01, maximum=0.2, step=0.01)
+                niter1 = gradio.Number(value=200, precision=0, minimum=0, maximum=10_000,
+                                       label="num_iterations", info="For coarse alignment!")
+                lr2 = gradio.Slider(label="Fine LR", value=0.014, minimum=0.005, maximum=0.05, step=0.001)
+                niter2 = gradio.Number(value=500, precision=0, minimum=0, maximum=100_000,
+                                       label="num_iterations", info="For refinement!")
+                optim_level = gradio.Dropdown(["coarse", "refine", "refine+depth"],
+                                              value='refine', label="OptLevel",
+                                              info="Optimization level")
+                scenegraph_type = gradio.Dropdown(["complete", "swin", "oneref", 'matrix'],
+                                                  value='complete', label="Scenegraph",
+                                                  info="Define how to make pairs",
+                                                  interactive=True)
+                winsize = gradio.Slider(label="Scene Graph: Window Size", value=1,
+                                        minimum=1, maximum=1, step=1, visible=False)
+                refid = gradio.Slider(label="Scene Graph: Id", value=0, minimum=0, maximum=0, step=1, visible=False)
+            run_btn = gradio.Button("Run")
+            with gradio.Row():
+                # adjust the confidence threshold
+                min_conf_thr = gradio.Slider(label="min_conf_thr", value=1.5, minimum=0.0, maximum=10, step=0.1)
+                # adjust the camera size in the output pointcloud
+                cam_size = gradio.Slider(label="cam_size", value=0.2, minimum=0.001, maximum=1.0, step=0.001)
+                TSDF_thresh = gradio.Slider(label="TSDF Threshold", value=0., minimum=0., maximum=1., step=0.01)
+            with gradio.Row():
+                as_pointcloud = gradio.Checkbox(value=True, label="As pointcloud")
+                # two post process implemented
+                mask_sky = gradio.Checkbox(value=False, label="Mask sky")
+                clean_depth = gradio.Checkbox(value=True, label="Clean-up depthmaps")
+                transparent_cams = gradio.Checkbox(value=False, label="Transparent cameras")
+            outmodel = gradio.Model3D()
+            # events
+            scenegraph_type.change(set_scenegraph_options,
+                                   inputs=[inputfiles, winsize, refid, scenegraph_type],
+                                   outputs=[winsize, refid])
+            inputfiles.change(set_scenegraph_options,
+                              inputs=[inputfiles, winsize, refid, scenegraph_type],
+                              outputs=[winsize, refid])
+            run_btn.click(fn=recon_fun,
+                          inputs=[inputfiles, optim_level, lr1, niter1, lr2, niter2, min_conf_thr, as_pointcloud,
+                                  mask_sky, clean_depth, transparent_cams, cam_size,
+                                  scenegraph_type, winsize, refid, TSDF_thresh],
+                          outputs=[scene, outmodel])
+            min_conf_thr.release(fn=model_from_scene_fun,
+                                 inputs=[scene, min_conf_thr, as_pointcloud, mask_sky,
+                                         clean_depth, transparent_cams, cam_size, TSDF_thresh],
+                                 outputs=outmodel)
+            cam_size.change(fn=model_from_scene_fun,
+                            inputs=[scene, min_conf_thr, as_pointcloud, mask_sky,
+                                    clean_depth, transparent_cams, cam_size, TSDF_thresh],
+                            outputs=outmodel)
+            TSDF_thresh.change(fn=model_from_scene_fun,
+                               inputs=[scene, min_conf_thr, as_pointcloud, mask_sky,
+                                       clean_depth, transparent_cams, cam_size, TSDF_thresh],
+                               outputs=outmodel)
+            as_pointcloud.change(fn=model_from_scene_fun,
+                                 inputs=[scene, min_conf_thr, as_pointcloud, mask_sky,
+                                         clean_depth, transparent_cams, cam_size, TSDF_thresh],
+                                 outputs=outmodel)
+            mask_sky.change(fn=model_from_scene_fun,
+                            inputs=[scene, min_conf_thr, as_pointcloud, mask_sky,
+                                    clean_depth, transparent_cams, cam_size, TSDF_thresh],
+                            outputs=outmodel)
+            clean_depth.change(fn=model_from_scene_fun,
+                               inputs=[scene, min_conf_thr, as_pointcloud, mask_sky,
+                                       clean_depth, transparent_cams, cam_size, TSDF_thresh],
+                               outputs=outmodel)
+            transparent_cams.change(model_from_scene_fun,
+                                    inputs=[scene, min_conf_thr, as_pointcloud, mask_sky,
+                                            clean_depth, transparent_cams, cam_size, TSDF_thresh],
+                                    outputs=outmodel)
+    demo.launch(share=False, server_name=server_name, server_port=server_port)
+if __name__ == '__main__':
+    parser = get_args_parser()
+    args = parser.parse_args()
+    if args.server_name is not None:
+        server_name = args.server_name
+    else:
+        server_name = '0.0.0.0' if args.local_network else '127.0.0.1'
+    if args.weights is not None:
+        weights_path = args.weights
+    else:
+        weights_path = "naver/" + args.model_name
+    model = AsymmetricMASt3R.from_pretrained(weights_path).to(args.device)
+    chkpt_tag = hash_md5(weights_path)
+    # mast3r will write the 3D model inside tmpdirname/chkpt_tag
+    if args.tmp_dir is not None:
+        tmpdirname = args.tmp_dir
+        cache_path = os.path.join(tmpdirname, chkpt_tag)
+        os.makedirs(cache_path, exist_ok=True)
+        main_demo(cache_path, model, args.device, args.image_size, server_name, args.server_port, silent=args.silent,
+                  share=args.share)
+    else:
+        with tempfile.TemporaryDirectory(suffix='_mast3r_gradio_demo') as tmpdirname:
+            cache_path = os.path.join(tmpdirname, chkpt_tag)
+            os.makedirs(cache_path, exist_ok=True)
+            main_demo(tmpdirname, model, args.device, args.image_size,
+                      server_name, args.server_port, silent=args.silent,
+                      share=args.share)

demo_dust3r_ga.py ADDED Viewed

	@@ -0,0 +1,64 @@

+#!/usr/bin/env python3
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# mast3r gradio demo executable
+# --------------------------------------------------------
+import os
+import torch
+import tempfile
+import mast3r.utils.path_to_dust3r  # noqa
+from dust3r.model import AsymmetricCroCo3DStereo
+from mast3r.model import AsymmetricMASt3R
+from dust3r.demo import get_args_parser as dust3r_get_args_parser
+from dust3r.demo import main_demo
+import matplotlib.pyplot as pl
+pl.ion()
+torch.backends.cuda.matmul.allow_tf32 = True  # for gpu >= Ampere and pytorch >= 1.12
+def get_args_parser():
+    parser = dust3r_get_args_parser()
+    actions = parser._actions
+    for action in actions:
+        if action.dest == 'model_name':
+            action.choices.append('MASt3R_ViTLarge_BaseDecoder_512_catmlpdpt_metric')
+    # change defaults
+    parser.prog = 'mast3r demo'
+    return parser
+if __name__ == '__main__':
+    parser = get_args_parser()
+    args = parser.parse_args()
+    if args.tmp_dir is not None:
+        tmp_path = args.tmp_dir
+        os.makedirs(tmp_path, exist_ok=True)
+        tempfile.tempdir = tmp_path
+    if args.server_name is not None:
+        server_name = args.server_name
+    else:
+        server_name = '0.0.0.0' if args.local_network else '127.0.0.1'
+    if args.weights is not None:
+        weights_path = args.weights
+    else:
+        weights_path = "naver/" + args.model_name
+    try:
+        model = AsymmetricMASt3R.from_pretrained(weights_path).to(args.device)
+    except Exception as e:
+        model = AsymmetricCroCo3DStereo.from_pretrained(weights_path).to(args.device)
+    # dust3r will write the 3D model inside tmpdirname
+    with tempfile.TemporaryDirectory(suffix='dust3r_gradio_demo') as tmpdirname:
+        if not args.silent:
+            print('Outputing stuff in', tmpdirname)
+        main_demo(tmpdirname, model, args.device, args.image_size, server_name, args.server_port, silent=args.silent)

dust3r ADDED Viewed

	@@ -0,0 +1 @@


1	+ Subproject commit c267f72ba845108be1d2bd5ea98c641f2835cbc7

mast3r/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # Copyright (C) 2024-present Naver Corporation. All rights reserved.
2	+ # Licensed under CC BY-NC-SA 4.0 (non-commercial use only).

mast3r/catmlp_dpt_head.py ADDED Viewed

	@@ -0,0 +1,123 @@

+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# MASt3R heads
+# --------------------------------------------------------
+import torch
+import torch.nn.functional as F
+import mast3r.utils.path_to_dust3r  # noqa
+from dust3r.heads.postprocess import reg_dense_depth, reg_dense_conf  # noqa
+from dust3r.heads.dpt_head import PixelwiseTaskWithDPT  # noqa
+import dust3r.utils.path_to_croco  # noqa
+from models.blocks import Mlp  # noqa
+def reg_desc(desc, mode):
+    if 'norm' in mode:
+        desc = desc / desc.norm(dim=-1, keepdim=True)
+    else:
+        raise ValueError(f"Unknown desc mode {mode}")
+    return desc
+def postprocess(out, depth_mode, conf_mode, desc_dim=None, desc_mode='norm', two_confs=False, desc_conf_mode=None):
+    if desc_conf_mode is None:
+        desc_conf_mode = conf_mode
+    fmap = out.permute(0, 2, 3, 1)  # B,H,W,D
+    res = dict(pts3d=reg_dense_depth(fmap[..., 0:3], mode=depth_mode))
+    if conf_mode is not None:
+        res['conf'] = reg_dense_conf(fmap[..., 3], mode=conf_mode)
+    if desc_dim is not None:
+        start = 3 + int(conf_mode is not None)
+        res['desc'] = reg_desc(fmap[..., start:start + desc_dim], mode=desc_mode)
+        if two_confs:
+            res['desc_conf'] = reg_dense_conf(fmap[..., start + desc_dim], mode=desc_conf_mode)
+        else:
+            res['desc_conf'] = res['conf'].clone()
+    return res
+class Cat_MLP_LocalFeatures_DPT_Pts3d(PixelwiseTaskWithDPT):
+    """ Mixture between MLP and DPT head that outputs 3d points and local features (with MLP).
+    The input for both heads is a concatenation of Encoder and Decoder outputs
+    """
+    def __init__(self, net, has_conf=False, local_feat_dim=16, hidden_dim_factor=4., hooks_idx=None, dim_tokens=None,
+                 num_channels=1, postprocess=None, feature_dim=256, last_dim=32, depth_mode=None, conf_mode=None, head_type="regression", **kwargs):
+        super().__init__(num_channels=num_channels, feature_dim=feature_dim, last_dim=last_dim, hooks_idx=hooks_idx,
+                         dim_tokens=dim_tokens, depth_mode=depth_mode, postprocess=postprocess, conf_mode=conf_mode, head_type=head_type)
+        self.local_feat_dim = local_feat_dim
+        patch_size = net.patch_embed.patch_size
+        if isinstance(patch_size, tuple):
+            assert len(patch_size) == 2 and isinstance(patch_size[0], int) and isinstance(
+                patch_size[1], int), "What is your patchsize format? Expected a single int or a tuple of two ints."
+            assert patch_size[0] == patch_size[1], "Error, non square patches not managed"
+            patch_size = patch_size[0]
+        self.patch_size = patch_size
+        self.desc_mode = net.desc_mode
+        self.has_conf = has_conf
+        self.two_confs = net.two_confs  # independent confs for 3D regr and descs
+        self.desc_conf_mode = net.desc_conf_mode
+        idim = net.enc_embed_dim + net.dec_embed_dim
+        self.head_local_features = Mlp(in_features=idim,
+                                       hidden_features=int(hidden_dim_factor * idim),
+                                       out_features=(self.local_feat_dim + self.two_confs) * self.patch_size**2)
+    def forward(self, decout, img_shape):
+        # pass through the heads
+        pts3d = self.dpt(decout, image_size=(img_shape[0], img_shape[1]))
+        # recover encoder and decoder outputs
+        enc_output, dec_output = decout[0], decout[-1]
+        cat_output = torch.cat([enc_output, dec_output], dim=-1)  # concatenate
+        H, W = img_shape
+        B, S, D = cat_output.shape
+        # extract local_features
+        local_features = self.head_local_features(cat_output)  # B,S,D
+        local_features = local_features.transpose(-1, -2).view(B, -1, H // self.patch_size, W // self.patch_size)
+        local_features = F.pixel_shuffle(local_features, self.patch_size)  # B,d,H,W
+        # post process 3D pts, descriptors and confidences
+        out = torch.cat([pts3d, local_features], dim=1)
+        if self.postprocess:
+            out = self.postprocess(out,
+                                   depth_mode=self.depth_mode,
+                                   conf_mode=self.conf_mode,
+                                   desc_dim=self.local_feat_dim,
+                                   desc_mode=self.desc_mode,
+                                   two_confs=self.two_confs,
+                                   desc_conf_mode=self.desc_conf_mode)
+        return out
+def mast3r_head_factory(head_type, output_mode, net, has_conf=False):
+    """" build a prediction head for the decoder
+    """
+    if head_type == 'catmlp+dpt' and output_mode.startswith('pts3d+desc'):
+        local_feat_dim = int(output_mode[10:])
+        assert net.dec_depth > 9
+        l2 = net.dec_depth
+        feature_dim = 256
+        last_dim = feature_dim // 2
+        out_nchan = 3
+        ed = net.enc_embed_dim
+        dd = net.dec_embed_dim
+        return Cat_MLP_LocalFeatures_DPT_Pts3d(net, local_feat_dim=local_feat_dim, has_conf=has_conf,
+                                               num_channels=out_nchan + has_conf,
+                                               feature_dim=feature_dim,
+                                               last_dim=last_dim,
+                                               hooks_idx=[0, l2 * 2 // 4, l2 * 3 // 4, l2],
+                                               dim_tokens=[ed, dd, dd, dd],
+                                               postprocess=postprocess,
+                                               depth_mode=net.depth_mode,
+                                               conf_mode=net.conf_mode,
+                                               head_type='regression')
+    else:
+        raise NotImplementedError(
+            f"unexpected {head_type=} and {output_mode=}")

mast3r/cloud_opt/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # Copyright (C) 2024-present Naver Corporation. All rights reserved.
2	+ # Licensed under CC BY-NC-SA 4.0 (non-commercial use only).

mast3r/cloud_opt/sparse_ga.py ADDED Viewed

	@@ -0,0 +1,1001 @@

+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# MASt3R Sparse Global Alignement
+# --------------------------------------------------------
+from tqdm import tqdm
+import roma
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+import os
+from collections import namedtuple
+from functools import lru_cache
+from scipy import sparse as sp
+from mast3r.utils.misc import mkdir_for, hash_md5
+from mast3r.cloud_opt.utils.losses import gamma_loss
+from mast3r.cloud_opt.utils.schedules import linear_schedule, cosine_schedule
+from mast3r.fast_nn import fast_reciprocal_NNs, merge_corres
+import mast3r.utils.path_to_dust3r  # noqa
+from dust3r.utils.geometry import inv, geotrf  # noqa
+from dust3r.utils.device import to_cpu, to_numpy, todevice  # noqa
+from dust3r.post_process import estimate_focal_knowing_depth  # noqa
+from dust3r.optim_factory import adjust_learning_rate_by_lr  # noqa
+from dust3r.cloud_opt.base_opt import clean_pointcloud
+from dust3r.viz import SceneViz
+class SparseGA():
+    def __init__(self, img_paths, pairs_in, res_fine, anchors, canonical_paths=None):
+        def fetch_img(im):
+            def torgb(x): return (x[0].permute(1, 2, 0).numpy() * .5 + .5).clip(min=0., max=1.)
+            for im1, im2 in pairs_in:
+                if im1['instance'] == im:
+                    return torgb(im1['img'])
+                if im2['instance'] == im:
+                    return torgb(im2['img'])
+        self.canonical_paths = canonical_paths
+        self.img_paths = img_paths
+        self.imgs = [fetch_img(img) for img in img_paths]
+        self.intrinsics = res_fine['intrinsics']
+        self.cam2w = res_fine['cam2w']
+        self.depthmaps = res_fine['depthmaps']
+        self.pts3d = res_fine['pts3d']
+        self.pts3d_colors = []
+        self.working_device = self.cam2w.device
+        for i in range(len(self.imgs)):
+            im = self.imgs[i]
+            x, y = anchors[i][0][..., :2].detach().cpu().numpy().T
+            self.pts3d_colors.append(im[y, x])
+            assert self.pts3d_colors[-1].shape == self.pts3d[i].shape
+        self.n_imgs = len(self.imgs)
+    def get_focals(self):
+        return torch.tensor([ff[0, 0] for ff in self.intrinsics]).to(self.working_device)
+    def get_principal_points(self):
+        return torch.stack([ff[:2, -1] for ff in self.intrinsics]).to(self.working_device)
+    def get_im_poses(self):
+        return self.cam2w
+    def get_sparse_pts3d(self):
+        return self.pts3d
+    def get_dense_pts3d(self, clean_depth=True, subsample=8):
+        assert self.canonical_paths, 'cache_path is required for dense 3d points'
+        device = self.cam2w.device
+        confs = []
+        base_focals = []
+        anchors = {}
+        for i, canon_path in enumerate(self.canonical_paths):
+            (canon, canon2, conf), focal = torch.load(canon_path, map_location=device)
+            confs.append(conf)
+            base_focals.append(focal)
+            H, W = conf.shape
+            pixels = torch.from_numpy(np.mgrid[:W, :H].T.reshape(-1, 2)).float().to(device)
+            idxs, offsets = anchor_depth_offsets(canon2, {i: (pixels, None)}, subsample=subsample)
+            anchors[i] = (pixels, idxs[i], offsets[i])
+        # densify sparse depthmaps
+        pts3d, depthmaps = make_pts3d(anchors, self.intrinsics, self.cam2w, [
+                                      d.ravel() for d in self.depthmaps], base_focals=base_focals, ret_depth=True)
+        if clean_depth:
+            confs = clean_pointcloud(confs, self.intrinsics, inv(self.cam2w), depthmaps, pts3d)
+        return pts3d, depthmaps, confs
+    def get_pts3d_colors(self):
+        return self.pts3d_colors
+    def get_depthmaps(self):
+        return self.depthmaps
+    def get_masks(self):
+        return [slice(None, None) for _ in range(len(self.imgs))]
+    def show(self, show_cams=True):
+        pts3d, _, confs = self.get_dense_pts3d()
+        show_reconstruction(self.imgs, self.intrinsics if show_cams else None, self.cam2w,
+                            [p.clip(min=-50, max=50) for p in pts3d],
+                            masks=[c > 1 for c in confs])
+def convert_dust3r_pairs_naming(imgs, pairs_in):
+    for pair_id in range(len(pairs_in)):
+        for i in range(2):
+            pairs_in[pair_id][i]['instance'] = imgs[pairs_in[pair_id][i]['idx']]
+    return pairs_in
+def sparse_global_alignment(imgs, pairs_in, cache_path, model, subsample=8, desc_conf='desc_conf',
+                            device='cuda', dtype=torch.float32, **kw):
+    """ Sparse alignment with MASt3R
+        imgs: list of image paths
+        cache_path: path where to dump temporary files (str)
+        lr1, niter1: learning rate and #iterations for coarse global alignment (3D matching)
+        lr2, niter2: learning rate and #iterations for refinement (2D reproj error)
+        lora_depth: smart dimensionality reduction with depthmaps
+    """
+    # Convert pair naming convention from dust3r to mast3r
+    pairs_in = convert_dust3r_pairs_naming(imgs, pairs_in)
+    # forward pass
+    pairs, cache_path = forward_mast3r(pairs_in, model,
+                                       cache_path=cache_path, subsample=subsample,
+                                       desc_conf=desc_conf, device=device)
+    # extract canonical pointmaps
+    tmp_pairs, pairwise_scores, canonical_views, canonical_paths, preds_21 = \
+        prepare_canonical_data(imgs, pairs, subsample, cache_path=cache_path, mode='avg-angle', device=device)
+    # compute minimal spanning tree
+    mst = compute_min_spanning_tree(pairwise_scores)
+    # remove all edges not in the spanning tree?
+    # min_spanning_tree = {(imgs[i],imgs[j]) for i,j in mst[1]}
+    # tmp_pairs = {(a,b):v for (a,b),v in tmp_pairs.items() if {(a,b),(b,a)} & min_spanning_tree}
+    # smartly combine all usefull data
+    imsizes, pps, base_focals, core_depth, anchors, corres, corres2d = \
+        condense_data(imgs, tmp_pairs, canonical_views, dtype)
+    imgs, res_coarse, res_fine = sparse_scene_optimizer(
+        imgs, subsample, imsizes, pps, base_focals, core_depth, anchors, corres, corres2d, preds_21, canonical_paths,
+        mst, cache_path=cache_path, device=device, dtype=dtype, **kw)
+    return SparseGA(imgs, pairs_in, res_fine or res_coarse, anchors, canonical_paths)
+def sparse_scene_optimizer(imgs, subsample, imsizes, pps, base_focals, core_depth, anchors, corres, corres2d,
+                           preds_21, canonical_paths, mst, cache_path,
+                           lr1=0.2, niter1=500, loss1=gamma_loss(1.1),
+                           lr2=0.02, niter2=500, loss2=gamma_loss(0.4),
+                           lossd=gamma_loss(1.1),
+                           opt_pp=True, opt_depth=True,
+                           schedule=cosine_schedule, depth_mode='add', exp_depth=False,
+                           lora_depth=False,  # dict(k=96, gamma=15, min_norm=.5),
+                           init={}, device='cuda', dtype=torch.float32,
+                           matching_conf_thr=4., loss_dust3r_w=0.01,
+                           verbose=True, dbg=()):
+    # extrinsic parameters
+    vec0001 = torch.tensor((0, 0, 0, 1), dtype=dtype, device=device)
+    quats = [nn.Parameter(vec0001.clone()) for _ in range(len(imgs))]
+    trans = [nn.Parameter(torch.zeros(3, device=device, dtype=dtype)) for _ in range(len(imgs))]
+    # intialize
+    ones = torch.ones((len(imgs), 1), device=device, dtype=dtype)
+    median_depths = torch.ones(len(imgs), device=device, dtype=dtype)
+    for img in imgs:
+        idx = imgs.index(img)
+        init_values = init.setdefault(img, {})
+        if verbose and init_values:
+            print(f' >> initializing img=...{img[-25:]} [{idx}] for {set(init_values)}')
+        K = init_values.get('intrinsics')
+        if K is not None:
+            K = K.detach()
+            focal = K[:2, :2].diag().mean()
+            pp = K[:2, 2]
+            base_focals[idx] = focal
+            pps[idx] = pp
+        pps[idx] /= imsizes[idx]  # default principal_point would be (0.5, 0.5)
+        depth = init_values.get('depthmap')
+        if depth is not None:
+            core_depth[idx] = depth.detach()
+        median_depths[idx] = med_depth = core_depth[idx].median()
+        core_depth[idx] /= med_depth
+        cam2w = init_values.get('cam2w')
+        if cam2w is not None:
+            rot = cam2w[:3, :3].detach()
+            cam_center = cam2w[:3, 3].detach()
+            quats[idx].data[:] = roma.rotmat_to_unitquat(rot)
+            trans_offset = med_depth * torch.cat((imsizes[idx] / base_focals[idx] * (0.5 - pps[idx]), ones[:1, 0]))
+            trans[idx].data[:] = cam_center + rot @ trans_offset
+            del rot
+            assert False, 'inverse kinematic chain not yet implemented'
+    # intrinsics parameters
+    pps = [nn.Parameter(pp.to(dtype)) for pp in pps]
+    diags = imsizes.float().norm(dim=1)
+    min_focals = 0.25 * diags  # diag = 1.2~1.4*max(W,H) => beta >= 1/(2*1.2*tan(fov/2)) ~= 0.26
+    max_focals = 10 * diags
+    log_focals = [nn.Parameter(f.view(1).log().to(dtype)) for f in base_focals]
+    assert len(mst[1]) == len(pps) - 1
+    def make_K_cam_depth(log_focals, pps, trans, quats, log_sizes, core_depth):
+        # make intrinsics
+        focals = torch.cat(log_focals).exp().clip(min=min_focals, max=max_focals)
+        pps = torch.stack(pps)
+        K = torch.eye(3, dtype=dtype, device=device)[None].expand(len(imgs), 3, 3).clone()
+        K[:, 0, 0] = K[:, 1, 1] = focals
+        K[:, 0:2, 2] = pps * imsizes
+        if trans is None:
+            return K
+        # security! optimization is always trying to crush the scale down
+        sizes = torch.cat(log_sizes).exp()
+        global_scaling = 1 / sizes.min()
+        # compute distance of camera to focal plane
+        # tan(fov) = W/2 / focal
+        z_cameras = sizes * median_depths * focals / base_focals
+        # make extrinsic
+        rel_cam2cam = torch.eye(4, dtype=dtype, device=device)[None].expand(len(imgs), 4, 4).clone()
+        rel_cam2cam[:, :3, :3] = roma.unitquat_to_rotmat(F.normalize(torch.stack(quats), dim=1))
+        rel_cam2cam[:, :3, 3] = torch.stack(trans)
+        # camera are defined as a kinematic chain
+        tmp_cam2w = [None] * len(K)
+        tmp_cam2w[mst[0]] = rel_cam2cam[mst[0]]
+        for i, j in mst[1]:
+            # i is the cam_i_to_world reference, j is the relative pose = cam_j_to_cam_i
+            tmp_cam2w[j] = tmp_cam2w[i] @ rel_cam2cam[j]
+        tmp_cam2w = torch.stack(tmp_cam2w)
+        # smart reparameterizaton of cameras
+        trans_offset = z_cameras.unsqueeze(1) * torch.cat((imsizes / focals.unsqueeze(1) * (0.5 - pps), ones), dim=-1)
+        new_trans = global_scaling * (tmp_cam2w[:, :3, 3:4] - tmp_cam2w[:, :3, :3] @ trans_offset.unsqueeze(-1))
+        cam2w = torch.cat((torch.cat((tmp_cam2w[:, :3, :3], new_trans), dim=2),
+                          vec0001.view(1, 1, 4).expand(len(K), 1, 4)), dim=1)
+        depthmaps = []
+        for i in range(len(imgs)):
+            core_depth_img = core_depth[i]
+            if exp_depth:
+                core_depth_img = core_depth_img.exp()
+            if lora_depth:  # compute core_depth as a low-rank decomposition of 3d points
+                core_depth_img = lora_depth_proj[i] @ core_depth_img
+            if depth_mode == 'add':
+                core_depth_img = z_cameras[i] + (core_depth_img - 1) * (median_depths[i] * sizes[i])
+            elif depth_mode == 'mul':
+                core_depth_img = z_cameras[i] * core_depth_img
+            else:
+                raise ValueError(f'Bad {depth_mode=}')
+            depthmaps.append(global_scaling * core_depth_img)
+        return K, (inv(cam2w), cam2w), depthmaps
+    K = make_K_cam_depth(log_focals, pps, None, None, None, None)
+    print('init focals =', to_numpy(K[:, 0, 0]))
+    # spectral low-rank projection of depthmaps
+    if lora_depth:
+        core_depth, lora_depth_proj = spectral_projection_of_depthmaps(
+            imgs, K, core_depth, subsample, cache_path=cache_path, **lora_depth)
+    if exp_depth:
+        core_depth = [d.clip(min=1e-4).log() for d in core_depth]
+    core_depth = [nn.Parameter(d.ravel().to(dtype)) for d in core_depth]
+    log_sizes = [nn.Parameter(torch.zeros(1, dtype=dtype, device=device)) for _ in range(len(imgs))]
+    # Fetch img slices
+    _, confs_sum, imgs_slices = corres
+    # Define which pairs are fine to use with matching
+    def matching_check(x): return x.max() > matching_conf_thr
+    is_matching_ok = {}
+    for s in imgs_slices:
+        is_matching_ok[s.img1, s.img2] = matching_check(s.confs)
+    # Subsample preds_21
+    subsamp_preds_21 = {}
+    for imk, imv in preds_21.items():
+        subsamp_preds_21[imk] = {}
+        for im2k, (pred, conf) in preds_21[imk].items():
+            subpred = pred[::subsample, ::subsample].reshape(-1, 3)  # original subsample
+            subconf = conf[::subsample, ::subsample].ravel()       # for both ptmaps and confs
+            idxs = anchors[imgs.index(im2k)][1]
+            subsamp_preds_21[imk][im2k] = (subpred[idxs], subconf[idxs])  # anchors subsample
+    def loss_dust3r(cam2w, pts3d, pix_loss):
+        # In the case no correspondence could be established, fallback to DUSt3R GA regression loss formulation (sparsified)
+        loss = 0.
+        cf_sum = 0.
+        for s in imgs_slices:
+            if not is_matching_ok[s.img1, s.img2]:
+                # fallback to dust3r regression
+                tgt_pts, tgt_confs = subsamp_preds_21[imgs[s.img2]][imgs[s.img1]]
+                tgt_pts = geotrf(cam2w[s.img2], tgt_pts)
+                cf_sum += tgt_confs.sum()
+                loss += tgt_confs @ pix_loss(pts3d[s.img1], tgt_pts)
+        return loss / cf_sum if cf_sum != 0. else 0.
+    def loss_3d(K, w2cam, pts3d, pix_loss):
+        # For each correspondence, we have two 3D points (one for each image of the pair).
+        # For each 3D point, we have 2 reproj errors
+        if any(v.get('freeze') for v in init.values()):
+            pts3d_1 = []
+            pts3d_2 = []
+            confs = []
+            for s in imgs_slices:
+                if init[imgs[s.img1]].get('freeze') and init[imgs[s.img2]].get('freeze'):
+                    continue
+                if is_matching_ok[s.img1, s.img2]:
+                    pts3d_1.append(pts3d[s.img1][s.slice1])
+                    pts3d_2.append(pts3d[s.img2][s.slice2])
+                    confs.append(s.confs)
+        else:
+            pts3d_1 = [pts3d[s.img1][s.slice1] for s in imgs_slices if is_matching_ok[s.img1, s.img2]]
+            pts3d_2 = [pts3d[s.img2][s.slice2] for s in imgs_slices if is_matching_ok[s.img1, s.img2]]
+            confs = [s.confs for s in imgs_slices if is_matching_ok[s.img1, s.img2]]
+        if pts3d_1 != []:
+            confs = torch.cat(confs)
+            pts3d_1 = torch.cat(pts3d_1)
+            pts3d_2 = torch.cat(pts3d_2)
+            loss = confs @ pix_loss(pts3d_1, pts3d_2)
+            cf_sum = confs.sum()
+        else:
+            loss = 0.
+            cf_sum = 1.
+        return loss / cf_sum
+    def loss_2d(K, w2cam, pts3d, pix_loss):
+        # For each correspondence, we have two 3D points (one for each image of the pair).
+        # For each 3D point, we have 2 reproj errors
+        proj_matrix = K @ w2cam[:, :3]
+        loss = npix = 0
+        for img1, pix1, confs, cf_sum, imgs_slices in corres2d:
+            if init[imgs[img1]].get('freeze', 0) >= 1:
+                continue  # no need
+            pts3d_in_img1 = [pts3d[img2][slice2] for img2, slice2 in imgs_slices if is_matching_ok[img1, img2]]
+            pix1_filtered = []
+            confs_filtered = []
+            curstep = 0
+            for img2, slice2 in imgs_slices:
+                if is_matching_ok[img1, img2]:
+                    tslice = slice(curstep, curstep + slice2.stop - slice2.start, slice2.step)
+                    pix1_filtered.append(pix1[tslice])
+                    confs_filtered.append(confs[tslice])
+                curstep += slice2.stop - slice2.start
+            if pts3d_in_img1 != []:
+                pts3d_in_img1 = torch.cat(pts3d_in_img1)
+                pix1_filtered = torch.cat(pix1_filtered)
+                confs_filtered = torch.cat(confs_filtered)
+                loss += confs_filtered @ pix_loss(pix1_filtered, reproj2d(proj_matrix[img1], pts3d_in_img1))
+                npix += confs_filtered.sum()
+        return loss / npix if npix != 0 else 0.
+    def optimize_loop(loss_func, lr_base, niter, pix_loss, lr_end=0):
+        # create optimizer
+        params = pps + log_focals + quats + trans + log_sizes + core_depth
+        optimizer = torch.optim.Adam(params, lr=1, weight_decay=0, betas=(0.9, 0.9))
+        ploss = pix_loss if 'meta' in repr(pix_loss) else (lambda a: pix_loss)
+        with tqdm(total=niter) as bar:
+            for iter in range(niter or 1):
+                K, (w2cam, cam2w), depthmaps = make_K_cam_depth(log_focals, pps, trans, quats, log_sizes, core_depth)
+                pts3d = make_pts3d(anchors, K, cam2w, depthmaps, base_focals=base_focals)
+                if niter == 0:
+                    break
+                alpha = (iter / niter)
+                lr = schedule(alpha, lr_base, lr_end)
+                adjust_learning_rate_by_lr(optimizer, lr)
+                pix_loss = ploss(1 - alpha)
+                optimizer.zero_grad()
+                loss = loss_func(K, w2cam, pts3d, pix_loss) + loss_dust3r_w * loss_dust3r(cam2w, pts3d, lossd)
+                loss.backward()
+                optimizer.step()
+                # make sure the pose remains well optimizable
+                for i in range(len(imgs)):
+                    quats[i].data[:] /= quats[i].data.norm()
+                loss = float(loss)
+                if loss != loss:
+                    break  # NaN loss
+                bar.set_postfix_str(f'{lr=:.4f}, {loss=:.3f}')
+                bar.update(1)
+        if niter:
+            print(f'>> final loss = {loss}')
+        return dict(intrinsics=K.detach(), cam2w=cam2w.detach(),
+                    depthmaps=[d.detach() for d in depthmaps], pts3d=[p.detach() for p in pts3d])
+    # at start, don't optimize 3d points
+    for i, img in enumerate(imgs):
+        trainable = not (init[img].get('freeze'))
+        pps[i].requires_grad_(False)
+        log_focals[i].requires_grad_(False)
+        quats[i].requires_grad_(trainable)
+        trans[i].requires_grad_(trainable)
+        log_sizes[i].requires_grad_(trainable)
+        core_depth[i].requires_grad_(False)
+    res_coarse = optimize_loop(loss_3d, lr_base=lr1, niter=niter1, pix_loss=loss1)
+    res_fine = None
+    if niter2:
+        # now we can optimize 3d points
+        for i, img in enumerate(imgs):
+            if init[img].get('freeze', 0) >= 1:
+                continue
+            pps[i].requires_grad_(bool(opt_pp))
+            log_focals[i].requires_grad_(True)
+            core_depth[i].requires_grad_(opt_depth)
+        # refinement with 2d reproj
+        res_fine = optimize_loop(loss_2d, lr_base=lr2, niter=niter2, pix_loss=loss2)
+    return imgs, res_coarse, res_fine
+@lru_cache
+def mask110(device, dtype):
+    return torch.tensor((1, 1, 0), device=device, dtype=dtype)
+def proj3d(inv_K, pixels, z):
+    if pixels.shape[-1] == 2:
+        pixels = torch.cat((pixels, torch.ones_like(pixels[..., :1])), dim=-1)
+    return z.unsqueeze(-1) * (pixels * inv_K.diag() + inv_K[:, 2] * mask110(z.device, z.dtype))
+def make_pts3d(anchors, K, cam2w, depthmaps, base_focals=None, ret_depth=False):
+    focals = K[:, 0, 0]
+    invK = inv(K)
+    all_pts3d = []
+    depth_out = []
+    for img, (pixels, idxs, offsets) in anchors.items():
+        # from depthmaps to 3d points
+        if base_focals is None:
+            pass
+        else:
+            # compensate for focal
+            # depth + depth * (offset - 1) * base_focal / focal
+            # = depth * (1 + (offset - 1) * (base_focal / focal))
+            offsets = 1 + (offsets - 1) * (base_focals[img] / focals[img])
+        pts3d = proj3d(invK[img], pixels, depthmaps[img][idxs] * offsets)
+        if ret_depth:
+            depth_out.append(pts3d[..., 2])  # before camera rotation
+        # rotate to world coordinate
+        pts3d = geotrf(cam2w[img], pts3d)
+        all_pts3d.append(pts3d)
+    if ret_depth:
+        return all_pts3d, depth_out
+    return all_pts3d
+def make_dense_pts3d(intrinsics, cam2w, depthmaps, canonical_paths, subsample, device='cuda'):
+    base_focals = []
+    anchors = {}
+    confs = []
+    for i, canon_path in enumerate(canonical_paths):
+        (canon, canon2, conf), focal = torch.load(canon_path, map_location=device)
+        confs.append(conf)
+        base_focals.append(focal)
+        H, W = conf.shape
+        pixels = torch.from_numpy(np.mgrid[:W, :H].T.reshape(-1, 2)).float().to(device)
+        idxs, offsets = anchor_depth_offsets(canon2, {i: (pixels, None)}, subsample=subsample)
+        anchors[i] = (pixels, idxs[i], offsets[i])
+    # densify sparse depthmaps
+    pts3d, depthmaps_out = make_pts3d(anchors, intrinsics, cam2w, [
+                                      d.ravel() for d in depthmaps], base_focals=base_focals, ret_depth=True)
+    return pts3d, depthmaps_out, confs
+@torch.no_grad()
+def forward_mast3r(pairs, model, cache_path, desc_conf='desc_conf',
+                   device='cuda', subsample=8, **matching_kw):
+    res_paths = {}
+    for img1, img2 in tqdm(pairs):
+        idx1 = hash_md5(img1['instance'])
+        idx2 = hash_md5(img2['instance'])
+        path1 = cache_path + f'/forward/{idx1}/{idx2}.pth'
+        path2 = cache_path + f'/forward/{idx2}/{idx1}.pth'
+        path_corres = cache_path + f'/corres_conf={desc_conf}_{subsample=}/{idx1}-{idx2}.pth'
+        path_corres2 = cache_path + f'/corres_conf={desc_conf}_{subsample=}/{idx2}-{idx1}.pth'
+        if os.path.isfile(path_corres2) and not os.path.isfile(path_corres):
+            score, (xy1, xy2, confs) = torch.load(path_corres2)
+            torch.save((score, (xy2, xy1, confs)), path_corres)
+        if not all(os.path.isfile(p) for p in (path1, path2, path_corres)):
+            if model is None:
+                continue
+            res = symmetric_inference(model, img1, img2, device=device)
+            X11, X21, X22, X12 = [r['pts3d'][0] for r in res]
+            C11, C21, C22, C12 = [r['conf'][0] for r in res]
+            descs = [r['desc'][0] for r in res]
+            qonfs = [r[desc_conf][0] for r in res]
+            # save
+            torch.save(to_cpu((X11, C11, X21, C21)), mkdir_for(path1))
+            torch.save(to_cpu((X22, C22, X12, C12)), mkdir_for(path2))
+            # perform reciprocal matching
+            corres = extract_correspondences(descs, qonfs, device=device, subsample=subsample)
+            conf_score = (C11.mean() * C12.mean() * C21.mean() * C22.mean()).sqrt().sqrt()
+            matching_score = (float(conf_score), float(corres[2].sum()), len(corres[2]))
+            if cache_path is not None:
+                torch.save((matching_score, corres), mkdir_for(path_corres))
+        res_paths[img1['instance'], img2['instance']] = (path1, path2), path_corres
+    del model
+    torch.cuda.empty_cache()
+    return res_paths, cache_path
+def symmetric_inference(model, img1, img2, device):
+    shape1 = torch.from_numpy(img1['true_shape']).to(device, non_blocking=True)
+    shape2 = torch.from_numpy(img2['true_shape']).to(device, non_blocking=True)
+    img1 = img1['img'].to(device, non_blocking=True)
+    img2 = img2['img'].to(device, non_blocking=True)
+    # compute encoder only once
+    feat1, feat2, pos1, pos2 = model._encode_image_pairs(img1, img2, shape1, shape2)
+    def decoder(feat1, feat2, pos1, pos2, shape1, shape2):
+        dec1, dec2 = model._decoder(feat1, pos1, feat2, pos2)
+        with torch.cuda.amp.autocast(enabled=False):
+            res1 = model._downstream_head(1, [tok.float() for tok in dec1], shape1)
+            res2 = model._downstream_head(2, [tok.float() for tok in dec2], shape2)
+        return res1, res2
+    # decoder 1-2
+    res11, res21 = decoder(feat1, feat2, pos1, pos2, shape1, shape2)
+    # decoder 2-1
+    res22, res12 = decoder(feat2, feat1, pos2, pos1, shape2, shape1)
+    return (res11, res21, res22, res12)
+def extract_correspondences(feats, qonfs, subsample=8, device=None, ptmap_key='pred_desc'):
+    feat11, feat21, feat22, feat12 = feats
+    qonf11, qonf21, qonf22, qonf12 = qonfs
+    assert feat11.shape[:2] == feat12.shape[:2] == qonf11.shape == qonf12.shape
+    assert feat21.shape[:2] == feat22.shape[:2] == qonf21.shape == qonf22.shape
+    if '3d' in ptmap_key:
+        opt = dict(device='cpu', workers=32)
+    else:
+        opt = dict(device=device, dist='dot', block_size=2**13)
+    # matching the two pairs
+    idx1 = []
+    idx2 = []
+    qonf1 = []
+    qonf2 = []
+    # TODO add non symmetric / pixel_tol options
+    for A, B, QA, QB in [(feat11, feat21, qonf11.cpu(), qonf21.cpu()),
+                         (feat12, feat22, qonf12.cpu(), qonf22.cpu())]:
+        nn1to2 = fast_reciprocal_NNs(A, B, subsample_or_initxy1=subsample, ret_xy=False, **opt)
+        nn2to1 = fast_reciprocal_NNs(B, A, subsample_or_initxy1=subsample, ret_xy=False, **opt)
+        idx1.append(np.r_[nn1to2[0], nn2to1[1]])
+        idx2.append(np.r_[nn1to2[1], nn2to1[0]])
+        qonf1.append(QA.ravel()[idx1[-1]])
+        qonf2.append(QB.ravel()[idx2[-1]])
+    # merge corres from opposite pairs
+    H1, W1 = feat11.shape[:2]
+    H2, W2 = feat22.shape[:2]
+    cat = np.concatenate
+    xy1, xy2, idx = merge_corres(cat(idx1), cat(idx2), (H1, W1), (H2, W2), ret_xy=True, ret_index=True)
+    corres = (xy1.copy(), xy2.copy(), np.sqrt(cat(qonf1)[idx] * cat(qonf2)[idx]))
+    return todevice(corres, device)
+@torch.no_grad()
+def prepare_canonical_data(imgs, tmp_pairs, subsample, order_imgs=False, min_conf_thr=0,
+                           cache_path=None, device='cuda', **kw):
+    canonical_views = {}
+    pairwise_scores = torch.zeros((len(imgs), len(imgs)), device=device)
+    canonical_paths = []
+    preds_21 = {}
+    for img in tqdm(imgs):
+        if cache_path:
+            cache = os.path.join(cache_path, 'canon_views', hash_md5(img) + f'_{subsample=}_{kw=}.pth')
+            canonical_paths.append(cache)
+        try:
+            (canon, canon2, cconf), focal = torch.load(cache, map_location=device)
+        except IOError:
+            # cache does not exist yet, we create it!
+            canon = focal = None
+        # collect all pred1
+        n_pairs = sum((img in pair) for pair in tmp_pairs)
+        ptmaps11 = None
+        pixels = {}
+        n = 0
+        for (img1, img2), ((path1, path2), path_corres) in tmp_pairs.items():
+            score = None
+            if img == img1:
+                X, C, X2, C2 = torch.load(path1, map_location=device)
+                score, (xy1, xy2, confs) = load_corres(path_corres, device, min_conf_thr)
+                pixels[img2] = xy1, confs
+                if img not in preds_21:
+                    preds_21[img] = {}
+                preds_21[img][img2] = X2, C2
+            if img == img2:
+                X, C, X2, C2 = torch.load(path2, map_location=device)
+                score, (xy1, xy2, confs) = load_corres(path_corres, device, min_conf_thr)
+                pixels[img1] = xy2, confs
+                if img not in preds_21:
+                    preds_21[img] = {}
+                preds_21[img][img1] = X2, C2
+            if score is not None:
+                i, j = imgs.index(img1), imgs.index(img2)
+                # score = score[0]
+                # score = np.log1p(score[2])
+                score = score[2]
+                pairwise_scores[i, j] = score
+                pairwise_scores[j, i] = score
+                if canon is not None:
+                    continue
+                if ptmaps11 is None:
+                    H, W = C.shape
+                    ptmaps11 = torch.empty((n_pairs, H, W, 3), device=device)
+                    confs11 = torch.empty((n_pairs, H, W), device=device)
+                ptmaps11[n] = X
+                confs11[n] = C
+                n += 1
+        if canon is None:
+            canon, canon2, cconf = canonical_view(ptmaps11, confs11, subsample, **kw)
+            del ptmaps11
+            del confs11
+        # compute focals
+        H, W = canon.shape[:2]
+        pp = torch.tensor([W / 2, H / 2], device=device)
+        if focal is None:
+            focal = estimate_focal_knowing_depth(canon[None], pp, focal_mode='weiszfeld', min_focal=0.5, max_focal=3.5)
+            if cache:
+                torch.save(to_cpu(((canon, canon2, cconf), focal)), mkdir_for(cache))
+        # extract depth offsets with correspondences
+        core_depth = canon[subsample // 2::subsample, subsample // 2::subsample, 2]
+        idxs, offsets = anchor_depth_offsets(canon2, pixels, subsample=subsample)
+        canonical_views[img] = (pp, (H, W), focal.view(1), core_depth, pixels, idxs, offsets)
+    return tmp_pairs, pairwise_scores, canonical_views, canonical_paths, preds_21
+def load_corres(path_corres, device, min_conf_thr):
+    score, (xy1, xy2, confs) = torch.load(path_corres, map_location=device)
+    valid = confs > min_conf_thr if min_conf_thr else slice(None)
+    # valid = (xy1 > 0).all(dim=1) & (xy2 > 0).all(dim=1) & (xy1 < 512).all(dim=1) & (xy2 < 512).all(dim=1)
+    # print(f'keeping {valid.sum()} / {len(valid)} correspondences')
+    return score, (xy1[valid], xy2[valid], confs[valid])
+PairOfSlices = namedtuple(
+    'ImgPair', 'img1, slice1, pix1, anchor_idxs1, img2, slice2, pix2, anchor_idxs2, confs, confs_sum')
+def condense_data(imgs, tmp_paths, canonical_views, dtype=torch.float32):
+    # aggregate all data properly
+    set_imgs = set(imgs)
+    principal_points = []
+    shapes = []
+    focals = []
+    core_depth = []
+    img_anchors = {}
+    tmp_pixels = {}
+    for idx1, img1 in enumerate(imgs):
+        # load stuff
+        pp, shape, focal, anchors, pixels_confs, idxs, offsets = canonical_views[img1]
+        principal_points.append(pp)
+        shapes.append(shape)
+        focals.append(focal)
+        core_depth.append(anchors)
+        img_uv1 = []
+        img_idxs = []
+        img_offs = []
+        cur_n = [0]
+        for img2, (pixels, match_confs) in pixels_confs.items():
+            if img2 not in set_imgs:
+                continue
+            assert len(pixels) == len(idxs[img2]) == len(offsets[img2])
+            img_uv1.append(torch.cat((pixels, torch.ones_like(pixels[:, :1])), dim=-1))
+            img_idxs.append(idxs[img2])
+            img_offs.append(offsets[img2])
+            cur_n.append(cur_n[-1] + len(pixels))
+            # store the position of 3d points
+            tmp_pixels[img1, img2] = pixels.to(dtype), match_confs.to(dtype), slice(*cur_n[-2:])
+        img_anchors[idx1] = (torch.cat(img_uv1), torch.cat(img_idxs), torch.cat(img_offs))
+    all_confs = []
+    imgs_slices = []
+    corres2d = {img: [] for img in range(len(imgs))}
+    for img1, img2 in tmp_paths:
+        try:
+            pix1, confs1, slice1 = tmp_pixels[img1, img2]
+            pix2, confs2, slice2 = tmp_pixels[img2, img1]
+        except KeyError:
+            continue
+        img1 = imgs.index(img1)
+        img2 = imgs.index(img2)
+        confs = (confs1 * confs2).sqrt()
+        # prepare for loss_3d
+        all_confs.append(confs)
+        anchor_idxs1 = canonical_views[imgs[img1]][5][imgs[img2]]
+        anchor_idxs2 = canonical_views[imgs[img2]][5][imgs[img1]]
+        imgs_slices.append(PairOfSlices(img1, slice1, pix1, anchor_idxs1,
+                                        img2, slice2, pix2, anchor_idxs2,
+                                        confs, float(confs.sum())))
+        # prepare for loss_2d
+        corres2d[img1].append((pix1, confs, img2, slice2))
+        corres2d[img2].append((pix2, confs, img1, slice1))
+    all_confs = torch.cat(all_confs)
+    corres = (all_confs, float(all_confs.sum()), imgs_slices)
+    def aggreg_matches(img1, list_matches):
+        pix1, confs, img2, slice2 = zip(*list_matches)
+        all_pix1 = torch.cat(pix1).to(dtype)
+        all_confs = torch.cat(confs).to(dtype)
+        return img1, all_pix1, all_confs, float(all_confs.sum()), [(j, sl2) for j, sl2 in zip(img2, slice2)]
+    corres2d = [aggreg_matches(img, m) for img, m in corres2d.items()]
+    imsizes = torch.tensor([(W, H) for H, W in shapes], device=pp.device)  # (W,H)
+    principal_points = torch.stack(principal_points)
+    focals = torch.cat(focals)
+    return imsizes, principal_points, focals, core_depth, img_anchors, corres, corres2d
+def canonical_view(ptmaps11, confs11, subsample, mode='avg-angle'):
+    assert len(ptmaps11) == len(confs11) > 0, 'not a single view1 for img={i}'
+    # canonical pointmap is just a weighted average
+    confs11 = confs11.unsqueeze(-1) - 0.999
+    canon = (confs11 * ptmaps11).sum(0) / confs11.sum(0)
+    canon_depth = ptmaps11[..., 2].unsqueeze(1)
+    S = slice(subsample // 2, None, subsample)
+    center_depth = canon_depth[:, :, S, S]
+    assert (center_depth > 0).all()
+    stacked_depth = F.pixel_unshuffle(canon_depth, subsample)
+    stacked_confs = F.pixel_unshuffle(confs11[:, None, :, :, 0], subsample)
+    if mode == 'avg-reldepth':
+        rel_depth = stacked_depth / center_depth
+        stacked_canon = (stacked_confs * rel_depth).sum(dim=0) / stacked_confs.sum(dim=0)
+        canon2 = F.pixel_shuffle(stacked_canon.unsqueeze(0), subsample).squeeze()
+    elif mode == 'avg-angle':
+        xy = ptmaps11[..., 0:2].permute(0, 3, 1, 2)
+        stacked_xy = F.pixel_unshuffle(xy, subsample)
+        B, _, H, W = stacked_xy.shape
+        stacked_radius = (stacked_xy.view(B, 2, -1, H, W) - xy[:, :, None, S, S]).norm(dim=1)
+        stacked_radius.clip_(min=1e-8)
+        stacked_angle = torch.arctan((stacked_depth - center_depth) / stacked_radius)
+        avg_angle = (stacked_confs * stacked_angle).sum(dim=0) / stacked_confs.sum(dim=0)
+        # back to depth
+        stacked_depth = stacked_radius.mean(dim=0) * torch.tan(avg_angle)
+        canon2 = F.pixel_shuffle((1 + stacked_depth / canon[S, S, 2]).unsqueeze(0), subsample).squeeze()
+    else:
+        raise ValueError(f'bad {mode=}')
+    confs = (confs11.square().sum(dim=0) / confs11.sum(dim=0)).squeeze()
+    return canon, canon2, confs
+def anchor_depth_offsets(canon_depth, pixels, subsample=8):
+    device = canon_depth.device
+    # create a 2D grid of anchor 3D points
+    H1, W1 = canon_depth.shape
+    yx = np.mgrid[subsample // 2:H1:subsample, subsample // 2:W1:subsample]
+    H2, W2 = yx.shape[1:]
+    cy, cx = yx.reshape(2, -1)
+    core_depth = canon_depth[cy, cx]
+    assert (core_depth > 0).all()
+    # slave 3d points (attached to core 3d points)
+    core_idxs = {}  # core_idxs[img2] = {corr_idx:core_idx}
+    core_offs = {}  # core_offs[img2] = {corr_idx:3d_offset}
+    for img2, (xy1, _confs) in pixels.items():
+        px, py = xy1.long().T
+        # find nearest anchor == block quantization
+        core_idx = (py // subsample) * W2 + (px // subsample)
+        core_idxs[img2] = core_idx.to(device)
+        # compute relative depth offsets w.r.t. anchors
+        ref_z = core_depth[core_idx]
+        pts_z = canon_depth[py, px]
+        offset = pts_z / ref_z
+        core_offs[img2] = offset.detach().to(device)
+    return core_idxs, core_offs
+def spectral_clustering(graph, k=None, normalized_cuts=False):
+    graph.fill_diagonal_(0)
+    # graph laplacian
+    degrees = graph.sum(dim=-1)
+    laplacian = torch.diag(degrees) - graph
+    if normalized_cuts:
+        i_inv = torch.diag(degrees.sqrt().reciprocal())
+        laplacian = i_inv @ laplacian @ i_inv
+    # compute eigenvectors!
+    eigval, eigvec = torch.linalg.eigh(laplacian)
+    return eigval[:k], eigvec[:, :k]
+def sim_func(p1, p2, gamma):
+    diff = (p1 - p2).norm(dim=-1)
+    avg_depth = (p1[:, :, 2] + p2[:, :, 2])
+    rel_distance = diff / avg_depth
+    sim = torch.exp(-gamma * rel_distance.square())
+    return sim
+def backproj(K, depthmap, subsample):
+    H, W = depthmap.shape
+    uv = np.mgrid[subsample // 2:subsample * W:subsample, subsample // 2:subsample * H:subsample].T.reshape(H, W, 2)
+    xyz = depthmap.unsqueeze(-1) * geotrf(inv(K), todevice(uv, K.device), ncol=3)
+    return xyz
+def spectral_projection_depth(K, depthmap, subsample, k=64, cache_path='',
+                              normalized_cuts=True, gamma=7, min_norm=5):
+    try:
+        if cache_path:
+            cache_path = cache_path + f'_{k=}_norm={normalized_cuts}_{gamma=}.pth'
+        lora_proj = torch.load(cache_path, map_location=K.device)
+    except IOError:
+        # reconstruct 3d points in camera coordinates
+        xyz = backproj(K, depthmap, subsample)
+        # compute all distances
+        xyz = xyz.reshape(-1, 3)
+        graph = sim_func(xyz[:, None], xyz[None, :], gamma=gamma)
+        _, lora_proj = spectral_clustering(graph, k, normalized_cuts=normalized_cuts)
+        if cache_path:
+            torch.save(lora_proj.cpu(), mkdir_for(cache_path))
+    lora_proj, coeffs = lora_encode_normed(lora_proj, depthmap.ravel(), min_norm=min_norm)
+    # depthmap ~= lora_proj @ coeffs
+    return coeffs, lora_proj
+def lora_encode_normed(lora_proj, x, min_norm, global_norm=False):
+    # encode the pointmap
+    coeffs = torch.linalg.pinv(lora_proj) @ x
+    # rectify the norm of basis vector to be ~ equal
+    if coeffs.ndim == 1:
+        coeffs = coeffs[:, None]
+    if global_norm:
+        lora_proj *= coeffs[1:].norm() * min_norm / coeffs.shape[1]
+    elif min_norm:
+        lora_proj *= coeffs.norm(dim=1).clip(min=min_norm)
+    # can have rounding errors here!
+    coeffs = (torch.linalg.pinv(lora_proj.double()) @ x.double()).float()
+    return lora_proj.detach(), coeffs.detach()
+@torch.no_grad()
+def spectral_projection_of_depthmaps(imgs, intrinsics, depthmaps, subsample, cache_path=None, **kw):
+    # recover 3d points
+    core_depth = []
+    lora_proj = []
+    for i, img in enumerate(tqdm(imgs)):
+        cache = os.path.join(cache_path, 'lora_depth', hash_md5(img)) if cache_path else None
+        depth, proj = spectral_projection_depth(intrinsics[i], depthmaps[i], subsample,
+                                                cache_path=cache, **kw)
+        core_depth.append(depth)
+        lora_proj.append(proj)
+    return core_depth, lora_proj
+def reproj2d(Trf, pts3d):
+    res = (pts3d @ Trf[:3, :3].transpose(-1, -2)) + Trf[:3, 3]
+    clipped_z = res[:, 2:3].clip(min=1e-3)  # make sure we don't have nans!
+    uv = res[:, 0:2] / clipped_z
+    return uv.clip(min=-1000, max=2000)
+def bfs(tree, start_node):
+    order, predecessors = sp.csgraph.breadth_first_order(tree, start_node, directed=False)
+    ranks = np.arange(len(order))
+    ranks[order] = ranks.copy()
+    return ranks, predecessors
+def compute_min_spanning_tree(pws):
+    sparse_graph = sp.dok_array(pws.shape)
+    for i, j in pws.nonzero().cpu().tolist():
+        sparse_graph[i, j] = -float(pws[i, j])
+    msp = sp.csgraph.minimum_spanning_tree(sparse_graph)
+    # now reorder the oriented edges, starting from the central point
+    ranks1, _ = bfs(msp, 0)
+    ranks2, _ = bfs(msp, ranks1.argmax())
+    ranks1, _ = bfs(msp, ranks2.argmax())
+    # this is the point farther from any leaf
+    root = np.minimum(ranks1, ranks2).argmax()
+    # find the ordered list of edges that describe the tree
+    order, predecessors = sp.csgraph.breadth_first_order(msp, root, directed=False)
+    order = order[1:]  # root not do not have a predecessor
+    edges = [(predecessors[i], i) for i in order]
+    return root, edges
+def show_reconstruction(shapes_or_imgs, K, cam2w, pts3d, gt_cam2w=None, gt_K=None, cam_size=None, masks=None, **kw):
+    viz = SceneViz()
+    cc = cam2w[:, :3, 3]
+    cs = cam_size or float(torch.cdist(cc, cc).fill_diagonal_(np.inf).min(dim=0).values.median())
+    colors = 64 + np.random.randint(255 - 64, size=(len(cam2w), 3))
+    if isinstance(shapes_or_imgs, np.ndarray) and shapes_or_imgs.ndim == 2:
+        cam_kws = dict(imsizes=shapes_or_imgs[:, ::-1], cam_size=cs)
+    else:
+        imgs = shapes_or_imgs
+        cam_kws = dict(images=imgs, cam_size=cs)
+    if K is not None:
+        viz.add_cameras(to_numpy(cam2w), to_numpy(K), colors=colors, **cam_kws)
+    if gt_cam2w is not None:
+        if gt_K is None:
+            gt_K = K
+        viz.add_cameras(to_numpy(gt_cam2w), to_numpy(gt_K), colors=colors, marker='o', **cam_kws)
+    if pts3d is not None:
+        for i, p in enumerate(pts3d):
+            if not len(p):
+                continue
+            if masks is None:
+                viz.add_pointcloud(to_numpy(p), color=tuple(colors[i].tolist()))
+            else:
+                viz.add_pointcloud(to_numpy(p), mask=masks[i], color=imgs[i])
+    viz.show(**kw)

mast3r/cloud_opt/triangulation.py ADDED Viewed

	@@ -0,0 +1,80 @@

+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# Matches Triangulation Utils
+# --------------------------------------------------------
+import numpy as np
+import torch
+# Batched Matches Triangulation
+def batched_triangulate(pts2d,        # [B, Ncams, Npts, 2]
+                        proj_mats):   # [B, Ncams, 3, 4] I@E projection matrix
+    B, Ncams, Npts, two = pts2d.shape
+    assert two==2
+    assert proj_mats.shape == (B, Ncams, 3, 4)
+    # P - xP
+    x = proj_mats[...,0,:][...,None,:] - torch.einsum('bij,bik->bijk', pts2d[...,0], proj_mats[...,2,:]) # [B, Ncams, Npts, 4]
+    y = proj_mats[...,1,:][...,None,:] - torch.einsum('bij,bik->bijk', pts2d[...,1], proj_mats[...,2,:]) # [B, Ncams, Npts, 4]
+    eq = torch.cat([x, y], dim=1).transpose(1, 2) # [B, Npts, 2xNcams, 4]
+    return torch.linalg.lstsq(eq[...,:3], -eq[...,3]).solution
+def matches_to_depths(intrinsics, # input camera intrinsics     [B, Ncams, 3, 3]
+                      extrinsics, # input camera extrinsics     [B, Ncams, 3, 4]
+                      matches,    # input correspondences       [B, Ncams, Npts, 2]
+                      batchsize=16, # bs for batched processing
+                      min_num_valids_ratio=.3 # at least this ratio of image pairs need to predict a match for a given pixel of img1
+                      ):
+    B, Nv, H, W, five = matches.shape
+    min_num_valids = np.floor(Nv*min_num_valids_ratio)
+    out_aggregated_points, out_depths, out_confs = [], [], []
+    for b in range(B//batchsize+1): # batched processing
+        start, stop = b*batchsize,min(B,(b+1)*batchsize)
+        sub_batch=slice(start,stop)
+        sub_batchsize = stop-start
+        if sub_batchsize==0:continue
+        points1, points2, confs = matches[sub_batch, ..., :2], matches[sub_batch, ..., 2:4], matches[sub_batch, ..., -1]
+        allpoints = torch.cat([points1.view([sub_batchsize*Nv,1,H*W,2]), points2.view([sub_batchsize*Nv,1,H*W,2])],dim=1) # [BxNv, 2, HxW, 2]
+        allcam_Ps = intrinsics[sub_batch] @ extrinsics[sub_batch,:,:3,:]
+        cam_Ps1, cam_Ps2 = allcam_Ps[:,[0]].repeat([1,Nv,1,1]), allcam_Ps[:,1:] # [B, Nv, 3, 4]
+        formatted_camPs = torch.cat([cam_Ps1.reshape([sub_batchsize*Nv,1,3,4]), cam_Ps2.reshape([sub_batchsize*Nv,1,3,4])],dim=1) # [BxNv, 2, 3, 4]
+        # Triangulate matches to 3D
+        points_3d_world = batched_triangulate(allpoints, formatted_camPs) # [BxNv, HxW, three]
+        # Aggregate pairwise predictions
+        points_3d_world = points_3d_world.view([sub_batchsize,Nv,H,W,3])
+        valids = points_3d_world.isfinite()
+        valids_sum = valids.sum(dim=-1)
+        validsuni=valids_sum.unique()
+        assert torch.all(torch.logical_or(validsuni == 0 , validsuni == 3)), "Error, can only be nan for none or all XYZ values, not a subset"
+        confs[valids_sum==0] = 0.
+        points_3d_world = points_3d_world*confs[...,None]
+        # Take care of NaNs
+        normalization = confs.sum(dim=1)[:,None].repeat(1,Nv,1,1)
+        normalization[normalization <= 1e-5] = 1.
+        points_3d_world[valids] /= normalization[valids_sum==3][:,None].repeat(1,3).view(-1)
+        points_3d_world[~valids] = 0.
+        aggregated_points = points_3d_world.sum(dim=1) # weighted average (by confidence value) ignoring nans
+        # Reset invalid values to nans, with a min visibility threshold
+        aggregated_points[valids_sum.sum(dim=1)/3 <= min_num_valids] = torch.nan
+        # From 3D to depths
+        refcamE = extrinsics[sub_batch, 0]
+        points_3d_camera = (refcamE[:,:3, :3] @ aggregated_points.view(sub_batchsize,-1,3).transpose(-2,-1) + refcamE[:,:3,[3]]).transpose(-2,-1) # [B,HxW,3]
+        depths = points_3d_camera.view(sub_batchsize,H,W,3)[..., 2] # [B,H,W]
+        # Cat results
+        out_aggregated_points.append(aggregated_points.cpu())
+        out_depths.append(depths.cpu())
+        out_confs.append(confs.sum(dim=1).cpu())
+    out_aggregated_points = torch.cat(out_aggregated_points,dim=0)
+    out_depths            = torch.cat(out_depths,dim=0)
+    out_confs             = torch.cat(out_confs,dim=0)
+    return out_aggregated_points, out_depths, out_confs

mast3r/cloud_opt/tsdf_optimizer.py ADDED Viewed

	@@ -0,0 +1,273 @@

+import torch
+from torch import nn
+import numpy as np
+from tqdm import tqdm
+from matplotlib import pyplot as pl
+import mast3r.utils.path_to_dust3r  # noqa
+from dust3r.utils.geometry import depthmap_to_pts3d, geotrf, inv
+from dust3r.cloud_opt.base_opt import clean_pointcloud
+class TSDFPostProcess:
+    """ Optimizes a signed distance-function to improve depthmaps.
+    """
+    def __init__(self, optimizer, subsample=8, TSDF_thresh=0., TSDF_batchsize=int(1e7)):
+        self.TSDF_thresh = TSDF_thresh  # None -> no TSDF
+        self.TSDF_batchsize = TSDF_batchsize
+        self.optimizer = optimizer
+        pts3d, depthmaps, confs = optimizer.get_dense_pts3d(clean_depth=False, subsample=subsample)
+        pts3d, depthmaps = self._TSDF_postprocess_or_not(pts3d, depthmaps, confs)
+        self.pts3d = pts3d
+        self.depthmaps = depthmaps
+        self.confs = confs
+    def _get_depthmaps(self, TSDF_filtering_thresh=None):
+        if TSDF_filtering_thresh:
+            self._refine_depths_with_TSDF(self.optimizer, TSDF_filtering_thresh)  # compute refined depths if needed
+        dms = self.TSDF_im_depthmaps if TSDF_filtering_thresh else self.im_depthmaps
+        return [d.exp() for d in dms]
+    @torch.no_grad()
+    def _refine_depths_with_TSDF(self, TSDF_filtering_thresh, niter=1, nsamples=1000):
+        """
+        Leverage TSDF to post-process estimated depths
+        for each pixel, find zero level of TSDF along ray (or closest to 0)
+        """
+        print("Post-Processing Depths with TSDF fusion.")
+        self.TSDF_im_depthmaps = []
+        alldepths, allposes, allfocals, allpps, allimshapes = self._get_depthmaps(), self.optimizer.get_im_poses(
+        ), self.optimizer.get_focals(), self.optimizer.get_principal_points(), self.imshapes
+        for vi in tqdm(range(self.optimizer.n_imgs)):
+            dm, pose, focal, pp, imshape = alldepths[vi], allposes[vi], allfocals[vi], allpps[vi], allimshapes[vi]
+            minvals = torch.full(dm.shape, 1e20)
+            for it in range(niter):
+                H, W = dm.shape
+                curthresh = (niter - it) * TSDF_filtering_thresh
+                dm_offsets = (torch.randn(H, W, nsamples).to(dm) - 1.) * \
+                    curthresh  # decreasing search std along with iterations
+                newdm = dm[..., None] + dm_offsets  # [H,W,Nsamp]
+                curproj = self._backproj_pts3d(in_depths=[newdm], in_im_poses=pose[None], in_focals=focal[None], in_pps=pp[None], in_imshapes=[
+                    imshape])[0]  # [H,W,Nsamp,3]
+                # Batched TSDF eval
+                curproj = curproj.view(-1, 3)
+                tsdf_vals = []
+                valids = []
+                for batch in range(0, len(curproj), self.TSDF_batchsize):
+                    values, valid = self._TSDF_query(
+                        curproj[batch:min(batch + self.TSDF_batchsize, len(curproj))], curthresh)
+                    tsdf_vals.append(values)
+                    valids.append(valid)
+                tsdf_vals = torch.cat(tsdf_vals, dim=0)
+                valids = torch.cat(valids, dim=0)
+                tsdf_vals = tsdf_vals.view([H, W, nsamples])
+                valids = valids.view([H, W, nsamples])
+                # keep depth value that got us the closest to 0
+                tsdf_vals[~valids] = torch.inf  # ignore invalid values
+                tsdf_vals = tsdf_vals.abs()
+                mins = torch.argmin(tsdf_vals, dim=-1, keepdim=True)
+                # when all samples live on a very flat zone, do nothing
+                allbad = (tsdf_vals == curthresh).sum(dim=-1) == nsamples
+                dm[~allbad] = torch.gather(newdm, -1, mins)[..., 0][~allbad]
+            # Save refined depth map
+            self.TSDF_im_depthmaps.append(dm.log())
+    def _TSDF_query(self, qpoints, TSDF_filtering_thresh, weighted=True):
+        """
+        TSDF query call: returns the weighted TSDF value for each query point [N, 3]
+        """
+        N, three = qpoints.shape
+        assert three == 3
+        qpoints = qpoints[None].repeat(self.optimizer.n_imgs, 1, 1)  # [B,N,3]
+        # get projection coordinates and depths onto images
+        coords_and_depth = self._proj_pts3d(pts3d=qpoints, cam2worlds=self.optimizer.get_im_poses(
+        ), focals=self.optimizer.get_focals(), pps=self.optimizer.get_principal_points())
+        image_coords = coords_and_depth[..., :2].round().to(int)  # for now, there's no interpolation...
+        proj_depths = coords_and_depth[..., -1]
+        # recover depth values after scene optim
+        pred_depths, pred_confs, valids = self._get_pixel_depths(image_coords)
+        # Gather TSDF scores
+        all_SDF_scores = pred_depths - proj_depths  # SDF
+        unseen = all_SDF_scores < -TSDF_filtering_thresh  # handle visibility
+        # all_TSDF_scores = all_SDF_scores.clip(-TSDF_filtering_thresh,TSDF_filtering_thresh) # SDF -> TSDF
+        all_TSDF_scores = all_SDF_scores.clip(-TSDF_filtering_thresh, 1e20)  # SDF -> TSDF
+        # Gather TSDF confidences and ignore points that are unseen, either OOB during reproj or too far behind seen depth
+        all_TSDF_weights = (~unseen).float() * valids.float()
+        if weighted:
+            all_TSDF_weights = pred_confs.exp() * all_TSDF_weights
+        # Aggregate all votes, ignoring zeros
+        TSDF_weights = all_TSDF_weights.sum(dim=0)
+        valids = TSDF_weights != 0.
+        TSDF_wsum = (all_TSDF_weights * all_TSDF_scores).sum(dim=0)
+        TSDF_wsum[valids] /= TSDF_weights[valids]
+        return TSDF_wsum, valids
+    def _get_pixel_depths(self, image_coords, TSDF_filtering_thresh=None, with_normals_conf=False):
+        """ Recover depth value for each input pixel coordinate, along with OOB validity mask
+        """
+        B, N, two = image_coords.shape
+        assert B == self.optimizer.n_imgs and two == 2
+        depths = torch.zeros([B, N], device=image_coords.device)
+        valids = torch.zeros([B, N], dtype=bool, device=image_coords.device)
+        confs = torch.zeros([B, N], device=image_coords.device)
+        curconfs = self._get_confs_with_normals() if with_normals_conf else self.im_conf
+        for ni, (imc, depth, conf) in enumerate(zip(image_coords, self._get_depthmaps(TSDF_filtering_thresh), curconfs)):
+            H, W = depth.shape
+            valids[ni] = torch.logical_and(0 <= imc[:, 1], imc[:, 1] <
+                                           H) & torch.logical_and(0 <= imc[:, 0], imc[:, 0] < W)
+            imc[~valids[ni]] = 0
+            depths[ni] = depth[imc[:, 1], imc[:, 0]]
+            confs[ni] = conf.cuda()[imc[:, 1], imc[:, 0]]
+        return depths, confs, valids
+    def _get_confs_with_normals(self):
+        outconfs = []
+        # Confidence basedf on depth gradient
+        class Sobel(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.filter = nn.Conv2d(in_channels=1, out_channels=2, kernel_size=3, stride=1, padding=1, bias=False)
+                Gx = torch.tensor([[2.0, 0.0, -2.0], [4.0, 0.0, -4.0], [2.0, 0.0, -2.0]])
+                Gy = torch.tensor([[2.0, 4.0, 2.0], [0.0, 0.0, 0.0], [-2.0, -4.0, -2.0]])
+                G = torch.cat([Gx.unsqueeze(0), Gy.unsqueeze(0)], 0)
+                G = G.unsqueeze(1)
+                self.filter.weight = nn.Parameter(G, requires_grad=False)
+            def forward(self, img):
+                x = self.filter(img)
+                x = torch.mul(x, x)
+                x = torch.sum(x, dim=1, keepdim=True)
+                x = torch.sqrt(x)
+                return x
+        grad_op = Sobel().to(self.im_depthmaps[0].device)
+        for conf, depth in zip(self.im_conf, self.im_depthmaps):
+            grad_confs = (1. - grad_op(depth[None, None])[0, 0]).clip(0)
+            if not 'dbg show':
+                pl.imshow(grad_confs.cpu())
+                pl.show()
+            outconfs.append(conf * grad_confs.to(conf))
+        return outconfs
+    def _proj_pts3d(self, pts3d, cam2worlds, focals, pps):
+        """
+        Projection operation: from 3D points to 2D coordinates + depths
+        """
+        B = pts3d.shape[0]
+        assert pts3d.shape[0] == cam2worlds.shape[0]
+        # prepare Extrinsincs
+        R, t = cam2worlds[:, :3, :3], cam2worlds[:, :3, -1]
+        Rinv = R.transpose(-2, -1)
+        tinv = -Rinv @ t[..., None]
+        # prepare intrinsics
+        intrinsics = torch.eye(3).to(cam2worlds)[None].repeat(focals.shape[0], 1, 1)
+        if len(focals.shape) == 1:
+            focals = torch.stack([focals, focals], dim=-1)
+        intrinsics[:, 0, 0] = focals[:, 0]
+        intrinsics[:, 1, 1] = focals[:, 1]
+        intrinsics[:, :2, -1] = pps
+        # Project
+        projpts = intrinsics @ (Rinv @ pts3d.transpose(-2, -1) + tinv)  # I(RX+t) : [B,3,N]
+        projpts = projpts.transpose(-2, -1)  # [B,N,3]
+        projpts[..., :2] /= projpts[..., [-1]]  # [B,N,3] (X/Z , Y/Z, Z)
+        return projpts
+    def _backproj_pts3d(self, in_depths=None, in_im_poses=None,
+                        in_focals=None, in_pps=None, in_imshapes=None):
+        """
+        Backprojection operation: from image depths to 3D points
+        """
+        # Get depths and  projection params if not provided
+        focals = self.optimizer.get_focals() if in_focals is None else in_focals
+        im_poses = self.optimizer.get_im_poses() if in_im_poses is None else in_im_poses
+        depth = self._get_depthmaps() if in_depths is None else in_depths
+        pp = self.optimizer.get_principal_points() if in_pps is None else in_pps
+        imshapes = self.imshapes if in_imshapes is None else in_imshapes
+        def focal_ex(i): return focals[i][..., None, None].expand(1, *focals[i].shape, *imshapes[i])
+        dm_to_3d = [depthmap_to_pts3d(depth[i][None], focal_ex(i), pp=pp[[i]]) for i in range(im_poses.shape[0])]
+        def autoprocess(x):
+            x = x[0]
+            return x.transpose(-2, -1) if len(x.shape) == 4 else x
+        return [geotrf(pose, autoprocess(pt)) for pose, pt in zip(im_poses, dm_to_3d)]
+    def _pts3d_to_depth(self, pts3d, cam2worlds, focals, pps):
+        """
+        Projection operation: from 3D points to 2D coordinates + depths
+        """
+        B = pts3d.shape[0]
+        assert pts3d.shape[0] == cam2worlds.shape[0]
+        # prepare Extrinsincs
+        R, t = cam2worlds[:, :3, :3], cam2worlds[:, :3, -1]
+        Rinv = R.transpose(-2, -1)
+        tinv = -Rinv @ t[..., None]
+        # prepare intrinsics
+        intrinsics = torch.eye(3).to(cam2worlds)[None].repeat(self.optimizer.n_imgs, 1, 1)
+        if len(focals.shape) == 1:
+            focals = torch.stack([focals, focals], dim=-1)
+        intrinsics[:, 0, 0] = focals[:, 0]
+        intrinsics[:, 1, 1] = focals[:, 1]
+        intrinsics[:, :2, -1] = pps
+        # Project
+        projpts = intrinsics @ (Rinv @ pts3d.transpose(-2, -1) + tinv)  # I(RX+t) : [B,3,N]
+        projpts = projpts.transpose(-2, -1)  # [B,N,3]
+        projpts[..., :2] /= projpts[..., [-1]]  # [B,N,3] (X/Z , Y/Z, Z)
+        return projpts
+    def _depth_to_pts3d(self, in_depths=None, in_im_poses=None, in_focals=None, in_pps=None, in_imshapes=None):
+        """
+        Backprojection operation: from image depths to 3D points
+        """
+        # Get depths and  projection params if not provided
+        focals = self.optimizer.get_focals() if in_focals is None else in_focals
+        im_poses = self.optimizer.get_im_poses() if in_im_poses is None else in_im_poses
+        depth = self._get_depthmaps() if in_depths is None else in_depths
+        pp = self.optimizer.get_principal_points() if in_pps is None else in_pps
+        imshapes = self.imshapes if in_imshapes is None else in_imshapes
+        def focal_ex(i): return focals[i][..., None, None].expand(1, *focals[i].shape, *imshapes[i])
+        dm_to_3d = [depthmap_to_pts3d(depth[i][None], focal_ex(i), pp=pp[i:i + 1]) for i in range(im_poses.shape[0])]
+        def autoprocess(x):
+            x = x[0]
+            H, W, three = x.shape[:3]
+            return x.transpose(-2, -1) if len(x.shape) == 4 else x
+        return [geotrf(pp, autoprocess(pt)) for pp, pt in zip(im_poses, dm_to_3d)]
+    def _get_pts3d(self, TSDF_filtering_thresh=None, **kw):
+        """
+        return 3D points (possibly filtering depths with TSDF)
+        """
+        return self._backproj_pts3d(in_depths=self._get_depthmaps(TSDF_filtering_thresh=TSDF_filtering_thresh), **kw)
+    def _TSDF_postprocess_or_not(self, pts3d, depthmaps, confs, niter=1):
+        # Setup inner variables
+        self.imshapes = [im.shape[:2] for im in self.optimizer.imgs]
+        self.im_depthmaps = [dd.log().view(imshape) for dd, imshape in zip(depthmaps, self.imshapes)]
+        self.im_conf = confs
+        if self.TSDF_thresh > 0.:
+            # Create or update self.TSDF_im_depthmaps that contain logdepths filtered with TSDF
+            self._refine_depths_with_TSDF(self.TSDF_thresh, niter=niter)
+            depthmaps = [dd.exp() for dd in self.TSDF_im_depthmaps]
+            # Turn them into 3D points
+            pts3d = self._backproj_pts3d(in_depths=depthmaps)
+            depthmaps = [dd.flatten() for dd in depthmaps]
+            pts3d = [pp.view(-1, 3) for pp in pts3d]
+        return pts3d, depthmaps
+    def get_dense_pts3d(self, clean_depth=True):
+        if clean_depth:
+            confs = clean_pointcloud(self.confs, self.optimizer.intrinsics, inv(self.optimizer.cam2w),
+                                     self.depthmaps, self.pts3d)
+        return self.pts3d, self.depthmaps, confs

mast3r/cloud_opt/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # Copyright (C) 2024-present Naver Corporation. All rights reserved.
2	+ # Licensed under CC BY-NC-SA 4.0 (non-commercial use only).

mast3r/cloud_opt/utils/losses.py ADDED Viewed

	@@ -0,0 +1,32 @@

+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# losses for sparse ga
+# --------------------------------------------------------
+import torch
+import numpy as np
+def l05_loss(x, y):
+    return torch.linalg.norm(x - y, dim=-1).sqrt()
+def l1_loss(x, y):
+    return torch.linalg.norm(x - y, dim=-1)
+def gamma_loss(gamma, mul=1, offset=None, clip=np.inf):
+    if offset is None:
+        if gamma == 1:
+            return l1_loss
+        # d(x**p)/dx = 1 ==> p * x**(p-1) == 1 ==> x = (1/p)**(1/(p-1))
+        offset = (1 / gamma)**(1 / (gamma - 1))
+    def loss_func(x, y):
+        return (mul * l1_loss(x, y).clip(max=clip) + offset) ** gamma - offset ** gamma
+    return loss_func
+def meta_gamma_loss():
+    return lambda alpha: gamma_loss(alpha)

mast3r/cloud_opt/utils/schedules.py ADDED Viewed

	@@ -0,0 +1,17 @@

+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# lr schedules for sparse ga
+# --------------------------------------------------------
+import numpy as np
+def linear_schedule(alpha, lr_base, lr_end=0):
+    lr = (1 - alpha) * lr_base + alpha * lr_end
+    return lr
+def cosine_schedule(alpha, lr_base, lr_end=0):
+    lr = lr_end + (lr_base - lr_end) * (1 + np.cos(alpha * np.pi)) / 2
+    return lr

mast3r/colmap/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # Copyright (C) 2024-present Naver Corporation. All rights reserved.
2	+ # Licensed under CC BY-NC-SA 4.0 (non-commercial use only).

mast3r/colmap/database.py ADDED Viewed

	@@ -0,0 +1,383 @@

+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# MASt3R to colmap export functions
+# --------------------------------------------------------
+import os
+import torch
+import copy
+import numpy as np
+import torchvision
+import numpy as np
+from tqdm import tqdm
+from scipy.cluster.hierarchy import DisjointSet
+from scipy.spatial.transform import Rotation as R
+from mast3r.utils.misc import hash_md5
+from mast3r.fast_nn import extract_correspondences_nonsym, bruteforce_reciprocal_nns
+import mast3r.utils.path_to_dust3r  # noqa
+from dust3r.utils.geometry import find_reciprocal_matches, xy_grid  # noqa
+def convert_im_matches_pairs(img0, img1, image_to_colmap, im_keypoints, matches_im0, matches_im1, viz):
+    if viz:
+        from matplotlib import pyplot as pl
+        image_mean = torch.as_tensor(
+            [0.5, 0.5, 0.5], device='cpu').reshape(1, 3, 1, 1)
+        image_std = torch.as_tensor(
+            [0.5, 0.5, 0.5], device='cpu').reshape(1, 3, 1, 1)
+        rgb0 = img0['img'] * image_std + image_mean
+        rgb0 = torchvision.transforms.functional.to_pil_image(rgb0[0])
+        rgb0 = np.array(rgb0)
+        rgb1 = img1['img'] * image_std + image_mean
+        rgb1 = torchvision.transforms.functional.to_pil_image(rgb1[0])
+        rgb1 = np.array(rgb1)
+        imgs = [rgb0, rgb1]
+        # visualize a few matches
+        n_viz = 100
+        num_matches = matches_im0.shape[0]
+        match_idx_to_viz = np.round(np.linspace(
+            0, num_matches - 1, n_viz)).astype(int)
+        viz_matches_im0, viz_matches_im1 = matches_im0[match_idx_to_viz], matches_im1[match_idx_to_viz]
+        H0, W0, H1, W1 = *imgs[0].shape[:2], *imgs[1].shape[:2]
+        rgb0 = np.pad(imgs[0], ((0, max(H1 - H0, 0)),
+                                (0, 0), (0, 0)), 'constant', constant_values=0)
+        rgb1 = np.pad(imgs[1], ((0, max(H0 - H1, 0)),
+                                (0, 0), (0, 0)), 'constant', constant_values=0)
+        img = np.concatenate((rgb0, rgb1), axis=1)
+        pl.figure()
+        pl.imshow(img)
+        cmap = pl.get_cmap('jet')
+        for ii in range(n_viz):
+            (x0, y0), (x1,
+                       y1) = viz_matches_im0[ii].T, viz_matches_im1[ii].T
+            pl.plot([x0, x1 + W0], [y0, y1], '-+', color=cmap(ii /
+                    (n_viz - 1)), scalex=False, scaley=False)
+        pl.show(block=True)
+    matches = [matches_im0.astype(np.float64), matches_im1.astype(np.float64)]
+    imgs = [img0, img1]
+    imidx0 = img0['idx']
+    imidx1 = img1['idx']
+    ravel_matches = []
+    for j in range(2):
+        H, W = imgs[j]['true_shape'][0]
+        with np.errstate(invalid='ignore'):
+            qx, qy = matches[j].round().astype(np.int32).T
+        ravel_matches_j = qx.clip(min=0, max=W - 1, out=qx) + W * qy.clip(min=0, max=H - 1, out=qy)
+        ravel_matches.append(ravel_matches_j)
+        imidxj = imgs[j]['idx']
+        for m in ravel_matches_j:
+            if m not in im_keypoints[imidxj]:
+                im_keypoints[imidxj][m] = 0
+            im_keypoints[imidxj][m] += 1
+    imid0 = copy.deepcopy(image_to_colmap[imidx0]['colmap_imid'])
+    imid1 = copy.deepcopy(image_to_colmap[imidx1]['colmap_imid'])
+    if imid0 > imid1:
+        colmap_matches = np.stack([ravel_matches[1], ravel_matches[0]], axis=-1)
+        imid0, imid1 = imid1, imid0
+        imidx0, imidx1 = imidx1, imidx0
+    else:
+        colmap_matches = np.stack([ravel_matches[0], ravel_matches[1]], axis=-1)
+    colmap_matches = np.unique(colmap_matches, axis=0)
+    return imidx0, imidx1, colmap_matches
+def get_im_matches(pred1, pred2, pairs, image_to_colmap, im_keypoints, conf_thr,
+                   is_sparse=True, subsample=8, pixel_tol=0, viz=False, device='cuda'):
+    im_matches = {}
+    for i in range(len(pred1['pts3d'])):
+        imidx0 = pairs[i][0]['idx']
+        imidx1 = pairs[i][1]['idx']
+        if 'desc' in pred1:  # mast3r
+            descs = [pred1['desc'][i], pred2['desc'][i]]
+            confidences = [pred1['desc_conf'][i], pred2['desc_conf'][i]]
+            desc_dim = descs[0].shape[-1]
+            if is_sparse:
+                corres = extract_correspondences_nonsym(descs[0], descs[1], confidences[0], confidences[1],
+                                                        device=device, subsample=subsample, pixel_tol=pixel_tol)
+                conf = corres[2]
+                mask = conf >= conf_thr
+                matches_im0 = corres[0][mask].cpu().numpy()
+                matches_im1 = corres[1][mask].cpu().numpy()
+            else:
+                confidence_masks = [confidences[0] >=
+                                    conf_thr, confidences[1] >= conf_thr]
+                pts2d_list, desc_list = [], []
+                for j in range(2):
+                    conf_j = confidence_masks[j].cpu().numpy().flatten()
+                    true_shape_j = pairs[i][j]['true_shape'][0]
+                    pts2d_j = xy_grid(
+                        true_shape_j[1], true_shape_j[0]).reshape(-1, 2)[conf_j]
+                    desc_j = descs[j].detach().cpu(
+                    ).numpy().reshape(-1, desc_dim)[conf_j]
+                    pts2d_list.append(pts2d_j)
+                    desc_list.append(desc_j)
+                if len(desc_list[0]) == 0 or len(desc_list[1]) == 0:
+                    continue
+                nn0, nn1 = bruteforce_reciprocal_nns(desc_list[0], desc_list[1],
+                                                     device=device, dist='dot', block_size=2**13)
+                reciprocal_in_P0 = (nn1[nn0] == np.arange(len(nn0)))
+                matches_im1 = pts2d_list[1][nn0][reciprocal_in_P0]
+                matches_im0 = pts2d_list[0][reciprocal_in_P0]
+        else:
+            pts3d = [pred1['pts3d'][i], pred2['pts3d_in_other_view'][i]]
+            confidences = [pred1['conf'][i], pred2['conf'][i]]
+            if is_sparse:
+                corres = extract_correspondences_nonsym(pts3d[0], pts3d[1], confidences[0], confidences[1],
+                                                        device=device, subsample=subsample, pixel_tol=pixel_tol,
+                                                        ptmap_key='3d')
+                conf = corres[2]
+                mask = conf >= conf_thr
+                matches_im0 = corres[0][mask].cpu().numpy()
+                matches_im1 = corres[1][mask].cpu().numpy()
+            else:
+                confidence_masks = [confidences[0] >=
+                                    conf_thr, confidences[1] >= conf_thr]
+                # find 2D-2D matches between the two images
+                pts2d_list, pts3d_list = [], []
+                for j in range(2):
+                    conf_j = confidence_masks[j].cpu().numpy().flatten()
+                    true_shape_j = pairs[i][j]['true_shape'][0]
+                    pts2d_j = xy_grid(true_shape_j[1], true_shape_j[0]).reshape(-1, 2)[conf_j]
+                    pts3d_j = pts3d[j].detach().cpu().numpy().reshape(-1, 3)[conf_j]
+                    pts2d_list.append(pts2d_j)
+                    pts3d_list.append(pts3d_j)
+                PQ, PM = pts3d_list[0], pts3d_list[1]
+                if len(PQ) == 0 or len(PM) == 0:
+                    continue
+                reciprocal_in_PM, nnM_in_PQ, num_matches = find_reciprocal_matches(
+                    PQ, PM)
+                matches_im1 = pts2d_list[1][reciprocal_in_PM]
+                matches_im0 = pts2d_list[0][nnM_in_PQ][reciprocal_in_PM]
+        if len(matches_im0) == 0:
+            continue
+        imidx0, imidx1, colmap_matches = convert_im_matches_pairs(pairs[i][0], pairs[i][1],
+                                                                  image_to_colmap, im_keypoints,
+                                                                  matches_im0, matches_im1, viz)
+        im_matches[(imidx0, imidx1)] = colmap_matches
+    return im_matches
+def get_im_matches_from_cache(pairs, cache_path, desc_conf, subsample,
+                              image_to_colmap, im_keypoints, conf_thr,
+                              viz=False, device='cuda'):
+    im_matches = {}
+    for i in range(len(pairs)):
+        imidx0 = pairs[i][0]['idx']
+        imidx1 = pairs[i][1]['idx']
+        corres_idx1 = hash_md5(pairs[i][0]['instance'])
+        corres_idx2 = hash_md5(pairs[i][1]['instance'])
+        path_corres = cache_path + f'/corres_conf={desc_conf}_{subsample=}/{corres_idx1}-{corres_idx2}.pth'
+        if os.path.isfile(path_corres):
+            score, (xy1, xy2, confs) = torch.load(path_corres, map_location=device)
+        else:
+            path_corres = cache_path + f'/corres_conf={desc_conf}_{subsample=}/{corres_idx2}-{corres_idx1}.pth'
+            score, (xy2, xy1, confs) = torch.load(path_corres, map_location=device)
+        mask = confs >= conf_thr
+        matches_im0 = xy1[mask].cpu().numpy()
+        matches_im1 = xy2[mask].cpu().numpy()
+        if len(matches_im0) == 0:
+            continue
+        imidx0, imidx1, colmap_matches = convert_im_matches_pairs(pairs[i][0], pairs[i][1],
+                                                                  image_to_colmap, im_keypoints,
+                                                                  matches_im0, matches_im1, viz)
+        im_matches[(imidx0, imidx1)] = colmap_matches
+    return im_matches
+def export_images(db, images, image_paths, focals, ga_world_to_cam, camera_model):
+    # add cameras/images to the db
+    # with the output of ga as prior
+    image_to_colmap = {}
+    im_keypoints = {}
+    for idx in range(len(image_paths)):
+        im_keypoints[idx] = {}
+        H, W = images[idx]["orig_shape"]
+        if focals is None:
+            focal_x = focal_y = 1.2 * max(W, H)
+            prior_focal_length = False
+            cx = W / 2.0
+            cy = H / 2.0
+        elif isinstance(focals[idx], np.ndarray) and len(focals[idx].shape) == 2:
+            # intrinsics
+            focal_x = focals[idx][0, 0]
+            focal_y = focals[idx][1, 1]
+            cx = focals[idx][0, 2] * images[idx]["to_orig"][0, 0]
+            cy = focals[idx][1, 2] * images[idx]["to_orig"][1, 1]
+            prior_focal_length = True
+        else:
+            focal_x = focal_y = float(focals[idx])
+            prior_focal_length = True
+            cx = W / 2.0
+            cy = H / 2.0
+        focal_x = focal_x * images[idx]["to_orig"][0, 0]
+        focal_y = focal_y * images[idx]["to_orig"][1, 1]
+        if camera_model == "SIMPLE_PINHOLE":
+            model_id = 0
+            focal = (focal_x + focal_y) / 2.0
+            params = np.asarray([focal, cx, cy], np.float64)
+        elif camera_model == "PINHOLE":
+            model_id = 1
+            params = np.asarray([focal_x, focal_y, cx, cy], np.float64)
+        elif camera_model == "SIMPLE_RADIAL":
+            model_id = 2
+            focal = (focal_x + focal_y) / 2.0
+            params = np.asarray([focal, cx, cy, 0.0], np.float64)
+        elif camera_model == "OPENCV":
+            model_id = 4
+            params = np.asarray([focal_x, focal_y, cx, cy, 0.0, 0.0, 0.0, 0.0], np.float64)
+        else:
+            raise ValueError(f"invalid camera model {camera_model}")
+        H, W = int(H), int(W)
+        # OPENCV camera model
+        camid = db.add_camera(
+            model_id, W, H, params, prior_focal_length=prior_focal_length)
+        if ga_world_to_cam is None:
+            prior_t = np.zeros(3)
+            prior_q = np.zeros(4)
+        else:
+            q = R.from_matrix(ga_world_to_cam[idx][:3, :3]).as_quat()
+            prior_t = ga_world_to_cam[idx][:3, 3]
+            prior_q = np.array([q[-1], q[0], q[1], q[2]])
+        imid = db.add_image(
+            image_paths[idx], camid, prior_q=prior_q, prior_t=prior_t)
+        image_to_colmap[idx] = {
+            'colmap_imid': imid,
+            'colmap_camid': camid
+        }
+    return image_to_colmap, im_keypoints
+def export_matches(db, images, image_to_colmap, im_keypoints, im_matches, min_len_track, skip_geometric_verification):
+    colmap_image_pairs = []
+    # 2D-2D are quite dense
+    # we want to remove the very small tracks
+    # and export only kpt for which we have values
+    # build tracks
+    print("building tracks")
+    keypoints_to_track_id = {}
+    track_id_to_kpt_list = []
+    to_merge = []
+    for (imidx0, imidx1), colmap_matches in tqdm(im_matches.items()):
+        if imidx0 not in keypoints_to_track_id:
+            keypoints_to_track_id[imidx0] = {}
+        if imidx1 not in keypoints_to_track_id:
+            keypoints_to_track_id[imidx1] = {}
+        for m in colmap_matches:
+            if m[0] not in keypoints_to_track_id[imidx0] and m[1] not in keypoints_to_track_id[imidx1]:
+                # new pair of kpts never seen before
+                track_idx = len(track_id_to_kpt_list)
+                keypoints_to_track_id[imidx0][m[0]] = track_idx
+                keypoints_to_track_id[imidx1][m[1]] = track_idx
+                track_id_to_kpt_list.append(
+                    [(imidx0, m[0]), (imidx1, m[1])])
+            elif m[1] not in keypoints_to_track_id[imidx1]:
+                # 0 has a track, not 1
+                track_idx = keypoints_to_track_id[imidx0][m[0]]
+                keypoints_to_track_id[imidx1][m[1]] = track_idx
+                track_id_to_kpt_list[track_idx].append((imidx1, m[1]))
+            elif m[0] not in keypoints_to_track_id[imidx0]:
+                # 1 has a track, not 0
+                track_idx = keypoints_to_track_id[imidx1][m[1]]
+                keypoints_to_track_id[imidx0][m[0]] = track_idx
+                track_id_to_kpt_list[track_idx].append((imidx0, m[0]))
+            else:
+                # both have tracks, merge them
+                track_idx0 = keypoints_to_track_id[imidx0][m[0]]
+                track_idx1 = keypoints_to_track_id[imidx1][m[1]]
+                if track_idx0 != track_idx1:
+                    # let's deal with them later
+                    to_merge.append((track_idx0, track_idx1))
+    # regroup merge targets
+    print("merging tracks")
+    unique = np.unique(to_merge)
+    tree = DisjointSet(unique)
+    for track_idx0, track_idx1 in tqdm(to_merge):
+        tree.merge(track_idx0, track_idx1)
+    subsets = tree.subsets()
+    print("applying merge")
+    for setvals in tqdm(subsets):
+        new_trackid = len(track_id_to_kpt_list)
+        kpt_list = []
+        for track_idx in setvals:
+            kpt_list.extend(track_id_to_kpt_list[track_idx])
+            for imidx, kpid in track_id_to_kpt_list[track_idx]:
+                keypoints_to_track_id[imidx][kpid] = new_trackid
+        track_id_to_kpt_list.append(kpt_list)
+    # binc = np.bincount([len(v) for v in track_id_to_kpt_list])
+    # nonzero = np.nonzero(binc)
+    # nonzerobinc = binc[nonzero[0]]
+    # print(nonzero[0].tolist())
+    # print(nonzerobinc)
+    num_valid_tracks = sum(
+        [1 for v in track_id_to_kpt_list if len(v) >= min_len_track])
+    keypoints_to_idx = {}
+    print(f"squashing keypoints - {num_valid_tracks} valid tracks")
+    for imidx, keypoints_imid in tqdm(im_keypoints.items()):
+        imid = image_to_colmap[imidx]['colmap_imid']
+        keypoints_kept = []
+        keypoints_to_idx[imidx] = {}
+        for kp in keypoints_imid.keys():
+            if kp not in keypoints_to_track_id[imidx]:
+                continue
+            track_idx = keypoints_to_track_id[imidx][kp]
+            track_length = len(track_id_to_kpt_list[track_idx])
+            if track_length < min_len_track:
+                continue
+            keypoints_to_idx[imidx][kp] = len(keypoints_kept)
+            keypoints_kept.append(kp)
+        if len(keypoints_kept) == 0:
+            continue
+        keypoints_kept = np.array(keypoints_kept)
+        keypoints_kept = np.unravel_index(keypoints_kept, images[imidx]['true_shape'][0])[
+            0].base[:, ::-1].copy().astype(np.float32)
+        # rescale coordinates
+        keypoints_kept[:, 0] += 0.5
+        keypoints_kept[:, 1] += 0.5
+        keypoints_kept = geotrf(images[imidx]['to_orig'], keypoints_kept, norm=True)
+        H, W = images[imidx]['orig_shape']
+        keypoints_kept[:, 0] = keypoints_kept[:, 0].clip(min=0, max=W - 0.01)
+        keypoints_kept[:, 1] = keypoints_kept[:, 1].clip(min=0, max=H - 0.01)
+        db.add_keypoints(imid, keypoints_kept)
+    print("exporting im_matches")
+    for (imidx0, imidx1), colmap_matches in im_matches.items():
+        imid0, imid1 = image_to_colmap[imidx0]['colmap_imid'], image_to_colmap[imidx1]['colmap_imid']
+        assert imid0 < imid1
+        final_matches = np.array([[keypoints_to_idx[imidx0][m[0]], keypoints_to_idx[imidx1][m[1]]]
+                                  for m in colmap_matches
+                                  if m[0] in keypoints_to_idx[imidx0] and m[1] in keypoints_to_idx[imidx1]])
+        if len(final_matches) > 0:
+            colmap_image_pairs.append(
+                (images[imidx0]['instance'], images[imidx1]['instance']))
+            db.add_matches(imid0, imid1, final_matches)
+            if skip_geometric_verification:
+                db.add_two_view_geometry(imid0, imid1, final_matches)
+    return colmap_image_pairs

mast3r/datasets/__init__.py ADDED Viewed

	@@ -0,0 +1,62 @@

+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+from .base.mast3r_base_stereo_view_dataset import MASt3RBaseStereoViewDataset
+import mast3r.utils.path_to_dust3r  # noqa
+from dust3r.datasets.arkitscenes import ARKitScenes as DUSt3R_ARKitScenes  # noqa
+from dust3r.datasets.blendedmvs import BlendedMVS as DUSt3R_BlendedMVS  # noqa
+from dust3r.datasets.co3d import Co3d as DUSt3R_Co3d  # noqa
+from dust3r.datasets.megadepth import MegaDepth as DUSt3R_MegaDepth  # noqa
+from dust3r.datasets.scannetpp import ScanNetpp as DUSt3R_ScanNetpp  # noqa
+from dust3r.datasets.staticthings3d import StaticThings3D as DUSt3R_StaticThings3D  # noqa
+from dust3r.datasets.waymo import Waymo as DUSt3R_Waymo  # noqa
+from dust3r.datasets.wildrgbd import WildRGBD as DUSt3R_WildRGBD  # noqa
+class ARKitScenes(DUSt3R_ARKitScenes, MASt3RBaseStereoViewDataset):
+    def __init__(self, *args, split, ROOT, **kwargs):
+        super().__init__(*args, split=split, ROOT=ROOT, **kwargs)
+        self.is_metric_scale = True
+class BlendedMVS(DUSt3R_BlendedMVS, MASt3RBaseStereoViewDataset):
+    def __init__(self, *args, ROOT, split=None, **kwargs):
+        super().__init__(*args, ROOT=ROOT, split=split, **kwargs)
+        self.is_metric_scale = False
+class Co3d(DUSt3R_Co3d, MASt3RBaseStereoViewDataset):
+    def __init__(self, mask_bg=True, *args, ROOT, **kwargs):
+        super().__init__(mask_bg, *args, ROOT=ROOT, **kwargs)
+        self.is_metric_scale = False
+class MegaDepth(DUSt3R_MegaDepth, MASt3RBaseStereoViewDataset):
+    def __init__(self, *args, split, ROOT, **kwargs):
+        super().__init__(*args, split=split, ROOT=ROOT, **kwargs)
+        self.is_metric_scale = False
+class ScanNetpp(DUSt3R_ScanNetpp, MASt3RBaseStereoViewDataset):
+    def __init__(self, *args, ROOT, **kwargs):
+        super().__init__(*args, ROOT=ROOT, **kwargs)
+        self.is_metric_scale = True
+class StaticThings3D(DUSt3R_StaticThings3D, MASt3RBaseStereoViewDataset):
+    def __init__(self, ROOT, *args, mask_bg='rand', **kwargs):
+        super().__init__(ROOT, *args, mask_bg=mask_bg, **kwargs)
+        self.is_metric_scale = False
+class Waymo(DUSt3R_Waymo, MASt3RBaseStereoViewDataset):
+    def __init__(self, *args, ROOT, **kwargs):
+        super().__init__(*args, ROOT=ROOT, **kwargs)
+        self.is_metric_scale = True
+class WildRGBD(DUSt3R_WildRGBD, MASt3RBaseStereoViewDataset):
+    def __init__(self, mask_bg=True, *args, ROOT, **kwargs):
+        super().__init__(mask_bg, *args, ROOT=ROOT, **kwargs)
+        self.is_metric_scale = True

mast3r/datasets/base/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # Copyright (C) 2024-present Naver Corporation. All rights reserved.
2	+ # Licensed under CC BY-NC-SA 4.0 (non-commercial use only).

mast3r/datasets/base/mast3r_base_stereo_view_dataset.py ADDED Viewed

	@@ -0,0 +1,355 @@

+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# base class for implementing datasets
+# --------------------------------------------------------
+import PIL.Image
+import PIL.Image as Image
+import numpy as np
+import torch
+import copy
+from mast3r.datasets.utils.cropping import (extract_correspondences_from_pts3d,
+                                            gen_random_crops, in2d_rect, crop_to_homography)
+import mast3r.utils.path_to_dust3r  # noqa
+from dust3r.datasets.base.base_stereo_view_dataset import BaseStereoViewDataset, view_name, is_good_type  # noqa
+from dust3r.datasets.utils.transforms import ImgNorm
+from dust3r.utils.geometry import depthmap_to_absolute_camera_coordinates, geotrf, depthmap_to_camera_coordinates
+import dust3r.datasets.utils.cropping as cropping
+class MASt3RBaseStereoViewDataset(BaseStereoViewDataset):
+    def __init__(self, *,  # only keyword arguments
+                 split=None,
+                 resolution=None,  # square_size or (width, height) or list of [(width,height), ...]
+                 transform=ImgNorm,
+                 aug_crop=False,
+                 aug_swap=False,
+                 aug_monocular=False,
+                 aug_portrait_or_landscape=True,  # automatic choice between landscape/portrait when possible
+                 aug_rot90=False,
+                 n_corres=0,
+                 nneg=0,
+                 n_tentative_crops=4,
+                 seed=None):
+        super().__init__(split=split, resolution=resolution, transform=transform, aug_crop=aug_crop, seed=seed)
+        self.is_metric_scale = False  # by default a dataset is not metric scale, subclasses can overwrite this
+        self.aug_swap = aug_swap
+        self.aug_monocular = aug_monocular
+        self.aug_portrait_or_landscape = aug_portrait_or_landscape
+        self.aug_rot90 = aug_rot90
+        self.n_corres = n_corres
+        self.nneg = nneg
+        assert self.n_corres == 'all' or isinstance(self.n_corres, int) or (isinstance(self.n_corres, list) and len(
+            self.n_corres) == self.num_views), f"Error, n_corres should either be 'all', a single integer or a list of length {self.num_views}"
+        assert self.nneg == 0 or self.n_corres != 'all'
+        self.n_tentative_crops = n_tentative_crops
+    def _swap_view_aug(self, views):
+        if self._rng.random() < 0.5:
+            views.reverse()
+    def _crop_resize_if_necessary(self, image, depthmap, intrinsics, resolution, rng=None, info=None):
+        """ This function:
+            - first downsizes the image with LANCZOS inteprolation,
+                which is better than bilinear interpolation in
+        """
+        if not isinstance(image, PIL.Image.Image):
+            image = PIL.Image.fromarray(image)
+        # transpose the resolution if necessary
+        W, H = image.size  # new size
+        assert resolution[0] >= resolution[1]
+        if H > 1.1 * W:
+            # image is portrait mode
+            resolution = resolution[::-1]
+        elif 0.9 < H / W < 1.1 and resolution[0] != resolution[1]:
+            # image is square, so we chose (portrait, landscape) randomly
+            if rng.integers(2) and self.aug_portrait_or_landscape:
+                resolution = resolution[::-1]
+        # high-quality Lanczos down-scaling
+        target_resolution = np.array(resolution)
+        image, depthmap, intrinsics = cropping.rescale_image_depthmap(image, depthmap, intrinsics, target_resolution)
+        # actual cropping (if necessary) with bilinear interpolation
+        offset_factor = 0.5
+        intrinsics2 = cropping.camera_matrix_of_crop(intrinsics, image.size, resolution, offset_factor=offset_factor)
+        crop_bbox = cropping.bbox_from_intrinsics_in_out(intrinsics, intrinsics2, resolution)
+        image, depthmap, intrinsics2 = cropping.crop_image_depthmap(image, depthmap, intrinsics, crop_bbox)
+        return image, depthmap, intrinsics2
+    def generate_crops_from_pair(self, view1, view2, resolution, aug_crop_arg, n_crops=4, rng=np.random):
+        views = [view1, view2]
+        if aug_crop_arg is False:
+            # compatibility
+            for i in range(2):
+                view = views[i]
+                view['img'], view['depthmap'], view['camera_intrinsics'] = self._crop_resize_if_necessary(view['img'],
+                                                                                                          view['depthmap'],
+                                                                                                          view['camera_intrinsics'],
+                                                                                                          resolution,
+                                                                                                          rng=rng)
+                view['pts3d'], view['valid_mask'] = depthmap_to_absolute_camera_coordinates(view['depthmap'],
+                                                                                            view['camera_intrinsics'],
+                                                                                            view['camera_pose'])
+            return
+        # extract correspondences
+        corres = extract_correspondences_from_pts3d(*views, target_n_corres=None, rng=rng)
+        # generate 4 random crops in each view
+        view_crops = []
+        crops_resolution = []
+        corres_msks = []
+        for i in range(2):
+            if aug_crop_arg == 'auto':
+                S = min(views[i]['img'].size)
+                R = min(resolution)
+                aug_crop = S * (S - R) // R
+                aug_crop = max(.1 * S, aug_crop)  # for cropping: augment scale of at least 10%, and more if possible
+            else:
+                aug_crop = aug_crop_arg
+            # tranpose the target resolution if necessary
+            assert resolution[0] >= resolution[1]
+            W, H = imsize = views[i]['img'].size
+            crop_resolution = resolution
+            if H > 1.1 * W:
+                # image is portrait mode
+                crop_resolution = resolution[::-1]
+            elif 0.9 < H / W < 1.1 and resolution[0] != resolution[1]:
+                # image is square, so we chose (portrait, landscape) randomly
+                if rng.integers(2):
+                    crop_resolution = resolution[::-1]
+            crops = gen_random_crops(imsize, n_crops, crop_resolution, aug_crop=aug_crop, rng=rng)
+            view_crops.append(crops)
+            crops_resolution.append(crop_resolution)
+            # compute correspondences
+            corres_msks.append(in2d_rect(corres[i], crops))
+        # compute IoU for each
+        intersection = np.float32(corres_msks[0]).T @ np.float32(corres_msks[1])
+        # select best pair of crops
+        best = np.unravel_index(intersection.argmax(), (n_crops, n_crops))
+        crops = [view_crops[i][c] for i, c in enumerate(best)]
+        # crop with the homography
+        for i in range(2):
+            view = views[i]
+            imsize, K_new, R, H = crop_to_homography(view['camera_intrinsics'], crops[i], crops_resolution[i])
+            # imsize, K_new, H = upscale_homography(imsize, resolution, K_new, H)
+            # update camera params
+            K_old = view['camera_intrinsics']
+            view['camera_intrinsics'] = K_new
+            view['camera_pose'] = view['camera_pose'].copy()
+            view['camera_pose'][:3, :3] = view['camera_pose'][:3, :3] @ R
+            # apply homography to image and depthmap
+            homo8 = (H / H[2, 2]).ravel().tolist()[:8]
+            view['img'] = view['img'].transform(imsize, Image.Transform.PERSPECTIVE,
+                                                homo8,
+                                                resample=Image.Resampling.BICUBIC)
+            depthmap2 = depthmap_to_camera_coordinates(view['depthmap'], K_old)[0] @ R[:, 2]
+            view['depthmap'] = np.array(Image.fromarray(depthmap2).transform(
+                imsize, Image.Transform.PERSPECTIVE, homo8))
+            if 'track_labels' in view:
+                # convert from uint64 --> uint32, because PIL.Image cannot handle uint64
+                mapping, track_labels = np.unique(view['track_labels'], return_inverse=True)
+                track_labels = track_labels.astype(np.uint32).reshape(view['track_labels'].shape)
+                # homography transformation
+                res = np.array(Image.fromarray(track_labels).transform(imsize, Image.Transform.PERSPECTIVE, homo8))
+                view['track_labels'] = mapping[res]  # mapping back to uint64
+            # recompute 3d points from scratch
+            view['pts3d'], view['valid_mask'] = depthmap_to_absolute_camera_coordinates(view['depthmap'],
+                                                                                        view['camera_intrinsics'],
+                                                                                        view['camera_pose'])
+    def __getitem__(self, idx):
+        if isinstance(idx, tuple):
+            # the idx is specifying the aspect-ratio
+            idx, ar_idx = idx
+        else:
+            assert len(self._resolutions) == 1
+            ar_idx = 0
+        # set-up the rng
+        if self.seed:  # reseed for each __getitem__
+            self._rng = np.random.default_rng(seed=self.seed + idx)
+        elif not hasattr(self, '_rng'):
+            seed = torch.initial_seed()  # this is different for each dataloader process
+            self._rng = np.random.default_rng(seed=seed)
+        # over-loaded code
+        resolution = self._resolutions[ar_idx]  # DO NOT CHANGE THIS (compatible with BatchedRandomSampler)
+        views = self._get_views(idx, resolution, self._rng)
+        assert len(views) == self.num_views
+        for v, view in enumerate(views):
+            assert 'pts3d' not in view, f"pts3d should not be there, they will be computed afterwards based on intrinsics+depthmap for view {view_name(view)}"
+            view['idx'] = (idx, ar_idx, v)
+            view['is_metric_scale'] = self.is_metric_scale
+            assert 'camera_intrinsics' in view
+            if 'camera_pose' not in view:
+                view['camera_pose'] = np.full((4, 4), np.nan, dtype=np.float32)
+            else:
+                assert np.isfinite(view['camera_pose']).all(), f'NaN in camera pose for view {view_name(view)}'
+            assert 'pts3d' not in view
+            assert 'valid_mask' not in view
+            assert np.isfinite(view['depthmap']).all(), f'NaN in depthmap for view {view_name(view)}'
+            pts3d, valid_mask = depthmap_to_absolute_camera_coordinates(**view)
+            view['pts3d'] = pts3d
+            view['valid_mask'] = valid_mask & np.isfinite(pts3d).all(axis=-1)
+        self.generate_crops_from_pair(views[0], views[1], resolution=resolution,
+                                      aug_crop_arg=self.aug_crop,
+                                      n_crops=self.n_tentative_crops,
+                                      rng=self._rng)
+        for v, view in enumerate(views):
+            # encode the image
+            width, height = view['img'].size
+            view['true_shape'] = np.int32((height, width))
+            view['img'] = self.transform(view['img'])
+            # Pixels for which depth is fundamentally undefined
+            view['sky_mask'] = (view['depthmap'] < 0)
+        if self.aug_swap:
+            self._swap_view_aug(views)
+        if self.aug_monocular:
+            if self._rng.random() < self.aug_monocular:
+                views = [copy.deepcopy(views[0]) for _ in range(len(views))]
+        # automatic extraction of correspondences from pts3d + pose
+        if self.n_corres > 0 and ('corres' not in view):
+            corres1, corres2, valid = extract_correspondences_from_pts3d(*views, self.n_corres,
+                                                                         self._rng, nneg=self.nneg)
+            views[0]['corres'] = corres1
+            views[1]['corres'] = corres2
+            views[0]['valid_corres'] = valid
+            views[1]['valid_corres'] = valid
+        if self.aug_rot90 is False:
+            pass
+        elif self.aug_rot90 == 'same':
+            rotate_90(views, k=self._rng.choice(4))
+        elif self.aug_rot90 == 'diff':
+            rotate_90(views[:1], k=self._rng.choice(4))
+            rotate_90(views[1:], k=self._rng.choice(4))
+        else:
+            raise ValueError(f'Bad value for {self.aug_rot90=}')
+        # check data-types metric_scale
+        for v, view in enumerate(views):
+            if 'corres' not in view:
+                view['corres'] = np.full((self.n_corres, 2), np.nan, dtype=np.float32)
+            # check all datatypes
+            for key, val in view.items():
+                res, err_msg = is_good_type(key, val)
+                assert res, f"{err_msg} with {key}={val} for view {view_name(view)}"
+            K = view['camera_intrinsics']
+            # check shapes
+            assert view['depthmap'].shape == view['img'].shape[1:]
+            assert view['depthmap'].shape == view['pts3d'].shape[:2]
+            assert view['depthmap'].shape == view['valid_mask'].shape
+        # last thing done!
+        for view in views:
+            # transpose to make sure all views are the same size
+            transpose_to_landscape(view)
+            # this allows to check whether the RNG is is the same state each time
+            view['rng'] = int.from_bytes(self._rng.bytes(4), 'big')
+        return views
+def transpose_to_landscape(view, revert=False):
+    height, width = view['true_shape']
+    if width < height:
+        if revert:
+            height, width = width, height
+        # rectify portrait to landscape
+        assert view['img'].shape == (3, height, width)
+        view['img'] = view['img'].swapaxes(1, 2)
+        assert view['valid_mask'].shape == (height, width)
+        view['valid_mask'] = view['valid_mask'].swapaxes(0, 1)
+        assert view['sky_mask'].shape == (height, width)
+        view['sky_mask'] = view['sky_mask'].swapaxes(0, 1)
+        assert view['depthmap'].shape == (height, width)
+        view['depthmap'] = view['depthmap'].swapaxes(0, 1)
+        assert view['pts3d'].shape == (height, width, 3)
+        view['pts3d'] = view['pts3d'].swapaxes(0, 1)
+        # transpose x and y pixels
+        view['camera_intrinsics'] = view['camera_intrinsics'][[1, 0, 2]]
+        # transpose correspondences x and y
+        view['corres'] = view['corres'][:, [1, 0]]
+def rotate_90(views, k=1):
+    from scipy.spatial.transform import Rotation
+    # print('rotation =', k)
+    RT = np.eye(4, dtype=np.float32)
+    RT[:3, :3] = Rotation.from_euler('z', 90 * k, degrees=True).as_matrix()
+    for view in views:
+        view['img'] = torch.rot90(view['img'], k=k, dims=(-2, -1))  # WARNING!! dims=(-1,-2) != dims=(-2,-1)
+        view['depthmap'] = np.rot90(view['depthmap'], k=k).copy()
+        view['camera_pose'] = view['camera_pose'] @ RT
+        RT2 = np.eye(3, dtype=np.float32)
+        RT2[:2, :2] = RT[:2, :2] * ((1, -1), (-1, 1))
+        H, W = view['depthmap'].shape
+        if k % 4 == 0:
+            pass
+        elif k % 4 == 1:
+            # top-left (0,0) pixel becomes (0,H-1)
+            RT2[:2, 2] = (0, H - 1)
+        elif k % 4 == 2:
+            # top-left (0,0) pixel becomes (W-1,H-1)
+            RT2[:2, 2] = (W - 1, H - 1)
+        elif k % 4 == 3:
+            # top-left (0,0) pixel becomes (W-1,0)
+            RT2[:2, 2] = (W - 1, 0)
+        else:
+            raise ValueError(f'Bad value for {k=}')
+        view['camera_intrinsics'][:2, 2] = geotrf(RT2, view['camera_intrinsics'][:2, 2])
+        if k % 2 == 1:
+            K = view['camera_intrinsics']
+            np.fill_diagonal(K, K.diagonal()[[1, 0, 2]])
+        pts3d, valid_mask = depthmap_to_absolute_camera_coordinates(**view)
+        view['pts3d'] = pts3d
+        view['valid_mask'] = np.rot90(view['valid_mask'], k=k).copy()
+        view['sky_mask'] = np.rot90(view['sky_mask'], k=k).copy()
+        view['corres'] = geotrf(RT2, view['corres']).round().astype(view['corres'].dtype)
+        view['true_shape'] = np.int32((H, W))

mast3r/datasets/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # Copyright (C) 2024-present Naver Corporation. All rights reserved.
2	+ # Licensed under CC BY-NC-SA 4.0 (non-commercial use only).

mast3r/datasets/utils/cropping.py ADDED Viewed

	@@ -0,0 +1,219 @@

+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# cropping/match extraction
+# --------------------------------------------------------
+import numpy as np
+import mast3r.utils.path_to_dust3r  # noqa
+from dust3r.utils.device import to_numpy
+from dust3r.utils.geometry import inv, geotrf
+def reciprocal_1d(corres_1_to_2, corres_2_to_1, ret_recip=False):
+    is_reciprocal1 = (corres_2_to_1[corres_1_to_2] == np.arange(len(corres_1_to_2)))
+    pos1 = is_reciprocal1.nonzero()[0]
+    pos2 = corres_1_to_2[pos1]
+    if ret_recip:
+        return is_reciprocal1, pos1, pos2
+    return pos1, pos2
+def extract_correspondences_from_pts3d(view1, view2, target_n_corres, rng=np.random, ret_xy=True, nneg=0):
+    view1, view2 = to_numpy((view1, view2))
+    # project pixels from image1 --> 3d points --> image2 pixels
+    shape1, corres1_to_2 = reproject_view(view1['pts3d'], view2)
+    shape2, corres2_to_1 = reproject_view(view2['pts3d'], view1)
+    # compute reciprocal correspondences:
+    # pos1 == valid pixels (correspondences) in image1
+    is_reciprocal1, pos1, pos2 = reciprocal_1d(corres1_to_2, corres2_to_1, ret_recip=True)
+    is_reciprocal2 = (corres1_to_2[corres2_to_1] == np.arange(len(corres2_to_1)))
+    if target_n_corres is None:
+        if ret_xy:
+            pos1 = unravel_xy(pos1, shape1)
+            pos2 = unravel_xy(pos2, shape2)
+        return pos1, pos2
+    available_negatives = min((~is_reciprocal1).sum(), (~is_reciprocal2).sum())
+    target_n_positives = int(target_n_corres * (1 - nneg))
+    n_positives = min(len(pos1), target_n_positives)
+    n_negatives = min(target_n_corres - n_positives, available_negatives)
+    if n_negatives + n_positives != target_n_corres:
+        # should be really rare => when there are not enough negatives
+        # in that case, break nneg and add a few more positives ?
+        n_positives = target_n_corres - n_negatives
+        assert n_positives <= len(pos1)
+    assert n_positives <= len(pos1)
+    assert n_positives <= len(pos2)
+    assert n_negatives <= (~is_reciprocal1).sum()
+    assert n_negatives <= (~is_reciprocal2).sum()
+    assert n_positives + n_negatives == target_n_corres
+    valid = np.ones(n_positives, dtype=bool)
+    if n_positives < len(pos1):
+        # random sub-sampling of valid correspondences
+        perm = rng.permutation(len(pos1))[:n_positives]
+        pos1 = pos1[perm]
+        pos2 = pos2[perm]
+    if n_negatives > 0:
+        # add false correspondences if not enough
+        def norm(p): return p / p.sum()
+        pos1 = np.r_[pos1, rng.choice(shape1[0] * shape1[1], size=n_negatives, replace=False, p=norm(~is_reciprocal1))]
+        pos2 = np.r_[pos2, rng.choice(shape2[0] * shape2[1], size=n_negatives, replace=False, p=norm(~is_reciprocal2))]
+        valid = np.r_[valid, np.zeros(n_negatives, dtype=bool)]
+    # convert (x+W*y) back to 2d (x,y) coordinates
+    if ret_xy:
+        pos1 = unravel_xy(pos1, shape1)
+        pos2 = unravel_xy(pos2, shape2)
+    return pos1, pos2, valid
+def reproject_view(pts3d, view2):
+    shape = view2['pts3d'].shape[:2]
+    return reproject(pts3d, view2['camera_intrinsics'], inv(view2['camera_pose']), shape)
+def reproject(pts3d, K, world2cam, shape):
+    H, W, THREE = pts3d.shape
+    assert THREE == 3
+    # reproject in camera2 space
+    with np.errstate(divide='ignore', invalid='ignore'):
+        pos = geotrf(K @ world2cam[:3], pts3d, norm=1, ncol=2)
+    # quantize to pixel positions
+    return (H, W), ravel_xy(pos, shape)
+def ravel_xy(pos, shape):
+    H, W = shape
+    with np.errstate(invalid='ignore'):
+        qx, qy = pos.reshape(-1, 2).round().astype(np.int32).T
+    quantized_pos = qx.clip(min=0, max=W - 1, out=qx) + W * qy.clip(min=0, max=H - 1, out=qy)
+    return quantized_pos
+def unravel_xy(pos, shape):
+    # convert (x+W*y) back to 2d (x,y) coordinates
+    return np.unravel_index(pos, shape)[0].base[:, ::-1].copy()
+def _rotation_origin_to_pt(target):
+    """ Align the origin (0,0,1) with the target point (x,y,1) in projective space.
+    Method: rotate z to put target on (x'+,0,1), then rotate on Y to get (0,0,1) and un-rotate z.
+    """
+    from scipy.spatial.transform import Rotation
+    x, y = target
+    rot_z = np.arctan2(y, x)
+    rot_y = np.arctan(np.linalg.norm(target))
+    R = Rotation.from_euler('ZYZ', [rot_z, rot_y, -rot_z]).as_matrix()
+    return R
+def _dotmv(Trf, pts, ncol=None, norm=False):
+    assert Trf.ndim >= 2
+    ncol = ncol or pts.shape[-1]
+    # adapt shape if necessary
+    output_reshape = pts.shape[:-1]
+    if Trf.ndim >= 3:
+        n = Trf.ndim - 2
+        assert Trf.shape[:n] == pts.shape[:n], 'batch size does not match'
+        Trf = Trf.reshape(-1, Trf.shape[-2], Trf.shape[-1])
+        if pts.ndim > Trf.ndim:
+            # Trf == (B,d,d) & pts == (B,H,W,d) --> (B, H*W, d)
+            pts = pts.reshape(Trf.shape[0], -1, pts.shape[-1])
+        elif pts.ndim == 2:
+            # Trf == (B,d,d) & pts == (B,d) --> (B, 1, d)
+            pts = pts[:, None, :]
+    if pts.shape[-1] + 1 == Trf.shape[-1]:
+        Trf = Trf.swapaxes(-1, -2)  # transpose Trf
+        pts = pts @ Trf[..., :-1, :] + Trf[..., -1:, :]
+    elif pts.shape[-1] == Trf.shape[-1]:
+        Trf = Trf.swapaxes(-1, -2)  # transpose Trf
+        pts = pts @ Trf
+    else:
+        pts = Trf @ pts.T
+        if pts.ndim >= 2:
+            pts = pts.swapaxes(-1, -2)
+    if norm:
+        pts = pts / pts[..., -1:]  # DONT DO /= BECAUSE OF WEIRD PYTORCH BUG
+        if norm != 1:
+            pts *= norm
+    res = pts[..., :ncol].reshape(*output_reshape, ncol)
+    return res
+def crop_to_homography(K, crop, target_size=None):
+    """ Given an image and its intrinsics,
+        we want to replicate a rectangular crop with an homography,
+        so that the principal point of the new 'crop' is centered.
+    """
+    # build intrinsics for the crop
+    crop = np.round(crop)
+    crop_size = crop[2:] - crop[:2]
+    K2 = K.copy()  # same focal
+    K2[:2, 2] = crop_size / 2  # new principal point is perfectly centered
+    # find which corner is the most far-away from current principal point
+    # so that the final homography does not go over the image borders
+    corners = crop.reshape(-1, 2)
+    corner_idx = np.abs(corners - K[:2, 2]).argmax(0)
+    corner = corners[corner_idx, [0, 1]]
+    # align with the corresponding corner from the target view
+    corner2 = np.c_[[0, 0], crop_size][[0, 1], corner_idx]
+    old_pt = _dotmv(np.linalg.inv(K), corner, norm=1)
+    new_pt = _dotmv(np.linalg.inv(K2), corner2, norm=1)
+    R = _rotation_origin_to_pt(old_pt) @ np.linalg.inv(_rotation_origin_to_pt(new_pt))
+    if target_size is not None:
+        imsize = target_size
+        target_size = np.asarray(target_size)
+        scaling = min(target_size / crop_size)
+        K2[:2] *= scaling
+        K2[:2, 2] = target_size / 2
+    else:
+        imsize = tuple(np.int32(crop_size).tolist())
+    return imsize, K2, R, K @ R @ np.linalg.inv(K2)
+def gen_random_crops(imsize, n_crops, resolution, aug_crop, rng=np.random):
+    """ Generate random crops of size=resolution,
+        for an input image upscaled to (imsize + randint(0 , aug_crop))
+    """
+    resolution_crop = np.array(resolution) * min(np.array(imsize) / resolution)
+    # (virtually) upscale the input image
+    # scaling = rng.uniform(1, 1+(aug_crop+1)/min(imsize))
+    scaling = np.exp(rng.uniform(0, np.log(1 + aug_crop / min(imsize))))
+    imsize2 = np.int32(np.array(imsize) * scaling)
+    # generate some random crops
+    topleft = rng.random((n_crops, 2)) * (imsize2 - resolution_crop)
+    crops = np.c_[topleft, topleft + resolution_crop]
+    # print(f"{scaling=}, {topleft=}")
+    # reduce the resolution to come back to original size
+    crops /= scaling
+    return crops
+def in2d_rect(corres, crops):
+    # corres = (N,2)
+    # crops = (M,4)
+    # output = (N, M)
+    is_sup = (corres[:, None] >= crops[None, :, 0:2])
+    is_inf = (corres[:, None] < crops[None, :, 2:4])
+    return (is_sup & is_inf).all(axis=-1)

mast3r/fast_nn.py ADDED Viewed

	@@ -0,0 +1,221 @@

+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# MASt3R Fast Nearest Neighbor
+# --------------------------------------------------------
+import torch
+import numpy as np
+import math
+from scipy.spatial import KDTree
+import mast3r.utils.path_to_dust3r  # noqa
+from dust3r.utils.device import to_numpy, todevice  # noqa
+@torch.no_grad()
+def bruteforce_reciprocal_nns(A, B, device='cuda', block_size=None, dist='l2'):
+    if isinstance(A, np.ndarray):
+        A = torch.from_numpy(A).to(device)
+    if isinstance(B, np.ndarray):
+        B = torch.from_numpy(B).to(device)
+    A = A.to(device)
+    B = B.to(device)
+    if dist == 'l2':
+        dist_func = torch.cdist
+        argmin = torch.min
+    elif dist == 'dot':
+        def dist_func(A, B):
+            return A @ B.T
+        def argmin(X, dim):
+            sim, nn = torch.max(X, dim=dim)
+            return sim.neg_(), nn
+    else:
+        raise ValueError(f'Unknown {dist=}')
+    if block_size is None or len(A) * len(B) <= block_size**2:
+        dists = dist_func(A, B)
+        _, nn_A = argmin(dists, dim=1)
+        _, nn_B = argmin(dists, dim=0)
+    else:
+        dis_A = torch.full((A.shape[0],), float('inf'), device=device, dtype=A.dtype)
+        dis_B = torch.full((B.shape[0],), float('inf'), device=device, dtype=B.dtype)
+        nn_A = torch.full((A.shape[0],), -1, device=device, dtype=torch.int64)
+        nn_B = torch.full((B.shape[0],), -1, device=device, dtype=torch.int64)
+        number_of_iteration_A = math.ceil(A.shape[0] / block_size)
+        number_of_iteration_B = math.ceil(B.shape[0] / block_size)
+        for i in range(number_of_iteration_A):
+            A_i = A[i * block_size:(i + 1) * block_size]
+            for j in range(number_of_iteration_B):
+                B_j = B[j * block_size:(j + 1) * block_size]
+                dists_blk = dist_func(A_i, B_j)  # A, B, 1
+                # dists_blk = dists[i * block_size:(i+1)*block_size, j * block_size:(j+1)*block_size]
+                min_A_i, argmin_A_i = argmin(dists_blk, dim=1)
+                min_B_j, argmin_B_j = argmin(dists_blk, dim=0)
+                col_mask = min_A_i < dis_A[i * block_size:(i + 1) * block_size]
+                line_mask = min_B_j < dis_B[j * block_size:(j + 1) * block_size]
+                dis_A[i * block_size:(i + 1) * block_size][col_mask] = min_A_i[col_mask]
+                dis_B[j * block_size:(j + 1) * block_size][line_mask] = min_B_j[line_mask]
+                nn_A[i * block_size:(i + 1) * block_size][col_mask] = argmin_A_i[col_mask] + (j * block_size)
+                nn_B[j * block_size:(j + 1) * block_size][line_mask] = argmin_B_j[line_mask] + (i * block_size)
+    nn_A = nn_A.cpu().numpy()
+    nn_B = nn_B.cpu().numpy()
+    return nn_A, nn_B
+class cdistMatcher:
+    def __init__(self, db_pts, device='cuda'):
+        self.db_pts = db_pts.to(device)
+        self.device = device
+    def query(self, queries, k=1, **kw):
+        assert k == 1
+        if queries.numel() == 0:
+            return None, []
+        nnA, nnB = bruteforce_reciprocal_nns(queries, self.db_pts, device=self.device, **kw)
+        dis = None
+        return dis, nnA
+def merge_corres(idx1, idx2, shape1=None, shape2=None, ret_xy=True, ret_index=False):
+    assert idx1.dtype == idx2.dtype == np.int32
+    # unique and sort along idx1
+    corres = np.unique(np.c_[idx2, idx1].view(np.int64), return_index=ret_index)
+    if ret_index:
+        corres, indices = corres
+    xy2, xy1 = corres[:, None].view(np.int32).T
+    if ret_xy:
+        assert shape1 and shape2
+        xy1 = np.unravel_index(xy1, shape1)
+        xy2 = np.unravel_index(xy2, shape2)
+        if ret_xy != 'y_x':
+            xy1 = xy1[0].base[:, ::-1]
+            xy2 = xy2[0].base[:, ::-1]
+    if ret_index:
+        return xy1, xy2, indices
+    return xy1, xy2
+def fast_reciprocal_NNs(pts1, pts2, subsample_or_initxy1=8, ret_xy=True, pixel_tol=0, ret_basin=False,
+                        device='cuda', **matcher_kw):
+    H1, W1, DIM1 = pts1.shape
+    H2, W2, DIM2 = pts2.shape
+    assert DIM1 == DIM2
+    pts1 = pts1.reshape(-1, DIM1)
+    pts2 = pts2.reshape(-1, DIM2)
+    if isinstance(subsample_or_initxy1, int) and pixel_tol == 0:
+        S = subsample_or_initxy1
+        y1, x1 = np.mgrid[S // 2:H1:S, S // 2:W1:S].reshape(2, -1)
+        max_iter = 10
+    else:
+        x1, y1 = subsample_or_initxy1
+        if isinstance(x1, torch.Tensor):
+            x1 = x1.cpu().numpy()
+        if isinstance(y1, torch.Tensor):
+            y1 = y1.cpu().numpy()
+        max_iter = 1
+    xy1 = np.int32(np.unique(x1 + W1 * y1))  # make sure there's no doublons
+    xy2 = np.full_like(xy1, -1)
+    old_xy1 = xy1.copy()
+    old_xy2 = xy2.copy()
+    if (isinstance(device, str) and device.startswith('cuda')) or (isinstance(device, torch.device) and device.type.startswith('cuda')):
+        pts1 = pts1.to(device)
+        pts2 = pts2.to(device)
+        tree1 = cdistMatcher(pts1, device=device)
+        tree2 = cdistMatcher(pts2, device=device)
+    else:
+        pts1, pts2 = to_numpy((pts1, pts2))
+        tree1 = KDTree(pts1)
+        tree2 = KDTree(pts2)
+    notyet = np.ones(len(xy1), dtype=bool)
+    basin = np.full((H1 * W1 + 1,), -1, dtype=np.int32) if ret_basin else None
+    niter = 0
+    # n_notyet = [len(notyet)]
+    while notyet.any():
+        _, xy2[notyet] = to_numpy(tree2.query(pts1[xy1[notyet]], **matcher_kw))
+        if not ret_basin:
+            notyet &= (old_xy2 != xy2)  # remove points that have converged
+        _, xy1[notyet] = to_numpy(tree1.query(pts2[xy2[notyet]], **matcher_kw))
+        if ret_basin:
+            basin[old_xy1[notyet]] = xy1[notyet]
+        notyet &= (old_xy1 != xy1)  # remove points that have converged
+        # n_notyet.append(notyet.sum())
+        niter += 1
+        if niter >= max_iter:
+            break
+        old_xy2[:] = xy2
+        old_xy1[:] = xy1
+    # print('notyet_stats:', ' '.join(map(str, (n_notyet+[0]*10)[:max_iter])))
+    if pixel_tol > 0:
+        # in case we only want to match some specific points
+        # and still have some way of checking reciprocity
+        old_yx1 = np.unravel_index(old_xy1, (H1, W1))[0].base
+        new_yx1 = np.unravel_index(xy1, (H1, W1))[0].base
+        dis = np.linalg.norm(old_yx1 - new_yx1, axis=-1)
+        converged = dis < pixel_tol
+        if not isinstance(subsample_or_initxy1, int):
+            xy1 = old_xy1  # replace new points by old ones
+    else:
+        converged = ~notyet  # converged correspondences
+    # keep only unique correspondences, and sort on xy1
+    xy1, xy2 = merge_corres(xy1[converged], xy2[converged], (H1, W1), (H2, W2), ret_xy=ret_xy)
+    if ret_basin:
+        return xy1, xy2, basin
+    return xy1, xy2
+def extract_correspondences_nonsym(A, B, confA, confB, subsample=8, device=None, ptmap_key='pred_desc', pixel_tol=0):
+    if '3d' in ptmap_key:
+        opt = dict(device='cpu', workers=32)
+    else:
+        opt = dict(device=device, dist='dot', block_size=2**13)
+    # matching the two pairs
+    idx1 = []
+    idx2 = []
+    # merge corres from opposite pairs
+    HA, WA = A.shape[:2]
+    HB, WB = B.shape[:2]
+    if pixel_tol == 0:
+        nn1to2 = fast_reciprocal_NNs(A, B, subsample_or_initxy1=subsample, ret_xy=False, **opt)
+        nn2to1 = fast_reciprocal_NNs(B, A, subsample_or_initxy1=subsample, ret_xy=False, **opt)
+    else:
+        S = subsample
+        yA, xA = np.mgrid[S // 2:HA:S, S // 2:WA:S].reshape(2, -1)
+        yB, xB = np.mgrid[S // 2:HB:S, S // 2:WB:S].reshape(2, -1)
+        nn1to2 = fast_reciprocal_NNs(A, B, subsample_or_initxy1=(xA, yA), ret_xy=False, pixel_tol=pixel_tol, **opt)
+        nn2to1 = fast_reciprocal_NNs(B, A, subsample_or_initxy1=(xB, yB), ret_xy=False, pixel_tol=pixel_tol, **opt)
+    idx1 = np.r_[nn1to2[0], nn2to1[1]]
+    idx2 = np.r_[nn1to2[1], nn2to1[0]]
+    c1 = confA.ravel()[idx1]
+    c2 = confB.ravel()[idx2]
+    xy1, xy2, idx = merge_corres(idx1, idx2, (HA, WA), (HB, WB), ret_xy=True, ret_index=True)
+    conf = np.minimum(c1[idx], c2[idx])
+    corres = (xy1.copy(), xy2.copy(), conf)
+    return todevice(corres, device)

mast3r/losses.py ADDED Viewed

	@@ -0,0 +1,514 @@

+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# Implementation of MASt3R training losses
+# --------------------------------------------------------
+import torch
+import torch.nn as nn
+import numpy as np
+from sklearn.metrics import average_precision_score
+import mast3r.utils.path_to_dust3r  # noqa
+from dust3r.losses import BaseCriterion, Criterion, MultiLoss, Sum, ConfLoss
+from dust3r.losses import Regr3D as Regr3D_dust3r
+from dust3r.utils.geometry import (geotrf, inv, normalize_pointcloud)
+from dust3r.inference import get_pred_pts3d
+from dust3r.utils.geometry import get_joint_pointcloud_depth, get_joint_pointcloud_center_scale
+def apply_log_to_norm(xyz):
+    d = xyz.norm(dim=-1, keepdim=True)
+    xyz = xyz / d.clip(min=1e-8)
+    xyz = xyz * torch.log1p(d)
+    return xyz
+class Regr3D (Regr3D_dust3r):
+    def __init__(self, criterion, norm_mode='avg_dis', gt_scale=False, opt_fit_gt=False,
+                 sky_loss_value=2, max_metric_scale=False, loss_in_log=False):
+        self.loss_in_log = loss_in_log
+        if norm_mode.startswith('?'):
+            # do no norm pts from metric scale datasets
+            self.norm_all = False
+            self.norm_mode = norm_mode[1:]
+        else:
+            self.norm_all = True
+            self.norm_mode = norm_mode
+        super().__init__(criterion, self.norm_mode, gt_scale)
+        self.sky_loss_value = sky_loss_value
+        self.max_metric_scale = max_metric_scale
+    def get_all_pts3d(self, gt1, gt2, pred1, pred2, dist_clip=None):
+        # everything is normalized w.r.t. camera of view1
+        in_camera1 = inv(gt1['camera_pose'])
+        gt_pts1 = geotrf(in_camera1, gt1['pts3d'])  # B,H,W,3
+        gt_pts2 = geotrf(in_camera1, gt2['pts3d'])  # B,H,W,3
+        valid1 = gt1['valid_mask'].clone()
+        valid2 = gt2['valid_mask'].clone()
+        if dist_clip is not None:
+            # points that are too far-away == invalid
+            dis1 = gt_pts1.norm(dim=-1)  # (B, H, W)
+            dis2 = gt_pts2.norm(dim=-1)  # (B, H, W)
+            valid1 = valid1 & (dis1 <= dist_clip)
+            valid2 = valid2 & (dis2 <= dist_clip)
+        if self.loss_in_log == 'before':
+            # this only make sense when depth_mode == 'linear'
+            gt_pts1 = apply_log_to_norm(gt_pts1)
+            gt_pts2 = apply_log_to_norm(gt_pts2)
+        pr_pts1 = get_pred_pts3d(gt1, pred1, use_pose=False).clone()
+        pr_pts2 = get_pred_pts3d(gt2, pred2, use_pose=True).clone()
+        if not self.norm_all:
+            if self.max_metric_scale:
+                B = valid1.shape[0]
+                # valid1: B, H, W
+                # torch.linalg.norm(gt_pts1, dim=-1) -> B, H, W
+                # dist1_to_cam1 -> reshape to B, H*W
+                dist1_to_cam1 = torch.where(valid1, torch.linalg.norm(gt_pts1, dim=-1), 0).view(B, -1)
+                dist2_to_cam1 = torch.where(valid2, torch.linalg.norm(gt_pts2, dim=-1), 0).view(B, -1)
+                # is_metric_scale: B
+                # dist1_to_cam1.max(dim=-1).values -> B
+                gt1['is_metric_scale'] = gt1['is_metric_scale'] \
+                    & (dist1_to_cam1.max(dim=-1).values < self.max_metric_scale) \
+                    & (dist2_to_cam1.max(dim=-1).values < self.max_metric_scale)
+                gt2['is_metric_scale'] = gt1['is_metric_scale']
+            mask = ~gt1['is_metric_scale']
+        else:
+            mask = torch.ones_like(gt1['is_metric_scale'])
+        # normalize 3d points
+        if self.norm_mode and mask.any():
+            pr_pts1[mask], pr_pts2[mask] = normalize_pointcloud(pr_pts1[mask], pr_pts2[mask], self.norm_mode,
+                                                                valid1[mask], valid2[mask])
+        if self.norm_mode and not self.gt_scale:
+            gt_pts1, gt_pts2, norm_factor = normalize_pointcloud(gt_pts1, gt_pts2, self.norm_mode,
+                                                                 valid1, valid2, ret_factor=True)
+            # apply the same normalization to prediction
+            pr_pts1[~mask] = pr_pts1[~mask] / norm_factor[~mask]
+            pr_pts2[~mask] = pr_pts2[~mask] / norm_factor[~mask]
+        # return sky segmentation, making sure they don't include any labelled 3d points
+        sky1 = gt1['sky_mask'] & (~valid1)
+        sky2 = gt2['sky_mask'] & (~valid2)
+        return gt_pts1, gt_pts2, pr_pts1, pr_pts2, valid1, valid2, sky1, sky2, {}
+    def compute_loss(self, gt1, gt2, pred1, pred2, **kw):
+        gt_pts1, gt_pts2, pred_pts1, pred_pts2, mask1, mask2, sky1, sky2, monitoring = \
+            self.get_all_pts3d(gt1, gt2, pred1, pred2, **kw)
+        if self.sky_loss_value > 0:
+            assert self.criterion.reduction == 'none', 'sky_loss_value should be 0 if no conf loss'
+            # add the sky pixel as "valid" pixels...
+            mask1 = mask1 | sky1
+            mask2 = mask2 | sky2
+        # loss on img1 side
+        pred_pts1 = pred_pts1[mask1]
+        gt_pts1 = gt_pts1[mask1]
+        if self.loss_in_log and self.loss_in_log != 'before':
+            # this only make sense when depth_mode == 'exp'
+            pred_pts1 = apply_log_to_norm(pred_pts1)
+            gt_pts1 = apply_log_to_norm(gt_pts1)
+        l1 = self.criterion(pred_pts1, gt_pts1)
+        # loss on gt2 side
+        pred_pts2 = pred_pts2[mask2]
+        gt_pts2 = gt_pts2[mask2]
+        if self.loss_in_log and self.loss_in_log != 'before':
+            pred_pts2 = apply_log_to_norm(pred_pts2)
+            gt_pts2 = apply_log_to_norm(gt_pts2)
+        l2 = self.criterion(pred_pts2, gt_pts2)
+        if self.sky_loss_value > 0:
+            assert self.criterion.reduction == 'none', 'sky_loss_value should be 0 if no conf loss'
+            # ... but force the loss to be high there
+            l1 = torch.where(sky1[mask1], self.sky_loss_value, l1)
+            l2 = torch.where(sky2[mask2], self.sky_loss_value, l2)
+        self_name = type(self).__name__
+        details = {self_name + '_pts3d_1': float(l1.mean()), self_name + '_pts3d_2': float(l2.mean())}
+        return Sum((l1, mask1), (l2, mask2)), (details | monitoring)
+class Regr3D_ShiftInv (Regr3D):
+    """ Same than Regr3D but invariant to depth shift.
+    """
+    def get_all_pts3d(self, gt1, gt2, pred1, pred2):
+        # compute unnormalized points
+        gt_pts1, gt_pts2, pred_pts1, pred_pts2, mask1, mask2, sky1, sky2, monitoring = \
+            super().get_all_pts3d(gt1, gt2, pred1, pred2)
+        # compute median depth
+        gt_z1, gt_z2 = gt_pts1[..., 2], gt_pts2[..., 2]
+        pred_z1, pred_z2 = pred_pts1[..., 2], pred_pts2[..., 2]
+        gt_shift_z = get_joint_pointcloud_depth(gt_z1, gt_z2, mask1, mask2)[:, None, None]
+        pred_shift_z = get_joint_pointcloud_depth(pred_z1, pred_z2, mask1, mask2)[:, None, None]
+        # subtract the median depth
+        gt_z1 -= gt_shift_z
+        gt_z2 -= gt_shift_z
+        pred_z1 -= pred_shift_z
+        pred_z2 -= pred_shift_z
+        # monitoring = dict(monitoring, gt_shift_z=gt_shift_z.mean().detach(), pred_shift_z=pred_shift_z.mean().detach())
+        return gt_pts1, gt_pts2, pred_pts1, pred_pts2, mask1, mask2, sky1, sky2, monitoring
+class Regr3D_ScaleInv (Regr3D):
+    """ Same than Regr3D but invariant to depth scale.
+        if gt_scale == True: enforce the prediction to take the same scale than GT
+    """
+    def get_all_pts3d(self, gt1, gt2, pred1, pred2):
+        # compute depth-normalized points
+        gt_pts1, gt_pts2, pred_pts1, pred_pts2, mask1, mask2, sky1, sky2, monitoring = \
+            super().get_all_pts3d(gt1, gt2, pred1, pred2)
+        # measure scene scale
+        _, gt_scale = get_joint_pointcloud_center_scale(gt_pts1, gt_pts2, mask1, mask2)
+        _, pred_scale = get_joint_pointcloud_center_scale(pred_pts1, pred_pts2, mask1, mask2)
+        # prevent predictions to be in a ridiculous range
+        pred_scale = pred_scale.clip(min=1e-3, max=1e3)
+        # subtract the median depth
+        if self.gt_scale:
+            pred_pts1 *= gt_scale / pred_scale
+            pred_pts2 *= gt_scale / pred_scale
+            # monitoring = dict(monitoring, pred_scale=(pred_scale/gt_scale).mean())
+        else:
+            gt_pts1 /= gt_scale
+            gt_pts2 /= gt_scale
+            pred_pts1 /= pred_scale
+            pred_pts2 /= pred_scale
+            # monitoring = dict(monitoring, gt_scale=gt_scale.mean(), pred_scale=pred_scale.mean().detach())
+        return gt_pts1, gt_pts2, pred_pts1, pred_pts2, mask1, mask2, sky1, sky2, monitoring
+class Regr3D_ScaleShiftInv (Regr3D_ScaleInv, Regr3D_ShiftInv):
+    # calls Regr3D_ShiftInv first, then Regr3D_ScaleInv
+    pass
+def get_similarities(desc1, desc2, euc=False):
+    if euc:  # euclidean distance in same range than similarities
+        dists = (desc1[:, :, None] - desc2[:, None]).norm(dim=-1)
+        sim = 1 / (1 + dists)
+    else:
+        # Compute similarities
+        sim = desc1 @ desc2.transpose(-2, -1)
+    return sim
+class MatchingCriterion(BaseCriterion):
+    def __init__(self, reduction='mean', fp=torch.float32):
+        super().__init__(reduction)
+        self.fp = fp
+    def forward(self, a, b, valid_matches=None, euc=False):
+        assert a.ndim >= 2 and 1 <= a.shape[-1], f'Bad shape = {a.shape}'
+        dist = self.loss(a.to(self.fp), b.to(self.fp), valid_matches, euc=euc)
+        # one dimension less or reduction to single value
+        assert (valid_matches is None and dist.ndim == a.ndim -
+                1) or self.reduction in ['mean', 'sum', '1-mean', 'none']
+        if self.reduction == 'none':
+            return dist
+        if self.reduction == 'sum':
+            return dist.sum()
+        if self.reduction == 'mean':
+            return dist.mean() if dist.numel() > 0 else dist.new_zeros(())
+        if self.reduction == '1-mean':
+            return 1. - dist.mean() if dist.numel() > 0 else dist.new_ones(())
+        raise ValueError(f'bad {self.reduction=} mode')
+    def loss(self, a, b, valid_matches=None):
+        raise NotImplementedError
+class InfoNCE(MatchingCriterion):
+    def __init__(self, temperature=0.07, eps=1e-8, mode='all', **kwargs):
+        super().__init__(**kwargs)
+        self.temperature = temperature
+        self.eps = eps
+        assert mode in ['all', 'proper', 'dual']
+        self.mode = mode
+    def loss(self, desc1, desc2, valid_matches=None, euc=False):
+        # valid positives are along diagonals
+        B, N, D = desc1.shape
+        B2, N2, D2 = desc2.shape
+        assert B == B2 and D == D2
+        if valid_matches is None:
+            valid_matches = torch.ones([B, N], dtype=bool)
+        # torch.all(valid_matches.sum(dim=-1) > 0) some pairs have no matches????
+        assert valid_matches.shape == torch.Size([B, N]) and valid_matches.sum() > 0
+        # Tempered similarities
+        sim = get_similarities(desc1, desc2, euc) / self.temperature
+        sim[sim.isnan()] = -torch.inf  # ignore nans
+        # Softmax of positives with temperature
+        sim = sim.exp_()  # save peak memory
+        positives = sim.diagonal(dim1=-2, dim2=-1)
+        # Loss
+        if self.mode == 'all':            # Previous InfoNCE
+            loss = -torch.log((positives / sim.sum(dim=-1).sum(dim=-1, keepdim=True)).clip(self.eps))
+        elif self.mode == 'proper':  # Proper InfoNCE
+            loss = -(torch.log((positives / sim.sum(dim=-2)).clip(self.eps)) +
+                     torch.log((positives / sim.sum(dim=-1)).clip(self.eps)))
+        elif self.mode == 'dual':  # Dual Softmax
+            loss = -(torch.log((positives**2 / sim.sum(dim=-1) / sim.sum(dim=-2)).clip(self.eps)))
+        else:
+            raise ValueError("This should not happen...")
+        return loss[valid_matches]
+class APLoss (MatchingCriterion):
+    """ AP loss.
+        Input: (N, M)   values in [min, max]
+        label: (N, M)   values in {0, 1}
+        Returns: 1 - mAP (mean AP for each n in {1..N})
+                 Note: typically, this is what you wanna minimize
+    """
+    def __init__(self, nq='torch', min=0, max=1, euc=False, **kw):
+        super().__init__(**kw)
+        # Exact/True AP loss (not differentiable)
+        if nq == 0:
+            nq = 'sklearn'  # special case
+        try:
+            self.compute_AP = eval('self.compute_true_AP_' + nq)
+        except:
+            raise ValueError("Unknown mode %s for AP loss" % nq)
+    @staticmethod
+    def compute_true_AP_sklearn(scores, labels):
+        def compute_AP(label, score):
+            return average_precision_score(label, score)
+        aps = scores.new_zeros((scores.shape[0], scores.shape[1]))
+        label_np = labels.cpu().numpy().astype(bool)
+        scores_np = scores.cpu().numpy()
+        for bi in range(scores_np.shape[0]):
+            for i in range(scores_np.shape[1]):
+                labels = label_np[bi, i, :]
+                if labels.sum() < 1:
+                    continue
+                aps[bi, i] = compute_AP(labels, scores_np[bi, i, :])
+        return aps
+    @staticmethod
+    def compute_true_AP_torch(scores, labels):
+        assert scores.shape == labels.shape
+        B, N, M = labels.shape
+        dev = labels.device
+        with torch.no_grad():
+            # sort scores
+            _, order = scores.sort(dim=-1, descending=True)
+            # sort labels accordingly
+            labels = labels[torch.arange(B, device=dev)[:, None, None].expand(order.shape),
+                            torch.arange(N, device=dev)[None, :, None].expand(order.shape),
+                            order]
+            # compute number of positives per query
+            npos = labels.sum(dim=-1)
+            assert torch.all(torch.isclose(npos, npos[0, 0])
+                             ), "only implemented for constant number of positives per query"
+            npos = int(npos[0, 0])
+            # compute precision at each recall point
+            posrank = labels.nonzero()[:, -1].view(B, N, npos)
+            recall = torch.arange(1, 1 + npos, dtype=torch.float32, device=dev)[None, None, :].expand(B, N, npos)
+            precision = recall / (1 + posrank).float()
+            # average precision values at all recall points
+            aps = precision.mean(dim=-1)
+        return aps
+    def loss(self, desc1, desc2, valid_matches=None, euc=False):  # if matches is None, positives are the diagonal
+        B, N1, D = desc1.shape
+        B2, N2, D2 = desc2.shape
+        assert B == B2 and D == D2
+        scores = get_similarities(desc1, desc2, euc)
+        labels = torch.zeros([B, N1, N2], dtype=scores.dtype, device=scores.device)
+        # allow all diagonal positives and only mask afterwards
+        labels.diagonal(dim1=-2, dim2=-1)[...] = 1.
+        apscore = self.compute_AP(scores, labels)
+        if valid_matches is not None:
+            apscore = apscore[valid_matches]
+        return apscore
+class MatchingLoss (Criterion, MultiLoss):
+    """
+    Matching loss per image
+    only compare pixels inside an image but not in the whole batch as what would be done usually
+    """
+    def __init__(self, criterion, withconf=False, use_pts3d=False, negatives_padding=0, blocksize=4096):
+        super().__init__(criterion)
+        self.negatives_padding = negatives_padding
+        self.use_pts3d = use_pts3d
+        self.blocksize = blocksize
+        self.withconf = withconf
+    def add_negatives(self, outdesc2, desc2, batchid, x2, y2):
+        if self.negatives_padding:
+            B, H, W, D = desc2.shape
+            negatives = torch.ones([B, H, W], device=desc2.device, dtype=bool)
+            negatives[batchid, y2, x2] = False
+            sel = negatives & (negatives.view([B, -1]).cumsum(dim=-1).view(B, H, W)
+                               <= self.negatives_padding)  # take the N-first negatives
+            outdesc2 = torch.cat([outdesc2, desc2[sel].view([B, -1, D])], dim=1)
+        return outdesc2
+    def get_confs(self, pred1, pred2, sel1, sel2):
+        if self.withconf:
+            if self.use_pts3d:
+                outconfs1 = pred1['conf'][sel1]
+                outconfs2 = pred2['conf'][sel2]
+            else:
+                outconfs1 = pred1['desc_conf'][sel1]
+                outconfs2 = pred2['desc_conf'][sel2]
+        else:
+            outconfs1 = outconfs2 = None
+        return outconfs1, outconfs2
+    def get_descs(self, pred1, pred2):
+        if self.use_pts3d:
+            desc1, desc2 = pred1['pts3d'], pred2['pts3d_in_other_view']
+        else:
+            desc1, desc2 = pred1['desc'], pred2['desc']
+        return desc1, desc2
+    def get_matching_descs(self, gt1, gt2, pred1, pred2, **kw):
+        outdesc1 = outdesc2 = outconfs1 = outconfs2 = None
+        # Recover descs, GT corres and valid mask
+        desc1, desc2 = self.get_descs(pred1, pred2)
+        (x1, y1), (x2, y2) = gt1['corres'].unbind(-1), gt2['corres'].unbind(-1)
+        valid_matches = gt1['valid_corres']
+        # Select descs that have GT matches
+        B, N = x1.shape
+        batchid = torch.arange(B)[:, None].repeat(1, N)  # B, N
+        outdesc1, outdesc2 = desc1[batchid, y1, x1], desc2[batchid, y2, x2]  # B, N, D
+        # Padd with unused negatives
+        outdesc2 = self.add_negatives(outdesc2, desc2, batchid, x2, y2)
+        # Gather confs if needed
+        sel1 = batchid, y1, x1
+        sel2 = batchid, y2, x2
+        outconfs1, outconfs2 = self.get_confs(pred1, pred2, sel1, sel2)
+        return outdesc1, outdesc2, outconfs1, outconfs2, valid_matches, {'use_euclidean_dist': self.use_pts3d}
+    def blockwise_criterion(self, descs1, descs2, confs1, confs2, valid_matches, euc, rng=np.random, shuffle=True):
+        loss = None
+        details = {}
+        B, N, D = descs1.shape
+        if N <= self.blocksize:  # Blocks are larger than provided descs, compute regular loss
+            loss = self.criterion(descs1, descs2, valid_matches, euc=euc)
+        else:  # Compute criterion on the blockdiagonal only, after shuffling
+            # Shuffle if necessary
+            matches_perm = slice(None)
+            if shuffle:
+                matches_perm = np.stack([rng.choice(range(N), size=N, replace=False) for _ in range(B)])
+                batchid = torch.tile(torch.arange(B), (N, 1)).T
+                matches_perm = batchid, matches_perm
+            descs1 = descs1[matches_perm]
+            descs2 = descs2[matches_perm]
+            valid_matches = valid_matches[matches_perm]
+            assert N % self.blocksize == 0, "Error, can't chunk block-diagonal, please check blocksize"
+            n_chunks = N // self.blocksize
+            descs1 = descs1.reshape([B * n_chunks, self.blocksize, D])  # [B*(N//blocksize), blocksize, D]
+            descs2 = descs2.reshape([B * n_chunks, self.blocksize, D])  # [B*(N//blocksize), blocksize, D]
+            valid_matches = valid_matches.view([B * n_chunks, self.blocksize])
+            loss = self.criterion(descs1, descs2, valid_matches, euc=euc)
+            if self.withconf:
+                confs1, confs2 = map(lambda x: x[matches_perm], (confs1, confs2))  # apply perm to confidences if needed
+        if self.withconf:
+            # split confidences between positives/negatives for loss computation
+            details['conf_pos'] = map(lambda x: x[valid_matches.view(B, -1)], (confs1, confs2))
+            details['conf_neg'] = map(lambda x: x[~valid_matches.view(B, -1)], (confs1, confs2))
+            details['Conf1_std'] = confs1.std()
+            details['Conf2_std'] = confs2.std()
+        return loss, details
+    def compute_loss(self, gt1, gt2, pred1, pred2, **kw):
+        # Gather preds and GT
+        descs1, descs2, confs1, confs2, valid_matches, monitoring = self.get_matching_descs(
+            gt1, gt2, pred1, pred2, **kw)
+        # loss on matches
+        loss, details = self.blockwise_criterion(descs1, descs2, confs1, confs2,
+                                                 valid_matches, euc=monitoring.pop('use_euclidean_dist', False))
+        details[type(self).__name__] = float(loss.mean())
+        return loss, (details | monitoring)
+class ConfMatchingLoss(ConfLoss):
+    """ Weight matching by learned confidence. Same as ConfLoss but for a matching criterion
+        Assuming the input matching_loss is a match-level loss.
+    """
+    def __init__(self, pixel_loss, alpha=1., confmode='prod', neg_conf_loss_quantile=False):
+        super().__init__(pixel_loss, alpha)
+        self.pixel_loss.withconf = True
+        self.confmode = confmode
+        self.neg_conf_loss_quantile = neg_conf_loss_quantile
+    def aggregate_confs(self, confs1, confs2):  # get the confidences resulting from the two view predictions
+        if self.confmode == 'prod':
+            confs = confs1 * confs2 if confs1 is not None and confs2 is not None else 1.
+        elif self.confmode == 'mean':
+            confs = .5 * (confs1 + confs2) if confs1 is not None and confs2 is not None else 1.
+        else:
+            raise ValueError(f"Unknown conf mode {self.confmode}")
+        return confs
+    def compute_loss(self, gt1, gt2, pred1, pred2, **kw):
+        # compute per-pixel loss
+        loss, details = self.pixel_loss(gt1, gt2, pred1, pred2, **kw)
+        # Recover confidences for positive and negative samples
+        conf1_pos, conf2_pos = details.pop('conf_pos')
+        conf1_neg, conf2_neg = details.pop('conf_neg')
+        conf_pos = self.aggregate_confs(conf1_pos, conf2_pos)
+        # weight Matching loss by confidence on positives
+        conf_pos, log_conf_pos = self.get_conf_log(conf_pos)
+        conf_loss = loss * conf_pos - self.alpha * log_conf_pos
+        # average + nan protection (in case of no valid pixels at all)
+        conf_loss = conf_loss.mean() if conf_loss.numel() > 0 else 0
+        # Add negative confs loss to give some supervision signal to confidences for pixels that are not matched in GT
+        if self.neg_conf_loss_quantile:
+            conf_neg = torch.cat([conf1_neg, conf2_neg])
+            conf_neg, log_conf_neg = self.get_conf_log(conf_neg)
+            # recover quantile that will be used for negatives loss value assignment
+            neg_loss_value = torch.quantile(loss, self.neg_conf_loss_quantile).detach()
+            neg_loss = neg_loss_value * conf_neg - self.alpha * log_conf_neg
+            neg_loss = neg_loss.mean() if neg_loss.numel() > 0 else 0
+            conf_loss = conf_loss + neg_loss
+        return conf_loss, dict(matching_conf_loss=float(conf_loss), **details)

mast3r/model.py ADDED Viewed

	@@ -0,0 +1,68 @@

+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# MASt3R model class
+# --------------------------------------------------------
+import torch
+import torch.nn.functional as F
+import os
+from mast3r.catmlp_dpt_head import mast3r_head_factory
+import mast3r.utils.path_to_dust3r  # noqa
+from dust3r.model import AsymmetricCroCo3DStereo  # noqa
+from dust3r.utils.misc import transpose_to_landscape  # noqa
+inf = float('inf')
+def load_model(model_path, device, verbose=True):
+    if verbose:
+        print('... loading model from', model_path)
+    ckpt = torch.load(model_path, map_location='cpu')
+    args = ckpt['args'].model.replace("ManyAR_PatchEmbed", "PatchEmbedDust3R")
+    if 'landscape_only' not in args:
+        args = args[:-1] + ', landscape_only=False)'
+    else:
+        args = args.replace(" ", "").replace('landscape_only=True', 'landscape_only=False')
+    assert "landscape_only=False" in args
+    if verbose:
+        print(f"instantiating : {args}")
+    net = eval(args)
+    s = net.load_state_dict(ckpt['model'], strict=False)
+    if verbose:
+        print(s)
+    return net.to(device)
+class AsymmetricMASt3R(AsymmetricCroCo3DStereo):
+    def __init__(self, desc_mode=('norm'), two_confs=False, desc_conf_mode=None, **kwargs):
+        self.desc_mode = desc_mode
+        self.two_confs = two_confs
+        self.desc_conf_mode = desc_conf_mode
+        super().__init__(**kwargs)
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, **kw):
+        if os.path.isfile(pretrained_model_name_or_path):
+            return load_model(pretrained_model_name_or_path, device='cpu')
+        else:
+            return super(AsymmetricMASt3R, cls).from_pretrained(pretrained_model_name_or_path, **kw)
+    def set_downstream_head(self, output_mode, head_type, landscape_only, depth_mode, conf_mode, patch_size, img_size, **kw):
+        assert img_size[0] % patch_size == 0 and img_size[
+            1] % patch_size == 0, f'{img_size=} must be multiple of {patch_size=}'
+        self.output_mode = output_mode
+        self.head_type = head_type
+        self.depth_mode = depth_mode
+        self.conf_mode = conf_mode
+        if self.desc_conf_mode is None:
+            self.desc_conf_mode = conf_mode
+        # allocate heads
+        self.downstream_head1 = mast3r_head_factory(head_type, output_mode, self, has_conf=bool(conf_mode))
+        self.downstream_head2 = mast3r_head_factory(head_type, output_mode, self, has_conf=bool(conf_mode))
+        # magic wrapper
+        self.head1 = transpose_to_landscape(self.downstream_head1, activate=landscape_only)
+        self.head2 = transpose_to_landscape(self.downstream_head2, activate=landscape_only)

mast3r/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # Copyright (C) 2024-present Naver Corporation. All rights reserved.
2	+ # Licensed under CC BY-NC-SA 4.0 (non-commercial use only).

mast3r/utils/coarse_to_fine.py ADDED Viewed

	@@ -0,0 +1,214 @@

+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# coarse to fine utilities
+# --------------------------------------------------------
+import numpy as np
+def crop_tag(cell):
+    return f'[{cell[1]}:{cell[3]},{cell[0]}:{cell[2]}]'
+def crop_slice(cell):
+    return slice(cell[1], cell[3]), slice(cell[0], cell[2])
+def _start_pos(total_size, win_size, overlap):
+    # we must have AT LEAST overlap between segments
+    # first segment starts at 0, last segment starts at total_size-win_size
+    assert 0 <= overlap < 1
+    assert total_size >= win_size
+    spacing = win_size * (1 - overlap)
+    last_pt = total_size - win_size
+    n_windows = 2 + int((last_pt - 1) // spacing)
+    return np.linspace(0, last_pt, n_windows).round().astype(int)
+def multiple_of_16(x):
+    return (x // 16) * 16
+def _make_overlapping_grid(H, W, size, overlap):
+    H_win = multiple_of_16(H * size // max(H, W))
+    W_win = multiple_of_16(W * size // max(H, W))
+    x = _start_pos(W, W_win, overlap)
+    y = _start_pos(H, H_win, overlap)
+    grid = np.stack(np.meshgrid(x, y, indexing='xy'), axis=-1)
+    grid = np.concatenate((grid, grid + (W_win, H_win)), axis=-1)
+    return grid.reshape(-1, 4)
+def _cell_size(cell2):
+    width, height = cell2[:, 2] - cell2[:, 0], cell2[:, 3] - cell2[:, 1]
+    assert width.min() >= 0
+    assert height.min() >= 0
+    return width, height
+def _norm_windows(cell2, H2, W2, forced_resolution=None):
+    # make sure the window aspect ratio is 3/4, or the output resolution is forced_resolution  if defined
+    outcell = cell2.copy()
+    width, height = _cell_size(cell2)
+    width2, height2 = width.clip(max=W2), height.clip(max=H2)
+    if forced_resolution is None:
+        width2[width < height] = (height2[width < height] * 3.01 / 4).clip(max=W2)
+        height2[width >= height] = (width2[width >= height] * 3.01 / 4).clip(max=H2)
+    else:
+        forced_H, forced_W = forced_resolution
+        width2[:] = forced_W
+        height2[:] = forced_H
+    half = (width2 - width) / 2
+    outcell[:, 0] -= half
+    outcell[:, 2] += half
+    half = (height2 - height) / 2
+    outcell[:, 1] -= half
+    outcell[:, 3] += half
+    # proj to integers
+    outcell = np.floor(outcell).astype(int)
+    # Take care of flooring errors
+    tmpw, tmph = _cell_size(outcell)
+    outcell[:, 0] += tmpw.astype(tmpw.dtype) - width2.astype(tmpw.dtype)
+    outcell[:, 1] += tmph.astype(tmpw.dtype) - height2.astype(tmpw.dtype)
+    # make sure 0 <= x < W2 and 0 <= y < H2
+    outcell[:, 0::2] -= outcell[:, [0]].clip(max=0)
+    outcell[:, 1::2] -= outcell[:, [1]].clip(max=0)
+    outcell[:, 0::2] -= outcell[:, [2]].clip(min=W2) - W2
+    outcell[:, 1::2] -= outcell[:, [3]].clip(min=H2) - H2
+    width, height = _cell_size(outcell)
+    assert np.all(width == width2.astype(width.dtype)) and np.all(
+        height == height2.astype(height.dtype)), "Error, output is not of the expected shape."
+    assert np.all(width <= W2)
+    assert np.all(height <= H2)
+    return outcell
+def _weight_pixels(cell, pix, assigned, gauss_var=2):
+    center = cell.reshape(-1, 2, 2).mean(axis=1)
+    width, height = _cell_size(cell)
+    # square distance between each cell center and each point
+    dist = (center[:, None] - pix[None]) / np.c_[width, height][:, None]
+    dist2 = np.square(dist).sum(axis=-1)
+    assert assigned.shape == dist2.shape
+    res = np.where(assigned, np.exp(-gauss_var * dist2), 0)
+    return res
+def pos2d_in_rect(p1, cell1):
+    x, y = p1.T
+    l, t, r, b = cell1
+    assigned = (l <= x) & (x < r) & (t <= y) & (y < b)
+    return assigned
+def _score_cell(cell1, H2, W2, p1, p2, min_corres=10, forced_resolution=None):
+    assert p1.shape == p2.shape
+    # compute keypoint assignment
+    assigned = pos2d_in_rect(p1, cell1[None].T)
+    assert assigned.shape == (len(cell1), len(p1))
+    # remove cells without correspondences
+    valid_cells = assigned.sum(axis=1) >= min_corres
+    cell1 = cell1[valid_cells]
+    assigned = assigned[valid_cells]
+    if not valid_cells.any():
+        return cell1, cell1, assigned
+    # fill-in the assigned points in both image
+    assigned_p1 = np.empty((len(cell1), len(p1), 2), dtype=np.float32)
+    assigned_p2 = np.empty((len(cell1), len(p2), 2), dtype=np.float32)
+    assigned_p1[:] = p1[None]
+    assigned_p2[:] = p2[None]
+    assigned_p1[~assigned] = np.nan
+    assigned_p2[~assigned] = np.nan
+    # find the median center and scale of assigned points in each cell
+    # cell_center1 = np.nanmean(assigned_p1, axis=1)
+    cell_center2 = np.nanmean(assigned_p2, axis=1)
+    im1_q25, im1_q75 = np.nanquantile(assigned_p1, (0.1, 0.9), axis=1)
+    im2_q25, im2_q75 = np.nanquantile(assigned_p2, (0.1, 0.9), axis=1)
+    robust_std1 = (im1_q75 - im1_q25).clip(20.)
+    robust_std2 = (im2_q75 - im2_q25).clip(20.)
+    cell_size1 = (cell1[:, 2:4] - cell1[:, 0:2])
+    cell_size2 = cell_size1 * robust_std2 / robust_std1
+    cell2 = np.c_[cell_center2 - cell_size2 / 2, cell_center2 + cell_size2 / 2]
+    # make sure cell bounds are valid
+    cell2 = _norm_windows(cell2, H2, W2, forced_resolution=forced_resolution)
+    # compute correspondence weights
+    corres_weights = _weight_pixels(cell1, p1, assigned) * _weight_pixels(cell2, p2, assigned)
+    # return a list of window pairs and assigned correspondences
+    return cell1, cell2, corres_weights
+def greedy_selection(corres_weights, target=0.9):
+    # corres_weight = (n_cell_pair, n_corres) matrix.
+    # If corres_weight[c,p]>0, means that correspondence p is visible in cell pair p
+    assert 0 < target <= 1
+    corres_weights = corres_weights.copy()
+    total = corres_weights.max(axis=0).sum()
+    target *= total
+    # init = empty
+    res = []
+    cur = np.zeros(corres_weights.shape[1])  # current selection
+    while cur.sum() < target:
+        # pick the nex best cell pair
+        best = corres_weights.sum(axis=1).argmax()
+        res.append(best)
+        # update current
+        cur += corres_weights[best]
+        # print('appending', best, 'with score', corres_weights[best].sum(), '-->', cur.sum())
+        # remove from all other views
+        corres_weights = (corres_weights - corres_weights[best]).clip(min=0)
+    return res
+def select_pairs_of_crops(img_q, img_b, pos2d_in_query, pos2d_in_ref, maxdim=512, overlap=.5, forced_resolution=None):
+    # prepare the overlapping cells
+    grid_q = _make_overlapping_grid(*img_q.shape[:2], maxdim, overlap)
+    grid_b = _make_overlapping_grid(*img_b.shape[:2], maxdim, overlap)
+    assert forced_resolution is None or len(forced_resolution) == 2
+    if isinstance(forced_resolution[0], int) or not len(forced_resolution[0]) == 2:
+        forced_resolution1 = forced_resolution2 = forced_resolution
+    else:
+        assert len(forced_resolution[1]) == 2
+        forced_resolution1 = forced_resolution[0]
+        forced_resolution2 = forced_resolution[1]
+    # Make sure crops respect constraints
+    grid_q = _norm_windows(grid_q.astype(float), *img_q.shape[:2], forced_resolution=forced_resolution1)
+    grid_b = _norm_windows(grid_b.astype(float), *img_b.shape[:2], forced_resolution=forced_resolution2)
+    # score cells
+    pairs_q = _score_cell(grid_q, *img_b.shape[:2], pos2d_in_query, pos2d_in_ref, forced_resolution=forced_resolution2)
+    pairs_b = _score_cell(grid_b, *img_q.shape[:2], pos2d_in_ref, pos2d_in_query, forced_resolution=forced_resolution1)
+    pairs_b = pairs_b[1], pairs_b[0], pairs_b[2]  # cellq, cellb, corres_weights
+    # greedy selection until all correspondences are generated
+    cell1, cell2, corres_weights = map(np.concatenate, zip(pairs_q, pairs_b))
+    if len(corres_weights) == 0:
+        return  # tolerated for empty generators
+    order = greedy_selection(corres_weights, target=0.9)
+    for i in order:
+        def pair_tag(qi, bi): return (str(qi) + crop_tag(cell1[i]), str(bi) + crop_tag(cell2[i]))
+        yield cell1[i], cell2[i], pair_tag

mast3r/utils/collate.py ADDED Viewed

	@@ -0,0 +1,62 @@

+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# Collate extensions
+# --------------------------------------------------------
+import torch
+import collections
+from torch.utils.data._utils.collate import default_collate_fn_map, default_collate_err_msg_format
+from typing import Callable, Dict, Optional, Tuple, Type, Union, List
+def cat_collate_tensor_fn(batch, *, collate_fn_map):
+    return torch.cat(batch, dim=0)
+def cat_collate_list_fn(batch, *, collate_fn_map: Optional[Dict[Union[Type, Tuple[Type, ...]], Callable]] = None):
+    return [item for bb in batch for item in bb]  # concatenate all lists
+cat_collate_fn_map = default_collate_fn_map.copy()
+cat_collate_fn_map[torch.Tensor] = cat_collate_tensor_fn
+cat_collate_fn_map[List] = cat_collate_list_fn
+cat_collate_fn_map[type(None)] = lambda _, **kw: None  # When some Nones, simply return a single None
+def cat_collate(batch, *, collate_fn_map: Optional[Dict[Union[Type, Tuple[Type, ...]], Callable]] = None):
+    r"""Custom collate function that concatenates stuff instead of stacking them, and handles NoneTypes """
+    elem = batch[0]
+    elem_type = type(elem)
+    if collate_fn_map is not None:
+        if elem_type in collate_fn_map:
+            return collate_fn_map[elem_type](batch, collate_fn_map=collate_fn_map)
+        for collate_type in collate_fn_map:
+            if isinstance(elem, collate_type):
+                return collate_fn_map[collate_type](batch, collate_fn_map=collate_fn_map)
+    if isinstance(elem, collections.abc.Mapping):
+        try:
+            return elem_type({key: cat_collate([d[key] for d in batch], collate_fn_map=collate_fn_map) for key in elem})
+        except TypeError:
+            # The mapping type may not support `__init__(iterable)`.
+            return {key: cat_collate([d[key] for d in batch], collate_fn_map=collate_fn_map) for key in elem}
+    elif isinstance(elem, tuple) and hasattr(elem, '_fields'):  # namedtuple
+        return elem_type(*(cat_collate(samples, collate_fn_map=collate_fn_map) for samples in zip(*batch)))
+    elif isinstance(elem, collections.abc.Sequence):
+        transposed = list(zip(*batch))  # It may be accessed twice, so we use a list.
+        if isinstance(elem, tuple):
+            # Backwards compatibility.
+            return [cat_collate(samples, collate_fn_map=collate_fn_map) for samples in transposed]
+        else:
+            try:
+                return elem_type([cat_collate(samples, collate_fn_map=collate_fn_map) for samples in transposed])
+            except TypeError:
+                # The sequence type may not support `__init__(iterable)` (e.g., `range`).
+                return [cat_collate(samples, collate_fn_map=collate_fn_map) for samples in transposed]
+    raise TypeError(default_collate_err_msg_format.format(elem_type))

mast3r/utils/misc.py ADDED Viewed

	@@ -0,0 +1,17 @@

+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# utilitary functions for MASt3R
+# --------------------------------------------------------
+import os
+import hashlib
+def mkdir_for(f):
+    os.makedirs(os.path.dirname(f), exist_ok=True)
+    return f
+def hash_md5(s):
+    return hashlib.md5(s.encode('utf-8')).hexdigest()

mast3r/utils/path_to_dust3r.py ADDED Viewed

	@@ -0,0 +1,19 @@

+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# dust3r submodule import
+# --------------------------------------------------------
+import sys
+import os.path as path
+HERE_PATH = path.normpath(path.dirname(__file__))
+DUSt3R_REPO_PATH = path.normpath(path.join(HERE_PATH, '../../dust3r'))
+DUSt3R_LIB_PATH = path.join(DUSt3R_REPO_PATH, 'dust3r')
+# check the presence of models directory in repo to be sure its cloned
+if path.isdir(DUSt3R_LIB_PATH):
+    # workaround for sibling import
+    sys.path.insert(0, DUSt3R_REPO_PATH)
+else:
+    raise ImportError(f"dust3r is not initialized, could not find: {DUSt3R_LIB_PATH}.\n "
+                      "Did you forget to run 'git submodule update --init --recursive' ?")

requirements.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ scikit-learn

train.py ADDED Viewed

	@@ -0,0 +1,48 @@

+#!/usr/bin/env python3
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# training executable for MASt3R
+# --------------------------------------------------------
+from mast3r.model import AsymmetricMASt3R
+from mast3r.losses import ConfMatchingLoss, MatchingLoss, APLoss, Regr3D, InfoNCE, Regr3D_ScaleShiftInv
+from mast3r.datasets import ARKitScenes, BlendedMVS, Co3d, MegaDepth, ScanNetpp, StaticThings3D, Waymo, WildRGBD
+import mast3r.utils.path_to_dust3r  # noqa
+# add mast3r classes to dust3r imports
+import dust3r.training
+dust3r.training.AsymmetricMASt3R = AsymmetricMASt3R
+dust3r.training.Regr3D = Regr3D
+dust3r.training.Regr3D_ScaleShiftInv = Regr3D_ScaleShiftInv
+dust3r.training.MatchingLoss = MatchingLoss
+dust3r.training.ConfMatchingLoss = ConfMatchingLoss
+dust3r.training.InfoNCE = InfoNCE
+dust3r.training.APLoss = APLoss
+import dust3r.datasets
+dust3r.datasets.ARKitScenes = ARKitScenes
+dust3r.datasets.BlendedMVS = BlendedMVS
+dust3r.datasets.Co3d = Co3d
+dust3r.datasets.MegaDepth = MegaDepth
+dust3r.datasets.ScanNetpp = ScanNetpp
+dust3r.datasets.StaticThings3D = StaticThings3D
+dust3r.datasets.Waymo = Waymo
+dust3r.datasets.WildRGBD = WildRGBD
+from dust3r.training import get_args_parser as dust3r_get_args_parser  # noqa
+from dust3r.training import train  # noqa
+def get_args_parser():
+    parser = dust3r_get_args_parser()
+    # change defaults
+    parser.prog = 'MASt3R training'
+    parser.set_defaults(model="AsymmetricMASt3R(patch_embed_cls='ManyAR_PatchEmbed')")
+    return parser
+if __name__ == '__main__':
+    args = get_args_parser()
+    args = args.parse_args()
+    train(args)

visloc.py ADDED Viewed

	@@ -0,0 +1,538 @@

+#!/usr/bin/env python3
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# visloc script with support for coarse to fine
+# --------------------------------------------------------
+import os
+import numpy as np
+import random
+import torch
+import torchvision.transforms as tvf
+import argparse
+from tqdm import tqdm
+from PIL import Image
+import math
+from mast3r.model import AsymmetricMASt3R
+from mast3r.fast_nn import fast_reciprocal_NNs
+from mast3r.utils.coarse_to_fine import select_pairs_of_crops, crop_slice
+from mast3r.utils.collate import cat_collate, cat_collate_fn_map
+from mast3r.utils.misc import mkdir_for
+from mast3r.datasets.utils.cropping import crop_to_homography
+import mast3r.utils.path_to_dust3r  # noqa
+from dust3r.inference import inference, loss_of_one_batch
+from dust3r.utils.geometry import geotrf, colmap_to_opencv_intrinsics, opencv_to_colmap_intrinsics
+from dust3r.datasets.utils.transforms import ImgNorm
+from dust3r_visloc.datasets import *
+from dust3r_visloc.localization import run_pnp
+from dust3r_visloc.evaluation import get_pose_error, aggregate_stats, export_results
+from dust3r_visloc.datasets.utils import get_HW_resolution, rescale_points3d
+def get_args_parser():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--dataset", type=str, required=True, help="visloc dataset to eval")
+    parser_weights = parser.add_mutually_exclusive_group(required=True)
+    parser_weights.add_argument("--weights", type=str, help="path to the model weights", default=None)
+    parser_weights.add_argument("--model_name", type=str, help="name of the model weights",
+                                choices=["MASt3R_ViTLarge_BaseDecoder_512_catmlpdpt_metric"])
+    parser.add_argument("--confidence_threshold", type=float, default=1.001,
+                        help="confidence values higher than threshold are invalid")
+    parser.add_argument('--pixel_tol', default=5, type=int)
+    parser.add_argument("--coarse_to_fine", action='store_true', default=False,
+                        help="do the matching from coarse to fine")
+    parser.add_argument("--max_image_size", type=int, default=None,
+                        help="max image size for the fine resolution")
+    parser.add_argument("--c2f_crop_with_homography", action='store_true', default=False,
+                        help="when using coarse to fine, crop with homographies to keep cx, cy centered")
+    parser.add_argument("--device", type=str, default='cuda', help="pytorch device")
+    parser.add_argument("--pnp_mode", type=str, default="cv2", choices=['cv2', 'poselib', 'pycolmap'],
+                        help="pnp lib to use")
+    parser_reproj = parser.add_mutually_exclusive_group()
+    parser_reproj.add_argument("--reprojection_error", type=float, default=5.0, help="pnp reprojection error")
+    parser_reproj.add_argument("--reprojection_error_diag_ratio", type=float, default=None,
+                               help="pnp reprojection error as a ratio of the diagonal of the image")
+    parser.add_argument("--max_batch_size", type=int, default=48,
+                        help="max batch size for inference on crops when using coarse to fine")
+    parser.add_argument("--pnp_max_points", type=int, default=100_000, help="pnp maximum number of points kept")
+    parser.add_argument("--viz_matches", type=int, default=0, help="debug matches")
+    parser.add_argument("--output_dir", type=str, default=None, help="output path")
+    parser.add_argument("--output_label", type=str, default='', help="prefix for results files")
+    return parser
+@torch.no_grad()
+def coarse_matching(query_view, map_view, model, device, pixel_tol, fast_nn_params):
+    # prepare batch
+    imgs = []
+    for idx, img in enumerate([query_view['rgb_rescaled'], map_view['rgb_rescaled']]):
+        imgs.append(dict(img=img.unsqueeze(0), true_shape=np.int32([img.shape[1:]]),
+                         idx=idx, instance=str(idx)))
+    output = inference([tuple(imgs)], model, device, batch_size=1, verbose=False)
+    pred1, pred2 = output['pred1'], output['pred2']
+    conf_list = [pred1['desc_conf'].squeeze(0).cpu().numpy(), pred2['desc_conf'].squeeze(0).cpu().numpy()]
+    desc_list = [pred1['desc'].squeeze(0).detach(), pred2['desc'].squeeze(0).detach()]
+    # find 2D-2D matches between the two images
+    PQ, PM = desc_list[0], desc_list[1]
+    if len(PQ) == 0 or len(PM) == 0:
+        return [], [], [], []
+    if pixel_tol == 0:
+        matches_im_map, matches_im_query = fast_reciprocal_NNs(PM, PQ, subsample_or_initxy1=8, **fast_nn_params)
+        HM, WM = map_view['rgb_rescaled'].shape[1:]
+        HQ, WQ = query_view['rgb_rescaled'].shape[1:]
+        # ignore small border around the edge
+        valid_matches_map = (matches_im_map[:, 0] >= 3) & (matches_im_map[:, 0] < WM - 3) & (
+            matches_im_map[:, 1] >= 3) & (matches_im_map[:, 1] < HM - 3)
+        valid_matches_query = (matches_im_query[:, 0] >= 3) & (matches_im_query[:, 0] < WQ - 3) & (
+            matches_im_query[:, 1] >= 3) & (matches_im_query[:, 1] < HQ - 3)
+        valid_matches = valid_matches_map & valid_matches_query
+        matches_im_map = matches_im_map[valid_matches]
+        matches_im_query = matches_im_query[valid_matches]
+        valid_pts3d = []
+        matches_confs = []
+    else:
+        yM, xM = torch.where(map_view['valid_rescaled'])
+        matches_im_map, matches_im_query = fast_reciprocal_NNs(PM, PQ, (xM, yM), pixel_tol=pixel_tol, **fast_nn_params)
+        valid_pts3d = map_view['pts3d_rescaled'].cpu().numpy()[matches_im_map[:, 1], matches_im_map[:, 0]]
+        matches_confs = np.minimum(
+            conf_list[1][matches_im_map[:, 1], matches_im_map[:, 0]],
+            conf_list[0][matches_im_query[:, 1], matches_im_query[:, 0]]
+        )
+    # from cv2 to colmap
+    matches_im_query = matches_im_query.astype(np.float64)
+    matches_im_map = matches_im_map.astype(np.float64)
+    matches_im_query[:, 0] += 0.5
+    matches_im_query[:, 1] += 0.5
+    matches_im_map[:, 0] += 0.5
+    matches_im_map[:, 1] += 0.5
+    # rescale coordinates
+    matches_im_query = geotrf(query_view['to_orig'], matches_im_query, norm=True)
+    matches_im_map = geotrf(map_view['to_orig'], matches_im_map, norm=True)
+    # from colmap back to cv2
+    matches_im_query[:, 0] -= 0.5
+    matches_im_query[:, 1] -= 0.5
+    matches_im_map[:, 0] -= 0.5
+    matches_im_map[:, 1] -= 0.5
+    return valid_pts3d, matches_im_query, matches_im_map, matches_confs
+@torch.no_grad()
+def crops_inference(pairs, model, device, batch_size=48, verbose=True):
+    assert len(pairs) == 2, "Error, data should be a tuple of dicts containing the batch of image pairs"
+    # Forward a possibly big bunch of data, by blocks of batch_size
+    B = pairs[0]['img'].shape[0]
+    if B < batch_size:
+        return loss_of_one_batch(pairs, model, None, device=device, symmetrize_batch=False)
+    preds = []
+    for ii in range(0, B, batch_size):
+        sel = slice(ii, ii + min(B - ii, batch_size))
+        temp_data = [{}, {}]
+        for di in [0, 1]:
+            temp_data[di] = {kk: pairs[di][kk][sel]
+                             for kk in pairs[di].keys() if pairs[di][kk] is not None}  # copy chunk for forward
+        preds.append(loss_of_one_batch(temp_data, model,
+                                       None, device=device, symmetrize_batch=False))  # sequential forward
+    # Merge all preds
+    return cat_collate(preds, collate_fn_map=cat_collate_fn_map)
+@torch.no_grad()
+def fine_matching(query_views, map_views, model, device, max_batch_size, pixel_tol, fast_nn_params):
+    assert pixel_tol > 0
+    output = crops_inference([query_views, map_views],
+                             model, device, batch_size=max_batch_size, verbose=False)
+    pred1, pred2 = output['pred1'], output['pred2']
+    descs1 = pred1['desc'].clone()
+    descs2 = pred2['desc'].clone()
+    confs1 = pred1['desc_conf'].clone()
+    confs2 = pred2['desc_conf'].clone()
+    # Compute matches
+    valid_pts3d, matches_im_map, matches_im_query, matches_confs = [], [], [], []
+    for ppi, (pp1, pp2, cc11, cc21) in enumerate(zip(descs1, descs2, confs1, confs2)):
+        valid_ppi = map_views['valid'][ppi]
+        pts3d_ppi = map_views['pts3d'][ppi].cpu().numpy()
+        conf_list_ppi = [cc11.cpu().numpy(), cc21.cpu().numpy()]
+        y_ppi, x_ppi = torch.where(valid_ppi)
+        matches_im_map_ppi, matches_im_query_ppi = fast_reciprocal_NNs(pp2, pp1, (x_ppi, y_ppi),
+                                                                       pixel_tol=pixel_tol, **fast_nn_params)
+        valid_pts3d_ppi = pts3d_ppi[matches_im_map_ppi[:, 1], matches_im_map_ppi[:, 0]]
+        matches_confs_ppi = np.minimum(
+            conf_list_ppi[1][matches_im_map_ppi[:, 1], matches_im_map_ppi[:, 0]],
+            conf_list_ppi[0][matches_im_query_ppi[:, 1], matches_im_query_ppi[:, 0]]
+        )
+        # inverse operation where we uncrop pixel coordinates
+        matches_im_map_ppi = geotrf(map_views['to_orig'][ppi].cpu().numpy(), matches_im_map_ppi.copy(), norm=True)
+        matches_im_query_ppi = geotrf(query_views['to_orig'][ppi].cpu().numpy(), matches_im_query_ppi.copy(), norm=True)
+        matches_im_map.append(matches_im_map_ppi)
+        matches_im_query.append(matches_im_query_ppi)
+        valid_pts3d.append(valid_pts3d_ppi)
+        matches_confs.append(matches_confs_ppi)
+    if len(valid_pts3d) == 0:
+        return [], [], [], []
+    matches_im_map = np.concatenate(matches_im_map, axis=0)
+    matches_im_query = np.concatenate(matches_im_query, axis=0)
+    valid_pts3d = np.concatenate(valid_pts3d, axis=0)
+    matches_confs = np.concatenate(matches_confs, axis=0)
+    return valid_pts3d, matches_im_query, matches_im_map, matches_confs
+def crop(img, mask, pts3d, crop, intrinsics=None):
+    out_cropped_img = img.clone()
+    if mask is not None:
+        out_cropped_mask = mask.clone()
+    else:
+        out_cropped_mask = None
+    if pts3d is not None:
+        out_cropped_pts3d = pts3d.clone()
+    else:
+        out_cropped_pts3d = None
+    to_orig = torch.eye(3, device=img.device)
+    # If intrinsics available, crop and apply rectifying homography. Otherwise, just crop
+    if intrinsics is not None:
+        K_old = intrinsics
+        imsize, K_new, R, H = crop_to_homography(K_old, crop)
+        # apply homography to image
+        H /= H[2, 2]
+        homo8 = H.ravel().tolist()[:8]
+        # From float tensor to uint8 PIL Image
+        pilim = Image.fromarray((255 * (img + 1.) / 2).to(torch.uint8).numpy())
+        pilout_cropped_img = pilim.transform(imsize, Image.Transform.PERSPECTIVE,
+                                             homo8, resample=Image.Resampling.BICUBIC)
+        # From uint8 PIL Image to float tensor
+        out_cropped_img = 2. * torch.tensor(np.array(pilout_cropped_img)).to(img) / 255. - 1.
+        if out_cropped_mask is not None:
+            pilmask = Image.fromarray((255 * out_cropped_mask).to(torch.uint8).numpy())
+            pilout_cropped_mask = pilmask.transform(
+                imsize, Image.Transform.PERSPECTIVE, homo8, resample=Image.Resampling.NEAREST)
+            out_cropped_mask = torch.from_numpy(np.array(pilout_cropped_mask) > 0).to(out_cropped_mask.dtype)
+        if out_cropped_pts3d is not None:
+            out_cropped_pts3d = out_cropped_pts3d.numpy()
+            out_cropped_X = np.array(Image.fromarray(out_cropped_pts3d[:, :, 0]).transform(imsize,
+                                                                                           Image.Transform.PERSPECTIVE,
+                                                                                           homo8,
+                                                                                           resample=Image.Resampling.NEAREST))
+            out_cropped_Y = np.array(Image.fromarray(out_cropped_pts3d[:, :, 1]).transform(imsize,
+                                                                                           Image.Transform.PERSPECTIVE,
+                                                                                           homo8,
+                                                                                           resample=Image.Resampling.NEAREST))
+            out_cropped_Z = np.array(Image.fromarray(out_cropped_pts3d[:, :, 2]).transform(imsize,
+                                                                                           Image.Transform.PERSPECTIVE,
+                                                                                           homo8,
+                                                                                           resample=Image.Resampling.NEAREST))
+            out_cropped_pts3d = torch.from_numpy(np.stack([out_cropped_X, out_cropped_Y, out_cropped_Z], axis=-1))
+        to_orig = torch.tensor(H, device=img.device)
+    else:
+        out_cropped_img = img[crop_slice(crop)]
+        if out_cropped_mask is not None:
+            out_cropped_mask = out_cropped_mask[crop_slice(crop)]
+        if out_cropped_pts3d is not None:
+            out_cropped_pts3d = out_cropped_pts3d[crop_slice(crop)]
+        to_orig[:2, -1] = torch.tensor(crop[:2])
+    return out_cropped_img, out_cropped_mask, out_cropped_pts3d, to_orig
+def resize_image_to_max(max_image_size, rgb, K):
+    W, H = rgb.size
+    if max_image_size and max(W, H) > max_image_size:
+        islandscape = (W >= H)
+        if islandscape:
+            WMax = max_image_size
+            HMax = int(H * (WMax / W))
+        else:
+            HMax = max_image_size
+            WMax = int(W * (HMax / H))
+        resize_op = tvf.Compose([ImgNorm, tvf.Resize(size=[HMax, WMax])])
+        rgb_tensor = resize_op(rgb).permute(1, 2, 0)
+        to_orig_max = np.array([[W / WMax, 0, 0],
+                                [0, H / HMax, 0],
+                                [0, 0, 1]])
+        to_resize_max = np.array([[WMax / W, 0, 0],
+                                  [0, HMax / H, 0],
+                                  [0, 0, 1]])
+        # Generate new camera parameters
+        new_K = opencv_to_colmap_intrinsics(K)
+        new_K[0, :] *= WMax / W
+        new_K[1, :] *= HMax / H
+        new_K = colmap_to_opencv_intrinsics(new_K)
+    else:
+        rgb_tensor = ImgNorm(rgb).permute(1, 2, 0)
+        to_orig_max = np.eye(3)
+        to_resize_max = np.eye(3)
+        HMax, WMax = H, W
+        new_K = K
+    return rgb_tensor, new_K, to_orig_max, to_resize_max, (HMax, WMax)
+if __name__ == '__main__':
+    parser = get_args_parser()
+    args = parser.parse_args()
+    conf_thr = args.confidence_threshold
+    device = args.device
+    pnp_mode = args.pnp_mode
+    assert args.pixel_tol > 0
+    reprojection_error = args.reprojection_error
+    reprojection_error_diag_ratio = args.reprojection_error_diag_ratio
+    pnp_max_points = args.pnp_max_points
+    viz_matches = args.viz_matches
+    if args.weights is not None:
+        weights_path = args.weights
+    else:
+        weights_path = "naver/" + args.model_name
+    model = AsymmetricMASt3R.from_pretrained(weights_path).to(args.device)
+    fast_nn_params = dict(device=device, dist='dot', block_size=2**13)
+    dataset = eval(args.dataset)
+    dataset.set_resolution(model)
+    query_names = []
+    poses_pred = []
+    pose_errors = []
+    angular_errors = []
+    params_str = f'tol_{args.pixel_tol}' + ("_c2f" if args.coarse_to_fine else '')
+    if args.max_image_size is not None:
+        params_str = params_str + f'_{args.max_image_size}'
+    if args.coarse_to_fine and args.c2f_crop_with_homography:
+        params_str = params_str + '_with_homography'
+    for idx in tqdm(range(len(dataset))):
+        views = dataset[(idx)]  # 0 is the query
+        query_view = views[0]
+        map_views = views[1:]
+        query_names.append(query_view['image_name'])
+        query_pts2d = []
+        query_pts3d = []
+        maxdim = max(model.patch_embed.img_size)
+        query_rgb_tensor, query_K, query_to_orig_max, query_to_resize_max, (HQ, WQ) = resize_image_to_max(
+            args.max_image_size, query_view['rgb'], query_view['intrinsics'])
+        # pairs of crops have the same resolution
+        query_resolution = get_HW_resolution(HQ, WQ, maxdim=maxdim, patchsize=model.patch_embed.patch_size)
+        for map_view in map_views:
+            if args.output_dir is not None:
+                cache_file = os.path.join(args.output_dir, 'matches', params_str,
+                                          query_view['image_name'], map_view['image_name'] + '.npz')
+            else:
+                cache_file = None
+            if cache_file is not None and os.path.isfile(cache_file):
+                matches = np.load(cache_file)
+                valid_pts3d = matches['valid_pts3d']
+                matches_im_query = matches['matches_im_query']
+                matches_im_map = matches['matches_im_map']
+                matches_conf = matches['matches_conf']
+            else:
+                # coarse matching
+                if args.coarse_to_fine and (maxdim < max(WQ, HQ)):
+                    # use all points
+                    _, coarse_matches_im0, coarse_matches_im1, _ = coarse_matching(query_view, map_view, model, device,
+                                                                                   0, fast_nn_params)
+                    # visualize a few matches
+                    if viz_matches > 0:
+                        num_matches = coarse_matches_im1.shape[0]
+                        print(f'found {num_matches} matches')
+                        viz_imgs = [np.array(query_view['rgb']), np.array(map_view['rgb'])]
+                        from matplotlib import pyplot as pl
+                        n_viz = viz_matches
+                        match_idx_to_viz = np.round(np.linspace(0, num_matches - 1, n_viz)).astype(int)
+                        viz_matches_im_query = coarse_matches_im0[match_idx_to_viz]
+                        viz_matches_im_map = coarse_matches_im1[match_idx_to_viz]
+                        H0, W0, H1, W1 = *viz_imgs[0].shape[:2], *viz_imgs[1].shape[:2]
+                        img0 = np.pad(viz_imgs[0], ((0, max(H1 - H0, 0)), (0, 0), (0, 0)),
+                                      'constant', constant_values=0)
+                        img1 = np.pad(viz_imgs[1], ((0, max(H0 - H1, 0)), (0, 0), (0, 0)),
+                                      'constant', constant_values=0)
+                        img = np.concatenate((img0, img1), axis=1)
+                        pl.figure()
+                        pl.imshow(img)
+                        cmap = pl.get_cmap('jet')
+                        for i in range(n_viz):
+                            (x0, y0), (x1, y1) = viz_matches_im_query[i].T, viz_matches_im_map[i].T
+                            pl.plot([x0, x1 + W0], [y0, y1], '-+',
+                                    color=cmap(i / (n_viz - 1)), scalex=False, scaley=False)
+                        pl.show(block=True)
+                    valid_all = map_view['valid']
+                    pts3d = map_view['pts3d']
+                    WM_full, HM_full = map_view['rgb'].size
+                    map_rgb_tensor, map_K, map_to_orig_max, map_to_resize_max, (HM, WM) = resize_image_to_max(
+                        args.max_image_size, map_view['rgb'], map_view['intrinsics'])
+                    if WM_full != WM or HM_full != HM:
+                        y_full, x_full = torch.where(valid_all)
+                        pos2d_cv2 = torch.stack([x_full, y_full], dim=-1).cpu().numpy().astype(np.float64)
+                        sparse_pts3d = pts3d[y_full, x_full].cpu().numpy()
+                        _, _, pts3d_max, valid_max = rescale_points3d(
+                            pos2d_cv2, sparse_pts3d, map_to_resize_max, HM, WM)
+                        pts3d = torch.from_numpy(pts3d_max)
+                        valid_all = torch.from_numpy(valid_max)
+                    coarse_matches_im0 = geotrf(query_to_resize_max, coarse_matches_im0, norm=True)
+                    coarse_matches_im1 = geotrf(map_to_resize_max, coarse_matches_im1, norm=True)
+                    crops1, crops2 = [], []
+                    crops_v1, crops_p1 = [], []
+                    to_orig1, to_orig2 = [], []
+                    map_resolution = get_HW_resolution(HM, WM, maxdim=maxdim, patchsize=model.patch_embed.patch_size)
+                    for crop_q, crop_b, pair_tag in select_pairs_of_crops(map_rgb_tensor,
+                                                                          query_rgb_tensor,
+                                                                          coarse_matches_im1,
+                                                                          coarse_matches_im0,
+                                                                          maxdim=maxdim,
+                                                                          overlap=.5,
+                                                                          forced_resolution=[map_resolution,
+                                                                                             query_resolution]):
+                        # Per crop processing
+                        if not args.c2f_crop_with_homography:
+                            map_K = None
+                            query_K = None
+                        c1, v1, p1, trf1 = crop(map_rgb_tensor, valid_all, pts3d, crop_q, map_K)
+                        c2, _, _, trf2 = crop(query_rgb_tensor, None, None, crop_b, query_K)
+                        crops1.append(c1)
+                        crops2.append(c2)
+                        crops_v1.append(v1)
+                        crops_p1.append(p1)
+                        to_orig1.append(trf1)
+                        to_orig2.append(trf2)
+                    if len(crops1) == 0 or len(crops2) == 0:
+                        valid_pts3d, matches_im_query, matches_im_map, matches_conf = [], [], [], []
+                    else:
+                        crops1, crops2 = torch.stack(crops1), torch.stack(crops2)
+                        if len(crops1.shape) == 3:
+                            crops1, crops2 = crops1[None], crops2[None]
+                        crops_v1 = torch.stack(crops_v1)
+                        crops_p1 = torch.stack(crops_p1)
+                        to_orig1, to_orig2 = torch.stack(to_orig1), torch.stack(to_orig2)
+                        map_crop_view = dict(img=crops1.permute(0, 3, 1, 2),
+                                             instance=['1' for _ in range(crops1.shape[0])],
+                                             valid=crops_v1, pts3d=crops_p1,
+                                             to_orig=to_orig1)
+                        query_crop_view = dict(img=crops2.permute(0, 3, 1, 2),
+                                               instance=['2' for _ in range(crops2.shape[0])],
+                                               to_orig=to_orig2)
+                        # Inference and Matching
+                        valid_pts3d, matches_im_query, matches_im_map, matches_conf = fine_matching(query_crop_view,
+                                                                                                    map_crop_view,
+                                                                                                    model, device,
+                                                                                                    args.max_batch_size,
+                                                                                                    args.pixel_tol,
+                                                                                                    fast_nn_params)
+                        matches_im_query = geotrf(query_to_orig_max, matches_im_query, norm=True)
+                        matches_im_map = geotrf(map_to_orig_max, matches_im_map, norm=True)
+                else:
+                    # use only valid 2d points
+                    valid_pts3d, matches_im_query, matches_im_map, matches_conf = coarse_matching(query_view, map_view,
+                                                                                                  model, device,
+                                                                                                  args.pixel_tol,
+                                                                                                  fast_nn_params)
+                if cache_file is not None:
+                    mkdir_for(cache_file)
+                    np.savez(cache_file, valid_pts3d=valid_pts3d, matches_im_query=matches_im_query,
+                             matches_im_map=matches_im_map, matches_conf=matches_conf)
+            # apply conf
+            if len(matches_conf) > 0:
+                mask = matches_conf >= conf_thr
+                valid_pts3d = valid_pts3d[mask]
+                matches_im_query = matches_im_query[mask]
+                matches_im_map = matches_im_map[mask]
+                matches_conf = matches_conf[mask]
+            # visualize a few matches
+            if viz_matches > 0:
+                num_matches = matches_im_map.shape[0]
+                print(f'found {num_matches} matches')
+                viz_imgs = [np.array(query_view['rgb']), np.array(map_view['rgb'])]
+                from matplotlib import pyplot as pl
+                n_viz = viz_matches
+                match_idx_to_viz = np.round(np.linspace(0, num_matches - 1, n_viz)).astype(int)
+                viz_matches_im_query = matches_im_query[match_idx_to_viz]
+                viz_matches_im_map = matches_im_map[match_idx_to_viz]
+                H0, W0, H1, W1 = *viz_imgs[0].shape[:2], *viz_imgs[1].shape[:2]
+                img0 = np.pad(viz_imgs[0], ((0, max(H1 - H0, 0)), (0, 0), (0, 0)), 'constant', constant_values=0)
+                img1 = np.pad(viz_imgs[1], ((0, max(H0 - H1, 0)), (0, 0), (0, 0)), 'constant', constant_values=0)
+                img = np.concatenate((img0, img1), axis=1)
+                pl.figure()
+                pl.imshow(img)
+                cmap = pl.get_cmap('jet')
+                for i in range(n_viz):
+                    (x0, y0), (x1, y1) = viz_matches_im_query[i].T, viz_matches_im_map[i].T
+                    pl.plot([x0, x1 + W0], [y0, y1], '-+', color=cmap(i / (n_viz - 1)), scalex=False, scaley=False)
+                pl.show(block=True)
+            if len(valid_pts3d) == 0:
+                pass
+            else:
+                query_pts3d.append(valid_pts3d)
+                query_pts2d.append(matches_im_query)
+        if len(query_pts2d) == 0:
+            success = False
+            pr_querycam_to_world = None
+        else:
+            query_pts2d = np.concatenate(query_pts2d, axis=0).astype(np.float32)
+            query_pts3d = np.concatenate(query_pts3d, axis=0)
+            if len(query_pts2d) > pnp_max_points:
+                idxs = random.sample(range(len(query_pts2d)), pnp_max_points)
+                query_pts3d = query_pts3d[idxs]
+                query_pts2d = query_pts2d[idxs]
+            W, H = query_view['rgb'].size
+            if reprojection_error_diag_ratio is not None:
+                reprojection_error_img = reprojection_error_diag_ratio * math.sqrt(W**2 + H**2)
+            else:
+                reprojection_error_img = reprojection_error
+            success, pr_querycam_to_world = run_pnp(query_pts2d, query_pts3d,
+                                                    query_view['intrinsics'], query_view['distortion'],
+                                                    pnp_mode, reprojection_error_img, img_size=[W, H])
+        if not success:
+            abs_transl_error = float('inf')
+            abs_angular_error = float('inf')
+        else:
+            abs_transl_error, abs_angular_error = get_pose_error(pr_querycam_to_world, query_view['cam_to_world'])
+        pose_errors.append(abs_transl_error)
+        angular_errors.append(abs_angular_error)
+        poses_pred.append(pr_querycam_to_world)
+    xp_label = params_str + f'_conf_{conf_thr}'
+    if args.output_label:
+        xp_label = args.output_label + "_" + xp_label
+    if reprojection_error_diag_ratio is not None:
+        xp_label = xp_label + f'_reproj_diag_{reprojection_error_diag_ratio}'
+    else:
+        xp_label = xp_label + f'_reproj_err_{reprojection_error}'
+    export_results(args.output_dir, xp_label, query_names, poses_pred)
+    out_string = aggregate_stats(f'{args.dataset}', pose_errors, angular_errors)
+    print(out_string)