Spaces:

sengerchen
/

stable-virtual-camera

Runtime error

App Files Files Community

sengerchen commited on Mar 21

Commit

1bb1365

verified ·

1 Parent(s): e4c6df8

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +17 -0
.gitignore +48 -0
.gitmodules +3 -0
.pre-commit-config.yaml +22 -0
LICENSE +124 -0
README.md +79 -8
assets/advance/backyard-7_0.jpg +0 -0
assets/advance/backyard-7_1.jpg +0 -0
assets/advance/backyard-7_2.jpg +0 -0
assets/advance/backyard-7_3.jpg +0 -0
assets/advance/backyard-7_4.jpg +0 -0
assets/advance/backyard-7_5.jpg +0 -0
assets/advance/backyard-7_6.jpg +0 -0
assets/advance/blue-car.jpg +3 -0
assets/advance/garden-4_0.jpg +3 -0
assets/advance/garden-4_1.jpg +3 -0
assets/advance/garden-4_2.jpg +3 -0
assets/advance/garden-4_3.jpg +3 -0
assets/advance/telebooth-2_0.jpg +0 -0
assets/advance/telebooth-2_1.jpg +0 -0
assets/advance/vgg-lab-4_0.png +3 -0
assets/advance/vgg-lab-4_1.png +3 -0
assets/advance/vgg-lab-4_2.png +3 -0
assets/advance/vgg-lab-4_3.png +3 -0
assets/basic/blue-car.jpg +3 -0
assets/basic/hilly-countryside.jpg +3 -0
assets/basic/lily-dragon.png +3 -0
assets/basic/llff-room.jpg +0 -0
assets/basic/mountain-lake.jpg +0 -0
assets/basic/vasedeck.jpg +0 -0
assets/basic/vgg-lab-4_0.png +3 -0
benchmark/README.md +156 -0
benchmark/export_reconfusion_example.py +137 -0
demo.py +407 -0
demo_gr.py +1248 -0
docs/CLI_USAGE.md +169 -0
docs/GR_USAGE.md +76 -0
docs/INSTALL.md +39 -0
pyproject.toml +39 -0
seva/__init__.py +0 -0
seva/data_io.py +553 -0
seva/eval.py +1990 -0
seva/geometry.py +811 -0
seva/gui.py +975 -0
seva/model.py +234 -0
seva/modules/__init__.py +0 -0
seva/modules/autoencoder.py +51 -0
seva/modules/conditioner.py +39 -0
seva/modules/layers.py +139 -0
seva/modules/preprocessor.py +116 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,20 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+assets/advance/blue-car.jpg filter=lfs diff=lfs merge=lfs -text
+assets/advance/garden-4_0.jpg filter=lfs diff=lfs merge=lfs -text
+assets/advance/garden-4_1.jpg filter=lfs diff=lfs merge=lfs -text
+assets/advance/garden-4_2.jpg filter=lfs diff=lfs merge=lfs -text
+assets/advance/garden-4_3.jpg filter=lfs diff=lfs merge=lfs -text
+assets/advance/vgg-lab-4_0.png filter=lfs diff=lfs merge=lfs -text
+assets/advance/vgg-lab-4_1.png filter=lfs diff=lfs merge=lfs -text
+assets/advance/vgg-lab-4_2.png filter=lfs diff=lfs merge=lfs -text
+assets/advance/vgg-lab-4_3.png filter=lfs diff=lfs merge=lfs -text
+assets/basic/blue-car.jpg filter=lfs diff=lfs merge=lfs -text
+assets/basic/hilly-countryside.jpg filter=lfs diff=lfs merge=lfs -text
+assets/basic/lily-dragon.png filter=lfs diff=lfs merge=lfs -text
+assets/basic/vgg-lab-4_0.png filter=lfs diff=lfs merge=lfs -text
+third_party/dust3r/assets/demo.jpg filter=lfs diff=lfs merge=lfs -text
+third_party/dust3r/assets/matching.jpg filter=lfs diff=lfs merge=lfs -text
+third_party/dust3r/croco/assets/Chateau1.png filter=lfs diff=lfs merge=lfs -text
+third_party/dust3r/croco/assets/Chateau2.png filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,48 @@

+.envrc
+.venv/
+.gradio/
+work_dirs*
+# Byte-compiled files
+__pycache__/
+*.py[cod]
+# Virtual environments
+env/
+venv/
+ENV/
+.VENV/
+# Distribution files
+build/
+dist/
+*.egg-info/
+# Logs and temporary files
+*.log
+*.tmp
+*.bak
+*.swp
+# IDE files
+.idea/
+.vscode/
+*.sublime-workspace
+*.sublime-project
+# OS files
+.DS_Store
+Thumbs.db
+# Testing and coverage
+htmlcov/
+.coverage
+*.cover
+*.py,cover
+.cache/
+# Jupyter Notebook checkpoints
+.ipynb_checkpoints/
+# Pre-commit hooks
+.pre-commit-config.yaml~

.gitmodules ADDED Viewed

	@@ -0,0 +1,3 @@

+[submodule "third_party/dust3r"]
+	path = third_party/dust3r
+	url = https://github.com/jensenstability/dust3r

.pre-commit-config.yaml ADDED Viewed

	@@ -0,0 +1,22 @@

+default_language_version:
+  python: python3
+default_stages: [pre-commit]
+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v5.0.0
+    hooks:
+      - id: trailing-whitespace
+      - id: end-of-file-fixer
+  - repo: https://github.com/charliermarsh/ruff-pre-commit
+    rev: v0.8.3
+    hooks:
+      - id: ruff
+        types_or: [python, pyi, jupyter]
+        args: [--fix, --extend-ignore=E402]
+      - id: ruff-format
+        types_or: [python, pyi, jupyter]
+  - repo: https://github.com/pre-commit/mirrors-prettier
+    rev: v3.1.0
+    hooks:
+      - id: prettier
+        types_or: [markdown]

LICENSE ADDED Viewed

	@@ -0,0 +1,124 @@

+Stability AI Non-Commercial License Agreement
+Last Updated: February 20, 2025
+I. INTRODUCTION
+This Stability AI Non-Commercial License Agreement (the “Agreement”) applies to any individual person or entity
+(“You”, “Your” or “Licensee”) that uses or distributes any portion or element of the Stability AI Materials or
+Derivative Works thereof for any Research & Non-Commercial use. Capitalized terms not otherwise defined herein
+are defined in Section IV below.
+This Agreement is intended to allow research and non-commercial uses of the Model free of charge.
+By clicking “I Accept” or by using or distributing or using any portion or element of the Stability Materials
+or Derivative Works, You agree that You have read, understood and are bound by the terms of this Agreement.
+If You are acting on behalf of a company, organization, or other entity, then “You” includes you and that entity,
+and You agree that You:
+(i) are an authorized representative of such entity with the authority to bind such entity to this Agreement, and
+(ii) You agree to the terms of this Agreement on that entity’s behalf.
+---
+II. RESEARCH & NON-COMMERCIAL USE LICENSE
+Subject to the terms of this Agreement, Stability AI grants You a non-exclusive, worldwide, non-transferable,
+non-sublicensable, revocable, and royalty-free limited license under Stability AI’s intellectual property or other
+rights owned by Stability AI embodied in the Stability AI Materials to use, reproduce, distribute, and create
+Derivative Works of, and make modifications to, the Stability AI Materials for any Research or Non-Commercial Purpose.
+- **“Research Purpose”** means academic or scientific advancement, and in each case, is not primarily intended
+  for commercial advantage or monetary compensation to You or others.
+- **“Non-Commercial Purpose”** means any purpose other than a Research Purpose that is not primarily intended
+  for commercial advantage or monetary compensation to You or others, such as personal use (i.e., hobbyist)
+  or evaluation and testing.
+---
+III. GENERAL TERMS
+Your Research or Non-Commercial license under this Agreement is subject to the following terms.
+### a. Distribution & Attribution
+If You distribute or make available the Stability AI Materials or a Derivative Work to a third party, or a product
+or service that uses any portion of them, You shall:
+1. Provide a copy of this Agreement to that third party.
+2. Retain the following attribution notice within a **"Notice"** text file distributed as a part of such copies:
+   **"This Stability AI Model is licensed under the Stability AI Non-Commercial License,
+   Copyright © Stability AI Ltd. All Rights Reserved."**
+3. Prominently display **“Powered by Stability AI”** on a related website, user interface, blog post,
+   about page, or product documentation.
+4. If You create a Derivative Work, You may add your own attribution notice(s) to the **"Notice"** text file
+   included with that Derivative Work, provided that You clearly indicate which attributions apply to the
+   Stability AI Materials and state in the **"Notice"** text file that You changed the Stability AI Materials
+   and how it was modified.
+### b. Use Restrictions
+Your use of the Stability AI Materials and Derivative Works, including any output or results of the Stability
+AI Materials or Derivative Works, must comply with applicable laws and regulations (including Trade Control
+Laws and equivalent regulations) and adhere to the Documentation and Stability AI’s AUP, which is hereby
+incorporated by reference.
+Furthermore, You will not use the Stability AI Materials or Derivative Works, or any output or results of the
+Stability AI Materials or Derivative Works, to create or improve any foundational generative AI model
+(excluding the Model or Derivative Works).
+### c. Intellectual Property
+#### (i) Trademark License
+No trademark licenses are granted under this Agreement, and in connection with the Stability AI Materials
+or Derivative Works, You may not use any name or mark owned by or associated with Stability AI or any of
+its Affiliates, except as required under Section IV(a) herein.
+#### (ii) Ownership of Derivative Works
+As between You and Stability AI, You are the owner of Derivative Works You create, subject to Stability AI’s
+ownership of the Stability AI Materials and any Derivative Works made by or for Stability AI.
+#### (iii) Ownership of Outputs
+As between You and Stability AI, You own any outputs generated from the Model or Derivative Works to the extent
+permitted by applicable law.
+#### (iv) Disputes
+If You or Your Affiliate(s) institute litigation or other proceedings against Stability AI (including a
+cross-claim or counterclaim in a lawsuit) alleging that the Stability AI Materials, Derivative Works, or
+associated outputs or results, or any portion of any of the foregoing, constitutes infringement of intellectual
+property or other rights owned or licensable by You, then any licenses granted to You under this Agreement
+shall terminate as of the date such litigation or claim is filed or instituted.
+You will indemnify and hold harmless Stability AI from and against any claim by any third party arising out
+of or related to Your use or distribution of the Stability AI Materials or Derivative Works in violation of
+this Agreement.
+#### (v) Feedback
+From time to time, You may provide Stability AI with verbal and/or written suggestions, comments, or other
+feedback related to Stability AI’s existing or prospective technology, products, or services (collectively,
+“Feedback”).
+You are not obligated to provide Stability AI with Feedback, but to the extent that You do, You hereby grant
+Stability AI a **perpetual, irrevocable, royalty-free, fully-paid, sub-licensable, transferable, non-exclusive,
+worldwide right and license** to exploit the Feedback in any manner without restriction.
+Your Feedback is provided **“AS IS”** and You make no warranties whatsoever about any Feedback.
+---
+IV. DEFINITIONS
+- **“Affiliate(s)”** means any entity that directly or indirectly controls, is controlled by, or is under common
+  control with the subject entity. For purposes of this definition, “control” means direct or indirect ownership
+  or control of more than 50% of the voting interests of the subject entity.
+- **“AUP”** means the Stability AI Acceptable Use Policy available at https://stability.ai/use-policy, as may
+  be updated from time to time.
+- **"Derivative Work(s)"** means:
+  (a) Any derivative work of the Stability AI Materials as recognized by U.S. copyright laws.
+  (b) Any modifications to a Model, and any other model created which is based on or derived from the Model or
+      the Model’s output, including **fine-tune** and **low-rank adaptation** models derived from a Model or
+      a Model’s output, but does not include the output of any Model.
+- **“Model”** means Stability AI’s Stable Virtual Camera model.
+- **"Stability AI" or "we"** means Stability AI Ltd. and its Affiliates.
+- **"Software"** means Stability AI’s proprietary software made available under this Agreement now or in the future.
+- **“Stability AI Materials”** means, collectively, Stability’s proprietary Model, Software, and Documentation
+  (and any portion or combination thereof) made available under this Agreement.
+- **“Trade Control Laws”** means any applicable U.S. and non-U.S. export control and trade sanctions laws and regulations.

README.md CHANGED Viewed

@@ -1,12 +1,83 @@
 ---
-title: Stable Virtual Camera
-emoji: 💻
-colorFrom: green
-colorTo: indigo
 sdk: gradio
-sdk_version: 5.22.0
-app_file: app.py
-pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: stable-virtual-camera
+app_file: demo_gr.py
 sdk: gradio
+sdk_version: 5.20.1
 ---
+# Stable Virtual Camera
+<a href="https://stable-virtual-camera.github.io"><img src="https://img.shields.io/badge/%F0%9F%8F%A0%20Project%20Page-gray.svg"></a>
+<a href="http://arxiv.org/abs/2503.14489"><img src="https://img.shields.io/badge/%F0%9F%93%84%20arXiv-2503.14489-B31B1B.svg"></a>
+<a href="https://stability.ai/news/introducing-stable-virtual-camera-multi-view-video-generation-with-3d-camera-control"><img src="https://img.shields.io/badge/%F0%9F%93%83%20Blog-Stability%20AI-orange.svg"></a>
+<a href="https://huggingface.co/stabilityai/stable-virtual-camera"><img src="https://img.shields.io/badge/%F0%9F%A4%97%20Model_Card-Huggingface-orange"></a>
+<a href="https://huggingface.co/spaces/stabilityai/stable-virtual-camera"><img src="https://img.shields.io/badge/%F0%9F%9A%80%20Gradio%20Demo-Huggingface-orange"></a>
+<a href="https://www.youtube.com/channel/UCLLlVDcS7nNenT_zzO3OPxQ"><img src="https://img.shields.io/badge/%F0%9F%8E%AC%20Video-YouTube-orange"></a>
+`Stable Virtual Camera (Seva)` is a 1.3B generalist diffusion model for Novel View Synthesis (NVS), generating 3D consistent novel views of a scene, given any number of input views and target cameras.
+# :tada: News
+- March 2025 - `Stable Virtual Camera` is out everywhere.
+# :wrench: Installation
+```bash
+git clone --recursive https://github.com/Stability-AI/stable-virtual-camera
+cd stable-virtual-camera
+pip install -e .
+```
+Please note that you will need `python>=3.10` and `torch>=2.6.0`.
+Check [INSTALL.md](docs/INSTALL.md) for other dependencies if you want to use our demos or develop from this repo.
+For windows users, please use WSL as flash attention isn't supported on native Windows [yet](https://github.com/pytorch/pytorch/issues/108175).
+# :open_book: Usage
+You need to properly authenticate with Hugging Face to download our model weights. Once set up, our code will handle it automatically at your first run. You can authenticate by running
+```bash
+# This will prompt you to enter your Hugging Face credentials.
+huggingface-cli login
+```
+Once authenticated, go to our model card [here](https://huggingface.co/stabilityai/stable-virtual-camera) and enter your information for access.
+We provide two demos for you to interative with `Stable Virtual Camera`.
+### :rocket: Gradio demo
+This gradio demo is a GUI interface that requires no expertised knowledge, suitable for general users. Simply run
+```bash
+python demo_gr.py
+```
+For a more detailed guide, follow [GR_USAGE.md](docs/GR_USAGE.md).
+### :computer: CLI demo
+This cli demo allows you to pass in more options and control the model in a fine-grained way, suitable for power users and academic researchers. An examplar command line looks as simple as
+```bash
+python demo.py --data_path <data_path> [additional arguments]
+```
+For a more detailed guide, follow [CLI_USAGE.md](docs/CLI_USAGE.md).
+For users interested in benchmarking NVS models using command lines, check [`benchmark`](benchmark/) containing the details about scenes, splits, and input/target views we reported in the <a href="http://arxiv.org/abs/2503.14489">paper</a>.
+# :books: Citing
+If you find this repository useful, please consider giving a star :star: and citation.
+```
+@article{zhou2025stable,
+    title={Stable Virtual Camera: Generative View Synthesis with Diffusion Models},
+    author={Jensen (Jinghao) Zhou and Hang Gao and Vikram Voleti and Aaryaman Vasishta and Chun-Han Yao and Mark Boss and
+    Philip Torr and Christian Rupprecht and Varun Jampani
+    },
+    journal={arXiv preprint},
+    year={2025}
+}
+```

assets/advance/backyard-7_0.jpg ADDED Viewed

assets/advance/backyard-7_1.jpg ADDED Viewed

assets/advance/backyard-7_2.jpg ADDED Viewed

assets/advance/backyard-7_3.jpg ADDED Viewed

assets/advance/backyard-7_4.jpg ADDED Viewed

assets/advance/backyard-7_5.jpg ADDED Viewed

assets/advance/backyard-7_6.jpg ADDED Viewed

assets/advance/blue-car.jpg ADDED Viewed

Git LFS Details

SHA256: 0cf493d0f738830223949fd24bb3ab0a1c078804fdb744efa95a1fdcfcfb5332
Pointer size: 131 Bytes
Size of remote file: 106 kB

assets/advance/garden-4_0.jpg ADDED Viewed

Git LFS Details

SHA256: 38fbe78f699fc84a1f4268ef8bacef9ddacfd32e9eb8fbcb605e46cfd52b988e
Pointer size: 132 Bytes
Size of remote file: 1.05 MB

assets/advance/garden-4_1.jpg ADDED Viewed

Git LFS Details

SHA256: 1975effeffc9b2011a28f6eb04d1b0bd2f37f765c194249c95e6b3783d698a42
Pointer size: 132 Bytes
Size of remote file: 1.05 MB

assets/advance/garden-4_2.jpg ADDED Viewed

Git LFS Details

SHA256: 4112ff5f2ceaa3b469bb402853e7cde10396f858e5a2ceba93b095e1e3d8d335
Pointer size: 132 Bytes
Size of remote file: 1.04 MB

assets/advance/garden-4_3.jpg ADDED Viewed

Git LFS Details

SHA256: a750b648c389f78f2f6b26d78f753eace13a41d355f725850c2667f864f709cd
Pointer size: 132 Bytes
Size of remote file: 1.06 MB

assets/advance/telebooth-2_0.jpg ADDED Viewed

assets/advance/telebooth-2_1.jpg ADDED Viewed

assets/advance/vgg-lab-4_0.png ADDED Viewed

Git LFS Details

SHA256: d1442eb509af02273cf7168f5212b3221142df4db99991b38395f42f8b239960
Pointer size: 131 Bytes
Size of remote file: 412 kB

assets/advance/vgg-lab-4_1.png ADDED Viewed

Git LFS Details

SHA256: c2bb10b9574247ceb0948aa00afea588f001f0271f51908b8132d63587fc43d0
Pointer size: 131 Bytes
Size of remote file: 443 kB

assets/advance/vgg-lab-4_2.png ADDED Viewed

Git LFS Details

SHA256: 7fa884bb6d783fd9385bd38042f3461f430bec8311e7b2171474b6a906538030
Pointer size: 131 Bytes
Size of remote file: 410 kB

assets/advance/vgg-lab-4_3.png ADDED Viewed

Git LFS Details

SHA256: 99469f816604c92c9c27a7cff119cb3649d3dfa4c41dcef89525b7b3cbd885a4
Pointer size: 131 Bytes
Size of remote file: 475 kB

assets/basic/blue-car.jpg ADDED Viewed

Git LFS Details

SHA256: 0cf493d0f738830223949fd24bb3ab0a1c078804fdb744efa95a1fdcfcfb5332
Pointer size: 131 Bytes
Size of remote file: 106 kB

assets/basic/hilly-countryside.jpg ADDED Viewed

Git LFS Details

SHA256: 4ae3b8cb5d989b62ceaf4930afea55790048657fa459f383f8bd809b3bdcfca0
Pointer size: 131 Bytes
Size of remote file: 107 kB

assets/basic/lily-dragon.png ADDED Viewed

Git LFS Details

SHA256: c545057ee2feeced73566f708311bf758350ef0ded844d7bd438e48fca7f5bd2
Pointer size: 132 Bytes
Size of remote file: 1.57 MB

assets/basic/llff-room.jpg ADDED Viewed

assets/basic/mountain-lake.jpg ADDED Viewed

assets/basic/vasedeck.jpg ADDED Viewed

assets/basic/vgg-lab-4_0.png ADDED Viewed

Git LFS Details

SHA256: d1442eb509af02273cf7168f5212b3221142df4db99991b38395f42f8b239960
Pointer size: 131 Bytes
Size of remote file: 412 kB

benchmark/README.md ADDED Viewed

	@@ -0,0 +1,156 @@

+# :bar_chart: Benchmark
+We provide <a href="https://github.com/Stability-AI/stable-virtual-camera/releases/tag/benchmark">in this release</a> (`benchmark.zip`) with the following 17 entries as a benchmark to evaluate NVS models.
+We hope this will help standardize the evaluation of NVS models and facilitate fair comparison between different methods.
+<table>
+  <thead>
+    <tr>
+      <th align="center">Dataset</th>
+      <th align="center">Split</th>
+      <th align="center">Path</th>
+      <th align="center">Content</th>
+      <th align="center">Image Preprocessing</th>
+      <th align="center">Image Postprocessing</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td align="center">OmniObject3D</td>
+      <td align="center"><code>S</code> (SV3D), <code>O</code> (Ours) </td>
+      <td align="center"><code>omniobject3d</code></td>
+      <td align="center"><code>train_test_split_*.json</code></td>
+      <td align="center">center crop to 576</td>
+      <td align="center">\</td>
+    </tr>
+    <tr>
+      <td align="center">GSO</td>
+      <td align="center"><code>S</code> (SV3D), <code>O</code> (Ours) </td>
+      <td align="center"><code>gso</code></td>
+      <td align="center"><code>train_test_split_*.json</code></td>
+      <td align="center">center crop to 576</td>
+      <td align="center">\</td>
+    </tr>
+    <tr>
+      <td align="center" rowspan="4">RealEstate10K</td>
+      <td align="center"><code>D</code> (4DiM) </td>
+      <td align="center"><code>re10k-4dim</code></td>
+      <td align="center"><code>train_test_split_*.json</code></td>
+      <td align="center">center crop to 576</td>
+      <td align="center">resize to 256</td>
+    </tr>
+    <tr>
+      <td align="center"><code>R</code> (ReconFusion) </td>
+      <td align="center"><code>re10k</code></td>
+      <td align="center"><code>train_test_split_*.json</code></td>
+      <td align="center">center crop to 576</td>
+      <td align="center">\</td>
+    </tr>
+    <tr>
+      <td align="center"><code>P</code> (pixelSplat) </td>
+      <td align="center"><code>re10k-pixelsplat</code></td>
+      <td align="center"><code>train_test_split_*.json</code></td>
+      <td align="center">center crop to 576</td>
+      <td align="center">resize to 256</td>
+    </tr>
+    <tr>
+      <td align="center"><code>V</code> (ViewCrafter) </td>
+      <td align="center"><code>re10k-viewcrafter</code></td>
+      <td align="center"><code>images/*.png</code>,<code>transforms.json</code>,<code>train_test_split_*.json</code></td>
+      <td align="center">resize the shortest side to 576 (<code>--L_short 576</code>)</td>
+      <td align="center">center crop</td>
+    </tr>
+    <tr>
+      <td align="center">LLFF</td>
+      <td align="center"><code>R</code> (ReconFusion) </td>
+      <td align="center"><code>llff</code></td>
+      <td align="center"><code>train_test_split_*.json</code></td>
+      <td align="center">center crop to 576</td>
+      <td align="center">\</td>
+    </tr>
+    <tr>
+      <td align="center">DTU</td>
+      <td align="center"><code>R</code> (ReconFusion) </td>
+      <td align="center"><code>dtu</code></td>
+      <td align="center"><code>train_test_split_*.json</code></td>
+      <td align="center">center crop to 576</td>
+      <td align="center">\</td>
+    </tr>
+    <tr>
+      <td align="center" rowspan="2">CO3D</td>
+      <td align="center"><code>R</code> (ReconFusion) </td>
+      <td align="center"><code>co3d</code></td>
+      <td align="center"><code>train_test_split_*.json</code></td>
+      <td align="center">center crop to 576</td>
+      <td align="center">\</td>
+    </tr>
+    <tr>
+      <td align="center"><code>V</code> (ViewCrafter) </td>
+      <td align="center"><code>co3d-viewcrafter</code></td>
+      <td align="center"><code>images/*.png</code>,<code>transforms.json</code>,<code>train_test_split_*.json</code></td>
+      <td align="center">resize the shortest side to 576 (<code>--L_short 576</code>)</td>
+      <td align="center">center crop</td>
+    </tr>
+    <tr>
+      <td align="center" rowspan="2" >WildRGB-D</td>
+      <td align="center"><code>Oₑ</code> (Ours, easy) </td>
+      <td align="center"><code>wildgbd/easy</code></td>
+      <td align="center"><code>train_test_split_*.json</code></td>
+      <td align="center">center crop to 576</td>
+      <td align="center">\</td>
+    </tr>
+    <tr>
+      <td align="center"><code>Oₕ</code> (Ours, hard) </td>
+      <td align="center"><code>wildgbd/hard</code></td>
+      <td align="center"><code>train_test_split_*.json</code></td>
+      <td align="center">center crop to 576</td>
+      <td align="center">\</td>
+    </tr>
+    <tr>
+      <td align="center">Mip-NeRF360</td>
+      <td align="center"><code>R</code> (ReconFusion) </td>
+      <td align="center"><code>mipnerf360</code></td>
+      <td align="center"><code>train_test_split_*.json</code></td>
+      <td align="center">center crop to 576</td>
+      <td align="center">\</td>
+    </tr>
+    <tr>
+      <td align="center" rowspan="2">DL3DV-140</td>
+      <td align="center"><code>O</code> (Ours) </td>
+      <td align="center"><code>dl3dv10</code></td>
+      <td align="center"><code>train_test_split_*.json</code></td>
+      <td align="center">center crop to 576</td>
+      <td align="center">\</td>
+    </tr>
+    <tr>
+      <td align="center"><code>L</code> (Long-LRM) </td>
+      <td align="center"><code>dl3dv140</code></td>
+      <td align="center"><code>train_test_split_*.json</code></td>
+      <td align="center">center crop to 576</td>
+      <td align="center">\</td>
+    </tr>
+    <tr>
+      <td align="center" rowspan="2">Tanks and Temples</td>
+      <td align="center"><code>V</code> (ViewCrafter) </td>
+      <td align="center"><code>tnt-viewcrafter</code></td>
+      <td align="center"><code>images/*.png</code>,<code>transforms.json</code>,<code>train_test_split_*.json</code></td>
+      <td align="center">resize the shortest side to 576 (<code>--L_short 576</code>)</td>
+      <td align="center">center crop</td>
+    </tr>
+    <tr>
+      <td align="center"><code>L</code> (Long-LRM) </td>
+      <td align="center"><code>tnt-longlrm</code></td>
+      <td align="center"><code>train_test_split_*.json</code></td>
+      <td align="center">center crop to 576</td>
+      <td align="center">\</td>
+    </tr>
+  </tbody>
+</table>
+- For entries without `images/*.png` and `transforms.json`, we use the images from the original dataset after converting them into the `reconfusion` format, which is then parsable by `ReconfusionParser` (`seva/data_io.py`).
+  Please note that during this conversion, you should sort the images by `sorted(image_paths)`, which is then directly indexable by our train/test ids. We provide in `benchmark/export_reconfusion_example.py` an example script converting an existing academic dataset into the the scene folders.
+- For evaluation and benchmarking, we first conduct operations in the `Image Preprocessing` column to the model input and then operations in the `Image Postprocessing` column to the model output. The final processed samples are used for metric computation.
+## Acknowledgment
+We would like to thank Wangbo Yu, Aleksander Hołyński, Saurabh Saxena, and Ziwen Chen for their kind clarification on experiment settings.

benchmark/export_reconfusion_example.py ADDED Viewed

	@@ -0,0 +1,137 @@

+import argparse
+import json
+import os
+import numpy as np
+from PIL import Image
+try:
+    from sklearn.cluster import KMeans  # type: ignore[import]
+except ImportError:
+    print("Please install sklearn to use this script.")
+    exit(1)
+# Define the folder containing the image and JSON files
+subfolder = "/path/to/your/dataset"
+output_file = os.path.join(subfolder, "transforms.json")
+# List to hold the frames
+frames = []
+# Iterate over the files in the folder
+for file in sorted(os.listdir(subfolder)):
+    if file.endswith(".json"):
+        # Read the JSON file containing camera extrinsics and intrinsics
+        json_path = os.path.join(subfolder, file)
+        with open(json_path, "r") as f:
+            data = json.load(f)
+        # Read the corresponding image file
+        image_file = file.replace(".json", ".png")
+        image_path = os.path.join(subfolder, image_file)
+        if not os.path.exists(image_path):
+            print(f"Image file not found for {file}, skipping...")
+            continue
+        with Image.open(image_path) as img:
+            w, h = img.size
+        # Extract and normalize intrinsic matrix K
+        K = data["K"]
+        fx = K[0][0] * w
+        fy = K[1][1] * h
+        cx = K[0][2] * w
+        cy = K[1][2] * h
+        # Extract the transformation matrix
+        transform_matrix = np.array(data["c2w"])
+        # Adjust for OpenGL convention
+        transform_matrix[..., [1, 2]] *= -1
+        # Add the frame data
+        frames.append(
+            {
+                "fl_x": fx,
+                "fl_y": fy,
+                "cx": cx,
+                "cy": cy,
+                "w": w,
+                "h": h,
+                "file_path": f"./{os.path.relpath(image_path, subfolder)}",
+                "transform_matrix": transform_matrix.tolist(),
+            }
+        )
+# Create the output dictionary
+transforms_data = {"orientation_override": "none", "frames": frames}
+# Write to the transforms.json file
+with open(output_file, "w") as f:
+    json.dump(transforms_data, f, indent=4)
+print(f"transforms.json generated at {output_file}")
+# Train-test split function using K-means clustering with stride
+def create_train_test_split(frames, n, output_path, stride):
+    # Prepare the data for K-means
+    positions = []
+    for frame in frames:
+        transform_matrix = np.array(frame["transform_matrix"])
+        position = transform_matrix[:3, 3]  # 3D camera position
+        direction = transform_matrix[:3, 2] / np.linalg.norm(
+            transform_matrix[:3, 2]
+        )  # Normalized 3D direction
+        positions.append(np.concatenate([position, direction]))
+    positions = np.array(positions)
+    # Apply K-means clustering
+    kmeans = KMeans(n_clusters=n, random_state=42)
+    kmeans.fit(positions)
+    centers = kmeans.cluster_centers_
+    # Find the index closest to each cluster center
+    train_ids = []
+    for center in centers:
+        distances = np.linalg.norm(positions - center, axis=1)
+        train_ids.append(int(np.argmin(distances)))  # Convert to Python int
+    # Remaining indices as test_ids, applying stride
+    all_indices = set(range(len(frames)))
+    remaining_indices = sorted(all_indices - set(train_ids))
+    test_ids = [
+        int(idx) for idx in remaining_indices[::stride]
+    ]  # Convert to Python int
+    # Create the split data
+    split_data = {"train_ids": sorted(train_ids), "test_ids": test_ids}
+    with open(output_path, "w") as f:
+        json.dump(split_data, f, indent=4)
+    print(f"Train-test split file generated at {output_path}")
+# Parse arguments
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Generate train-test split JSON file using K-means clustering."
+    )
+    parser.add_argument(
+        "--n",
+        type=int,
+        required=True,
+        help="Number of frames to include in the training set.",
+    )
+    parser.add_argument(
+        "--stride",
+        type=int,
+        default=1,
+        help="Stride for selecting test frames (not used with K-means).",
+    )
+    args = parser.parse_args()
+    # Create train-test split
+    train_test_split_path = os.path.join(subfolder, f"train_test_split_{args.n}.json")
+    create_train_test_split(frames, args.n, train_test_split_path, args.stride)

demo.py ADDED Viewed

	@@ -0,0 +1,407 @@

+import glob
+import os
+import os.path as osp
+import fire
+import numpy as np
+import torch
+import torch.nn.functional as F
+from PIL import Image
+from tqdm import tqdm
+from seva.data_io import get_parser
+from seva.eval import (
+    IS_TORCH_NIGHTLY,
+    compute_relative_inds,
+    create_transforms_simple,
+    infer_prior_inds,
+    infer_prior_stats,
+    run_one_scene,
+)
+from seva.geometry import (
+    generate_interpolated_path,
+    generate_spiral_path,
+    get_arc_horizontal_w2cs,
+    get_default_intrinsics,
+    get_lookat,
+    get_preset_pose_fov,
+)
+from seva.model import SGMWrapper
+from seva.modules.autoencoder import AutoEncoder
+from seva.modules.conditioner import CLIPConditioner
+from seva.sampling import DDPMDiscretization, DiscreteDenoiser
+from seva.utils import load_model
+device = "cuda:0"
+# Constants.
+WORK_DIR = "work_dirs/demo"
+if IS_TORCH_NIGHTLY:
+    COMPILE = True
+    os.environ["TORCHINDUCTOR_AUTOGRAD_CACHE"] = "1"
+    os.environ["TORCHINDUCTOR_FX_GRAPH_CACHE"] = "1"
+else:
+    COMPILE = False
+MODEL = SGMWrapper(load_model(device="cpu", verbose=True).eval()).to(device)
+AE = AutoEncoder(chunk_size=1).to(device)
+CONDITIONER = CLIPConditioner().to(device)
+DISCRETIZATION = DDPMDiscretization()
+DENOISER = DiscreteDenoiser(discretization=DISCRETIZATION, num_idx=1000, device=device)
+VERSION_DICT = {
+    "H": 576,
+    "W": 576,
+    "T": 21,
+    "C": 4,
+    "f": 8,
+    "options": {},
+}
+if COMPILE:
+    MODEL = torch.compile(MODEL, dynamic=False)
+    CONDITIONER = torch.compile(CONDITIONER, dynamic=False)
+    AE = torch.compile(AE, dynamic=False)
+def parse_task(
+    task,
+    scene,
+    num_inputs,
+    T,
+    version_dict,
+):
+    options = version_dict["options"]
+    anchor_indices = None
+    anchor_c2ws = None
+    anchor_Ks = None
+    if task == "img2trajvid_s-prob":
+        if num_inputs is not None:
+            assert (
+                num_inputs == 1
+            ), "Task `img2trajvid_s-prob` only support 1-view conditioning..."
+        else:
+            num_inputs = 1
+        num_targets = options.get("num_targets", T - 1)
+        num_anchors = infer_prior_stats(
+            T,
+            num_inputs,
+            num_total_frames=num_targets,
+            version_dict=version_dict,
+        )
+        input_indices = [0]
+        anchor_indices = np.linspace(1, num_targets, num_anchors).tolist()
+        all_imgs_path = [scene] + [None] * num_targets
+        c2ws, fovs = get_preset_pose_fov(
+            option=options.get("traj_prior", "orbit"),
+            num_frames=num_targets + 1,
+            start_w2c=torch.eye(4),
+            look_at=torch.Tensor([0, 0, 10]),
+        )
+        with Image.open(scene) as img:
+            W, H = img.size
+            aspect_ratio = W / H
+        Ks = get_default_intrinsics(fovs, aspect_ratio=aspect_ratio)  # unormalized
+        Ks[:, :2] *= (
+            torch.tensor([W, H]).reshape(1, -1, 1).repeat(Ks.shape[0], 1, 1)
+        )  # normalized
+        Ks = Ks.numpy()
+        anchor_c2ws = c2ws[[round(ind) for ind in anchor_indices]]
+        anchor_Ks = Ks[[round(ind) for ind in anchor_indices]]
+    else:
+        parser = get_parser(
+            parser_type="reconfusion",
+            data_dir=scene,
+            normalize=False,
+        )
+        all_imgs_path = parser.image_paths
+        c2ws = parser.camtoworlds
+        camera_ids = parser.camera_ids
+        Ks = np.concatenate([parser.Ks_dict[cam_id][None] for cam_id in camera_ids], 0)
+        if num_inputs is None:
+            assert len(parser.splits_per_num_input_frames.keys()) == 1
+            num_inputs = list(parser.splits_per_num_input_frames.keys())[0]
+            split_dict = parser.splits_per_num_input_frames[num_inputs]  # type: ignore
+        elif isinstance(num_inputs, str):
+            split_dict = parser.splits_per_num_input_frames[num_inputs]  # type: ignore
+            num_inputs = int(num_inputs.split("-")[0])  # for example 1_from32
+        else:
+            split_dict = parser.splits_per_num_input_frames[num_inputs]  # type: ignore
+        num_targets = len(split_dict["test_ids"])
+        if task == "img2img":
+            # Note in this setting, we should refrain from using all the other camera
+            # info except ones from sampled_indices, and most importantly, the order.
+            num_anchors = infer_prior_stats(
+                T,
+                num_inputs,
+                num_total_frames=num_targets,
+                version_dict=version_dict,
+            )
+            sampled_indices = np.sort(
+                np.array(split_dict["train_ids"] + split_dict["test_ids"])
+            )  # we always sort all indices first
+            traj_prior = options.get("traj_prior", None)
+            if traj_prior == "spiral":
+                assert parser.bounds is not None
+                anchor_c2ws = generate_spiral_path(
+                    c2ws[sampled_indices] @ np.diagflat([1, -1, -1, 1]),
+                    parser.bounds[sampled_indices],
+                    n_frames=num_anchors + 1,
+                    n_rots=2,
+                    zrate=0.5,
+                    endpoint=False,
+                )[1:] @ np.diagflat([1, -1, -1, 1])
+            elif traj_prior == "interpolated":
+                assert num_inputs > 1
+                anchor_c2ws = generate_interpolated_path(
+                    c2ws[split_dict["train_ids"], :3],
+                    round((num_anchors + 1) / (num_inputs - 1)),
+                    endpoint=False,
+                )[1 : num_anchors + 1]
+            elif traj_prior == "orbit":
+                c2ws_th = torch.as_tensor(c2ws)
+                lookat = get_lookat(
+                    c2ws_th[sampled_indices, :3, 3],
+                    c2ws_th[sampled_indices, :3, 2],
+                )
+                anchor_c2ws = torch.linalg.inv(
+                    get_arc_horizontal_w2cs(
+                        torch.linalg.inv(c2ws_th[split_dict["train_ids"][0]]),
+                        lookat,
+                        -F.normalize(
+                            c2ws_th[split_dict["train_ids"]][:, :3, 1].mean(0),
+                            dim=-1,
+                        ),
+                        num_frames=num_anchors + 1,
+                        endpoint=False,
+                    )
+                ).numpy()[1:, :3]
+            else:
+                anchor_c2ws = None
+            # anchor_Ks is default to be the first from target_Ks
+            all_imgs_path = [all_imgs_path[i] for i in sampled_indices]
+            c2ws = c2ws[sampled_indices]
+            Ks = Ks[sampled_indices]
+            # absolute to relative indices
+            input_indices = compute_relative_inds(
+                sampled_indices,
+                np.array(split_dict["train_ids"]),
+            )
+            anchor_indices = np.arange(
+                sampled_indices.shape[0],
+                sampled_indices.shape[0] + num_anchors,
+            ).tolist()  # the order has no meaning here
+        elif task == "img2vid":
+            num_targets = len(all_imgs_path) - num_inputs
+            num_anchors = infer_prior_stats(
+                T,
+                num_inputs,
+                num_total_frames=num_targets,
+                version_dict=version_dict,
+            )
+            input_indices = split_dict["train_ids"]
+            anchor_indices = infer_prior_inds(
+                c2ws,
+                num_prior_frames=num_anchors,
+                input_frame_indices=input_indices,
+                options=options,
+            ).tolist()
+            num_anchors = len(anchor_indices)
+            anchor_c2ws = c2ws[anchor_indices, :3]
+            anchor_Ks = Ks[anchor_indices]
+        elif task == "img2trajvid":
+            num_anchors = infer_prior_stats(
+                T,
+                num_inputs,
+                num_total_frames=num_targets,
+                version_dict=version_dict,
+            )
+            target_c2ws = c2ws[split_dict["test_ids"], :3]
+            target_Ks = Ks[split_dict["test_ids"]]
+            anchor_c2ws = target_c2ws[
+                np.linspace(0, num_targets - 1, num_anchors).round().astype(np.int64)
+            ]
+            anchor_Ks = target_Ks[
+                np.linspace(0, num_targets - 1, num_anchors).round().astype(np.int64)
+            ]
+            sampled_indices = split_dict["train_ids"] + split_dict["test_ids"]
+            all_imgs_path = [all_imgs_path[i] for i in sampled_indices]
+            c2ws = c2ws[sampled_indices]
+            Ks = Ks[sampled_indices]
+            input_indices = np.arange(num_inputs).tolist()
+            anchor_indices = np.linspace(
+                num_inputs, num_inputs + num_targets - 1, num_anchors
+            ).tolist()
+        else:
+            raise ValueError(f"Unknown task: {task}")
+    return (
+        all_imgs_path,
+        num_inputs,
+        num_targets,
+        input_indices,
+        anchor_indices,
+        torch.tensor(c2ws[:, :3]).float(),
+        torch.tensor(Ks).float(),
+        (torch.tensor(anchor_c2ws[:, :3]).float() if anchor_c2ws is not None else None),
+        (torch.tensor(anchor_Ks).float() if anchor_Ks is not None else None),
+    )
+def main(
+    data_path,
+    data_items=None,
+    task="img2img",
+    save_subdir="",
+    H=None,
+    W=None,
+    T=None,
+    use_traj_prior=False,
+    **overwrite_options,
+):
+    if H is not None:
+        VERSION_DICT["H"] = H
+    if W is not None:
+        VERSION_DICT["W"] = W
+    if T is not None:
+        VERSION_DICT["T"] = [int(t) for t in T.split(",")] if isinstance(T, str) else T
+    options = VERSION_DICT["options"]
+    options["chunk_strategy"] = "nearest-gt"
+    options["video_save_fps"] = 30.0
+    options["beta_linear_start"] = 5e-6
+    options["log_snr_shift"] = 2.4
+    options["guider_types"] = 1
+    options["cfg"] = 2.0
+    options["camera_scale"] = 2.0
+    options["num_steps"] = 50
+    options["cfg_min"] = 1.2
+    options["encoding_t"] = 1
+    options["decoding_t"] = 1
+    options["num_inputs"] = None
+    options["seed"] = 23
+    options.update(overwrite_options)
+    num_inputs = options["num_inputs"]
+    seed = options["seed"]
+    if data_items is not None:
+        if not isinstance(data_items, (list, tuple)):
+            data_items = data_items.split(",")
+        scenes = [os.path.join(data_path, item) for item in data_items]
+    else:
+        scenes = glob.glob(osp.join(data_path, "*"))
+    for scene in tqdm(scenes):
+        save_path_scene = os.path.join(
+            WORK_DIR, task, save_subdir, os.path.splitext(os.path.basename(scene))[0]
+        )
+        if options.get("skip_saved", False) and os.path.exists(
+            os.path.join(save_path_scene, "transforms.json")
+        ):
+            print(f"Skipping {scene} as it is already sampled.")
+            continue
+        # parse_task -> infer_prior_stats modifies VERSION_DICT["T"] in-place.
+        (
+            all_imgs_path,
+            num_inputs,
+            num_targets,
+            input_indices,
+            anchor_indices,
+            c2ws,
+            Ks,
+            anchor_c2ws,
+            anchor_Ks,
+        ) = parse_task(
+            task,
+            scene,
+            num_inputs,
+            VERSION_DICT["T"],
+            VERSION_DICT,
+        )
+        assert num_inputs is not None
+        # Create image conditioning.
+        image_cond = {
+            "img": all_imgs_path,
+            "input_indices": input_indices,
+            "prior_indices": anchor_indices,
+        }
+        # Create camera conditioning.
+        camera_cond = {
+            "c2w": c2ws.clone(),
+            "K": Ks.clone(),
+            "input_indices": list(range(num_inputs + num_targets)),
+        }
+        # run_one_scene -> transform_img_and_K modifies VERSION_DICT["H"] and VERSION_DICT["W"] in-place.
+        video_path_generator = run_one_scene(
+            task,
+            VERSION_DICT,  # H, W maybe updated in run_one_scene
+            model=MODEL,
+            ae=AE,
+            conditioner=CONDITIONER,
+            denoiser=DENOISER,
+            image_cond=image_cond,
+            camera_cond=camera_cond,
+            save_path=save_path_scene,
+            use_traj_prior=use_traj_prior,
+            traj_prior_Ks=anchor_Ks,
+            traj_prior_c2ws=anchor_c2ws,
+            seed=seed,  # to ensure sampled video can be reproduced in regardless of start and i
+        )
+        for _ in video_path_generator:
+            pass
+        # Convert from OpenCV to OpenGL camera format.
+        c2ws = c2ws @ torch.tensor(np.diag([1, -1, -1, 1])).float()
+        img_paths = sorted(glob.glob(osp.join(save_path_scene, "samples-rgb", "*.png")))
+        if len(img_paths) != len(c2ws):
+            input_img_paths = sorted(
+                glob.glob(osp.join(save_path_scene, "input", "*.png"))
+            )
+            assert len(img_paths) == num_targets
+            assert len(input_img_paths) == num_inputs
+            assert c2ws.shape[0] == num_inputs + num_targets
+            target_indices = [i for i in range(c2ws.shape[0]) if i not in input_indices]
+            img_paths = [
+                input_img_paths[input_indices.index(i)]
+                if i in input_indices
+                else img_paths[target_indices.index(i)]
+                for i in range(c2ws.shape[0])
+            ]
+        create_transforms_simple(
+            save_path=save_path_scene,
+            img_paths=img_paths,
+            img_whs=np.array([VERSION_DICT["W"], VERSION_DICT["H"]])[None].repeat(
+                num_inputs + num_targets, 0
+            ),
+            c2ws=c2ws,
+            Ks=Ks,
+        )
+if __name__ == "__main__":
+    fire.Fire(main)

demo_gr.py ADDED Viewed

	@@ -0,0 +1,1248 @@

+import copy
+import json
+import os
+import os.path as osp
+import queue
+import secrets
+import threading
+import time
+from datetime import datetime
+from glob import glob
+from pathlib import Path
+from typing import Literal
+import gradio as gr
+import httpx
+import imageio.v3 as iio
+import numpy as np
+import torch
+import torch.nn.functional as F
+import tyro
+import viser
+import viser.transforms as vt
+from einops import rearrange
+from gradio import networking
+from gradio.context import LocalContext
+from gradio.tunneling import CERTIFICATE_PATH, Tunnel
+from seva.eval import (
+    IS_TORCH_NIGHTLY,
+    chunk_input_and_test,
+    create_transforms_simple,
+    infer_prior_stats,
+    run_one_scene,
+    transform_img_and_K,
+)
+from seva.geometry import (
+    DEFAULT_FOV_RAD,
+    get_default_intrinsics,
+    get_preset_pose_fov,
+    normalize_scene,
+)
+from seva.gui import define_gui
+from seva.model import SGMWrapper
+from seva.modules.autoencoder import AutoEncoder
+from seva.modules.conditioner import CLIPConditioner
+from seva.modules.preprocessor import Dust3rPipeline
+from seva.sampling import DDPMDiscretization, DiscreteDenoiser
+from seva.utils import load_model
+device = "cpu"
+# Constants.
+WORK_DIR = "work_dirs/demo_gr"
+MAX_SESSIONS = 1
+ADVANCE_EXAMPLE_MAP = [
+    (
+        "assets/advance/blue-car.jpg",
+        ["assets/advance/blue-car.jpg"],
+    ),
+    (
+        "assets/advance/garden-4_0.jpg",
+        [
+            "assets/advance/garden-4_0.jpg",
+            "assets/advance/garden-4_1.jpg",
+            "assets/advance/garden-4_2.jpg",
+            "assets/advance/garden-4_3.jpg",
+        ],
+    ),
+    (
+        "assets/advance/vgg-lab-4_0.png",
+        [
+            "assets/advance/vgg-lab-4_0.png",
+            "assets/advance/vgg-lab-4_1.png",
+            "assets/advance/vgg-lab-4_2.png",
+            "assets/advance/vgg-lab-4_3.png",
+        ],
+    ),
+    (
+        "assets/advance/telebooth-2_0.jpg",
+        [
+            "assets/advance/telebooth-2_0.jpg",
+            "assets/advance/telebooth-2_1.jpg",
+        ],
+    ),
+    (
+        "assets/advance/backyard-7_0.jpg",
+        [
+            "assets/advance/backyard-7_0.jpg",
+            "assets/advance/backyard-7_1.jpg",
+            "assets/advance/backyard-7_2.jpg",
+            "assets/advance/backyard-7_3.jpg",
+            "assets/advance/backyard-7_4.jpg",
+            "assets/advance/backyard-7_5.jpg",
+            "assets/advance/backyard-7_6.jpg",
+        ],
+    ),
+]
+if IS_TORCH_NIGHTLY:
+    COMPILE = True
+    os.environ["TORCHINDUCTOR_AUTOGRAD_CACHE"] = "1"
+    os.environ["TORCHINDUCTOR_FX_GRAPH_CACHE"] = "1"
+else:
+    COMPILE = False
+# Shared global variables across sessions.
+DUST3R = Dust3rPipeline(device=device)  # type: ignore
+MODEL = SGMWrapper(load_model(device="cpu", verbose=True).eval()).to(device)
+AE = AutoEncoder(chunk_size=1).to(device)
+CONDITIONER = CLIPConditioner().to(device)
+DISCRETIZATION = DDPMDiscretization()
+DENOISER = DiscreteDenoiser(discretization=DISCRETIZATION, num_idx=1000, device=device)
+VERSION_DICT = {
+    "H": 576,
+    "W": 576,
+    "T": 21,
+    "C": 4,
+    "f": 8,
+    "options": {},
+}
+SERVERS = {}
+ABORT_EVENTS = {}
+if COMPILE:
+    MODEL = torch.compile(MODEL)
+    CONDITIONER = torch.compile(CONDITIONER)
+    AE = torch.compile(AE)
+class SevaRenderer(object):
+    def __init__(self, server: viser.ViserServer):
+        self.server = server
+        self.gui_state = None
+    def preprocess(
+        self, input_img_path_or_tuples: list[tuple[str, None]] | str
+    ) -> tuple[dict, dict, dict]:
+        # Simply hardcode these such that aspect ratio is always kept and
+        # shorter side is resized to 576. This is only to make GUI option fewer
+        # though, changing it still works.
+        shorter: int = 576
+        # Has to be 64 multiple for the network.
+        shorter = round(shorter / 64) * 64
+        if isinstance(input_img_path_or_tuples, str):
+            # Assume `Basic` demo mode: just hardcode the camera parameters and ignore points.
+            input_imgs = torch.as_tensor(
+                iio.imread(input_img_path_or_tuples) / 255.0, dtype=torch.float32
+            )[None, ..., :3]
+            input_imgs = transform_img_and_K(
+                input_imgs.permute(0, 3, 1, 2),
+                shorter,
+                K=None,
+                size_stride=64,
+            )[0].permute(0, 2, 3, 1)
+            input_Ks = get_default_intrinsics(
+                aspect_ratio=input_imgs.shape[2] / input_imgs.shape[1]
+            )
+            input_c2ws = torch.eye(4)[None]
+            # Simulate a small time interval such that gradio can update
+            # propgress properly.
+            time.sleep(0.1)
+            return (
+                {
+                    "input_imgs": input_imgs,
+                    "input_Ks": input_Ks,
+                    "input_c2ws": input_c2ws,
+                    "input_wh": (input_imgs.shape[2], input_imgs.shape[1]),
+                    "points": [np.zeros((0, 3))],
+                    "point_colors": [np.zeros((0, 3))],
+                    "scene_scale": 1.0,
+                },
+                gr.update(visible=False),
+                gr.update(),
+            )
+        else:
+            # Assume `Advance` demo mode: use dust3r to extract camera parameters and points.
+            img_paths = [p for (p, _) in input_img_path_or_tuples]
+            (
+                input_imgs,
+                input_Ks,
+                input_c2ws,
+                points,
+                point_colors,
+            ) = DUST3R.infer_cameras_and_points(img_paths)
+            num_inputs = len(img_paths)
+            if num_inputs == 1:
+                input_imgs, input_Ks, input_c2ws, points, point_colors = (
+                    input_imgs[:1],
+                    input_Ks[:1],
+                    input_c2ws[:1],
+                    points[:1],
+                    point_colors[:1],
+                )
+            input_imgs = [img[..., :3] for img in input_imgs]
+            # Normalize the scene.
+            point_chunks = [p.shape[0] for p in points]
+            point_indices = np.cumsum(point_chunks)[:-1]
+            input_c2ws, points, _ = normalize_scene(  # type: ignore
+                input_c2ws,
+                np.concatenate(points, 0),
+                camera_center_method="poses",
+            )
+            points = np.split(points, point_indices, 0)
+            # Scale camera and points for viewport visualization.
+            scene_scale = np.median(
+                np.ptp(np.concatenate([input_c2ws[:, :3, 3], *points], 0), -1)
+            )
+            input_c2ws[:, :3, 3] /= scene_scale
+            points = [point / scene_scale for point in points]
+            input_imgs = [
+                torch.as_tensor(img / 255.0, dtype=torch.float32) for img in input_imgs
+            ]
+            input_Ks = torch.as_tensor(input_Ks)
+            input_c2ws = torch.as_tensor(input_c2ws)
+            new_input_imgs, new_input_Ks = [], []
+            for img, K in zip(input_imgs, input_Ks):
+                img = rearrange(img, "h w c -> 1 c h w")
+                # If you don't want to keep aspect ratio and want to always center crop, use this:
+                # img, K = transform_img_and_K(img, (shorter, shorter), K=K[None])
+                img, K = transform_img_and_K(img, shorter, K=K[None], size_stride=64)
+                assert isinstance(K, torch.Tensor)
+                K = K / K.new_tensor([img.shape[-1], img.shape[-2], 1])[:, None]
+                new_input_imgs.append(img)
+                new_input_Ks.append(K)
+            input_imgs = torch.cat(new_input_imgs, 0)
+            input_imgs = rearrange(input_imgs, "b c h w -> b h w c")[..., :3]
+            input_Ks = torch.cat(new_input_Ks, 0)
+            return (
+                {
+                    "input_imgs": input_imgs,
+                    "input_Ks": input_Ks,
+                    "input_c2ws": input_c2ws,
+                    "input_wh": (input_imgs.shape[2], input_imgs.shape[1]),
+                    "points": points,
+                    "point_colors": point_colors,
+                    "scene_scale": scene_scale,
+                },
+                gr.update(visible=False),
+                gr.update()
+                if num_inputs <= 10
+                else gr.update(choices=["interp"], value="interp"),
+            )
+    def visualize_scene(self, preprocessed: dict):
+        server = self.server
+        server.scene.reset()
+        server.gui.reset()
+        set_bkgd_color(server)
+        (
+            input_imgs,
+            input_Ks,
+            input_c2ws,
+            input_wh,
+            points,
+            point_colors,
+            scene_scale,
+        ) = (
+            preprocessed["input_imgs"],
+            preprocessed["input_Ks"],
+            preprocessed["input_c2ws"],
+            preprocessed["input_wh"],
+            preprocessed["points"],
+            preprocessed["point_colors"],
+            preprocessed["scene_scale"],
+        )
+        W, H = input_wh
+        server.scene.set_up_direction(-input_c2ws[..., :3, 1].mean(0).numpy())
+        # Use first image as default fov.
+        assert input_imgs[0].shape[:2] == (H, W)
+        if H > W:
+            init_fov = 2 * np.arctan(1 / (2 * input_Ks[0, 0, 0].item()))
+        else:
+            init_fov = 2 * np.arctan(1 / (2 * input_Ks[0, 1, 1].item()))
+        init_fov_deg = float(init_fov / np.pi * 180.0)
+        frustum_nodes, pcd_nodes = [], []
+        for i in range(len(input_imgs)):
+            K = input_Ks[i]
+            frustum = server.scene.add_camera_frustum(
+                f"/scene_assets/cameras/{i}",
+                fov=2 * np.arctan(1 / (2 * K[1, 1].item())),
+                aspect=W / H,
+                scale=0.1 * scene_scale,
+                image=(input_imgs[i].numpy() * 255.0).astype(np.uint8),
+                wxyz=vt.SO3.from_matrix(input_c2ws[i, :3, :3].numpy()).wxyz,
+                position=input_c2ws[i, :3, 3].numpy(),
+            )
+            def get_handler(frustum):
+                def handler(event: viser.GuiEvent) -> None:
+                    assert event.client_id is not None
+                    client = server.get_clients()[event.client_id]
+                    with client.atomic():
+                        client.camera.position = frustum.position
+                        client.camera.wxyz = frustum.wxyz
+                        # Set look_at as the projected origin onto the
+                        # frustum's forward direction.
+                        look_direction = vt.SO3(frustum.wxyz).as_matrix()[:, 2]
+                        position_origin = -frustum.position
+                        client.camera.look_at = (
+                            frustum.position
+                            + np.dot(look_direction, position_origin)
+                            / np.linalg.norm(position_origin)
+                            * look_direction
+                        )
+                return handler
+            frustum.on_click(get_handler(frustum))  # type: ignore
+            frustum_nodes.append(frustum)
+            pcd = server.scene.add_point_cloud(
+                f"/scene_assets/points/{i}",
+                points[i],
+                point_colors[i],
+                point_size=0.01 * scene_scale,
+                point_shape="circle",
+            )
+            pcd_nodes.append(pcd)
+        with server.gui.add_folder("Scene scale", expand_by_default=False, order=200):
+            camera_scale_slider = server.gui.add_slider(
+                "Log camera scale", initial_value=0.0, min=-2.0, max=2.0, step=0.1
+            )
+            @camera_scale_slider.on_update
+            def _(_) -> None:
+                for i in range(len(frustum_nodes)):
+                    frustum_nodes[i].scale = (
+                        0.1 * scene_scale * 10**camera_scale_slider.value
+                    )
+            point_scale_slider = server.gui.add_slider(
+                "Log point scale", initial_value=0.0, min=-2.0, max=2.0, step=0.1
+            )
+            @point_scale_slider.on_update
+            def _(_) -> None:
+                for i in range(len(pcd_nodes)):
+                    pcd_nodes[i].point_size = (
+                        0.01 * scene_scale * 10**point_scale_slider.value
+                    )
+        self.gui_state = define_gui(
+            server,
+            init_fov=init_fov_deg,
+            img_wh=input_wh,
+            scene_scale=scene_scale,
+        )
+    def get_target_c2ws_and_Ks_from_gui(self, preprocessed: dict):
+        input_wh = preprocessed["input_wh"]
+        W, H = input_wh
+        gui_state = self.gui_state
+        assert gui_state is not None and gui_state.camera_traj_list is not None
+        target_c2ws, target_Ks = [], []
+        for item in gui_state.camera_traj_list:
+            target_c2ws.append(item["w2c"])
+            assert item["img_wh"] == input_wh
+            K = np.array(item["K"]).reshape(3, 3) / np.array([W, H, 1])[:, None]
+            target_Ks.append(K)
+        target_c2ws = torch.as_tensor(
+            np.linalg.inv(np.array(target_c2ws).reshape(-1, 4, 4))
+        )
+        target_Ks = torch.as_tensor(np.array(target_Ks).reshape(-1, 3, 3))
+        return target_c2ws, target_Ks
+    def get_target_c2ws_and_Ks_from_preset(
+        self,
+        preprocessed: dict,
+        preset_traj: Literal[
+            "orbit",
+            "spiral",
+            "lemniscate",
+            "zoom-in",
+            "zoom-out",
+            "dolly zoom-in",
+            "dolly zoom-out",
+            "move-forward",
+            "move-backward",
+            "move-up",
+            "move-down",
+            "move-left",
+            "move-right",
+        ],
+        num_frames: int,
+        zoom_factor: float | None,
+    ):
+        img_wh = preprocessed["input_wh"]
+        start_c2w = preprocessed["input_c2ws"][0]
+        start_w2c = torch.linalg.inv(start_c2w)
+        look_at = torch.tensor([0, 0, 10])
+        start_fov = DEFAULT_FOV_RAD
+        target_c2ws, target_fovs = get_preset_pose_fov(
+            preset_traj,
+            num_frames,
+            start_w2c,
+            look_at,
+            -start_c2w[:3, 1],
+            start_fov,
+            spiral_radii=[1.0, 1.0, 0.5],
+            zoom_factor=zoom_factor,
+        )
+        target_c2ws = torch.as_tensor(target_c2ws)
+        target_fovs = torch.as_tensor(target_fovs)
+        target_Ks = get_default_intrinsics(
+            target_fovs,  # type: ignore
+            aspect_ratio=img_wh[0] / img_wh[1],
+        )
+        return target_c2ws, target_Ks
+    def export_output_data(self, preprocessed: dict, output_dir: str):
+        input_imgs, input_Ks, input_c2ws, input_wh = (
+            preprocessed["input_imgs"],
+            preprocessed["input_Ks"],
+            preprocessed["input_c2ws"],
+            preprocessed["input_wh"],
+        )
+        target_c2ws, target_Ks = self.get_target_c2ws_and_Ks_from_gui(preprocessed)
+        num_inputs = len(input_imgs)
+        num_targets = len(target_c2ws)
+        input_imgs = (input_imgs.cpu().numpy() * 255.0).astype(np.uint8)
+        input_c2ws = input_c2ws.cpu().numpy()
+        input_Ks = input_Ks.cpu().numpy()
+        target_c2ws = target_c2ws.cpu().numpy()
+        target_Ks = target_Ks.cpu().numpy()
+        img_whs = np.array(input_wh)[None].repeat(len(input_imgs) + len(target_Ks), 0)
+        os.makedirs(output_dir, exist_ok=True)
+        img_paths = []
+        for i, img in enumerate(input_imgs):
+            iio.imwrite(img_path := osp.join(output_dir, f"{i:03d}.png"), img)
+            img_paths.append(img_path)
+        for i in range(num_targets):
+            iio.imwrite(
+                img_path := osp.join(output_dir, f"{i + num_inputs:03d}.png"),
+                np.zeros((input_wh[1], input_wh[0], 3), dtype=np.uint8),
+            )
+            img_paths.append(img_path)
+        # Convert from OpenCV to OpenGL camera format.
+        all_c2ws = np.concatenate([input_c2ws, target_c2ws])
+        all_Ks = np.concatenate([input_Ks, target_Ks])
+        all_c2ws = all_c2ws @ np.diag([1, -1, -1, 1])
+        create_transforms_simple(output_dir, img_paths, img_whs, all_c2ws, all_Ks)
+        split_dict = {
+            "train_ids": list(range(num_inputs)),
+            "test_ids": list(range(num_inputs, num_inputs + num_targets)),
+        }
+        with open(
+            osp.join(output_dir, f"train_test_split_{num_inputs}.json"), "w"
+        ) as f:
+            json.dump(split_dict, f, indent=4)
+        gr.Info(f"Output data saved to {output_dir}", duration=1)
+    def render(
+        self,
+        preprocessed: dict,
+        session_hash: str,
+        seed: int,
+        chunk_strategy: str,
+        cfg: float,
+        preset_traj: Literal[
+            "orbit",
+            "spiral",
+            "lemniscate",
+            "zoom-in",
+            "zoom-out",
+            "dolly zoom-in",
+            "dolly zoom-out",
+            "move-forward",
+            "move-backward",
+            "move-up",
+            "move-down",
+            "move-left",
+            "move-right",
+        ]
+        | None,
+        num_frames: int | None,
+        zoom_factor: float | None,
+        camera_scale: float,
+    ):
+        render_name = datetime.now().strftime("%Y%m%d_%H%M%S")
+        render_dir = osp.join(WORK_DIR, render_name)
+        input_imgs, input_Ks, input_c2ws, (W, H) = (
+            preprocessed["input_imgs"],
+            preprocessed["input_Ks"],
+            preprocessed["input_c2ws"],
+            preprocessed["input_wh"],
+        )
+        num_inputs = len(input_imgs)
+        if preset_traj is None:
+            target_c2ws, target_Ks = self.get_target_c2ws_and_Ks_from_gui(preprocessed)
+        else:
+            assert num_frames is not None
+            assert num_inputs == 1
+            input_c2ws = torch.eye(4)[None].to(dtype=input_c2ws.dtype)
+            target_c2ws, target_Ks = self.get_target_c2ws_and_Ks_from_preset(
+                preprocessed, preset_traj, num_frames, zoom_factor
+            )
+        all_c2ws = torch.cat([input_c2ws, target_c2ws], 0)
+        all_Ks = (
+            torch.cat([input_Ks, target_Ks], 0)
+            * input_Ks.new_tensor([W, H, 1])[:, None]
+        )
+        num_targets = len(target_c2ws)
+        input_indices = list(range(num_inputs))
+        target_indices = np.arange(num_inputs, num_inputs + num_targets).tolist()
+        # Get anchor cameras.
+        T = VERSION_DICT["T"]
+        version_dict = copy.deepcopy(VERSION_DICT)
+        num_anchors = infer_prior_stats(
+            T,
+            num_inputs,
+            num_total_frames=num_targets,
+            version_dict=version_dict,
+        )
+        # infer_prior_stats modifies T in-place.
+        T = version_dict["T"]
+        assert isinstance(num_anchors, int)
+        anchor_indices = np.linspace(
+            num_inputs,
+            num_inputs + num_targets - 1,
+            num_anchors,
+        ).tolist()
+        anchor_c2ws = all_c2ws[[round(ind) for ind in anchor_indices]]
+        anchor_Ks = all_Ks[[round(ind) for ind in anchor_indices]]
+        # Create image conditioning.
+        all_imgs_np = (
+            F.pad(input_imgs, (0, 0, 0, 0, 0, 0, 0, num_targets), value=0.0).numpy()
+            * 255.0
+        ).astype(np.uint8)
+        image_cond = {
+            "img": all_imgs_np,
+            "input_indices": input_indices,
+            "prior_indices": anchor_indices,
+        }
+        # Create camera conditioning (K is unnormalized).
+        camera_cond = {
+            "c2w": all_c2ws,
+            "K": all_Ks,
+            "input_indices": list(range(num_inputs + num_targets)),
+        }
+        # Run rendering.
+        num_steps = 50
+        options_ori = VERSION_DICT["options"]
+        options = copy.deepcopy(options_ori)
+        options["chunk_strategy"] = chunk_strategy
+        options["video_save_fps"] = 30.0
+        options["beta_linear_start"] = 5e-6
+        options["log_snr_shift"] = 2.4
+        options["guider_types"] = [1, 2]
+        options["cfg"] = [
+            float(cfg),
+            3.0 if num_inputs >= 9 else 2.0,
+        ]  # We define semi-dense-view regime to have 9 input views.
+        options["camera_scale"] = camera_scale
+        options["num_steps"] = num_steps
+        options["cfg_min"] = 1.2
+        options["encoding_t"] = 1
+        options["decoding_t"] = 1
+        assert session_hash in ABORT_EVENTS
+        abort_event = ABORT_EVENTS[session_hash]
+        abort_event.clear()
+        options["abort_event"] = abort_event
+        task = "img2trajvid"
+        # Get number of first pass chunks.
+        T_first_pass = T[0] if isinstance(T, (list, tuple)) else T
+        chunk_strategy_first_pass = options.get(
+            "chunk_strategy_first_pass", "gt-nearest"
+        )
+        num_chunks_0 = len(
+            chunk_input_and_test(
+                T_first_pass,
+                input_c2ws,
+                anchor_c2ws,
+                input_indices,
+                image_cond["prior_indices"],
+                options={**options, "sampler_verbose": False},
+                task=task,
+                chunk_strategy=chunk_strategy_first_pass,
+                gt_input_inds=list(range(input_c2ws.shape[0])),
+            )[1]
+        )
+        # Get number of second pass chunks.
+        anchor_argsort = np.argsort(input_indices + anchor_indices).tolist()
+        anchor_indices = np.array(input_indices + anchor_indices)[
+            anchor_argsort
+        ].tolist()
+        gt_input_inds = [anchor_argsort.index(i) for i in range(input_c2ws.shape[0])]
+        anchor_c2ws_second_pass = torch.cat([input_c2ws, anchor_c2ws], dim=0)[
+            anchor_argsort
+        ]
+        T_second_pass = T[1] if isinstance(T, (list, tuple)) else T
+        chunk_strategy = options.get("chunk_strategy", "nearest")
+        num_chunks_1 = len(
+            chunk_input_and_test(
+                T_second_pass,
+                anchor_c2ws_second_pass,
+                target_c2ws,
+                anchor_indices,
+                target_indices,
+                options={**options, "sampler_verbose": False},
+                task=task,
+                chunk_strategy=chunk_strategy,
+                gt_input_inds=gt_input_inds,
+            )[1]
+        )
+        second_pass_pbar = gr.Progress().tqdm(
+            iterable=None,
+            desc="Second pass sampling",
+            total=num_chunks_1 * num_steps,
+        )
+        first_pass_pbar = gr.Progress().tqdm(
+            iterable=None,
+            desc="First pass sampling",
+            total=num_chunks_0 * num_steps,
+        )
+        video_path_generator = run_one_scene(
+            task=task,
+            version_dict={
+                "H": H,
+                "W": W,
+                "T": T,
+                "C": VERSION_DICT["C"],
+                "f": VERSION_DICT["f"],
+                "options": options,
+            },
+            model=MODEL,
+            ae=AE,
+            conditioner=CONDITIONER,
+            denoiser=DENOISER,
+            image_cond=image_cond,
+            camera_cond=camera_cond,
+            save_path=render_dir,
+            use_traj_prior=True,
+            traj_prior_c2ws=anchor_c2ws,
+            traj_prior_Ks=anchor_Ks,
+            seed=seed,
+            gradio=True,
+            first_pass_pbar=first_pass_pbar,
+            second_pass_pbar=second_pass_pbar,
+            abort_event=abort_event,
+        )
+        output_queue = queue.Queue()
+        blocks = LocalContext.blocks.get()
+        event_id = LocalContext.event_id.get()
+        def worker():
+            # gradio doesn't support threading with progress intentionally, so
+            # we need to hack this.
+            LocalContext.blocks.set(blocks)
+            LocalContext.event_id.set(event_id)
+            for i, video_path in enumerate(video_path_generator):
+                if i == 0:
+                    output_queue.put(
+                        (
+                            video_path,
+                            gr.update(),
+                            gr.update(),
+                            gr.update(),
+                        )
+                    )
+                elif i == 1:
+                    output_queue.put(
+                        (
+                            video_path,
+                            gr.update(visible=True),
+                            gr.update(visible=False),
+                            gr.update(visible=False),
+                        )
+                    )
+                else:
+                    gr.Error("More than two passes during rendering.")
+        thread = threading.Thread(target=worker, daemon=True)
+        thread.start()
+        while thread.is_alive() or not output_queue.empty():
+            if abort_event.is_set():
+                thread.join()
+                abort_event.clear()
+                yield (
+                    gr.update(),
+                    gr.update(visible=True),
+                    gr.update(visible=False),
+                    gr.update(visible=False),
+                )
+            time.sleep(0.1)
+            while not output_queue.empty():
+                yield output_queue.get()
+# This is basically a copy of the original `networking.setup_tunnel` function,
+# but it also returns the tunnel object for proper cleanup.
+def setup_tunnel(
+    local_host: str, local_port: int, share_token: str, share_server_address: str | None
+) -> tuple[str, Tunnel]:
+    share_server_address = (
+        networking.GRADIO_SHARE_SERVER_ADDRESS
+        if share_server_address is None
+        else share_server_address
+    )
+    if share_server_address is None:
+        try:
+            response = httpx.get(networking.GRADIO_API_SERVER, timeout=30)
+            payload = response.json()[0]
+            remote_host, remote_port = payload["host"], int(payload["port"])
+            certificate = payload["root_ca"]
+            Path(CERTIFICATE_PATH).parent.mkdir(parents=True, exist_ok=True)
+            with open(CERTIFICATE_PATH, "w") as f:
+                f.write(certificate)
+        except Exception as e:
+            raise RuntimeError(
+                "Could not get share link from Gradio API Server."
+            ) from e
+    else:
+        remote_host, remote_port = share_server_address.split(":")
+        remote_port = int(remote_port)
+    tunnel = Tunnel(remote_host, remote_port, local_host, local_port, share_token)
+    address = tunnel.start_tunnel()
+    return address, tunnel
+def set_bkgd_color(server: viser.ViserServer | viser.ClientHandle):
+    server.scene.set_background_image(np.array([[[39, 39, 42]]], dtype=np.uint8))
+def start_server_and_abort_event(request: gr.Request):
+    server = viser.ViserServer()
+    @server.on_client_connect
+    def _(client: viser.ClientHandle):
+        # Force dark mode that blends well with gradio's dark theme.
+        client.gui.configure_theme(
+            dark_mode=True,
+            show_share_button=False,
+            control_layout="collapsible",
+        )
+        set_bkgd_color(client)
+    print(f"Starting server {server.get_port()}")
+    server_url, tunnel = setup_tunnel(
+        local_host=server.get_host(),
+        local_port=server.get_port(),
+        share_token=secrets.token_urlsafe(32),
+        share_server_address=None,
+    )
+    SERVERS[request.session_hash] = (server, tunnel)
+    if server_url is None:
+        raise gr.Error(
+            "Failed to get a viewport URL. Please check your network connection."
+        )
+    # Give it enough time to start.
+    time.sleep(1)
+    ABORT_EVENTS[request.session_hash] = threading.Event()
+    return (
+        SevaRenderer(server),
+        gr.HTML(
+            f'<iframe src="{server_url}" style="display: block; margin: auto; width: 100%; height: min(60vh, 600px);" frameborder="0"></iframe>',
+            container=True,
+        ),
+        request.session_hash,
+    )
+def stop_server_and_abort_event(request: gr.Request):
+    if request.session_hash in SERVERS:
+        print(f"Stopping server {request.session_hash}")
+        server, tunnel = SERVERS.pop(request.session_hash)
+        server.stop()
+        tunnel.kill()
+    if request.session_hash in ABORT_EVENTS:
+        print(f"Setting abort event {request.session_hash}")
+        ABORT_EVENTS[request.session_hash].set()
+        # Give it enough time to abort jobs.
+        time.sleep(5)
+        ABORT_EVENTS.pop(request.session_hash)
+def set_abort_event(request: gr.Request):
+    if request.session_hash in ABORT_EVENTS:
+        print(f"Setting abort event {request.session_hash}")
+        ABORT_EVENTS[request.session_hash].set()
+def get_advance_examples(selection: gr.SelectData):
+    index = selection.index
+    return (
+        gr.Gallery(ADVANCE_EXAMPLE_MAP[index][1], visible=True),
+        gr.update(visible=True),
+        gr.update(visible=True),
+        gr.Gallery(visible=False),
+    )
+def get_preamble():
+    gr.Markdown("""
+# Stable Virtual Camera
+<span style="display: flex; flex-wrap: wrap; gap: 5px;">
+    <a href="https://stable-virtual-camera.github.io"><img src="https://img.shields.io/badge/%F0%9F%8F%A0%20Project%20Page-gray.svg"></a>
+    <a href="http://arxiv.org/abs/2503.14489"><img src="https://img.shields.io/badge/%F0%9F%93%84%20arXiv-2503.14489-B31B1B.svg"></a>
+    <a href="https://stability.ai/news/introducing-stable-virtual-camera-multi-view-video-generation-with-3d-camera-control"><img src="https://img.shields.io/badge/%F0%9F%93%83%20Blog-Stability%20AI-orange.svg"></a>
+    <a href="https://huggingface.co/stabilityai/stable-virtual-camera"><img src="https://img.shields.io/badge/%F0%9F%A4%97%20Model_Card-Huggingface-orange"></a>
+    <a href="https://huggingface.co/spaces/stabilityai/stable-virtual-camera"><img src="https://img.shields.io/badge/%F0%9F%9A%80%20Gradio%20Demo-Huggingface-orange"></a>
+    <a href="https://www.youtube.com/channel/UCLLlVDcS7nNenT_zzO3OPxQ"><img src="https://img.shields.io/badge/%F0%9F%8E%AC%20Video-YouTube-orange"></a>
+</span>
+Welcome to the demo of <strong>Stable Virtual Camera (Seva)</strong>! Given any number of input views and their cameras, this demo will allow you to generate novel views of a scene at any target camera of interest.
+We provide two ways to use our demo (selected by the tab below, documented [here](https://github.com/Stability-AI/stable-virtual-camera/blob/main/docs/GR_USAGE.md)):
+1. **[Basic](https://github.com/user-attachments/assets/4d965fa6-d8eb-452c-b773-6e09c88ca705)**: Given a single image, you can generate a video following one of our preset camera trajectories.
+2. **[Advanced](https://github.com/user-attachments/assets/dcec1be0-bd10-441e-879c-d1c2b63091ba)**: Given any number of input images, you can generate a video following any camera trajectory of your choice by our key-frame-based interface.
+> This is a research preview and comes with a few [limitations](https://stable-virtual-camera.github.io/#limitations):
+> - Limited quality in certain subjects due to training data, including humans, animals, and dynamic textures.
+> - Limited quality in some highly ambiguous scenes and camera trajectories, including extreme views and collision into objects.
+    """)
+# Make sure that gradio uses dark theme.
+_APP_JS = """
+function refresh() {
+    const url = new URL(window.location);
+    if (url.searchParams.get('__theme') !== 'dark') {
+        url.searchParams.set('__theme', 'dark');
+    }
+}
+"""
+def main(server_port: int | None = None, share: bool = True):
+    with gr.Blocks(js=_APP_JS) as app:
+        renderer = gr.State()
+        session_hash = gr.State()
+        _ = get_preamble()
+        with gr.Tabs():
+            with gr.Tab("Basic"):
+                render_btn = gr.Button("Render video", interactive=False, render=False)
+                with gr.Row():
+                    with gr.Column():
+                        with gr.Group():
+                            # Initially disable the Preprocess Images button until an image is selected.
+                            preprocess_btn = gr.Button("Preprocess images", interactive=False)
+                            preprocess_progress = gr.Textbox(
+                                label="",
+                                visible=False,
+                                interactive=False,
+                            )
+                        with gr.Group():
+                            input_imgs = gr.Image(
+                                type="filepath",
+                                label="Input",
+                                height=200,
+                            )
+                            _ = gr.Examples(
+                                examples=sorted(glob("assets/basic/*")),
+                                inputs=[input_imgs],
+                                label="Example",
+                            )
+                            chunk_strategy = gr.Dropdown(
+                                ["interp", "interp-gt"],
+                                label="Chunk strategy",
+                                render=False,
+                            )
+                            preprocessed = gr.State()
+                            # Enable the Preprocess Images button only if an image is selected.
+                            input_imgs.change(
+                                lambda img: gr.update(interactive=bool(img)),
+                                inputs=input_imgs,
+                                outputs=preprocess_btn,
+                            )
+                            preprocess_btn.click(
+                                lambda r, *args: [
+                                    *r.preprocess(*args),
+                                    gr.update(interactive=True),
+                                ],
+                                inputs=[renderer, input_imgs],
+                                outputs=[
+                                    preprocessed,
+                                    preprocess_progress,
+                                    chunk_strategy,
+                                    render_btn,
+                                ],
+                                show_progress_on=[preprocess_progress],
+                                concurrency_limit=1,
+                                concurrency_id="gpu_queue",
+                            )
+                            preprocess_btn.click(
+                                lambda: gr.update(visible=True),
+                                outputs=[preprocess_progress],
+                            )
+                        with gr.Row():
+                            preset_traj = gr.Dropdown(
+                                choices=[
+                                    "orbit",
+                                    "spiral",
+                                    "lemniscate",
+                                    "zoom-in",
+                                    "zoom-out",
+                                    "dolly zoom-in",
+                                    "dolly zoom-out",
+                                    "move-forward",
+                                    "move-backward",
+                                    "move-up",
+                                    "move-down",
+                                    "move-left",
+                                    "move-right",
+                                ],
+                                label="Preset trajectory",
+                                value="orbit",
+                            )
+                            num_frames = gr.Slider(30, 150, 80, label="#Frames")
+                            zoom_factor = gr.Slider(
+                                step=0.01, label="Zoom factor", visible=False
+                            )
+                        with gr.Row():
+                            seed = gr.Number(value=23, label="Random seed")
+                            chunk_strategy.render()
+                            cfg = gr.Slider(1.0, 7.0, value=4.0, label="CFG value")
+                        with gr.Row():
+                            camera_scale = gr.Slider(
+                                0.1,
+                                15.0,
+                                value=2.0,
+                                label="Camera scale",
+                            )
+                        def default_cfg_preset_traj(traj):
+                            # These are just some hand-tuned values that we
+                            # found work the best.
+                            if traj in ["zoom-out", "move-down"]:
+                                value = 5.0
+                            elif traj in [
+                                "orbit",
+                                "dolly zoom-out",
+                                "move-backward",
+                                "move-up",
+                                "move-left",
+                                "move-right",
+                            ]:
+                                value = 4.0
+                            else:
+                                value = 3.0
+                            return value
+                        preset_traj.change(
+                            default_cfg_preset_traj,
+                            inputs=[preset_traj],
+                            outputs=[cfg],
+                        )
+                        preset_traj.change(
+                            lambda traj: gr.update(
+                                value=(
+                                    10.0 if "dolly" in traj or "pan" in traj else 2.0
+                                )
+                            ),
+                            inputs=[preset_traj],
+                            outputs=[camera_scale],
+                        )
+                        def zoom_factor_preset_traj(traj):
+                            visible = traj in [
+                                "zoom-in",
+                                "zoom-out",
+                                "dolly zoom-in",
+                                "dolly zoom-out",
+                            ]
+                            is_zoomin = traj.endswith("zoom-in")
+                            if is_zoomin:
+                                minimum = 0.1
+                                maximum = 0.5
+                                value = 0.28
+                            else:
+                                minimum = 1.2
+                                maximum = 3
+                                value = 1.5
+                            return gr.update(
+                                visible=visible,
+                                minimum=minimum,
+                                maximum=maximum,
+                                value=value,
+                            )
+                        preset_traj.change(
+                            zoom_factor_preset_traj,
+                            inputs=[preset_traj],
+                            outputs=[zoom_factor],
+                        )
+                    with gr.Column():
+                        with gr.Group():
+                            abort_btn = gr.Button("Abort rendering", visible=False)
+                            render_btn.render()
+                            render_progress = gr.Textbox(
+                                label="", visible=False, interactive=False
+                            )
+                        output_video = gr.Video(
+                            label="Output", interactive=False, autoplay=True, loop=True
+                        )
+                        render_btn.click(
+                            lambda r, *args: (yield from r.render(*args)),
+                            inputs=[
+                                renderer,
+                                preprocessed,
+                                session_hash,
+                                seed,
+                                chunk_strategy,
+                                cfg,
+                                preset_traj,
+                                num_frames,
+                                zoom_factor,
+                                camera_scale,
+                            ],
+                            outputs=[
+                                output_video,
+                                render_btn,
+                                abort_btn,
+                                render_progress,
+                            ],
+                            show_progress_on=[render_progress],
+                            concurrency_id="gpu_queue",
+                        )
+                        render_btn.click(
+                            lambda: [
+                                gr.update(visible=False),
+                                gr.update(visible=True),
+                                gr.update(visible=True),
+                            ],
+                            outputs=[render_btn, abort_btn, render_progress],
+                        )
+                        abort_btn.click(set_abort_event)
+            with gr.Tab("Advanced"):
+                render_btn = gr.Button("Render video", interactive=False, render=False)
+                viewport = gr.HTML(container=True, render=False)
+                gr.Timer(0.1).tick(
+                    lambda renderer: gr.update(
+                        interactive=renderer is not None
+                        and renderer.gui_state is not None
+                        and renderer.gui_state.camera_traj_list is not None
+                    ),
+                    inputs=[renderer],
+                    outputs=[render_btn],
+                )
+                with gr.Row():
+                    viewport.render()
+                with gr.Row():
+                    with gr.Column():
+                        with gr.Group():
+                            # Initially disable the Preprocess Images button until images are selected.
+                            preprocess_btn = gr.Button("Preprocess images", interactive=False)
+                            preprocess_progress = gr.Textbox(
+                                label="",
+                                visible=False,
+                                interactive=False,
+                            )
+                        with gr.Group():
+                            input_imgs = gr.Gallery(
+                                interactive=True,
+                                label="Input",
+                                columns=4,
+                                height=200,
+                            )
+                            # Define example images (gradio doesn't support variable length
+                            # examples so we need to hack it).
+                            example_imgs = gr.Gallery(
+                                [e[0] for e in ADVANCE_EXAMPLE_MAP],
+                                allow_preview=False,
+                                preview=False,
+                                label="Example",
+                                columns=20,
+                                rows=1,
+                                height=115,
+                            )
+                            example_imgs_expander = gr.Gallery(
+                                visible=False,
+                                interactive=False,
+                                label="Example",
+                                preview=True,
+                                columns=20,
+                                rows=1,
+                            )
+                            chunk_strategy = gr.Dropdown(
+                                ["interp-gt", "interp"],
+                                label="Chunk strategy",
+                                value="interp-gt",
+                                render=False,
+                            )
+                            with gr.Row():
+                                example_imgs_backer = gr.Button(
+                                    "Go back", visible=False
+                                )
+                                example_imgs_confirmer = gr.Button(
+                                    "Confirm", visible=False
+                                )
+                            example_imgs.select(
+                                get_advance_examples,
+                                outputs=[
+                                    example_imgs_expander,
+                                    example_imgs_confirmer,
+                                    example_imgs_backer,
+                                    example_imgs,
+                                ],
+                            )
+                            example_imgs_confirmer.click(
+                                lambda x: (
+                                    x,
+                                    gr.update(visible=False),
+                                    gr.update(visible=False),
+                                    gr.update(visible=False),
+                                    gr.update(visible=True),
+                                    gr.update(interactive=bool(x))
+                                ),
+                                inputs=[example_imgs_expander],
+                                outputs=[
+                                    input_imgs,
+                                    example_imgs_expander,
+                                    example_imgs_confirmer,
+                                    example_imgs_backer,
+                                    example_imgs,
+                                    preprocess_btn
+                                ],
+                            )
+                            example_imgs_backer.click(
+                                lambda: (
+                                    gr.update(visible=False),
+                                    gr.update(visible=False),
+                                    gr.update(visible=False),
+                                    gr.update(visible=True),
+                                ),
+                                outputs=[
+                                    example_imgs_expander,
+                                    example_imgs_confirmer,
+                                    example_imgs_backer,
+                                    example_imgs,
+                                ],
+                            )
+                            preprocessed = gr.State()
+                            preprocess_btn.click(
+                                lambda r, *args: r.preprocess(*args),
+                                inputs=[renderer, input_imgs],
+                                outputs=[
+                                    preprocessed,
+                                    preprocess_progress,
+                                    chunk_strategy,
+                                ],
+                                show_progress_on=[preprocess_progress],
+                                concurrency_id="gpu_queue",
+                            )
+                            preprocess_btn.click(
+                                lambda: gr.update(visible=True),
+                                outputs=[preprocess_progress],
+                            )
+                            preprocessed.change(
+                                lambda r, *args: r.visualize_scene(*args),
+                                inputs=[renderer, preprocessed],
+                            )
+                        with gr.Row():
+                            seed = gr.Number(value=23, label="Random seed")
+                            chunk_strategy.render()
+                            cfg = gr.Slider(1.0, 7.0, value=3.0, label="CFG value")
+                        with gr.Row():
+                            camera_scale = gr.Slider(
+                                0.1,
+                                15.0,
+                                value=2.0,
+                                label="Camera scale (useful for single-view input)",
+                            )
+                        with gr.Group():
+                            output_data_dir = gr.Textbox(label="Output data directory")
+                            output_data_btn = gr.Button("Export output data")
+                        output_data_btn.click(
+                            lambda r, *args: r.export_output_data(*args),
+                            inputs=[renderer, preprocessed, output_data_dir],
+                        )
+                    with gr.Column():
+                        with gr.Group():
+                            abort_btn = gr.Button("Abort rendering", visible=False)
+                            render_btn.render()
+                            render_progress = gr.Textbox(
+                                label="", visible=False, interactive=False
+                            )
+                        output_video = gr.Video(
+                            label="Output", interactive=False, autoplay=True, loop=True
+                        )
+                        render_btn.click(
+                            lambda r, *args: (yield from r.render(*args)),
+                            inputs=[
+                                renderer,
+                                preprocessed,
+                                session_hash,
+                                seed,
+                                chunk_strategy,
+                                cfg,
+                                gr.State(),
+                                gr.State(),
+                                gr.State(),
+                                camera_scale,
+                            ],
+                            outputs=[
+                                output_video,
+                                render_btn,
+                                abort_btn,
+                                render_progress,
+                            ],
+                            show_progress_on=[render_progress],
+                            concurrency_id="gpu_queue",
+                        )
+                        render_btn.click(
+                            lambda: [
+                                gr.update(visible=False),
+                                gr.update(visible=True),
+                                gr.update(visible=True),
+                            ],
+                            outputs=[render_btn, abort_btn, render_progress],
+                        )
+                        abort_btn.click(set_abort_event)
+        # Register the session initialization and cleanup functions.
+        app.load(
+            start_server_and_abort_event,
+            outputs=[renderer, viewport, session_hash],
+        )
+        app.unload(stop_server_and_abort_event)
+    app.queue(max_size=5).launch(
+        share=share,
+        server_port=server_port,
+        show_error=True,
+        allowed_paths=[WORK_DIR],
+        # Badget rendering will be broken otherwise.
+        ssr_mode=False,
+    )
+if __name__ == "__main__":
+    tyro.cli(main)

docs/CLI_USAGE.md ADDED Viewed

	@@ -0,0 +1,169 @@

+# :computer: CLI Demo
+This cli demo allows you to pass in more options and control the model in a fine-grained way, suitable for power users and academic researchers. An examplar command line looks as simple as
+```bash
+python demo.py --data_path <data_path> [additional arguments]
+```
+We discuss here first some key attributes:
+- `Procedural Two-Pass Sampling`: We recommend enabling procedural sampling by setting `--use_traj_prior True --chunk_strategy <chunk_strategy>` with `<chunk_strategy>` set according to the type of the task.
+- `Resolution and Aspect-Ratio`: Default image preprocessing include center cropping. All input and output are square images of size $576\times 576$. To overwrite, the code support to pass in `--W <W> --H <H>` directly. We recommend passing in `--L_short 576` such that the aspect-ratio of original image is kept while the shortest side will be resized to $576$.
+## Task
+Before diving into the command lines, we introduce `Task` (specified by `--task <task>`) to bucket different usage cases depending on the data constraints in input and output domains (e.g., if the ordering is available).
+|         Task         |  Type of NVS   |         Format of `<data_path>`          | Target Views Sorted? | Input and Target Views Sorted? |    Recommended Usage     |
+| :------------------: | :------------: | :--------------------------------------: | :------------------: | :----------------------------: | :----------------------: |
+|      `img2img`       |    set NVS     | folder (parsable by `ReconfusionParser`) |         :x:          |              :x:               | evaluation, benchmarking |
+|      `img2vid`       | trajectory NVS | folder (parsable by `ReconfusionParser`) |  :white_check_mark:  |       :white_check_mark:       | evaluation, benchmarking |
+| `img2trajvid_s-prob` | trajectory NVS |               single image               |  :white_check_mark:  |       :white_check_mark:       |         general          |
+|    `img2trajvid`     | trajectory NVS | folder (parsable by `ReconfusionParser`) |  :white_check_mark:  |              :x:               |         general          |
+### Format of `<data_path>`
+For `img2trajvid_s-prob` task, we are generating a trajectory video following preset camera motions or effects given only one input image, the data format as simple as
+```bash
+<data_path>/
+  ├── scene_1.png
+  ├── scene_2.png
+  └── scene_3.png
+```
+For all the other tasks, we use a folder for each scene that is parsable by `ReconfusionParser` (see `seva/data_io.py`). It contains (1) a subdirectory containing all views; (2) `transforms.json` defining the intrinsics and extrinsics (OpenGL convention) for each image; and (3) `train_test_split_*.json` file splitting the input and target views, with `*` indicating the number of the input views.
+We provide <a href="https://github.com/Stability-AI/stable-virtual-camera/releases/tag/assets_demo_cli">in this release</a> (`assets_demo_cli.zip`) several examplar scenes for you to take reference from. Target views is available if you the data are from academic sources, but in the case where target views is unavailble, we will create dummy black images as placeholders (e.g., the `garden_flythrough` scene). The general data structure follows
+```bash
+<data_path>/
+├── scene_1/
+    ├── train_test_split_1.json # for single-view regime
+    ├── train_test_split_6.json # for sparse-veiw regime
+    ├── train_test_split_32.json # for semi-dense-view regime
+    ├── transforms.json
+    └── images/
+        ├── image_0.png
+        ├── image_1.png
+        ├── ...
+        └── image_1000.png
+├── scene_2
+└── scene_3
+```
+You can specify which scene to run by passing in `--data_items scene_1,scene_2` to run, for example, `scene_1` and `scene_2`.
+### Recommended Usage
+- `img2img` and `img2vid` are recommended to be used for evaluation and benchmarking. These two tasks are used for the quantitative evalution in our <a href="http://arxiv.org/abs/2503.14489">paper</a>. The data is converted from academic datasets so the groundtruth target views are available for metric computation. Check the [`benchmark`](../benchmark/) folder for detailed splits we organize to benchmark different NVS models.
+- `img2vid` requries both the input and target views to be sorted, which is usually not guaranteed in general usage.
+- `img2trajvid_s-prob` is for general usage but only for single-view regime and fixed preset camera control.
+- `img2trajvid` is the task designed for general usage since it does not need the ordering of the input views. This is the task used in the gradio demo.
+Next we go over all tasks and provide for each task an examplar command line.
+## `img2img`
+```bash
+python demo.py \
+    --data_path <data_path> \
+    --num_inputs <P> \
+    --video_save_fps 10
+```
+- `--num_inputs <P>` is only necessary if there are multiple `train_test_split_*.json` files in the scene folder.
+- The above command works for the dataset without trajectory prior (e.g., DL3DV-140). When the trajectory prior is available given a benchmarking dataset, for example, `orbit` trajectory prior for the CO3D dataset, we use the `nearest-gt` chunking strategy by setting `--use_traj_prior True --traj_prior orbit --chunking_strategy nearest-gt`. We find this leads to more 3D consistent results.
+- For all the single-view conditioning test scenarios: we set `--camera_scale <camera_scale>` with `<camera_scale>` sweeping 20 different camera scales `0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1.0 1.1 1.2 1.3 1.4 1.5 1.6 1.7 1.8 1.9 2.0`.
+- In single-view regime for the RealEstate10K dataset, we find increasing `cfg` is helpful: we additionally set `--cfg 6.0` (`cfg` is `2.0` by default).
+- For the evaluation in semi-dense-view regime (i.e., DL3DV-140 and Tanks and Temples dataset) with `32` input views, we zero-shot extend `T` to fit all input and target views in one forward. Specifically, we set `--T 90` for the DL3DV-140 dataset and `--T 80` for the Tanks and Temples dataset.
+- For the evaluation on ViewCrafter split (including the RealEastate10K, CO3D, and Tanks and Temples dataset), we find zero-shot extending `T` to `25` to fit all input and target views in one forward is better. Also, the V split uses the original image resolutions: we therefore set `--T 25 --L_short 576`.
+For example, you can run the following command on the example `dl3d140-165f5af8bfe32f70595a1c9393a6e442acf7af019998275144f605b89a306557` with 3 input views:
+```bash
+python demo.py \
+    --data_path /path/to/assets_demo_cli/ \
+    --data_items dl3d140-165f5af8bfe32f70595a1c9393a6e442acf7af019998275144f605b89a306557 \
+    --num_inputs 3 \
+    --video_save_fps 10
+```
+## `img2vid`
+```bash
+python demo.py \
+    --data_path <data_path> \
+    --task img2vid \
+    --replace_or_include_input True \
+    --num_inputs <P> \
+    --use_traj_prior True \
+    --chunk_strategy interp \
+```
+- `--replace_or_include_input True` is necessary here since input views and target views are mutually exclusive, forming a trajectory together in this task, so we need to append back the input views to the generated target views.
+- `--num_inputs <P>` is only necessary if there are multiple `train_test_split_*.json` files in the scene folder.
+- We use `interp` chunking strategy by default.
+- For the evaluation on ViewCrafter split (including the RealEastate10K, CO3D, and Tanks and Temples dataset), we find zero-shot extending `T` to `25` to fit all input and target views in one forward is better. Also, the V split uses the original image resolutions: we therefore set `--T 25 --L_short 576`.
+## `img2trajvid_s-prob`
+```bash
+python demo.py \
+    --data_path <data_path> \
+    --task img2trajvid_s-prob \
+    --replace_or_include_input True \
+    --traj_prior orbit \
+    --cfg 4.0,2.0 \
+    --guider 1,2 \
+    --num_targets 111 \
+    --L_short 576 \
+    --use_traj_prior True \
+    --chunk_strategy interp
+```
+- `--replace_or_include_input True` is necessary here since input views and target views are mutually exclusive, forming a trajectory together in this task, so we need to append back the input views to the generated target views.
+- Default `cfg` should be adusted according to `traj_prior`.
+- Default chunking strategy is `interp`.
+- Default guider is `--guider 1,2` (instead of `1`, `1` still works but `1,2` is slightly better).
+- `camera_scale` (default is `2.0`) can be adjusted according to `traj_prior`. The model has scale ambiguity with single-view input, especially for panning motions. We encourage to tune up `camera_scale` to `10.0` for all panning motions (`--traj_prior pan-*/dolly*`) if you expect a larger camera motion.
+## `img2trajvid`
+### Sparse-view regime ($P\leq 8$)
+```bash
+python demo.py \
+    --data_path <data_path> \
+    --task img2trajvid \
+    --num_inputs <P> \
+    --cfg 3.0,2.0  \
+    --use_traj_prior True \
+    --chunk_strategy interp-gt
+```
+- `--num_inputs <P>` is only necessary if there are multiple `train_test_split_*.json` files in the scene folder.
+- Default `cfg` should be set to `3,2` (`3` being `cfg` for the first pass, and `2` being the `cfg` for the second pass). Try to increase the `cfg` for the first pass from `3` to higher values if you observe blurry areas (usually happens for harder scenes with a fair amount of unseen regions).
+- Default chunking strategy should be set to `interp+gt` (instead of `interp`, `interp` can work but usually a bit worse).
+- The `--chunk_strategy_first_pass` is set as `gt-nearest` by default. So it can automatically adapt when $P$ is large (up to a thousand frames).
+### Semi-dense-view regime ($P>9$)
+```bash
+python demo.py \
+    --data_path <data_path> \
+    --task img2trajvid \
+    --num_inputs <P> \
+    --cfg 3.0  \
+    --L_short 576 \
+    --use_traj_prior True \
+    --chunk_strategy interp
+```
+- `--num_inputs <P>` is only necessary if there are multiple `train_test_split_*.json` files in the scene folder.
+- Default `cfg` should be set to `3`.
+- Default chunking strategy should be set to `interp` (instead of `interp-gt`, `interp-gt` is also supported but the results do not look good).
+- `T` can be overwritten by `--T <N>,21` (X being extended `T` for the first pass, and `21` being the default `T` for the second pass). `<N>` is dynamically decided now in the code but can also be manually updated. This is useful when you observe that there exist two very dissimilar adjacent anchors which make the interpolation in the second pass impossible. There exist two ways:
+  - `--T 96,21`: this overwrites the `T` in the first pass to be exactly `96`.
+  - `--num_prior_frames_ratio 1.2`: this enlarges T in the first pass dynamically to be `1.2`$\times$ larger.

docs/GR_USAGE.md ADDED Viewed

	@@ -0,0 +1,76 @@

+# :rocket: Gradio Demo
+This gradio demo is the simplest starting point for you play with our project.
+You can either visit it at our huggingface space [here](https://huggingface.co/spaces/stabilityai/stable-virtual-camera) or run it locally yourself by
+```bash
+python demo_gr.py
+```
+We provide two ways to use our demo:
+1. `Basic` mode, where user can upload a single image, and set a target camera trajectory from our preset options. This is the most straightforward way to use our model, and is suitable for most users.
+2. `Advanced` mode, where user can upload one or multiple images, and set a target camera trajectory by interacting with a 3D viewport (powered by [viser](https://viser.studio/latest)). This is suitable for power users and academic researchers.
+### `Basic`
+This is the default mode when entering our demo (given its simplicity).
+User can upload a single image, and set a target camera trajectory from our preset options. This is the most straightforward way to use our model, and is suitable for most users.
+Here is a video walkthrough:
+https://github.com/user-attachments/assets/4d965fa6-d8eb-452c-b773-6e09c88ca705
+You can choose from 13 preset trajectories that are common for NVS (`move-forward/backward` are omitted for visualization purpose):
+https://github.com/user-attachments/assets/b2cf8700-3d85-44b9-8d52-248e82f1fb55
+More formally:
+- `orbit/spiral/lemniscate` are good for showing the "3D-ness" of the scene.
+- `zoom-in/out` keep the camera position the same while increasing/decreasing the focal length.
+- `dolly zoom-in/out` move camera position backward/forward while increasing/decreasing the focal length.
+- `move-forward/backward/up/down/left/right` move camera position in different directions.
+Notes:
+- For a 80 frame video at `786x576` resolution, it takes around 20 seconds for the first pass generation, and around 2 minutes for the second pass generation, tested with a single H100 GPU.
+- Please expect around ~2-3x more times on HF space.
+### `Advanced`
+This is the power mode where you can have very fine-grained control over camera trajectories.
+User can upload one or multiple images, and set a target camera trajectory by interacting with a 3D viewport. This is suitable for power users and academic researchers.
+Here is a video walkthrough
+https://github.com/user-attachments/assets/dcec1be0-bd10-441e-879c-d1c2b63091ba
+Notes:
+- For a 134 frame video at `576x576` resolution, it takes around 16 seconds for the first pass generation, and around 4 minutes for the second pass generation, tested with a single H100 GPU.
+- Please expect around ~2-3x more times on HF space.
+### Pro tips
+- If the first pass sampling result is bad, click "Abort rendering" button in GUI to avoid stucking at second pass sampling such that you can try something else.
+### Performance benchmark
+We have tested our gradio demo in both a local environment and the HF space environment, across different modes and compilation settings. Here are our results:
+| Total time (s) | `Basic` first pass | `Basic` second pass | `Advanced` first pass | `Advanced` second pass |
+|:------------------------:|:-----------------:|:------------------:|:--------------------:|:---------------------:|
+| HF (L40S, w/o comp.) | 68 | 484 | 48 | 780 |
+| HF (L40S, w/ comp.) | 51 | 362 | 36 | 587 |
+| Local (H100, w/o comp.) | 35 | 204 | 20 | 313 |
+| Local (H100, w/ comp.) | 21 | 144 | 16 | 234 |
+Notes:
+- HF space uses L40S GPU, and our local environment uses H100 GPU.
+- We opt-in compilation by `torch.compile`.
+- `Basic` mode is tested by generating 80 frames at `768x576` resolution.
+- `Advanced` mode is tested by generating 134 frames at `576x576` resolution.

docs/INSTALL.md ADDED Viewed

	@@ -0,0 +1,39 @@

+# :wrench: Installation
+### Model Dependencies
+```bash
+# Install seva model dependencies.
+pip install -e .
+```
+### Demo Dependencies
+To use the cli demo (`demo.py`) or the gradio demo (`demo_gr.py`), do the following:
+```bash
+# Initialize and update submodules for demo.
+git submodule update --init --recursive
+# Install pycolmap dependencies for cli and gradio demo (our model is not dependent on it).
+echo "Installing pycolmap (for both cli and gradio demo)..."
+pip install git+https://github.com/jensenz-sai/pycolmap@543266bc316df2fe407b3a33d454b310b1641042
+# Install dust3r dependencies for gradio demo (our model is not dependent on it).
+echo "Installing dust3r dependencies (only for gradio demo)..."
+pushd third_party/dust3r
+pip install -r requirements.txt
+popd
+```
+### Dev and Speeding Up (Optional)
+```bash
+# [OPTIONAL] Install seva dependencies for development.
+pip install -e ".[dev]"
+pre-commit install
+# [OPTIONAL] Install the torch nightly version for faster JIT via. torch.compile (speed up sampling by 2x in our testing).
+# Please adjust to your own cuda version. For example, if you have cuda 11.8, use the following command.
+pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu118
+```

pyproject.toml ADDED Viewed

	@@ -0,0 +1,39 @@

+[build-system]
+requires = ["setuptools>=65.5.3"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "seva"
+version = "0.0.0"
+requires-python = ">=3.10"
+dependencies = [
+    "torch>=2.6.0",
+    "roma",
+    "viser",
+    "tyro",
+    "fire",
+    "ninja",
+    "gradio==5.17.0",
+    "einops",
+    "colorama",
+    "splines",
+    "kornia",
+    "open-clip-torch",
+    "diffusers",
+    "numpy==1.24.4",
+    "imageio[ffmpeg]",
+    "huggingface-hub",
+    "opencv-python",
+]
+[project.optional-dependencies]
+dev = ["ruff", "ipdb", "pytest", "line_profiler", "pre-commit"]
+[tool.setuptools.packages.find]
+include = ["seva"]
+[tool.pyright]
+extraPaths = ["third_party/dust3r"]
+[tool.ruff]
+lint.ignore = ["E741"]

seva/__init__.py ADDED Viewed

File without changes

seva/data_io.py ADDED Viewed

	@@ -0,0 +1,553 @@

+import json
+import os
+import os.path as osp
+from glob import glob
+from typing import Any, Dict, List, Optional, Tuple
+import cv2
+import imageio.v3 as iio
+import numpy as np
+import torch
+from seva.geometry import (
+    align_principle_axes,
+    similarity_from_cameras,
+    transform_cameras,
+    transform_points,
+)
+def _get_rel_paths(path_dir: str) -> List[str]:
+    """Recursively get relative paths of files in a directory."""
+    paths = []
+    for dp, _, fn in os.walk(path_dir):
+        for f in fn:
+            paths.append(os.path.relpath(os.path.join(dp, f), path_dir))
+    return paths
+class BaseParser(object):
+    def __init__(
+        self,
+        data_dir: str,
+        factor: int = 1,
+        normalize: bool = False,
+        test_every: Optional[int] = 8,
+    ):
+        self.data_dir = data_dir
+        self.factor = factor
+        self.normalize = normalize
+        self.test_every = test_every
+        self.image_names: List[str] = []  # (num_images,)
+        self.image_paths: List[str] = []  # (num_images,)
+        self.camtoworlds: np.ndarray = np.zeros((0, 4, 4))  # (num_images, 4, 4)
+        self.camera_ids: List[int] = []  # (num_images,)
+        self.Ks_dict: Dict[int, np.ndarray] = {}  # Dict of camera_id -> K
+        self.params_dict: Dict[int, np.ndarray] = {}  # Dict of camera_id -> params
+        self.imsize_dict: Dict[
+            int, Tuple[int, int]
+        ] = {}  # Dict of camera_id -> (width, height)
+        self.points: np.ndarray = np.zeros((0, 3))  # (num_points, 3)
+        self.points_err: np.ndarray = np.zeros((0,))  # (num_points,)
+        self.points_rgb: np.ndarray = np.zeros((0, 3))  # (num_points, 3)
+        self.point_indices: Dict[str, np.ndarray] = {}  # Dict of image_name -> (M,)
+        self.transform: np.ndarray = np.zeros((4, 4))  # (4, 4)
+        self.mapx_dict: Dict[int, np.ndarray] = {}  # Dict of camera_id -> (H, W)
+        self.mapy_dict: Dict[int, np.ndarray] = {}  # Dict of camera_id -> (H, W)
+        self.roi_undist_dict: Dict[int, Tuple[int, int, int, int]] = (
+            dict()
+        )  # Dict of camera_id -> (x, y, w, h)
+        self.scene_scale: float = 1.0
+class DirectParser(BaseParser):
+    def __init__(
+        self,
+        imgs: List[np.ndarray],
+        c2ws: np.ndarray,
+        Ks: np.ndarray,
+        points: Optional[np.ndarray] = None,
+        points_rgb: Optional[np.ndarray] = None,  # uint8
+        mono_disps: Optional[List[np.ndarray]] = None,
+        normalize: bool = False,
+        test_every: Optional[int] = None,
+    ):
+        super().__init__("", 1, normalize, test_every)
+        self.image_names = [f"{i:06d}" for i in range(len(imgs))]
+        self.image_paths = ["null" for _ in range(len(imgs))]
+        self.camtoworlds = c2ws
+        self.camera_ids = [i for i in range(len(imgs))]
+        self.Ks_dict = {i: K for i, K in enumerate(Ks)}
+        self.imsize_dict = {
+            i: (img.shape[1], img.shape[0]) for i, img in enumerate(imgs)
+        }
+        if points is not None:
+            self.points = points
+            assert points_rgb is not None
+            self.points_rgb = points_rgb
+            self.points_err = np.zeros((len(points),))
+        self.imgs = imgs
+        self.mono_disps = mono_disps
+        # Normalize the world space.
+        if normalize:
+            T1 = similarity_from_cameras(self.camtoworlds)
+            self.camtoworlds = transform_cameras(T1, self.camtoworlds)
+            if points is not None:
+                self.points = transform_points(T1, self.points)
+                T2 = align_principle_axes(self.points)
+                self.camtoworlds = transform_cameras(T2, self.camtoworlds)
+                self.points = transform_points(T2, self.points)
+            else:
+                T2 = np.eye(4)
+            self.transform = T2 @ T1
+        else:
+            self.transform = np.eye(4)
+        # size of the scene measured by cameras
+        camera_locations = self.camtoworlds[:, :3, 3]
+        scene_center = np.mean(camera_locations, axis=0)
+        dists = np.linalg.norm(camera_locations - scene_center, axis=1)
+        self.scene_scale = np.max(dists)
+class COLMAPParser(BaseParser):
+    """COLMAP parser."""
+    def __init__(
+        self,
+        data_dir: str,
+        factor: int = 1,
+        normalize: bool = False,
+        test_every: Optional[int] = 8,
+        image_folder: str = "images",
+        colmap_folder: str = "sparse/0",
+    ):
+        super().__init__(data_dir, factor, normalize, test_every)
+        colmap_dir = os.path.join(data_dir, colmap_folder)
+        assert os.path.exists(
+            colmap_dir
+        ), f"COLMAP directory {colmap_dir} does not exist."
+        try:
+            from pycolmap import SceneManager
+        except ImportError:
+            raise ImportError(
+                "Please install pycolmap to use the data parsers: "
+                "  `pip install git+https://github.com/jensenz-sai/pycolmap.git@543266bc316df2fe407b3a33d454b310b1641042`"
+            )
+        manager = SceneManager(colmap_dir)
+        manager.load_cameras()
+        manager.load_images()
+        manager.load_points3D()
+        # Extract extrinsic matrices in world-to-camera format.
+        imdata = manager.images
+        w2c_mats = []
+        camera_ids = []
+        Ks_dict = dict()
+        params_dict = dict()
+        imsize_dict = dict()  # width, height
+        bottom = np.array([0, 0, 0, 1]).reshape(1, 4)
+        for k in imdata:
+            im = imdata[k]
+            rot = im.R()
+            trans = im.tvec.reshape(3, 1)
+            w2c = np.concatenate([np.concatenate([rot, trans], 1), bottom], axis=0)
+            w2c_mats.append(w2c)
+            # support different camera intrinsics
+            camera_id = im.camera_id
+            camera_ids.append(camera_id)
+            # camera intrinsics
+            cam = manager.cameras[camera_id]
+            fx, fy, cx, cy = cam.fx, cam.fy, cam.cx, cam.cy
+            K = np.array([[fx, 0, cx], [0, fy, cy], [0, 0, 1]])
+            K[:2, :] /= factor
+            Ks_dict[camera_id] = K
+            # Get distortion parameters.
+            type_ = cam.camera_type
+            if type_ == 0 or type_ == "SIMPLE_PINHOLE":
+                params = np.empty(0, dtype=np.float32)
+                camtype = "perspective"
+            elif type_ == 1 or type_ == "PINHOLE":
+                params = np.empty(0, dtype=np.float32)
+                camtype = "perspective"
+            if type_ == 2 or type_ == "SIMPLE_RADIAL":
+                params = np.array([cam.k1, 0.0, 0.0, 0.0], dtype=np.float32)
+                camtype = "perspective"
+            elif type_ == 3 or type_ == "RADIAL":
+                params = np.array([cam.k1, cam.k2, 0.0, 0.0], dtype=np.float32)
+                camtype = "perspective"
+            elif type_ == 4 or type_ == "OPENCV":
+                params = np.array([cam.k1, cam.k2, cam.p1, cam.p2], dtype=np.float32)
+                camtype = "perspective"
+            elif type_ == 5 or type_ == "OPENCV_FISHEYE":
+                params = np.array([cam.k1, cam.k2, cam.k3, cam.k4], dtype=np.float32)
+                camtype = "fisheye"
+            assert (
+                camtype == "perspective"  # type: ignore
+            ), f"Only support perspective camera model, got {type_}"
+            params_dict[camera_id] = params  # type: ignore
+            # image size
+            imsize_dict[camera_id] = (cam.width // factor, cam.height // factor)
+        print(
+            f"[Parser] {len(imdata)} images, taken by {len(set(camera_ids))} cameras."
+        )
+        if len(imdata) == 0:
+            raise ValueError("No images found in COLMAP.")
+        if not (type_ == 0 or type_ == 1):  # type: ignore
+            print("Warning: COLMAP Camera is not PINHOLE. Images have distortion.")
+        w2c_mats = np.stack(w2c_mats, axis=0)
+        # Convert extrinsics to camera-to-world.
+        camtoworlds = np.linalg.inv(w2c_mats)
+        # Image names from COLMAP. No need for permuting the poses according to
+        # image names anymore.
+        image_names = [imdata[k].name for k in imdata]
+        # Previous Nerf results were generated with images sorted by filename,
+        # ensure metrics are reported on the same test set.
+        inds = np.argsort(image_names)
+        image_names = [image_names[i] for i in inds]
+        camtoworlds = camtoworlds[inds]
+        camera_ids = [camera_ids[i] for i in inds]
+        # Load images.
+        if factor > 1:
+            image_dir_suffix = f"_{factor}"
+        else:
+            image_dir_suffix = ""
+        colmap_image_dir = os.path.join(data_dir, image_folder)
+        image_dir = os.path.join(data_dir, image_folder + image_dir_suffix)
+        for d in [image_dir, colmap_image_dir]:
+            if not os.path.exists(d):
+                raise ValueError(f"Image folder {d} does not exist.")
+        # Downsampled images may have different names vs images used for COLMAP,
+        # so we need to map between the two sorted lists of files.
+        colmap_files = sorted(_get_rel_paths(colmap_image_dir))
+        image_files = sorted(_get_rel_paths(image_dir))
+        colmap_to_image = dict(zip(colmap_files, image_files))
+        image_paths = [os.path.join(image_dir, colmap_to_image[f]) for f in image_names]
+        # 3D points and {image_name -> [point_idx]}
+        points = manager.points3D.astype(np.float32)  # type: ignore
+        points_err = manager.point3D_errors.astype(np.float32)  # type: ignore
+        points_rgb = manager.point3D_colors.astype(np.uint8)  # type: ignore
+        point_indices = dict()
+        image_id_to_name = {v: k for k, v in manager.name_to_image_id.items()}
+        for point_id, data in manager.point3D_id_to_images.items():
+            for image_id, _ in data:
+                image_name = image_id_to_name[image_id]
+                point_idx = manager.point3D_id_to_point3D_idx[point_id]
+                point_indices.setdefault(image_name, []).append(point_idx)
+        point_indices = {
+            k: np.array(v).astype(np.int32) for k, v in point_indices.items()
+        }
+        # Normalize the world space.
+        if normalize:
+            T1 = similarity_from_cameras(camtoworlds)
+            camtoworlds = transform_cameras(T1, camtoworlds)
+            points = transform_points(T1, points)
+            T2 = align_principle_axes(points)
+            camtoworlds = transform_cameras(T2, camtoworlds)
+            points = transform_points(T2, points)
+            transform = T2 @ T1
+        else:
+            transform = np.eye(4)
+        self.image_names = image_names  # List[str], (num_images,)
+        self.image_paths = image_paths  # List[str], (num_images,)
+        self.camtoworlds = camtoworlds  # np.ndarray, (num_images, 4, 4)
+        self.camera_ids = camera_ids  # List[int], (num_images,)
+        self.Ks_dict = Ks_dict  # Dict of camera_id -> K
+        self.params_dict = params_dict  # Dict of camera_id -> params
+        self.imsize_dict = imsize_dict  # Dict of camera_id -> (width, height)
+        self.points = points  # np.ndarray, (num_points, 3)
+        self.points_err = points_err  # np.ndarray, (num_points,)
+        self.points_rgb = points_rgb  # np.ndarray, (num_points, 3)
+        self.point_indices = point_indices  # Dict[str, np.ndarray], image_name -> [M,]
+        self.transform = transform  # np.ndarray, (4, 4)
+        # undistortion
+        self.mapx_dict = dict()
+        self.mapy_dict = dict()
+        self.roi_undist_dict = dict()
+        for camera_id in self.params_dict.keys():
+            params = self.params_dict[camera_id]
+            if len(params) == 0:
+                continue  # no distortion
+            assert camera_id in self.Ks_dict, f"Missing K for camera {camera_id}"
+            assert (
+                camera_id in self.params_dict
+            ), f"Missing params for camera {camera_id}"
+            K = self.Ks_dict[camera_id]
+            width, height = self.imsize_dict[camera_id]
+            K_undist, roi_undist = cv2.getOptimalNewCameraMatrix(
+                K, params, (width, height), 0
+            )
+            mapx, mapy = cv2.initUndistortRectifyMap(
+                K,
+                params,
+                None,
+                K_undist,
+                (width, height),
+                cv2.CV_32FC1,  # type: ignore
+            )
+            self.Ks_dict[camera_id] = K_undist
+            self.mapx_dict[camera_id] = mapx
+            self.mapy_dict[camera_id] = mapy
+            self.roi_undist_dict[camera_id] = roi_undist  # type: ignore
+        # size of the scene measured by cameras
+        camera_locations = camtoworlds[:, :3, 3]
+        scene_center = np.mean(camera_locations, axis=0)
+        dists = np.linalg.norm(camera_locations - scene_center, axis=1)
+        self.scene_scale = np.max(dists)
+class ReconfusionParser(BaseParser):
+    def __init__(self, data_dir: str, normalize: bool = False):
+        super().__init__(data_dir, 1, normalize, test_every=None)
+        def get_num(p):
+            return p.split("_")[-1].removesuffix(".json")
+        splits_per_num_input_frames = {}
+        num_input_frames = [
+            int(get_num(p)) if get_num(p).isdigit() else get_num(p)
+            for p in sorted(glob(osp.join(data_dir, "train_test_split_*.json")))
+        ]
+        for num_input_frames in num_input_frames:
+            with open(
+                osp.join(
+                    data_dir,
+                    f"train_test_split_{num_input_frames}.json",
+                )
+            ) as f:
+                splits_per_num_input_frames[num_input_frames] = json.load(f)
+        self.splits_per_num_input_frames = splits_per_num_input_frames
+        with open(osp.join(data_dir, "transforms.json")) as f:
+            metadata = json.load(f)
+        image_names, image_paths, camtoworlds = [], [], []
+        for frame in metadata["frames"]:
+            if frame["file_path"] is None:
+                image_path = image_name = None
+            else:
+                image_path = osp.join(data_dir, frame["file_path"])
+                image_name = osp.basename(image_path)
+            image_paths.append(image_path)
+            image_names.append(image_name)
+            camtoworld = np.array(frame["transform_matrix"])
+            if "applied_transform" in metadata:
+                applied_transform = np.concatenate(
+                    [metadata["applied_transform"], [[0, 0, 0, 1]]], axis=0
+                )
+                camtoworld = applied_transform @ camtoworld
+            camtoworlds.append(camtoworld)
+        camtoworlds = np.array(camtoworlds)
+        camtoworlds[:, :, [1, 2]] *= -1
+        # Normalize the world space.
+        if normalize:
+            T1 = similarity_from_cameras(camtoworlds)
+            camtoworlds = transform_cameras(T1, camtoworlds)
+            self.transform = T1
+        else:
+            self.transform = np.eye(4)
+        self.image_names = image_names
+        self.image_paths = image_paths
+        self.camtoworlds = camtoworlds
+        self.camera_ids = list(range(len(image_paths)))
+        self.Ks_dict = {
+            i: np.array(
+                [
+                    [
+                        metadata.get("fl_x", frame.get("fl_x", None)),
+                        0.0,
+                        metadata.get("cx", frame.get("cx", None)),
+                    ],
+                    [
+                        0.0,
+                        metadata.get("fl_y", frame.get("fl_y", None)),
+                        metadata.get("cy", frame.get("cy", None)),
+                    ],
+                    [0.0, 0.0, 1.0],
+                ]
+            )
+            for i, frame in enumerate(metadata["frames"])
+        }
+        self.imsize_dict = {
+            i: (
+                metadata.get("w", frame.get("w", None)),
+                metadata.get("h", frame.get("h", None)),
+            )
+            for i, frame in enumerate(metadata["frames"])
+        }
+        # When num_input_frames is None, use all frames for both training and
+        # testing.
+        # self.splits_per_num_input_frames[None] = {
+        #     "train_ids": list(range(len(image_paths))),
+        #     "test_ids": list(range(len(image_paths))),
+        # }
+        # size of the scene measured by cameras
+        camera_locations = camtoworlds[:, :3, 3]
+        scene_center = np.mean(camera_locations, axis=0)
+        dists = np.linalg.norm(camera_locations - scene_center, axis=1)
+        self.scene_scale = np.max(dists)
+        self.bounds = None
+        if osp.exists(osp.join(data_dir, "bounds.npy")):
+            self.bounds = np.load(osp.join(data_dir, "bounds.npy"))
+            scaling = np.linalg.norm(self.transform[0, :3])
+            self.bounds = self.bounds / scaling
+class Dataset(torch.utils.data.Dataset):
+    """A simple dataset class."""
+    def __init__(
+        self,
+        parser: BaseParser,
+        split: str = "train",
+        num_input_frames: Optional[int] = None,
+        patch_size: Optional[int] = None,
+        load_depths: bool = False,
+        load_mono_disps: bool = False,
+    ):
+        self.parser = parser
+        self.split = split
+        self.num_input_frames = num_input_frames
+        self.patch_size = patch_size
+        self.load_depths = load_depths
+        self.load_mono_disps = load_mono_disps
+        if load_mono_disps:
+            assert isinstance(parser, DirectParser)
+            assert parser.mono_disps is not None
+        if isinstance(parser, ReconfusionParser):
+            ids_per_split = parser.splits_per_num_input_frames[num_input_frames]
+            self.indices = ids_per_split[
+                "train_ids" if split == "train" else "test_ids"
+            ]
+        else:
+            indices = np.arange(len(self.parser.image_names))
+            if split == "train":
+                self.indices = (
+                    indices[indices % self.parser.test_every != 0]
+                    if self.parser.test_every is not None
+                    else indices
+                )
+            else:
+                self.indices = (
+                    indices[indices % self.parser.test_every == 0]
+                    if self.parser.test_every is not None
+                    else indices
+                )
+    def __len__(self):
+        return len(self.indices)
+    def __getitem__(self, item: int) -> Dict[str, Any]:
+        index = self.indices[item]
+        if isinstance(self.parser, DirectParser):
+            image = self.parser.imgs[index]
+        else:
+            image = iio.imread(self.parser.image_paths[index])[..., :3]
+        camera_id = self.parser.camera_ids[index]
+        K = self.parser.Ks_dict[camera_id].copy()  # undistorted K
+        params = self.parser.params_dict.get(camera_id, None)
+        camtoworlds = self.parser.camtoworlds[index]
+        x, y, w, h = 0, 0, image.shape[1], image.shape[0]
+        if params is not None and len(params) > 0:
+            # Images are distorted. Undistort them.
+            mapx, mapy = (
+                self.parser.mapx_dict[camera_id],
+                self.parser.mapy_dict[camera_id],
+            )
+            image = cv2.remap(image, mapx, mapy, cv2.INTER_LINEAR)
+            x, y, w, h = self.parser.roi_undist_dict[camera_id]
+            image = image[y : y + h, x : x + w]
+        if self.patch_size is not None:
+            # Random crop.
+            h, w = image.shape[:2]
+            x = np.random.randint(0, max(w - self.patch_size, 1))
+            y = np.random.randint(0, max(h - self.patch_size, 1))
+            image = image[y : y + self.patch_size, x : x + self.patch_size]
+            K[0, 2] -= x
+            K[1, 2] -= y
+        data = {
+            "K": torch.from_numpy(K).float(),
+            "camtoworld": torch.from_numpy(camtoworlds).float(),
+            "image": torch.from_numpy(image).float(),
+            "image_id": item,  # the index of the image in the dataset
+        }
+        if self.load_depths:
+            # projected points to image plane to get depths
+            worldtocams = np.linalg.inv(camtoworlds)
+            image_name = self.parser.image_names[index]
+            point_indices = self.parser.point_indices[image_name]
+            points_world = self.parser.points[point_indices]
+            points_cam = (worldtocams[:3, :3] @ points_world.T + worldtocams[:3, 3:4]).T
+            points_proj = (K @ points_cam.T).T
+            points = points_proj[:, :2] / points_proj[:, 2:3]  # (M, 2)
+            depths = points_cam[:, 2]  # (M,)
+            if self.patch_size is not None:
+                points[:, 0] -= x
+                points[:, 1] -= y
+            # filter out points outside the image
+            selector = (
+                (points[:, 0] >= 0)
+                & (points[:, 0] < image.shape[1])
+                & (points[:, 1] >= 0)
+                & (points[:, 1] < image.shape[0])
+                & (depths > 0)
+            )
+            points = points[selector]
+            depths = depths[selector]
+            data["points"] = torch.from_numpy(points).float()
+            data["depths"] = torch.from_numpy(depths).float()
+        if self.load_mono_disps:
+            data["mono_disps"] = torch.from_numpy(self.parser.mono_disps[index]).float()  # type: ignore
+        return data
+def get_parser(parser_type: str, **kwargs) -> BaseParser:
+    if parser_type == "colmap":
+        parser = COLMAPParser(**kwargs)
+    elif parser_type == "direct":
+        parser = DirectParser(**kwargs)
+    elif parser_type == "reconfusion":
+        parser = ReconfusionParser(**kwargs)
+    else:
+        raise ValueError(f"Unknown parser type: {parser_type}")
+    return parser

seva/eval.py ADDED Viewed

	@@ -0,0 +1,1990 @@

+import collections
+import json
+import math
+import os
+import re
+import threading
+from typing import List, Literal, Optional, Tuple, Union
+import gradio as gr
+from colorama import Fore, Style, init
+init(autoreset=True)
+import imageio.v3 as iio
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torchvision.transforms.functional as TF
+from einops import repeat
+from PIL import Image
+from tqdm.auto import tqdm
+from seva.geometry import get_camera_dist, get_plucker_coordinates, to_hom_pose
+from seva.sampling import (
+    EulerEDMSampler,
+    MultiviewCFG,
+    MultiviewTemporalCFG,
+    VanillaCFG,
+)
+from seva.utils import seed_everything
+try:
+    # Check if version string contains 'dev' or 'nightly'
+    version = torch.__version__
+    IS_TORCH_NIGHTLY = "dev" in version
+    if IS_TORCH_NIGHTLY:
+        torch._dynamo.config.cache_size_limit = 128  # type: ignore[assignment]
+        torch._dynamo.config.accumulated_cache_size_limit = 1024  # type: ignore[assignment]
+        torch._dynamo.config.force_parameter_static_shapes = False  # type: ignore[assignment]
+except Exception:
+    IS_TORCH_NIGHTLY = False
+def pad_indices(
+    input_indices: List[int],
+    test_indices: List[int],
+    T: int,
+    padding_mode: Literal["first", "last", "none"] = "last",
+):
+    assert padding_mode in ["last", "none"], "`first` padding is not supported yet."
+    if padding_mode == "last":
+        padded_indices = [
+            i for i in range(T) if i not in (input_indices + test_indices)
+        ]
+    else:
+        padded_indices = []
+    input_selects = list(range(len(input_indices)))
+    test_selects = list(range(len(test_indices)))
+    if max(input_indices) > max(test_indices):
+        # last elem from input
+        input_selects += [input_selects[-1]] * len(padded_indices)
+        input_indices = input_indices + padded_indices
+        sorted_inds = np.argsort(input_indices)
+        input_indices = [input_indices[ind] for ind in sorted_inds]
+        input_selects = [input_selects[ind] for ind in sorted_inds]
+    else:
+        # last elem from test
+        test_selects += [test_selects[-1]] * len(padded_indices)
+        test_indices = test_indices + padded_indices
+        sorted_inds = np.argsort(test_indices)
+        test_indices = [test_indices[ind] for ind in sorted_inds]
+        test_selects = [test_selects[ind] for ind in sorted_inds]
+    if padding_mode == "last":
+        input_maps = np.array([-1] * T)
+        test_maps = np.array([-1] * T)
+    else:
+        input_maps = np.array([-1] * (len(input_indices) + len(test_indices)))
+        test_maps = np.array([-1] * (len(input_indices) + len(test_indices)))
+    input_maps[input_indices] = input_selects
+    test_maps[test_indices] = test_selects
+    return input_indices, test_indices, input_maps, test_maps
+def assemble(
+    input,
+    test,
+    input_maps,
+    test_maps,
+):
+    T = len(input_maps)
+    assembled = torch.zeros_like(test[-1:]).repeat_interleave(T, dim=0)
+    assembled[input_maps != -1] = input[input_maps[input_maps != -1]]
+    assembled[test_maps != -1] = test[test_maps[test_maps != -1]]
+    assert np.logical_xor(input_maps != -1, test_maps != -1).all()
+    return assembled
+def get_resizing_factor(
+    target_shape: Tuple[int, int],  # H, W
+    current_shape: Tuple[int, int],  # H, W
+    cover_target: bool = True,
+    # If True, the output shape will fully cover the target shape.
+    # If No, the target shape will fully cover the output shape.
+) -> float:
+    r_bound = target_shape[1] / target_shape[0]
+    aspect_r = current_shape[1] / current_shape[0]
+    if r_bound >= 1.0:
+        if cover_target:
+            if aspect_r >= r_bound:
+                factor = min(target_shape) / min(current_shape)
+            elif aspect_r < 1.0:
+                factor = max(target_shape) / min(current_shape)
+            else:
+                factor = max(target_shape) / max(current_shape)
+        else:
+            if aspect_r >= r_bound:
+                factor = max(target_shape) / max(current_shape)
+            elif aspect_r < 1.0:
+                factor = min(target_shape) / max(current_shape)
+            else:
+                factor = min(target_shape) / min(current_shape)
+    else:
+        if cover_target:
+            if aspect_r <= r_bound:
+                factor = min(target_shape) / min(current_shape)
+            elif aspect_r > 1.0:
+                factor = max(target_shape) / min(current_shape)
+            else:
+                factor = max(target_shape) / max(current_shape)
+        else:
+            if aspect_r <= r_bound:
+                factor = max(target_shape) / max(current_shape)
+            elif aspect_r > 1.0:
+                factor = min(target_shape) / max(current_shape)
+            else:
+                factor = min(target_shape) / min(current_shape)
+    return factor
+def get_unique_embedder_keys_from_conditioner(conditioner):
+    keys = [x.input_key for x in conditioner.embedders if x.input_key is not None]
+    keys = [item for sublist in keys for item in sublist]  # Flatten list
+    return set(keys)
+def get_wh_with_fixed_shortest_side(w, h, size):
+    # size is smaller or equal to zero, we return original w h
+    if size is None or size <= 0:
+        return w, h
+    if w < h:
+        new_w = size
+        new_h = int(size * h / w)
+    else:
+        new_h = size
+        new_w = int(size * w / h)
+    return new_w, new_h
+def load_img_and_K(
+    image_path_or_size: Union[str, torch.Size],
+    size: Optional[Union[int, Tuple[int, int]]],
+    scale: float = 1.0,
+    center: Tuple[float, float] = (0.5, 0.5),
+    K: torch.Tensor | None = None,
+    size_stride: int = 1,
+    center_crop: bool = False,
+    image_as_tensor: bool = True,
+    context_rgb: np.ndarray | None = None,
+    device: str = "cuda",
+):
+    if isinstance(image_path_or_size, torch.Size):
+        image = Image.new("RGBA", image_path_or_size[::-1])
+    else:
+        image = Image.open(image_path_or_size).convert("RGBA")
+    w, h = image.size
+    if size is None:
+        size = (w, h)
+    image = np.array(image).astype(np.float32) / 255
+    if image.shape[-1] == 4:
+        rgb, alpha = image[:, :, :3], image[:, :, 3:]
+        if context_rgb is not None:
+            image = rgb * alpha + context_rgb * (1 - alpha)
+        else:
+            image = rgb * alpha + (1 - alpha)
+    image = image.transpose(2, 0, 1)
+    image = torch.from_numpy(image).to(dtype=torch.float32)
+    image = image.unsqueeze(0)
+    if isinstance(size, (tuple, list)):
+        # => if size is a tuple or list, we first rescale to fully cover the `size`
+        # area and then crop the `size` area from the rescale image
+        W, H = size
+    else:
+        # => if size is int, we rescale the image to fit the shortest side to size
+        # => if size is None, no rescaling is applied
+        W, H = get_wh_with_fixed_shortest_side(w, h, size)
+    W, H = (
+        math.floor(W / size_stride + 0.5) * size_stride,
+        math.floor(H / size_stride + 0.5) * size_stride,
+    )
+    rfs = get_resizing_factor((math.floor(H * scale), math.floor(W * scale)), (h, w))
+    resize_size = rh, rw = [int(np.ceil(rfs * s)) for s in (h, w)]
+    image = torch.nn.functional.interpolate(
+        image, resize_size, mode="area", antialias=False
+    )
+    if scale < 1.0:
+        pw = math.ceil((W - resize_size[1]) * 0.5)
+        ph = math.ceil((H - resize_size[0]) * 0.5)
+        image = F.pad(image, (pw, pw, ph, ph), "constant", 1.0)
+    cy_center = int(center[1] * image.shape[-2])
+    cx_center = int(center[0] * image.shape[-1])
+    if center_crop:
+        side = min(H, W)
+        ct = max(0, cy_center - side // 2)
+        cl = max(0, cx_center - side // 2)
+        ct = min(ct, image.shape[-2] - side)
+        cl = min(cl, image.shape[-1] - side)
+        image = TF.crop(image, top=ct, left=cl, height=side, width=side)
+    else:
+        ct = max(0, cy_center - H // 2)
+        cl = max(0, cx_center - W // 2)
+        ct = min(ct, image.shape[-2] - H)
+        cl = min(cl, image.shape[-1] - W)
+        image = TF.crop(image, top=ct, left=cl, height=H, width=W)
+    if K is not None:
+        K = K.clone()
+        if torch.all(K[:2, -1] >= 0) and torch.all(K[:2, -1] <= 1):
+            K[:2] *= K.new_tensor([rw, rh])[:, None]  # normalized K
+        else:
+            K[:2] *= K.new_tensor([rw / w, rh / h])[:, None]  # unnormalized K
+        K[:2, 2] -= K.new_tensor([cl, ct])
+    if image_as_tensor:
+        # tensor of shape (1, 3, H, W) with values ranging from (-1, 1)
+        image = image.to(device) * 2.0 - 1.0
+    else:
+        # PIL Image with values ranging from (0, 255)
+        image = image.permute(0, 2, 3, 1).numpy()[0]
+        image = Image.fromarray((image * 255).astype(np.uint8))
+    return image, K
+def transform_img_and_K(
+    image: torch.Tensor,
+    size: Union[int, Tuple[int, int]],
+    scale: float = 1.0,
+    center: Tuple[float, float] = (0.5, 0.5),
+    K: torch.Tensor | None = None,
+    size_stride: int = 1,
+    mode: str = "crop",
+):
+    assert mode in [
+        "crop",
+        "pad",
+        "stretch",
+    ], f"mode should be one of ['crop', 'pad', 'stretch'], got {mode}"
+    h, w = image.shape[-2:]
+    if isinstance(size, (tuple, list)):
+        # => if size is a tuple or list, we first rescale to fully cover the `size`
+        # area and then crop the `size` area from the rescale image
+        W, H = size
+    else:
+        # => if size is int, we rescale the image to fit the shortest side to size
+        # => if size is None, no rescaling is applied
+        W, H = get_wh_with_fixed_shortest_side(w, h, size)
+    W, H = (
+        math.floor(W / size_stride + 0.5) * size_stride,
+        math.floor(H / size_stride + 0.5) * size_stride,
+    )
+    if mode == "stretch":
+        rh, rw = H, W
+    else:
+        rfs = get_resizing_factor(
+            (H, W),
+            (h, w),
+            cover_target=mode != "pad",
+        )
+        (rh, rw) = [int(np.ceil(rfs * s)) for s in (h, w)]
+    rh, rw = int(rh / scale), int(rw / scale)
+    image = torch.nn.functional.interpolate(
+        image, (rh, rw), mode="area", antialias=False
+    )
+    cy_center = int(center[1] * image.shape[-2])
+    cx_center = int(center[0] * image.shape[-1])
+    if mode != "pad":
+        ct = max(0, cy_center - H // 2)
+        cl = max(0, cx_center - W // 2)
+        ct = min(ct, image.shape[-2] - H)
+        cl = min(cl, image.shape[-1] - W)
+        image = TF.crop(image, top=ct, left=cl, height=H, width=W)
+        pl, pt = 0, 0
+    else:
+        pt = max(0, H // 2 - cy_center)
+        pl = max(0, W // 2 - cx_center)
+        pb = max(0, H - pt - image.shape[-2])
+        pr = max(0, W - pl - image.shape[-1])
+        image = TF.pad(
+            image,
+            [pl, pt, pr, pb],
+        )
+        cl, ct = 0, 0
+    if K is not None:
+        K = K.clone()
+        # K[:, :2, 2] += K.new_tensor([pl, pt])
+        if torch.all(K[:, :2, -1] >= 0) and torch.all(K[:, :2, -1] <= 1):
+            K[:, :2] *= K.new_tensor([rw, rh])[None, :, None]  # normalized K
+        else:
+            K[:, :2] *= K.new_tensor([rw / w, rh / h])[None, :, None]  # unnormalized K
+        K[:, :2, 2] += K.new_tensor([pl - cl, pt - ct])
+    return image, K
+lowvram_mode = False
+def set_lowvram_mode(mode):
+    global lowvram_mode
+    lowvram_mode = mode
+def load_model(model, device: str = "cuda"):
+    model.to(device)
+def unload_model(model):
+    global lowvram_mode
+    if lowvram_mode:
+        model.cpu()
+        torch.cuda.empty_cache()
+def infer_prior_stats(
+    T,
+    num_input_frames,
+    num_total_frames,
+    version_dict,
+):
+    options = version_dict["options"]
+    chunk_strategy = options.get("chunk_strategy", "nearest")
+    T_first_pass = T[0] if isinstance(T, (list, tuple)) else T
+    T_second_pass = T[1] if isinstance(T, (list, tuple)) else T
+    # get traj_prior_c2ws for 2-pass sampling
+    if chunk_strategy.startswith("interp"):
+        # Start and end have alreay taken up two slots
+        # +1 means we need X + 1 prior frames to bound X times forwards for all test frames
+        # Tuning up `num_prior_frames_ratio` is helpful when you observe sudden jump in the
+        # generated frames due to insufficient prior frames. This option is effective for
+        # complicated trajectory and when `interp` strategy is used (usually semi-dense-view
+        # regime). Recommended range is [1.0 (default), 1.5].
+        if num_input_frames >= options.get("num_input_semi_dense", 9):
+            num_prior_frames = (
+                math.ceil(
+                    num_total_frames
+                    / (T_second_pass - 2)
+                    * options.get("num_prior_frames_ratio", 1.0)
+                )
+                + 1
+            )
+            if num_prior_frames + num_input_frames < T_first_pass:
+                num_prior_frames = T_first_pass - num_input_frames
+            num_prior_frames = max(
+                num_prior_frames,
+                options.get("num_prior_frames", 0),
+            )
+            T_first_pass = num_prior_frames + num_input_frames
+            if "gt" in chunk_strategy:
+                T_second_pass = T_second_pass + num_input_frames
+            # Dynamically update context window length.
+            version_dict["T"] = [T_first_pass, T_second_pass]
+        else:
+            num_prior_frames = (
+                math.ceil(
+                    num_total_frames
+                    / (
+                        T_second_pass
+                        - 2
+                        - (num_input_frames if "gt" in chunk_strategy else 0)
+                    )
+                    * options.get("num_prior_frames_ratio", 1.0)
+                )
+                + 1
+            )
+            if num_prior_frames + num_input_frames < T_first_pass:
+                num_prior_frames = T_first_pass - num_input_frames
+            num_prior_frames = max(
+                num_prior_frames,
+                options.get("num_prior_frames", 0),
+            )
+    else:
+        num_prior_frames = max(
+            T_first_pass - num_input_frames,
+            options.get("num_prior_frames", 0),
+        )
+        if num_input_frames >= options.get("num_input_semi_dense", 9):
+            T_first_pass = num_prior_frames + num_input_frames
+            # Dynamically update context window length.
+            version_dict["T"] = [T_first_pass, T_second_pass]
+    return num_prior_frames
+def infer_prior_inds(
+    c2ws,
+    num_prior_frames,
+    input_frame_indices,
+    options,
+):
+    chunk_strategy = options.get("chunk_strategy", "nearest")
+    if chunk_strategy.startswith("interp"):
+        prior_frame_indices = np.array(
+            [i for i in range(c2ws.shape[0]) if i not in input_frame_indices]
+        )
+        prior_frame_indices = prior_frame_indices[
+            np.ceil(
+                np.linspace(
+                    0, prior_frame_indices.shape[0] - 1, num_prior_frames, endpoint=True
+                )
+            ).astype(int)
+        ]  # having a ceil here is actually safer for corner case
+    else:
+        prior_frame_indices = []
+        while len(prior_frame_indices) < num_prior_frames:
+            closest_distance = np.abs(
+                np.arange(c2ws.shape[0])[None]
+                - np.concatenate(
+                    [np.array(input_frame_indices), np.array(prior_frame_indices)]
+                )[:, None]
+            ).min(0)
+            prior_frame_indices.append(np.argsort(closest_distance)[-1])
+    return np.sort(prior_frame_indices)
+def compute_relative_inds(
+    source_inds,
+    target_inds,
+):
+    assert len(source_inds) > 2
+    # compute relative indices of target_inds within source_inds
+    relative_inds = []
+    for ind in target_inds:
+        if ind in source_inds:
+            relative_ind = int(np.where(source_inds == ind)[0][0])
+        elif ind < source_inds[0]:
+            # extrapolate
+            relative_ind = -((source_inds[0] - ind) / (source_inds[1] - source_inds[0]))
+        elif ind > source_inds[-1]:
+            # extrapolate
+            relative_ind = len(source_inds) + (
+                (ind - source_inds[-1]) / (source_inds[-1] - source_inds[-2])
+            )
+        else:
+            # interpolate
+            lower_inds = source_inds[source_inds < ind]
+            upper_inds = source_inds[source_inds > ind]
+            if len(lower_inds) > 0 and len(upper_inds) > 0:
+                lower_ind = lower_inds[-1]
+                upper_ind = upper_inds[0]
+                relative_lower_ind = int(np.where(source_inds == lower_ind)[0][0])
+                relative_upper_ind = int(np.where(source_inds == upper_ind)[0][0])
+                relative_ind = relative_lower_ind + (ind - lower_ind) / (
+                    upper_ind - lower_ind
+                ) * (relative_upper_ind - relative_lower_ind)
+            else:
+                # Out of range
+                relative_inds.append(float("nan"))  # Or some other placeholder
+        relative_inds.append(relative_ind)
+    return relative_inds
+def find_nearest_source_inds(
+    source_c2ws,
+    target_c2ws,
+    nearest_num=1,
+    mode="translation",
+):
+    dists = get_camera_dist(source_c2ws, target_c2ws, mode=mode).cpu().numpy()
+    sorted_inds = np.argsort(dists, axis=0).T
+    return sorted_inds[:, :nearest_num]
+def chunk_input_and_test(
+    T,
+    input_c2ws,
+    test_c2ws,
+    input_ords,  # orders
+    test_ords,  # orders
+    options,
+    task: str = "img2img",
+    chunk_strategy: str = "gt",
+    gt_input_inds: list = [],
+):
+    M, N = input_c2ws.shape[0], test_c2ws.shape[0]
+    chunks = []
+    if chunk_strategy.startswith("gt"):
+        assert len(gt_input_inds) < T, (
+            f"Number of gt input frames {len(gt_input_inds)} should be "
+            f"less than {T} when `gt` chunking strategy is used."
+        )
+        assert (
+            list(range(M)) == gt_input_inds
+        ), "All input_c2ws should be gt when `gt` chunking strategy is used."
+        # LEGACY CHUNKING STRATEGY
+        # num_test_per_chunk = T - len(gt_input_inds)
+        # test_inds_per_chunk = [i for i in range(T) if i not in gt_input_inds]
+        # for i in range(0, test_c2ws.shape[0], num_test_per_chunk):
+        #     chunk = ["NULL"] * T
+        #     for j, k in enumerate(gt_input_inds):
+        #         chunk[k] = f"!{j:03d}"
+        #     for j, k in enumerate(
+        #         test_inds_per_chunk[: test_c2ws[i : i + num_test_per_chunk].shape[0]]
+        #     ):
+        #         chunk[k] = f">{i + j:03d}"
+        #     chunks.append(chunk)
+        num_test_seen = 0
+        while num_test_seen < N:
+            chunk = [f"!{i:03d}" for i in gt_input_inds]
+            if chunk_strategy != "gt" and num_test_seen > 0:
+                pseudo_num_ratio = options.get("pseudo_num_ratio", 0.33)
+                if (N - num_test_seen) >= math.floor(
+                    (T - len(gt_input_inds)) * pseudo_num_ratio
+                ):
+                    pseudo_num = math.ceil((T - len(gt_input_inds)) * pseudo_num_ratio)
+                else:
+                    pseudo_num = (T - len(gt_input_inds)) - (N - num_test_seen)
+                pseudo_num = min(pseudo_num, options.get("pseudo_num_max", 10000))
+                if "ltr" in chunk_strategy:
+                    chunk.extend(
+                        [
+                            f"!{i + len(gt_input_inds):03d}"
+                            for i in range(num_test_seen - pseudo_num, num_test_seen)
+                        ]
+                    )
+                elif "nearest" in chunk_strategy:
+                    source_inds = np.concatenate(
+                        [
+                            find_nearest_source_inds(
+                                test_c2ws[:num_test_seen],
+                                test_c2ws[num_test_seen:],
+                                nearest_num=1,  # pseudo_num,
+                                mode="rotation",
+                            ),
+                            find_nearest_source_inds(
+                                test_c2ws[:num_test_seen],
+                                test_c2ws[num_test_seen:],
+                                nearest_num=1,  # pseudo_num,
+                                mode="translation",
+                            ),
+                        ],
+                        axis=1,
+                    )
+                    ####### [HACK ALERT] keep running until pseudo num is stablized ########
+                    temp_pseudo_num = pseudo_num
+                    while True:
+                        nearest_source_inds = np.concatenate(
+                            [
+                                np.sort(
+                                    [
+                                        ind
+                                        for (ind, _) in collections.Counter(
+                                            [
+                                                item
+                                                for item in source_inds[
+                                                    : T
+                                                    - len(gt_input_inds)
+                                                    - temp_pseudo_num
+                                                ]
+                                                .flatten()
+                                                .tolist()
+                                                if item
+                                                != (
+                                                    num_test_seen - 1
+                                                )  # exclude the last one here
+                                            ]
+                                        ).most_common(pseudo_num - 1)
+                                    ],
+                                ).astype(int),
+                                [num_test_seen - 1],  # always keep the last one
+                            ]
+                        )
+                        if len(nearest_source_inds) >= temp_pseudo_num:
+                            break  # stablized
+                        else:
+                            temp_pseudo_num = len(nearest_source_inds)
+                    pseudo_num = len(nearest_source_inds)
+                    ########################################################################
+                    chunk.extend(
+                        [f"!{i + len(gt_input_inds):03d}" for i in nearest_source_inds]
+                    )
+                else:
+                    raise NotImplementedError(
+                        f"Chunking strategy {chunk_strategy} for the first pass is not implemented."
+                    )
+                chunk.extend(
+                    [
+                        f">{i:03d}"
+                        for i in range(
+                            num_test_seen,
+                            min(num_test_seen + T - len(gt_input_inds) - pseudo_num, N),
+                        )
+                    ]
+                )
+            else:
+                chunk.extend(
+                    [
+                        f">{i:03d}"
+                        for i in range(
+                            num_test_seen,
+                            min(num_test_seen + T - len(gt_input_inds), N),
+                        )
+                    ]
+                )
+            num_test_seen += sum([1 for c in chunk if c.startswith(">")])
+            if len(chunk) < T:
+                chunk.extend(["NULL"] * (T - len(chunk)))
+            chunks.append(chunk)
+    elif chunk_strategy.startswith("nearest"):
+        input_imgs = np.array([f"!{i:03d}" for i in range(M)])
+        test_imgs = np.array([f">{i:03d}" for i in range(N)])
+        match = re.match(r"^nearest-(\d+)$", chunk_strategy)
+        if match:
+            nearest_num = int(match.group(1))
+            assert (
+                nearest_num < T
+            ), f"Nearest number of {nearest_num} should be less than {T}."
+            source_inds = find_nearest_source_inds(
+                input_c2ws,
+                test_c2ws,
+                nearest_num=nearest_num,
+                mode="translation",  # during the second pass, consider translation only is enough
+            )
+            for i in range(0, N, T - nearest_num):
+                nearest_source_inds = np.sort(
+                    [
+                        ind
+                        for (ind, _) in collections.Counter(
+                            source_inds[i : i + T - nearest_num].flatten().tolist()
+                        ).most_common(nearest_num)
+                    ]
+                )
+                chunk = (
+                    input_imgs[nearest_source_inds].tolist()
+                    + test_imgs[i : i + T - nearest_num].tolist()
+                )
+                chunks.append(chunk + ["NULL"] * (T - len(chunk)))
+        else:
+            # do not always condition on gt cond frames
+            if "gt" not in chunk_strategy:
+                gt_input_inds = []
+            source_inds = find_nearest_source_inds(
+                input_c2ws,
+                test_c2ws,
+                nearest_num=1,
+                mode="translation",  # during the second pass, consider translation only is enough
+            )[:, 0]
+            test_inds_per_input = {}
+            for test_idx, input_idx in enumerate(source_inds):
+                if input_idx not in test_inds_per_input:
+                    test_inds_per_input[input_idx] = []
+                test_inds_per_input[input_idx].append(test_idx)
+            num_test_seen = 0
+            chunk = input_imgs[gt_input_inds].tolist()
+            candidate_input_inds = sorted(list(test_inds_per_input.keys()))
+            while num_test_seen < N:
+                input_idx = candidate_input_inds[0]
+                test_inds = test_inds_per_input[input_idx]
+                input_is_cond = input_idx in gt_input_inds
+                prefix_inds = [] if input_is_cond else [input_idx]
+                if len(chunk) == T - len(prefix_inds) or not candidate_input_inds:
+                    if chunk:
+                        chunk += ["NULL"] * (T - len(chunk))
+                        chunks.append(chunk)
+                        chunk = input_imgs[gt_input_inds].tolist()
+                    if num_test_seen >= N:
+                        break
+                    continue
+                candidate_chunk = (
+                    input_imgs[prefix_inds].tolist() + test_imgs[test_inds].tolist()
+                )
+                space_left = T - len(chunk)
+                if len(candidate_chunk) <= space_left:
+                    chunk.extend(candidate_chunk)
+                    num_test_seen += len(test_inds)
+                    candidate_input_inds.pop(0)
+                else:
+                    chunk.extend(candidate_chunk[:space_left])
+                    num_input_idx = 0 if input_is_cond else 1
+                    num_test_seen += space_left - num_input_idx
+                    test_inds_per_input[input_idx] = test_inds[
+                        space_left - num_input_idx :
+                    ]
+                if len(chunk) == T:
+                    chunks.append(chunk)
+                    chunk = input_imgs[gt_input_inds].tolist()
+            if chunk and chunk != input_imgs[gt_input_inds].tolist():
+                chunks.append(chunk + ["NULL"] * (T - len(chunk)))
+    elif chunk_strategy.startswith("interp"):
+        # `interp` chunk requires ordering info
+        assert input_ords is not None and test_ords is not None, (
+            "When using `interp` chunking strategy, ordering of input "
+            "and test frames should be provided."
+        )
+        # if chunk_strategy is `interp*`` and task is `img2trajvid*`, we will not
+        # use input views since their order info within target views is unknown
+        if "img2trajvid" in task:
+            assert (
+                list(range(len(gt_input_inds))) == gt_input_inds
+            ), "`img2trajvid` task should put `gt_input_inds` in start."
+            input_c2ws = input_c2ws[
+                [ind for ind in range(M) if ind not in gt_input_inds]
+            ]
+            input_ords = [
+                input_ords[ind] for ind in range(M) if ind not in gt_input_inds
+            ]
+            M = input_c2ws.shape[0]
+        input_ords = [0] + input_ords  # this is a  hack accounting for test views
+        # before the first input view
+        input_ords[-1] += 0.01  # this is a hack ensuring last test stop is included
+        # in the last forward when input_ords[-1] == test_ords[-1]
+        input_ords = np.array(input_ords)[:, None]
+        input_ords_ = np.concatenate([input_ords[1:], np.full((1, 1), np.inf)])
+        test_ords = np.array(test_ords)[None]
+        in_stop_ranges = np.logical_and(
+            np.repeat(input_ords, N, axis=1) <= np.repeat(test_ords, M + 1, axis=0),
+            np.repeat(input_ords_, N, axis=1) > np.repeat(test_ords, M + 1, axis=0),
+        )  # (M, N)
+        assert (in_stop_ranges.sum(1) <= T - 2).all(), (
+            "More input frames need to be sampled during the first pass to ensure "
+            f"#test frames during each forard in the second pass will not exceed {T - 2}."
+        )
+        if input_ords[1, 0] <= test_ords[0, 0]:
+            assert not in_stop_ranges[0].any()
+        if input_ords[-1, 0] >= test_ords[0, -1]:
+            assert not in_stop_ranges[-1].any()
+        gt_chunk = (
+            [f"!{i:03d}" for i in gt_input_inds] if "gt" in chunk_strategy else []
+        )
+        chunk = gt_chunk + []
+        # any test views before the first input views
+        if in_stop_ranges[0].any():
+            for j, in_range in enumerate(in_stop_ranges[0]):
+                if in_range:
+                    chunk.append(f">{j:03d}")
+        in_stop_ranges = in_stop_ranges[1:]
+        i = 0
+        base_i = len(gt_input_inds) if "img2trajvid" in task else 0
+        chunk.append(f"!{i + base_i:03d}")
+        while i < len(in_stop_ranges):
+            in_stop_range = in_stop_ranges[i]
+            if not in_stop_range.any():
+                i += 1
+                continue
+            input_left = i + 1 < M
+            space_left = T - len(chunk)
+            if sum(in_stop_range) + input_left <= space_left:
+                for j, in_range in enumerate(in_stop_range):
+                    if in_range:
+                        chunk.append(f">{j:03d}")
+                i += 1
+                if input_left:
+                    chunk.append(f"!{i + base_i:03d}")
+            else:
+                chunk += ["NULL"] * space_left
+                chunks.append(chunk)
+                chunk = gt_chunk + [f"!{i + base_i:03d}"]
+        if len(chunk) > 1:
+            chunk += ["NULL"] * (T - len(chunk))
+            chunks.append(chunk)
+    else:
+        raise NotImplementedError
+    (
+        input_inds_per_chunk,
+        input_sels_per_chunk,
+        test_inds_per_chunk,
+        test_sels_per_chunk,
+    ) = (
+        [],
+        [],
+        [],
+        [],
+    )
+    for chunk in chunks:
+        input_inds = [
+            int(img.removeprefix("!")) for img in chunk if img.startswith("!")
+        ]
+        input_sels = [chunk.index(img) for img in chunk if img.startswith("!")]
+        test_inds = [int(img.removeprefix(">")) for img in chunk if img.startswith(">")]
+        test_sels = [chunk.index(img) for img in chunk if img.startswith(">")]
+        input_inds_per_chunk.append(input_inds)
+        input_sels_per_chunk.append(input_sels)
+        test_inds_per_chunk.append(test_inds)
+        test_sels_per_chunk.append(test_sels)
+    if options.get("sampler_verbose", True):
+        def colorize(item):
+            if item.startswith("!"):
+                return f"{Fore.RED}{item}{Style.RESET_ALL}"  # Red for items starting with '!'
+            elif item.startswith(">"):
+                return f"{Fore.GREEN}{item}{Style.RESET_ALL}"  # Green for items starting with '>'
+            return item  # Default color if neither '!' nor '>'
+        print("\nchunks:")
+        for chunk in chunks:
+            print(", ".join(colorize(item) for item in chunk))
+    return (
+        chunks,
+        input_inds_per_chunk,  # ordering of input in raw sequence
+        input_sels_per_chunk,  # ordering of input in one-forward sequence of length T
+        test_inds_per_chunk,  # ordering of test in raw sequence
+        test_sels_per_chunk,  # oredering of test in one-forward sequence of length T
+    )
+def is_k_in_dict(d, k):
+    return any(map(lambda x: x.startswith(k), d.keys()))
+def get_k_from_dict(d, k):
+    media_d = {}
+    for key, value in d.items():
+        if key == k:
+            return value
+        if key.startswith(k):
+            media = key.split("/")[-1]
+            if media == "raw":
+                return value
+            media_d[media] = value
+    if len(media_d) == 0:
+        return torch.tensor([])
+    assert (
+        len(media_d) == 1
+    ), f"multiple media found in {d} for key {k}: {media_d.keys()}"
+    return media_d[media]
+def update_kv_for_dict(d, k, v):
+    for key in d.keys():
+        if key.startswith(k):
+            d[key] = v
+    return d
+def extend_dict(ds, d):
+    for key in d.keys():
+        if key in ds:
+            ds[key] = torch.cat([ds[key], d[key]], 0)
+        else:
+            ds[key] = d[key]
+    return ds
+def replace_or_include_input_for_dict(
+    samples,
+    test_indices,
+    imgs,
+    c2w,
+    K,
+):
+    samples_new = {}
+    for sample, value in samples.items():
+        if "rgb" in sample:
+            imgs[test_indices] = (
+                value[test_indices] if value.shape[0] == imgs.shape[0] else value
+            ).to(device=imgs.device, dtype=imgs.dtype)
+            samples_new[sample] = imgs
+        elif "c2w" in sample:
+            c2w[test_indices] = (
+                value[test_indices] if value.shape[0] == c2w.shape[0] else value
+            ).to(device=c2w.device, dtype=c2w.dtype)
+            samples_new[sample] = c2w
+        elif "intrinsics" in sample:
+            K[test_indices] = (
+                value[test_indices] if value.shape[0] == K.shape[0] else value
+            ).to(device=K.device, dtype=K.dtype)
+            samples_new[sample] = K
+        else:
+            samples_new[sample] = value
+    return samples_new
+def decode_output(
+    samples,
+    T,
+    indices=None,
+):
+    # decode model output into dict if it is not
+    if isinstance(samples, dict):
+        # model with postprocessor and outputs dict
+        for sample, value in samples.items():
+            if isinstance(value, torch.Tensor):
+                value = value.detach().cpu()
+            elif isinstance(value, np.ndarray):
+                value = torch.from_numpy(value)
+            else:
+                value = torch.tensor(value)
+            if indices is not None and value.shape[0] == T:
+                value = value[indices]
+            samples[sample] = value
+    else:
+        # model without postprocessor and outputs tensor (rgb)
+        samples = samples.detach().cpu()
+        if indices is not None and samples.shape[0] == T:
+            samples = samples[indices]
+        samples = {"samples-rgb/image": samples}
+    return samples
+def save_output(
+    samples,
+    save_path,
+    video_save_fps=2,
+):
+    os.makedirs(save_path, exist_ok=True)
+    for sample in samples:
+        media_type = "video"
+        if "/" in sample:
+            sample_, media_type = sample.split("/")
+        else:
+            sample_ = sample
+        value = samples[sample]
+        if isinstance(value, torch.Tensor):
+            value = value.detach().cpu()
+        elif isinstance(value, np.ndarray):
+            value = torch.from_numpy(value)
+        else:
+            value = torch.tensor(value)
+        if media_type == "image":
+            value = (value.permute(0, 2, 3, 1) + 1) / 2.0
+            value = (value * 255).clamp(0, 255).to(torch.uint8)
+            iio.imwrite(
+                os.path.join(save_path, f"{sample_}.mp4")
+                if sample_
+                else f"{save_path}.mp4",
+                value,
+                fps=video_save_fps,
+                macro_block_size=1,
+                ffmpeg_log_level="error",
+            )
+            os.makedirs(os.path.join(save_path, sample_), exist_ok=True)
+            for i, s in enumerate(value):
+                iio.imwrite(
+                    os.path.join(save_path, sample_, f"{i:03d}.png"),
+                    s,
+                )
+        elif media_type == "video":
+            value = (value.permute(0, 2, 3, 1) + 1) / 2.0
+            value = (value * 255).clamp(0, 255).to(torch.uint8)
+            iio.imwrite(
+                os.path.join(save_path, f"{sample_}.mp4"),
+                value,
+                fps=video_save_fps,
+                macro_block_size=1,
+                ffmpeg_log_level="error",
+            )
+        elif media_type == "raw":
+            torch.save(
+                value,
+                os.path.join(save_path, f"{sample_}.pt"),
+            )
+        else:
+            pass
+def create_transforms_simple(save_path, img_paths, img_whs, c2ws, Ks):
+    import os.path as osp
+    out_frames = []
+    for img_path, img_wh, c2w, K in zip(img_paths, img_whs, c2ws, Ks):
+        out_frame = {
+            "fl_x": K[0][0].item(),
+            "fl_y": K[1][1].item(),
+            "cx": K[0][2].item(),
+            "cy": K[1][2].item(),
+            "w": img_wh[0].item(),
+            "h": img_wh[1].item(),
+            "file_path": f"./{osp.relpath(img_path, start=save_path)}"
+            if img_path is not None
+            else None,
+            "transform_matrix": c2w.tolist(),
+        }
+        out_frames.append(out_frame)
+    out = {
+        # "camera_model": "PINHOLE",
+        "orientation_override": "none",
+        "frames": out_frames,
+    }
+    with open(osp.join(save_path, "transforms.json"), "w") as of:
+        json.dump(out, of, indent=5)
+class GradioTrackedSampler(EulerEDMSampler):
+    """
+    A thin wrapper around the EulerEDMSampler that allows tracking progress and
+    aborting sampling for gradio demo.
+    """
+    def __init__(self, abort_event: threading.Event, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.abort_event = abort_event
+    def __call__(  # type: ignore
+        self,
+        denoiser,
+        x: torch.Tensor,
+        scale: float | torch.Tensor,
+        cond: dict,
+        uc: dict | None = None,
+        num_steps: int | None = None,
+        verbose: bool = True,
+        global_pbar: gr.Progress | None = None,
+        **guider_kwargs,
+    ) -> torch.Tensor | None:
+        uc = cond if uc is None else uc
+        x, s_in, sigmas, num_sigmas, cond, uc = self.prepare_sampling_loop(
+            x,
+            cond,
+            uc,
+            num_steps,
+        )
+        for i in self.get_sigma_gen(num_sigmas, verbose=verbose):
+            gamma = (
+                min(self.s_churn / (num_sigmas - 1), 2**0.5 - 1)
+                if self.s_tmin <= sigmas[i] <= self.s_tmax
+                else 0.0
+            )
+            x = self.sampler_step(
+                s_in * sigmas[i],
+                s_in * sigmas[i + 1],
+                denoiser,
+                x,
+                scale,
+                cond,
+                uc,
+                gamma,
+                **guider_kwargs,
+            )
+            # Allow tracking progress in gradio demo.
+            if global_pbar is not None:
+                global_pbar.update()
+            # Allow aborting sampling in gradio demo.
+            if self.abort_event.is_set():
+                return None
+        return x
+def create_samplers(
+    guider_types: int | list[int],
+    discretization,
+    num_frames: list[int] | None,
+    num_steps: int,
+    cfg_min: float = 1.0,
+    device: str | torch.device = "cuda",
+    abort_event: threading.Event | None = None,
+):
+    guider_mapping = {
+        0: VanillaCFG,
+        1: MultiviewCFG,
+        2: MultiviewTemporalCFG,
+    }
+    samplers = []
+    if not isinstance(guider_types, (list, tuple)):
+        guider_types = [guider_types]
+    for i, guider_type in enumerate(guider_types):
+        if guider_type not in guider_mapping:
+            raise ValueError(
+                f"Invalid guider type {guider_type}. Must be one of {list(guider_mapping.keys())}"
+            )
+        guider_cls = guider_mapping[guider_type]
+        guider_args = ()
+        if guider_type > 0:
+            guider_args += (cfg_min,)
+            if guider_type == 2:
+                assert num_frames is not None
+                guider_args = (num_frames[i], cfg_min)
+        guider = guider_cls(*guider_args)
+        if abort_event is not None:
+            sampler = GradioTrackedSampler(
+                abort_event,
+                discretization=discretization,
+                guider=guider,
+                num_steps=num_steps,
+                s_churn=0.0,
+                s_tmin=0.0,
+                s_tmax=999.0,
+                s_noise=1.0,
+                verbose=True,
+                device=device,
+            )
+        else:
+            sampler = EulerEDMSampler(
+                discretization=discretization,
+                guider=guider,
+                num_steps=num_steps,
+                s_churn=0.0,
+                s_tmin=0.0,
+                s_tmax=999.0,
+                s_noise=1.0,
+                verbose=True,
+                device=device,
+            )
+        samplers.append(sampler)
+    return samplers
+def get_value_dict(
+    curr_imgs,
+    curr_imgs_clip,
+    curr_input_frame_indices,
+    curr_c2ws,
+    curr_Ks,
+    curr_input_camera_indices,
+    all_c2ws,
+    camera_scale,
+):
+    assert sorted(curr_input_camera_indices) == sorted(
+        range(len(curr_input_camera_indices))
+    )
+    H, W, T, F = curr_imgs.shape[-2], curr_imgs.shape[-1], len(curr_imgs), 8
+    value_dict = {}
+    value_dict["cond_frames_without_noise"] = curr_imgs_clip[curr_input_frame_indices]
+    value_dict["cond_frames"] = curr_imgs + 0.0 * torch.randn_like(curr_imgs)
+    value_dict["cond_frames_mask"] = torch.zeros(T, dtype=torch.bool)
+    value_dict["cond_frames_mask"][curr_input_frame_indices] = True
+    value_dict["cond_aug"] = 0.0
+    c2w = to_hom_pose(curr_c2ws.float())
+    w2c = torch.linalg.inv(c2w)
+    # camera centering
+    ref_c2ws = all_c2ws
+    camera_dist_2med = torch.norm(
+        ref_c2ws[:, :3, 3] - ref_c2ws[:, :3, 3].median(0, keepdim=True).values,
+        dim=-1,
+    )
+    valid_mask = camera_dist_2med <= torch.clamp(
+        torch.quantile(camera_dist_2med, 0.97) * 10,
+        max=1e6,
+    )
+    c2w[:, :3, 3] -= ref_c2ws[valid_mask, :3, 3].mean(0, keepdim=True)
+    w2c = torch.linalg.inv(c2w)
+    # camera normalization
+    camera_dists = c2w[:, :3, 3].clone()
+    translation_scaling_factor = (
+        camera_scale
+        if torch.isclose(
+            torch.norm(camera_dists[0]),
+            torch.zeros(1),
+            atol=1e-5,
+        ).any()
+        else (camera_scale / torch.norm(camera_dists[0]))
+    )
+    w2c[:, :3, 3] *= translation_scaling_factor
+    c2w[:, :3, 3] *= translation_scaling_factor
+    value_dict["plucker_coordinate"], _ = get_plucker_coordinates(
+        extrinsics_src=w2c[0],
+        extrinsics=w2c,
+        intrinsics=curr_Ks.float().clone(),
+        mode="plucker",
+        rel_zero_translation=True,
+        target_size=(H // F, W // F),
+        return_grid_cam=True,
+    )
+    value_dict["c2w"] = c2w
+    value_dict["K"] = curr_Ks
+    value_dict["camera_mask"] = torch.zeros(T, dtype=torch.bool)
+    value_dict["camera_mask"][curr_input_camera_indices] = True
+    return value_dict
+def do_sample(
+    model,
+    ae,
+    conditioner,
+    denoiser,
+    sampler,
+    value_dict,
+    H,
+    W,
+    C,
+    F,
+    T,
+    cfg,
+    encoding_t=1,
+    decoding_t=1,
+    verbose=True,
+    global_pbar=None,
+    **_,
+):
+    imgs = value_dict["cond_frames"].to("cuda")
+    input_masks = value_dict["cond_frames_mask"].to("cuda")
+    pluckers = value_dict["plucker_coordinate"].to("cuda")
+    num_samples = [1, T]
+    with torch.inference_mode(), torch.autocast("cuda"):
+        load_model(ae)
+        load_model(conditioner)
+        latents = torch.nn.functional.pad(
+            ae.encode(imgs[input_masks], encoding_t), (0, 0, 0, 0, 0, 1), value=1.0
+        )
+        c_crossattn = repeat(conditioner(imgs[input_masks]).mean(0), "d -> n 1 d", n=T)
+        uc_crossattn = torch.zeros_like(c_crossattn)
+        c_replace = latents.new_zeros(T, *latents.shape[1:])
+        c_replace[input_masks] = latents
+        uc_replace = torch.zeros_like(c_replace)
+        c_concat = torch.cat(
+            [
+                repeat(
+                    input_masks,
+                    "n -> n 1 h w",
+                    h=pluckers.shape[2],
+                    w=pluckers.shape[3],
+                ),
+                pluckers,
+            ],
+            1,
+        )
+        uc_concat = torch.cat(
+            [pluckers.new_zeros(T, 1, *pluckers.shape[-2:]), pluckers], 1
+        )
+        c_dense_vector = pluckers
+        uc_dense_vector = c_dense_vector
+        c = {
+            "crossattn": c_crossattn,
+            "replace": c_replace,
+            "concat": c_concat,
+            "dense_vector": c_dense_vector,
+        }
+        uc = {
+            "crossattn": uc_crossattn,
+            "replace": uc_replace,
+            "concat": uc_concat,
+            "dense_vector": uc_dense_vector,
+        }
+        unload_model(ae)
+        unload_model(conditioner)
+        additional_model_inputs = {"num_frames": T}
+        additional_sampler_inputs = {
+            "c2w": value_dict["c2w"].to("cuda"),
+            "K": value_dict["K"].to("cuda"),
+            "input_frame_mask": value_dict["cond_frames_mask"].to("cuda"),
+        }
+        if global_pbar is not None:
+            additional_sampler_inputs["global_pbar"] = global_pbar
+        shape = (math.prod(num_samples), C, H // F, W // F)
+        randn = torch.randn(shape).to("cuda")
+        load_model(model)
+        samples_z = sampler(
+            lambda input, sigma, c: denoiser(
+                model,
+                input,
+                sigma,
+                c,
+                **additional_model_inputs,
+            ),
+            randn,
+            scale=cfg,
+            cond=c,
+            uc=uc,
+            verbose=verbose,
+            **additional_sampler_inputs,
+        )
+        if samples_z is None:
+            return
+        unload_model(model)
+        load_model(ae)
+        samples = ae.decode(samples_z, decoding_t)
+        unload_model(ae)
+    return samples
+def run_one_scene(
+    task,
+    version_dict,
+    model,
+    ae,
+    conditioner,
+    denoiser,
+    image_cond,
+    camera_cond,
+    save_path,
+    use_traj_prior,
+    traj_prior_Ks,
+    traj_prior_c2ws,
+    seed=23,
+    gradio=False,
+    abort_event=None,
+    first_pass_pbar=None,
+    second_pass_pbar=None,
+):
+    H, W, T, C, F, options = (
+        version_dict["H"],
+        version_dict["W"],
+        version_dict["T"],
+        version_dict["C"],
+        version_dict["f"],
+        version_dict["options"],
+    )
+    if isinstance(image_cond, str):
+        image_cond = {"img": [image_cond]}
+    imgs_clip, imgs, img_size = [], [], None
+    for i, (img, K) in enumerate(zip(image_cond["img"], camera_cond["K"])):
+        if isinstance(img, str) or img is None:
+            img, K = load_img_and_K(img or img_size, None, K=K, device="cpu")  # type: ignore
+            img_size = img.shape[-2:]
+            if options.get("L_short", -1) == -1:
+                img, K = transform_img_and_K(
+                    img,
+                    (W, H),
+                    K=K[None],
+                    mode=(
+                        options.get("transform_input", "crop")
+                        if i in image_cond["input_indices"]
+                        else options.get("transform_target", "crop")
+                    ),
+                    scale=(
+                        1.0
+                        if i in image_cond["input_indices"]
+                        else options.get("transform_scale", 1.0)
+                    ),
+                )
+            else:
+                downsample = 3
+                assert options["L_short"] % F * 2**downsample == 0, (
+                    "Short side of the image should be divisible by "
+                    f"F*2**{downsample}={F * 2**downsample}."
+                )
+                img, K = transform_img_and_K(
+                    img,
+                    options["L_short"],
+                    K=K[None],
+                    size_stride=F * 2**downsample,
+                    mode=(
+                        options.get("transform_input", "crop")
+                        if i in image_cond["input_indices"]
+                        else options.get("transform_target", "crop")
+                    ),
+                    scale=(
+                        1.0
+                        if i in image_cond["input_indices"]
+                        else options.get("transform_scale", 1.0)
+                    ),
+                )
+                version_dict["W"] = W = img.shape[-1]
+                version_dict["H"] = H = img.shape[-2]
+            K = K[0]
+            K[0] /= W
+            K[1] /= H
+            camera_cond["K"][i] = K
+            img_clip = img
+        elif isinstance(img, np.ndarray):
+            img_size = torch.Size(img.shape[:2])
+            img = torch.as_tensor(img).permute(2, 0, 1)
+            img = img.unsqueeze(0)
+            img = img / 255.0 * 2.0 - 1.0
+            if not gradio:
+                img, K = transform_img_and_K(img, (W, H), K=K[None])
+                assert K is not None
+                K = K[0]
+            K[0] /= W
+            K[1] /= H
+            camera_cond["K"][i] = K
+            img_clip = img
+        else:
+            assert (
+                False
+            ), f"Variable `img` got {type(img)} type which is not supported!!!"
+        imgs_clip.append(img_clip)
+        imgs.append(img)
+    imgs_clip = torch.cat(imgs_clip, dim=0)
+    imgs = torch.cat(imgs, dim=0)
+    if traj_prior_Ks is not None:
+        assert img_size is not None
+        for i, prior_k in enumerate(traj_prior_Ks):
+            img, prior_k = load_img_and_K(img_size, None, K=prior_k, device="cpu")  # type: ignore
+            img, prior_k = transform_img_and_K(
+                img,
+                (W, H),
+                K=prior_k[None],
+                mode=options.get(
+                    "transform_target", "crop"
+                ),  # mode for prior is always same as target
+                scale=options.get(
+                    "transform_scale", 1.0
+                ),  # scale for prior is always same as target
+            )
+            prior_k = prior_k[0]
+            prior_k[0] /= W
+            prior_k[1] /= H
+            traj_prior_Ks[i] = prior_k
+    options["num_frames"] = T
+    discretization = denoiser.discretization
+    torch.cuda.empty_cache()
+    seed_everything(seed)
+    # Get Data
+    input_indices = image_cond["input_indices"]
+    input_imgs = imgs[input_indices]
+    input_imgs_clip = imgs_clip[input_indices]
+    input_c2ws = camera_cond["c2w"][input_indices]
+    input_Ks = camera_cond["K"][input_indices]
+    test_indices = [i for i in range(len(imgs)) if i not in input_indices]
+    test_imgs = imgs[test_indices]
+    test_imgs_clip = imgs_clip[test_indices]
+    test_c2ws = camera_cond["c2w"][test_indices]
+    test_Ks = camera_cond["K"][test_indices]
+    if options.get("save_input", True):
+        save_output(
+            {"/image": input_imgs},
+            save_path=os.path.join(save_path, "input"),
+            video_save_fps=2,
+        )
+    if not use_traj_prior:
+        chunk_strategy = options.get("chunk_strategy", "gt")
+        (
+            _,
+            input_inds_per_chunk,
+            input_sels_per_chunk,
+            test_inds_per_chunk,
+            test_sels_per_chunk,
+        ) = chunk_input_and_test(
+            T,
+            input_c2ws,
+            test_c2ws,
+            input_indices,
+            test_indices,
+            options=options,
+            task=task,
+            chunk_strategy=chunk_strategy,
+            gt_input_inds=list(range(input_c2ws.shape[0])),
+        )
+        print(
+            f"One pass - chunking with `{chunk_strategy}` strategy: total "
+            f"{len(input_inds_per_chunk)} forward(s) ..."
+        )
+        all_samples = {}
+        all_test_inds = []
+        for i, (
+            chunk_input_inds,
+            chunk_input_sels,
+            chunk_test_inds,
+            chunk_test_sels,
+        ) in tqdm(
+            enumerate(
+                zip(
+                    input_inds_per_chunk,
+                    input_sels_per_chunk,
+                    test_inds_per_chunk,
+                    test_sels_per_chunk,
+                )
+            ),
+            total=len(input_inds_per_chunk),
+            leave=False,
+        ):
+            (
+                curr_input_sels,
+                curr_test_sels,
+                curr_input_maps,
+                curr_test_maps,
+            ) = pad_indices(
+                chunk_input_sels,
+                chunk_test_sels,
+                T=T,
+                padding_mode=options.get("t_padding_mode", "last"),
+            )
+            curr_imgs, curr_imgs_clip, curr_c2ws, curr_Ks = [
+                assemble(
+                    input=x[chunk_input_inds],
+                    test=y[chunk_test_inds],
+                    input_maps=curr_input_maps,
+                    test_maps=curr_test_maps,
+                )
+                for x, y in zip(
+                    [
+                        torch.cat(
+                            [
+                                input_imgs,
+                                get_k_from_dict(all_samples, "samples-rgb").to(
+                                    input_imgs.device
+                                ),
+                            ],
+                            dim=0,
+                        ),
+                        torch.cat(
+                            [
+                                input_imgs_clip,
+                                get_k_from_dict(all_samples, "samples-rgb").to(
+                                    input_imgs.device
+                                ),
+                            ],
+                            dim=0,
+                        ),
+                        torch.cat([input_c2ws, test_c2ws[all_test_inds]], dim=0),
+                        torch.cat([input_Ks, test_Ks[all_test_inds]], dim=0),
+                    ],  # procedually append generated prior views to the input views
+                    [test_imgs, test_imgs_clip, test_c2ws, test_Ks],
+                )
+            ]
+            value_dict = get_value_dict(
+                curr_imgs.to("cuda"),
+                curr_imgs_clip.to("cuda"),
+                curr_input_sels
+                + [
+                    sel
+                    for (ind, sel) in zip(
+                        np.array(chunk_test_inds)[curr_test_maps[curr_test_maps != -1]],
+                        curr_test_sels,
+                    )
+                    if test_indices[ind] in image_cond["input_indices"]
+                ],
+                curr_c2ws,
+                curr_Ks,
+                curr_input_sels
+                + [
+                    sel
+                    for (ind, sel) in zip(
+                        np.array(chunk_test_inds)[curr_test_maps[curr_test_maps != -1]],
+                        curr_test_sels,
+                    )
+                    if test_indices[ind] in camera_cond["input_indices"]
+                ],
+                all_c2ws=camera_cond["c2w"],
+                camera_scale=options.get("camera_scale", 2.0),
+            )
+            samplers = create_samplers(
+                options["guider_types"],
+                discretization,
+                [len(curr_imgs)],
+                options["num_steps"],
+                options["cfg_min"],
+                abort_event=abort_event,
+            )
+            assert len(samplers) == 1
+            samples = do_sample(
+                model,
+                ae,
+                conditioner,
+                denoiser,
+                samplers[0],
+                value_dict,
+                H,
+                W,
+                C,
+                F,
+                T=len(curr_imgs),
+                cfg=(
+                    options["cfg"][0]
+                    if isinstance(options["cfg"], (list, tuple))
+                    else options["cfg"]
+                ),
+                **{k: options[k] for k in options if k not in ["cfg", "T"]},
+            )
+            samples = decode_output(
+                samples, len(curr_imgs), chunk_test_sels
+            )  # decode into dict
+            if options.get("save_first_pass", False):
+                save_output(
+                    replace_or_include_input_for_dict(
+                        samples,
+                        chunk_test_sels,
+                        curr_imgs,
+                        curr_c2ws,
+                        curr_Ks,
+                    ),
+                    save_path=os.path.join(save_path, "first-pass", f"forward_{i}"),
+                    video_save_fps=2,
+                )
+            extend_dict(all_samples, samples)
+            all_test_inds.extend(chunk_test_inds)
+    else:
+        assert traj_prior_c2ws is not None, (
+            "`traj_prior_c2ws` should be set when using 2-pass sampling. One "
+            "potential reason is that the amount of input frames is larger than "
+            "T. Set `num_prior_frames` manually to overwrite the infered stats."
+        )
+        traj_prior_c2ws = torch.as_tensor(
+            traj_prior_c2ws,
+            device=input_c2ws.device,
+            dtype=input_c2ws.dtype,
+        )
+        if traj_prior_Ks is None:
+            traj_prior_Ks = test_Ks[:1].repeat_interleave(
+                traj_prior_c2ws.shape[0], dim=0
+            )
+        traj_prior_imgs = imgs.new_zeros(traj_prior_c2ws.shape[0], *imgs.shape[1:])
+        traj_prior_imgs_clip = imgs_clip.new_zeros(
+            traj_prior_c2ws.shape[0], *imgs_clip.shape[1:]
+        )
+        # ---------------------------------- first pass ----------------------------------
+        T_first_pass = T[0] if isinstance(T, (list, tuple)) else T
+        T_second_pass = T[1] if isinstance(T, (list, tuple)) else T
+        chunk_strategy_first_pass = options.get(
+            "chunk_strategy_first_pass", "gt-nearest"
+        )
+        (
+            _,
+            input_inds_per_chunk,
+            input_sels_per_chunk,
+            prior_inds_per_chunk,
+            prior_sels_per_chunk,
+        ) = chunk_input_and_test(
+            T_first_pass,
+            input_c2ws,
+            traj_prior_c2ws,
+            input_indices,
+            image_cond["prior_indices"],
+            options=options,
+            task=task,
+            chunk_strategy=chunk_strategy_first_pass,
+            gt_input_inds=list(range(input_c2ws.shape[0])),
+        )
+        print(
+            f"Two passes (first) - chunking with `{chunk_strategy_first_pass}` strategy: total "
+            f"{len(input_inds_per_chunk)} forward(s) ..."
+        )
+        all_samples = {}
+        all_prior_inds = []
+        for i, (
+            chunk_input_inds,
+            chunk_input_sels,
+            chunk_prior_inds,
+            chunk_prior_sels,
+        ) in tqdm(
+            enumerate(
+                zip(
+                    input_inds_per_chunk,
+                    input_sels_per_chunk,
+                    prior_inds_per_chunk,
+                    prior_sels_per_chunk,
+                )
+            ),
+            total=len(input_inds_per_chunk),
+            leave=False,
+        ):
+            (
+                curr_input_sels,
+                curr_prior_sels,
+                curr_input_maps,
+                curr_prior_maps,
+            ) = pad_indices(
+                chunk_input_sels,
+                chunk_prior_sels,
+                T=T_first_pass,
+                padding_mode=options.get("t_padding_mode", "last"),
+            )
+            curr_imgs, curr_imgs_clip, curr_c2ws, curr_Ks = [
+                assemble(
+                    input=x[chunk_input_inds],
+                    test=y[chunk_prior_inds],
+                    input_maps=curr_input_maps,
+                    test_maps=curr_prior_maps,
+                )
+                for x, y in zip(
+                    [
+                        torch.cat(
+                            [
+                                input_imgs,
+                                get_k_from_dict(all_samples, "samples-rgb").to(
+                                    input_imgs.device
+                                ),
+                            ],
+                            dim=0,
+                        ),
+                        torch.cat(
+                            [
+                                input_imgs_clip,
+                                get_k_from_dict(all_samples, "samples-rgb").to(
+                                    input_imgs.device
+                                ),
+                            ],
+                            dim=0,
+                        ),
+                        torch.cat([input_c2ws, traj_prior_c2ws[all_prior_inds]], dim=0),
+                        torch.cat([input_Ks, traj_prior_Ks[all_prior_inds]], dim=0),
+                    ],  # procedually append generated prior views to the input views
+                    [
+                        traj_prior_imgs,
+                        traj_prior_imgs_clip,
+                        traj_prior_c2ws,
+                        traj_prior_Ks,
+                    ],
+                )
+            ]
+            value_dict = get_value_dict(
+                curr_imgs.to("cuda"),
+                curr_imgs_clip.to("cuda"),
+                curr_input_sels,
+                curr_c2ws,
+                curr_Ks,
+                list(range(T_first_pass)),
+                all_c2ws=camera_cond["c2w"],
+                camera_scale=options.get("camera_scale", 2.0),
+            )
+            samplers = create_samplers(
+                options["guider_types"],
+                discretization,
+                [T_first_pass, T_second_pass],
+                options["num_steps"],
+                options["cfg_min"],
+                abort_event=abort_event,
+            )
+            samples = do_sample(
+                model,
+                ae,
+                conditioner,
+                denoiser,
+                (
+                    samplers[1]
+                    if len(samplers) > 1
+                    and options.get("ltr_first_pass", False)
+                    and chunk_strategy_first_pass != "gt"
+                    and i > 0
+                    else samplers[0]
+                ),
+                value_dict,
+                H,
+                W,
+                C,
+                F,
+                cfg=(
+                    options["cfg"][0]
+                    if isinstance(options["cfg"], (list, tuple))
+                    else options["cfg"]
+                ),
+                T=T_first_pass,
+                global_pbar=first_pass_pbar,
+                **{k: options[k] for k in options if k not in ["cfg", "T", "sampler"]},
+            )
+            if samples is None:
+                return
+            samples = decode_output(
+                samples, T_first_pass, chunk_prior_sels
+            )  # decode into dict
+            extend_dict(all_samples, samples)
+            all_prior_inds.extend(chunk_prior_inds)
+        if options.get("save_first_pass", True):
+            save_output(
+                all_samples,
+                save_path=os.path.join(save_path, "first-pass"),
+                video_save_fps=5,
+            )
+            video_path_0 = os.path.join(save_path, "first-pass", "samples-rgb.mp4")
+            yield video_path_0
+        # ---------------------------------- second pass ----------------------------------
+        prior_indices = image_cond["prior_indices"]
+        assert (
+            prior_indices is not None
+        ), "`prior_frame_indices` needs to be set if using 2-pass sampling."
+        prior_argsort = np.argsort(input_indices + prior_indices).tolist()
+        prior_indices = np.array(input_indices + prior_indices)[prior_argsort].tolist()
+        gt_input_inds = [prior_argsort.index(i) for i in range(input_c2ws.shape[0])]
+        traj_prior_imgs = torch.cat(
+            [input_imgs, get_k_from_dict(all_samples, "samples-rgb")], dim=0
+        )[prior_argsort]
+        traj_prior_imgs_clip = torch.cat(
+            [
+                input_imgs_clip,
+                get_k_from_dict(all_samples, "samples-rgb"),
+            ],
+            dim=0,
+        )[prior_argsort]
+        traj_prior_c2ws = torch.cat([input_c2ws, traj_prior_c2ws], dim=0)[prior_argsort]
+        traj_prior_Ks = torch.cat([input_Ks, traj_prior_Ks], dim=0)[prior_argsort]
+        update_kv_for_dict(all_samples, "samples-rgb", traj_prior_imgs)
+        update_kv_for_dict(all_samples, "samples-c2ws", traj_prior_c2ws)
+        update_kv_for_dict(all_samples, "samples-intrinsics", traj_prior_Ks)
+        chunk_strategy = options.get("chunk_strategy", "nearest")
+        (
+            _,
+            prior_inds_per_chunk,
+            prior_sels_per_chunk,
+            test_inds_per_chunk,
+            test_sels_per_chunk,
+        ) = chunk_input_and_test(
+            T_second_pass,
+            traj_prior_c2ws,
+            test_c2ws,
+            prior_indices,
+            test_indices,
+            options=options,
+            task=task,
+            chunk_strategy=chunk_strategy,
+            gt_input_inds=gt_input_inds,
+        )
+        print(
+            f"Two passes (second) - chunking with `{chunk_strategy}` strategy: total "
+            f"{len(prior_inds_per_chunk)} forward(s) ..."
+        )
+        all_samples = {}
+        all_test_inds = []
+        for i, (
+            chunk_prior_inds,
+            chunk_prior_sels,
+            chunk_test_inds,
+            chunk_test_sels,
+        ) in tqdm(
+            enumerate(
+                zip(
+                    prior_inds_per_chunk,
+                    prior_sels_per_chunk,
+                    test_inds_per_chunk,
+                    test_sels_per_chunk,
+                )
+            ),
+            total=len(prior_inds_per_chunk),
+            leave=False,
+        ):
+            (
+                curr_prior_sels,
+                curr_test_sels,
+                curr_prior_maps,
+                curr_test_maps,
+            ) = pad_indices(
+                chunk_prior_sels,
+                chunk_test_sels,
+                T=T_second_pass,
+                padding_mode="last",
+            )
+            curr_imgs, curr_imgs_clip, curr_c2ws, curr_Ks = [
+                assemble(
+                    input=x[chunk_prior_inds],
+                    test=y[chunk_test_inds],
+                    input_maps=curr_prior_maps,
+                    test_maps=curr_test_maps,
+                )
+                for x, y in zip(
+                    [
+                        traj_prior_imgs,
+                        traj_prior_imgs_clip,
+                        traj_prior_c2ws,
+                        traj_prior_Ks,
+                    ],
+                    [test_imgs, test_imgs_clip, test_c2ws, test_Ks],
+                )
+            ]
+            value_dict = get_value_dict(
+                curr_imgs.to("cuda"),
+                curr_imgs_clip.to("cuda"),
+                curr_prior_sels,
+                curr_c2ws,
+                curr_Ks,
+                list(range(T_second_pass)),
+                all_c2ws=camera_cond["c2w"],
+                camera_scale=options.get("camera_scale", 2.0),
+            )
+            samples = do_sample(
+                model,
+                ae,
+                conditioner,
+                denoiser,
+                samplers[1] if len(samplers) > 1 else samplers[0],
+                value_dict,
+                H,
+                W,
+                C,
+                F,
+                T=T_second_pass,
+                cfg=(
+                    options["cfg"][1]
+                    if isinstance(options["cfg"], (list, tuple))
+                    and len(options["cfg"]) > 1
+                    else options["cfg"]
+                ),
+                global_pbar=second_pass_pbar,
+                **{k: options[k] for k in options if k not in ["cfg", "T", "sampler"]},
+            )
+            if samples is None:
+                return
+            samples = decode_output(
+                samples, T_second_pass, chunk_test_sels
+            )  # decode into dict
+            if options.get("save_second_pass", False):
+                save_output(
+                    replace_or_include_input_for_dict(
+                        samples,
+                        chunk_test_sels,
+                        curr_imgs,
+                        curr_c2ws,
+                        curr_Ks,
+                    ),
+                    save_path=os.path.join(save_path, "second-pass", f"forward_{i}"),
+                    video_save_fps=2,
+                )
+            extend_dict(all_samples, samples)
+            all_test_inds.extend(chunk_test_inds)
+        all_samples = {
+            key: value[np.argsort(all_test_inds)] for key, value in all_samples.items()
+        }
+    save_output(
+        replace_or_include_input_for_dict(
+            all_samples,
+            test_indices,
+            imgs.clone(),
+            camera_cond["c2w"].clone(),
+            camera_cond["K"].clone(),
+        )
+        if options.get("replace_or_include_input", False)
+        else all_samples,
+        save_path=save_path,
+        video_save_fps=options.get("video_save_fps", 2),
+    )
+    video_path_1 = os.path.join(save_path, "samples-rgb.mp4")
+    yield video_path_1

seva/geometry.py ADDED Viewed

	@@ -0,0 +1,811 @@

+from typing import Literal
+import numpy as np
+import roma
+import scipy.interpolate
+import torch
+import torch.nn.functional as F
+DEFAULT_FOV_RAD = 0.9424777960769379  # 54 degrees by default
+def get_camera_dist(
+    source_c2ws: torch.Tensor,  # N x 3 x 4
+    target_c2ws: torch.Tensor,  # M x 3 x 4
+    mode: str = "translation",
+):
+    if mode == "rotation":
+        dists = torch.acos(
+            (
+                (
+                    torch.matmul(
+                        source_c2ws[:, None, :3, :3],
+                        target_c2ws[None, :, :3, :3].transpose(-1, -2),
+                    )
+                    .diagonal(offset=0, dim1=-2, dim2=-1)
+                    .sum(-1)
+                    - 1
+                )
+                / 2
+            ).clamp(-1, 1)
+        ) * (180 / torch.pi)
+    elif mode == "translation":
+        dists = torch.norm(
+            source_c2ws[:, None, :3, 3] - target_c2ws[None, :, :3, 3], dim=-1
+        )
+    else:
+        raise NotImplementedError(
+            f"Mode {mode} is not implemented for finding nearest source indices."
+        )
+    return dists
+def to_hom(X):
+    # get homogeneous coordinates of the input
+    X_hom = torch.cat([X, torch.ones_like(X[..., :1])], dim=-1)
+    return X_hom
+def to_hom_pose(pose):
+    # get homogeneous coordinates of the input pose
+    if pose.shape[-2:] == (3, 4):
+        pose_hom = torch.eye(4, device=pose.device)[None].repeat(pose.shape[0], 1, 1)
+        pose_hom[:, :3, :] = pose
+        return pose_hom
+    return pose
+def get_default_intrinsics(
+    fov_rad=DEFAULT_FOV_RAD,
+    aspect_ratio=1.0,
+):
+    if not isinstance(fov_rad, torch.Tensor):
+        fov_rad = torch.tensor(
+            [fov_rad] if isinstance(fov_rad, (int, float)) else fov_rad
+        )
+    if aspect_ratio >= 1.0:  # W >= H
+        focal_x = 0.5 / torch.tan(0.5 * fov_rad)
+        focal_y = focal_x * aspect_ratio
+    else:  # W < H
+        focal_y = 0.5 / torch.tan(0.5 * fov_rad)
+        focal_x = focal_y / aspect_ratio
+    intrinsics = focal_x.new_zeros((focal_x.shape[0], 3, 3))
+    intrinsics[:, torch.eye(3, device=focal_x.device, dtype=bool)] = torch.stack(
+        [focal_x, focal_y, torch.ones_like(focal_x)], dim=-1
+    )
+    intrinsics[:, :, -1] = torch.tensor(
+        [0.5, 0.5, 1.0], device=focal_x.device, dtype=focal_x.dtype
+    )
+    return intrinsics
+def get_image_grid(img_h, img_w):
+    # add 0.5 is VERY important especially when your img_h and img_w
+    # is not very large (e.g., 72)!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+    y_range = torch.arange(img_h, dtype=torch.float32).add_(0.5)
+    x_range = torch.arange(img_w, dtype=torch.float32).add_(0.5)
+    Y, X = torch.meshgrid(y_range, x_range, indexing="ij")  # [H,W]
+    xy_grid = torch.stack([X, Y], dim=-1).view(-1, 2)  # [HW,2]
+    return to_hom(xy_grid)  # [HW,3]
+def img2cam(X, cam_intr):
+    return X @ cam_intr.inverse().transpose(-1, -2)
+def cam2world(X, pose):
+    X_hom = to_hom(X)
+    pose_inv = torch.linalg.inv(to_hom_pose(pose))[..., :3, :4]
+    return X_hom @ pose_inv.transpose(-1, -2)
+def get_center_and_ray(
+    img_h, img_w, pose, intr, zero_center_for_debugging=False
+):  # [HW,2]
+    # given the intrinsic/extrinsic matrices, get the camera center and ray directions]
+    # assert(opt.camera.model=="perspective")
+    # compute center and ray
+    grid_img = get_image_grid(img_h, img_w)  # [HW,3]
+    grid_3D_cam = img2cam(grid_img.to(intr.device), intr.float())  # [B,HW,3]
+    center_3D_cam = torch.zeros_like(grid_3D_cam)  # [B,HW,3]
+    # transform from camera to world coordinates
+    grid_3D = cam2world(grid_3D_cam, pose)  # [B,HW,3]
+    center_3D = cam2world(center_3D_cam, pose)  # [B,HW,3]
+    ray = grid_3D - center_3D  # [B,HW,3]
+    return center_3D_cam if zero_center_for_debugging else center_3D, ray, grid_3D_cam
+def get_plucker_coordinates(
+    extrinsics_src,
+    extrinsics,
+    intrinsics=None,
+    fov_rad=DEFAULT_FOV_RAD,
+    mode="plucker",
+    rel_zero_translation=True,
+    zero_center_for_debugging=False,
+    target_size=[72, 72],  # 576-size image
+    return_grid_cam=False,  # save for later use if want restore
+):
+    if intrinsics is None:
+        intrinsics = get_default_intrinsics(fov_rad).to(extrinsics.device)
+    else:
+        # for some data preprocessed in the early stage (e.g., MVI and CO3D),
+        # intrinsics are expressed in raw pixel space (e.g., 576x576) instead
+        # of normalized image coordinates
+        if not (
+            torch.all(intrinsics[:, :2, -1] >= 0)
+            and torch.all(intrinsics[:, :2, -1] <= 1)
+        ):
+            intrinsics[:, :2] /= intrinsics.new_tensor(target_size).view(1, -1, 1) * 8
+        # you should ensure the intrisics are expressed in
+        # resolution-independent normalized image coordinates just performing a
+        # very simple verification here checking if principal points are
+        # between 0 and 1
+        assert (
+            torch.all(intrinsics[:, :2, -1] >= 0)
+            and torch.all(intrinsics[:, :2, -1] <= 1)
+        ), "Intrinsics should be expressed in resolution-independent normalized image coordinates."
+    c2w_src = torch.linalg.inv(extrinsics_src)
+    if not rel_zero_translation:
+        c2w_src[:3, 3] = c2w_src[3, :3] = 0.0
+    # transform coordinates from the source camera's coordinate system to the coordinate system of the respective camera
+    extrinsics_rel = torch.einsum(
+        "vnm,vmp->vnp", extrinsics, c2w_src[None].repeat(extrinsics.shape[0], 1, 1)
+    )
+    intrinsics[:, :2] *= extrinsics.new_tensor(
+        [
+            target_size[1],  # w
+            target_size[0],  # h
+        ]
+    ).view(1, -1, 1)
+    centers, rays, grid_cam = get_center_and_ray(
+        img_h=target_size[0],
+        img_w=target_size[1],
+        pose=extrinsics_rel[:, :3, :],
+        intr=intrinsics,
+        zero_center_for_debugging=zero_center_for_debugging,
+    )
+    if mode == "plucker" or "v1" in mode:
+        rays = torch.nn.functional.normalize(rays, dim=-1)
+        plucker = torch.cat((rays, torch.cross(centers, rays, dim=-1)), dim=-1)
+    else:
+        raise ValueError(f"Unknown Plucker coordinate mode: {mode}")
+    plucker = plucker.permute(0, 2, 1).reshape(plucker.shape[0], -1, *target_size)
+    if return_grid_cam:
+        return plucker, grid_cam.reshape(-1, *target_size, 3)
+    return plucker
+def rt_to_mat4(
+    R: torch.Tensor, t: torch.Tensor, s: torch.Tensor | None = None
+) -> torch.Tensor:
+    """
+    Args:
+        R (torch.Tensor): (..., 3, 3).
+        t (torch.Tensor): (..., 3).
+        s (torch.Tensor): (...,).
+    Returns:
+        torch.Tensor: (..., 4, 4)
+    """
+    mat34 = torch.cat([R, t[..., None]], dim=-1)
+    if s is None:
+        bottom = (
+            mat34.new_tensor([[0.0, 0.0, 0.0, 1.0]])
+            .reshape((1,) * (mat34.dim() - 2) + (1, 4))
+            .expand(mat34.shape[:-2] + (1, 4))
+        )
+    else:
+        bottom = F.pad(1.0 / s[..., None, None], (3, 0), value=0.0)
+    mat4 = torch.cat([mat34, bottom], dim=-2)
+    return mat4
+def get_preset_pose_fov(
+    option: Literal[
+        "orbit",
+        "spiral",
+        "lemniscate",
+        "zoom-in",
+        "zoom-out",
+        "dolly zoom-in",
+        "dolly zoom-out",
+        "move-forward",
+        "move-backward",
+        "move-up",
+        "move-down",
+        "move-left",
+        "move-right",
+        "roll",
+    ],
+    num_frames: int,
+    start_w2c: torch.Tensor,
+    look_at: torch.Tensor,
+    up_direction: torch.Tensor | None = None,
+    fov: float = DEFAULT_FOV_RAD,
+    spiral_radii: list[float] = [0.5, 0.5, 0.2],
+    zoom_factor: float | None = None,
+):
+    poses = fovs = None
+    if option == "orbit":
+        poses = torch.linalg.inv(
+            get_arc_horizontal_w2cs(
+                start_w2c,
+                look_at,
+                up_direction,
+                num_frames=num_frames,
+                endpoint=False,
+            )
+        ).numpy()
+        fovs = np.full((num_frames,), fov)
+    elif option == "spiral":
+        poses = generate_spiral_path(
+            torch.linalg.inv(start_w2c)[None].numpy() @ np.diagflat([1, -1, -1, 1]),
+            np.array([1, 5]),
+            n_frames=num_frames,
+            n_rots=2,
+            zrate=0.5,
+            radii=spiral_radii,
+            endpoint=False,
+        ) @ np.diagflat([1, -1, -1, 1])
+        poses = np.concatenate(
+            [
+                poses,
+                np.array([0.0, 0.0, 0.0, 1.0])[None, None].repeat(len(poses), 0),
+            ],
+            1,
+        )
+        # We want the spiral trajectory to always start from start_w2c. Thus we
+        # apply the relative pose to get the final trajectory.
+        poses = (
+            np.linalg.inv(start_w2c.numpy())[None] @ np.linalg.inv(poses[:1]) @ poses
+        )
+        fovs = np.full((num_frames,), fov)
+    elif option == "lemniscate":
+        poses = torch.linalg.inv(
+            get_lemniscate_w2cs(
+                start_w2c,
+                look_at,
+                up_direction,
+                num_frames,
+                degree=60.0,
+                endpoint=False,
+            )
+        ).numpy()
+        fovs = np.full((num_frames,), fov)
+    elif option == "roll":
+        poses = torch.linalg.inv(
+            get_roll_w2cs(
+                start_w2c,
+                look_at,
+                None,
+                num_frames,
+                degree=360.0,
+                endpoint=False,
+            )
+        ).numpy()
+        fovs = np.full((num_frames,), fov)
+    elif option in [
+        "dolly zoom-in",
+        "dolly zoom-out",
+        "zoom-in",
+        "zoom-out",
+    ]:
+        if option.startswith("dolly"):
+            direction = "backward" if option == "dolly zoom-in" else "forward"
+            poses = torch.linalg.inv(
+                get_moving_w2cs(
+                    start_w2c,
+                    look_at,
+                    up_direction,
+                    num_frames,
+                    endpoint=True,
+                    direction=direction,
+                )
+            ).numpy()
+        else:
+            poses = torch.linalg.inv(start_w2c)[None].repeat(num_frames, 1, 1).numpy()
+        fov_rad_start = fov
+        if zoom_factor is None:
+            zoom_factor = 0.28 if option.endswith("zoom-in") else 1.5
+        fov_rad_end = zoom_factor * fov
+        fovs = (
+            np.linspace(0, 1, num_frames) * (fov_rad_end - fov_rad_start)
+            + fov_rad_start
+        )
+    elif option in [
+        "move-forward",
+        "move-backward",
+        "move-up",
+        "move-down",
+        "move-left",
+        "move-right",
+    ]:
+        poses = torch.linalg.inv(
+            get_moving_w2cs(
+                start_w2c,
+                look_at,
+                up_direction,
+                num_frames,
+                endpoint=True,
+                direction=option.removeprefix("move-"),
+            )
+        ).numpy()
+        fovs = np.full((num_frames,), fov)
+    else:
+        raise ValueError(f"Unknown preset option {option}.")
+    return poses, fovs
+def get_lookat(origins: torch.Tensor, viewdirs: torch.Tensor) -> torch.Tensor:
+    """Triangulate a set of rays to find a single lookat point.
+    Args:
+        origins (torch.Tensor): A (N, 3) array of ray origins.
+        viewdirs (torch.Tensor): A (N, 3) array of ray view directions.
+    Returns:
+        torch.Tensor: A (3,) lookat point.
+    """
+    viewdirs = torch.nn.functional.normalize(viewdirs, dim=-1)
+    eye = torch.eye(3, device=origins.device, dtype=origins.dtype)[None]
+    # Calculate projection matrix I - rr^T
+    I_min_cov = eye - (viewdirs[..., None] * viewdirs[..., None, :])
+    # Compute sum of projections
+    sum_proj = I_min_cov.matmul(origins[..., None]).sum(dim=-3)
+    # Solve for the intersection point using least squares
+    lookat = torch.linalg.lstsq(I_min_cov.sum(dim=-3), sum_proj).solution[..., 0]
+    # Check NaNs.
+    assert not torch.any(torch.isnan(lookat))
+    return lookat
+def get_lookat_w2cs(
+    positions: torch.Tensor,
+    lookat: torch.Tensor,
+    up: torch.Tensor,
+    face_off: bool = False,
+):
+    """
+    Args:
+        positions: (N, 3) tensor of camera positions
+        lookat: (3,) tensor of lookat point
+        up: (3,) or (N, 3) tensor of up vector
+    Returns:
+        w2cs: (N, 3, 3) tensor of world to camera rotation matrices
+    """
+    forward_vectors = F.normalize(lookat - positions, dim=-1)
+    if face_off:
+        forward_vectors = -forward_vectors
+    if up.dim() == 1:
+        up = up[None]
+    right_vectors = F.normalize(torch.cross(forward_vectors, up, dim=-1), dim=-1)
+    down_vectors = F.normalize(
+        torch.cross(forward_vectors, right_vectors, dim=-1), dim=-1
+    )
+    Rs = torch.stack([right_vectors, down_vectors, forward_vectors], dim=-1)
+    w2cs = torch.linalg.inv(rt_to_mat4(Rs, positions))
+    return w2cs
+def get_arc_horizontal_w2cs(
+    ref_w2c: torch.Tensor,
+    lookat: torch.Tensor,
+    up: torch.Tensor | None,
+    num_frames: int,
+    clockwise: bool = True,
+    face_off: bool = False,
+    endpoint: bool = False,
+    degree: float = 360.0,
+    ref_up_shift: float = 0.0,
+    ref_radius_scale: float = 1.0,
+    **_,
+) -> torch.Tensor:
+    ref_c2w = torch.linalg.inv(ref_w2c)
+    ref_position = ref_c2w[:3, 3]
+    if up is None:
+        up = -ref_c2w[:3, 1]
+    assert up is not None
+    ref_position += up * ref_up_shift
+    ref_position *= ref_radius_scale
+    thetas = (
+        torch.linspace(0.0, torch.pi * degree / 180, num_frames, device=ref_w2c.device)
+        if endpoint
+        else torch.linspace(
+            0.0, torch.pi * degree / 180, num_frames + 1, device=ref_w2c.device
+        )[:-1]
+    )
+    if not clockwise:
+        thetas = -thetas
+    positions = (
+        torch.einsum(
+            "nij,j->ni",
+            roma.rotvec_to_rotmat(thetas[:, None] * up[None]),
+            ref_position - lookat,
+        )
+        + lookat
+    )
+    return get_lookat_w2cs(positions, lookat, up, face_off=face_off)
+def get_lemniscate_w2cs(
+    ref_w2c: torch.Tensor,
+    lookat: torch.Tensor,
+    up: torch.Tensor | None,
+    num_frames: int,
+    degree: float,
+    endpoint: bool = False,
+    **_,
+) -> torch.Tensor:
+    ref_c2w = torch.linalg.inv(ref_w2c)
+    a = torch.linalg.norm(ref_c2w[:3, 3] - lookat) * np.tan(degree / 360 * np.pi)
+    # Lemniscate curve in camera space. Starting at the origin.
+    thetas = (
+        torch.linspace(0, 2 * torch.pi, num_frames, device=ref_w2c.device)
+        if endpoint
+        else torch.linspace(0, 2 * torch.pi, num_frames + 1, device=ref_w2c.device)[:-1]
+    ) + torch.pi / 2
+    positions = torch.stack(
+        [
+            a * torch.cos(thetas) / (1 + torch.sin(thetas) ** 2),
+            a * torch.cos(thetas) * torch.sin(thetas) / (1 + torch.sin(thetas) ** 2),
+            torch.zeros(num_frames, device=ref_w2c.device),
+        ],
+        dim=-1,
+    )
+    # Transform to world space.
+    positions = torch.einsum(
+        "ij,nj->ni", ref_c2w[:3], F.pad(positions, (0, 1), value=1.0)
+    )
+    if up is None:
+        up = -ref_c2w[:3, 1]
+    assert up is not None
+    return get_lookat_w2cs(positions, lookat, up)
+def get_moving_w2cs(
+    ref_w2c: torch.Tensor,
+    lookat: torch.Tensor,
+    up: torch.Tensor | None,
+    num_frames: int,
+    endpoint: bool = False,
+    direction: str = "forward",
+    tilt_xy: torch.Tensor = None,
+):
+    """
+    Args:
+        ref_w2c: (4, 4) tensor of the reference wolrd-to-camera matrix
+        lookat: (3,) tensor of lookat point
+        up: (3,) tensor of up vector
+    Returns:
+        w2cs: (N, 3, 3) tensor of world to camera rotation matrices
+    """
+    ref_c2w = torch.linalg.inv(ref_w2c)
+    ref_position = ref_c2w[:3, -1]
+    if up is None:
+        up = -ref_c2w[:3, 1]
+    direction_vectors = {
+        "forward": (lookat - ref_position).clone(),
+        "backward": -(lookat - ref_position).clone(),
+        "up": up.clone(),
+        "down": -up.clone(),
+        "right": torch.cross((lookat - ref_position), up, dim=0),
+        "left": -torch.cross((lookat - ref_position), up, dim=0),
+    }
+    if direction not in direction_vectors:
+        raise ValueError(
+            f"Invalid direction: {direction}. Must be one of {list(direction_vectors.keys())}"
+        )
+    positions = ref_position + (
+        F.normalize(direction_vectors[direction], dim=0)
+        * (
+            torch.linspace(0, 0.99, num_frames, device=ref_w2c.device)
+            if endpoint
+            else torch.linspace(0, 1, num_frames + 1, device=ref_w2c.device)[:-1]
+        )[:, None]
+    )
+    if tilt_xy is not None:
+        positions[:, :2] += tilt_xy
+    return get_lookat_w2cs(positions, lookat, up)
+def get_roll_w2cs(
+    ref_w2c: torch.Tensor,
+    lookat: torch.Tensor,
+    up: torch.Tensor | None,
+    num_frames: int,
+    endpoint: bool = False,
+    degree: float = 360.0,
+    **_,
+) -> torch.Tensor:
+    ref_c2w = torch.linalg.inv(ref_w2c)
+    ref_position = ref_c2w[:3, 3]
+    if up is None:
+        up = -ref_c2w[:3, 1]  # Infer the up vector from the reference.
+    # Create vertical angles
+    thetas = (
+        torch.linspace(0.0, torch.pi * degree / 180, num_frames, device=ref_w2c.device)
+        if endpoint
+        else torch.linspace(
+            0.0, torch.pi * degree / 180, num_frames + 1, device=ref_w2c.device
+        )[:-1]
+    )[:, None]
+    lookat_vector = F.normalize(lookat[None].float(), dim=-1)
+    up = up[None]
+    up = (
+        up * torch.cos(thetas)
+        + torch.cross(lookat_vector, up) * torch.sin(thetas)
+        + lookat_vector
+        * torch.einsum("ij,ij->i", lookat_vector, up)[:, None]
+        * (1 - torch.cos(thetas))
+    )
+    # Normalize the camera orientation
+    return get_lookat_w2cs(ref_position[None].repeat(num_frames, 1), lookat, up)
+def normalize(x):
+    """Normalization helper function."""
+    return x / np.linalg.norm(x)
+def viewmatrix(lookdir, up, position, subtract_position=False):
+    """Construct lookat view matrix."""
+    vec2 = normalize((lookdir - position) if subtract_position else lookdir)
+    vec0 = normalize(np.cross(up, vec2))
+    vec1 = normalize(np.cross(vec2, vec0))
+    m = np.stack([vec0, vec1, vec2, position], axis=1)
+    return m
+def poses_avg(poses):
+    """New pose using average position, z-axis, and up vector of input poses."""
+    position = poses[:, :3, 3].mean(0)
+    z_axis = poses[:, :3, 2].mean(0)
+    up = poses[:, :3, 1].mean(0)
+    cam2world = viewmatrix(z_axis, up, position)
+    return cam2world
+def generate_spiral_path(
+    poses, bounds, n_frames=120, n_rots=2, zrate=0.5, endpoint=False, radii=None
+):
+    """Calculates a forward facing spiral path for rendering."""
+    # Find a reasonable 'focus depth' for this dataset as a weighted average
+    # of near and far bounds in disparity space.
+    close_depth, inf_depth = bounds.min() * 0.9, bounds.max() * 5.0
+    dt = 0.75
+    focal = 1 / ((1 - dt) / close_depth + dt / inf_depth)
+    # Get radii for spiral path using 90th percentile of camera positions.
+    positions = poses[:, :3, 3]
+    if radii is None:
+        radii = np.percentile(np.abs(positions), 90, 0)
+    radii = np.concatenate([radii, [1.0]])
+    # Generate poses for spiral path.
+    render_poses = []
+    cam2world = poses_avg(poses)
+    up = poses[:, :3, 1].mean(0)
+    for theta in np.linspace(0.0, 2.0 * np.pi * n_rots, n_frames, endpoint=endpoint):
+        t = radii * [np.cos(theta), -np.sin(theta), -np.sin(theta * zrate), 1.0]
+        position = cam2world @ t
+        lookat = cam2world @ [0, 0, -focal, 1.0]
+        z_axis = position - lookat
+        render_poses.append(viewmatrix(z_axis, up, position))
+    render_poses = np.stack(render_poses, axis=0)
+    return render_poses
+def generate_interpolated_path(
+    poses: np.ndarray,
+    n_interp: int,
+    spline_degree: int = 5,
+    smoothness: float = 0.03,
+    rot_weight: float = 0.1,
+    endpoint: bool = False,
+):
+    """Creates a smooth spline path between input keyframe camera poses.
+    Spline is calculated with poses in format (position, lookat-point, up-point).
+    Args:
+      poses: (n, 3, 4) array of input pose keyframes.
+      n_interp: returned path will have n_interp * (n - 1) total poses.
+      spline_degree: polynomial degree of B-spline.
+      smoothness: parameter for spline smoothing, 0 forces exact interpolation.
+      rot_weight: relative weighting of rotation/translation in spline solve.
+    Returns:
+      Array of new camera poses with shape (n_interp * (n - 1), 3, 4).
+    """
+    def poses_to_points(poses, dist):
+        """Converts from pose matrices to (position, lookat, up) format."""
+        pos = poses[:, :3, -1]
+        lookat = poses[:, :3, -1] - dist * poses[:, :3, 2]
+        up = poses[:, :3, -1] + dist * poses[:, :3, 1]
+        return np.stack([pos, lookat, up], 1)
+    def points_to_poses(points):
+        """Converts from (position, lookat, up) format to pose matrices."""
+        return np.array([viewmatrix(p - l, u - p, p) for p, l, u in points])
+    def interp(points, n, k, s):
+        """Runs multidimensional B-spline interpolation on the input points."""
+        sh = points.shape
+        pts = np.reshape(points, (sh[0], -1))
+        k = min(k, sh[0] - 1)
+        tck, _ = scipy.interpolate.splprep(pts.T, k=k, s=s)
+        u = np.linspace(0, 1, n, endpoint=endpoint)
+        new_points = np.array(scipy.interpolate.splev(u, tck))
+        new_points = np.reshape(new_points.T, (n, sh[1], sh[2]))
+        return new_points
+    points = poses_to_points(poses, dist=rot_weight)
+    new_points = interp(
+        points, n_interp * (points.shape[0] - 1), k=spline_degree, s=smoothness
+    )
+    return points_to_poses(new_points)
+def similarity_from_cameras(c2w, strict_scaling=False, center_method="focus"):
+    """
+    reference: nerf-factory
+    Get a similarity transform to normalize dataset
+    from c2w (OpenCV convention) cameras
+    :param c2w: (N, 4)
+    :return T (4,4) , scale (float)
+    """
+    t = c2w[:, :3, 3]
+    R = c2w[:, :3, :3]
+    # (1) Rotate the world so that z+ is the up axis
+    # we estimate the up axis by averaging the camera up axes
+    ups = np.sum(R * np.array([0, -1.0, 0]), axis=-1)
+    world_up = np.mean(ups, axis=0)
+    world_up /= np.linalg.norm(world_up)
+    up_camspace = np.array([0.0, -1.0, 0.0])
+    c = (up_camspace * world_up).sum()
+    cross = np.cross(world_up, up_camspace)
+    skew = np.array(
+        [
+            [0.0, -cross[2], cross[1]],
+            [cross[2], 0.0, -cross[0]],
+            [-cross[1], cross[0], 0.0],
+        ]
+    )
+    if c > -1:
+        R_align = np.eye(3) + skew + (skew @ skew) * 1 / (1 + c)
+    else:
+        # In the unlikely case the original data has y+ up axis,
+        # rotate 180-deg about x axis
+        R_align = np.array([[-1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 1.0]])
+    #  R_align = np.eye(3) # DEBUG
+    R = R_align @ R
+    fwds = np.sum(R * np.array([0, 0.0, 1.0]), axis=-1)
+    t = (R_align @ t[..., None])[..., 0]
+    # (2) Recenter the scene.
+    if center_method == "focus":
+        # find the closest point to the origin for each camera's center ray
+        nearest = t + (fwds * -t).sum(-1)[:, None] * fwds
+        translate = -np.median(nearest, axis=0)
+    elif center_method == "poses":
+        # use center of the camera positions
+        translate = -np.median(t, axis=0)
+    else:
+        raise ValueError(f"Unknown center_method {center_method}")
+    transform = np.eye(4)
+    transform[:3, 3] = translate
+    transform[:3, :3] = R_align
+    # (3) Rescale the scene using camera distances
+    scale_fn = np.max if strict_scaling else np.median
+    inv_scale = scale_fn(np.linalg.norm(t + translate, axis=-1))
+    if inv_scale == 0:
+        inv_scale = 1.0
+    scale = 1.0 / inv_scale
+    transform[:3, :] *= scale
+    return transform
+def align_principle_axes(point_cloud):
+    # Compute centroid
+    centroid = np.median(point_cloud, axis=0)
+    # Translate point cloud to centroid
+    translated_point_cloud = point_cloud - centroid
+    # Compute covariance matrix
+    covariance_matrix = np.cov(translated_point_cloud, rowvar=False)
+    # Compute eigenvectors and eigenvalues
+    eigenvalues, eigenvectors = np.linalg.eigh(covariance_matrix)
+    # Sort eigenvectors by eigenvalues (descending order) so that the z-axis
+    # is the principal axis with the smallest eigenvalue.
+    sort_indices = eigenvalues.argsort()[::-1]
+    eigenvectors = eigenvectors[:, sort_indices]
+    # Check orientation of eigenvectors. If the determinant of the eigenvectors is
+    # negative, then we need to flip the sign of one of the eigenvectors.
+    if np.linalg.det(eigenvectors) < 0:
+        eigenvectors[:, 0] *= -1
+    # Create rotation matrix
+    rotation_matrix = eigenvectors.T
+    # Create SE(3) matrix (4x4 transformation matrix)
+    transform = np.eye(4)
+    transform[:3, :3] = rotation_matrix
+    transform[:3, 3] = -rotation_matrix @ centroid
+    return transform
+def transform_points(matrix, points):
+    """Transform points using a SE(4) matrix.
+    Args:
+        matrix: 4x4 SE(4) matrix
+        points: Nx3 array of points
+    Returns:
+        Nx3 array of transformed points
+    """
+    assert matrix.shape == (4, 4)
+    assert len(points.shape) == 2 and points.shape[1] == 3
+    return points @ matrix[:3, :3].T + matrix[:3, 3]
+def transform_cameras(matrix, camtoworlds):
+    """Transform cameras using a SE(4) matrix.
+    Args:
+        matrix: 4x4 SE(4) matrix
+        camtoworlds: Nx4x4 array of camera-to-world matrices
+    Returns:
+        Nx4x4 array of transformed camera-to-world matrices
+    """
+    assert matrix.shape == (4, 4)
+    assert len(camtoworlds.shape) == 3 and camtoworlds.shape[1:] == (4, 4)
+    camtoworlds = np.einsum("nij, ki -> nkj", camtoworlds, matrix)
+    scaling = np.linalg.norm(camtoworlds[:, 0, :3], axis=1)
+    camtoworlds[:, :3, :3] = camtoworlds[:, :3, :3] / scaling[:, None, None]
+    return camtoworlds
+def normalize_scene(camtoworlds, points=None, camera_center_method="focus"):
+    T1 = similarity_from_cameras(camtoworlds, center_method=camera_center_method)
+    camtoworlds = transform_cameras(T1, camtoworlds)
+    if points is not None:
+        points = transform_points(T1, points)
+        T2 = align_principle_axes(points)
+        camtoworlds = transform_cameras(T2, camtoworlds)
+        points = transform_points(T2, points)
+        return camtoworlds, points, T2 @ T1
+    else:
+        return camtoworlds, T1

seva/gui.py ADDED Viewed

	@@ -0,0 +1,975 @@

+import colorsys
+import dataclasses
+import threading
+import time
+from pathlib import Path
+import numpy as np
+import scipy
+import splines
+import splines.quaternion
+import torch
+import viser
+import viser.transforms as vt
+from seva.geometry import get_preset_pose_fov
+@dataclasses.dataclass
+class Keyframe(object):
+    position: np.ndarray
+    wxyz: np.ndarray
+    override_fov_enabled: bool
+    override_fov_rad: float
+    aspect: float
+    override_transition_enabled: bool
+    override_transition_sec: float | None
+    @staticmethod
+    def from_camera(camera: viser.CameraHandle, aspect: float) -> "Keyframe":
+        return Keyframe(
+            camera.position,
+            camera.wxyz,
+            override_fov_enabled=False,
+            override_fov_rad=camera.fov,
+            aspect=aspect,
+            override_transition_enabled=False,
+            override_transition_sec=None,
+        )
+    @staticmethod
+    def from_se3(se3: vt.SE3, fov: float, aspect: float) -> "Keyframe":
+        return Keyframe(
+            se3.translation(),
+            se3.rotation().wxyz,
+            override_fov_enabled=False,
+            override_fov_rad=fov,
+            aspect=aspect,
+            override_transition_enabled=False,
+            override_transition_sec=None,
+        )
+class CameraTrajectory(object):
+    def __init__(
+        self,
+        server: viser.ViserServer,
+        duration_element: viser.GuiInputHandle[float],
+        scene_scale: float,
+        scene_node_prefix: str = "/",
+    ):
+        self._server = server
+        self._keyframes: dict[int, tuple[Keyframe, viser.CameraFrustumHandle]] = {}
+        self._keyframe_counter: int = 0
+        self._spline_nodes: list[viser.SceneNodeHandle] = []
+        self._camera_edit_panel: viser.Gui3dContainerHandle | None = None
+        self._orientation_spline: splines.quaternion.KochanekBartels | None = None
+        self._position_spline: splines.KochanekBartels | None = None
+        self._fov_spline: splines.KochanekBartels | None = None
+        self._keyframes_visible: bool = True
+        self._duration_element = duration_element
+        self._scene_node_prefix = scene_node_prefix
+        self.scene_scale = scene_scale
+        # These parameters should be overridden externally.
+        self.loop: bool = False
+        self.framerate: float = 30.0
+        self.tension: float = 0.0  # Tension / alpha term.
+        self.default_fov: float = 0.0
+        self.default_transition_sec: float = 0.0
+        self.show_spline: bool = True
+    def set_keyframes_visible(self, visible: bool) -> None:
+        self._keyframes_visible = visible
+        for keyframe in self._keyframes.values():
+            keyframe[1].visible = visible
+    def add_camera(self, keyframe: Keyframe, keyframe_index: int | None = None) -> None:
+        """Add a new camera, or replace an old one if `keyframe_index` is passed in."""
+        server = self._server
+        # Add a keyframe if we aren't replacing an existing one.
+        if keyframe_index is None:
+            keyframe_index = self._keyframe_counter
+            self._keyframe_counter += 1
+        print(
+            f"{keyframe.wxyz=} {keyframe.position=} {keyframe_index=} {keyframe.aspect=}"
+        )
+        frustum_handle = server.scene.add_camera_frustum(
+            str(Path(self._scene_node_prefix) / f"cameras/{keyframe_index}"),
+            fov=(
+                keyframe.override_fov_rad
+                if keyframe.override_fov_enabled
+                else self.default_fov
+            ),
+            aspect=keyframe.aspect,
+            scale=0.1 * self.scene_scale,
+            color=(200, 10, 30),
+            wxyz=keyframe.wxyz,
+            position=keyframe.position,
+            visible=self._keyframes_visible,
+        )
+        self._server.scene.add_icosphere(
+            str(Path(self._scene_node_prefix) / f"cameras/{keyframe_index}/sphere"),
+            radius=0.03,
+            color=(200, 10, 30),
+        )
+        @frustum_handle.on_click
+        def _(_) -> None:
+            if self._camera_edit_panel is not None:
+                self._camera_edit_panel.remove()
+                self._camera_edit_panel = None
+            with server.scene.add_3d_gui_container(
+                "/camera_edit_panel",
+                position=keyframe.position,
+            ) as camera_edit_panel:
+                self._camera_edit_panel = camera_edit_panel
+                override_fov = server.gui.add_checkbox(
+                    "Override FOV", initial_value=keyframe.override_fov_enabled
+                )
+                override_fov_degrees = server.gui.add_slider(
+                    "Override FOV (degrees)",
+                    5.0,
+                    175.0,
+                    step=0.1,
+                    initial_value=keyframe.override_fov_rad * 180.0 / np.pi,
+                    disabled=not keyframe.override_fov_enabled,
+                )
+                delete_button = server.gui.add_button(
+                    "Delete", color="red", icon=viser.Icon.TRASH
+                )
+                go_to_button = server.gui.add_button("Go to")
+                close_button = server.gui.add_button("Close")
+            @override_fov.on_update
+            def _(_) -> None:
+                keyframe.override_fov_enabled = override_fov.value
+                override_fov_degrees.disabled = not override_fov.value
+                self.add_camera(keyframe, keyframe_index)
+            @override_fov_degrees.on_update
+            def _(_) -> None:
+                keyframe.override_fov_rad = override_fov_degrees.value / 180.0 * np.pi
+                self.add_camera(keyframe, keyframe_index)
+            @delete_button.on_click
+            def _(event: viser.GuiEvent) -> None:
+                assert event.client is not None
+                with event.client.gui.add_modal("Confirm") as modal:
+                    event.client.gui.add_markdown("Delete keyframe?")
+                    confirm_button = event.client.gui.add_button(
+                        "Yes", color="red", icon=viser.Icon.TRASH
+                    )
+                    exit_button = event.client.gui.add_button("Cancel")
+                    @confirm_button.on_click
+                    def _(_) -> None:
+                        assert camera_edit_panel is not None
+                        keyframe_id = None
+                        for i, keyframe_tuple in self._keyframes.items():
+                            if keyframe_tuple[1] is frustum_handle:
+                                keyframe_id = i
+                                break
+                        assert keyframe_id is not None
+                        self._keyframes.pop(keyframe_id)
+                        frustum_handle.remove()
+                        camera_edit_panel.remove()
+                        self._camera_edit_panel = None
+                        modal.close()
+                        self.update_spline()
+                    @exit_button.on_click
+                    def _(_) -> None:
+                        modal.close()
+            @go_to_button.on_click
+            def _(event: viser.GuiEvent) -> None:
+                assert event.client is not None
+                client = event.client
+                T_world_current = vt.SE3.from_rotation_and_translation(
+                    vt.SO3(client.camera.wxyz), client.camera.position
+                )
+                T_world_target = vt.SE3.from_rotation_and_translation(
+                    vt.SO3(keyframe.wxyz), keyframe.position
+                ) @ vt.SE3.from_translation(np.array([0.0, 0.0, -0.5]))
+                T_current_target = T_world_current.inverse() @ T_world_target
+                for j in range(10):
+                    T_world_set = T_world_current @ vt.SE3.exp(
+                        T_current_target.log() * j / 9.0
+                    )
+                    # Important bit: we atomically set both the orientation and
+                    # the position of the camera.
+                    with client.atomic():
+                        client.camera.wxyz = T_world_set.rotation().wxyz
+                        client.camera.position = T_world_set.translation()
+                    time.sleep(1.0 / 30.0)
+            @close_button.on_click
+            def _(_) -> None:
+                assert camera_edit_panel is not None
+                camera_edit_panel.remove()
+                self._camera_edit_panel = None
+        self._keyframes[keyframe_index] = (keyframe, frustum_handle)
+    def update_aspect(self, aspect: float) -> None:
+        for keyframe_index, frame in self._keyframes.items():
+            frame = dataclasses.replace(frame[0], aspect=aspect)
+            self.add_camera(frame, keyframe_index=keyframe_index)
+    def get_aspect(self) -> float:
+        """Get W/H aspect ratio, which is shared across all keyframes."""
+        assert len(self._keyframes) > 0
+        return next(iter(self._keyframes.values()))[0].aspect
+    def reset(self) -> None:
+        for frame in self._keyframes.values():
+            print(f"removing {frame[1]}")
+            frame[1].remove()
+        self._keyframes.clear()
+        self.update_spline()
+        print("camera traj reset")
+    def spline_t_from_t_sec(self, time: np.ndarray) -> np.ndarray:
+        """From a time value in seconds, compute a t value for our geometric
+        spline interpolation. An increment of 1 for the latter will move the
+        camera forward by one keyframe.
+        We use a PCHIP spline here to guarantee monotonicity.
+        """
+        transition_times_cumsum = self.compute_transition_times_cumsum()
+        spline_indices = np.arange(transition_times_cumsum.shape[0])
+        if self.loop:
+            # In the case of a loop, we pad the spline to match the start/end
+            # slopes.
+            interpolator = scipy.interpolate.PchipInterpolator(
+                x=np.concatenate(
+                    [
+                        [-(transition_times_cumsum[-1] - transition_times_cumsum[-2])],
+                        transition_times_cumsum,
+                        transition_times_cumsum[-1:] + transition_times_cumsum[1:2],
+                    ],
+                    axis=0,
+                ),
+                y=np.concatenate(
+                    [[-1], spline_indices, [spline_indices[-1] + 1]],  # type: ignore
+                    axis=0,
+                ),
+            )
+        else:
+            interpolator = scipy.interpolate.PchipInterpolator(
+                x=transition_times_cumsum, y=spline_indices
+            )
+        # Clip to account for floating point error.
+        return np.clip(interpolator(time), 0, spline_indices[-1])
+    def interpolate_pose_and_fov_rad(
+        self, normalized_t: float
+    ) -> tuple[vt.SE3, float] | None:
+        if len(self._keyframes) < 2:
+            return None
+        self._fov_spline = splines.KochanekBartels(
+            [
+                (
+                    keyframe[0].override_fov_rad
+                    if keyframe[0].override_fov_enabled
+                    else self.default_fov
+                )
+                for keyframe in self._keyframes.values()
+            ],
+            tcb=(self.tension, 0.0, 0.0),
+            endconditions="closed" if self.loop else "natural",
+        )
+        assert self._orientation_spline is not None
+        assert self._position_spline is not None
+        assert self._fov_spline is not None
+        max_t = self.compute_duration()
+        t = max_t * normalized_t
+        spline_t = float(self.spline_t_from_t_sec(np.array(t)))
+        quat = self._orientation_spline.evaluate(spline_t)
+        assert isinstance(quat, splines.quaternion.UnitQuaternion)
+        return (
+            vt.SE3.from_rotation_and_translation(
+                vt.SO3(np.array([quat.scalar, *quat.vector])),
+                self._position_spline.evaluate(spline_t),
+            ),
+            float(self._fov_spline.evaluate(spline_t)),
+        )
+    def update_spline(self) -> None:
+        num_frames = int(self.compute_duration() * self.framerate)
+        keyframes = list(self._keyframes.values())
+        if num_frames <= 0 or not self.show_spline or len(keyframes) < 2:
+            for node in self._spline_nodes:
+                node.remove()
+            self._spline_nodes.clear()
+            return
+        transition_times_cumsum = self.compute_transition_times_cumsum()
+        self._orientation_spline = splines.quaternion.KochanekBartels(
+            [
+                splines.quaternion.UnitQuaternion.from_unit_xyzw(
+                    np.roll(keyframe[0].wxyz, shift=-1)
+                )
+                for keyframe in keyframes
+            ],
+            tcb=(self.tension, 0.0, 0.0),
+            endconditions="closed" if self.loop else "natural",
+        )
+        self._position_spline = splines.KochanekBartels(
+            [keyframe[0].position for keyframe in keyframes],
+            tcb=(self.tension, 0.0, 0.0),
+            endconditions="closed" if self.loop else "natural",
+        )
+        # Update visualized spline.
+        points_array = self._position_spline.evaluate(
+            self.spline_t_from_t_sec(
+                np.linspace(0, transition_times_cumsum[-1], num_frames)
+            )
+        )
+        colors_array = np.array(
+            [
+                colorsys.hls_to_rgb(h, 0.5, 1.0)
+                for h in np.linspace(0.0, 1.0, len(points_array))
+            ]
+        )
+        # Clear prior spline nodes.
+        for node in self._spline_nodes:
+            node.remove()
+        self._spline_nodes.clear()
+        self._spline_nodes.append(
+            self._server.scene.add_spline_catmull_rom(
+                str(Path(self._scene_node_prefix) / "camera_spline"),
+                positions=points_array,
+                color=(220, 220, 220),
+                closed=self.loop,
+                line_width=1.0,
+                segments=points_array.shape[0] + 1,
+            )
+        )
+        self._spline_nodes.append(
+            self._server.scene.add_point_cloud(
+                str(Path(self._scene_node_prefix) / "camera_spline/points"),
+                points=points_array,
+                colors=colors_array,
+                point_size=0.04,
+            )
+        )
+        def make_transition_handle(i: int) -> None:
+            assert self._position_spline is not None
+            transition_pos = self._position_spline.evaluate(
+                float(
+                    self.spline_t_from_t_sec(
+                        (transition_times_cumsum[i] + transition_times_cumsum[i + 1])
+                        / 2.0,
+                    )
+                )
+            )
+            transition_sphere = self._server.scene.add_icosphere(
+                str(Path(self._scene_node_prefix) / f"camera_spline/transition_{i}"),
+                radius=0.04,
+                color=(255, 0, 0),
+                position=transition_pos,
+            )
+            self._spline_nodes.append(transition_sphere)
+            @transition_sphere.on_click
+            def _(_) -> None:
+                server = self._server
+                if self._camera_edit_panel is not None:
+                    self._camera_edit_panel.remove()
+                    self._camera_edit_panel = None
+                keyframe_index = (i + 1) % len(self._keyframes)
+                keyframe = keyframes[keyframe_index][0]
+                with server.scene.add_3d_gui_container(
+                    "/camera_edit_panel",
+                    position=transition_pos,
+                ) as camera_edit_panel:
+                    self._camera_edit_panel = camera_edit_panel
+                    override_transition_enabled = server.gui.add_checkbox(
+                        "Override transition",
+                        initial_value=keyframe.override_transition_enabled,
+                    )
+                    override_transition_sec = server.gui.add_number(
+                        "Override transition (sec)",
+                        initial_value=(
+                            keyframe.override_transition_sec
+                            if keyframe.override_transition_sec is not None
+                            else self.default_transition_sec
+                        ),
+                        min=0.001,
+                        max=30.0,
+                        step=0.001,
+                        disabled=not override_transition_enabled.value,
+                    )
+                    close_button = server.gui.add_button("Close")
+                @override_transition_enabled.on_update
+                def _(_) -> None:
+                    keyframe.override_transition_enabled = (
+                        override_transition_enabled.value
+                    )
+                    override_transition_sec.disabled = (
+                        not override_transition_enabled.value
+                    )
+                    self._duration_element.value = self.compute_duration()
+                @override_transition_sec.on_update
+                def _(_) -> None:
+                    keyframe.override_transition_sec = override_transition_sec.value
+                    self._duration_element.value = self.compute_duration()
+                @close_button.on_click
+                def _(_) -> None:
+                    assert camera_edit_panel is not None
+                    camera_edit_panel.remove()
+                    self._camera_edit_panel = None
+        (num_transitions_plus_1,) = transition_times_cumsum.shape
+        for i in range(num_transitions_plus_1 - 1):
+            make_transition_handle(i)
+    def compute_duration(self) -> float:
+        """Compute the total duration of the trajectory."""
+        total = 0.0
+        for i, (keyframe, frustum) in enumerate(self._keyframes.values()):
+            if i == 0 and not self.loop:
+                continue
+            del frustum
+            total += (
+                keyframe.override_transition_sec
+                if keyframe.override_transition_enabled
+                and keyframe.override_transition_sec is not None
+                else self.default_transition_sec
+            )
+        return total
+    def compute_transition_times_cumsum(self) -> np.ndarray:
+        """Compute the total duration of the trajectory."""
+        total = 0.0
+        out = [0.0]
+        for i, (keyframe, frustum) in enumerate(self._keyframes.values()):
+            if i == 0:
+                continue
+            del frustum
+            total += (
+                keyframe.override_transition_sec
+                if keyframe.override_transition_enabled
+                and keyframe.override_transition_sec is not None
+                else self.default_transition_sec
+            )
+            out.append(total)
+        if self.loop:
+            keyframe = next(iter(self._keyframes.values()))[0]
+            total += (
+                keyframe.override_transition_sec
+                if keyframe.override_transition_enabled
+                and keyframe.override_transition_sec is not None
+                else self.default_transition_sec
+            )
+            out.append(total)
+        return np.array(out)
+@dataclasses.dataclass
+class GuiState:
+    preview_render: bool
+    preview_fov: float
+    preview_aspect: float
+    camera_traj_list: list | None
+    active_input_index: int
+def define_gui(
+    server: viser.ViserServer,
+    init_fov: float = 75.0,
+    img_wh: tuple[int, int] = (576, 576),
+    **kwargs,
+) -> GuiState:
+    gui_state = GuiState(
+        preview_render=False,
+        preview_fov=0.0,
+        preview_aspect=1.0,
+        camera_traj_list=None,
+        active_input_index=0,
+    )
+    with server.gui.add_folder(
+        "Preset camera trajectories", order=99, expand_by_default=False
+    ):
+        preset_traj_dropdown = server.gui.add_dropdown(
+            "Options",
+            [
+                "orbit",
+                "spiral",
+                "lemniscate",
+                "zoom-out",
+                "dolly zoom-out",
+            ],
+            initial_value="orbit",
+            hint="Select a preset camera trajectory.",
+        )
+        preset_duration_num = server.gui.add_number(
+            "Duration (sec)",
+            min=1.0,
+            max=60.0,
+            step=0.5,
+            initial_value=2.0,
+        )
+        preset_submit_button = server.gui.add_button(
+            "Submit",
+            icon=viser.Icon.PICK,
+            hint="Add a new keyframe at the current pose.",
+        )
+        @preset_submit_button.on_click
+        def _(event: viser.GuiEvent) -> None:
+            camera_traj.reset()
+            gui_state.camera_traj_list = None
+            duration = preset_duration_num.value
+            fps = framerate_number.value
+            num_frames = int(duration * fps)
+            transition_sec = duration / num_frames
+            transition_sec_number.value = transition_sec
+            assert event.client_id is not None
+            transition_sec_number.disabled = True
+            loop_checkbox.disabled = True
+            add_keyframe_button.disabled = True
+            camera = server.get_clients()[event.client_id].camera
+            start_w2c = torch.linalg.inv(
+                torch.as_tensor(
+                    vt.SE3.from_rotation_and_translation(
+                        vt.SO3(camera.wxyz), camera.position
+                    ).as_matrix(),
+                    dtype=torch.float32,
+                )
+            )
+            look_at = torch.as_tensor(camera.look_at, dtype=torch.float32)
+            up_direction = torch.as_tensor(camera.up_direction, dtype=torch.float32)
+            poses, fovs = get_preset_pose_fov(
+                option=preset_traj_dropdown.value,  # type: ignore
+                num_frames=num_frames,
+                start_w2c=start_w2c,
+                look_at=look_at,
+                up_direction=up_direction,
+                fov=camera.fov,
+            )
+            assert poses is not None and fovs is not None
+            for pose, fov in zip(poses, fovs):
+                camera_traj.add_camera(
+                    Keyframe.from_se3(
+                        vt.SE3.from_matrix(pose),
+                        fov=fov,
+                        aspect=img_wh[0] / img_wh[1],
+                    )
+                )
+            duration_number.value = camera_traj.compute_duration()
+            camera_traj.update_spline()
+    with server.gui.add_folder("Advanced", expand_by_default=False, order=100):
+        transition_sec_number = server.gui.add_number(
+            "Transition (sec)",
+            min=0.001,
+            max=30.0,
+            step=0.001,
+            initial_value=1.5,
+            hint="Time in seconds between each keyframe, which can also be overridden on a per-transition basis.",
+        )
+        framerate_number = server.gui.add_number(
+            "FPS", min=0.1, max=240.0, step=1e-2, initial_value=30.0
+        )
+        framerate_buttons = server.gui.add_button_group("", ("24", "30", "60"))
+        duration_number = server.gui.add_number(
+            "Duration (sec)",
+            min=0.0,
+            max=1e8,
+            step=0.001,
+            initial_value=0.0,
+            disabled=True,
+        )
+        @framerate_buttons.on_click
+        def _(_) -> None:
+            framerate_number.value = float(framerate_buttons.value)
+    fov_degree_slider = server.gui.add_slider(
+        "FOV",
+        initial_value=init_fov,
+        min=0.1,
+        max=175.0,
+        step=0.01,
+        hint="Field-of-view for rendering, which can also be overridden on a per-keyframe basis.",
+    )
+    @fov_degree_slider.on_update
+    def _(_) -> None:
+        fov_radians = fov_degree_slider.value / 180.0 * np.pi
+        for client in server.get_clients().values():
+            client.camera.fov = fov_radians
+        camera_traj.default_fov = fov_radians
+        # Updating the aspect ratio will also re-render the camera frustums.
+        # Could rethink this.
+        camera_traj.update_aspect(img_wh[0] / img_wh[1])
+        compute_and_update_preview_camera_state()
+    scene_node_prefix = "/render_assets"
+    base_scene_node = server.scene.add_frame(scene_node_prefix, show_axes=False)
+    add_keyframe_button = server.gui.add_button(
+        "Add keyframe",
+        icon=viser.Icon.PLUS,
+        hint="Add a new keyframe at the current pose.",
+    )
+    @add_keyframe_button.on_click
+    def _(event: viser.GuiEvent) -> None:
+        assert event.client_id is not None
+        camera = server.get_clients()[event.client_id].camera
+        pose = vt.SE3.from_rotation_and_translation(
+            vt.SO3(camera.wxyz), camera.position
+        )
+        print(f"client {event.client_id} at {camera.position} {camera.wxyz}")
+        print(f"camera pose {pose.as_matrix()}")
+        # Add this camera to the trajectory.
+        camera_traj.add_camera(
+            Keyframe.from_camera(
+                camera,
+                aspect=img_wh[0] / img_wh[1],
+            ),
+        )
+        duration_number.value = camera_traj.compute_duration()
+        camera_traj.update_spline()
+    clear_keyframes_button = server.gui.add_button(
+        "Clear keyframes",
+        icon=viser.Icon.TRASH,
+        hint="Remove all keyframes from the render trajectory.",
+    )
+    @clear_keyframes_button.on_click
+    def _(event: viser.GuiEvent) -> None:
+        assert event.client_id is not None
+        client = server.get_clients()[event.client_id]
+        with client.atomic(), client.gui.add_modal("Confirm") as modal:
+            client.gui.add_markdown("Clear all keyframes?")
+            confirm_button = client.gui.add_button(
+                "Yes", color="red", icon=viser.Icon.TRASH
+            )
+            exit_button = client.gui.add_button("Cancel")
+            @confirm_button.on_click
+            def _(_) -> None:
+                camera_traj.reset()
+                modal.close()
+                duration_number.value = camera_traj.compute_duration()
+                add_keyframe_button.disabled = False
+                transition_sec_number.disabled = False
+                transition_sec_number.value = 1.5
+                loop_checkbox.disabled = False
+                nonlocal gui_state
+                gui_state.camera_traj_list = None
+            @exit_button.on_click
+            def _(_) -> None:
+                modal.close()
+    play_button = server.gui.add_button("Play", icon=viser.Icon.PLAYER_PLAY)
+    pause_button = server.gui.add_button(
+        "Pause", icon=viser.Icon.PLAYER_PAUSE, visible=False
+    )
+    # Poll the play button to see if we should be playing endlessly.
+    def play() -> None:
+        while True:
+            while not play_button.visible:
+                max_frame = int(framerate_number.value * duration_number.value)
+                if max_frame > 0:
+                    assert preview_frame_slider is not None
+                    preview_frame_slider.value = (
+                        preview_frame_slider.value + 1
+                    ) % max_frame
+                time.sleep(1.0 / framerate_number.value)
+            time.sleep(0.1)
+    threading.Thread(target=play).start()
+    # Play the camera trajectory when the play button is pressed.
+    @play_button.on_click
+    def _(_) -> None:
+        play_button.visible = False
+        pause_button.visible = True
+    # Play the camera trajectory when the play button is pressed.
+    @pause_button.on_click
+    def _(_) -> None:
+        play_button.visible = True
+        pause_button.visible = False
+    preview_render_button = server.gui.add_button(
+        "Preview render",
+        hint="Show a preview of the render in the viewport.",
+        icon=viser.Icon.CAMERA_CHECK,
+    )
+    preview_render_stop_button = server.gui.add_button(
+        "Exit render preview",
+        color="red",
+        icon=viser.Icon.CAMERA_CANCEL,
+        visible=False,
+    )
+    @preview_render_button.on_click
+    def _(_) -> None:
+        gui_state.preview_render = True
+        preview_render_button.visible = False
+        preview_render_stop_button.visible = True
+        play_button.visible = False
+        pause_button.visible = True
+        preset_submit_button.disabled = True
+        maybe_pose_and_fov_rad = compute_and_update_preview_camera_state()
+        if maybe_pose_and_fov_rad is None:
+            remove_preview_camera()
+            return
+        pose, fov = maybe_pose_and_fov_rad
+        del fov
+        # Hide all render assets when we're previewing the render.
+        nonlocal base_scene_node
+        base_scene_node.visible = False
+        # Back up and then set camera poses.
+        for client in server.get_clients().values():
+            camera_pose_backup_from_id[client.client_id] = (
+                client.camera.position,
+                client.camera.look_at,
+                client.camera.up_direction,
+            )
+            with client.atomic():
+                client.camera.wxyz = pose.rotation().wxyz
+                client.camera.position = pose.translation()
+    def stop_preview_render() -> None:
+        gui_state.preview_render = False
+        preview_render_button.visible = True
+        preview_render_stop_button.visible = False
+        play_button.visible = True
+        pause_button.visible = False
+        preset_submit_button.disabled = False
+        # Revert camera poses.
+        for client in server.get_clients().values():
+            if client.client_id not in camera_pose_backup_from_id:
+                continue
+            cam_position, cam_look_at, cam_up = camera_pose_backup_from_id.pop(
+                client.client_id
+            )
+            with client.atomic():
+                client.camera.position = cam_position
+                client.camera.look_at = cam_look_at
+                client.camera.up_direction = cam_up
+            client.flush()
+        # Un-hide render assets.
+        nonlocal base_scene_node
+        base_scene_node.visible = True
+        remove_preview_camera()
+    @preview_render_stop_button.on_click
+    def _(_) -> None:
+        stop_preview_render()
+    def get_max_frame_index() -> int:
+        return max(1, int(framerate_number.value * duration_number.value) - 1)
+    def add_preview_frame_slider() -> viser.GuiInputHandle[int] | None:
+        """Helper for creating the current frame # slider. This is removed and
+        re-added anytime the `max` value changes."""
+        preview_frame_slider = server.gui.add_slider(
+            "Preview frame",
+            min=0,
+            max=get_max_frame_index(),
+            step=1,
+            initial_value=0,
+            order=set_traj_button.order + 0.01,
+            disabled=get_max_frame_index() == 1,
+        )
+        play_button.disabled = preview_frame_slider.disabled
+        preview_render_button.disabled = preview_frame_slider.disabled
+        set_traj_button.disabled = preview_frame_slider.disabled
+        @preview_frame_slider.on_update
+        def _(_) -> None:
+            nonlocal preview_camera_handle
+            maybe_pose_and_fov_rad = compute_and_update_preview_camera_state()
+            if maybe_pose_and_fov_rad is None:
+                return
+            pose, fov_rad = maybe_pose_and_fov_rad
+            preview_camera_handle = server.scene.add_camera_frustum(
+                str(Path(scene_node_prefix) / "preview_camera"),
+                fov=fov_rad,
+                aspect=img_wh[0] / img_wh[1],
+                scale=0.35,
+                wxyz=pose.rotation().wxyz,
+                position=pose.translation(),
+                color=(10, 200, 30),
+            )
+            if gui_state.preview_render:
+                for client in server.get_clients().values():
+                    with client.atomic():
+                        client.camera.wxyz = pose.rotation().wxyz
+                        client.camera.position = pose.translation()
+        return preview_frame_slider
+    set_traj_button = server.gui.add_button(
+        "Set camera trajectory",
+        color="green",
+        icon=viser.Icon.CHECK,
+        hint="Save the camera trajectory for rendering.",
+    )
+    @set_traj_button.on_click
+    def _(event: viser.GuiEvent) -> None:
+        assert event.client is not None
+        num_frames = int(framerate_number.value * duration_number.value)
+        def get_intrinsics(W, H, fov_rad):
+            focal = 0.5 * H / np.tan(0.5 * fov_rad)
+            return np.array(
+                [[focal, 0.0, 0.5 * W], [0.0, focal, 0.5 * H], [0.0, 0.0, 1.0]]
+            )
+        camera_traj_list = []
+        for i in range(num_frames):
+            maybe_pose_and_fov_rad = camera_traj.interpolate_pose_and_fov_rad(
+                i / num_frames
+            )
+            if maybe_pose_and_fov_rad is None:
+                return
+            pose, fov_rad = maybe_pose_and_fov_rad
+            H = img_wh[1]
+            W = img_wh[0]
+            K = get_intrinsics(W, H, fov_rad)
+            w2c = pose.inverse().as_matrix()
+            camera_traj_list.append(
+                {
+                    "w2c": w2c.flatten().tolist(),
+                    "K": K.flatten().tolist(),
+                    "img_wh": (W, H),
+                }
+            )
+        nonlocal gui_state
+        gui_state.camera_traj_list = camera_traj_list
+        print(f"Get camera_traj_list: {gui_state.camera_traj_list}")
+        stop_preview_render()
+    preview_frame_slider = add_preview_frame_slider()
+    loop_checkbox = server.gui.add_checkbox(
+        "Loop", False, hint="Add a segment between the first and last keyframes."
+    )
+    @loop_checkbox.on_update
+    def _(_) -> None:
+        camera_traj.loop = loop_checkbox.value
+        duration_number.value = camera_traj.compute_duration()
+    @transition_sec_number.on_update
+    def _(_) -> None:
+        camera_traj.default_transition_sec = transition_sec_number.value
+        duration_number.value = camera_traj.compute_duration()
+    preview_camera_handle: viser.SceneNodeHandle | None = None
+    def remove_preview_camera() -> None:
+        nonlocal preview_camera_handle
+        if preview_camera_handle is not None:
+            preview_camera_handle.remove()
+            preview_camera_handle = None
+    def compute_and_update_preview_camera_state() -> tuple[vt.SE3, float] | None:
+        """Update the render tab state with the current preview camera pose.
+        Returns current camera pose + FOV if available."""
+        if preview_frame_slider is None:
+            return None
+        maybe_pose_and_fov_rad = camera_traj.interpolate_pose_and_fov_rad(
+            preview_frame_slider.value / get_max_frame_index()
+        )
+        if maybe_pose_and_fov_rad is None:
+            remove_preview_camera()
+            return None
+        pose, fov_rad = maybe_pose_and_fov_rad
+        gui_state.preview_fov = fov_rad
+        gui_state.preview_aspect = camera_traj.get_aspect()
+        return pose, fov_rad
+    # We back up the camera poses before and after we start previewing renders.
+    camera_pose_backup_from_id: dict[int, tuple] = {}
+    # Update the # of frames.
+    @duration_number.on_update
+    @framerate_number.on_update
+    def _(_) -> None:
+        remove_preview_camera()  # Will be re-added when slider is updated.
+        nonlocal preview_frame_slider
+        old = preview_frame_slider
+        assert old is not None
+        preview_frame_slider = add_preview_frame_slider()
+        if preview_frame_slider is not None:
+            old.remove()
+        else:
+            preview_frame_slider = old
+        camera_traj.framerate = framerate_number.value
+        camera_traj.update_spline()
+    camera_traj = CameraTrajectory(
+        server,
+        duration_number,
+        scene_node_prefix=scene_node_prefix,
+        **kwargs,
+    )
+    camera_traj.default_fov = fov_degree_slider.value / 180.0 * np.pi
+    camera_traj.default_transition_sec = transition_sec_number.value
+    return gui_state

seva/model.py ADDED Viewed

	@@ -0,0 +1,234 @@

+from dataclasses import dataclass, field
+import torch
+import torch.nn as nn
+from seva.modules.layers import (
+    Downsample,
+    GroupNorm32,
+    ResBlock,
+    TimestepEmbedSequential,
+    Upsample,
+    timestep_embedding,
+)
+from seva.modules.transformer import MultiviewTransformer
+@dataclass
+class SevaParams(object):
+    in_channels: int = 11
+    model_channels: int = 320
+    out_channels: int = 4
+    num_frames: int = 21
+    num_res_blocks: int = 2
+    attention_resolutions: list[int] = field(default_factory=lambda: [4, 2, 1])
+    channel_mult: list[int] = field(default_factory=lambda: [1, 2, 4, 4])
+    num_head_channels: int = 64
+    transformer_depth: list[int] = field(default_factory=lambda: [1, 1, 1, 1])
+    context_dim: int = 1024
+    dense_in_channels: int = 6
+    dropout: float = 0.0
+    unflatten_names: list[str] = field(
+        default_factory=lambda: ["middle_ds8", "output_ds4", "output_ds2"]
+    )
+    def __post_init__(self):
+        assert len(self.channel_mult) == len(self.transformer_depth)
+class Seva(nn.Module):
+    def __init__(self, params: SevaParams) -> None:
+        super().__init__()
+        self.params = params
+        self.model_channels = params.model_channels
+        self.out_channels = params.out_channels
+        self.num_head_channels = params.num_head_channels
+        time_embed_dim = params.model_channels * 4
+        self.time_embed = nn.Sequential(
+            nn.Linear(params.model_channels, time_embed_dim),
+            nn.SiLU(),
+            nn.Linear(time_embed_dim, time_embed_dim),
+        )
+        self.input_blocks = nn.ModuleList(
+            [
+                TimestepEmbedSequential(
+                    nn.Conv2d(params.in_channels, params.model_channels, 3, padding=1)
+                )
+            ]
+        )
+        self._feature_size = params.model_channels
+        input_block_chans = [params.model_channels]
+        ch = params.model_channels
+        ds = 1
+        for level, mult in enumerate(params.channel_mult):
+            for _ in range(params.num_res_blocks):
+                input_layers: list[ResBlock | MultiviewTransformer | Downsample] = [
+                    ResBlock(
+                        channels=ch,
+                        emb_channels=time_embed_dim,
+                        out_channels=mult * params.model_channels,
+                        dense_in_channels=params.dense_in_channels,
+                        dropout=params.dropout,
+                    )
+                ]
+                ch = mult * params.model_channels
+                if ds in params.attention_resolutions:
+                    num_heads = ch // params.num_head_channels
+                    dim_head = params.num_head_channels
+                    input_layers.append(
+                        MultiviewTransformer(
+                            ch,
+                            num_heads,
+                            dim_head,
+                            name=f"input_ds{ds}",
+                            depth=params.transformer_depth[level],
+                            context_dim=params.context_dim,
+                            unflatten_names=params.unflatten_names,
+                        )
+                    )
+                self.input_blocks.append(TimestepEmbedSequential(*input_layers))
+                self._feature_size += ch
+                input_block_chans.append(ch)
+            if level != len(params.channel_mult) - 1:
+                ds *= 2
+                out_ch = ch
+                self.input_blocks.append(
+                    TimestepEmbedSequential(Downsample(ch, out_channels=out_ch))
+                )
+                ch = out_ch
+                input_block_chans.append(ch)
+                self._feature_size += ch
+        num_heads = ch // params.num_head_channels
+        dim_head = params.num_head_channels
+        self.middle_block = TimestepEmbedSequential(
+            ResBlock(
+                channels=ch,
+                emb_channels=time_embed_dim,
+                out_channels=None,
+                dense_in_channels=params.dense_in_channels,
+                dropout=params.dropout,
+            ),
+            MultiviewTransformer(
+                ch,
+                num_heads,
+                dim_head,
+                name=f"middle_ds{ds}",
+                depth=params.transformer_depth[-1],
+                context_dim=params.context_dim,
+                unflatten_names=params.unflatten_names,
+            ),
+            ResBlock(
+                channels=ch,
+                emb_channels=time_embed_dim,
+                out_channels=None,
+                dense_in_channels=params.dense_in_channels,
+                dropout=params.dropout,
+            ),
+        )
+        self._feature_size += ch
+        self.output_blocks = nn.ModuleList([])
+        for level, mult in list(enumerate(params.channel_mult))[::-1]:
+            for i in range(params.num_res_blocks + 1):
+                ich = input_block_chans.pop()
+                output_layers: list[ResBlock | MultiviewTransformer | Upsample] = [
+                    ResBlock(
+                        channels=ch + ich,
+                        emb_channels=time_embed_dim,
+                        out_channels=params.model_channels * mult,
+                        dense_in_channels=params.dense_in_channels,
+                        dropout=params.dropout,
+                    )
+                ]
+                ch = params.model_channels * mult
+                if ds in params.attention_resolutions:
+                    num_heads = ch // params.num_head_channels
+                    dim_head = params.num_head_channels
+                    output_layers.append(
+                        MultiviewTransformer(
+                            ch,
+                            num_heads,
+                            dim_head,
+                            name=f"output_ds{ds}",
+                            depth=params.transformer_depth[level],
+                            context_dim=params.context_dim,
+                            unflatten_names=params.unflatten_names,
+                        )
+                    )
+                if level and i == params.num_res_blocks:
+                    out_ch = ch
+                    ds //= 2
+                    output_layers.append(Upsample(ch, out_ch))
+                self.output_blocks.append(TimestepEmbedSequential(*output_layers))
+                self._feature_size += ch
+        self.out = nn.Sequential(
+            GroupNorm32(32, ch),
+            nn.SiLU(),
+            nn.Conv2d(self.model_channels, params.out_channels, 3, padding=1),
+        )
+    def forward(
+        self,
+        x: torch.Tensor,
+        t: torch.Tensor,
+        y: torch.Tensor,
+        dense_y: torch.Tensor,
+        num_frames: int | None = None,
+    ) -> torch.Tensor:
+        num_frames = num_frames or self.params.num_frames
+        t_emb = timestep_embedding(t, self.model_channels)
+        t_emb = self.time_embed(t_emb)
+        hs = []
+        h = x
+        for module in self.input_blocks:
+            h = module(
+                h,
+                emb=t_emb,
+                context=y,
+                dense_emb=dense_y,
+                num_frames=num_frames,
+            )
+            hs.append(h)
+        h = self.middle_block(
+            h,
+            emb=t_emb,
+            context=y,
+            dense_emb=dense_y,
+            num_frames=num_frames,
+        )
+        for module in self.output_blocks:
+            h = torch.cat([h, hs.pop()], dim=1)
+            h = module(
+                h,
+                emb=t_emb,
+                context=y,
+                dense_emb=dense_y,
+                num_frames=num_frames,
+            )
+        h = h.type(x.dtype)
+        return self.out(h)
+class SGMWrapper(nn.Module):
+    def __init__(self, module: Seva):
+        super().__init__()
+        self.module = module
+    def forward(
+        self, x: torch.Tensor, t: torch.Tensor, c: dict, **kwargs
+    ) -> torch.Tensor:
+        x = torch.cat((x, c.get("concat", torch.Tensor([]).type_as(x))), dim=1)
+        return self.module(
+            x,
+            t=t,
+            y=c["crossattn"],
+            dense_y=c["dense_vector"],
+            **kwargs,
+        )

seva/modules/__init__.py ADDED Viewed

File without changes

seva/modules/autoencoder.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import torch
+from diffusers.models import AutoencoderKL  # type: ignore
+from torch import nn
+class AutoEncoder(nn.Module):
+    scale_factor: float = 0.18215
+    downsample: int = 8
+    def __init__(self, chunk_size: int | None = None):
+        super().__init__()
+        self.module = AutoencoderKL.from_pretrained(
+            "stabilityai/stable-diffusion-2-1-base",
+            subfolder="vae",
+            force_download=False,
+            low_cpu_mem_usage=False,
+        )
+        self.module.eval().requires_grad_(False)  # type: ignore
+        self.chunk_size = chunk_size
+    def _encode(self, x: torch.Tensor) -> torch.Tensor:
+        return (
+            self.module.encode(x).latent_dist.mean  # type: ignore
+            * self.scale_factor
+        )
+    def encode(self, x: torch.Tensor, chunk_size: int | None = None) -> torch.Tensor:
+        chunk_size = chunk_size or self.chunk_size
+        if chunk_size is not None:
+            return torch.cat(
+                [self._encode(x_chunk) for x_chunk in x.split(chunk_size)],
+                dim=0,
+            )
+        else:
+            return self._encode(x)
+    def _decode(self, z: torch.Tensor) -> torch.Tensor:
+        return self.module.decode(z / self.scale_factor).sample  # type: ignore
+    def decode(self, z: torch.Tensor, chunk_size: int | None = None) -> torch.Tensor:
+        chunk_size = chunk_size or self.chunk_size
+        if chunk_size is not None:
+            return torch.cat(
+                [self._decode(z_chunk) for z_chunk in z.split(chunk_size)],
+                dim=0,
+            )
+        else:
+            return self._decode(z)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.decode(self.encode(x))

seva/modules/conditioner.py ADDED Viewed

	@@ -0,0 +1,39 @@

+import kornia
+import open_clip
+import torch
+from torch import nn
+class CLIPConditioner(nn.Module):
+    mean: torch.Tensor
+    std: torch.Tensor
+    def __init__(self):
+        super().__init__()
+        self.module = open_clip.create_model_and_transforms(
+            "ViT-H-14", pretrained="laion2b_s32b_b79k"
+        )[0]
+        self.module.eval().requires_grad_(False)  # type: ignore
+        self.register_buffer(
+            "mean", torch.Tensor([0.48145466, 0.4578275, 0.40821073]), persistent=False
+        )
+        self.register_buffer(
+            "std", torch.Tensor([0.26862954, 0.26130258, 0.27577711]), persistent=False
+        )
+    def preprocess(self, x: torch.Tensor) -> torch.Tensor:
+        x = kornia.geometry.resize(
+            x,
+            (224, 224),
+            interpolation="bicubic",
+            align_corners=True,
+            antialias=True,
+        )
+        x = (x + 1.0) / 2.0
+        x = kornia.enhance.normalize(x, self.mean, self.std)
+        return x
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.preprocess(x)
+        x = self.module.encode_image(x)
+        return x

seva/modules/layers.py ADDED Viewed

	@@ -0,0 +1,139 @@

+import math
+import torch
+import torch.nn.functional as F
+from einops import repeat
+from torch import nn
+from .transformer import MultiviewTransformer
+def timestep_embedding(
+    timesteps: torch.Tensor,
+    dim: int,
+    max_period: int = 10000,
+    repeat_only: bool = False,
+) -> torch.Tensor:
+    if not repeat_only:
+        half = dim // 2
+        freqs = torch.exp(
+            -math.log(max_period)
+            * torch.arange(start=0, end=half, dtype=torch.float32)
+            / half
+        ).to(device=timesteps.device)
+        args = timesteps[:, None].float() * freqs[None]
+        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+        if dim % 2:
+            embedding = torch.cat(
+                [embedding, torch.zeros_like(embedding[:, :1])], dim=-1
+            )
+    else:
+        embedding = repeat(timesteps, "b -> b d", d=dim)
+    return embedding
+class Upsample(nn.Module):
+    def __init__(self, channels: int, out_channels: int | None = None):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.conv = nn.Conv2d(self.channels, self.out_channels, 3, 1, 1)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        assert x.shape[1] == self.channels
+        x = F.interpolate(x, scale_factor=2, mode="nearest")
+        x = self.conv(x)
+        return x
+class Downsample(nn.Module):
+    def __init__(self, channels: int, out_channels: int | None = None):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.op = nn.Conv2d(self.channels, self.out_channels, 3, 2, 1)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        assert x.shape[1] == self.channels
+        return self.op(x)
+class GroupNorm32(nn.GroupNorm):
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        return super().forward(input.float()).type(input.dtype)
+class TimestepEmbedSequential(nn.Sequential):
+    def forward(  # type: ignore[override]
+        self,
+        x: torch.Tensor,
+        emb: torch.Tensor,
+        context: torch.Tensor,
+        dense_emb: torch.Tensor,
+        num_frames: int,
+    ) -> torch.Tensor:
+        for layer in self:
+            if isinstance(layer, MultiviewTransformer):
+                assert num_frames is not None
+                x = layer(x, context, num_frames)
+            elif isinstance(layer, ResBlock):
+                x = layer(x, emb, dense_emb)
+            else:
+                x = layer(x)
+        return x
+class ResBlock(nn.Module):
+    def __init__(
+        self,
+        channels: int,
+        emb_channels: int,
+        out_channels: int | None,
+        dense_in_channels: int,
+        dropout: float,
+    ):
+        super().__init__()
+        out_channels = out_channels or channels
+        self.in_layers = nn.Sequential(
+            GroupNorm32(32, channels),
+            nn.SiLU(),
+            nn.Conv2d(channels, out_channels, 3, 1, 1),
+        )
+        self.emb_layers = nn.Sequential(
+            nn.SiLU(), nn.Linear(emb_channels, out_channels)
+        )
+        self.dense_emb_layers = nn.Sequential(
+            nn.Conv2d(dense_in_channels, 2 * channels, 1, 1, 0)
+        )
+        self.out_layers = nn.Sequential(
+            GroupNorm32(32, out_channels),
+            nn.SiLU(),
+            nn.Dropout(dropout),
+            nn.Conv2d(out_channels, out_channels, 3, 1, 1),
+        )
+        if out_channels == channels:
+            self.skip_connection = nn.Identity()
+        else:
+            self.skip_connection = nn.Conv2d(channels, out_channels, 1, 1, 0)
+    def forward(
+        self, x: torch.Tensor, emb: torch.Tensor, dense_emb: torch.Tensor
+    ) -> torch.Tensor:
+        in_rest, in_conv = self.in_layers[:-1], self.in_layers[-1]
+        h = in_rest(x)
+        dense = self.dense_emb_layers(
+            F.interpolate(
+                dense_emb, size=h.shape[2:], mode="bilinear", align_corners=True
+            )
+        ).type(h.dtype)
+        dense_scale, dense_shift = torch.chunk(dense, 2, dim=1)
+        h = h * (1 + dense_scale) + dense_shift
+        h = in_conv(h)
+        emb_out = self.emb_layers(emb).type(h.dtype)
+        while len(emb_out.shape) < len(h.shape):
+            emb_out = emb_out[..., None]
+        h = h + emb_out
+        h = self.out_layers(h)
+        h = self.skip_connection(x) + h
+        return h

seva/modules/preprocessor.py ADDED Viewed

	@@ -0,0 +1,116 @@

+import contextlib
+import os
+import os.path as osp
+import sys
+from typing import cast
+import imageio.v3 as iio
+import numpy as np
+import torch
+class Dust3rPipeline(object):
+    def __init__(self, device: str | torch.device = "cuda"):
+        submodule_path = osp.realpath(
+            osp.join(osp.dirname(__file__), "../../third_party/dust3r/")
+        )
+        if submodule_path not in sys.path:
+            sys.path.insert(0, submodule_path)
+        try:
+            with open(os.devnull, "w") as f, contextlib.redirect_stdout(f):
+                from dust3r.cloud_opt import (  # type: ignore[import]
+                    GlobalAlignerMode,
+                    global_aligner,
+                )
+                from dust3r.image_pairs import make_pairs  # type: ignore[import]
+                from dust3r.inference import inference  # type: ignore[import]
+                from dust3r.model import AsymmetricCroCo3DStereo  # type: ignore[import]
+                from dust3r.utils.image import load_images  # type: ignore[import]
+        except ImportError:
+            raise ImportError(
+                "Missing required submodule: 'dust3r'. Please ensure that all submodules are properly set up.\n\n"
+                "To initialize them, run the following command in the project root:\n"
+                "  git submodule update --init --recursive"
+            )
+        self.device = torch.device(device)
+        self.model = AsymmetricCroCo3DStereo.from_pretrained(
+            "naver/DUSt3R_ViTLarge_BaseDecoder_512_dpt"
+        ).to(self.device)
+        self._GlobalAlignerMode = GlobalAlignerMode
+        self._global_aligner = global_aligner
+        self._make_pairs = make_pairs
+        self._inference = inference
+        self._load_images = load_images
+    def infer_cameras_and_points(
+        self,
+        img_paths: list[str],
+        Ks: list[list] = None,
+        c2ws: list[list] = None,
+        batch_size: int = 16,
+        schedule: str = "cosine",
+        lr: float = 0.01,
+        niter: int = 500,
+        min_conf_thr: int = 3,
+    ) -> tuple[
+        list[np.ndarray], np.ndarray, np.ndarray, list[np.ndarray], list[np.ndarray]
+    ]:
+        num_img = len(img_paths)
+        if num_img == 1:
+            print("Only one image found, duplicating it to create a stereo pair.")
+            img_paths = img_paths * 2
+        images = self._load_images(img_paths, size=512)
+        pairs = self._make_pairs(
+            images,
+            scene_graph="complete",
+            prefilter=None,
+            symmetrize=True,
+        )
+        output = self._inference(pairs, self.model, self.device, batch_size=batch_size)
+        ori_imgs = [iio.imread(p) for p in img_paths]
+        ori_img_whs = np.array([img.shape[1::-1] for img in ori_imgs])
+        img_whs = np.concatenate([image["true_shape"][:, ::-1] for image in images], 0)
+        scene = self._global_aligner(
+            output,
+            device=self.device,
+            mode=self._GlobalAlignerMode.PointCloudOptimizer,
+            same_focals=True,
+            optimize_pp=False,  # True,
+            min_conf_thr=min_conf_thr,
+        )
+        # if Ks is not None:
+        #     scene.preset_focal(
+        #         torch.tensor([[K[0, 0], K[1, 1]] for K in Ks])
+        #     )
+        if c2ws is not None:
+            scene.preset_pose(c2ws)
+        _ = scene.compute_global_alignment(
+            init="msp", niter=niter, schedule=schedule, lr=lr
+        )
+        imgs = cast(list, scene.imgs)
+        Ks = scene.get_intrinsics().detach().cpu().numpy().copy()
+        c2ws = scene.get_im_poses().detach().cpu().numpy()  # type: ignore
+        pts3d = [x.detach().cpu().numpy() for x in scene.get_pts3d()]  # type: ignore
+        if num_img > 1:
+            masks = [x.detach().cpu().numpy() for x in scene.get_masks()]
+            points = [p[m] for p, m in zip(pts3d, masks)]
+            point_colors = [img[m] for img, m in zip(imgs, masks)]
+        else:
+            points = [p.reshape(-1, 3) for p in pts3d]
+            point_colors = [img.reshape(-1, 3) for img in imgs]
+        # Convert back to the original image size.
+        imgs = ori_imgs
+        Ks[:, :2, -1] *= ori_img_whs / img_whs
+        Ks[:, :2, :2] *= (ori_img_whs / img_whs).mean(axis=1, keepdims=True)[..., None]
+        return imgs, Ks, c2ws, points, point_colors